##http://socrates.berkeley.edu/~biostat/Courses/Spring/Ph29632/Data/index.html ##Chapter 10: p. 145 ##Assignment 10: hdat <- read.table("http://www.medepi.net/selvin/hypertension.10.data", header = FALSE, sep="") names(hdat) <- c("Hyper", "Age", "Worked", "History", "Salt", "SPL") attr(hdat, "dictionary") <- list(Hyper = "1 = hypertension, 0 = otherwise", Age= "reported age in years", Worked = "reported years worked", History = "1 = family history of hypertension, 0 = otherwise", Salt = "1 = heavy use, 0 = normal or low use", SPL = "time-weighted average of six sound pressure level measurements" ) head(hdat) str(hdat) ##Descriptive stats from Table 10.1 describe <- function(x){ n <- length(x) Mean <- mean(x) Median <- median(x) Std.Dev <- sd(x) Minimum <- min(x) Maximum <- max(x) cbind(n, Mean, Median, Std.Dev, Minimum, Maximum) } ##Create Table 10.1 tab10.1a <- rbind(describe(hdat$SPL), describe(hdat$Age), describe(hdat$Worked)) rownames(tab10.1a) <- c("SPL", "Age", "Years worked") tab10.1b <- rbind(table(hdat$Salt), table(hdat$History)) rownames(tab10.1b) <- c("Salt consumption", "Family history") tab10.1a tab10.1b ##Create Table 10.2 tab10.2 <- table(hdat$Hyper, hdat$SPL)[2:1,] tot <- apply(tab10.2, 2, sum) prop <- tab10.2[1,]/tot tab10.2 <- rbind(tab10.2, tot, prop) tab10.2 rownames(tab10.2) <- c("Hypertensive", "Nonhypertensive", "Total", "Proportion") round(tab10.2, 3) ##logistic regression on p. 149 m1 <- glm(Hyper ~ SPL, data = hdat, family = binomial) ahat <- m1$coef[1]; bhat <- m1$coef[2] x <- as.numeric(names(table(hdat$SPL))) ##Pr(Hypertension | SPL) p.SPL <- 1/(1 + exp(-(ahat + bhat*x))) p.SPL ##Create Figure 10.1 div <- 10 ##divider number used to get plot to match plot in book windows(5,5) plot(jitter(hdat$SPL), hdat$Hyper/div, type = "p", pch = 3, xlab = "Sound pressure level (SPL)", ylab = "P(hypertension)") lines(x, p.SPL) lines(x, prop, type = "b", pch = 16, lty = 2) abline(h = 0:1/div) legend(75, .07, legend = c("estimated", "data"), lty = 1:2) xadj <- c(0, 0, 0, 0.7, 0, -.8, -2) yadj <- c(.007, .004, .004, .003, -.004, .004, 0) text(x + xadj, prop + yadj, labels = round(prop,3), cex = 0.7) ##Multivariable logistic regression model, p. 149 ##Create Table 10.3 m2 <- glm(Hyper ~ SPL + Age + Worked + History + Salt, data = hdat, family = binomial) tab10.3 <- summary(m2) tab10.3 ##model removing Worked only summary(glm(Hyper ~ SPL + Age + History + Salt, data = hdat, family = binomial)) ##model removing Age only summary(glm(Hyper ~ SPL + Worked + History + Salt, data = hdat, family = binomial)) ##Decile of risk approach to assessing fit, p. 151-153 ##Create Table 10.4 phat.i <- fitted(m2) p.k <- cut(x = phat.i, breaks = quantile(phat.i, seq(0,1,.1)), include.lowest = TRUE) n.k <- as.vector(table(p.k)) n.k ##cases pbar1.k <- tapply(phat.i, p.k, mean) pbar1.k e1.k <- n.k*pbar1.k e1.k o1.k <- tapply(hdat$Hyper, p.k, sum) o1.k ##non-cases pbar0.k <- 1-pbar1.k pbar0.k e0.k <- n.k*pbar0.k e0.k o0.k <- n.k-o1.k o0.k tab10.4 <- cbind(n.k, pbar1.k, e1.k, o1.k, pbar0.k, e0.k, o0.k) round(tab10.4, 3) ##chi square goodness-of-fit approach o.k <- c(o1.k, o0.k) e.k <- c(e1.k, e0.k) chi2 <- sum(((o.k-e.k)^2)/e.k) pchisq(chi2, df = 8, lower.tail = FALSE) ##Logistic regression using polynomial terms, p. 155-157 m3 <- glm(Hyper ~ poly(SPL,3) + poly(Age,3) + poly(Worked,3) + History + Salt, data = hdat, family = binomial) summary(m3) m4 <- glm(Hyper ~ poly(SPL,3) + poly(Age,3) + Worked + History + Salt, data = hdat, family = binomial) summary(m4) m5 <- glm(Hyper ~ poly(SPL,3) + Age + poly(Worked,3) + History + Salt, data = hdat, family = binomial) summary(m5) m6 <- glm(Hyper ~ SPL + poly(Age,3) + poly(Worked,3) + History + Salt, data = hdat, family = binomial) summary(m6) ##Analysis for Table 10.5 ##Models 4, 5, 6 are compared model 3 anova(m3, m4, test = "Chisq") anova(m3, m5, test = "Chisq") anova(m3, m6, test = "Chisq") ##Generalized additive model ##Install 'gam' package from CRAN ##'gam' package is from Dr. Trevor Hastie, Professor of Biostatistics, ##Standford University library(gam) m7 <- gam(Hyper ~ s(SPL) + s(Age) + s(Worked) + History + Salt, data = hdat, family = binomial) summary(m7) m <- 1.4 ##plot windows width & height multiplier windows(5*m, 3*m) par(mfrow=c(2,3)) plot(m7) par(mfrow=c(1,1))