##http://socrates.berkeley.edu/~biostat/Courses/Spring/Ph29632/Data/index.html ##Chapter2: p. 13 ##THIS PAGE HAS CORRECTIONS: The factors were incorrectly coded previously ##PART I: ODDS RATIO, ##lowbwt.2.data lbwdat <- read.table("http://www.medepi.net/selvin/lowbwt.2.data", header = FALSE, sep="") names(lbwdat) <- c("bwt", "smk", "eth", "count") attr(lbwdat, "dict") <- c("bwt = birth weight (1: <2500 g; 0: >=2500 g)", "smk = smoking exposure (1: Yes; 0: otherwise)", "eth = race/ethnicity (1: white; 2: African American; 3: Hispanic; 4: Asian)", "count = cell frequency") str(lbwdat) lbwdat ##re-create original data set on p. 14 bwt <- rep(lbwdat$bwt, lbwdat$count) smk <- rep(lbwdat$smk, lbwdat$count) eth <- rep(lbwdat$eth, lbwdat$count) bwt <- factor(bwt, levels = 0:1, labels = c(">=2500g", "<2500g")) smk <- factor(smk, levels = 0:1, labels = c("Nonsmokers", "Smokers")) eth <- factor(eth, levels = 1:4, labels = c("White", "African American", "Hispanic", "Asian")) lbw <- data.frame(bwt, smk, eth) ##2x2 table table(Smoking = lbw$smk, "Birth weight" = lbw$bwt) ##Table 2.1 p. 16 ##2x2x4 table: 3-dimensional array (best for manipulation) tab2.1 <- table(Smoking = lbw$smk, "Birth weight" = lbw$bwt, Ethnicity = lbw$eth) tab2.1 ##alternative view of arrays: flat contingency table (easier to view) tab2.1b <- ftable(lbw$eth, lbw$smk, lbw$bwt) tab2.1b ##calculate strata-specific odds ratio and CIs for Table 2.2 ai <- tab2.1[1,1,] bi <- tab2.1[1,2,] ci <- tab2.1[2,1,] di <- tab2.1[2,2,] ni <- ai + bi + ci + di ori <- (ai*di)/(bi*ci) conf.level <- 0.95 Z <- qnorm(0.5*(1 + conf.level)) log.ori <- log(ori) var.log.ori <- 1/ai + 1/bi + 1/ci + 1/di sd.log.ori <- sqrt(var.log.ori) lower <- exp(log.ori - Z*sd.log.ori) upper <- exp(log.ori + Z*sd.log.ori) ##Table 2.2 p. 16 tab2.2 <- round(cbind(ni, ori, lower, upper), 3) tab2.2 wi <- 1/var.log.ori ##weights are inverse of variance ##Table 2.3 p. 16 tab2.3 <- round(cbind(ni, log.ori, var.log.ori, wi), 3) tab2.3 ##First, do the odds ratios systematically differ among the ethnic groups? ##More specifically, we will look at the log-odds ratios ##Let's evaluate the amount of variability among the observed odds ratios ##relative to the mean log-odds ratio log.or.bar <- sum(wi * log.ori)/sum(wi) log.or.bar zi <- (log.ori - log.or.bar)/sd.log.ori zi ##[Method 1] Woolf's chi square test for homogeneity chi2.W <- sum(zi^2) ##with k-1 d.f. chi2.W sum(wi*(log.ori - log.or.bar)^2) 1 - pchisq(q = chi2.W, df = length(zi) - 1) ##equivalent and preferred pchisq(q = chi2.W, df = length(zi) - 1, lower.tail = FALSE) ##no evidence that odds ratios differ systematically among ethnic groups ##therefore, calculate common odds ratios ##Woolf's estimate or.W <- exp(log.or.bar) or.W ##now test whether or.W differs from 1 (or log.or.bar differs from 0) var.log.or.bar <- 1/sum(wi) z <- (log.or.bar - log(1))/sqrt(var.log.or.bar) z ##P(Z>=z) pnorm(z, lower.tail=FALSE) ##CI for common odds ratio conf.level <- 0.95 Z <- qnorm(0.5 *(1 + conf.level)) lower <- exp(log.or.bar - Z*sqrt(var.log.or.W)) lower upper <- exp(log.or.bar + Z*sqrt(var.log.or.W)) upper ##Mantel-Haenszel estimate or.MH <- sum(ai*di/ni)/sum(bi*ci/ni) or.MH ##MLE estimate ##comes from logistic model (below) ##A natural summary, p. 19 ##Logistic model approach, p. 22 ##[Method 2]: Test for homogeneity (interaction) ##original data m1 <- glm(bwt ~ smk + factor(eth), weight = count, data = lbwdat, family = binomial(link = logit)) summary(m1) m2 <- glm(bwt ~ smk+factor(eth)+smk*factor(eth), weight = count, data = lbwdat, family = binomial(link = logit)) summary(m2) ##list glm model values names(m1) ##Test for homogeneity (interaction) m12.resid.dev <- m1$deviance - m2$deviance m12.resid.dev m12.df.resid <- m1$df.residual - m2$df.residual m12.df.resid pchisq(q = m12.resid.dev, df = m12.df.resid, lower.tail = FALSE) ##short cut to compare nested models anova(m1, m2, test = "Chisq") ##expanded full data m1full <- glm(bwt~smk + eth, data = lbw, family = binomial(link = logit)) summary(m1full) m2full <- glm(bwt~smk + eth + smk*eth, data = lbw, family = binomial(link = logit)) summary(m2full) anova(m1full, m2full, test = "Chisq") ##[Method 3] Chi-square good-of-fit approach ##We will use m1 (no interaction model) ##goal is to replicate table 2.6 ##to get fitted log-odds m1$linear.predictors predict(m1) ##to get fitted probabilities 1/(1 + exp(-predict(m1))) m1$fitted.values fitted(m1) ##get marginal totals mtot <- tapply(lbwdat$count, list(lbwdat$smk, lbwdat$eth), sum) mtot mtot <- as.vector(mtot) mtot mtot <- rep(mtot, rep(2, length(mtot))) mtot cellprob <- abs(rep(c(1,0), nrow(lbwdat)/2) - fitted(m1)) cellprob ei <- cellprob*mtot tab2.6 <- cbind(lbwdat[,1:3], Total = mtot, pij = cellprob, oi = lbwdat[,4], ei = ei) tab2.6 ##Pearson chi-square goodness-of-fit statistic, p. 24 chi2 <- sum((tab2.6[,"oi"] - tab2.6[,"ei"])^2/tab2.6[,"ei"]) chi2 ##recall that d.f. = (r-1)(c-1)(d-1) for 3-way array pchisq(chi2, df = 3, lower.tail = FALSE) ##assess impact of smoking, p. 24 (bottom) summary(m1)$coef bhat <- summary(m1)$coef["smk", "Estimate"] bhat Sbhat <- summary(m1)$coef["smk", "Std. Error"] Sbhat z <- bhat/Sbhat z ##P(Z>=z | b = 0) pnorm(z, lower.tail = FALSE) ##MLE log.or.bar or.MLE <- exp(log.or.bar) or.MLE ##confounding: compare crude and summary odds ratios or.crude <- (sum(ai)*sum(di))/(sum(bi)*sum(ci)) cbind(or.crude, or.W, or.MH, or.MLE) ##PART II: RELATIVE RISK ##chd.2.data chddat <- read.table("http://www.medepi.net/selvin/chd.2.data", header = FALSE, sep="") names(chddat) <- c("chd", "beh", "wt", "count") attr(chddat, "dict") <- c("chd = coronary event: 1 = yes, 0 = no", "beh = A-type or B-type behavior: 1 = A, 0 = B", "wt = body weight category: 1: <150; 2: 150-160lb; 3: 160-170lb; 4: 170-180lb; 5: >180lb", "count = cell frequencies") str(chddat) chddat ##re-create original data set chd <- rep(chddat$chd, chddat$count) beh <- rep(chddat$"beh", chddat$count) wt <- rep(chddat$wt, chddat$count) chd <- factor(chd, levels = 0:1, labels = c("No CHD", "CHD")) beh <- factor(beh, levels = 0:1, labels = c("Type B", "Type A")) wt <- factor(wt, levels = 1:5, labels = c("<150lb", "150-160lb", "160-170lb", "170-180lb", ">180lb")) ##create table 2.7, p. 30 tab2.7 <- table(beh, chd, wt)[2:1,2:1,] tab2.7 ##alternative view: flat contingency table tab2.7b <- ftable(wt, beh, chd) tab2.7b ##create table 2.8, p. 31 ai <- tab2.7[1,1,] bi <- tab2.7[1,2,] ci <- tab2.7[2,1,] di <- tab2.7[2,2,] ni <- ai + bi + ci + di Pi <- ai/(ai+bi) ##Pr(disease | exposed) pi <- ci/(ci+di) ##Pr(disease | not exposed) rri <- Pi/pi ##stratum-specific risk ratio log.rri <- log(rri) var.log.rri <- ((1 - Pi)/ai) + ((1 - pi)/ci) sd.log.rri <- sqrt(var.log.rri) wi <- 1/var.log.rri tab2.8 <- cbind(ni, rri, log.rri, var.log.rri, wi) round(tab2.8, 3) ##stratum-specific CIs conf.level <- 0.95 Z <- qnorm(0.5*(1 + conf.level)) Z logrri.L <- log.rri - Z*sqrt(var.log.rri) logrri.U <- log.rri + Z*sqrt(var.log.rri) rri.L <- exp(logrri.L) rri.U <- exp(logrri.U) ##create table 2.9, p. 32 tab2.8 <- cbind(ni, log.rri, logrri.L, logrri.U, rri, rri.L, rri.U) round(tab2.8, 3) ##risk ratios seemed approximately equal, but different than 1 ##calculate weighted average rr, p. 33 log.rr.bar <- sum(wi * log.rri)/sum(wi) log.rr.bar zi <- (log.rri - log.rr.bar)/sd.log.rri zi ##[Method 1] Woolf's chi square test for homogeneity chi2.W <- sum(zi^2) ##with k-1 d.f. chi2.W sum(wi*(log.rri - log.rr.bar)^2) 1 - pchisq(q = chi2.W, df = length(zi) - 1) ##equivalent and preferred pchisq(q = chi2.W, df = length(zi) - 1, lower.tail = FALSE) ##Woolf's estimate rr.W <- exp(log.rr.bar) rr.W ##as an exercise do the MH common risk ratio ##now test whether rr.W differs from 1 (or log.rr.bar differs from 0) var.log.rr.bar <- 1/sum(wi) z <- (log.rr.bar - log(1))/sqrt(var.log.rr.bar) z ##P(Z>=z) pnorm(z, lower.tail=FALSE) ##CI for common risk ratio (Woolf estimate), p. 33 conf.level <- 0.95 Z <- qnorm(0.5*(1 + conf.level)) Z var.log.rr.bar <- 1/sum(wi) logrrbar.L <- log.rr.bar - Z*sqrt(var.log.rr.bar) logrrbar.L logrrbar.U <- log.rr.bar + Z*sqrt(var.log.rr.bar) logrrbar.U rr.W.L <- exp(logrrbar.L) rr.W.L rr.W.U <- exp(logrrbar.U) rr.W.U ##[method 2] Poisson model ##No interaction model m1 <- glm(chd ~ beh + factor(wt), weight = count, data = chddat, family = poisson) summary(m1) ##Interaction model m2 <- glm(chd ~ beh + factor(wt) + beh*factor(wt), weight = count, data = chddat, family = poisson) summary(m2) ##Test for homogeneity (interaction) m12.resid.dev <- m1$deviance - m2$deviance m12.resid.dev m12.df.resid <- m1$df.residual - m2$df.residual m12.df.resid pchisq(q = m12.resid.dev, df = m12.df.resid, lower.tail = FALSE) ##short cut to compare nested models anova(m1, m2, test = "Chisq") ##[method 3: pearson chi2 goodness of fit] ##get marginal totals mtot <- tapply(chddat$count, list(chddat$beh, chddat$wt), sum) mtot mtot <- as.vector(mtot) mtot mtot <- rep(mtot, rep(2, length(mtot))) mtot cellprob <- abs(rep(c(1,0), nrow(chddat)/2) - fitted(m1)) cellprob ei <- cellprob*mtot tab2.11 <- cbind(chddat[,1:3], Total = mtot, pij = cellprob, oi = chddat[,4], ei = ei) tab2.11 ##Pearson chi-square goodness-of-fit statistic, p. 37 chi2 <- sum((tab2.11[,"oi"] - tab2.11[,"ei"])^2/tab2.11[,"ei"]) chi2 ##recall that d.f. = (r-1)(c-1)(d-1) for 3-way array pchisq(chi2, df = 4, lower.tail = FALSE)