#POISSON REGRESSION USING R # Read in data set; data set has one record per county, sex, race, age, half-year strata, with number of cases and population in strata gc.reg.f <- read.table("http://www.medepi.net/msamuel/PoissonReg_GC/gcregion.csv",sep=",",na.strings="",header=T) #remove "multirace" strata -- 0 numerator and only > year = 2000 gc.reg.f <- gc.reg.f[gc.reg.f$RACE != "M",] attach(gc.reg.f) # This is the basic model structure for doing Poisson regression using R, with a population/total "offset" out0 <- glm(N ~ as.factor(SEX) + as.factor(YR)+ offset(log(HALFPOP)), family=poisson) # Use standard R syntax to extract the appropriate coefficients and/or other regression "output" out0$coef exp(out0$coef) summary(out0)$coef # Default plots associated with and object generated by the glm function plot(out0) # Plot of observed versus fitted Numbers of cases--basically not interpretable because of differing population sizes of strata plot(N,out0$fitted.values) # Generate and plot Log(observed rates) versus Log(fitted rates); much more meaningful... lrhat <- log(out0$fitted.values/HALFPOP) lrate <- log(N/HALFPOP) plot(lrate,lrhat) # "Goodness of fit" test pchisq(out0$deviance,out0$df.residual,lower.tail=FALSE) # Function to generate table of "Rate Ratios" w/ Confidence Intervals, and GOF tests, and calculate Log Rates, etc. mypreg <- function(out0) { out1 <<- summary(out0)$coef rr <<- exp(out1[,"Estimate"]) lci <<- exp(out1[,"Estimate"]-1.96*out1[,"Std. Error"]) uci <<- exp(out1[,"Estimate"]+1.96*out1[,"Std. Error"]) ptab <<- round(cbind(rr,lci,uci),3) xfit <<- sum(((N-out0$fitted)^2)/out0$fitted) pfit <<- pchisq(out0$deviance,out0$df.residual,lower.tail=FALSE) lrhat <<- log(out0$fitted.values/HALFPOP) lrate <<- log(N/HALFPOP) } # Now use Poisson regression on selected subsets of the data to fit better/more meaningful models # Create new data frame that exclude strata with zero cases or age < 15 # ONLY 2002 for these models d1 <- gc.reg.f[gc.reg.f$YR == 2002 & gc.reg.f$N > 0 & gc.reg.f$AGEGP != "0-14",] attach(d1) out0 <- glm(N ~ SEX + RACE + AGEGP + REGION + offset(log(HALFPOP)), family=poisson) mypreg(out0) ptab pfit plot(lrate,lrhat) # Male ONLY Model d1.male <- d1[d1$SEX=="M",] attach(d1.male) out0 <- glm(N ~ RACE + AGEGP + REGION + HALFD + offset(log(HALFPOP)), family=poisson) mypreg(out0) ptab plot(lrate,lrhat) # Use power of R's graphical routine to further understand these data / regression models levels(RACE) # "A" "B" "H" "I" "W" race.pch <- c(4,1,2,3,0) levels(AGEGP) # "0-14" "15-19" "20-24" "25-29" "30-34" "35-39" "40+" ; but 0-14 removed from this data frame age.col <- c("ivory3","black","red","green","purple3","cyan","magenta") plot(lrate,lrhat,las=1,xlim=c(-11,-2),ylim=c(-11,-2),xlab="Log(Observed Rate)",ylab="Log(Predicted Rate)",type="n") for (i in 1:length(levels(RACE))) for (j in 1:length(levels(AGEGP))) {points(lrate[RACE == levels(RACE)[i] & AGEGP == levels(AGEGP)[j] ], lrhat[RACE == levels(RACE)[i] & AGEGP == levels(AGEGP)[j] ], pch=race.pch[i],col=age.col[j])} legend(-11,-2,legend=levels(RACE),pch=race.pch,cex=.7) legend(-9.7,-2,legend=levels(AGEGP),text.col=age.col,cex=.7) # Female ONLY Model d1.female <- d1[d1$SEX=="F",] attach(d1.female) out0 <- glm(N ~ RACE + AGEGP + REGION + HALFD + offset(log(HALFPOP)), family=poisson) mypreg(out0) ptab plot(lrate,lrhat,las=1,xlim=c(-11,-2),ylim=c(-11,-2),xlab="Log(Observed Rate)",ylab="Log(Predicted Rate)",type="n") for (i in 1:length(levels(RACE))) for (j in 1:length(levels(AGEGP))) {points(lrate[RACE == levels(RACE)[i] & AGEGP == levels(AGEGP)[j] ], lrhat[RACE == levels(RACE)[i] & AGEGP == levels(AGEGP)[j] ], pch=race.pch[i],col=age.col[j])} legend(-11,-2,legend=levels(RACE),pch=race.pch,cex=.7) legend(-9.7,-2,legend=levels(AGEGP),text.col=age.col,cex=.7)