##http://socrates.berkeley.edu/~biostat/Courses/Spring/Ph29632/Data/index.html ##Chapter7: p. 93 ##Assignment 7: ##gene.7.data gdat0 <- read.table("http://www.medepi.net/selvin/gene.7.data", header = FALSE, sep="") ##change data to Table 7.1, p. 96 gdat <- as.matrix(gdat0)/100 ##gnames.7.data genenames <- scan("http://www.medepi.net/selvin/gnames.7.data", what = "") varnames <- c("A", "B", "AB", "O", "CDE", "CDe", "Cde", "cDE", "cdE", "cDe", "cde", "M", "N") rownames(gdat) <- genenames colnames(gdat) <- varnames gdat ##function to calculate Euclidean distance ##x and y are numeric vectors euclid <- function(x, y){ sqrt(sum((x - y)^2)) } ##Red blood cell groups Euclidean distances for all pair-wise comparisons rbcdist <- matrix(NA, 26, 26) rownames(rbcdist) <- genenames colnames(rbcdist) <- genenames for(i in 1:26){ for(j in 1:26){ rbcdist[i, j] <- euclid(gdat[i, ], gdat[j, ]) } } ##set diagonal zeros to missing (NA) diag(rbcdist) <- NA ##This could have been done using the 'dist' function ##'dist(gdat)' gives same results as above ##nearest-neighbor analysis nearest.distance <- apply(rbcdist, 1, min, na.rm = TRUE) fun <- function(x){which(x==min(x, na.rm = TRUE))} min.index <- apply(rbcdist, 1, fun) nearest.neighbor <- genenames[min.index] nn <- data.frame(nearest.neighbor, nearest.distance) nn <- nn[order(nn$nearest.distance),] nn ##Dendrograms ##ABO system plot(hclust(dist(gdat[,1:4]), "single")) ##RH system plot(hclust(dist(gdat[,5:11]), "single")) ##MN system plot(hclust(dist(gdat[,12:13]), "single")) ##All RBC systems combined plot(hclust(dist(gdat), "average")) ##PRINCIPAL COMPONENTS ##uses 'prcomp' function gene.pca <- prcomp(gdat) gene.pca ##Table 7.3 a.i <- gene.pca$rotation[,1] b.i <- gene.pca$rotation[,2] tab7.3 <- round(cbind(a.i, b.i), 2) tab7.3 ##Canonical variables, p. 102 Pi <- gdat%*%a.i Pi Qi <- gdat%*%b.i Qi ##Table 7.4, p. 105 tab7.4 <- round(cbind(Pi = Pi, Qi = Qi), 3) colnames(tab7.4) <- c("Pi", "Oi") tab7.4 ##Figure 7.4, p. 104 plot(Pi, Qi, pch = letters, xlim = c(0, 1.2), ylim = c(-0.8, 0.5), cex = 0.8, main = "Figure 7.4, p. 104") legend(0.95, 0.45, legend = genenames, pch = letters, cex = 0.7) ###OPTIONAL ###S-Plus code for Figures 7.5 & 7.6 (pp. 106-107) at: http://socrates.berkeley.edu/~biostat/Courses/Spring/Ph29632/Solve/contour.txt ##This code from Selvin's site ##'gene.pca' from above p <- gene.pca$x x <- p[,1] y <- p[,2] ##set up plot xmin <- min(x) ymin <- min(y) xmax <- max(x) ymax <- max(y) k <- length(x) n <- 50 #n = grid size x0 <- seq(xmin,xmax,length=n) y0 <- seq(ymin,ymax,length=n) h <- .04 #bandwidth ##initialize arrays zz <- array(0,c(n,n,k)) z <- matrix(0,n,n) ## smoother function zfcn <- function(u,v){(exp(-0.5*(((u-m1)/h)^2+((v-m2)/h)^2)))/(2*pi)} ##calculates heights for(i in 1:k){ m1 <- x[i] m2 <- y[i] z <- outer(x0,y0,"zfcn") zz[,,i] <- z } zsum <- apply(zz,c(1,2),sum) ##3-dimensional smoothed plot ##this adopted from 'persp' help file op <- par(bg = "white") persp(x0, y0, zsum, theta = -30, phi = 30, expand = 0.5, col = "lightblue", ltheta = 120, shade = 0.75, ticktype = "detailed", xlab = "Pi", ylab = "Qi", zlab = "Height") contour(x0, y0, zsum, xlab = "Pi", ylab = "Qi")