R: October 2007

Saturday, October 27, 2007

Mixed Models - longitudinal data

### Sitka (79 trees with 5 time points)

# random intercept models = compound symmetry
sitka.lme <- lme(size~treat*ordered(Time), random=~1|tree, data=Sitka)

Mixed Models - random coefficient models

library(nlme)

coplot(Y ~ EP|No, data=petrol)

Petrol <- petrol
Petrol[,2:5] <- scale(Petrol[,2:5],scale=F)

# Y~EP for each No; no overal mean
pet1.lm <- lm(Y~No/EP-1, data=Petrol)

# Y~ mu_i + beta*EP; no overal mean
pet2.lm <- lm(Y~No-1+EP, data=Petrol)
pet3.lm <- lm(Y~SG+VP+V10+EP, data=Petrol)

# Random intercept model
pet3.lme <- lme(Y~SG+VP+V10+EP, random=~1|No, data=Petrol)

# To compare likelihood, use "ML"
pet3.lme <- update(pet3.lme, method="ML")
anova(pet3.lme, pet3.lm)
pet4.lme <- update(pet3.lme, fixed=Y~V10+EP)
# pet4.lme <- lme(Y~V10+EP, random=~1|No, method="ML", data=Petrol)
anova(pet3.lme, pet4.lme)

## Random intercept + Random coefficient
pet5.lme <- lme(Y~V10+EP, random=~1+EP|No, method="ML", data=Petrol)
anova(pet4.lme, pet5.lme)

Mixed Models - RCBD

### Randomized Complete Block Design
library(nlme)

rcb <- read.csv("rcb.csv")
rcb$ingot <- as.factor(rcb$ingot)
rcb$metal <- as.factor(rcb$metal)

### lme
rcbFit <- lme(pres~ metal, random=~1|ingot, data=rcb)

### manually
t<-3; b<-7

mean_b<-tapply(rcb[,3],rcb[,1],mean)
mean_t<-tapply(rcb[,3],rcb[,2],mean)[t:1]

msa<-b*sum((mean_t-mean(mean_t))^2)/(t-1)
msb<-a*sum((mean_b-mean(mean_b))^2)/(b-1)
mse<-sum((y-rep(mean_t,b)-rep(mean_b,rep(t,b))+mean(y))^2)/(t-1)/(b-1)
F<-msa/mse
p_value <- 1-pf(F,(t-1),(t-1)*(b-1))

sb<-(msb-mse)/t # sigma_b


/* data & SAS code */
data rcb;
   input ingot metal $ pres @@;
   datalines;
1 n 67.0  1 i 71.9  1 c 72.2
2 n 67.5  2 i 68.8  2 c 66.4
3 n 76.0  3 i 82.6  3 c 74.5
4 n 72.7  4 i 78.1  4 c 67.3
5 n 73.1  5 i 74.2  5 c 73.2
6 n 65.8  6 i 70.8  6 c 68.7
7 n 75.6  7 i 84.9  7 c 69.0
run;

proc mixed data=rcb;
   class ingot metal;
   model pres=metal;
   random ingot;
run;

Structural Equation Model

### Principle Component Analysis

## with prcomp()
prcomp(USArrests, scale=T)
prcomp(~Murder+Assault+Rape, data=USArrests, scale=T)
plot(prcomp(~Murder+Assault+Rape, data=USArrests, scale=T))
summary(prcomp(~Murder+Assault+Rape, data=USArrests, scale=T))
biplot(prcomp(~Murder+Assault+Rape, data=USArrests, scale=T))

## with princomp()
princomp(USArrests, cor=T)
princomp(~Murder+Assault+Rape, data=USArrests, cor=T)
summary(princomp(~Murder+Assault+Rape, data=USArrests, cor=T))
princomp(~Murder+Assault+Rape, data=USArrests, cor=T)$score
plot(princomp(~Murder+Assault+Rape, data=USArrests, cor=T))
biplot(princomp(~Murder+Assault+Rape, data=USArrests, cor=T))

# missing handling
USArrests[1,2]<-NA
princomp(~Murder+Assault+Rape, data=USArrests, na.action=na.exclude, cor=T)

### Factor Analysis with factanal()
factanal(~Murder+Assault+UrbanPop+Rape, factors=1, data=USArrests)

### FactoMineR

Collapse table and calculate p-value by Monte Carol Method


# colapse rows and columns whose cell counts < 4
rcbind <- function(tt)
{
  # rearraign table
  cc <- names(sort(tt[names(sort(rowSums(tt==max(tt)), decreasing=T))[1],], decreasing=T))
  rr <- names(sort(tt[,names(sort(colSums(tt==max(tt)), decreasing=T))[1]], decreasing=T))
  tt <- tt[rr,cc]

  # p-value for Consensus Test (binomial test)
  m1=max(tt)
  m2=max(tt[1,2:dim(tt)[2]],tt[2:dim(tt)[1],1])
  p <- binom.test(m1,m1+m2)$p.value # 2-sided

  if(p>=0.05){
    print("There is no concensus.")
    return(list(0,tt))
  } else
  if(sum(tt[2:dim(tt)[1],2:dim(tt)[2]]>=4)==0){
    print("There is no double mutation cell >=4.")
    return(list(0,tt))
  } else {
    # inner table
    inner <- tt[2:dim(tt)[1],2:dim(tt)[2]]
    rrr <-rowSums(inner>=4)
    ccc <-colSums(inner>=4)

    # colnames and rownames whose cell counts are larger than 4
    rnames <- c(rownames(tt)[1],names(rrr[rrr>=1]))
    cnames <- c(colnames(tt)[1],names(ccc[ccc>=1]))

    # colnames and rownames which is collapsed
    mr <- rownames(tt)[-charmatch(rnames,rownames(tt))]
    mc <- colnames(tt)[-charmatch(cnames,colnames(tt))]

    # colapsing rows and columns whose cell counts are smaller than 4
    return(list(1,rbind(cbind(tt[rnames,cnames],rowSums(tt[rnames,mc]))
          ,c(colSums(tt[mr,cnames]),sum(tt[mr,mc])))))
  }
}

test_stat <- function(x){
  # Pearson's Chi-square test
  expect <- (matrix(rowSums(x))%*%t(matrix(colSums(x))))/sum(x)
  x2 <- sum((x-expect)^2/expect)

  # LRT
  temp <- x*log(x/expect)
  g2 <- 2*sum(temp[x!=0])

  # Maximal Odds Ratio & Maximal MH
  or <- rep(0,(dim(x)[1]-1)*(dim(x)[2]-1))
  dim(or) <- c(dim(x)[1]-1,dim(x)[2]-1)
  diff <- or
  for(i in 2:dim(x)[1])
    for(j in 2:dim(x)[2]){
       or[i-1,j-1] <- abs(log(x[i,j]*x[1,1]/x[i,1]/x[1,j])
                    /sqrt(1/x[1,1]+1/x[1,j]+1/x[i,1]+1/x[i,j]))
       diff[i-1,j-1] <- abs(x[i,j]*x[1,1]-x[i,1]*x[1,j])
                 /sqrt((x[1,1]+x[i,1])*(x[1,1]+x[1,j])*(x[1,j]+x[i,j])
                 *(x[i,1]+x[i,j])/(x[i,j]+x[1,1]+x[i,1]+x[1,j]-1))

    }
#  df <- (dim(x)[1]-1)*(dim(x)[2]-1)
#  1-pchisq(c(x2,g2),df) 
  
  c(x2,g2,max(or),max(diff))

}

 

exact <- function(x){

  ts_o <- test_stat(x)

  reference <- r2dtable(iter, rowSums(x), colSums(x))

  ts_e <- sapply(reference, test_stat)

  p_value <- c(fisher.test(x)$p.value,apply(ts_e>ts_o,1,mean,na.rm=T))
  names(p_value) <- c("Exact","X2","LRT","Maximal OR","Maximal MH")
  
  return(p_value)

}

nsi<-read.csv("g:/programming/R/nsi.csv")

iter =10000
position1 <- nsi$x18
position2 <- nsi$x19

ss <- rcbind(table(position1,position2))

if(ss[[1]]) p_value <- exact(ss[[2]])

ss[[2]]
p_value

aa <- r2dtable(1,c(90,50,40),c(90,45,45))
exact(aa[[1]])

Mortgage


mortgage <- function(P, r, year, py){
  m <- year*12;
  Ei <- rep(0,m);
  mr <- r/12;
  E <- P*mr/((1+mr)^m -1);
  pmt <- P*mr+E;

  for(i in 1:m) Ei[i]<-E*(1+mr)^(i-1);
  Ii <- pmt-Ei;

  cat(year, "year at", r*100, "%\n");
  cat("Monthly Payment:", pmt, "\n");
  cat("Total Payment:", pmt*m, ", Total Interest:", pmt*m-P, "\n");
  cat("Total Principal:", sum(Ei[1:(12*py)]), 
      ", Total Interest:", sum(Ii[1:(12*py)]),"(After", py, "years)\n");
}

P <- 210000 # principal
r <- 0.05625 # interest rate
year <- 15 # loan period
py <- 5 # short term

mortgage(P,r,year,py)

apply, sapply, tapply

### apply

### sapply
> sapply(ChickWeight[,c(1,2)], mean, na.rm=T)
> sapply(ChickWeight[,c(1,2)], summary)

### tapply
tapply(ChickWeight$weight, ChickWeight$Diet, mean)
tapply(ChickWeight$weight, ChickWeight[,c(2,4)], mean)

coplot(weight ~ Time | Chick, data = ChickWeight, type = "b", show = FALSE)
coplot(weight ~ Time | Diet, data = ChickWeight, type = "b", show = FALSE)

Generalized Linear Mixed Models

### Linear Mixed Models
library(lme4)
ChickWeight <- data.frame(ChickWeight)
class(ChickWeight)

gsummary(ChickWeight, groups=ChickWeight$Chick)

list <- lmList(weight~Time |Chick, data=ChickWeight)
pooledSD(list) # exact pooled standard deviation

> lm1<-lmer(weight~Time+(Time|Chick), data=ChickWeight)
> lm2<-lmer(weight~Time+Diet+(Time|Chick), data=ChickWeight)
> anova(lm1,lm2)

### Generalized Linear Mixed Models
Laplace Method
PQL (penalized quasi-likelihood) - fastest but least accurate
AGQ (adaptive Gauss quadrature) - most accurate but slow

library(lme4)
lmer(method="PQL", "Laplace", "AGQ")

GLMMGibbs: Gibbs sampling

Survival Data Analysis

library(survival)
library(ISwR)
data(melanom)
str(melanom)

### The Kaplan-Meier estimates
# Surv creats a survival object
mfit <- survfit(Surv(days, status==1), data=melanom)
summary(mfit)
plot(mfit)

## By sex
mfit.bysex <- survfit(Surv(days, status==1)~sex, data=melanom)
plot(mfit.bysex, col=c("black", "grey"), legend.text=c("Female", "Male"))

# log-transformation
plot(mfit.bysex, fun="log", col=c("black", "grey"), legend.text=c("Female", "Male"))

# cloglog-transformation
plot(mfit.bysex, fun="cloglog", col=c("black", "grey"), legend.text=c("Female", "Male"))

### The log-rank test
survdiff(Surv(days, status==1)~sex, data=melanom)

### Cox Regression Model
data(ovarian)
ovcph <- coxph(Surv(futime, fustat) ~ age + factor(resid.ds) + factor(rx), x=T, data=ovarian)
summary(ovcph)

# Kaplan-Meier
ovarian.surv <- survfit(Surv(futime, fustat) ~ rx, data=ovarian)
plot(ovarian.surv, lty=2:3)
legend(600, .3, c("rx=1", "rx=2"), lty=2:3)

# Examine outliers and check proportional hazards assumption
temp <- cox.zph(ovcph)
print(temp)
plot(temp[1]) # Schoenfeld plot for age
split.screen(c(2,2))
plot(temp[3])
screen(2)
plot(temp[2])
screen(3)
plot(temp[1])
screen(4)
plot(ovarian$futime, ovcph$residuals, type="n", xlab="Time", ylab="Mart Res")
text(ovarian$futime, ovcph$residuals, labels=ovarian$rx, cox=.6)

Nonparametric Regression

attach(Prestige)

### Simple regression
plot(income, prestige)
lines(lowess(income, prestige,f=0.7,iter=0), lwd=2) # f: the smother span

### Multiple resgression
mod.lo <- loess(prestige~income+education, span=.7, degree=1)
summary(mod.lo)
inc <- seq(min(income), max(income), len=25)
ed <- seq(min(education), max(education), len=25)

# 25*25 combinations of inc and ed
newdata <- expand.grid(income=inc, education=ed)
fit.prestige <- matrix(predict(mod.lo, newdata), 25, 25)
persp(inc, ed, fit.prestige, theta=45, phi=30, ticktype='detailed',
xlab='Income', ylab='Education', zlab='Prestige', expand=2/3, shade=0.5)

### Statistical significance of predictors
mod.lo.inc <- loess(prestige~income, span = 0.7, degree=1)
mod.lo.ed <- loess(prestige~education, spanc=0.7, degree=1)
anova(mod.lo.inc, mod.lo)
anova(mod.lo.ed, mod.lo)

### Smoothing Splines
inc.100 <- seq(min(income), max(income), len=100)
mod.inc <- loess(prestige~income, span=.7, degree=1) # fit local-regression
pres <- predict(mod.inc, data.frame(income=inc.100)) # fitted values

plot(income, prestige)
lines(inc.100, pres, lty=2, lwd=2) # loess curve
lines(smooth.spline(income, prestige, df=3.85), lwd=2) # smoothing spline

### Additive Nonparametric Regression
library(mgcv)
mod.gam <- gam(prestige~s(income)+s(education))
mod.gam

fit.prestige <- matrix(predict(mod.gam, newdata), 25, 25)
persp(inc, ed, fit.prestige, theta=45, phi=30, ticktype='detailed',
xlab='Income', ylab='Education', zlab='Prestige', expand=2/3, shade=0.5)
plot(mod.gam)

# interaction term
mod.gam <- gam(prestige~s(income)+s(education)+s(income, education))

# semi-parametric
mod.gam <- gam(prestige~s(income)+education)

# Generalized Nonparametric Regression
mod.gam <- gam(prestige~s(income)+education, family=poisson)

sample()

sample(n) // select a random permutation from 1,...,n

sample(x) // randomly permute x
sample(x, replace=T) // a bootstrap sample
sample(x, n) // sample n items from x w/o replacement
sample(x, n, replace=T) // sample n items from x w/ replacement
sample(x, n, replace=T, prob=p)

Read and Manupulate Data

### Data
logical << numeric << complex << character
> complex(real=1, imaginary=pi)
> mode(x)

### Reading Data
> dna <- read.table("d:/data/dna.txt", header=T)
> dna <- read.table(" ", col.names=NULL)
> nsi <- read.table("nsi.dat")
> dimnames(nsi) <- list(NULL, c('x','y'))
> dimnames(nsi) <- list(NULL, paste("x", 1:35, sep="")
> dimnames(nsi)[[1]] // colnames(nsi)
> dimnames(nsi)[[2]] // rownames(nsi)

# Read tab delimiters
read.table("tab.dat", header=T, sep="t")
read.delim("tab.dat", header=T)

# Read SAS dataset (foreign package)
sashome <- "/Program Files/SAS/SAS 9.1" # The location where sas.exe is
example <- read.ssd(file.path("c:/cahn"), "example", sascmd = file.path(sashome, "sas.exe"))

### paste()
> paste('x',1:5,sep='')
> paste("A",1:5,sep=":") [1] "A:1" "A:2" "A:3" "A:4" "A:5"
> paste(1:5,"A",sep="/") [1] "1/A" "2/A" "3/A" "4/A" "5/A"

### is.as
vector, matrix, array, list, factor, function,
catergory, integer, numeric, character, logical, complex,
single, double, na, name, null, ts

# missing
> y<-c(1,NA,3,0,NA)
> y[is.na(y)] <- 0

### as
as.numeric, as.character, as.factor

### Sort data.frame
# sort a data.frame "design" with the order of design$ID
design <- design[order(design$ID),]

Vector

# initialization
> vector('logical',3) = logical(3)
> vector('numeric',4) = numeric(4)
> vector('complex',5) = complex(5)
> vector('character',5) = character(5)

# vector
length( ), mode( ), names( )

### examples for vector
> X <- 2*(1:20)
> X[X<=10] [1] 2 4 6 8 10
> X[-(1:17)] [1] 36 38 40
> X[-c(4,7)]
> Y[c(1,5,20)] <- 1 // assess the value 1 at element 1,5,20
> y[y<0] <- -y[y<0] == y <- abs(y)
> y <- c(6,7,8)
> z <- c(y,x,y)
> u <- scan() // input by keyboard

> rep(c(4,2),times=2)
> rep(c(4,2),times=c(3,2)) [1] 4 4 4 2 2
> rep(c(1,5),length=5) [1] 1 5 1 5 1

> seq(1,11,by=3) [1] 1 4 7 10
> seq(1,11,length=4) [1] 1.000000 4.333333 7.666667 11.000000
> rev(seq(1,9,by=2)) [1] 9 7 5 3 1 // reverse
> unique(x) // return the value which is unique
> sort(x) // sorts data in ascending order
> rev(sort(x)) // descending order
> rank(x) // returns the ranks of the input, average
> order(x) // sort(x) - x[order(x)]
> rle(x) // length and value of runs of the same value

> height <- c(170,180,163); names(person) <- c('a','b','c')
> height <- append(height, 200, after=2)
> height <- replace(height, 2, 192) = height[2]<-192
> height <- replace(height, c(1,3), c(166,159))

> x <- c(-1.90691,0.76018,-0.26556,0.08571,NA)
> ceiling(x) [1] -1 1 0 1 NA
> floor(x) [1] -2 0 -1 0 NA
> trunc(x) [1] -1 0 0 0 NA
> round(x,1) [1] -1.9 0.8 -0.3 0.1 NA
> signif(x,2) [1] -1.900 0.760 -0.270 0.086 NA
> print(x,digits=1) [1] -1.91 0.76 -0.27 0.09 NA
> print(x,digits=2) [1] -1.907 0.760 -0.266 0.086 NA

# functions
> sum(x)
> prod(x) // product of all the elements
> cumsum(x) // cumulative sum
> cumprod(x) // cumulative product
> diff(x)
> diff(x, lag=2)

# factors
gender <- c(rep("female", 691), rep("male", 692))
gender <- factor(gender)
gender <- relevel(gender, ref="male")
gender <- factor(gender, levels=c("male", "female"))

Linear Models

### Model Fitting
> lm.out <- lm(y~x, data= )
> summary(lm.out)
> names(lm.out)
> coef(lm.out)
> resid(lm.out)
> predict(lm.out)
> model.matrix(lm.out)

> y ~ x + fac + fac:x
> y ~ x | fac1 + fac2
> y ~ x + I(x^2) # polynomial term
> anova(lm(y~x), lm(y~x+I(x^2))) # anova table testing x^2

### Add a trend line
> plot(dist~speed, data=cars)
> abline(lm(dist~speed,data=cars))

Matrix, Array and List

### define a matrix
> A = matrix(c(1,2,3,4,5,6,7,8,9),ncol=3)
> A
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9

### operations for matrix and array
diag(10) : identity matrix
t(A), A%*%B, crossprod(A,B) ; t(A)%*%B, outer(A,B) ; outer product, kronecker(A,B)
solve(A) : inverse matrix of A
solve(A,b) ; solve equations or invert matrices - solve(A)*b
svd(A) ; singular value decomposition
qr(A) ; QR decomposition
qr(A)$rank ; rank of A
eigen(A) ; eigenvalues
chol(A) ; Choleski decomposition

### matrix(data=NA, nrow=, ncol=, byrow=F)
> mm<-matrix( scan("mfile"), ncol=5, byrow=T) // read all rows from the file
> M[1,] // row1
> M[,2] // column2
> M[1,2] // element row1, column2
> M[,c(1,3,5)] // column 1, 3, 5
> M[,-c(1,5)] // remove column 1, 5
> M[M[,3]>7,1]
> X1 <- matrix(1:10,ncol=2)
> Y1 <- matrix(1:20,ncol=4)
> Z1 <- cbind(X1,Y1) // column bind
> rbind(X1,Y1[,1:2]) // row bind
> colnames(marks)
> marks["test3"]
> aa <- array(c(1:6,7:12,13:18),dim=c(2,3,3))
> aa <- 1:18 ; dim(aa)<-c(2,3,3)
> col(aa)
> row(aa)

# matrix, array
length( ), ncol( ), nrow( ), mode( ), dim( ), dimnames( )

### list()
> alist<-list(c(0,1,2),1:10)
> alist[[1]]
> alist[[2]]

> blist <- list(c("A","B","C"),1:5,"Name")

> clist <- list(x=matrix(1:10,ncol=2),y=c("A","B"),z=blist)
> clist$x
> clist$x[1:3,1:2]
> clist$z[[2]]

### list -> array
array(unlist(aa), dim=c(3,3,10))

Categorical Data Analysis

### Binomial Test

## Approximate
prop.test(x, n, p=NULL, alternative=c("two.sided", "less", "greater"), conf.level=0.95, corrrect=TRUE)

# chi-squared with continuity correction 1/(2n)
((0.7-0.5-1/20)/(sqrt(0.5*0.5/10)))^2
> prop.test(7, 10)

# without continuity correction
> prop.test(7, 10, correct=F)

## Exact
binom.test(x, n, p = 0.5, alternative = c("two.sided", "less", "greater"), conf.level = 0.95)

### Comparing proportions in 2*2 Tables
prop.test(x) # x is a 2*2 table

### Two-Way Contingency Tables
respire <- matrix(c(16,48,40,20), byrow=T, nrow=2)
dimnames(respire) <- list(treat=c("placebo","test"), outcome=c("f","u"))

# chi-squred test
chisq.test(respire)
chisq.test(respire, correct=F)

# Monte Carlo method
chisq.test(respire, simulate.p.value=T, B=10000)

# Fisher's exact method
fisher.test(respire)

# For non-aggregate data
chisq.test(table(cat1, cat2))

# Mantel-Haenszel Chi-square Test (z=stratum)
mantelhaen.test(x, y = NULL, z = NULL, alternative = c("two.sided", "less", "greater"), correct = TRUE, exact = FALSE, conf.level = 0.95)

# McNemar's Chi-square test
mcnemar.test(x, y=NULL, correct=TRUE)

Logistic Regression

### Logistic regression
> glm(y ~ x1 + x2, family = binomial, data= )

### Cumulative logistic regression
> polr(factor(y) ~ c + t, weights=, Hess=, data =) // Hesss=F is default, but it won't work sometimes.

Nonlinear Models

### Least Square Estimation
x <- c(0.02,0.02,0.06,0.06,0.11,0.11,0.22,0.22,0.56,0.56,1.1,1.1)
y <- c(76,47,97,107,123,139,159,152,191,201,207,200)

# mu(x) = p[1]*x / (p[2]+x)

fn <- function(p) sum((y-(p[1]*x)/(p[2]+x))^2)
out <- nlm(fn, p=c(200, 0.1), hessian=T)

xfit <-seq(.02, 1.1, .05)
yfit <- out$estimate[1]*xfit/(out$estimate[2]+xfit)

plot(x,y)
lines(spline(xfit,yfit))

### Maximum Likelihood Estimation

# N(p[1], p[2])

fn <- function(p) length(x)/2*log(p[2])+sum((x-p[1])^2)/(2*p[2])
gr <- function(p) c(sum(x-p[1])/p[2], 0.5*length(x)/p[2]-0.5*sum((x-p[1])^2)/p[2]^2)

# 100 random samples from N(20, 9)
x <- rnorm(100, 20, 3)

# MLEs
mean(x); var(x)*(length(x)-1)/length(x)

nlm(p=c(20,9), fn, gr)$estimate

optim(c(20,9), fn, gr)$par

# nls()
out<-nls(y ~ alpha*x/(beta+x), start=list(alpha=200, beta=0.1))

plot(y~x)
lines(x, predict(out), col='blue')

Comparing endpoints

x <- rnorm(15,1,1)
y <- rnorm(15,1.2,1.5)

summary(x); summary(y)

fivenum(x); fivenum(y)

boxplot(x, y)

var.test(x, y)

# Test for Zero Correlation
cor.test(x, y, alternative = c("two.sided", "less", "greater"),
method = c("pearson", "kendall", "spearman"),
exact = NULL, conf.level = 0.95, ...)

# Bartlett Test for Homogeneity of Variances
bartlett.test(x, g)

# F Test to Compare Two Variances
var.test(x, y, ratio = 1, alternative = c("two.sided", "less", "greater"), conf.level = 0.95, ...)

# t-test
t.test(x, y = NULL, alternative = c("two.sided", "less", "greater"), mu = 0,
paired = FALSE, var.equal = FALSE, conf.level = 0.95, ...)

ex1) t.test(x, y)
ex2) t.test(x, y, var.equal=T)
ex3) t.test(x, y, paired=T) # paired t-test
ex4) t.test(x, y, alternative="less") # one-sided
ex5) t.test(x, mu=1) # H0: E(x) = 1

# Wilcoxon test
wilcox.test(x, y = NULL, alternative = c("two.sided", "less", "greater"), mu = 0, paired = FALSE, exact = NULL, correct = TRUE, conf.int = FALSE, conf.level =0.95, ...)

ex1) wilcox.test(x, y) # Mann-Whitney test
ex2) wilcox.test(x, y, paired = T) # Wilcoxon Signed Rank Sum test

# two-sample Kolmogrov-Smirnov test
ks.test(x, y)

# Friedman rank sum test with unreplicated blocked data
friedman.test(y, groups, blocks)

# Kruskal-Wallis rank sum test (one-way anova)
kruskal.test(x, g)

# Mood's two-sample test for a difference in scale parameters
mood.test(x, y, alternative = c("two.sided", "less", "greater"), ...)

# Ansari-Bradley two-sample test for a difference in scale parameters
ansari.test(x, y, alternative = c("two.sided", "less", "greater"),
exact = NULL, conf.int = FALSE, conf.level = 0.95, ...)

### Normality Test

#Shapiro-Wilk test of normality
shapiro.test(x)

Graphics

> dev.list(), dev.cur(), dev.set(2)
> dev.copy(3), dev.prev(), dev.next()
> dev.off, dev.off(3), graphics.off()

# Pairs plots
pairs(trees, pch=15) # pch: plotting character

# histogram with density
hist(ChickWeight$weight, probability=T)
lines(density(ChickWeight$weight))

# histogram & densityplot
attach(ChickWeight)
histogram(~weight | Diet)
densityplot(~weight | Diet)

# barplot
barplot(tapply(weight, Diet, mean, na.rm=T))

tt<-t(tapply(warpbreaks$breaks,warpbreaks[,2:3],mean))
barplot(tt,beside=T,legend=c("L", "M", "H"))

barplot(VADeaths)

ymean=c(1.25,2.65,3.45)
ysd=c(0.35,0.65,0.50)
xpos=barplot(ymean,ylim=c(0,max(ymean)+max(ysd)),col='yellow')
arrows(xpos,ymean-ysd,xpos,ymean+ysd)

# pie
pie(tapply(weight, Diet, mean, na.rm=T))

# bwplot
bwplot(weight~factor(Time), ChickWeight)
bwplot(weight~factor(Diet), ChickWeight)
bwplot(weight~factor(Time)|Diet, ChickWeight)

# X-Y plot
attach(Puromycin)
plot(rate~conc)
lines(lowess(conc,rate),col="blue")

# Extensions
plot(conc,rate,pch=as.numeric(state))
legend(3300,35,legend=c("treated","untreated"),pch=1:2)

# Coplots
coplot(rate~conc|state)

# Density plot
plot(density(rate))
qqnorm(rate)
qqline(rate)

hist(x)
hist(x, break=14) // # of classes is 14
hist(x, break=seq(60, 140, by=8), col =2) // color =2
hist(x, break=c(70,80,90,100,140), xlab="X label", ylab="Y label", main="Title", ylim=c(0,0.05)) // ylim

stem(x)
stem(x, scale=2) // double # of stems

qqplot(x, y)
qqnorm(x); qqline(x)
ppoints(x) - plot( qt(ppoints(x),9), sort(x) )

# boxplot
male <- c(6,0,2,1,2,4.5,8,3,17,4.5,4,5)
female <- c(5,13,3,2,6,14,3,1,1.5,1.5,3,8,4)
boxplot(list(male=male,female=female))
hours <- c(male,female)
sex <- c(rep("male",12), rep("female",13))
x <- data.frame(hours=hours, sex=sex)
attach(x) // add the names of the variables to the search path
boxplot(split(hours, sex))

# 2 by 2 pictures
par(mfrow=c(2,2), pch=16)

# back to 1
par(mfrow=c(1,1), pch=1)

### plots
x <- (1:100)/10
plot(x, x^3-13*x^2+39*x)
plot(x, x^3-13*x^2+39*x, type='l') # line
plot(x, x^3-13*x^2+39*x, type='l', xlab='Time', ylab='Intensity') # label

# mathematical symbols
plot(x, x^3-13*x^2+39*x, type='l', xlab='Time', ylab=expression(Intensity==x^3-13*x^2+38*x))
plot(x, pi*x^2, type='l', xlab="Radius", ylab=expression(Area==pi*r^2))

# add points, arrows, text and lines to existing plots
points(2, 34, col='red', pch=16, cex=2)
arrows(4, 50, 2.2, 34.5)
text(4.15, 50, 'local max', adj=0, col='blue', cex=1.5)
lines(x, 30-50*sin(x/2), col='blue')

# add legend
legend(0, 80, c('poly', 'sine'), col=c('black', 'blue'), lwd=2)

# change options before plotting
windows(width=8, height=5, pointsize=14)
par(mai=c(1,1,0.1,0.1), lwd=3)

Making Functions


> std.dev <- function(x) 
  {
    std.x <- sqrt(var(x));
    std.x;
  }  

> t.stat <- function(x) 
  {
    n <- length(x)
    t <- sqrt(n)*(mean(x)-mu)/std.dev(x)
    list(t=t, p=2*(1- pt(abs(x), n-1)))    
  }  

> unlist(t.stat(z,1))

Iteration

# for, while, repeat

# examples

> x <- matrix(0, nrow=3, ncol=4)
> for(i in 1:3){ for(j in 1:4){ x[i,j]<-i+j } }

> while(x*2<10000){ x<-x*2; i<-i+1 }

> repeat{ i<-i+1; x<-x*2; if(x*2>10000) break }

> y <- NULL
> for(i in 1:3) { y<-y+i; cat('i is',i, '\n') }

Descriptive Statistics

min(x)
max(x)
median(x)
quantile(x)
summary(x)

mean(x)
var(x)
sd(x)

quantile(x, probs=c(0,0.1,0.9), na.rm=T)
range(x, na.rm=T)
IQR(x)

pmax(x,y,z), pmin(x,y,z) # returns minimum for each i
mean(x, trim=0.2, na.rm=T) # trim 20% from each end

# "hills" is a data.frame
summary(trees)pairs(trees)
options(digits=3)cor(trees)cor(log(trees))

Statistical & Mathematical functions

### Statistical Functions

r : random
d: density
p: cumulative probability
q: quantity

binom(size,prob)
multinom(c(x1,x2,...,xr),n,c(p1,p2,...,pr))
geom(prob)
hyper(m,n,k)
pois(lambda)
unif(min,max)
norm(mean,sd)
mvnorm(mean, vector,covariance matrix) // multivariate
t(df)
chisq(df)
f(df1,df2)
exp(rate(=1/mean))
gamma(shape,rate)
beta(shape1,shape2)
cauchy(location, scale)
lnorm(mean,sd(of log))
logis(location,scale)
weibull(shape)
wilcox(m,n)

ex1) pnorm(1.) // Pr(Z < 1.0)
ex2) rt(5,7) // generating 5 random numbers from t distribution with df=7
ex3) dpois(1,3) // Pr(X = 1) = dpois(1, 3)
ex4) qnorm(0.975) // z which satisfies Pr(Z < z) = 0.975

### Mathematical Functionssqrt, abs
sin, cos, tan
asin, acos, atan
sinh, cosh, tanh
asinh, ashinh, achoh, atanh
exp, log, log10

### Beta and Gamma Functionsgamma(x)
lgamma(x)
psigamma(x, deriv=0)
digamma(x)
trigamma(x)
beta(a,b)
lbeta(a,b)
choose(n,k)
lchoose(n,k)
factorial(x)
lfactorial(x)

### numerical functions
Comparison Operator; <, >, ==, !, <=, >=, !=
Logic Operator; &, , xor, &&,
+, -, *, /, ^, %/% ; integer divide, %% ; Modulo function)

Introduction

### Installing Packages (Linux)
> options(CRAN = "http://cran.us.r-project.org/")
> install.packages("qtl")

> source("http://www.bioconductor.org/getBioC.R")
> getBioC("exprs")

### Install Bioconductor
> source("http://www.bioconductor.org/biocLite.R")
> biocLite()

### Update Packages (Linux)
> update.packages()

### RUNNING R
> help(polr)
> ?polr // the same
> help.search("polr")
> source("c:/power.txt")
> sink("a:/power.lst") // subsequent output from terminal can be saved as "a:/power.lst")
> sink() // restore output to the terminal
> ls() // dir> ls(pat=".dat$") // dir *.dat
> rm(x,X) // delete x & X (case-sensitive)

### RUNNING R in batch mode
# Windows
> Rcmd BATCH example.R

# Unix/Linux
> R CMD BATCH example.R

### Add default package and working directory
options(defaultPackages=c(getOption("defaultPackages"),"RWinEdt"))
setwd("G:/programming/R")

### Data
> library(affy)
> data(package="affy") // show all datasets in the package "affy"

> data(package="datasets")

### useful datasets

## PK/PD
# Indometh conc~time | Subject (2 compartment)
# Theoph (oral dose)
# Puromycin rate ~ conc state (PD)

# ChickWeight# trees# InsectSprays (1-way anaova)

# Orangecircumference ~ age Tree
# OrchardSprays (Latin Square)decrease ~ rowpos colpos treatment

# PlantGrowthweight group (1 cont, 2 trts)

# ability.cov : covariance matrix
# airquality
# attitude
# chickwts (1-way anova)

# esoph (glm)Smoking, Alcohol and (O)esophageal Cancer

# iris (cluster?)
# morley (speed of light)
# warpbreaks (2-way anova)

R