# R program for logistic regression exercise # 3 (VER 16.3)
# Jenny Yu / Henrik Stryhn, March 2014

calf <- read.csv("h:/vhm/vhm802/data_csv/calf.csv")
# restrict the data to cases actually used in final model
calf2 <- na.omit(calf[,c("sepsis", "post","umb","age")])

#Q1: Goodness-of-fit tests
calf.final<- glm(sepsis ~ I(age-10)+I((age-10)^2)+umb+as.factor(post), family=binomial(), data=calf2)
summary(calf.final)
# Hosmer-Lemeshow test; code from http://http://sas-and-r.blogspot.ca/2010/09/example-87-hosmer-and-lemeshow-goodness.html
hosmerlem = function(y, yhat, g=10) {
  cutyhat = cut(yhat, breaks = quantile(yhat, probs=seq(0, 1, 1/g)), include.lowest=TRUE)
  obs = xtabs(cbind(1 - y, y) ~ cutyhat)
  print(obs)
  expect = xtabs(cbind(1 - yhat, yhat) ~ cutyhat)
  print(expect)
  chisq = sum((obs - expect)^2/expect)
  P = 1 - pchisq(chisq, g - 2)
  #return(list(chisq=chisq,p.value=P))
  c("X^2" = chisq, Df = g - 2, "P(>Chi)" = P)
}
hosmerlem(y=calf2$sepsis, yhat=fitted(calf.final)) 
# see below for Pearson X2 statistic
# statistics for logistic model fit at defined threshold
thresh <- 0.5
predpos <- as.numeric(fitted(calf.final)>=thresh)
table(predpos, calf2$sepsis==1)
mean(predpos[calf2$sepsis==1]) # Se
1-mean(predpos[calf2$sepsis==0]) # Sp
mean(calf2$sepsis[predpos==1]) # positive predictive value
1-mean(calf2$sepsis[predpos==0]) # negative predictive value
mean(as.numeric(predpos==calf2$sepsis)) # prop. correctly classified
# for ROC curves, see Q3 

#Q2: logistic regression diagnostics
# first collapse to covariate patterns
library(doBy) # package to facilitate processing by groups
calf2.agg <- data.frame( summaryBy( sepsis~ age+post+umb, data=calf2, FUN=c(sum,length)))
calf2.agg$Ymat <- cbind( calf2.agg$sepsis.sum, calf2.agg$sepsis.length-calf2.agg$sepsis.sum)
calfagg.final<- glm(Ymat ~ I(age-10)+I((age-10)^2)+umb+as.factor(post), family=binomial(), data=calf2.agg)
summary(calfagg.final) # same as for calf.final
# note: listing gives Deviance goodness-of-fit test (Residual deviance)
sum( residuals( calfagg.final, type="pearson")^2) # Pearson goodness-of-fit test, same DF as for Deviance test

calf.diag <- calf2.agg #all diagnostics to be stored in calf.diag (for convenience, not at all necessary)
calf.diag$obsprob <- calf.diag$sepsis.sum/calf.diag$sepsis.length
calf.diag$fitprob <- fitted(calfagg.final) 
calf.diag$devres <- residuals(calfagg.final)
calf.diag$pearsres <- residuals(calfagg.final, type="pearson")
calf.diag$lev <- hatvalues(calfagg.final)
calf.diag$stdpears <- calf.diag$pearsres/sqrt(1-calf.diag$lev)
calf.diag$dfbeta <- dfbeta(calfagg.final) # separate dfbetas for each parameter
calf.diag$cooksd <- cooks.distance(calfagg.final) # Cook's D influence diagnostic
summary(calf.diag)
plot(stdpears~lev, data=calf.diag)
plot(stdpears~cooksd, data=calf.diag)
plot(cooksd~fitprob, data=calf.diag)
plot(stdpears~fitprob, data=calf.diag)
# no delta X2 or delta deviance statistics
# additional data manipulations (e.g. analysis without certain observations) not shown here, 
# see ver14_3sol.R for sample code

#Q3: predicting sepsis
# ROC curves apparently included in epicalc package (not shown here)

#Q4: choosing an appropriate cutpoint
# use code from Q1 or ROC curve function in epicalc package
thresh <- 0.2
predpos <- as.numeric(fitted(calf.final)>=thresh) # note: using calf.final model fit
table(predpos, calf2$sepsis==1)
mean(predpos[calf2$sepsis==1]) # Se
1-mean(predpos[calf2$sepsis==0]) # Sp
mean(calf2$sepsis[predpos==1]) # positive predictive value
1-mean(calf2$sepsis[predpos==0]) # negative predictive value
mean(as.numeric(predpos==calf2$sepsis)) # prop. correctly classified
# same thing for a threshold of 0.6