# R program for linear regression exercise # 3 (VER 14.3)

btb <- read.csv("h:/vhm/vhm802/data_csv/btb_episodes.csv")
btb$intvl.ln <- log(btb$intvl) #natural log
btb <- btb[with(btb, order(herd, p_year)),] #sorting by herd and p_year
btb$id <- as.numeric(row.names(btb)) # row/observation numbers
#run model (without excluding data rows with missing values)
btb.mod <- lm(intvl.ln~ p_rct+hdsize+I(p_year-2000)+I((p_year-2000)^2), data=btb, na.action=na.exclude)
summary(btb.mod)

#Q1
btb.diag <- btb #all diagnostics to be stored in btb.diag (for convenience, not at all necessary)
btb.diag$fit <- fitted(btb.mod) 
btb.diag$stdres <- rstandard(btb.mod)
plot(stdres~fit, data=btb.diag)
#no tests for homoscedasticity implemented in standard library

#Q2
hist(btb.diag$stdres, nclass=20) # no overlaid normal distribution
qqnorm(btb.diag$stdres)
qqline(btb.diag$stdres)
shapiro.test(btb.diag$stdres)

#Q3
library(MASS)
bc <- boxcox(intvl ~ p_rct+hdsize+I(p_year-2000)+I((p_year-2000)^2), data=btb, lambda=seq(-1, 1, 0.01))
bc <- boxcox(intvl ~ p_rct+hdsize+I(p_year-2000)+I((p_year-2000)^2), data=btb, lambda=seq(0.1, 0.2, 0.001))
bc
btb.modbc <- lm(intvl^.165 ~ p_rct+hdsize+I(p_year-2000)+I((p_year-2000)^2), data=btb, na.action=na.exclude)
summary(btb.modbc)
#diagnostics can be redone for this model

bc <- boxcox(intvl.ln ~ p_rct+hdsize+I(p_year-2000)+I((p_year-2000)^2), data=btb, lambda=seq(0, 2, 0.01))
btb.modbc2 <- lm(intvl.ln^1.89 ~ p_rct+hdsize+I(p_year-2000)+I((p_year-2000)^2), data=btb, na.action=na.exclude)
summary(btb.modbc)
#diagnostics can also be redone for this model

#Q4
#(a) extreme residuals
print(btb.diag[abs(btb.diag$stdres)>3 & !is.na(btb.diag$stdres),]) #no values listed
print(btb.diag[abs(btb.diag$stdres)>2.5 & !is.na(btb.diag$stdres),])

#(b) high leverage
btb.diag$hat <- hatvalues(btb.mod)
print(t<-btb.diag[btb.diag$hat>3*5/2987 & !is.na(btb.diag$hat),])
head(t [with( t, order(-t$hat)),], 20) #20 highest leverages

#(c) high influence 
btb.diag$cook <- cooks.distance(btb.mod)
btb.diag$dfit <- dffits(btb.mod)
print(t<-btb.diag[(btb.diag$cook>4/2987 | btb.diag$dfit>2*sqrt(5/2987))& !is.na(btb.diag$cook),])
head(t [with( t, order(-t$cook)),], 20) #20 highest cookd
head(t [with( t, order(-abs(t$dfit))),], 20) #20 most extreme dfit

#(d) extreme dfbeta
btb.diag$dfbeta <- dfbetas(btb.mod)[,2] #values for p_rct in 2nd column
print(t<-btb.diag[abs(btb.diag$dfbeta)>2/sqrt(2987) & !is.na(btb.diag$dfbeta),])
head(t [with( t, order(-abs(t$dfbeta))),], 20) #20 most extreme dfbeta

#(e) refit without observations where cooks>0.007
btb.mod2 <- lm(intvl.ln~ p_rct+hdsize+I(p_year-2000)+I((p_year-2000)^2), data=btb[btb.diag$cook<=0.007,], na.action=na.exclude)
summary(btb.mod2)