* do-file for lecture 4b of VHM 802/812, Winter 2014
version 13
set more off
cd "h:\vhm\vhm802\data_stata"

use nocardia.dta, clear
* multiple logistic regression model
logit casecont dneo dclox dcpct 
logistic casecont dneo dclox dcpct 

* confounding effect by dcpct?
logit casecont dneo dclox dcpct 
estimates store full
logit casecont dneo dclox
estimates store red
estimates table *, se
di (2.3774092-2.2125639)/2.3774092 /* percent change for dneo */
di (1.412516-1.0097292)/1.0097292 /* percent change for dclox */
* check the association between dcpct and the exposures
kwallis dcpct if casecont==0, by(dneo)
kwallis dcpct if casecont==0, by(dclox)

* interaction dneo*dclox?
generate neoclox=dneo*dclox
logit casecont dcpct dneo dclox neoclox
logistic casecont dcpct dneo dclox neoclox
logit casecont dcpct dneo##dclox /* same thing with factor notation */
lincom 1.dneo+1.dneo#1.dclox /* log OR for dneo at dclox=1 */
lincom 1.dclox+1.dneo#1.dclox /* log OR for dclox at dneo=1 */

* evaluating linearity graphically
logit casecont numcow
summarize numcow, detail
scatter casecont numcow, ylabel(0 1)
tabulate numcow casecont /* little replication of numcow */
lowess casecont numcow, logit /* smoothed scatterplot */
* added code for smoothed scatterplot with restricted range
lowess casecont numcow, logit nograph generate(smoothy)
twoway (line smoothy numcow if numcow<100, sort)
lowess casecont numcow if numcow<100, logit /* "intuitive" but totally wrong! */

* evaluating linearity by categorization, using lintrend add-on command
lintrend casecont numcow, groups(4) plot(log) /* linear trend plot */
egen numcow4=cut(numcow), at(16,32,42,55,200) /* note: left endpoints included */
sort numcow
browse numcow numcow4
logit casecont i.numcow4 /* model with numcow categorized */
testparm i.numcow4 /* combined Wald test; could also have used LR test */

* evaluating linearity by polynomials
generate numcowsq=numcow^2
logit casecont numcow numcowsq
estat vce, corr /* strong correlation of estimates */
* code to illustrate reduction in collinearity by centering
generate numcowct=numcow-75 /* mean numcow=49, but distrib is right-skewed */
generate numcowctsq=numcowct^2
logit casecont numcowct numcowctsq
estat vce, corr

* other polynomials (advanced)
* orthogonal polynomials
orthpoly numcow, degree(2) generate(numcowop1 numcowop2)
pwcorr numcowop1 numcowop2 /* no correlation */
logit casecont numcowop1 numcowop2 /* same as quadratic model above */
estat vce, corr
drop numcowop1 numcowop2
* fractional polynomials
fp <numcow>,  scale center replace: logit casecont <numcow>
fp plot, r(none)
predict linp_fp, xb
scatter linp_fp numcow
* piecewise linear curve
mkspline numcowpl1 50 numcowpl2 75 numcowpl3=numcow /* knots at 50,75 */
logit casecont numcowpl1 numcowpl2 numcowpl3
predict linp_pwl, xb
scatter linp_pwl numcow
* close to significance with quadratic polynomial and piecewise linear (50,75)

* reload clean dataset
use nocardia.dta, clear
* preparation for model building
codebook dout /* exclude due to too little variation */
codebook dcprep /* exclude due to too little variation */
codebook dbarn /* generate dichotomous barn type, due to too sparse category */
generate dbarn2=dbarn 
replace dbarn2=1 if dbarn==3
codebook doth /* good distribution on two categories */
* stepwise model selection procedures
stepwise, pr(.1): logit casecont dneo dclox dcpct doth numcow dbarn2 bscc /* backwards elimination */
stepwise, pe(.099): logit casecont dneo dclox dcpct doth numcow dbarn2 bscc /* forward selection */
stepwise, pe(.099) pr(.1): logit casecont dneo dclox dcpct doth numcow dbarn2 bscc /* backwise stepwise */
* conclusion: all point to same model
* refit with all observations included
logit casecont dneo dclox dcpct doth dbarn2
* continue manually from model with interaction dneo*dclox included
logit casecont dneo##dclox dcpct doth dbarn2 
estat vce, corr /* look for correlation doth vs dneo#dclox */
table casecont dneo dclox, by(doth) /* table to investigate change in doth estimate */
logit casecont dneo##dclox dcpct dbarn2 /* final model */
* display statistics and compute tests for dclox
estimates store full
estimates stats /* display model-fit statistics */
testparm i.dclox dneo#dclox /* Wald test for dclox */
logit casecont dneo dcpct dbarn2 /* without dclox */
estimates store red
lrtest full, stats /* LR test for dclox */

* categorical predictor effects: effects of dbarn and dneo#dclox
logit casecont dneo##dclox dcpct i.dbarn /* final model */
testparm i.dbarn /* overall test */
lincom 2.dbarn-1.dbarn /* already in table of estimates */
lincom 3.dbarn-2.dbarn /* not in table of estimates */
pwcompare dbarn dneo#dclox
pwcompare dbarn dneo#dclox, pv /* option pv to get P-values */
pwcompare dbarn dneo#dclox, pv mcompare(bon) /* bonferroni adjustment */

* generalised linear models for binary/binomial data
glm casecont dcpct dneo dclox, family(binomial) /* same as logit */
glm casecont dcpct dneo dclox, family(binomial) link(probit) 
