. * do-file (updated) for lecture 1a of VHM 802/812, Winter 2015 . version 13 . set more off . cd "h:\vhm\vhm802\data_stata" h:\vhm\vhm802\data_stata . . use daisy2red, clear . twoway (scatter milk120 parity) (lfit milk120 parity) . twoway (scatter milk120 parity, msymbol(circle_hollow)) (lfit milk120 parity, range(0 8)), ytitle("milk120") xtitle( > "parity") . histogram milk120 (bin=31, start=1110.2, width=145.80967) . . * regression model with prediction . regress milk120 parity Source | SS df MS Number of obs = 1536 -------------+------------------------------ F( 1, 1534) = 262.27 Model | 109234227 1 109234227 Prob > F = 0.0000 Residual | 638905966 1534 416496.718 R-squared = 0.1460 -------------+------------------------------ Adj R-squared = 0.1455 Total | 748140192 1535 487387.748 Root MSE = 645.37 ------------------------------------------------------------------------------ milk120 | Coef. Std. Err. t P>|t| [95% Conf. Interval] -------------+---------------------------------------------------------------- parity | 178.347 11.01266 16.19 0.000 156.7455 199.9484 _cons | 2727.08 34.33991 79.41 0.000 2659.722 2794.438 ------------------------------------------------------------------------------ . display invttail(1534,0.025) /* t* from Stata */ 1.9615116 . predict fit (option xb assumed; fitted values) . predict se_mean, stdp /* SE(yhat), for confidence interval */ . predict se_ind, stdf /* prediction error, for prediction interval */ . generate pred_low = fit - invttail(1534,0.025)*se_ind . generate pred_upp = fit + invttail(1534,0.025)*se_ind . list cow parity milk120 fit se_mean se_ind pred_low pred_upp in 1/10 +-------------------------------------------------------------------------------+ | cow parity milk120 fit se_mean se_ind pred_low pred_upp | |-------------------------------------------------------------------------------| 1. | 3 5 4173.0 3618.815 29.87665 646.0568 2351.567 4886.063 | 2. | 4 5 3727.3 3618.815 29.87665 646.0568 2351.567 4886.063 | 3. | 5 5 3090.8 3618.815 29.87665 646.0568 2351.567 4886.063 | 4. | 8 5 4228.4 3618.815 29.87665 646.0568 2351.567 4886.063 | 5. | 9 6 3431.1 3797.162 39.53432 646.5754 2528.896 5065.427 | |-------------------------------------------------------------------------------| 6. | 11 5 3984.7 3618.815 29.87665 646.0568 2351.567 4886.063 | 7. | 12 5 4064.7 3618.815 29.87665 646.0568 2351.567 4886.063 | 8. | 15 4 4241.5 3440.468 21.55974 645.7256 2173.869 4707.066 | 9. | 17 2 3416.6 3083.774 18.35515 645.6265 1817.37 4350.178 | 10. | 17 3 3529.5 3262.121 16.7209 645.5822 1995.804 4528.438 | +-------------------------------------------------------------------------------+ . * code modified from VER2 Fig 14.1, works for simple linear regression only . twoway (scatter milk120 parity, msymbol(circle_hollow)) (lfit milk120 parity, range(0,8)) /// > (lfitci milk120 parity, ciplot(rline) blcolor(black) blpattern(dash) range(0,8)) /// > (lfitci milk120 parity, stdf ciplot(rline) blcolor(black) blwidth(medium) blpattern(shortdash_dot_dot) range(0,8)), > /// > legend(off) ytitle("milk120") xtitle("parity") . . * residuals: individual observations . predict rawres, residuals (38 missing values generated) . predict stdres, rstandard (38 missing values generated) . predict delres, rstudent /* note: rstudent option for deletion residuals */ (38 missing values generated) . summarize rawres stdres delres Variable | Obs Mean Std. Dev. Min Max -------------+-------------------------------------------------------- rawres | 1536 -2.65e-06 645.1553 -1973.574 2137.779 stdres | 1536 -.0000951 1.000306 -3.059309 3.313621 delres | 1536 -.0000529 1.000989 -3.067684 3.324461 . list cow parity milk120 fit rawres stdres delres in 1/10 +-----------------------------------------------------------------------+ | cow parity milk120 fit rawres stdres delres | |-----------------------------------------------------------------------| 1. | 3 5 4173.0 3618.815 554.1854 .8596371 .8595639 | 2. | 4 5 3727.3 3618.815 108.4854 .1682796 .1682263 | 3. | 5 5 3090.8 3618.815 -528.0146 -.8190417 -.8189538 | 4. | 8 5 4228.4 3618.815 609.5853 .9455719 .9455392 | 5. | 9 6 3431.1 3797.162 -366.0615 -.568283 -.5681576 | |-----------------------------------------------------------------------| 6. | 11 5 3984.7 3618.815 365.8853 .5675513 .5674258 | 7. | 12 5 4064.7 3618.815 445.8853 .691645 .6915274 | 8. | 15 4 4241.5 3440.468 801.0323 1.2419 1.24212 | 9. | 17 2 3416.6 3083.774 332.8264 .5159264 .5158029 | 10. | 17 3 3529.5 3262.121 267.3793 .4144459 .414334 | +-----------------------------------------------------------------------+ . list cow parity milk120 fit rawres stdres delres if stdres~=. & abs(stdres)>3 +------------------------------------------------------------------------+ | cow parity milk120 fit rawres stdres delres | |------------------------------------------------------------------------| 669. | 765 2 1110.2 3083.774 -1973.574 -3.059309 -3.067684 | 1319. | 2440 5 5630.3 3618.815 2011.485 3.12016 3.129088 | 1341. | 2460 3 5399.9 3262.121 2137.779 3.313621 3.324461 | 1370. | 2489 3 5278.0 3262.121 2015.879 3.124673 3.133643 | +------------------------------------------------------------------------+ . display 2*1536*ttail(1533,3.324461) /* Bonferroni-corrected P-value */ 1.3928477 . display invttail(1533,.025/1536) /* cut-off for signif at 5% level */ 4.1672429 . . * assessing normality using residuals . qnorm stdres . histogram stdres, normal (bin=31, start=-3.0593085, width=.20557838) . summarize stdres, detail Standardized residuals ------------------------------------------------------------- Percentiles Smallest 1% -2.23909 -3.059309 5% -1.601332 -2.949112 10% -1.267381 -2.927706 Obs 1536 25% -.670562 -2.849384 Sum of Wgt. 1536 50% -.0180296 Mean -.0000951 Largest Std. Dev. 1.000306 75% .6480753 2.97556 90% 1.281694 3.12016 Variance 1.000612 95% 1.698061 3.124673 Skewness .129289 99% 2.410189 3.313621 Kurtosis 3.089667 . swilk stdres Shapiro-Wilk W test for normal data Variable | Obs W V z Prob>z -------------+-------------------------------------------------- stdres | 1536 0.99804 1.826 1.517 0.06468 . . * assessing linearity using residuals . scatter rawres fit . rvfplot /* same plot without generating variables; raw residuals only */ . scatter stdres fit /* most interesting residual plot */ . scatter stdres parity /* in simple regression, essentially the same as above */ . lowess stdres parity, xtitle("parity") . lowess milk120 parity, ytitle ("milk120") xtitle("parity") . . * assessing homoscedasticity . scatter stdres fit /* same as above */ . tabstat stdres, statistics(mean sd count) by(parity) Summary for variables: stdres by categories of: parity (Lactation number) parity | mean sd N ---------+------------------------------ 1 | -.4121462 .7542639 403 2 | .4093676 .9711145 369 3 | .2594253 1.015435 308 4 | .0739393 1.017689 219 5 | -.2971432 1.074628 164 6 | -.4336789 .8422888 69 7 | -.6370535 .1613464 4 ---------+------------------------------ Total | -.0000951 1.000306 1536 ---------------------------------------- . estat hettest Breusch-Pagan / Cook-Weisberg test for heteroskedasticity Ho: Constant variance Variables: fitted values of milk120 chi2(1) = 7.29 Prob > chi2 = 0.0069 . estat imtest Cameron & Trivedi's decomposition of IM-test --------------------------------------------------- Source | chi2 df p ---------------------+----------------------------- Heteroskedasticity | 16.48 2 0.0003 Skewness | 7.22 1 0.0072 Kurtosis | 0.75 1 0.3874 ---------------------+----------------------------- Total | 24.46 4 0.0001 --------------------------------------------------- . . * transformation . boxcox milk120 parity, model(lhs) /* lhs=left-hand-side is the default */ Fitting comparison model Iteration 0: log likelihood = -12237.344 Iteration 1: log likelihood = -12235.143 Iteration 2: log likelihood = -12235.143 Fitting full model Iteration 0: log likelihood = -12116.128 Iteration 1: log likelihood = -12112.932 Iteration 2: log likelihood = -12112.931 Number of obs = 1536 LR chi2(1) = 244.42 Log likelihood = -12112.931 Prob > chi2 = 0.000 ------------------------------------------------------------------------------ milk120 | Coef. Std. Err. z P>|z| [95% Conf. Interval] -------------+---------------------------------------------------------------- /theta | .7656256 .0921533 8.31 0.000 .5850085 .9462428 ------------------------------------------------------------------------------ Estimates of scale-variant parameters ---------------------------- | Coef. -------------+-------------- Notrans | parity | 27.0975 _cons | 554.488 -------------+-------------- /sigma | 97.53378 ---------------------------- --------------------------------------------------------- Test Restricted LR statistic P-value H0: log likelihood chi2 Prob > chi2 --------------------------------------------------------- theta = -1 -12314.672 403.48 0.000 theta = 0 -12148.834 71.81 0.000 theta = 1 -12116.128 6.39 0.011 --------------------------------------------------------- . * analysis of power-transformed outcome . generate tmilk=milk120^.75 (38 missing values generated) . regress tmilk parity Source | SS df MS Number of obs = 1536 -------------+------------------------------ F( 1, 1534) = 264.86 Model | 1103464.63 1 1103464.63 Prob > F = 0.0000 Residual | 6390947.69 1534 4166.19798 R-squared = 0.1472 -------------+------------------------------ Adj R-squared = 0.1467 Total | 7494412.32 1535 4882.3533 Root MSE = 64.546 ------------------------------------------------------------------------------ tmilk | Coef. Std. Err. t P>|t| [95% Conf. Interval] -------------+---------------------------------------------------------------- parity | 17.92527 1.101429 16.27 0.000 15.7648 20.08573 _cons | 375.9898 3.434499 109.47 0.000 369.253 382.7267 ------------------------------------------------------------------------------ . predict fit1 (option xb assumed; fitted values) . predict stdres1, rstandard (38 missing values generated) . scatter stdres1 fit1 . qnorm stdres1 . summarize stdres1, detail Standardized residuals ------------------------------------------------------------- Percentiles Smallest 1% -2.313608 -3.402181 5% -1.658406 -3.083472 10% -1.283043 -3.058562 Obs 1536 25% -.6623973 -3.01349 Sum of Wgt. 1536 50% .0068648 Mean -.0000956 Largest Std. Dev. 1.000307 75% .6662298 2.804832 90% 1.270427 2.859347 Variance 1.000614 95% 1.657495 2.936321 Skewness -.0226933 99% 2.27907 3.102081 Kurtosis 3.070858 . swilk stdres1 Shapiro-Wilk W test for normal data Variable | Obs W V z Prob>z -------------+-------------------------------------------------- stdres1 | 1536 0.99904 0.891 -0.291 0.61429 . estat hettest Breusch-Pagan / Cook-Weisberg test for heteroskedasticity Ho: Constant variance Variables: fitted values of tmilk chi2(1) = 3.31 Prob > chi2 = 0.0689 . * analysis of square-root transformed outcome . generate rmilk=milk120^.5 (38 missing values generated) . regress rmilk parity Source | SS df MS Number of obs = 1536 -------------+------------------------------ F( 1, 1534) = 266.23 Model | 8851.08257 1 8851.08257 Prob > F = 0.0000 Residual | 50998.9065 1534 33.2457018 R-squared = 0.1479 -------------+------------------------------ Adj R-squared = 0.1473 Total | 59849.9891 1535 38.9902209 Root MSE = 5.7659 ------------------------------------------------------------------------------ rmilk | Coef. Std. Err. t P>|t| [95% Conf. Interval] -------------+---------------------------------------------------------------- parity | 1.605405 .0983907 16.32 0.000 1.41241 1.798399 _cons | 51.96426 .3068041 169.37 0.000 51.36246 52.56606 ------------------------------------------------------------------------------ . predict fit2 (option xb assumed; fitted values) . predict stdres2, rstandard (38 missing values generated) . scatter stdres2 fit2 . qnorm stdres2 . summarize stdres2, detail Standardized residuals ------------------------------------------------------------- Percentiles Smallest 1% -2.486428 -3.791986 5% -1.729319 -3.220177 10% -1.306212 -3.191329 Obs 1536 25% -.6381913 -3.184865 Sum of Wgt. 1536 50% .0332434 Mean -.0000959 Largest Std. Dev. 1.000308 75% .6782325 2.611937 90% 1.257776 2.637807 Variance 1.000616 95% 1.624026 2.753201 Skewness -.1790609 99% 2.184529 2.897921 Kurtosis 3.137836 . swilk stdres2 Shapiro-Wilk W test for normal data Variable | Obs W V z Prob>z -------------+-------------------------------------------------- stdres2 | 1536 0.99739 2.436 2.243 0.01246 . estat hettest Breusch-Pagan / Cook-Weisberg test for heteroskedasticity Ho: Constant variance Variables: fitted values of rmilk chi2(1) = 0.83 Prob > chi2 = 0.3616 . . * transformation: multiple regression for wpc outcome . use daisy2red, clear . generate parity1=parity-1 . generate calv_mth=month(calv_dt) . generate aut_calv=(calv_mth>=2 & calv_mth<=7) if !missing(calv_mth) . label value aut_calv noyes . generate hs100_ctr=(herd_size-251)/100 . generate hs100_ctr_sq=hs100_ctr^2 . * basic model . regress wpc hs100_ctr hs100_ctr_sq parity1 i.aut_calv i.twin i.dyst i.rp##i.vag_disch Source | SS df MS Number of obs = 1574 -------------+------------------------------ F( 9, 1564) = 13.22 Model | 296062.694 9 32895.8549 Prob > F = 0.0000 Residual | 3892027.86 1564 2488.50886 R-squared = 0.0707 -------------+------------------------------ Adj R-squared = 0.0653 Total | 4188090.56 1573 2662.48605 Root MSE = 49.885 ------------------------------------------------------------------------------ wpc | Coef. Std. Err. t P>|t| [95% Conf. Interval] -------------+---------------------------------------------------------------- hs100_ctr | 19.85708 2.163397 9.18 0.000 15.61361 24.10054 hs100_ctr_sq | 11.13827 3.111145 3.58 0.000 5.035817 17.24073 parity1 | 1.13721 .8583103 1.32 0.185 -.5463501 2.82077 | aut_calv | yes | -8.263839 2.537751 -3.26 0.001 -13.24159 -3.286086 | twin | yes | 20.68314 9.845165 2.10 0.036 1.37203 39.99425 | dyst | yes | 11.70041 5.462576 2.14 0.032 .985666 22.41516 | rp | yes | 5.98687 4.811976 1.24 0.214 -3.451734 15.42547 | vag_disch | yes | 1.228196 7.161395 0.17 0.864 -12.81875 15.27514 | rp#vag_disch | yes#yes | 22.85194 12.51605 1.83 0.068 -1.698056 47.40194 | _cons | 64.33029 2.634114 24.42 0.000 59.16352 69.49705 ------------------------------------------------------------------------------ . xi: regress wpc hs100_ctr hs100_ctr_sq parity1 i.aut_calv i.twin i.dyst i.rp*i.vag_disch i.aut_calv _Iaut_calv_0-1 (naturally coded; _Iaut_calv_0 omitted) i.twin _Itwin_0-1 (naturally coded; _Itwin_0 omitted) i.dyst _Idyst_0-1 (naturally coded; _Idyst_0 omitted) i.rp _Irp_0-1 (naturally coded; _Irp_0 omitted) i.vag_disch _Ivag_disch_0-1 (naturally coded; _Ivag_disch_0 omitted) i.rp*i.vag_di~h _IrpXvag_#_# (coded as above) Source | SS df MS Number of obs = 1574 -------------+------------------------------ F( 9, 1564) = 13.22 Model | 296062.694 9 32895.8549 Prob > F = 0.0000 Residual | 3892027.86 1564 2488.50886 R-squared = 0.0707 -------------+------------------------------ Adj R-squared = 0.0653 Total | 4188090.56 1573 2662.48605 Root MSE = 49.885 ------------------------------------------------------------------------------- wpc | Coef. Std. Err. t P>|t| [95% Conf. Interval] --------------+---------------------------------------------------------------- hs100_ctr | 19.85708 2.163397 9.18 0.000 15.61361 24.10054 hs100_ctr_sq | 11.13827 3.111145 3.58 0.000 5.035817 17.24073 parity1 | 1.13721 .8583103 1.32 0.185 -.5463501 2.82077 _Iaut_calv_1 | -8.263839 2.537751 -3.26 0.001 -13.24159 -3.286086 _Itwin_1 | 20.68314 9.845165 2.10 0.036 1.37203 39.99425 _Idyst_1 | 11.70041 5.462576 2.14 0.032 .985666 22.41516 _Irp_1 | 5.98687 4.811976 1.24 0.214 -3.451734 15.42547 _Ivag_disch_1 | 1.228196 7.161395 0.17 0.864 -12.81875 15.27514 _IrpXvag_1_1 | 22.85194 12.51605 1.83 0.068 -1.698056 47.40194 _cons | 64.33029 2.634114 24.42 0.000 59.16352 69.49705 ------------------------------------------------------------------------------- . . * box-cox analysis: note different specification of interactions . xi: boxcox wpc hs100_ctr hs100_ctr_sq parity1 i.aut_calv i.twin i.dyst i.rp*i.vag_disch i.aut_calv _Iaut_calv_0-1 (naturally coded; _Iaut_calv_0 omitted) i.twin _Itwin_0-1 (naturally coded; _Itwin_0 omitted) i.dyst _Idyst_0-1 (naturally coded; _Idyst_0 omitted) i.rp _Irp_0-1 (naturally coded; _Irp_0 omitted) i.vag_disch _Ivag_disch_0-1 (naturally coded; _Ivag_disch_0 omitted) i.rp*i.vag_di~h _IrpXvag_#_# (coded as above) Fitting comparison model Iteration 0: log likelihood = -8439.9903 Iteration 1: log likelihood = -8082.9461 Iteration 2: log likelihood = -8035.2171 Iteration 3: log likelihood = -8034.9512 Iteration 4: log likelihood = -8034.9511 Fitting full model Iteration 0: log likelihood = -8382.2918 Iteration 1: log likelihood = -8022.9541 Iteration 2: log likelihood = -7958.3552 Iteration 3: log likelihood = -7957.985 Iteration 4: log likelihood = -7957.9849 Number of obs = 1574 LR chi2(9) = 153.93 Log likelihood = -7957.9849 Prob > chi2 = 0.000 ------------------------------------------------------------------------------ wpc | Coef. Std. Err. z P>|z| [95% Conf. Interval] -------------+---------------------------------------------------------------- /theta | .1097595 .0270646 4.06 0.000 .0567138 .1628052 ------------------------------------------------------------------------------ Estimates of scale-variant parameters ---------------------------- | Coef. -------------+-------------- Notrans | hs100_ctr | .5236879 hs100_ctr_sq | .316867 parity1 | .0207726 _Iaut_calv_1 | -.2119782 _Itwin_1 | .5994942 _Idyst_1 | .1803076 _Irp_1 | .1697644 _Ivag_disc~1 | -.0333896 _IrpXvag_1_1 | .6352271 _cons | 4.90172 -------------+-------------- /sigma | 1.119778 ---------------------------- --------------------------------------------------------- Test Restricted LR statistic P-value H0: log likelihood chi2 Prob > chi2 --------------------------------------------------------- theta = -1 -9664.734 3413.50 0.000 theta = 0 -7966.6087 17.25 0.000 theta = 1 -8382.2918 848.61 0.000 --------------------------------------------------------- . generate lnwpc=ln(wpc) . regress lnwpc hs100_ctr hs100_ctr_sq parity1 i.aut_calv i.twin i.dyst i.rp##i.vag_disch Source | SS df MS Number of obs = 1574 -------------+------------------------------ F( 9, 1564) = 18.05 Model | 86.9474063 9 9.66082293 Prob > F = 0.0000 Residual | 836.874538 1564 .535086022 R-squared = 0.0941 -------------+------------------------------ Adj R-squared = 0.0889 Total | 923.821945 1573 .587299393 Root MSE = .7315 ------------------------------------------------------------------------------ lnwpc | Coef. Std. Err. t P>|t| [95% Conf. Interval] -------------+---------------------------------------------------------------- hs100_ctr | .343658 .0317233 10.83 0.000 .2814333 .4058827 hs100_ctr_sq | .2118728 .0456207 4.64 0.000 .1223885 .301357 parity1 | .0129844 .012586 1.03 0.302 -.0117028 .0376715 | aut_calv | yes | -.1371621 .0372127 -3.69 0.000 -.2101542 -.0641701 | twin | yes | .3927426 .1443661 2.72 0.007 .1095711 .6759141 | dyst | yes | .1109405 .0801013 1.39 0.166 -.0461768 .2680578 | rp | yes | .1123166 .0705612 1.59 0.112 -.0260878 .250721 | vag_disch | yes | -.0260135 .1050122 -0.25 0.804 -.231993 .1799661 | rp#vag_disch | yes#yes | .4136999 .183531 2.25 0.024 .0537072 .7736926 | _cons | 3.888587 .0386257 100.67 0.000 3.812823 3.96435 ------------------------------------------------------------------------------ . predict fit, xb . predict stdres, rstandard . scatter stdres fit . qnorm stdres . estat hettest Breusch-Pagan / Cook-Weisberg test for heteroskedasticity Ho: Constant variance Variables: fitted values of lnwpc chi2(1) = 19.98 Prob > chi2 = 0.0000 . estat imtest Cameron & Trivedi's decomposition of IM-test --------------------------------------------------- Source | chi2 df p ---------------------+----------------------------- Heteroskedasticity | 48.63 44 0.2919 Skewness | 8.75 9 0.4605 Kurtosis | 0.52 1 0.4698 ---------------------+----------------------------- Total | 57.91 54 0.3332 --------------------------------------------------- . * backtransformation of coef for dystocia . display "e^b= " exp(.1109405) ", 95% CI: " exp(-.0461768) " , " exp(.2680578) e^b= 1.1173284, 95% CI: .95487313 , 1.3074227 . * backtransformation of intercept . display "e^b= " exp(3.888587) ", 95% CI: " exp(3.812823) " , " exp(3.96435) e^b= 48.841824, 95% CI: 45.278079 , 52.686012 . * backtransformation of dystocia prediction (no contrib from other predictors) . lincom _cons+1.dyst ( 1) 1.dyst + _cons = 0 ------------------------------------------------------------------------------ lnwpc | Coef. Std. Err. t P>|t| [95% Conf. Interval] -------------+---------------------------------------------------------------- (1) | 3.999527 .081327 49.18 0.000 3.840006 4.159049 ------------------------------------------------------------------------------ . display "e^b= " exp(3.999527) ", 95% CI: " exp(3.840006) " , " exp(4.159049) e^b= 54.572331, 95% CI: 46.525754 , 64.01062