. * do-file for lecture 2b of VHM 812/802, Winter 2025
. version 18 /* works also with versions 14-17 */

. set more off

. set scheme stcolor_alt

. cd "r:\"
r:\

. 
. * Coleman data to illustrate model checking procedures
. use coleman.dta, clear

. gen obs=_n

. regress y x1-x5

      Source |       SS           df       MS      Number of obs   =        20
-------------+----------------------------------   F(5, 14)        =     27.08
       Model |  582.686401         5   116.53728   Prob > F        =    0.0000
    Residual |  60.2378924        14   4.3027066   R-squared       =    0.9063
-------------+----------------------------------   Adj R-squared   =    0.8728
       Total |  642.924294        19  33.8381207   Root MSE        =    2.0743

------------------------------------------------------------------------------
           y | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
          x1 |  -1.793333   1.233397    -1.45   0.168    -4.438706    .8520392
          x2 |   .0436015   .0532589     0.82   0.427    -.0706275    .1578304
          x3 |   .5557601   .0929564     5.98   0.000     .3563884    .7551318
          x4 |   1.110168   .4337681     2.56   0.023     .1798279    2.040508
          x5 |  -1.810919    2.02739    -0.89   0.387    -6.159239      2.5374
       _cons |   19.94857   13.62755     1.46   0.165    -9.279627    49.17676
------------------------------------------------------------------------------

. boxcox y x1-x5 // lambda=2.25 but CI includes 1 => stay at original scale
Fitting comparison model

Iteration 0:  Log likelihood = -63.081718  
Iteration 1:  Log likelihood = -62.012445  
Iteration 2:  Log likelihood = -62.011492  
Iteration 3:  Log likelihood = -62.011492  

Fitting full model

Iteration 0:  Log likelihood = -39.404464  
Iteration 1:  Log likelihood = -38.111357  
Iteration 2:  Log likelihood = -38.109884  
Iteration 3:  Log likelihood = -38.109884  

                                                  Number of obs   =         20
                                                  LR chi2(5)      =      47.80
Log likelihood = -38.109884                       Prob > chi2     =      0.000
 
------------------------------------------------------------------------------
           y | Coefficient  Std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
      /theta |   2.252691   .8174182     2.76   0.006     .6505804    3.854801
------------------------------------------------------------------------------
 
Estimates of scale-variant parameters
----------------------------
             | Coefficient
-------------+--------------
Notrans      |
          x1 |  -170.9819
          x2 |   6.214605
          x3 |   38.62343
          x4 |   101.1014
          x5 |  -148.1153
       _cons |  -125.5757
-------------+--------------
      /sigma |   137.6627
----------------------------

---------------------------------------------------------
   Test         Restricted     LR statistic
    H0:       log likelihood       chi2       Prob > chi2
---------------------------------------------------------
theta = -1      -46.685628        17.15           0.000
theta =  0      -42.360532         8.50           0.004
theta =  1      -39.404464         2.59           0.108
---------------------------------------------------------

. regress y x1-x5

      Source |       SS           df       MS      Number of obs   =        20
-------------+----------------------------------   F(5, 14)        =     27.08
       Model |  582.686401         5   116.53728   Prob > F        =    0.0000
    Residual |  60.2378924        14   4.3027066   R-squared       =    0.9063
-------------+----------------------------------   Adj R-squared   =    0.8728
       Total |  642.924294        19  33.8381207   Root MSE        =    2.0743

------------------------------------------------------------------------------
           y | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
          x1 |  -1.793333   1.233397    -1.45   0.168    -4.438706    .8520392
          x2 |   .0436015   .0532589     0.82   0.427    -.0706275    .1578304
          x3 |   .5557601   .0929564     5.98   0.000     .3563884    .7551318
          x4 |   1.110168   .4337681     2.56   0.023     .1798279    2.040508
          x5 |  -1.810919    2.02739    -0.89   0.387    -6.159239      2.5374
       _cons |   19.94857   13.62755     1.46   0.165    -9.279627    49.17676
------------------------------------------------------------------------------

. predict fit
(option xb assumed; fitted values)

. predict stdres, rstandard

. qnorm stdres

. scatter stdres fit

. estat hettest // P=0.06

Breusch–Pagan/Cook–Weisberg test for heteroskedasticity 
Assumption: Normal error terms
Variable: Fitted values of y

H0: Constant variance

    chi2(1) =   3.53
Prob > chi2 = 0.0603

. predict delres, rstudent

. summarize stdres delres

    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
      stdres |         20    .0141447    1.011249  -2.045073   2.936169
      delres |         20    .0774767    1.315312  -2.353293    4.56463

. display 2*20*ttail(13,4.56463)
.01061588

. display invttail(13,.025/20) // one significant outlier: obs=18
3.7345363

. * leverage
. predict lev, hat

. scalar nobs=20

. scalar nparam=6 // includes intercept

. display "leverage cutoff:  " 2*nparam/nobs
leverage cutoff:  .6

. list x1-x5 lev // no value exceeds cutoff

     +-------------------------------------------------+
     |   x1      x2       x3      x4     x5        lev |
     |-------------------------------------------------|
  1. | 3.83   28.87      7.2    26.6   6.19   .4824874 |
  2. | 2.89    20.1   -11.71    24.4   5.17   .4862099 |
  3. | 2.86   69.05    12.32    25.7   7.04     .13309 |
  4. | 2.92    65.4    14.28    25.7    7.1   .1711694 |
  5. | 3.06   29.59     6.31    25.4   6.15   .1781282 |
     |-------------------------------------------------|
  6. | 2.07   44.82     6.16    21.6   6.41   .4999116 |
  7. | 2.52   77.37     12.7    24.9   6.86   .2391022 |
  8. | 2.45   24.67     -.17   25.01   5.78   .1074992 |
  9. | 3.13   65.01     9.85    26.6   6.51   .2853501 |
 10. | 2.44    9.99     -.05   28.01   5.57    .618044 |
     |-------------------------------------------------|
 11. | 2.09    12.2   -12.86   23.51   5.62   .2914316 |
 12. | 2.52   22.55      .92    23.6   5.34   .4027088 |
 13. | 2.22    14.3     4.77   24.51    5.8   .3693005 |
 14. | 2.67   31.79     -.96    25.8   6.19   .1088051 |
 15. | 2.71    11.6   -16.04    25.2   5.62   .3464445 |
     |-------------------------------------------------|
 16. | 3.14   68.47    10.62   25.01   6.94   .1568976 |
 17. | 3.54   42.64     2.66   25.01   6.33    .290619 |
 18. | 2.52    16.7   -10.99    24.8   6.01   .3255631 |
 19. | 2.68   86.27    15.03   25.51   7.51    .284568 |
 20. | 2.37   76.73    12.77   24.51   6.96     .22267 |
     +-------------------------------------------------+

. * influence on fit: cooksd, dfits
. predict cook, cooksd

. predict dfit, dfits

. display "Cook's D cutoff:   " 4/nobs
Cook's D cutoff:   .2

. display "DFITS cutoff:   " 2*sqrt(nparam/nobs)*(nobs>=120)+1*(nobs<120)
DFITS cutoff:   1

. list stdres delres lev cook dfit // only obs 18 has high influence

     +---------------------------------------------------------+
     |    stdres      delres        lev       cook        dfit |
     |---------------------------------------------------------|
  1. |  .2337315      .22567   .4824874   .0084888    .2178996 |
  2. | -.2353406   -.2272298   .4862099   .0087354   -.2210468 |
  3. | -2.045073   -2.353293     .13309   .1070133   -.9220654 |
  4. | -.2507936   -.2422155   .1711694   .0021649   -.1100733 |
  5. |  .4152676    .4026494   .1781282   .0062292    .1874525 |
     |---------------------------------------------------------|
  6. |  -.058422   -.0563037   .4999116   .0005687   -.0562938 |
  7. |  .3972383     .384964   .2391022   .0082643    .2157987 |
  8. | -.2215979   -.2139125   .1074992   .0009858   -.0742394 |
  9. |   .356088    .3446995   .2853501   .0084382    .2178126 |
 10. |    .16412    .1583024    .618044    .007264     .201368 |
     |---------------------------------------------------------|
 11. | -1.264579   -1.294768   .2914316   .1096215    -.830366 |
 12. |  1.089425    1.097339   .4027088   .1333669    .9010385 |
 13. | -.6366176   -.6225371   .3693005   .0395515   -.4763695 |
 14. |  -.176523   -.1702914   .1088051   .0006341   -.0595019 |
 15. | -1.060803   -1.065953   .3464445    .099419   -.7760926 |
     |---------------------------------------------------------|
 16. |  .6811914    .6675688   .1568976   .0143921    .2879811 |
 17. | -.8241417   -.8141577    .290619   .0463764   -.5211117 |
 18. |  2.936169     4.56463   .3255631   .6935931    3.171411 |
 19. |  .6398469    .6257898    .284568   .0271405    .3946729 |
 20. |  .1437086    .1385834     .22267    .000986    .0741719 |
     +---------------------------------------------------------+

. * influence on coefficients: dfbeta
. dfbeta, stub(dfb_x)

Generating DFBETA variables ...

       dfb_x1: DFBETA x1
       dfb_x2: DFBETA x2
       dfb_x3: DFBETA x3
       dfb_x4: DFBETA x4
       dfb_x5: DFBETA x5

. display "DFBETA cutoff:   " 2/sqrt(nobs)*(nobs>=120)+1*(nobs<120)
DFBETA cutoff:   1

. list dfb* // obs 18 influential for coeff of x2,x3,x5

     +-----------------------------------------------------------+
     |    dfb_x1      dfb_x2      dfb_x3      dfb_x4      dfb_x5 |
     |-----------------------------------------------------------|
  1. |  .1614153   -.0931844    .0717191    -.029151    .0357828 |
  2. | -.0592324   -.1525698    .0757882    .0132224    .1526851 |
  3. |  .0756827   -.0205473    .0571031   -.2099382   -.2564127 |
  4. | -.0033554    .0437473   -.0224562   -.0051965   -.0574873 |
  5. |  .0786158   -.1147672     .109427   -.0463141    .0402474 |
     |-----------------------------------------------------------|
  6. |  .0037679    .0164793   -.0162754    .0442629   -.0111139 |
  7. |  -.076813    .1451226   -.0037053    .0390117   -.0999592 |
  8. |  .0317623   -.0014475   -.0224271   -.0144413    .0254423 |
  9. |  .0133586    .1544458   -.0107843    .1078393   -.1366929 |
 10. | -.1097971   -.0158453    .0435539    .1622572   -.0303635 |
     |-----------------------------------------------------------|
 11. |  .2553824    .0826239    .4103158    .1565751   -.2130188 |
 12. |  .0467161    .2789285     .433746   -.2963116   -.6600307 |
 13. |  .1653133    .2867755   -.3371399    .0617986   -.0623551 |
 14. |  .0161378    .0147256    .0207168   -.0280619   -.0240135 |
 15. | -.0475978   -.0470727    .6261342   -.1228394   -.1847286 |
     |-----------------------------------------------------------|
 16. |  .1502512    .0349931   -.0387388   -.0858724    .0423927 |
 17. |  -.470125    .0443447    .0614843    .2344617    -.059246 |
 18. | -.2218324   -1.221368   -1.976353    .0014652    2.116274 |
 19. | -.1037023    .0363976   -.1032856    .0828336    .1429687 |
 20. | -.0344354     .034995   -.0005397    .0049002   -.0168981 |
     +-----------------------------------------------------------+

. 
. * explore impact of removing obs 18
. regress y x1-x5 if obs~=18

      Source |       SS           df       MS      Number of obs   =        19
-------------+----------------------------------   F(5, 13)        =     68.27
       Model |  607.736946         5  121.547389   Prob > F        =    0.0000
    Residual |  23.1438749        13  1.78029807   R-squared       =    0.9633
-------------+----------------------------------   Adj R-squared   =    0.9492
       Total |  630.880821        18  35.0489345   Root MSE        =    1.3343

------------------------------------------------------------------------------
           y | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
          x1 |  -1.617337   .7943108    -2.04   0.063    -3.333342     .098667
          x2 |   .0854436   .0354636     2.41   0.032     .0088292    .1620581
          x3 |   .6739334   .0651576    10.34   0.000     .5331689    .8146978
          x4 |   1.109759   .2790186     3.98   0.002     .5069761    1.712542
          x5 |  -4.570765   1.437446    -3.18   0.007    -7.676179   -1.465351
       _cons |   34.28739   9.311691     3.68   0.003      14.1707    54.40407
------------------------------------------------------------------------------

. predict fit18 if e(sample)
(option xb assumed; fitted values)
(1 missing value generated)

. predict stdres18 if e(sample), rstandard
(1 missing value generated)

. qnorm stdres18

. scatter stdres18 fit

. estat hettest // P=0.05

Breusch–Pagan/Cook–Weisberg test for heteroskedasticity 
Assumption: Normal error terms
Variable: Fitted values of y

H0: Constant variance

    chi2(1) =   3.92
Prob > chi2 = 0.0476

. predict delres18 if e(sample), rstudent
(1 missing value generated)

. summarize stdres18 delres18

    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
    stdres18 |         19    .0181177    .9544079  -2.978977   1.481831
    delres18 |         19   -.0893966    1.360745  -5.080531   1.561686

. display 2*19*ttail(12,5.080531)
.00513785

. display invttail(12,.025/19) // new significant outlier: obs=3
3.7783276

. predict lev18 if e(sample), hat
(1 missing value generated)

. predict cook18 if e(sample), cooksd
(1 missing value generated)

. predict dfit18 if e(sample), dfits
(1 missing value generated)

. list stdres18 delres18 lev18 cook18 dfit18 // obs 3 and 19 extreme on dfit

     +---------------------------------------------------------+
     |  stdres18    delres18      lev18     cook18      dfit18 |
     |---------------------------------------------------------|
  1. |  .3869159    .3738959   .4825011   .0232633    .3610315 |
  2. | -.3933622   -.3801996   .4862285   .0244064   -.3698681 |
  3. | -2.978977   -5.080531   .1348095   .2304579   -2.005457 |
  4. | -.1195133   -.1148878   .1740819   .0005018   -.0527451 |
  5. |  .7468585    .7334666   .1785313   .0202045    .3419337 |
     |---------------------------------------------------------|
  6. |  .3270415    .3155119   .5040752    .018119     .318094 |
  7. |  .1218885    .1171736   .2481005    .000817    .0673076 |
  8. | -.2018298   -.1942163   .1083723   .0008252     -.06771 |
  9. | -.1543165   -.1483985    .302448   .0017209   -.0977162 |
 10. |  .5025933    .4876368   .6191598    .068445    .6217653 |
     |---------------------------------------------------------|
 11. |  -.367237   -.3546745   .3809769   .0138335    -.278244 |
 12. |   .656935    .6419074   .4345837   .0552839    .5627622 |
 13. | -.8714966   -.8628929   .3697256   .0742557   -.6608939 |
 14. |  .5840698     .568667   .1395861   .0092239    .2290474 |
 15. |  .3450564    .3330481   .4670354   .0173892    .3117688 |
     |---------------------------------------------------------|
 16. |   1.18364    1.203926   .1575218   .0436586    .5205842 |
 17. | -.9108062   -.9044064   .2953679   .0579564   -.5855501 |
 18. |         .           .          .          .           . |
 19. |  1.481831    1.561686    .292444   .1512615    1.004002 |
 20. |   .004945     .004751   .2244506   1.18e-06    .0025559 |
     +---------------------------------------------------------+

. dfbeta, stub(dfb18_x)

Generating DFBETA variables ...

(1 missing value generated)
     dfb18_x1: DFBETA x1
(1 missing value generated)
     dfb18_x2: DFBETA x2
(1 missing value generated)
     dfb18_x3: DFBETA x3
(1 missing value generated)
     dfb18_x4: DFBETA x4
(1 missing value generated)
     dfb18_x5: DFBETA x5

. list dfb18* // no influence on coeff

     +-----------------------------------------------------------+
     |  dfb18_x1    dfb18_x2    dfb18_x3    dfb18_x4    dfb18_x5 |
     |-----------------------------------------------------------|
  1. |  .2670316   -.1496442     .108279   -.0482982    .0545984 |
  2. | -.0991034   -.2472001    .1154615    .0221248    .2327411 |
  3. |  .1743552    .0156491    .2032355    -.453759   -.5979855 |
  4. | -.0012613    .0218439   -.0070811   -.0024714   -.0276513 |
  5. |  .1422842   -.2062048    .1765119   -.0843812    .0733644 |
     |-----------------------------------------------------------|
  6. | -.0225812   -.0970537    .0725591   -.2490679    .0688991 |
  7. | -.0228695    .0462385    .0040519    .0119409   -.0331591 |
  8. |  .0291128    .0003007   -.0162802     -.01312    .0184108 |
  9. |  -.006942   -.0710195   -.0049188   -.0469846    .0638122 |
 10. | -.3395975   -.0540428    .1128121    .5005594   -.0738779 |
     |-----------------------------------------------------------|
 11. |  .0813053    .0582596    .1639505    .0458446   -.1133784 |
 12. |  .0354521    .2013958    .2998692   -.1782003   -.4241256 |
 13. |   .230034    .3899101   -.4200776    .0856801    -.087864 |
 14. | -.0600019   -.0761471   -.1073475    .0954051    .1192823 |
 15. |  .0087589   -.0252156   -.2617462    .0425518    .1246201 |
     |-----------------------------------------------------------|
 16. |  .2691604    .0525153    -.077157   -.1549134    .0831713 |
 17. | -.5197731    .0669376    .0923884    .2613039   -.0911388 |
 18. |         .           .           .           .           . |
 19. |  -.267921    .0456437   -.3033134     .207915    .3947883 |
 20. | -.0011694    .0012191    .0000735    .0001681   -.0006219 |
     +-----------------------------------------------------------+

. * also explore impact of removing obs 3 & 18
. regress y x1-x5 if obs~=18 & obs~=3

      Source |       SS           df       MS      Number of obs   =        18
-------------+----------------------------------   F(5, 12)        =    203.20
       Model |  621.887937         5  124.377587   Prob > F        =    0.0000
    Residual |  7.34497054        12  .612080879   R-squared       =    0.9883
-------------+----------------------------------   Adj R-squared   =    0.9835
       Total |  629.232907        17  37.0137004   Root MSE        =    .78236

------------------------------------------------------------------------------
           y | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
          x1 |  -1.698542   .4660195    -3.64   0.003    -2.713912   -.6831731
          x2 |   .0851182   .0207942     4.09   0.001     .0398115     .130425
          x3 |   .6661687   .0382358    17.42   0.000       .58286    .7494774
          x4 |   1.183995   .1642542     7.21   0.000     .8261162    1.541874
          x5 |  -4.066754   .8486669    -4.79   0.000     -5.91584   -2.217667
       _cons |   29.75773   5.532239     5.38   0.000     17.70401    41.81144
------------------------------------------------------------------------------

. 
. * wpc model in daisy2red data for more complex example of model checking
. use daisy2red, clear

. gen calv_mth=month(calv_dt)

. gen aut_calv=(calv_mth>=2 & calv_mth<=7) if !missing(calv_mth)

. gen hs100=herd_size/100

. gen hs100sq=hs100^2

. gen parity1=parity-1

. regress wpc c.hs100##c.hs100 parity1 i.aut_calv i.twin i.dyst i.rp##i.vag_disch

      Source |       SS           df       MS      Number of obs   =     1,574
-------------+----------------------------------   F(9, 1564)      =     13.22
       Model |  296062.681         9  32895.8535   Prob > F        =    0.0000
    Residual |  3892027.88     1,564  2488.50887   R-squared       =    0.0707
-------------+----------------------------------   Adj R-squared   =    0.0653
       Total |  4188090.56     1,573  2662.48605   Root MSE        =    49.885

---------------------------------------------------------------------------------
            wpc | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
----------------+----------------------------------------------------------------
          hs100 |  -36.05705   15.05032    -2.40   0.017    -65.57798    -6.53612
                |
c.hs100#c.hs100 |   11.13827   3.111145     3.58   0.000     5.035818    17.24073
                |
        parity1 |    1.13721   .8583103     1.32   0.185      -.54635     2.82077
     1.aut_calv |  -8.263839   2.537751    -3.26   0.001    -13.24159   -3.286086
                |
           twin |
           yes  |   20.68314   9.845165     2.10   0.036      1.37203    39.99425
                |
           dyst |
           yes  |   11.70041   5.462576     2.14   0.032     .9856659    22.41516
                |
             rp |
           yes  |    5.98687   4.811976     1.24   0.214    -3.451734    15.42547
                |
      vag_disch |
           yes  |   1.228195   7.161395     0.17   0.864    -12.81875    15.27514
                |
   rp#vag_disch |
       yes#yes  |   22.85194   12.51605     1.83   0.068    -1.698056    47.40194
                |
          _cons |   84.66125   17.61671     4.81   0.000     50.10639    119.2161
---------------------------------------------------------------------------------

. predict fit
(option xb assumed; fitted values)

. predict stdres, rstandard

. qnorm stdres

. summarize stdres, detail

                   Standardized residuals
-------------------------------------------------------------
      Percentiles      Smallest
 1%    -1.295559      -1.543635
 5%    -1.085226      -1.491268
10%    -.9411177      -1.445124       Obs               1,574
25%    -.7014489      -1.439589       Sum of wgt.       1,574

50%    -.3049904                      Mean           .0000153
                        Largest       Std. dev.      1.000497
75%     .4302663       3.716394
90%     1.438834       3.910809       Variance       1.000993
95%     2.065976       4.020359       Skewness       1.371514
99%      3.24163        4.31282       Kurtosis        4.72451

. histogram stdres // right-skewed
(bin=31, start=-1.5436347, width=.18891791)

. scatter stdres fit

. estat hettest // P<0.00005

Breusch–Pagan/Cook–Weisberg test for heteroskedasticity 
Assumption: Normal error terms
Variable: Fitted values of wpc

H0: Constant variance

    chi2(1) =  20.58
Prob > chi2 = 0.0000

. estat imtest // all P-values small

Cameron & Trivedi's decomposition of IM-test

--------------------------------------------------
              Source |       chi2     df         p
---------------------+----------------------------
  Heteroskedasticity |      74.11     44    0.0030
            Skewness |     143.84      9    0.0000
            Kurtosis |      33.27      1    0.0000
---------------------+----------------------------
               Total |     251.22     54    0.0000
--------------------------------------------------

. * try transformation, note boxcox command uses old Stata syntax
. xi: boxcox wpc hs100 hs100sq aut_calv parity1 twin dyst i.rp*i.vag_disch
i.rp              _Irp_0-1            (naturally coded; _Irp_0 omitted)
i.vag_disch       _Ivag_disch_0-1     (naturally coded; _Ivag_disch_0 omitted)
i.rp*i.vag_di~h   _IrpXvag_#_#        (coded as above)
Fitting comparison model

Iteration 0:  Log likelihood = -8439.9903  
Iteration 1:  Log likelihood = -8082.9461  
Iteration 2:  Log likelihood = -8035.2171  
Iteration 3:  Log likelihood = -8034.9512  
Iteration 4:  Log likelihood = -8034.9511  

Fitting full model

Iteration 0:  Log likelihood = -8382.2918  
Iteration 1:  Log likelihood = -8022.9541  
Iteration 2:  Log likelihood = -7958.3502  
Iteration 3:  Log likelihood =  -7957.985  
Iteration 4:  Log likelihood = -7957.9849  

                                                  Number of obs   =      1,574
                                                  LR chi2(9)      =     153.93
Log likelihood = -7957.9849                       Prob > chi2     =      0.000
 
------------------------------------------------------------------------------
         wpc | Coefficient  Std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
      /theta |   .1097595   .0270646     4.06   0.000     .0567138    .1628052
------------------------------------------------------------------------------
 
Estimates of scale-variant parameters
----------------------------
             | Coefficient
-------------+--------------
Notrans      |
       hs100 |  -1.066984
     hs100sq |    .316867
    aut_calv |  -.2119782
     parity1 |   .0207726
        twin |   .5994942
        dyst |   .1803076
      _Irp_1 |   .1697644
_Ivag_disc~1 |  -.0333896
_IrpXvag_1_1 |   .6352271
       _cons |   5.583557
-------------+--------------
      /sigma |   1.119778
----------------------------

---------------------------------------------------------
   Test         Restricted     LR statistic
    H0:       log likelihood       chi2       Prob > chi2
---------------------------------------------------------
theta = -1       -9664.734      3413.50           0.000
theta =  0      -7966.6087        17.25           0.000
theta =  1      -8382.2918       848.61           0.000
---------------------------------------------------------

. generate lnwpc=ln(wpc)

. regress lnwpc c.hs100##c.hs100 parity1 i.aut_calv i.twin i.dyst i.rp##i.vag_disch

      Source |       SS           df       MS      Number of obs   =     1,574
-------------+----------------------------------   F(9, 1564)      =     18.05
       Model |  86.9474034         9   9.6608226   Prob > F        =    0.0000
    Residual |  836.874541     1,564  .535086024   R-squared       =    0.0941
-------------+----------------------------------   Adj R-squared   =    0.0889
       Total |  923.821945     1,573  .587299393   Root MSE        =     .7315

---------------------------------------------------------------------------------
          lnwpc | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
----------------+----------------------------------------------------------------
          hs100 |  -.7199434   .2206927    -3.26   0.001    -1.152828   -.2870587
                |
c.hs100#c.hs100 |   .2118728   .0456208     4.64   0.000     .1223885    .3013571
                |
        parity1 |   .0129844    .012586     1.03   0.302    -.0117028    .0376715
     1.aut_calv |  -.1371621   .0372127    -3.69   0.000    -.2101542   -.0641701
                |
           twin |
           yes  |   .3927426   .1443661     2.72   0.007     .1095711    .6759141
                |
           dyst |
           yes  |   .1109405   .0801013     1.39   0.166    -.0461768    .2680578
                |
             rp |
           yes  |   .1123166   .0705612     1.59   0.112    -.0260878     .250721
                |
      vag_disch |
           yes  |  -.0260135   .1050122    -0.25   0.804     -.231993    .1799661
                |
   rp#vag_disch |
       yes#yes  |   .4136999    .183531     2.25   0.024     .0537072    .7736926
                |
          _cons |   4.360825   .2583254    16.88   0.000     3.854124    4.867525
---------------------------------------------------------------------------------

. drop fit stdres // revise if variables don't exist; alternatively use capture drop...

. predict fit
(option xb assumed; fitted values)

. predict stdres, rstandard

. qnorm stdres

. summarize stdres, detail

                   Standardized residuals
-------------------------------------------------------------
      Percentiles      Smallest
 1%    -2.211829      -5.399607
 5%    -1.564966      -4.989504
10%    -1.242075       -3.61681       Obs               1,574
25%    -.7143057         -3.316       Sum of wgt.       1,574

50%    -.0133218                      Mean            .000018
                        Largest       Std. dev.      1.000267
75%     .7199248       2.358784
90%     1.333008       2.373729       Variance       1.000533
95%     1.628151       2.452677       Skewness      -.1738834
99%      2.15899       2.507594       Kurtosis       3.382854

. histogram stdres // a few low outliers
(bin=31, start=-5.3996067, width=.25507099)

. scatter stdres fit

. estat hettest // P<0.00005

Breusch–Pagan/Cook–Weisberg test for heteroskedasticity 
Assumption: Normal error terms
Variable: Fitted values of lnwpc

H0: Constant variance

    chi2(1) =  19.98
Prob > chi2 = 0.0000

. estat imtest // all P-values>0.05 - conflicting evidence by the two tests

Cameron & Trivedi's decomposition of IM-test

--------------------------------------------------
              Source |       chi2     df         p
---------------------+----------------------------
  Heteroskedasticity |      48.63     44    0.2919
            Skewness |       8.75      9    0.4605
            Kurtosis |       0.52      1    0.4698
---------------------+----------------------------
               Total |      57.91     54    0.3332
--------------------------------------------------

. egen fitcat=cut(fit), group(10)

. tabstat stdres, statistic(mean sd count) by(fitcat) // sd looks reasonably constant

Summary for variables: stdres
Group variable: fitcat 

  fitcat |      Mean        SD         N
---------+------------------------------
       0 | -.0991147  1.049569       152
       1 | -.0123365  1.063479       154
       2 |  .0667896  1.095383       164
       3 |  .0167708  1.016686       159
       4 |  .0428468  1.004965       157
       5 |  .0531342  1.065621       154
       6 | -.0264528  1.098681       160
       7 | -.0602804  .9448753       158
       8 |  .0137793  .8347668       137
       9 |  .0018601  .7991401       179
---------+------------------------------
   Total |   .000018  1.000267      1574
----------------------------------------

. * VER uses square-root transform, but we will stick to (natural) log transform
. 
. * reload data to start from scratch
. use daisy2red, clear

. gen calv_mth=month(calv_dt)

. gen aut_calv=(calv_mth>=2 & calv_mth<=7) if !missing(calv_mth)

. gen hs100=herd_size/100

. gen hs100sq=hs100^2

. gen parity1=parity-1

. generate lnwpc=ln(wpc)

. regress lnwpc c.hs100##c.hs100 parity1 i.aut_calv i.twin i.dyst i.rp##i.vag_disch

      Source |       SS           df       MS      Number of obs   =     1,574
-------------+----------------------------------   F(9, 1564)      =     18.05
       Model |  86.9474034         9   9.6608226   Prob > F        =    0.0000
    Residual |  836.874541     1,564  .535086024   R-squared       =    0.0941
-------------+----------------------------------   Adj R-squared   =    0.0889
       Total |  923.821945     1,573  .587299393   Root MSE        =     .7315

---------------------------------------------------------------------------------
          lnwpc | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
----------------+----------------------------------------------------------------
          hs100 |  -.7199434   .2206927    -3.26   0.001    -1.152828   -.2870587
                |
c.hs100#c.hs100 |   .2118728   .0456208     4.64   0.000     .1223885    .3013571
                |
        parity1 |   .0129844    .012586     1.03   0.302    -.0117028    .0376715
     1.aut_calv |  -.1371621   .0372127    -3.69   0.000    -.2101542   -.0641701
                |
           twin |
           yes  |   .3927426   .1443661     2.72   0.007     .1095711    .6759141
                |
           dyst |
           yes  |   .1109405   .0801013     1.39   0.166    -.0461768    .2680578
                |
             rp |
           yes  |   .1123166   .0705612     1.59   0.112    -.0260878     .250721
                |
      vag_disch |
           yes  |  -.0260135   .1050122    -0.25   0.804     -.231993    .1799661
                |
   rp#vag_disch |
       yes#yes  |   .4136999    .183531     2.25   0.024     .0537072    .7736926
                |
          _cons |   4.360825   .2583254    16.88   0.000     3.854124    4.867525
---------------------------------------------------------------------------------

. * check for collinearity issues
. estat vif

    Variable |       VIF       1/VIF  
-------------+----------------------
       hs100 |     55.07    0.018159
     c.hs100#|
     c.hs100 |     55.24    0.018103
     parity1 |      1.04    0.962306
  1.aut_calv |      1.02    0.985045
      1.twin |      1.03    0.967484
      1.dyst |      1.06    0.943538
        1.rp |      1.26    0.796702
 1.vag_disch |      1.60    0.624261
rp#vag_disch |
        1 1  |      1.85    0.539810
-------------+----------------------
    Mean VIF |     13.24

. estat vce, corr

Correlation matrix of coefficients of regress model

             |            c.hs100#                  1.        1.        1.        1.        1.     1.rp#          
        e(V) |    hs100   c.hs100   parity1  aut_calv      twin      dyst        rp  vag_di~h  1.vag_~h     _cons 
-------------+---------------------------------------------------------------------------------------------------
       hs100 |   1.0000                                                                                           
     c.hs100#|                                                                                                   
     c.hs100 |  -0.9907    1.0000                                                                                 
     parity1 |   0.0426   -0.0468    1.0000                                                                       
  1.aut_calv |   0.0349   -0.0265   -0.0500    1.0000                                                             
      1.twin |  -0.0435    0.0464   -0.0666   -0.0487    1.0000                                                   
      1.dyst |  -0.0579    0.0717    0.1303    0.0023   -0.0214    1.0000                                         
        1.rp |  -0.0527    0.0540    0.0318    0.0438   -0.0807   -0.0656    1.0000                               
 1.vag_disch |  -0.0680    0.0698    0.0649    0.0387   -0.0383   -0.1188    0.0742    1.0000                     
        1.rp#|                                                                                                   
 1.vag_disch |   0.0566   -0.0571   -0.0644   -0.0024   -0.0429    0.0861   -0.3873   -0.5752    1.0000           
       _cons |  -0.9772    0.9458   -0.1201   -0.1090    0.0417    0.0132    0.0235    0.0427   -0.0410    1.0000 

. * check modelling of herd_size and parity
. predict fit
(option xb assumed; fitted values)

. predict stdres, rstandard

. lowess stdres herd_size

. lowess stdres parity

. * looking at herd_size without herd_size in the model
. preserve

. regress lnwpc parity1 i.aut_calv i.twin i.dyst i.rp##i.vag_disch

      Source |       SS           df       MS      Number of obs   =     1,574
-------------+----------------------------------   F(7, 1566)      =      5.82
       Model |  23.4240047         7  3.34628639   Prob > F        =    0.0000
    Residual |   900.39794     1,566  .574966756   R-squared       =    0.0254
-------------+----------------------------------   Adj R-squared   =    0.0210
       Total |  923.821945     1,573  .587299393   Root MSE        =    .75827

------------------------------------------------------------------------------
       lnwpc | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
     parity1 |   .0191623   .0130272     1.47   0.142    -.0063904    .0447149
  1.aut_calv |   -.156154   .0384813    -4.06   0.000    -.2316343   -.0806737
             |
        twin |
        yes  |   .3353984   .1494624     2.24   0.025     .0422309    .6285659
             |
        dyst |
        yes  |   .0081531   .0824313     0.10   0.921    -.1535343    .1698404
             |
          rp |
        yes  |   .0906877   .0730356     1.24   0.215    -.0525701    .2339455
             |
   vag_disch |
        yes  |  -.0684004   .1085865    -0.63   0.529    -.2813906    .1445898
             |
rp#vag_disch |
    yes#yes  |   .4618901   .1899367     2.43   0.015      .089333    .8344472
             |
       _cons |   3.978786   .0359075   110.81   0.000     3.908354    4.049218
------------------------------------------------------------------------------

. predict stdres1, rstandard

. lowess stdres1 herd_size

. restore

. * lack of fit test for herd_size
. regress lnwpc i.herd_size parity1 i.aut_calv i.twin i.dyst i.rp##i.vag_disch

      Source |       SS           df       MS      Number of obs   =     1,574
-------------+----------------------------------   F(13, 1560)     =     13.30
       Model |  92.1852073        13  7.09116979   Prob > F        =    0.0000
    Residual |  831.636737     1,560  .533100473   R-squared       =    0.0998
-------------+----------------------------------   Adj R-squared   =    0.0923
       Total |  923.821945     1,573  .587299393   Root MSE        =    .73014

------------------------------------------------------------------------------
       lnwpc | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
   herd_size |
        185  |  -.0026548   .0861735    -0.03   0.975    -.1716829    .1663733
        201  |  -.0786045   .0825403    -0.95   0.341     -.240506    .0832971
        235  |   .1550797   .0809602     1.92   0.056    -.0037225     .313882
        263  |   .0589591   .0796896     0.74   0.459     -.097351    .2152691
        294  |   .3074852   .0782635     3.93   0.000     .1539724     .460998
        333  |    .528529   .0763696     6.92   0.000     .3787311    .6783269
             |
     parity1 |   .0130744   .0126014     1.04   0.300    -.0116431     .037792
  1.aut_calv |  -.1315347   .0373915    -3.52   0.000    -.2048776   -.0581919
             |
        twin |
        yes  |   .3936493   .1442976     2.73   0.006     .1106116     .676687
             |
        dyst |
        yes  |   .1070154   .0806696     1.33   0.185    -.0512169    .2652476
             |
          rp |
        yes  |   .1044637   .0705255     1.48   0.139     -.033871    .2427983
             |
   vag_disch |
        yes  |   .0119934   .1071589     0.11   0.911    -.1981972    .2221841
             |
rp#vag_disch |
    yes#yes  |   .4060258   .1836165     2.21   0.027     .0458647    .7661869
             |
       _cons |   3.783685   .0710932    53.22   0.000     3.644237    3.923133
------------------------------------------------------------------------------

. scalar F_lackfit=(836.874541-831.636737)/(1564-1560)/.533100473

. display "F=" F_lackfit  "  P=" Ftail(4,1560,F_lackfit)
F=2.4562931  P=.04392327

. * lack of fit test for parity
. regress lnwpc c.hs100##c.hs100 i.parity i.aut_calv i.twin i.dyst i.rp##i.vag_disch

      Source |       SS           df       MS      Number of obs   =     1,574
-------------+----------------------------------   F(14, 1559)     =     11.62
       Model |  87.2631668        14  6.23308334   Prob > F        =    0.0000
    Residual |  836.558778     1,559  .536599601   R-squared       =    0.0945
-------------+----------------------------------   Adj R-squared   =    0.0863
       Total |  923.821945     1,573  .587299393   Root MSE        =    .73253

---------------------------------------------------------------------------------
          lnwpc | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
----------------+----------------------------------------------------------------
          hs100 |  -.7237872   .2233335    -3.24   0.001    -1.161853   -.2857215
                |
c.hs100#c.hs100 |   .2123472   .0461856     4.60   0.000     .1217549    .3029396
                |
         parity |
             2  |   .0045233   .0527451     0.09   0.932    -.0989355    .1079821
             3  |   .0222099    .055756     0.40   0.690    -.0871549    .1315746
             4  |   .0547609    .061697     0.89   0.375     -.066257    .1757788
             5  |   .0211432   .0677532     0.31   0.755    -.1117537    .1540402
             6  |   .0775801   .0962505     0.81   0.420    -.1112139    .2663741
             7  |   .2140779    .368691     0.58   0.562    -.5091047    .9372604
                |
     1.aut_calv |  -.1363354   .0373412    -3.65   0.000    -.2095797   -.0630911
                |
           twin |
           yes  |   .3949597   .1446544     2.73   0.006      .111222    .6786975
                |
           dyst |
           yes  |   .1101247   .0808516     1.36   0.173    -.0484646     .268714
                |
             rp |
           yes  |   .1100733    .070909     1.55   0.121    -.0290138    .2491604
                |
      vag_disch |
           yes  |  -.0273856   .1052126    -0.26   0.795    -.2337588    .1789877
                |
   rp#vag_disch |
       yes#yes  |    .415274   .1839199     2.26   0.024     .0545176    .7760304
                |
          _cons |   4.370124   .2625064    16.65   0.000     3.855221    4.885027
---------------------------------------------------------------------------------

. scalar F_lackfit=(836.874541-836.558778)/(1564-1559)/.536599601

. display "F=" F_lackfit  "  P=" Ftail(5,1559,F_lackfit)
F=.11769036  P=.98850552

. * refit model of interest, Stata always works from the last fitted model!
. regress lnwpc c.hs100##c.hs100 parity1 i.aut_calv i.twin i.dyst i.rp##i.vag_disch

      Source |       SS           df       MS      Number of obs   =     1,574
-------------+----------------------------------   F(9, 1564)      =     18.05
       Model |  86.9474034         9   9.6608226   Prob > F        =    0.0000
    Residual |  836.874541     1,564  .535086024   R-squared       =    0.0941
-------------+----------------------------------   Adj R-squared   =    0.0889
       Total |  923.821945     1,573  .587299393   Root MSE        =     .7315

---------------------------------------------------------------------------------
          lnwpc | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
----------------+----------------------------------------------------------------
          hs100 |  -.7199434   .2206927    -3.26   0.001    -1.152828   -.2870587
                |
c.hs100#c.hs100 |   .2118728   .0456208     4.64   0.000     .1223885    .3013571
                |
        parity1 |   .0129844    .012586     1.03   0.302    -.0117028    .0376715
     1.aut_calv |  -.1371621   .0372127    -3.69   0.000    -.2101542   -.0641701
                |
           twin |
           yes  |   .3927426   .1443661     2.72   0.007     .1095711    .6759141
                |
           dyst |
           yes  |   .1109405   .0801013     1.39   0.166    -.0461768    .2680578
                |
             rp |
           yes  |   .1123166   .0705612     1.59   0.112    -.0260878     .250721
                |
      vag_disch |
           yes  |  -.0260135   .1050122    -0.25   0.804     -.231993    .1799661
                |
   rp#vag_disch |
       yes#yes  |   .4136999    .183531     2.25   0.024     .0537072    .7736926
                |
          _cons |   4.360825   .2583254    16.88   0.000     3.854124    4.867525
---------------------------------------------------------------------------------

. * residuals and diagnostics, note: fit and stdres already generated above
. *predict fit
. *predict stdres, rstandard
. predict delres, rstudent

. predict lev, lev

. predict cook, cooksd

. predict dfit, dfit

. predict dfb_dyst, dfbeta(1.dyst)

. predict dfb_rp, dfbeta(1.rp)

. predict dfb_vd, dfbeta(1.vag_disch)

. predict dfb_int, dfbeta(1.rp#1.vag_disch)

. * alternatively, use dfbeta command to get all DFBETAs
. scalar nobs=1574

. scalar nparam=10 // includes intercept

. * examine outliers
. summarize stdres delres wpc lnwpc fit

    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
      stdres |      1,574     .000018    1.000267  -5.399607   2.507594
      delres |      1,574   -.0000384    1.001035  -5.448908   2.511847
         wpc |      1,574    68.79924    51.59928          1        298
       lnwpc |      1,574     3.95829    .7663546          0   5.697093
         fit |      1,574     3.95829    .2351061   3.616902   4.994249

. * outlier test based on deletion residuals
. display 2*nobs*ttail(nobs-nparam-1,5.448908) // P=0.0001
.00009254

. display invttail(nobs-nparam-1,.025/nobs) // cutoff=4.17
4.1726361

. * number of standardized residuals outside -2/+2 and -3/+3
. * expect 5% (n=72) outside -2,+2 and 0.27% (n=4) outside -3,+3
. count if abs(stdres)>2 // n=50 ~ less than expected
  50

. count if abs(stdres)>3 // n=6 ~ slightly more than expected
  6

. * browse most extreme residuals
. order cow herd_size parity aut_calv dyst rp vag_disch fit stdres delres lev cook dfit dfb*

. sort stdres

. browse in 1/5 // most extreme negative residuals: two exceed cutoff, both have wpc=1

. browse in -5/-1 // most extreme positive residuals: none exceed cutoff

. * leverage and influence diagnostics
. summarize lev cook dfit

    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
         lev |      1,574    .0063532    .0083435   .0016636   .0642237
        cook |      1,574    .0006351    .0013679   5.57e-10   .0174304
        dfit |      1,574    .0003931    .0797594  -.4179343   .3664328

. display "leverage cutoff:  " 2*nparam/nobs
leverage cutoff:  .01270648

. display "conservative leverage cutoff:  " 3*nparam/nobs
conservative leverage cutoff:  .01905972

. display "Cook's D cutoff:   " 4/nobs
Cook's D cutoff:   .0025413

. display "DFITS cutoff:   " 2*sqrt(nparam/nobs)*(nobs>=120)+1*(nobs<120)
DFITS cutoff:   .15941443

. * large leverage values
. count if lev>=.01905972 // many (n=108) high leverage values
  108

. sort lev

. browse in -20/-1 // most extreme leverages

. * large Cook's D values
. count if cook>=.0025413 // many (n=92) high Cook's D values
  92

. sort cook

. browse in -20/-1 // most extreme Cook's D values

. scatter cook lev, xline(0.019) yline(0.0025)

. * large DFITS values
. count if abs(dfit)>=.15941443 // many (n=92) high DFITS values
  92

. count if cook>=.0025413 & abs(dfit)>=.15941443
  92

. sort dfit

. browse in 1/10 // most extreme negative DFITS values 

. browse in -10/-1 // most extreme positive DFITS values

. scatter dfit lev, xline(0.019) yline(-0.159 0.159)

. * dfbeta diagnostics (for a subset of predictors)
. sum dfb_dyst dfb_rp dfb_vd dfb_int

    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
    dfb_dyst |      1,574   -.0000162    .0299013  -.2078438   .2472617
      dfb_rp |      1,574    4.82e-07    .0254968  -.1650124   .1895015
      dfb_vd |      1,574    2.40e-06     .029381  -.2821976   .3064255
     dfb_int |      1,574   -7.50e-06    .0236208  -.2360168   .2135576

. display "DFBETA cutoff:   " 2/sqrt(nobs)*(nobs>=120)+1*(nobs<120)
DFBETA cutoff:   .05041127

. count if abs(dfb_dyst)>.05041127 // many (n=74) high DFBETA values
  74

. sort dfb_dyst

. browse in 1/10 // most extreme negative dfb_dyst values

. browse in -10/-1 // most extreme positive dfb_dyst values

. * same approach for other DFBETAs
. 
. * next explore impact of the two outliers
. list wpc lnwpc fit stdres delres lev cook dfit dfb* if abs(delres)>4.17

      +-------------------------------------------------------------------------------------------------------------------------------+
      | wpc   lnwpc        fit      stdres      delres        lev       cook        dfit   dfb_dyst     dfb_rp     dfb_vd     dfb_int |
      |-------------------------------------------------------------------------------------------------------------------------------|
1538. |   1       0   3.945861   -5.399607   -5.448908   .0019881   .0058081   -.2432013   .0444021   .0529631    .041972   -.0257843 |
1540. |   1       0   3.645548   -4.989504   -5.028087   .0023291   .0058118   -.2429413   .0506043   .0367807   .0277904   -.0214488 |
      +-------------------------------------------------------------------------------------------------------------------------------+

. regress lnwpc c.hs100##c.hs100 parity1 i.aut_calv i.twin i.dyst i.rp##i.vag_disch

      Source |       SS           df       MS      Number of obs   =     1,574
-------------+----------------------------------   F(9, 1564)      =     18.05
       Model |  86.9474034         9   9.6608226   Prob > F        =    0.0000
    Residual |  836.874541     1,564  .535086024   R-squared       =    0.0941
-------------+----------------------------------   Adj R-squared   =    0.0889
       Total |  923.821945     1,573  .587299393   Root MSE        =     .7315

---------------------------------------------------------------------------------
          lnwpc | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
----------------+----------------------------------------------------------------
          hs100 |  -.7199434   .2206927    -3.26   0.001    -1.152828   -.2870587
                |
c.hs100#c.hs100 |   .2118728   .0456208     4.64   0.000     .1223885    .3013571
                |
        parity1 |   .0129844    .012586     1.03   0.302    -.0117028    .0376715
     1.aut_calv |  -.1371621   .0372127    -3.69   0.000    -.2101542   -.0641701
                |
           twin |
           yes  |   .3927426   .1443661     2.72   0.007     .1095711    .6759141
                |
           dyst |
           yes  |   .1109405   .0801013     1.39   0.166    -.0461768    .2680578
                |
             rp |
           yes  |   .1123166   .0705612     1.59   0.112    -.0260878     .250721
                |
      vag_disch |
           yes  |  -.0260135   .1050122    -0.25   0.804     -.231993    .1799661
                |
   rp#vag_disch |
       yes#yes  |   .4136999    .183531     2.25   0.024     .0537072    .7736926
                |
          _cons |   4.360825   .2583254    16.88   0.000     3.854124    4.867525
---------------------------------------------------------------------------------

. estimates store full

. * refit model and redo (some) diagnostics without the two outliers
. regress lnwpc c.hs100##c.hs100 parity1 i.aut_calv i.twin i.dyst i.rp##i.vag_disch if abs(delres)<4.17

      Source |       SS           df       MS      Number of obs   =     1,572
-------------+----------------------------------   F(9, 1562)      =     18.15
       Model |  84.5110911         9  9.39012123   Prob > F        =    0.0000
    Residual |  807.934867     1,562  .517243833   R-squared       =    0.0947
-------------+----------------------------------   Adj R-squared   =    0.0895
       Total |  892.445959     1,571  .568075085   Root MSE        =     .7192

---------------------------------------------------------------------------------
          lnwpc | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
----------------+----------------------------------------------------------------
          hs100 |  -.6783733   .2170552    -3.13   0.002    -1.104124    -.252623
                |
c.hs100#c.hs100 |   .2026965   .0448706     4.52   0.000     .1146836    .2907094
                |
        parity1 |   .0113333   .0123763     0.92   0.360    -.0129427    .0356093
     1.aut_calv |  -.1369515   .0366092    -3.74   0.000    -.2087598   -.0651432
                |
           twin |
           yes  |   .3894441   .1419397     2.74   0.006     .1110316    .6678566
                |
           dyst |
           yes  |   .1033889   .0787611     1.31   0.189    -.0510998    .2578777
                |
             rp |
           yes  |   .1060341   .0693799     1.53   0.127    -.0300535    .2421216
                |
      vag_disch |
           yes  |  -.0332815   .1032512    -0.32   0.747    -.2358071    .1692441
                |
   rp#vag_disch |
       yes#yes  |   .4223009   .1804488     2.34   0.019     .0683535    .7762484
                |
          _cons |   4.326733   .2540256    17.03   0.000     3.828466       4.825
---------------------------------------------------------------------------------

. estimates store fullminus2

. estimates table full fullminus2, se // estimates look very similar, but rootMSE went down a bit

----------------------------------------
    Variable |    full      fullminus2  
-------------+--------------------------
       hs100 | -.71994342   -.67837329  
             |  .22069268    .21705522  
             |
     c.hs100#|
     c.hs100 |  .21187279    .20269652  
             |  .04562075    .04487058  
             |
     parity1 |  .01298436     .0113333  
             |  .01258597    .01237635  
             |
    aut_calv |
          1  | -.13716215   -.13695147  
             |   .0372127    .03660917  
             |
        twin |
        yes  |  .39274261     .3894441  
             |  .14436609    .14193974  
             |
        dyst |
        yes  |   .1109405    .10338895  
             |  .08010133    .07876115  
             |
          rp |
        yes  |  .11231661    .10603408  
             |  .07056115     .0693799  
             |
   vag_disch |
        yes  | -.02601348   -.03328153  
             |  .10501221    .10325122  
             |
rp#vag_disch |
    yes#yes  |  .41369993    .42230093  
             |  .18353098    .18044883  
             |
       _cons |  4.3608249     4.326733  
             |  .25832536     .2540256  
----------------------------------------
                            Legend: b/se

. estat vif

    Variable |       VIF       1/VIF  
-------------+----------------------
       hs100 |     55.08    0.018155
     c.hs100#|
     c.hs100 |     55.25    0.018098
     parity1 |      1.04    0.962288
  1.aut_calv |      1.02    0.985113
      1.twin |      1.03    0.967492
      1.dyst |      1.06    0.943456
        1.rp |      1.26    0.796690
 1.vag_disch |      1.60    0.624248
rp#vag_disch |
        1 1  |      1.85    0.539802
-------------+----------------------
    Mean VIF |     13.24

. estat vce, corr

Correlation matrix of coefficients of regress model

             |            c.hs100#                  1.        1.        1.        1.        1.     1.rp#          
        e(V) |    hs100   c.hs100   parity1  aut_calv      twin      dyst        rp  vag_di~h  1.vag_~h     _cons 
-------------+---------------------------------------------------------------------------------------------------
       hs100 |   1.0000                                                                                           
     c.hs100#|                                                                                                   
     c.hs100 |  -0.9907    1.0000                                                                                 
     parity1 |   0.0421   -0.0463    1.0000                                                                       
  1.aut_calv |   0.0348   -0.0264   -0.0501    1.0000                                                             
      1.twin |  -0.0436    0.0465   -0.0666   -0.0487    1.0000                                                   
      1.dyst |  -0.0582    0.0720    0.1305    0.0022   -0.0213    1.0000                                         
        1.rp |  -0.0530    0.0543    0.0320    0.0438   -0.0807   -0.0654    1.0000                               
 1.vag_disch |  -0.0682    0.0700    0.0650    0.0387   -0.0383   -0.1186    0.0743    1.0000                     
        1.rp#|                                                                                                   
 1.vag_disch |   0.0568   -0.0573   -0.0645   -0.0024   -0.0429    0.0860   -0.3873   -0.5752    1.0000           
       _cons |  -0.9772    0.9458   -0.1197   -0.1087    0.0417    0.0134    0.0237    0.0428   -0.0411    1.0000 

. predict fit2 if e(sample)
(option xb assumed; fitted values)
(2 missing values generated)

. predict stdres2 if e(sample), rstandard
(2 missing values generated)

. predict delres2 if e(sample), rstudent
(2 missing values generated)

. predict lev2 if e(sample), lev
(2 missing values generated)

. predict cook2 if e(sample), cooksd
(2 missing values generated)

. predict dfit2 if e(sample), dfit
(2 missing values generated)

. summarize stdres2, detail

                   Standardized residuals
-------------------------------------------------------------
      Percentiles      Smallest
 1%    -2.144261      -3.690055
 5%    -1.580315      -3.383067
10%     -1.25911      -3.145711       Obs               1,572
25%    -.7282061      -3.089859       Sum of wgt.       1,572

50%    -.0183884                      Mean           .0000171
                        Largest       Std. dev.      1.000335
75%     .7257645       2.399085
90%     1.349566       2.400366       Variance       1.000671
95%     1.644327       2.491705       Skewness      -.0164227
99%     2.181393       2.554476       Kurtosis       2.625953

. histogram stdres2 // a few low outliers
(bin=31, start=-3.6900551, width=.20143648)

. scatter stdres2 fit2

. estat hettest // P=0.0001

Breusch–Pagan/Cook–Weisberg test for heteroskedasticity 
Assumption: Normal error terms
Variable: Fitted values of lnwpc

H0: Constant variance

    chi2(1) =  16.22
Prob > chi2 = 0.0001

. estat imtest // now significant values, very strange?

Cameron & Trivedi's decomposition of IM-test

--------------------------------------------------
              Source |       chi2     df         p
---------------------+----------------------------
  Heteroskedasticity |      71.76     44    0.0051
            Skewness |      14.82      9    0.0960
            Kurtosis |       9.69      1    0.0019
---------------------+----------------------------
               Total |      96.27     54    0.0004
--------------------------------------------------

. scalar nobs=1572

. scalar nparam=10 // includes intercept

. * examine outliers
. summarize stdres2 delres2 wpc lnwpc fit

    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
     stdres2 |      1,572    .0000171    1.000335  -3.690055   2.554476
     delres2 |      1,572    .0000118    1.000858  -3.705058   2.559009
         wpc |      1,574    68.79924    51.59928          1        298
       lnwpc |      1,574     3.95829    .7663546          0   5.697093
         fit |      1,574     3.95829    .2351061   3.616902   4.994249

. * outlier test based on deletion residuals
. display 2*nobs*ttail(nobs-nparam-1,3.705) // P=0.34
.34389531

. display invttail(nobs-nparam-1,.025/nobs) // cutoff=4.17
4.1723589

. * no new outliers
. summarize lev cook dfit lev2 cook2 dfit2

    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
         lev |      1,574    .0063532    .0083435   .0016636   .0642237
        cook |      1,574    .0006351    .0013679   5.57e-10   .0174304
        dfit |      1,574    .0003931    .0797594  -.4179343   .3664328
        lev2 |      1,572    .0063613    .0083469   .0016659   .0642244
       cook2 |      1,572    .0006499     .001402   1.24e-09   .0181919
-------------+---------------------------------------------------------
       dfit2 |      1,572    .0003723    .0806768  -.4269926   .3736203

. 
end of do-file