Chapter 9. More on Specification and Data Issues#

Home | Stata | R | April 20, 2020

import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col

from wooldridge import *

Example 9.1. Economic Model of Crime#

df = dataWoo('crime1')
crime_hetr_r = smf.ols('narr86  ~ pcnv + avgsen + tottime + ptime86 + qemp86 + inc86 + black + hispan + 1', 
                       data=df).fit()
crime_hetr = smf.ols('narr86  ~ pcnv + avgsen + tottime + ptime86 + qemp86 + inc86 + black + hispan + pcnvsq + pt86sq + inc86sq + 1', 
                     data=df).fit()
crime_robust = smf.ols('narr86  ~ pcnv + avgsen + tottime + ptime86 + qemp86 + inc86 + black + hispan + pcnvsq + pt86sq + inc86sq + 1',
                       data=df).fit(cov_type='HC1')

print(summary_col([crime_hetr_r, crime_hetr, crime_robust],stars=True,float_format='%0.3f',
                  model_names=['Hetrosced_r\n(b/se)','Hetrosced\n(b/se)','Robust\n(b/se)'],
                 info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                             'R2':lambda x: "{:.3f}".format(x.rsquared)}))
==============================================
               Hetrosced_r Hetrosced   Robust 
                  (b/se)     (b/se)    (b/se) 
----------------------------------------------
Intercept      0.569***    0.505***  0.505*** 
               (0.036)     (0.037)   (0.039)  
R-squared      0.072       0.103     0.103    
R-squared Adj. 0.070       0.100     0.100    
avgsen         -0.011      -0.017    -0.017   
               (0.012)     (0.012)   (0.014)  
black          0.327***    0.292***  0.292*** 
               (0.045)     (0.045)   (0.058)  
hispan         0.194***    0.164***  0.164*** 
               (0.040)     (0.039)   (0.040)  
inc86          -0.001***   -0.003*** -0.003***
               (0.000)     (0.001)   (0.001)  
inc86sq                    0.000***  0.000*** 
                           (0.000)   (0.000)  
pcnv           -0.133***   0.553***  0.553*** 
               (0.040)     (0.154)   (0.170)  
pcnvsq                     -0.730*** -0.730***
                           (0.156)   (0.172)  
pt86sq                     -0.030*** -0.030***
                           (0.004)   (0.006)  
ptime86        -0.041***   0.287***  0.287*** 
               (0.009)     (0.044)   (0.069)  
qemp86         -0.051***   -0.014    -0.014   
               (0.014)     (0.017)   (0.017)  
tottime        0.012       0.012     0.012    
               (0.009)     (0.009)   (0.013)  
N              2725        2725      2725     
R2             0.072       0.103     0.103    
==============================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01

Example 9.2. Housing Price Equation#

df = dataWoo("hprice1")

hprice_reg = smf.ols('price ~ lotsize + sqrft + bdrms', data=df).fit()
prhat2=hprice_reg.predict()**2
prhat3=hprice_reg.predict()**3
hprice_reg_pol = smf.ols('price ~ lotsize + sqrft + bdrms + prhat2 + prhat3 + 1 ', data=df).fit()

print(summary_col([hprice_reg, hprice_reg_pol],stars=True,float_format='%0.3f',
                  model_names=['Linear\n(b/se)','Polynomial\n(b/se)'],
                 info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                             'R2':lambda x: "{:.3f}".format(x.rsquared),
                           'Adj.R2':lambda x: "{:.3f}".format(x.rsquared_adj)}))
==================================
                Linear  Polynomial
                (b/se)    (b/se)  
----------------------------------
Intercept      -21.770  166.097   
               (29.475) (317.433) 
R-squared      0.672    0.706     
R-squared Adj. 0.661    0.688     
bdrms          13.853   2.175     
               (9.010)  (33.888)  
lotsize        0.002*** 0.000     
               (0.001)  (0.005)   
prhat2                  0.000     
                        (0.007)   
prhat3                  0.000     
                        (0.000)   
sqrft          0.123*** 0.018     
               (0.013)  (0.299)   
N              88       88        
R2             0.672    0.706     
Adj.R2         0.661    0.688     
==================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
hypotheses = '(prhat2  = prhat3 = 0)'
f_test = hprice_reg_pol.f_test(hypotheses)
print(f_test)
<F test: F=4.668205534947545, p=0.01202171144289627, df_denom=82, df_num=2>

Logarithmic form#

lhprice_reg = smf.ols('lprice ~ llotsize + lsqrft + bdrms', data=df).fit()
lprhat2=lhprice_reg.predict()**2
lprhat3=lhprice_reg.predict()**3
lhprice_reg_pol = smf.ols('lprice ~ llotsize + lsqrft + bdrms + lprhat2 + lprhat3 + 1 ', data=df).fit()

print(summary_col([lhprice_reg, lhprice_reg_pol],stars=True,float_format='%0.3f',
                  model_names=['Linear_L\n(b/se)','Polynomial_L\n(b/se)'],
                 info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                             'R2':lambda x: "{:.3f}".format(x.rsquared), 
                           'Adj.R2':lambda x: "{:.3f}".format(x.rsquared_adj)}))
====================================
               Linear_L Polynomial_L
                (b/se)     (b/se)   
------------------------------------
Intercept      -1.297** 87.886      
               (0.651)  (240.974)   
R-squared      0.643    0.664       
R-squared Adj. 0.630    0.643       
bdrms          0.037    -0.925      
               (0.028)  (2.770)     
llotsize       0.168*** -4.181      
               (0.038)  (12.595)    
lprhat2                 3.910       
                        (13.014)    
lprhat3                 -0.193      
                        (0.752)     
lsqrft         0.700*** -17.349     
               (0.093)  (52.490)    
N              88       88          
R2             0.643    0.664       
Adj.R2         0.630    0.643       
====================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01

hypotheses = '(lprhat2  = lprhat3 = 0)'
f_test = lhprice_reg_pol.f_test(hypotheses)
print(f_test)
<F test: F=2.565046204755637, p=0.0830754662426802, df_denom=82, df_num=2>

Example 9.3. IQ as a Proxy for Ability#

df = dataWoo("wage2")
IQA = smf.ols('lwage ~ educ + exper + tenure + married + south + urban + black + 1', data=df).fit()
IQB = smf.ols('lwage ~ educ + exper + tenure + married + south + urban + black + IQ + 1', data=df).fit()
IQC = smf.ols('lwage ~ educ + exper + tenure + married + south + urban + black + IQ + educ:IQ + 1', data=df).fit()

print(summary_col([IQA, IQB, IQC],stars=True,float_format='%0.3f',
                  model_names=['IQA\n(b/se)','IQB\n(b/se)','IQC\n(b/se)'],
                 info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                             'R2':lambda x: "{:.3f}".format(x.rsquared), 
                           'Adj.R2':lambda x: "{:.3f}".format(x.rsquared_adj)}))
============================================
                  IQA       IQB       IQC   
                 (b/se)    (b/se)    (b/se) 
--------------------------------------------
IQ                       0.004***  -0.001   
                         (0.001)   (0.005)  
Intercept      5.395***  5.176***  5.648*** 
               (0.113)   (0.128)   (0.546)  
R-squared      0.253     0.263     0.263    
R-squared Adj. 0.247     0.256     0.256    
black          -0.188*** -0.143*** -0.147***
               (0.038)   (0.039)   (0.040)  
educ           0.065***  0.054***  0.018    
               (0.006)   (0.007)   (0.041)  
educ:IQ                            0.000    
                                   (0.000)  
exper          0.014***  0.014***  0.014*** 
               (0.003)   (0.003)   (0.003)  
married        0.199***  0.200***  0.201*** 
               (0.039)   (0.039)   (0.039)  
south          -0.091*** -0.080*** -0.080***
               (0.026)   (0.026)   (0.026)  
tenure         0.012***  0.011***  0.011*** 
               (0.002)   (0.002)   (0.002)  
urban          0.184***  0.182***  0.184*** 
               (0.027)   (0.027)   (0.027)  
N              935       935       935      
R2             0.253     0.263     0.263    
Adj.R2         0.247     0.256     0.256    
============================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01

Example 9.4. City Crime Rates#

df = dataWoo("crime2")
df = df[(df['year']==87)]
crimeA = smf.ols('lcrmrte ~ unem + llawexpc + 1', data=df).fit()
crimeB = smf.ols('lcrmrte ~ unem + llawexpc + lcrmrt_1 + 1', data=df).fit()

print(summary_col([crimeA, crimeB],stars=True,float_format='%0.3f',
                  model_names=['crimeA\n(b/se)','crimeA\n(b/se)'],
                 info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                             'R2':lambda x: "{:.3f}".format(x.rsquared), 
                           'Adj.R2':lambda x: "{:.3f}".format(x.rsquared_adj)}))
=================================
                crimeA    crimeA 
               (b/se) I (b/se) II
---------------------------------
Intercept      3.343**  0.076    
               (1.251)  (0.821)  
R-squared      0.057    0.680    
R-squared Adj. 0.013    0.657    
lcrmrt_1                1.194*** 
                        (0.132)  
llawexpc       0.203    -0.140   
               (0.173)  (0.109)  
unem           -0.029   0.009    
               (0.032)  (0.020)  
N              46       46       
R2             0.057    0.680    
Adj.R2         0.013    0.657    
=================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01

Example 9.8. R&D Intensity and Firm Size#

df = dataWoo("rdchem")
RD1 = smf.ols('rdintens ~ sales + profmarg + 1', data=df).fit()
RD2 = smf.ols('rdintens ~ sales + profmarg + 1', data=df[(df['sales']<30000)]).fit()

print(summary_col([RD1, RD2],stars=True,float_format='%0.3f',
                  model_names=['RD1\n(b/se)','RD2\n(b/se)'],
                 info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                             'R2':lambda x: "{:.3f}".format(x.rsquared), 
                           'Adj.R2':lambda x: "{:.3f}".format(x.rsquared_adj)}))
================================
                 RD1      RD2   
                (b/se)   (b/se) 
--------------------------------
Intercept      2.625*** 2.297***
               (0.586)  (0.592) 
sales          0.000    0.000** 
               (0.000)  (0.000) 
profmarg       0.045    0.048   
               (0.046)  (0.044) 
R-squared      0.076    0.173   
R-squared Adj. 0.012    0.114   
N              32       31      
R2             0.076    0.173   
Adj.R2         0.012    0.114   
================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01

Example 9.9. R&D Intensity#

df = dataWoo("rdchem")
lRD1 = smf.ols('lrd ~ lsales + profmarg + 1', data=df).fit()
lRD2 = smf.ols('lrd ~ lsales + profmarg + 1', data=df[(df['sales']<30000)]).fit()

print(summary_col([lRD1, lRD2],stars=True,float_format='%0.3f',
                  model_names=['RD1_Log\n(b/se)','RD2_Log\n(b/se)'],
                 info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                             'R2':lambda x: "{:.3f}".format(x.rsquared), 
                           'Adj.R2':lambda x: "{:.3f}".format(x.rsquared_adj)}))
==================================
                RD1_Log   RD2_Log 
                 (b/se)    (b/se) 
----------------------------------
Intercept      -4.378*** -4.404***
               (0.468)   (0.511)  
lsales         1.084***  1.088*** 
               (0.060)   (0.067)  
profmarg       0.022     0.022    
               (0.013)   (0.013)  
R-squared      0.918     0.904    
R-squared Adj. 0.912     0.897    
N              32        31       
R2             0.918     0.904    
Adj.R2         0.912     0.897    
==================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01

Example 9.10. State Infant Mortality Rates#

df = dataWoo("infmrt")
df = df[(df['year']==1990)]
infant1  = smf.ols('infmort ~ lpcinc + lphysic + lpopul + 1', data=df).fit()
infant2  = smf.ols('infmort ~ lpcinc + lphysic + lpopul + 1 ', data=df[(df['DC']==0)]).fit()

print(summary_col([infant1, infant2],stars=True,float_format='%0.3f',
                  model_names=['Infmort1\n(b/se)','Infmort2\n(b/se)'],
                 info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                             'R2':lambda x: "{:.3f}".format(x.rsquared), 
                           'Adj.R2':lambda x: "{:.3f}".format(x.rsquared_adj)}))
================================
               Infmort1 Infmort2
                (b/se)   (b/se) 
--------------------------------
Intercept      33.859   23.955* 
               (20.428) (12.419)
lpcinc         -4.685*  -0.567  
               (2.604)  (1.641) 
lphysic        4.153*** -2.742**
               (1.513)  (1.191) 
lpopul         -0.088   0.629***
               (0.287)  (0.191) 
R-squared      0.139    0.273   
R-squared Adj. 0.084    0.226   
N              51       50      
R2             0.139    0.273   
Adj.R2         0.084    0.226   
================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01