Chapter 9. More on Specification and Data Issues#

Home | Stata | R | April 20, 2020

import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col

from wooldridge import *


Example 9.1. Economic Model of Crime#

df = dataWoo('crime1')

crime_hetr_r = smf.ols('narr86  ~ pcnv + avgsen + tottime + ptime86 + qemp86 + inc86 + black + hispan + 1',
data=df).fit()
crime_hetr = smf.ols('narr86  ~ pcnv + avgsen + tottime + ptime86 + qemp86 + inc86 + black + hispan + pcnvsq + pt86sq + inc86sq + 1',
data=df).fit()
crime_robust = smf.ols('narr86  ~ pcnv + avgsen + tottime + ptime86 + qemp86 + inc86 + black + hispan + pcnvsq + pt86sq + inc86sq + 1',
data=df).fit(cov_type='HC1')

print(summary_col([crime_hetr_r, crime_hetr, crime_robust],stars=True,float_format='%0.3f',
model_names=['Hetrosced_r\n(b/se)','Hetrosced\n(b/se)','Robust\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared)}))

==============================================
Hetrosced_r Hetrosced   Robust
(b/se)     (b/se)    (b/se)
----------------------------------------------
Intercept      0.569***    0.505***  0.505***
(0.036)     (0.037)   (0.039)
R-squared      0.072       0.103     0.103
avgsen         -0.011      -0.017    -0.017
(0.012)     (0.012)   (0.014)
black          0.327***    0.292***  0.292***
(0.045)     (0.045)   (0.058)
hispan         0.194***    0.164***  0.164***
(0.040)     (0.039)   (0.040)
inc86          -0.001***   -0.003*** -0.003***
(0.000)     (0.001)   (0.001)
inc86sq                    0.000***  0.000***
(0.000)   (0.000)
pcnv           -0.133***   0.553***  0.553***
(0.040)     (0.154)   (0.170)
pcnvsq                     -0.730*** -0.730***
(0.156)   (0.172)
pt86sq                     -0.030*** -0.030***
(0.004)   (0.006)
ptime86        -0.041***   0.287***  0.287***
(0.009)     (0.044)   (0.069)
qemp86         -0.051***   -0.014    -0.014
(0.014)     (0.017)   (0.017)
tottime        0.012       0.012     0.012
(0.009)     (0.009)   (0.013)
N              2725        2725      2725
R2             0.072       0.103     0.103
==============================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


Example 9.2. Housing Price Equation#

df = dataWoo("hprice1")

hprice_reg = smf.ols('price ~ lotsize + sqrft + bdrms', data=df).fit()
prhat2=hprice_reg.predict()**2
prhat3=hprice_reg.predict()**3
hprice_reg_pol = smf.ols('price ~ lotsize + sqrft + bdrms + prhat2 + prhat3 + 1 ', data=df).fit()

print(summary_col([hprice_reg, hprice_reg_pol],stars=True,float_format='%0.3f',
model_names=['Linear\n(b/se)','Polynomial\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),

==================================
Linear  Polynomial
(b/se)    (b/se)
----------------------------------
Intercept      -21.770  166.097
(29.475) (317.433)
R-squared      0.672    0.706
bdrms          13.853   2.175
(9.010)  (33.888)
lotsize        0.002*** 0.000
(0.001)  (0.005)
prhat2                  0.000
(0.007)
prhat3                  0.000
(0.000)
sqrft          0.123*** 0.018
(0.013)  (0.299)
N              88       88
R2             0.672    0.706
==================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01

hypotheses = '(prhat2  = prhat3 = 0)'
f_test = hprice_reg_pol.f_test(hypotheses)
print(f_test)

<F test: F=4.668205534947545, p=0.01202171144289627, df_denom=82, df_num=2>


Logarithmic form#

lhprice_reg = smf.ols('lprice ~ llotsize + lsqrft + bdrms', data=df).fit()
lprhat2=lhprice_reg.predict()**2
lprhat3=lhprice_reg.predict()**3
lhprice_reg_pol = smf.ols('lprice ~ llotsize + lsqrft + bdrms + lprhat2 + lprhat3 + 1 ', data=df).fit()

print(summary_col([lhprice_reg, lhprice_reg_pol],stars=True,float_format='%0.3f',
model_names=['Linear_L\n(b/se)','Polynomial_L\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),

====================================
Linear_L Polynomial_L
(b/se)     (b/se)
------------------------------------
Intercept      -1.297** 87.886
(0.651)  (240.974)
R-squared      0.643    0.664
bdrms          0.037    -0.925
(0.028)  (2.770)
llotsize       0.168*** -4.181
(0.038)  (12.595)
lprhat2                 3.910
(13.014)
lprhat3                 -0.193
(0.752)
lsqrft         0.700*** -17.349
(0.093)  (52.490)
N              88       88
R2             0.643    0.664
====================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01



hypotheses = '(lprhat2  = lprhat3 = 0)'
f_test = lhprice_reg_pol.f_test(hypotheses)
print(f_test)

<F test: F=2.565046204755637, p=0.0830754662426802, df_denom=82, df_num=2>


Example 9.3. IQ as a Proxy for Ability#

df = dataWoo("wage2")
IQA = smf.ols('lwage ~ educ + exper + tenure + married + south + urban + black + 1', data=df).fit()
IQB = smf.ols('lwage ~ educ + exper + tenure + married + south + urban + black + IQ + 1', data=df).fit()
IQC = smf.ols('lwage ~ educ + exper + tenure + married + south + urban + black + IQ + educ:IQ + 1', data=df).fit()

print(summary_col([IQA, IQB, IQC],stars=True,float_format='%0.3f',
model_names=['IQA\n(b/se)','IQB\n(b/se)','IQC\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),

============================================
IQA       IQB       IQC
(b/se)    (b/se)    (b/se)
--------------------------------------------
IQ                       0.004***  -0.001
(0.001)   (0.005)
Intercept      5.395***  5.176***  5.648***
(0.113)   (0.128)   (0.546)
R-squared      0.253     0.263     0.263
black          -0.188*** -0.143*** -0.147***
(0.038)   (0.039)   (0.040)
educ           0.065***  0.054***  0.018
(0.006)   (0.007)   (0.041)
educ:IQ                            0.000
(0.000)
exper          0.014***  0.014***  0.014***
(0.003)   (0.003)   (0.003)
married        0.199***  0.200***  0.201***
(0.039)   (0.039)   (0.039)
south          -0.091*** -0.080*** -0.080***
(0.026)   (0.026)   (0.026)
tenure         0.012***  0.011***  0.011***
(0.002)   (0.002)   (0.002)
urban          0.184***  0.182***  0.184***
(0.027)   (0.027)   (0.027)
N              935       935       935
R2             0.253     0.263     0.263
============================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


Example 9.4. City Crime Rates#

df = dataWoo("crime2")
df = df[(df['year']==87)]
crimeA = smf.ols('lcrmrte ~ unem + llawexpc + 1', data=df).fit()
crimeB = smf.ols('lcrmrte ~ unem + llawexpc + lcrmrt_1 + 1', data=df).fit()

print(summary_col([crimeA, crimeB],stars=True,float_format='%0.3f',
model_names=['crimeA\n(b/se)','crimeA\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),

=================================
crimeA    crimeA
(b/se) I (b/se) II
---------------------------------
Intercept      3.343**  0.076
(1.251)  (0.821)
R-squared      0.057    0.680
lcrmrt_1                1.194***
(0.132)
llawexpc       0.203    -0.140
(0.173)  (0.109)
unem           -0.029   0.009
(0.032)  (0.020)
N              46       46
R2             0.057    0.680
=================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


Example 9.8. R&D Intensity and Firm Size#

df = dataWoo("rdchem")
RD1 = smf.ols('rdintens ~ sales + profmarg + 1', data=df).fit()
RD2 = smf.ols('rdintens ~ sales + profmarg + 1', data=df[(df['sales']<30000)]).fit()

print(summary_col([RD1, RD2],stars=True,float_format='%0.3f',
model_names=['RD1\n(b/se)','RD2\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),

================================
RD1      RD2
(b/se)   (b/se)
--------------------------------
Intercept      2.625*** 2.297***
(0.586)  (0.592)
sales          0.000    0.000**
(0.000)  (0.000)
profmarg       0.045    0.048
(0.046)  (0.044)
R-squared      0.076    0.173
N              32       31
R2             0.076    0.173
================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


Example 9.9. R&D Intensity#

df = dataWoo("rdchem")
lRD1 = smf.ols('lrd ~ lsales + profmarg + 1', data=df).fit()
lRD2 = smf.ols('lrd ~ lsales + profmarg + 1', data=df[(df['sales']<30000)]).fit()

print(summary_col([lRD1, lRD2],stars=True,float_format='%0.3f',
model_names=['RD1_Log\n(b/se)','RD2_Log\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),

==================================
RD1_Log   RD2_Log
(b/se)    (b/se)
----------------------------------
Intercept      -4.378*** -4.404***
(0.468)   (0.511)
lsales         1.084***  1.088***
(0.060)   (0.067)
profmarg       0.022     0.022
(0.013)   (0.013)
R-squared      0.918     0.904
N              32        31
R2             0.918     0.904
==================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


Example 9.10. State Infant Mortality Rates#

df = dataWoo("infmrt")
df = df[(df['year']==1990)]
infant1  = smf.ols('infmort ~ lpcinc + lphysic + lpopul + 1', data=df).fit()
infant2  = smf.ols('infmort ~ lpcinc + lphysic + lpopul + 1 ', data=df[(df['DC']==0)]).fit()

print(summary_col([infant1, infant2],stars=True,float_format='%0.3f',
model_names=['Infmort1\n(b/se)','Infmort2\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),

================================
Infmort1 Infmort2
(b/se)   (b/se)
--------------------------------
Intercept      33.859   23.955*
(20.428) (12.419)
lpcinc         -4.685*  -0.567
(2.604)  (1.641)
lphysic        4.153*** -2.742**
(1.513)  (1.191)
lpopul         -0.088   0.629***
(0.287)  (0.191)
R-squared      0.139    0.273