Chapter 9. More on Specification and Data Issues#
Home | Stata | R | April 20, 2020
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
from wooldridge import *
Example 9.1. Economic Model of Crime#
df = dataWoo('crime1')
crime_hetr_r = smf.ols('narr86 ~ pcnv + avgsen + tottime + ptime86 + qemp86 + inc86 + black + hispan + 1',
data=df).fit()
crime_hetr = smf.ols('narr86 ~ pcnv + avgsen + tottime + ptime86 + qemp86 + inc86 + black + hispan + pcnvsq + pt86sq + inc86sq + 1',
data=df).fit()
crime_robust = smf.ols('narr86 ~ pcnv + avgsen + tottime + ptime86 + qemp86 + inc86 + black + hispan + pcnvsq + pt86sq + inc86sq + 1',
data=df).fit(cov_type='HC1')
print(summary_col([crime_hetr_r, crime_hetr, crime_robust],stars=True,float_format='%0.3f',
model_names=['Hetrosced_r\n(b/se)','Hetrosced\n(b/se)','Robust\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared)}))
==============================================
Hetrosced_r Hetrosced Robust
(b/se) (b/se) (b/se)
----------------------------------------------
Intercept 0.569*** 0.505*** 0.505***
(0.036) (0.037) (0.039)
R-squared 0.072 0.103 0.103
R-squared Adj. 0.070 0.100 0.100
avgsen -0.011 -0.017 -0.017
(0.012) (0.012) (0.014)
black 0.327*** 0.292*** 0.292***
(0.045) (0.045) (0.058)
hispan 0.194*** 0.164*** 0.164***
(0.040) (0.039) (0.040)
inc86 -0.001*** -0.003*** -0.003***
(0.000) (0.001) (0.001)
inc86sq 0.000*** 0.000***
(0.000) (0.000)
pcnv -0.133*** 0.553*** 0.553***
(0.040) (0.154) (0.170)
pcnvsq -0.730*** -0.730***
(0.156) (0.172)
pt86sq -0.030*** -0.030***
(0.004) (0.006)
ptime86 -0.041*** 0.287*** 0.287***
(0.009) (0.044) (0.069)
qemp86 -0.051*** -0.014 -0.014
(0.014) (0.017) (0.017)
tottime 0.012 0.012 0.012
(0.009) (0.009) (0.013)
N 2725 2725 2725
R2 0.072 0.103 0.103
==============================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
Example 9.2. Housing Price Equation#
df = dataWoo("hprice1")
hprice_reg = smf.ols('price ~ lotsize + sqrft + bdrms', data=df).fit()
prhat2=hprice_reg.predict()**2
prhat3=hprice_reg.predict()**3
hprice_reg_pol = smf.ols('price ~ lotsize + sqrft + bdrms + prhat2 + prhat3 + 1 ', data=df).fit()
print(summary_col([hprice_reg, hprice_reg_pol],stars=True,float_format='%0.3f',
model_names=['Linear\n(b/se)','Polynomial\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),
'Adj.R2':lambda x: "{:.3f}".format(x.rsquared_adj)}))
==================================
Linear Polynomial
(b/se) (b/se)
----------------------------------
Intercept -21.770 166.097
(29.475) (317.433)
R-squared 0.672 0.706
R-squared Adj. 0.661 0.688
bdrms 13.853 2.175
(9.010) (33.888)
lotsize 0.002*** 0.000
(0.001) (0.005)
prhat2 0.000
(0.007)
prhat3 0.000
(0.000)
sqrft 0.123*** 0.018
(0.013) (0.299)
N 88 88
R2 0.672 0.706
Adj.R2 0.661 0.688
==================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
hypotheses = '(prhat2 = prhat3 = 0)'
f_test = hprice_reg_pol.f_test(hypotheses)
print(f_test)
<F test: F=4.668205534947545, p=0.01202171144289627, df_denom=82, df_num=2>
Logarithmic form#
lhprice_reg = smf.ols('lprice ~ llotsize + lsqrft + bdrms', data=df).fit()
lprhat2=lhprice_reg.predict()**2
lprhat3=lhprice_reg.predict()**3
lhprice_reg_pol = smf.ols('lprice ~ llotsize + lsqrft + bdrms + lprhat2 + lprhat3 + 1 ', data=df).fit()
print(summary_col([lhprice_reg, lhprice_reg_pol],stars=True,float_format='%0.3f',
model_names=['Linear_L\n(b/se)','Polynomial_L\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),
'Adj.R2':lambda x: "{:.3f}".format(x.rsquared_adj)}))
====================================
Linear_L Polynomial_L
(b/se) (b/se)
------------------------------------
Intercept -1.297** 87.886
(0.651) (240.974)
R-squared 0.643 0.664
R-squared Adj. 0.630 0.643
bdrms 0.037 -0.925
(0.028) (2.770)
llotsize 0.168*** -4.181
(0.038) (12.595)
lprhat2 3.910
(13.014)
lprhat3 -0.193
(0.752)
lsqrft 0.700*** -17.349
(0.093) (52.490)
N 88 88
R2 0.643 0.664
Adj.R2 0.630 0.643
====================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
hypotheses = '(lprhat2 = lprhat3 = 0)'
f_test = lhprice_reg_pol.f_test(hypotheses)
print(f_test)
<F test: F=2.565046204755637, p=0.0830754662426802, df_denom=82, df_num=2>
Example 9.3. IQ as a Proxy for Ability#
df = dataWoo("wage2")
IQA = smf.ols('lwage ~ educ + exper + tenure + married + south + urban + black + 1', data=df).fit()
IQB = smf.ols('lwage ~ educ + exper + tenure + married + south + urban + black + IQ + 1', data=df).fit()
IQC = smf.ols('lwage ~ educ + exper + tenure + married + south + urban + black + IQ + educ:IQ + 1', data=df).fit()
print(summary_col([IQA, IQB, IQC],stars=True,float_format='%0.3f',
model_names=['IQA\n(b/se)','IQB\n(b/se)','IQC\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),
'Adj.R2':lambda x: "{:.3f}".format(x.rsquared_adj)}))
============================================
IQA IQB IQC
(b/se) (b/se) (b/se)
--------------------------------------------
IQ 0.004*** -0.001
(0.001) (0.005)
Intercept 5.395*** 5.176*** 5.648***
(0.113) (0.128) (0.546)
R-squared 0.253 0.263 0.263
R-squared Adj. 0.247 0.256 0.256
black -0.188*** -0.143*** -0.147***
(0.038) (0.039) (0.040)
educ 0.065*** 0.054*** 0.018
(0.006) (0.007) (0.041)
educ:IQ 0.000
(0.000)
exper 0.014*** 0.014*** 0.014***
(0.003) (0.003) (0.003)
married 0.199*** 0.200*** 0.201***
(0.039) (0.039) (0.039)
south -0.091*** -0.080*** -0.080***
(0.026) (0.026) (0.026)
tenure 0.012*** 0.011*** 0.011***
(0.002) (0.002) (0.002)
urban 0.184*** 0.182*** 0.184***
(0.027) (0.027) (0.027)
N 935 935 935
R2 0.253 0.263 0.263
Adj.R2 0.247 0.256 0.256
============================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
Example 9.4. City Crime Rates#
df = dataWoo("crime2")
df = df[(df['year']==87)]
crimeA = smf.ols('lcrmrte ~ unem + llawexpc + 1', data=df).fit()
crimeB = smf.ols('lcrmrte ~ unem + llawexpc + lcrmrt_1 + 1', data=df).fit()
print(summary_col([crimeA, crimeB],stars=True,float_format='%0.3f',
model_names=['crimeA\n(b/se)','crimeA\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),
'Adj.R2':lambda x: "{:.3f}".format(x.rsquared_adj)}))
=================================
crimeA crimeA
(b/se) I (b/se) II
---------------------------------
Intercept 3.343** 0.076
(1.251) (0.821)
R-squared 0.057 0.680
R-squared Adj. 0.013 0.657
lcrmrt_1 1.194***
(0.132)
llawexpc 0.203 -0.140
(0.173) (0.109)
unem -0.029 0.009
(0.032) (0.020)
N 46 46
R2 0.057 0.680
Adj.R2 0.013 0.657
=================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
Example 9.8. R&D Intensity and Firm Size#
df = dataWoo("rdchem")
RD1 = smf.ols('rdintens ~ sales + profmarg + 1', data=df).fit()
RD2 = smf.ols('rdintens ~ sales + profmarg + 1', data=df[(df['sales']<30000)]).fit()
print(summary_col([RD1, RD2],stars=True,float_format='%0.3f',
model_names=['RD1\n(b/se)','RD2\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),
'Adj.R2':lambda x: "{:.3f}".format(x.rsquared_adj)}))
================================
RD1 RD2
(b/se) (b/se)
--------------------------------
Intercept 2.625*** 2.297***
(0.586) (0.592)
sales 0.000 0.000**
(0.000) (0.000)
profmarg 0.045 0.048
(0.046) (0.044)
R-squared 0.076 0.173
R-squared Adj. 0.012 0.114
N 32 31
R2 0.076 0.173
Adj.R2 0.012 0.114
================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
Example 9.9. R&D Intensity#
df = dataWoo("rdchem")
lRD1 = smf.ols('lrd ~ lsales + profmarg + 1', data=df).fit()
lRD2 = smf.ols('lrd ~ lsales + profmarg + 1', data=df[(df['sales']<30000)]).fit()
print(summary_col([lRD1, lRD2],stars=True,float_format='%0.3f',
model_names=['RD1_Log\n(b/se)','RD2_Log\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),
'Adj.R2':lambda x: "{:.3f}".format(x.rsquared_adj)}))
==================================
RD1_Log RD2_Log
(b/se) (b/se)
----------------------------------
Intercept -4.378*** -4.404***
(0.468) (0.511)
lsales 1.084*** 1.088***
(0.060) (0.067)
profmarg 0.022 0.022
(0.013) (0.013)
R-squared 0.918 0.904
R-squared Adj. 0.912 0.897
N 32 31
R2 0.918 0.904
Adj.R2 0.912 0.897
==================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
Example 9.10. State Infant Mortality Rates#
df = dataWoo("infmrt")
df = df[(df['year']==1990)]
infant1 = smf.ols('infmort ~ lpcinc + lphysic + lpopul + 1', data=df).fit()
infant2 = smf.ols('infmort ~ lpcinc + lphysic + lpopul + 1 ', data=df[(df['DC']==0)]).fit()
print(summary_col([infant1, infant2],stars=True,float_format='%0.3f',
model_names=['Infmort1\n(b/se)','Infmort2\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),
'Adj.R2':lambda x: "{:.3f}".format(x.rsquared_adj)}))
================================
Infmort1 Infmort2
(b/se) (b/se)
--------------------------------
Intercept 33.859 23.955*
(20.428) (12.419)
lpcinc -4.685* -0.567
(2.604) (1.641)
lphysic 4.153*** -2.742**
(1.513) (1.191)
lpopul -0.088 0.629***
(0.287) (0.191)
R-squared 0.139 0.273
R-squared Adj. 0.084 0.226
N 51 50
R2 0.139 0.273
Adj.R2 0.084 0.226
================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01