Chapter 5. IV estimation of Single-Equation Linear Models#
Examples#
import pandas as pd
import scipy.stats as ss
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
from wooldridge import *
from linearmodels.iv import IV2SLS
Example 5.3 Parents’ and Husband’s Education as IVs#
df=dataWoo("mroz").dropna(subset=['lwage'])
print(smf.ols('lwage ~ exper + expersq + educ', data=df).fit().summary())
OLS Regression Results
==============================================================================
Dep. Variable: lwage R-squared: 0.157
Model: OLS Adj. R-squared: 0.151
Method: Least Squares F-statistic: 26.29
Date: Mon, 11 Dec 2023 Prob (F-statistic): 1.30e-15
Time: 22:32:33 Log-Likelihood: -431.60
No. Observations: 428 AIC: 871.2
Df Residuals: 424 BIC: 887.4
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -0.5220 0.199 -2.628 0.009 -0.912 -0.132
exper 0.0416 0.013 3.155 0.002 0.016 0.067
expersq -0.0008 0.000 -2.063 0.040 -0.002 -3.82e-05
educ 0.1075 0.014 7.598 0.000 0.080 0.135
==============================================================================
Omnibus: 77.792 Durbin-Watson: 1.961
Prob(Omnibus): 0.000 Jarque-Bera (JB): 300.917
Skew: -0.753 Prob(JB): 4.54e-66
Kurtosis: 6.822 Cond. No. 2.21e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.21e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
IV1 = IV2SLS.from_formula(
'lwage ~ 1 + exper + expersq + [educ ~ motheduc + fatheduc + huseduc]',
data=df.dropna(subset=['lwage'])).fit()
print(IV1)
IV-2SLS Estimation Summary
==============================================================================
Dep. Variable: lwage R-squared: 0.1495
Estimator: IV-2SLS Adj. R-squared: 0.1435
No. Observations: 428 F-statistic: 27.835
Date: Mon, Dec 11 2023 P-value (F-stat) 0.0000
Time: 22:32:33 Distribution: chi2(3)
Cov. Estimator: robust
Parameter Estimates
==============================================================================
Parameter Std. Err. T-stat P-value Lower CI Upper CI
------------------------------------------------------------------------------
Intercept -0.1869 0.2999 -0.6232 0.5332 -0.7746 0.4008
exper 0.0431 0.0152 2.8289 0.0047 0.0132 0.0730
expersq -0.0009 0.0004 -2.0558 0.0398 -0.0017 -4.023e-05
educ 0.0804 0.0216 3.7216 0.0002 0.0381 0.1227
==============================================================================
Endogenous: educ
Instruments: motheduc, fatheduc, huseduc
Robust Covariance (Heteroskedastic)
Debiased: False
stage1 = smf.ols('educ ~motheduc + fatheduc + huseduc + exper + expersq', data=df.dropna()).fit()
print(stage1.summary())
OLS Regression Results
==============================================================================
Dep. Variable: educ R-squared: 0.429
Model: OLS Adj. R-squared: 0.422
Method: Least Squares F-statistic: 63.30
Date: Mon, 11 Dec 2023 Prob (F-statistic): 3.43e-49
Time: 22:32:33 Log-Likelihood: -840.80
No. Observations: 428 AIC: 1694.
Df Residuals: 422 BIC: 1718.
Df Model: 5
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 5.5383 0.460 12.046 0.000 4.635 6.442
motheduc 0.1142 0.031 3.708 0.000 0.054 0.175
fatheduc 0.1061 0.030 3.594 0.000 0.048 0.164
huseduc 0.3753 0.030 12.663 0.000 0.317 0.434
exper 0.0375 0.034 1.093 0.275 -0.030 0.105
expersq -0.0006 0.001 -0.585 0.559 -0.003 0.001
==============================================================================
Omnibus: 7.891 Durbin-Watson: 1.941
Prob(Omnibus): 0.019 Jarque-Bera (JB): 11.619
Skew: -0.105 Prob(JB): 0.00300
Kurtosis: 3.780 Cond. No. 1.96e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.96e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
print(stage1.f_test('(motheduc=fatheduc=huseduc=0)'))
<F test: F=104.29424463273558, p=1.5857824440176843e-50, df_denom=422, df_num=3>
Example 5.4 Parents’ and Husband’s Education as IVs, cont’d#
IV2= IV2SLS.from_formula(
'lwage ~ 1 + exper + expersq + [educ ~ motheduc + fatheduc + huseduc]+ kidslt6 + kidsge6',
data=df.dropna(subset=['lwage'])).fit(cov_type='unadjusted')
print(IV2)
IV-2SLS Estimation Summary
==============================================================================
Dep. Variable: lwage R-squared: 0.1505
Estimator: IV-2SLS Adj. R-squared: 0.1404
No. Observations: 428 F-statistic: 35.923
Date: Mon, Dec 11 2023 P-value (F-stat) 0.0000
Time: 22:32:33 Distribution: chi2(5)
Cov. Estimator: unadjusted
Parameter Estimates
==============================================================================
Parameter Std. Err. T-stat P-value Lower CI Upper CI
------------------------------------------------------------------------------
Intercept -0.1315 0.3017 -0.4359 0.6629 -0.7229 0.4598
exper 0.0415 0.0134 3.1080 0.0019 0.0153 0.0677
expersq -0.0009 0.0004 -2.1743 0.0297 -0.0016 -8.456e-05
kidslt6 -0.0313 0.0855 -0.3665 0.7140 -0.1989 0.1362
kidsge6 -0.0182 0.0270 -0.6761 0.4990 -0.0710 0.0346
educ 0.0799 0.0222 3.5976 0.0003 0.0364 0.1234
==============================================================================
Endogenous: educ
Instruments: motheduc, fatheduc, huseduc
Unadjusted Covariance (Homoskedastic)
Debiased: False
uhat = IV1.resids
IV3= IV2SLS.from_formula(
'uhat ~ 1 + exper + expersq + [educ ~ motheduc + fatheduc + huseduc]+ kidslt6 + kidsge6',
data=df.dropna(subset=['wage'])).fit()
print(IV3)
IV-2SLS Estimation Summary
==============================================================================
Dep. Variable: uhat R-squared: 0.0011
Estimator: IV-2SLS Adj. R-squared: -0.0107
No. Observations: 428 F-statistic: 0.5040
Date: Mon, Dec 11 2023 P-value (F-stat) 0.9920
Time: 22:32:33 Distribution: chi2(5)
Cov. Estimator: robust
Parameter Estimates
==============================================================================
Parameter Std. Err. T-stat P-value Lower CI Upper CI
------------------------------------------------------------------------------
Intercept 0.0553 0.3184 0.1738 0.8620 -0.5687 0.6793
exper -0.0016 0.0153 -0.1045 0.9167 -0.0317 0.0285
expersq 5.169e-06 0.0004 0.0124 0.9901 -0.0008 0.0008
kidslt6 -0.0313 0.1007 -0.3112 0.7556 -0.2287 0.1660
kidsge6 -0.0182 0.0282 -0.6471 0.5175 -0.0734 0.0370
educ -0.0005 0.0223 -0.0235 0.9813 -0.0442 0.0432
==============================================================================
Endogenous: educ
Instruments: motheduc, fatheduc, huseduc
Robust Covariance (Heteroskedastic)
Debiased: False
v2 = smf.ols('educ ~ motheduc + fatheduc + huseduc + exper + expersq + kidslt6 + kidsge6', data=df).fit().resid
IV2f = smf.ols('lwage ~ 1 + educ + exper + expersq + kidslt6 + kidsge6 + v2 ', data = df).fit()
print(IV2f.f_test('(kidslt6=kidsge6=0)'))
<F test: F=0.30975430489632627, p=0.7337942346879394, df_denom=421, df_num=2>
LM1 = IV3.nobs * IV3.rsquared
P = ss.chi2.sf(LM1, 2)
print(LM1, P)
0.47737716303172517 0.7876601361080349
# Robust Covariance (Heteroskedastic)
IV2= IV2SLS.from_formula(
'lwage ~ 1 + exper + expersq + [educ ~ motheduc + fatheduc + huseduc]+ kidslt6 + kidsge6',
data=df).fit()
print(IV2)
IV-2SLS Estimation Summary
==============================================================================
Dep. Variable: lwage R-squared: 0.1505
Estimator: IV-2SLS Adj. R-squared: 0.1404
No. Observations: 428 F-statistic: 30.243
Date: Mon, Dec 11 2023 P-value (F-stat) 0.0000
Time: 22:32:33 Distribution: chi2(5)
Cov. Estimator: robust
Parameter Estimates
==============================================================================
Parameter Std. Err. T-stat P-value Lower CI Upper CI
------------------------------------------------------------------------------
Intercept -0.1315 0.3184 -0.4131 0.6795 -0.7555 0.4925
exper 0.0415 0.0153 2.7054 0.0068 0.0114 0.0716
expersq -0.0009 0.0004 -2.0505 0.0403 -0.0017 -3.789e-05
kidslt6 -0.0313 0.1007 -0.3112 0.7556 -0.2287 0.1660
kidsge6 -0.0182 0.0282 -0.6471 0.5175 -0.0734 0.0370
educ 0.0799 0.0223 3.5810 0.0003 0.0362 0.1236
==============================================================================
Endogenous: educ
Instruments: motheduc, fatheduc, huseduc
Robust Covariance (Heteroskedastic)
Debiased: False
LM test page 107#
df['uhat'] = IV2SLS.from_formula(
'lwage ~ 1 + exper + expersq + [educ ~ motheduc + fatheduc + huseduc]',
data=df.dropna(subset=['lwage'])).fit().resids
df['u_klt6'] = IV2SLS.from_formula(
'kidslt6 ~ 1 + exper + expersq + [educ ~ motheduc + fatheduc + huseduc]',
data=df.dropna(subset=['lwage'])).fit().resids
df['u_kage6'] = IV2SLS.from_formula(
'kidsge6 ~ 1 + exper + expersq + [educ ~ motheduc + fatheduc + huseduc]',
data=df.dropna(subset=['lwage'])).fit().resids
df['one']=1
long = smf.ols('one ~ u_klt6:uhat + u_kage6:uhat + 0', data=df.dropna(subset=['lwage'])).fit()
print(long.summary())
OLS Regression Results
=======================================================================================
Dep. Variable: one R-squared (uncentered): 0.001
Model: OLS Adj. R-squared (uncentered): -0.004
Method: Least Squares F-statistic: 0.2330
Date: Mon, 11 Dec 2023 Prob (F-statistic): 0.792
Time: 22:32:33 Log-Likelihood: -607.07
No. Observations: 428 AIC: 1218.
Df Residuals: 426 BIC: 1226.
Df Model: 2
Covariance Type: nonrobust
================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------
u_klt6:uhat -0.0848 0.163 -0.520 0.603 -0.405 0.236
u_kage6:uhat -0.0218 0.058 -0.379 0.705 -0.135 0.091
==============================================================================
Omnibus: 130.945 Durbin-Watson: 0.002
Prob(Omnibus): 0.000 Jarque-Bera (JB): 6061.081
Skew: 0.439 Prob(JB): 0.00
Kurtosis: 21.415 Cond. No. 2.85
==============================================================================
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
LM1 = long.nobs * long.rsquared
P = ss.chi2.sf(LM1, 2)
print(LM1, P)
0.46762334065794464 0.7915108668409525
Example 5.5 IQ and KWW as indicator of Ability#
df=pd.read_csv("nls80.csv")
print(IV2SLS.from_formula(
'lwage ~ 1 + exper + tenure + married + south + urban + black + educ + [iq~kww]',
data=df).fit())
IV-2SLS Estimation Summary
==============================================================================
Dep. Variable: lwage R-squared: 0.1900
Estimator: IV-2SLS Adj. R-squared: 0.1830
No. Observations: 935 F-statistic: 356.33
Date: Mon, Dec 11 2023 P-value (F-stat) 0.0000
Time: 22:32:34 Distribution: chi2(8)
Cov. Estimator: robust
Parameter Estimates
==============================================================================
Parameter Std. Err. T-stat P-value Lower CI Upper CI
------------------------------------------------------------------------------
Intercept 4.5925 0.3501 13.117 0.0000 3.9063 5.2786
exper 0.0144 0.0034 4.2234 0.0000 0.0077 0.0211
tenure 0.0105 0.0028 3.7258 0.0002 0.0050 0.0160
married 0.2007 0.0404 4.9616 0.0000 0.1214 0.2800
south -0.0516 0.0339 -1.5201 0.1285 -0.1180 0.0149
urban 0.1767 0.0274 6.4470 0.0000 0.1230 0.2304
black -0.0226 0.0798 -0.2826 0.7775 -0.1790 0.1339
educ 0.0250 0.0187 1.3410 0.1799 -0.0116 0.0616
iq 0.0130 0.0055 2.3835 0.0171 0.0023 0.0238
==============================================================================
Endogenous: iq
Instruments: kww
Robust Covariance (Heteroskedastic)
Debiased: False