import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats as ss
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
from wooldridge import *
df=dataWoo('wage1')
df['marrmale'] = 0
df.loc[(df['female'] ==0) & (df['married'] == 1), 'marrmale'] = 1
df['marrfem'] = 0
df.loc[(df['female'] ==1) & (df['married'] == 1), 'marrfem'] = 1
df['singfem'] = 0
df.loc[(df['female'] ==1) & (df['married'] == 0), 'singfem'] = 1
wage_hetr = smf.ols('lwage ~ marrmale + marrfem + singfem + educ + exper + expersq + tenure + tenursq + 1', data=df).fit()
wage_robust = smf.ols('lwage ~ marrmale + marrfem + singfem + educ + exper + expersq + tenure + tenursq + 1', data=df).fit(cov_type='HC1')
print(summary_col([wage_hetr, wage_robust],stars=True,float_format='%0.3f',
model_names=['Hetroskedastic\n(b/se)','Robust\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared)}))
df = dataWoo('gpa3')
df = df[(df['spring']==1)]
gpa_hetr = smf.ols('cumgpa ~ sat + hsperc + tothrs + female + black + white + 1', data=df).fit()
gpa_robust = smf.ols('cumgpa ~ sat + hsperc + tothrs + female + black + white + 1', data=df).fit(cov_type='HC1')
print(summary_col([gpa_hetr, gpa_robust],stars=True,float_format='%0.3f',
model_names=['Hetroskedastic\n(b/se)','Robust\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared)}))
df = dataWoo('crime1')
df['avgsensq'] = np.square(df['avgsen'])
crime_hetr = smf.ols('narr86 ~ pcnv + avgsen + avgsensq + ptime86 + qemp86 + inc86 + black + hispan + 1', data=df).fit()
crime_robust = smf.ols('narr86 ~ pcnv + avgsen + avgsensq + ptime86 + qemp86 + inc86 + black + hispan + 1', data=df).fit(cov_type='HC1')
print(summary_col([crime_hetr, crime_robust],stars=True,float_format='%0.3f',
model_names=['Hetroskedastic\n(b/se)','Robust\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared)}))
crime_hetr = smf.ols('narr86 ~ pcnv + ptime86 + qemp86 + inc86 + black + hispan + 1', data=df).fit()
uhat = df.narr86 - crime_hetr.predict()
uhat_reg = smf.ols('uhat ~ pcnv + avgsen + avgsensq + ptime86 + qemp86 + inc86 + black + hispan + 1', data=df).fit()
print(uhat_reg.summary())
NRsq= 2725 * 0.00127
NRsq
avgsen_reg = smf.ols('avgsen ~ pcnv + ptime86 + qemp86 + inc86 + black + hispan + 1', data=df).fit()
df['uhat_avgsen'] = (df.avgsen - avgsen_reg.predict())*uhat
avgsensq_reg = smf.ols('avgsensq ~ pcnv + ptime86 + qemp86 + inc86 + black + hispan + 1', data=df).fit()
df['uhat_avgsensq'] = (df.avgsensq - avgsensq_reg.predict())*uhat
df['one']= 1
lm_robust = smf.ols('one ~ uhat_avgsen + uhat_avgsensq + 0', data=df).fit()
print(lm_robust.summary())
robust_NRsq= 2725 * 0.001467
robust_NRsq
df = dataWoo('hprice1')
hprice_hetr= smf.ols('price ~ lotsize + sqrft + bdrms + 1', data=df).fit()
df['uhat_hp'] = np.square(df.price - hprice_hetr.predict())
uhat_hp_reg = smf.ols('uhat_hp ~ lotsize + sqrft + bdrms + 1', data=df).fit()
print(uhat_hp_reg.summary())
NRsq = 88*0.160
NRsq
hprice_hetr_lp= smf.ols('lprice ~ llotsize + lsqrft + bdrms + 1', data=df).fit()
df['uhat_hp_lp'] = np.square(df.lprice - hprice_hetr_lp.predict())
uhat_hp_lp_reg = smf.ols('uhat_hp_lp ~ llotsize + lsqrft + bdrms + 1', data=df).fit()
print(uhat_hp_lp_reg.summary())
NRsq = 88 * 0.048
NRsq
yhat = hprice_hetr_lp.predict()
yhat_sq = np.square(hprice_hetr_lp.predict())
special_reg = smf.ols('uhat_hp_lp ~ yhat + yhat_sq + 1', data=df).fit()
print(special_reg.summary())
NRsq = 88 * 0.0392
NRsq
df = dataWoo('401ksubs')
df = df[(df['fsize']==1)]
df['age25sq'] = np.square(df['age'] - 25)
OLS1 = smf.ols('nettfa ~ inc + 1', data=df).fit(cov_type='HC1')
OLS2 = smf.ols('nettfa ~ inc + age25sq + male + e401k + 1', data=df).fit(cov_type='HC1')
df['w']= 1/df.inc
WLS1 = smf.wls('nettfa ~ inc + 1', weights=df.w, data=df).fit()
WLS2 = smf.wls('nettfa ~ inc + age25sq + male + e401k+ 1', weights=df.w, data=df).fit()
WLS3 = smf.wls('nettfa ~ inc + age25sq + male + e401k+ 1', weights=df.w, data=df).fit(cov_type='HC1') # Table 8.2
print(summary_col([OLS1, OLS2, WLS1, WLS2, WLS3 ],stars=True,float_format='%0.3f',
model_names=['OLS1\n(b/se)','OLS2\n(b/se)', 'WLS1\n(b/se)','WLS2\n(b/se)', 'WLS3\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared)}))
df = dataWoo('smoke')
OLS_smk= smf.ols('cigs ~ lincome + lcigpric + educ + age + agesq + restaurn + 1', data=df).fit()
print(OLS_smk.summary())
luhat_sq = np.log(np.square(df.cigs - OLS_smk.predict()))
luhat_sq_reg = smf.ols('luhat_sq ~ lincome + lcigpric + educ + age + agesq + restaurn + 1', data=df).fit()
luhat_sq_hat = np.exp(luhat_sq_reg.predict())
w = 1/luhat_sq_hat
GLS_smk = smf.wls('cigs ~ lincome + lcigpric + educ + age + agesq + restaurn + 1', weights = w, data=df).fit()
print(GLS_smk.summary())
print(summary_col([OLS_smk, GLS_smk ],stars=True,float_format='%0.3f',
model_names=['OLS_smk\n(b/se)','GLS_smk\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared)}))
df = dataWoo('mroz')
mroz_hetr = smf.ols('inlf ~ nwifeinc + educ + exper + expersq + age + kidslt6 + kidsge6 + 1', data=df).fit()
mroz_robust = smf.ols('inlf ~ nwifeinc + educ + exper + expersq + age + kidslt6 + kidsge6 + 1', data=df).fit(cov_type='HC1')
print(summary_col([mroz_hetr, mroz_robust ],stars=True,float_format='%0.3f',
model_names=['Hetroskedastic\n(b/se)','Robust\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared)}))
df = dataWoo('gpa1')
df['parcoll'] = 0
df.loc[(df['fathcoll'] ==1) | (df['mothcoll'] == 1), 'parcoll'] = 1
gpa_hetr = smf.ols('PC ~ hsGPA + ACT + parcoll + 1', data=df).fit()
gpa_robust = smf.ols('PC ~ hsGPA + ACT + parcoll + 1', data=df).fit(cov_type='HC1')
w = 1/(gpa_hetr.predict() - np.square(gpa_hetr.predict()))
gpa_gls = smf.wls('PC ~ hsGPA + ACT + parcoll + 1', weights = w, data=df).fit()
print(summary_col([gpa_hetr, gpa_robust, gpa_gls],stars=True,float_format='%0.3f',
model_names=['Hetroskedastic\n(b/se)','Robust\n(b/se)', 'GLS\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared)}))