# Chapter 13. Pooling Cross Sections across Time#

import numpy as np
import pandas as pd

import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col

from wooldridge import *


## Example 13.1. Women’s Fertility over Time#

df = dataWoo('fertil1')

fert_reg = smf.ols(
'kids ~ educ + age + agesq + black + east + northcen + west + farm + othrural + town + smcity + y74 + y76 + y78 + y80 + y82 + y84',
data=df).fit()
print(fert_reg.summary())

                            OLS Regression Results
==============================================================================
Dep. Variable:                   kids   R-squared:                       0.130
Method:                 Least Squares   F-statistic:                     9.723
Date:                Mon, 11 Dec 2023   Prob (F-statistic):           2.42e-24
Time:                        18:37:36   Log-Likelihood:                -2091.2
No. Observations:                1129   AIC:                             4218.
Df Residuals:                    1111   BIC:                             4309.
Df Model:                          17
Covariance Type:            nonrobust
==============================================================================
coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -7.7425      3.052     -2.537      0.011     -13.730      -1.755
educ          -0.1284      0.018     -6.999      0.000      -0.164      -0.092
age            0.5321      0.138      3.845      0.000       0.261       0.804
agesq         -0.0058      0.002     -3.710      0.000      -0.009      -0.003
black          1.0757      0.174      6.198      0.000       0.735       1.416
east           0.2173      0.133      1.637      0.102      -0.043       0.478
northcen       0.3631      0.121      3.004      0.003       0.126       0.600
west           0.1976      0.167      1.184      0.237      -0.130       0.525
farm          -0.0526      0.147     -0.357      0.721      -0.341       0.236
othrural      -0.1629      0.175     -0.928      0.353      -0.507       0.181
town           0.0844      0.125      0.677      0.498      -0.160       0.329
smcity         0.2119      0.160      1.322      0.187      -0.103       0.526
y74            0.2682      0.173      1.553      0.121      -0.071       0.607
y76           -0.0974      0.179     -0.544      0.587      -0.449       0.254
y78           -0.0687      0.182     -0.378      0.706      -0.425       0.288
y80           -0.0713      0.183     -0.390      0.697      -0.430       0.287
y82           -0.5225      0.172     -3.030      0.003      -0.861      -0.184
y84           -0.5452      0.175     -3.124      0.002      -0.888      -0.203
==============================================================================
Omnibus:                        9.775   Durbin-Watson:                   2.011
Prob(Omnibus):                  0.008   Jarque-Bera (JB):                9.966
Skew:                           0.227   Prob(JB):                      0.00685
Kurtosis:                       2.920   Cond. No.                     1.32e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.32e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

hypotheses = '(y74 =y76 = y78 = y80 = y82 = y84 = 0)'
f_test = fert_reg.f_test(hypotheses)
print(f_test)

<F test: F=5.869508671580268, p=4.85518986757229e-06, df_denom=1.11e+03, df_num=6>

bptest = sms.diagnostic.het_breuschpagan(fert_reg.resid, fert_reg.model.exog)
df2 = pd.DataFrame({'Chi-Sq':[bptest[0]],
'Prob>Chi-Sq':[bptest[1]]})
print(df2)

      Chi-Sq  Prob>Chi-Sq
0  55.315373     0.000006


## Example 13.2. Changes in the Return to Education and the Gender Wage Gap#

print(smf.ols('lwage ~ y85 + educ + y85educ + exper + expersq + union + female + y85fem', data=dataWoo("cps78_85")).fit().summary())

                            OLS Regression Results
==============================================================================
Dep. Variable:                  lwage   R-squared:                       0.426
Method:                 Least Squares   F-statistic:                     99.80
Date:                Mon, 11 Dec 2023   Prob (F-statistic):          4.46e-124
Time:                        18:37:36   Log-Likelihood:                -574.24
No. Observations:                1084   AIC:                             1166.
Df Residuals:                    1075   BIC:                             1211.
Df Model:                           8
Covariance Type:            nonrobust
==============================================================================
coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.4589      0.093      4.911      0.000       0.276       0.642
y85            0.1178      0.124      0.952      0.341      -0.125       0.361
educ           0.0747      0.007     11.192      0.000       0.062       0.088
y85educ        0.0185      0.009      1.974      0.049       0.000       0.037
exper          0.0296      0.004      8.293      0.000       0.023       0.037
expersq       -0.0004   7.75e-05     -5.151      0.000      -0.001      -0.000
union          0.2021      0.030      6.672      0.000       0.143       0.262
female        -0.3167      0.037     -8.648      0.000      -0.389      -0.245
y85fem         0.0851      0.051      1.658      0.098      -0.016       0.186
==============================================================================
Omnibus:                       83.747   Durbin-Watson:                   1.918
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              317.985
Skew:                          -0.271   Prob(JB):                     8.92e-70
Kurtosis:                       5.597   Cond. No.                     8.77e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 8.77e+03. This might indicate that there are
strong multicollinearity or other numerical problems.


## Example 13.3. Effect of a Garbage Incinerator’s Location on Housing Prices#

df = dataWoo("kielmc")
garb81_reg = smf.ols('rprice ~ nearinc', data=df[(df['year']==1981)]).fit()
garb78_reg = smf.ols('rprice ~ nearinc', data=df[(df['year']==1978)]).fit()

print(summary_col([garb81_reg, garb78_reg],stars=True,float_format='%0.3f',
model_names=['grab81\n(b/se)','grab78\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),

==========================================
grab81        grab78
(b/se)        (b/se)
------------------------------------------
Intercept      101307.514*** 82517.228***
(3093.027)    (2653.790)
nearinc        -30688.274*** -18824.370***
(5827.709)    (4744.594)
R-squared      0.165         0.082
N              142           179
R2             0.165         0.082
==========================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


## Table 13.2 Effects of Incinerator Location on Housing Prices (rprice)#

One = smf.ols('rprice ~ y81 + nearinc + y81nrinc', data=df).fit()
Two = smf.ols('rprice ~ y81 + nearinc + y81nrinc + age + agesq', data=df).fit()
Three = smf.ols('rprice ~ y81 + nearinc + y81nrinc + age + agesq + intst + land + area + rooms + baths', data=df).fit()

print(summary_col([One, Two, Three],stars=True,float_format='%0.3f',
model_names=['One\n(b/se)','Two\n(b/se)', 'Three\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),

========================================================
One           Two          Three
(b/se)        (b/se)        (b/se)
--------------------------------------------------------
Intercept      82517.228***  89116.535***  13807.665
(2726.910)    (2406.051)    (11166.594)
R-squared      0.174         0.414         0.660
age                          -1494.424***  -739.451***
(131.860)     (131.127)
agesq                        8.691***      3.453***
(0.848)       (0.813)
area                                       18.086***
(2.306)
baths                                      6977.317***
(2581.321)
intst                                      -0.539***
(0.196)
land                                       0.141***
(0.031)
nearinc        -18824.370*** 9397.936*     3780.337
(4875.322)    (4812.222)    (4453.415)
rooms                                      3304.227**
(1661.248)
y81            18790.286***  21321.042***  13928.476***
(4050.065)    (3443.631)    (2798.747)
y81nrinc       -11863.903    -21920.270*** -14177.934***
(7456.646)    (6359.745)    (4987.267)
N              321           321           321
R2             0.174         0.414         0.660
========================================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01

lOne = smf.ols('lprice ~ y81 + nearinc + y81nrinc', data=df).fit()
lThree = smf.ols('lprice ~ y81 + nearinc + y81nrinc + age + agesq + lintst + lland + larea + rooms + baths', data=df).fit()

print(summary_col([lOne, lThree],stars=True,float_format='%0.3f',
model_names=['lOne\n(b/se)', 'lThree\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),

==================================
lOne     lThree
(b/se)    (b/se)
----------------------------------
Intercept      11.285*** 7.652***
(0.031)   (0.416)
R-squared      0.409     0.790
age                      -0.008***
(0.001)
agesq                    0.000***
(0.000)
baths                    0.094***
(0.028)
larea                    0.351***
(0.051)
lintst                   -0.061*
(0.032)
lland                    0.100***
(0.024)
nearinc        -0.340*** 0.032
(0.055)   (0.047)
rooms                    0.047***
(0.017)
y81            0.457***  0.426***
(0.045)   (0.028)
y81nrinc       -0.063    -0.132**
(0.083)   (0.052)
N              321       321
R2             0.409     0.790
==================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


## Example 13.4. Effect of Worker Compensation Laws on Weeks out of Work#

df = dataWoo("injury")
print(smf.ols('ldurat~ afchnge + highearn + afhigh', data=df[(df['ky']==1)]).fit().summary())

                            OLS Regression Results
==============================================================================
Dep. Variable:                 ldurat   R-squared:                       0.021
Method:                 Least Squares   F-statistic:                     39.54
Date:                Mon, 11 Dec 2023   Prob (F-statistic):           2.81e-25
Time:                        18:37:37   Log-Likelihood:                -9322.0
No. Observations:                5626   AIC:                         1.865e+04
Df Residuals:                    5622   BIC:                         1.868e+04
Df Model:                           3
Covariance Type:            nonrobust
==============================================================================
coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.1256      0.031     36.621      0.000       1.065       1.186
afchnge        0.0077      0.045      0.171      0.864      -0.080       0.095
highearn       0.2565      0.047      5.406      0.000       0.163       0.349
afhigh         0.1906      0.069      2.782      0.005       0.056       0.325
==============================================================================
Omnibus:                       29.931   Durbin-Watson:                   1.905
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               41.672
Skew:                           0.037   Prob(JB):                     8.93e-10
Kurtosis:                       3.415   Cond. No.                         6.38
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


## Example 13.5. Sleeping versus Working#

print(smf.ols('cslpnap ~ ctotwrk + ceduc + cmarr + cyngkid + cgdhlth', data=dataWoo("slp75_81")).fit().summary())

                            OLS Regression Results
==============================================================================
Dep. Variable:                cslpnap   R-squared:                       0.150
Method:                 Least Squares   F-statistic:                     8.191
Date:                Mon, 11 Dec 2023   Prob (F-statistic):           3.83e-07
Time:                        18:37:37   Log-Likelihood:                -1864.4
No. Observations:                 239   AIC:                             3741.
Df Residuals:                     233   BIC:                             3762.
Df Model:                           5
Covariance Type:            nonrobust
==============================================================================
coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -92.6340     45.866     -2.020      0.045    -182.999      -2.269
ctotwrk       -0.2267      0.036     -6.287      0.000      -0.298      -0.156
ceduc         -0.0245     48.759     -0.001      1.000     -96.090      96.041
cmarr        104.2139     92.855      1.122      0.263     -78.729     287.157
cyngkid       94.6654     87.653      1.080      0.281     -78.027     267.358
cgdhlth       87.5778     76.599      1.143      0.254     -63.338     238.493
==============================================================================
Omnibus:                       31.927   Durbin-Watson:                   1.890
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               57.378
Skew:                          -0.719   Prob(JB):                     3.47e-13
Kurtosis:                       4.922   Cond. No.                     2.72e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.72e+03. This might indicate that there are
strong multicollinearity or other numerical problems.


## Distributed Lag of Crime Rate on Clear-Up Rate#

print(smf.ols('clcrime ~ cclrprc1 + cclrprc2', data=dataWoo("crime3")).fit().summary())

                            OLS Regression Results
==============================================================================
Dep. Variable:                clcrime   R-squared:                       0.193
Method:                 Least Squares   F-statistic:                     5.992
Date:                Mon, 11 Dec 2023   Prob (F-statistic):            0.00465
Time:                        18:37:37   Log-Likelihood:                -17.194
No. Observations:                  53   AIC:                             40.39
Df Residuals:                      50   BIC:                             46.30
Df Model:                           2
Covariance Type:            nonrobust
==============================================================================
coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0857      0.064      1.343      0.185      -0.042       0.214
cclrprc1      -0.0040      0.005     -0.858      0.395      -0.014       0.005
cclrprc2      -0.0132      0.005     -2.540      0.014      -0.024      -0.003
==============================================================================
Omnibus:                        3.032   Durbin-Watson:                   2.203
Prob(Omnibus):                  0.220   Jarque-Bera (JB):                2.071
Skew:                          -0.344   Prob(JB):                        0.355
Kurtosis:                       3.681   Cond. No.                         23.6
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


## Example 13.7. Effect of Drunk Driving Laws on Traffic Fatalities#

ezon_reg =smf.ols('guclms ~ d82 + d83 + d84 + d85 + d86 + d87 + d88 + cez', data=dataWoo("ezunem")).fit()
print(ezon_reg.summary())

                            OLS Regression Results
==============================================================================
Dep. Variable:                 guclms   R-squared:                       0.623
Method:                 Least Squares   F-statistic:                     34.50
Date:                Mon, 11 Dec 2023   Prob (F-statistic):           1.08e-31
Time:                        18:37:37   Log-Likelihood:                 24.553
No. Observations:                 176   AIC:                            -31.11
Df Residuals:                     167   BIC:                            -2.573
Df Model:                           8
Covariance Type:            nonrobust
==============================================================================
coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.3216      0.046     -6.982      0.000      -0.413      -0.231
d82            0.7788      0.065     11.954      0.000       0.650       0.907
d83           -0.0331      0.065     -0.508      0.612      -0.162       0.095
d84           -0.0171      0.069     -0.250      0.803      -0.152       0.118
d85            0.3231      0.067      4.845      0.000       0.191       0.455
d86            0.2922      0.065      4.485      0.000       0.164       0.421
d87            0.0539      0.065      0.828      0.409      -0.075       0.183
d88           -0.0171      0.065     -0.262      0.794      -0.146       0.112
cez           -0.1819      0.078     -2.326      0.021      -0.336      -0.028
==============================================================================
Omnibus:                        0.858   Durbin-Watson:                   2.370
Prob(Omnibus):                  0.651   Jarque-Bera (JB):                0.871
Skew:                           0.166   Prob(JB):                        0.647
Kurtosis:                       2.905   Cond. No.                         8.96
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

cez = (np.exp(-.1819) - 1) * 100
cez

-16.631529528207743

bptest = sms.diagnostic.het_breuschpagan(ezon_reg.resid, ezon_reg.model.exog)
bptest2 = pd.DataFrame({'Chi-Sq':[bptest[0]],
'Prob>Chi-Sq':[bptest[1]]})
print(bptest2)

     Chi-Sq  Prob>Chi-Sq
0  6.913966     0.545943


## Example 13.9. County Crime Rates in North Carolina#

df = dataWoo("crime4")
hetrosced_r =smf.ols('clcrmrte ~  d83 + d84 + d85 + d86 + d87 + clprbarr + clprbcon + clprbpri + clavgsen + clpolpc + 1', data=df).fit()
robust_r =smf.ols('clcrmrte ~  d83 + d84 + d85 + d86 + d87 + clprbarr + clprbcon + clprbpri + clavgsen + clpolpc + 1', data=df).fit(cov_type='HC1')

print(summary_col([hetrosced_r, robust_r],stars=True,float_format='%0.3f',
model_names=['Hetrosced\n(b/se)', 'Robust\n(b/se)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),

==================================
Hetrosced   Robust
(b/se)    (b/se)
----------------------------------
Intercept      0.008     0.008
(0.017)   (0.015)
d83            -0.100*** -0.100***
(0.024)   (0.022)
d84            -0.048**  -0.048**
(0.024)   (0.020)
d85            -0.005    -0.005
(0.023)   (0.024)
d86            0.028     0.028
(0.024)   (0.021)
d87            0.041*    0.041*
(0.024)   (0.024)
clprbarr       -0.327*** -0.327***
(0.030)   (0.051)
clprbcon       -0.238*** -0.238***
(0.018)   (0.031)
clprbpri       -0.165*** -0.165***
(0.026)   (0.035)
clavgsen       -0.022    -0.022
(0.022)   (0.025)
clpolpc        0.398***  0.398***
(0.027)   (0.076)
R-squared      0.433     0.433
N              540       540
R2             0.433     0.433

bptest = sms.diagnostic.het_breuschpagan(hetrosced_r.resid, hetrosced_r.model.exog)

      Chi-Sq  Prob>Chi-Sq