import numpy as np
import pandas as pd
import scipy as sp
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
from wooldridge import *
df = dataWoo('wage1')
wage_multiple = smf.ols(formula='lwage ~ educ + exper + tenure + 1', data=df).fit()
print(wage_multiple.summary())
OLS Regression Results ============================================================================== Dep. Variable: lwage R-squared: 0.316 Model: OLS Adj. R-squared: 0.312 Method: Least Squares F-statistic: 80.39 Date: Thu, 09 Apr 2020 Prob (F-statistic): 9.13e-43 Time: 17:08:10 Log-Likelihood: -313.55 No. Observations: 526 AIC: 635.1 Df Residuals: 522 BIC: 652.2 Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 0.2844 0.104 2.729 0.007 0.080 0.489 educ 0.0920 0.007 12.555 0.000 0.078 0.106 exper 0.0041 0.002 2.391 0.017 0.001 0.008 tenure 0.0221 0.003 7.133 0.000 0.016 0.028 ============================================================================== Omnibus: 11.534 Durbin-Watson: 1.769 Prob(Omnibus): 0.003 Jarque-Bera (JB): 20.941 Skew: 0.021 Prob(JB): 2.84e-05 Kurtosis: 3.977 Cond. No. 135. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
df = dataWoo('meap93')
math_lin_lin = smf.ols(formula='math10 ~ totcomp + staff + enroll + 1', data=df).fit()
print(math_lin_lin.summary())
OLS Regression Results ============================================================================== Dep. Variable: math10 R-squared: 0.054 Model: OLS Adj. R-squared: 0.047 Method: Least Squares F-statistic: 7.697 Date: Thu, 09 Apr 2020 Prob (F-statistic): 5.18e-05 Time: 17:08:10 Log-Likelihood: -1526.2 No. Observations: 408 AIC: 3060. Df Residuals: 404 BIC: 3076. Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 2.2740 6.114 0.372 0.710 -9.745 14.293 totcomp 0.0005 0.000 4.570 0.000 0.000 0.001 staff 0.0479 0.040 1.204 0.229 -0.030 0.126 enroll -0.0002 0.000 -0.918 0.359 -0.001 0.000 ============================================================================== Omnibus: 31.666 Durbin-Watson: 1.669 Prob(Omnibus): 0.000 Jarque-Bera (JB): 40.628 Skew: 0.615 Prob(JB): 1.51e-09 Kurtosis: 3.936 Cond. No. 4.68e+05 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 4.68e+05. This might indicate that there are strong multicollinearity or other numerical problems.
math_lin_log = smf.ols(formula='math10 ~ ltotcomp + lstaff + lenroll + 1', data=df).fit()
print(math_lin_log.summary())
OLS Regression Results ============================================================================== Dep. Variable: math10 R-squared: 0.065 Model: OLS Adj. R-squared: 0.058 Method: Least Squares F-statistic: 9.420 Date: Thu, 09 Apr 2020 Prob (F-statistic): 4.97e-06 Time: 17:08:10 Log-Likelihood: -1523.7 No. Observations: 408 AIC: 3055. Df Residuals: 404 BIC: 3072. Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -207.6648 48.703 -4.264 0.000 -303.408 -111.922 ltotcomp 21.1550 4.056 5.216 0.000 13.182 29.128 lstaff 3.9800 4.190 0.950 0.343 -4.256 12.216 lenroll -1.2680 0.693 -1.829 0.068 -2.631 0.095 ============================================================================== Omnibus: 27.703 Durbin-Watson: 1.666 Prob(Omnibus): 0.000 Jarque-Bera (JB): 34.495 Skew: 0.568 Prob(JB): 3.23e-08 Kurtosis: 3.860 Cond. No. 1.34e+03 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.34e+03. This might indicate that there are strong multicollinearity or other numerical problems.
from statsmodels.iolib.summary2 import summary_col
print(summary_col([math_lin_lin,math_lin_log],stars=True,float_format='%0.3f',
model_names=['math10\n(Lin_Lin)','math10\n(Lin_Log)'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared)}))
=============================== math10 math10 (Lin_Lin) (Lin_Log) ------------------------------- Intercept 2.274 -207.665*** (6.114) (48.703) enroll -0.000 (0.000) lenroll -1.268* (0.693) lstaff 3.980 (4.190) ltotcomp 21.155*** (4.056) staff 0.048 (0.040) totcomp 0.000*** (0.000) N 408 408 R2 0.054 0.065 =============================== Standard errors in parentheses. * p<.1, ** p<.05, ***p<.01
df = dataWoo('gpa1')
gpa_mols = smf.ols(formula='colGPA ~ hsGPA + ACT + skipped + 1', data=df).fit()
print(gpa_mols.summary())
OLS Regression Results ============================================================================== Dep. Variable: colGPA R-squared: 0.234 Model: OLS Adj. R-squared: 0.217 Method: Least Squares F-statistic: 13.92 Date: Thu, 09 Apr 2020 Prob (F-statistic): 5.65e-08 Time: 17:08:10 Log-Likelihood: -41.501 No. Observations: 141 AIC: 91.00 Df Residuals: 137 BIC: 102.8 Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 1.3896 0.332 4.191 0.000 0.734 2.045 hsGPA 0.4118 0.094 4.396 0.000 0.227 0.597 ACT 0.0147 0.011 1.393 0.166 -0.006 0.036 skipped -0.0831 0.026 -3.197 0.002 -0.135 -0.032 ============================================================================== Omnibus: 1.917 Durbin-Watson: 1.881 Prob(Omnibus): 0.383 Jarque-Bera (JB): 1.636 Skew: 0.125 Prob(JB): 0.441 Kurtosis: 2.535 Cond. No. 300. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
df = dataWoo('campus')
crime_ols = smf.ols(formula='lcrime ~ lenroll + 1', data=df).fit()
print(crime_ols.summary())
OLS Regression Results ============================================================================== Dep. Variable: lcrime R-squared: 0.585 Model: OLS Adj. R-squared: 0.580 Method: Least Squares F-statistic: 133.8 Date: Thu, 09 Apr 2020 Prob (F-statistic): 7.83e-20 Time: 17:08:10 Log-Likelihood: -125.83 No. Observations: 97 AIC: 255.7 Df Residuals: 95 BIC: 260.8 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -6.6314 1.034 -6.416 0.000 -8.683 -4.580 lenroll 1.2698 0.110 11.567 0.000 1.052 1.488 ============================================================================== Omnibus: 51.350 Durbin-Watson: 1.548 Prob(Omnibus): 0.000 Jarque-Bera (JB): 247.879 Skew: -1.628 Prob(JB): 1.49e-54 Kurtosis: 10.123 Cond. No. 108. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
df = dataWoo('hprice2')
ldist=np.log(df.dist)
hprice_mols = smf.ols(formula='lprice ~ lnox +ldist + rooms + stratio + 1', data=df).fit()
print(hprice_mols.summary())
OLS Regression Results ============================================================================== Dep. Variable: lprice R-squared: 0.584 Model: OLS Adj. R-squared: 0.581 Method: Least Squares F-statistic: 175.9 Date: Thu, 09 Apr 2020 Prob (F-statistic): 5.53e-94 Time: 17:08:10 Log-Likelihood: -43.495 No. Observations: 506 AIC: 96.99 Df Residuals: 501 BIC: 118.1 Df Model: 4 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 11.0839 0.318 34.843 0.000 10.459 11.709 lnox -0.9535 0.117 -8.168 0.000 -1.183 -0.724 ldist -0.1343 0.043 -3.117 0.002 -0.219 -0.050 rooms 0.2545 0.019 13.736 0.000 0.218 0.291 stratio -0.0525 0.006 -8.894 0.000 -0.064 -0.041 ============================================================================== Omnibus: 61.317 Durbin-Watson: 0.682 Prob(Omnibus): 0.000 Jarque-Bera (JB): 480.143 Skew: 0.051 Prob(JB): 5.47e-105 Kurtosis: 7.771 Cond. No. 560. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
df = dataWoo('401k')
pension_multiple = smf.ols(formula='prate ~ mrate + age + totemp + 1', data=df).fit()
print(pension_multiple.summary())
OLS Regression Results ============================================================================== Dep. Variable: prate R-squared: 0.100 Model: OLS Adj. R-squared: 0.098 Method: Least Squares F-statistic: 56.38 Date: Thu, 09 Apr 2020 Prob (F-statistic): 1.45e-34 Time: 17:08:10 Log-Likelihood: -6416.1 No. Observations: 1534 AIC: 1.284e+04 Df Residuals: 1530 BIC: 1.286e+04 Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 80.2941 0.778 103.242 0.000 78.769 81.820 mrate 5.4422 0.524 10.378 0.000 4.414 6.471 age 0.2692 0.045 5.963 0.000 0.181 0.358 totemp -0.0001 3.67e-05 -3.521 0.000 -0.000 -5.72e-05 ============================================================================== Omnibus: 364.684 Durbin-Watson: 1.906 Prob(Omnibus): 0.000 Jarque-Bera (JB): 755.421 Skew: -1.367 Prob(JB): 9.17e-165 Kurtosis: 5.083 Cond. No. 2.38e+04 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 2.38e+04. This might indicate that there are strong multicollinearity or other numerical problems.
df = dataWoo('jtrain')
dataWoo('jtrain', description=True)
name of dataset: jtrain no of variables: 30 no of observations: 471 +----------+---------------------------------+ | variable | label | +----------+---------------------------------+ | year | 1987, 1988, or 1989 | | fcode | firm code number | | employ | # employees at plant | | sales | annual sales, $ | | avgsal | average employee salary | | scrap | scrap rate (per 100 items) | | rework | rework rate (per 100 items) | | tothrs | total hours training | | union | =1 if unionized | | grant | = 1 if received grant | | d89 | = 1 if year = 1989 | | d88 | = 1 if year = 1988 | | totrain | total employees trained | | hrsemp | tothrs/totrain | | lscrap | log(scrap) | | lemploy | log(employ) | | lsales | log(sales) | | lrework | log(rework) | | lhrsemp | log(1 + hrsemp) | | lscrap_1 | lagged lscrap; missing 1987 | | grant_1 | lagged grant; assumed 0 in 1987 | | clscrap | lscrap - lscrap_1; year > 1987 | | cgrant | grant - grant_1 | | clemploy | lemploy - lemploy[_n-1] | | clsales | lavgsal - lavgsal[_n-1] | | lavgsal | log(avgsal) | | clavgsal | lavgsal - lavgsal[_n-1] | | cgrant_1 | cgrant[_n-1] | | chrsemp | hrsemp - hrsemp[_n-1] | | clhrsemp | lhrsemp - lhrsemp[_n-1] | +----------+---------------------------------+ H. Holzer, R. Block, M. Cheatham, and J. Knott (1993), “Are Training Subsidies Effective? The Michigan Experience,” Industrial and Labor Relations Review 46, 625-636. The authors kindly provided the data.
df = df[(df['year']==1987) & (df['union']==0)] #regress if year=1987 & union=0
job_multiple = smf.ols(formula='lscrap ~ hrsemp + lsales + lemploy + 1', data=df).fit()
print(job_multiple.summary())
OLS Regression Results ============================================================================== Dep. Variable: lscrap R-squared: 0.262 Model: OLS Adj. R-squared: 0.174 Method: Least Squares F-statistic: 2.965 Date: Thu, 09 Apr 2020 Prob (F-statistic): 0.0513 Time: 17:08:10 Log-Likelihood: -48.254 No. Observations: 29 AIC: 104.5 Df Residuals: 25 BIC: 110.0 Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 12.4584 5.687 2.191 0.038 0.746 24.170 hrsemp -0.0293 0.023 -1.283 0.211 -0.076 0.018 lsales -0.9620 0.453 -2.126 0.044 -1.894 -0.030 lemploy 0.7615 0.407 1.869 0.073 -0.078 1.601 ============================================================================== Omnibus: 0.035 Durbin-Watson: 2.255 Prob(Omnibus): 0.983 Jarque-Bera (JB): 0.202 Skew: 0.066 Prob(JB): 0.904 Kurtosis: 2.613 Cond. No. 417. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
df = dataWoo('rdchem')
lrd_ols = smf.ols(formula='lrd ~ lsales + profmarg + 1', data=df).fit()
print(lrd_ols.summary())
OLS Regression Results ============================================================================== Dep. Variable: lrd R-squared: 0.918 Model: OLS Adj. R-squared: 0.912 Method: Least Squares F-statistic: 162.2 Date: Thu, 09 Apr 2020 Prob (F-statistic): 1.79e-16 Time: 17:08:10 Log-Likelihood: -22.511 No. Observations: 32 AIC: 51.02 Df Residuals: 29 BIC: 55.42 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -4.3783 0.468 -9.355 0.000 -5.335 -3.421 lsales 1.0842 0.060 18.012 0.000 0.961 1.207 profmarg 0.0217 0.013 1.694 0.101 -0.004 0.048 ============================================================================== Omnibus: 0.670 Durbin-Watson: 1.859 Prob(Omnibus): 0.715 Jarque-Bera (JB): 0.671 Skew: 0.308 Prob(JB): 0.715 Kurtosis: 2.649 Cond. No. 70.6 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
df = dataWoo('bwght')
birthw_ols = smf.ols(formula='bwght ~ cigs + parity + faminc + motheduc + fatheduc + 1', data=df).fit()
print(birthw_ols.summary())
OLS Regression Results ============================================================================== Dep. Variable: bwght R-squared: 0.039 Model: OLS Adj. R-squared: 0.035 Method: Least Squares F-statistic: 9.553 Date: Thu, 09 Apr 2020 Prob (F-statistic): 5.99e-09 Time: 17:08:11 Log-Likelihood: -5242.2 No. Observations: 1191 AIC: 1.050e+04 Df Residuals: 1185 BIC: 1.053e+04 Df Model: 5 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 114.5243 3.728 30.716 0.000 107.209 121.839 cigs -0.5959 0.110 -5.401 0.000 -0.812 -0.379 parity 1.7876 0.659 2.711 0.007 0.494 3.081 faminc 0.0560 0.037 1.533 0.126 -0.016 0.128 motheduc -0.3705 0.320 -1.158 0.247 -0.998 0.257 fatheduc 0.4724 0.283 1.671 0.095 -0.082 1.027 ============================================================================== Omnibus: 120.762 Durbin-Watson: 1.938 Prob(Omnibus): 0.000 Jarque-Bera (JB): 838.114 Skew: -0.119 Prob(JB): 1.01e-182 Kurtosis: 7.103 Cond. No. 266. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
birthw_ols_r = smf.ols(formula='bwght ~ cigs + parity + faminc + 1', data=df).fit()
print(birthw_ols_r.summary())
OLS Regression Results ============================================================================== Dep. Variable: bwght R-squared: 0.035 Model: OLS Adj. R-squared: 0.033 Method: Least Squares F-statistic: 16.63 Date: Thu, 09 Apr 2020 Prob (F-statistic): 1.28e-10 Time: 17:08:11 Log-Likelihood: -6126.8 No. Observations: 1388 AIC: 1.226e+04 Df Residuals: 1384 BIC: 1.228e+04 Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 114.2143 1.469 77.734 0.000 111.332 117.097 cigs -0.4772 0.092 -5.214 0.000 -0.657 -0.298 parity 1.6164 0.604 2.676 0.008 0.432 2.801 faminc 0.0979 0.029 3.355 0.001 0.041 0.155 ============================================================================== Omnibus: 118.381 Durbin-Watson: 1.922 Prob(Omnibus): 0.000 Jarque-Bera (JB): 638.175 Skew: -0.156 Prob(JB): 2.64e-139 Kurtosis: 6.307 Cond. No. 98.8 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
import statsmodels.stats as ss
ss.anova.anova_lm(birthw_ols_r, birthw_ols)
df_resid | ssr | df_diff | ss_diff | F | Pr(>F) | |
---|---|---|---|---|---|---|
0 | 1384.0 | 554615.198655 | 0.0 | NaN | NaN | NaN |
1 | 1185.0 | 464041.135130 | 199.0 | 90574.063525 | 1.162285 | 0.075222 |
df = dataWoo('attend')
attend_ols_r = smf.ols(formula='atndrte ~ priGPA + 1', data=df).fit()
print(attend_ols_r.summary())
OLS Regression Results ============================================================================== Dep. Variable: atndrte R-squared: 0.182 Model: OLS Adj. R-squared: 0.181 Method: Least Squares F-statistic: 151.3 Date: Thu, 09 Apr 2020 Prob (F-statistic): 1.54e-31 Time: 17:08:11 Log-Likelihood: -2824.3 No. Observations: 680 AIC: 5653. Df Residuals: 678 BIC: 5662. Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 47.1270 2.873 16.406 0.000 41.487 52.767 priGPA 13.3690 1.087 12.302 0.000 11.235 15.503 ============================================================================== Omnibus: 167.813 Durbin-Watson: 1.942 Prob(Omnibus): 0.000 Jarque-Bera (JB): 368.128 Skew: -1.331 Prob(JB): 1.15e-80 Kurtosis: 5.431 Cond. No. 14.6 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
attend_ols = smf.ols(formula='atndrte ~ priGPA + ACT + 1', data=df).fit()
print(attend_ols.summary())
OLS Regression Results ============================================================================== Dep. Variable: atndrte R-squared: 0.291 Model: OLS Adj. R-squared: 0.288 Method: Least Squares F-statistic: 138.7 Date: Thu, 09 Apr 2020 Prob (F-statistic): 3.39e-51 Time: 17:08:11 Log-Likelihood: -2776.1 No. Observations: 680 AIC: 5558. Df Residuals: 677 BIC: 5572. Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ Intercept 75.7004 3.884 19.490 0.000 68.074 83.327 priGPA 17.2606 1.083 15.936 0.000 15.134 19.387 ACT -1.7166 0.169 -10.156 0.000 -2.048 -1.385 ============================================================================== Omnibus: 126.367 Durbin-Watson: 2.011 Prob(Omnibus): 0.000 Jarque-Bera (JB): 237.444 Skew: -1.079 Prob(JB): 2.75e-52 Kurtosis: 4.929 Cond. No. 163. ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(summary_col([attend_ols_r, attend_ols], stars=True,float_format='%0.3f',
model_names=['attend_ols_r','attend_ols'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared)}))
================================= attend_ols_r attend_ols --------------------------------- ACT -1.717*** (0.169) Intercept 47.127*** 75.700*** (2.873) (3.884) priGPA 13.369*** 17.261*** (1.087) (1.083) N 680 680 R2 0.182 0.291 ================================= Standard errors in parentheses. * p<.1, ** p<.05, ***p<.01
df = dataWoo('meap93')
meap_ols1 = smf.ols(formula='lsalary ~ bensal + 1', data=df).fit()
meap_ols2 = smf.ols(formula='lsalary ~ bensal + lenroll + lstaff + 1', data=df).fit()
meap_ols3 = smf.ols(formula='lsalary ~ bensal + lenroll + lstaff + droprate + gradrate + 1', data=df).fit()
print(summary_col([meap_ols1, meap_ols2, meap_ols3], stars=True,float_format='%0.2f',
model_names=['meap_ols1','meap_ols2', 'meap_ols3'],
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.2f}".format(x.rsquared)}))
======================================= meap_ols1 meap_ols2 meap_ols3 --------------------------------------- Intercept 10.52*** 10.84*** 10.74*** (0.04) (0.25) (0.26) bensal -0.83*** -0.60*** -0.59*** (0.20) (0.17) (0.16) droprate -0.00 (0.00) gradrate 0.00 (0.00) lenroll 0.09*** 0.09*** (0.01) (0.01) lstaff -0.22*** -0.22*** (0.05) (0.05) N 408 408 408 R2 0.04 0.35 0.36 ======================================= Standard errors in parentheses. * p<.1, ** p<.05, ***p<.01