Verbeek5. Modern Econometrics Using R - Chap 3

### Chapter 3. Interpreting and Comparing Regression Models

#### Table 3.1

3.4 Illustration: Explaining House Prices Load libraries

library(haven)
library(stargazer)
library(dplyr)
library(MASS)
library(AER)

OLS results hedonic price function

df <- read_dta("Data/housing.dta")
OLS1 = lm(log(price)~log(lotsize) + bedrooms + bathrms + airco,data=df)
summary(OLS1)
##
## Call:
## lm(formula = log(price) ~ log(lotsize) + bedrooms + bathrms +
##     airco, data = df)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -0.81782 -0.15562  0.00778  0.16468  0.84143
##
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept)   7.09378    0.23155  30.636  < 2e-16 ***
## log(lotsize)  0.40042    0.02781  14.397  < 2e-16 ***
## bedrooms      0.07770    0.01549   5.017 7.11e-07 ***
## bathrms       0.21583    0.02300   9.386  < 2e-16 ***
## airco         0.21167    0.02372   8.923  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2456 on 541 degrees of freedom
## Multiple R-squared:  0.5674, Adjusted R-squared:  0.5642
## F-statistic: 177.4 on 4 and 541 DF,  p-value: < 2.2e-16

#### Table 3.2

OLS results hedonic price function, extended model

OLS2 = lm(log(price)~ log(lotsize) + bedrooms + bathrms + airco + driveway +
recroom + fullbase + gashw + garagepl + prefarea + stories, data=df)
summary(OLS2)
##
## Call:
## lm(formula = log(price) ~ log(lotsize) + bedrooms + bathrms +
##     airco + driveway + recroom + fullbase + gashw + garagepl +
##     prefarea + stories, data = df)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -0.68355 -0.12247  0.00802  0.12780  0.67564
##
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept)   7.74509    0.21634  35.801  < 2e-16 ***
## log(lotsize)  0.30313    0.02669  11.356  < 2e-16 ***
## bedrooms      0.03440    0.01427   2.410 0.016294 *
## bathrms       0.16576    0.02033   8.154 2.52e-15 ***
## airco         0.16642    0.02134   7.799 3.29e-14 ***
## driveway      0.11020    0.02823   3.904 0.000107 ***
## recroom       0.05797    0.02605   2.225 0.026482 *
## fullbase      0.10449    0.02169   4.817 1.90e-06 ***
## gashw         0.17902    0.04389   4.079 5.22e-05 ***
## garagepl      0.04795    0.01148   4.178 3.43e-05 ***
## prefarea      0.13185    0.02267   5.816 1.04e-08 ***
## stories       0.09169    0.01261   7.268 1.30e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2104 on 534 degrees of freedom
## Multiple R-squared:  0.6865, Adjusted R-squared:  0.6801
## F-statistic: 106.3 on 11 and 534 DF,  p-value: < 2.2e-16
linearHypothesis(OLS2, c("driveway = 0", "recroom =0","fullbase = 0", "gashw =0","garagepl = 0", "prefarea =0", "stories=0"))
## Linear hypothesis test
##
## Hypothesis:
## driveway = 0
## recroom = 0
## fullbase = 0
## gashw = 0
## garagepl = 0
## prefarea = 0
## stories = 0
##
## Model 1: restricted model
## Model 2: log(price) ~ log(lotsize) + bedrooms + bathrms + airco + driveway +
##     recroom + fullbase + gashw + garagepl + prefarea + stories
##
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)
## 1    541 32.622
## 2    534 23.638  7    8.9839 28.993 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#### Table 3.3

OLS results hedonic price function, linear model

summary(OLS3 <- lm(price~lotsize + bedrooms + bathrms + airco +  driveway +
recroom + fullbase + gashw + garagepl + prefarea + stories,data=df))
##
## Call:
## lm(formula = price ~ lotsize + bedrooms + bathrms + airco + driveway +
##     recroom + fullbase + gashw + garagepl + prefarea + stories,
##     data = df)
##
## Residuals:
##    Min     1Q Median     3Q    Max
## -41389  -9307   -591   7353  74875
##
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4038.3504  3409.4713  -1.184 0.236762
## lotsize         3.5463     0.3503  10.124  < 2e-16 ***
## bedrooms     1832.0035  1047.0002   1.750 0.080733 .
## bathrms     14335.5585  1489.9209   9.622  < 2e-16 ***
## airco       12632.8904  1555.0211   8.124 3.15e-15 ***
## driveway     6687.7789  2045.2458   3.270 0.001145 **
## recroom      4511.2838  1899.9577   2.374 0.017929 *
## fullbase     5452.3855  1588.0239   3.433 0.000642 ***
## gashw       12831.4063  3217.5971   3.988 7.60e-05 ***
## garagepl     4244.8290   840.5442   5.050 6.07e-07 ***
## prefarea     9369.5132  1669.0907   5.614 3.19e-08 ***
## stories      6556.9457   925.2899   7.086 4.37e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15420 on 534 degrees of freedom
## Multiple R-squared:  0.6731, Adjusted R-squared:  0.6664
## F-statistic: 99.97 on 11 and 534 DF,  p-value: < 2.2e-16

#### Table 3.4

df <- read_dta("Data/predictsp5.dta")
#library(dplyr)
df <- df %>%
mutate(bm1 = lag(b_m, n = 1),
dfr1 = lag(dfr, n = 1),
dfy1 = lag(dfy, n = 1),
logdp1 = lag(logdp, n = 1),
logdy1 = lag(logdy, n = 1),
logep1 = lag(logep, n = 1),
infl2 = lag(infl, n = 2),
ltr1 = lag(ltr, n = 1),
lty1 = lag(lty, n = 1),
tms1 = lag(tms, n = 1))
summary(Full <- lm(exret ~ bm1 + dfr1 + dfy1 + logdp1 + logdy1 + logep1 + infl2 + ltr1 + lty1 + tms1 + winter,
data= df, subset = yyyymm<="200312" ))
##
## Call:
## lm(formula = exret ~ bm1 + dfr1 + dfy1 + logdp1 + logdy1 + logep1 +
##     infl2 + ltr1 + lty1 + tms1 + winter, data = df, subset = yyyymm <=
##     "200312")
##
## Residuals:
##       Min        1Q    Median        3Q       Max
## -0.212034 -0.023642  0.000755  0.026591  0.158504
##
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.201192   0.062923   3.197 0.001456 **
## bm1         -0.036863   0.017917  -2.057 0.040054 *
## dfr1         0.174267   0.168065   1.037 0.300177
## dfy1         1.705778   0.687145   2.482 0.013307 *
## logdp1       0.051822   0.042463   1.220 0.222768
## logdy1      -0.055053   0.040887  -1.346 0.178627
## logep1       0.037715   0.012417   3.037 0.002485 **
## infl2       -0.360527   0.647676  -0.557 0.577965
## ltr1         0.176099   0.075575   2.330 0.020113 *
## lty1        -0.349943   0.097892  -3.575 0.000377 ***
## tms1         0.414244   0.149456   2.772 0.005741 **
## winter       0.009545   0.003262   2.927 0.003549 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0409 on 634 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.07494,    Adjusted R-squared:  0.05889
## F-statistic: 4.669 on 11 and 634 DF,  p-value: 6.99e-07

Stepwise regression: excludes regressors with t-ratio smaller than 1.96

df2 <- subset(df, df$yyyymm<="200312") summary(Stepwise <- lm(exret ~ bm1 + dfy1 + logep1 + lty1 + tms1 + winter, data= df2 )) ## ## Call: ## lm(formula = exret ~ bm1 + dfy1 + logep1 + lty1 + tms1 + winter, ## data = df2) ## ## Residuals: ## Min 1Q Median 3Q Max ## -0.220090 -0.024466 0.000701 0.026150 0.167360 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 0.208080 0.053876 3.862 0.000124 *** ## bm1 -0.041395 0.015614 -2.651 0.008219 ** ## dfy1 2.012768 0.653421 3.080 0.002156 ** ## logep1 0.035245 0.008954 3.936 9.18e-05 *** ## lty1 -0.366410 0.086869 -4.218 2.82e-05 *** ## tms1 0.401002 0.134763 2.976 0.003034 ** ## winter 0.009089 0.003238 2.807 0.005156 ** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.04093 on 640 degrees of freedom ## (1 observation deleted due to missingness) ## Multiple R-squared: 0.06538, Adjusted R-squared: 0.05662 ## F-statistic: 7.462 on 6 and 640 DF, p-value: 9.683e-08 Max adjusted R-square / Min AIC library("leaps") leaps <- regsubsets( exret ~ bm1 + dfr1 + dfy1 + logdp1 + logdy1 + logep1 + infl2 + ltr1 + lty1 + tms1 + winter, data= df2, nbest=10) par(mar=c(1, 2.1, 1.1, 1)) plot(leaps,scale="adjr2") summary(minAIC <- lm(exret~bm1 + dfy1 + logep1 + ltr1 + lty1 + tms1 + winter, data=df2)) ## ## Call: ## lm(formula = exret ~ bm1 + dfy1 + logep1 + ltr1 + lty1 + tms1 + ## winter, data = df2) ## ## Residuals: ## Min 1Q Median 3Q Max ## -0.214873 -0.023876 0.001484 0.026213 0.163967 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 0.201568 0.053865 3.742 0.000199 *** ## bm1 -0.038220 0.015666 -2.440 0.014970 * ## dfy1 1.737098 0.667310 2.603 0.009452 ** ## logep1 0.034227 0.008950 3.824 0.000144 *** ## ltr1 0.122555 0.063155 1.941 0.052755 . ## lty1 -0.350817 0.087053 -4.030 6.25e-05 *** ## tms1 0.417569 0.134743 3.099 0.002027 ** ## winter 0.009189 0.003232 2.844 0.004603 ** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.04084 on 639 degrees of freedom ## (1 observation deleted due to missingness) ## Multiple R-squared: 0.07086, Adjusted R-squared: 0.06068 ## F-statistic: 6.961 on 7 and 639 DF, p-value: 5.234e-08 Alternatively, #library(MASS) df <- na.omit(df) AIC <- stepAIC(Full, direction = "both", trace = F) summary(AIC) ## ## Call: ## lm(formula = exret ~ bm1 + dfy1 + logep1 + ltr1 + lty1 + tms1 + ## winter, data = df, subset = yyyymm <= "200312") ## ## Residuals: ## Min 1Q Median 3Q Max ## -0.214809 -0.023822 0.001438 0.026190 0.163974 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 0.203662 0.054140 3.762 0.000184 *** ## bm1 -0.038576 0.015700 -2.457 0.014272 * ## dfy1 1.747346 0.668210 2.615 0.009135 ** ## logep1 0.034574 0.008996 3.843 0.000133 *** ## ltr1 0.122084 0.063207 1.931 0.053865 . ## lty1 -0.353789 0.087410 -4.047 5.81e-05 *** ## tms1 0.418881 0.134869 3.106 0.001982 ** ## winter 0.009236 0.003236 2.854 0.004451 ** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.04086 on 638 degrees of freedom ## Multiple R-squared: 0.07098, Adjusted R-squared: 0.06078 ## F-statistic: 6.963 on 7 and 638 DF, p-value: 5.21e-08 Min BIC par(mar=c(1, 2.1, 1.1, 1)) plot(leaps, scale="bic") summary(minBIC <- lm(exret ~ logep1 + ltr1 + lty1 + tms1 + winter, data=df2)) ## ## Call: ## lm(formula = exret ~ logep1 + ltr1 + lty1 + tms1 + winter, data = df2) ## ## Residuals: ## Min 1Q Median 3Q Max ## -0.213682 -0.024862 0.000898 0.026814 0.154582 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 0.094022 0.024134 3.896 0.000108 *** ## logep1 0.016817 0.004437 3.790 0.000165 *** ## ltr1 0.158186 0.062029 2.550 0.010998 * ## lty1 -0.212101 0.062482 -3.395 0.000730 *** ## tms1 0.512621 0.130553 3.927 9.55e-05 *** ## winter 0.010111 0.003232 3.128 0.001838 ** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 0.04105 on 641 degrees of freedom ## (1 observation deleted due to missingness) ## Multiple R-squared: 0.05816, Adjusted R-squared: 0.05082 ## F-statistic: 7.917 on 5 and 641 DF, p-value: 2.993e-07 #### Table 3.4 Contâ€™d stargazer(Full, minAIC, minBIC, Stepwise, no.space=TRUE, type="text", keep.stat = c("ser", "N", "rsq", "adj.rsq" ), column.labels = c("Full", "Min AIC", "Min BIC", "Stepwise"), title ="Table 3.4 Forecasting equation S&P 500 excess returns") ## ## Table 3.4 Forecasting equation S&P 500 excess returns ## ======================================================================================= ## Dependent variable: ## ------------------------------------------------------------------- ## exret ## Full Min AIC Min BIC Stepwise ## (1) (2) (3) (4) ## --------------------------------------------------------------------------------------- ## bm1 -0.037** -0.038** -0.041*** ## (0.018) (0.016) (0.016) ## dfr1 0.174 ## (0.168) ## dfy1 1.706** 1.737*** 2.013*** ## (0.687) (0.667) (0.653) ## logdp1 0.052 ## (0.042) ## logdy1 -0.055 ## (0.041) ## logep1 0.038*** 0.034*** 0.017*** 0.035*** ## (0.012) (0.009) (0.004) (0.009) ## infl2 -0.361 ## (0.648) ## ltr1 0.176** 0.123* 0.158** ## (0.076) (0.063) (0.062) ## lty1 -0.350*** -0.351*** -0.212*** -0.366*** ## (0.098) (0.087) (0.062) (0.087) ## tms1 0.414*** 0.418*** 0.513*** 0.401*** ## (0.149) (0.135) (0.131) (0.135) ## winter 0.010*** 0.009*** 0.010*** 0.009*** ## (0.003) (0.003) (0.003) (0.003) ## Constant 0.201*** 0.202*** 0.094*** 0.208*** ## (0.063) (0.054) (0.024) (0.054) ## --------------------------------------------------------------------------------------- ## Observations 646 647 647 647 ## R2 0.075 0.071 0.058 0.065 ## Adjusted R2 0.059 0.061 0.051 0.057 ## Residual Std. Error 0.041 (df = 634) 0.041 (df = 639) 0.041 (df = 641) 0.041 (df = 640) ## ======================================================================================= ## Note: *p<0.1; **p<0.05; ***p<0.01 #### Table 3.5 Out-of-sample forecasting performance Jan 2004-Dec 2013 Forcast df3 <- subset(df, df$yyyymm>"200312")
attach(df3)
Full_frc = cbind(1, bm1, dfr1, dfy1, logdp1, logdy1, logep1, infl2, ltr1, lty1, tms1, winter)%*%Full$coef minAIC_frc <- cbind(1, bm1, dfy1, logep1, ltr1, lty1, tms1, winter)%*%minAIC$coef
minBIC_frc <- cbind(1, logep1, ltr1, lty1, tms1, winter)%*%minBIC$coef Stepwise_frc <- cbind(1, bm1, dfy1, logep1, lty1, tms1, winter)%*%Stepwise$coef

Forecast performance: Full model

Full_frc_er <- Full_frc - exret
Full_rmse <- sqrt(sum(Full_frc_er^2)/length(Full_frc_er))*100
Full_r2os = (1-sum(Full_frc_er^2)/sum((mean(df2$exret)-df3$exret)^2) )*100
Full_r2ps <- cor(Full_frc, exret, method="pearson")*100

Hit <- data.frame(forecast <- ifelse(Full_frc>0,1,0),
actual=ifelse(df3$exret>0,1,0)) Hit_matrix <- xtabs(~Hit$forecast + Hit$actual) Full_hit_rate <- (sum(diag(Hit_matrix))/nrow(Hit))*100 Full_forecast_perf <- round(c(Full_rmse, Full_mad, Full_r2os, Full_r2ps, Full_hit_rate),3) Full_forecast_perf ## [1] 4.800 3.416 -31.412 0.040 58.333 Forecast performance: Max R-sqaure / Min AIC model minAIC_frc_er <- minAIC_frc - exret minAIC_rmse <- sqrt(sum(minAIC_frc_er^2)/length(minAIC_frc_er))*100 minAIC_mad <- sum(abs(minAIC_frc_er))/length(minAIC_frc_er)*100 minAIC_r2os = (1-sum(minAIC_frc_er^2)/sum((mean(df2$exret)-df3$exret)^2) )*100 minAIC_r2ps <- cor(minAIC_frc, exret, method="pearson")*100 Hit <- data.frame(forecast <- ifelse(minAIC_frc>0,1,0), actual=ifelse(df3$exret>0,1,0))
Hit_matrix <- xtabs(~Hit$forecast + Hit$actual)
minAIC_hit_rate <- (sum(diag(Hit_matrix))/nrow(Hit))*100

minAIC_forecast_perf <- round(c(minAIC_rmse, minAIC_mad, minAIC_r2os, minAIC_r2ps, minAIC_hit_rate),3)
minAIC_forecast_perf #In percentage 
## [1]   4.726   3.339 -27.345  -0.535  59.167

Forecast performance: Min BIC model

minBIC_frc_er <- minBIC_frc - exret
minBIC_rmse <- sqrt(sum(minBIC_frc_er^2)/length(minBIC_frc_er))*100
minBIC_r2os = (1-sum(minBIC_frc_er^2)/sum((mean(df2$exret)-df3$exret)^2) )*100
minBIC_r2ps <- cor(minBIC_frc, exret, method="pearson")*100

Hit <- data.frame(forecast <- ifelse(minBIC_frc>0,1,0),
actual=ifelse(exret>0,1,0))
Hit_matrix <- xtabs(~Hit$forecast + Hit$actual)
minBIC_hit_rate <- (sum(diag(Hit_matrix))/nrow(Hit))*100

minBIC_forecast_perf <- round(c(minBIC_rmse, minBIC_mad, minBIC_r2os, minBIC_r2ps, minBIC_hit_rate),3)
minBIC_forecast_perf
## [1]  4.306  3.107 -5.725  7.522 56.667
Stepwise_frc_er <- Stepwise_frc - exret
Stepwise_rmse <- sqrt(sum(Stepwise_frc_er^2)/length(Stepwise_frc_er))*100
Stepwise_r2os = (1-sum(Stepwise_frc_er^2)/sum((mean(df2$exret)-df3$exret)^2) )*100
Stepwise_r2ps <- cor(Stepwise_frc, exret, method="pearson")*100

Hit <- data.frame(forecast <- ifelse(Stepwise_frc>0,1,0),
actual=ifelse(df3$exret>0,1,0)) Hit_matrix <- xtabs(~Hit$forecast + Hit$actual) Stepwise_hit_rate <- (sum(diag(Hit_matrix))/nrow(Hit))*100 Stepwise_forecast_perf <- round(c(Stepwise_rmse, Stepwise_mad, Stepwise_r2os, Stepwise_r2ps, Stepwise_hit_rate),3) Stepwise_forecast_perf ## [1] 4.787 3.402 -30.685 -3.309 60.000 Table 3.5 final forecast_perf = cbind(Full_forecast_perf, minBIC_forecast_perf, minBIC_forecast_perf, Stepwise_forecast_perf) colnames(forecast_perf)=c("Full","Min AIC","Min BIC", "Stepwise") rownames(forecast_perf)=c("RMSE","MAD","R2os","R2ps","HitRate") forecast_perf ## Full Min AIC Min BIC Stepwise ## RMSE 4.800 4.306 4.306 4.787 ## MAD 3.416 3.107 3.107 3.402 ## R2os -31.412 -5.725 -5.725 -30.685 ## R2ps 0.040 7.522 7.522 -3.309 ## HitRate 58.333 56.667 56.667 60.000 detach(df3) #### Table 3.6 Summary statistics, 1472 individuals df <- read_dta("Data/Bwages.dta") df2 <- as.data.frame(cbind(wage=df$wage, educ=df$educ, exper=df$exper, male=df\$male))
ag <- aggregate(. ~ male, df2, function(x) c(mean = mean(x), sd = sd(x)))
table3.2 <- do.call("data.frame", ag)
round(table3.2, 2)
##   male wage.mean wage.sd educ.mean educ.sd exper.mean exper.sd
## 1    0     10.26    3.81      3.59    1.09      15.20     9.70
## 2    1     11.56    4.75      3.24    1.26      18.52    10.25

#### Table 3.7

OLS results specification 1

summary(OLS1<-lm(wage ~ male + educ + exper, data=df))
##
## Call:
## lm(formula = wage ~ male + educ + exper, data = df)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -13.5294  -1.9686  -0.3124   1.5679  30.7015
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.213692   0.386895   0.552    0.581
## male        1.346144   0.192736   6.984 4.32e-12 ***
## educ        1.986090   0.080640  24.629  < 2e-16 ***
## exper       0.192275   0.009583  20.064  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.548 on 1468 degrees of freedom
## Multiple R-squared:  0.3656, Adjusted R-squared:  0.3643
## F-statistic:   282 on 3 and 1468 DF,  p-value: < 2.2e-16

#### Table 3.8

OLS results specification 2

summary(OLS2<-lm(wage ~ male + educ + exper + I(exper^2),data=df))
##
## Call:
## lm(formula = wage ~ male + educ + exper + I(exper^2), data = df)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -12.7246  -1.9519  -0.3107   1.5117  30.5951
##
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.8924851  0.4329127  -2.062   0.0394 *
## male         1.3336934  0.1908668   6.988 4.23e-12 ***
## educ         1.9881267  0.0798526  24.897  < 2e-16 ***
## exper        0.3579993  0.0316566  11.309  < 2e-16 ***
## I(exper^2)  -0.0043692  0.0007962  -5.487 4.80e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.514 on 1467 degrees of freedom
## Multiple R-squared:  0.3783, Adjusted R-squared:  0.3766
## F-statistic: 223.2 on 4 and 1467 DF,  p-value: < 2.2e-16

#### Figure 3.1

Residual <- resid(OLS2)
Fitted <- predict(OLS2)
par(mar=c(2, 2.1, 1.1, 1))
plot(Fitted, Residual, xlim = c(0, 20), ylim = c(-20, 40),
xlab="Fitted value", ylab="Residual", cex.main=0.8,
main = "Figure 3.1 Residuals versus fitted values, linear model")

#### Table 3.9

OLS results specification 3 and the F-test

summary(OLS3 <- lm(lnwage ~ male + lneduc + lnexper + I(lnexper^2), data=df))
##
## Call:
## lm(formula = lnwage ~ male + lneduc + lnexper + I(lnexper^2),
##     data = df)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -1.75085 -0.15921  0.00618  0.17145  1.10533
##
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept)   1.26271    0.06634  19.033  < 2e-16 ***
## male          0.11794    0.01557   7.574 6.35e-14 ***
## lneduc        0.44218    0.01819  24.306  < 2e-16 ***
## lnexper       0.10982    0.05438   2.019   0.0436 *
## I(lnexper^2)  0.02601    0.01148   2.266   0.0236 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2862 on 1467 degrees of freedom
## Multiple R-squared:  0.3783, Adjusted R-squared:  0.3766
## F-statistic: 223.1 on 4 and 1467 DF,  p-value: < 2.2e-16
linearHypothesis(OLS3, c("lnexper = 0", "I(lnexper^2) =0"))
## Linear hypothesis test
##
## Hypothesis:
## lnexper = 0
## I(lnexper^2) = 0
##
## Model 1: restricted model
## Model 2: lnwage ~ male + lneduc + lnexper + I(lnexper^2)
##
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)
## 1   1469 158.57
## 2   1467 120.20  2    38.362 234.09 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#### Figure 3.2

Residual <- resid(OLS3)
Fitted <- predict(OLS3)
par(mar=c(2, 3.1, 1.1, 2.1))
plot(Fitted, Residual, xlim = c(1, 3), ylim = c(-2, 2),
main = "Figure 3.1 Residuals against fitted values, loglinear model",
cex.main=0.8)

#### Table 3.10

OLS results specification 4

summary(OLS4 <- lm(lnwage ~ male + lneduc + lnexper, data=df))
##
## Call:
## lm(formula = lnwage ~ male + lneduc + lnexper, data = df)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -1.70477 -0.15862  0.00485  0.17366  1.11815
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  1.14473    0.04118  27.798  < 2e-16 ***
## male         0.12008    0.01556   7.715 2.22e-14 ***
## lneduc       0.43662    0.01805  24.188  < 2e-16 ***
## lnexper      0.23065    0.01073  21.488  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2867 on 1468 degrees of freedom
## Multiple R-squared:  0.3761, Adjusted R-squared:  0.3748
## F-statistic:   295 on 3 and 1468 DF,  p-value: < 2.2e-16

#### Table 3.11

OLS results specification 5 and the F-test

summary(OLS5 <- lm(lnwage ~ male + factor(educ) + lnexper, data=df))
##
## Call:
## lm(formula = lnwage ~ male + factor(educ) + lnexper, data = df)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -1.65548 -0.15138  0.01324  0.16998  1.11684
##
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)    1.27189    0.04483  28.369  < 2e-16 ***
## male           0.11762    0.01546   7.610 4.88e-14 ***
## factor(educ)2  0.14364    0.03336   4.306 1.77e-05 ***
## factor(educ)3  0.30487    0.03202   9.521  < 2e-16 ***
## factor(educ)4  0.47428    0.03301  14.366  < 2e-16 ***
## factor(educ)5  0.63910    0.03322  19.237  < 2e-16 ***
## lnexper        0.23022    0.01056  21.804  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.282 on 1465 degrees of freedom
## Multiple R-squared:  0.3976, Adjusted R-squared:  0.3951
## F-statistic: 161.1 on 6 and 1465 DF,  p-value: < 2.2e-16
anova(OLS4,OLS5)
## Analysis of Variance Table
##
## Model 1: lnwage ~ male + lneduc + lnexper
## Model 2: lnwage ~ male + factor(educ) + lnexper
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)
## 1   1468 120.62
## 2   1465 116.47  3    4.1563 17.427 4.047e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#### Table 3.12

OLS results specification 6 and the F-test

summary(OLS6 <- lm(lnwage ~ male*factor(educ) + lnexper + lnexper*male, data=df))
##
## Call:
## lm(formula = lnwage ~ male * factor(educ) + lnexper + lnexper *
##     male, data = df)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -1.63955 -0.15328  0.01225  0.16647  1.11698
##
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)
## (Intercept)         1.21584    0.07768  15.652  < 2e-16 ***
## male                0.15375    0.09522   1.615 0.106595
## factor(educ)2       0.22411    0.06758   3.316 0.000935 ***
## factor(educ)3       0.43319    0.06323   6.851 1.08e-11 ***
## factor(educ)4       0.60191    0.06280   9.585  < 2e-16 ***
## factor(educ)5       0.75491    0.06467  11.673  < 2e-16 ***
## lnexper             0.20744    0.01655  12.535  < 2e-16 ***
## male:factor(educ)2 -0.09651    0.07770  -1.242 0.214381
## male:factor(educ)3 -0.16677    0.07340  -2.272 0.023215 *
## male:factor(educ)4 -0.17236    0.07440  -2.317 0.020663 *
## male:factor(educ)5 -0.14616    0.07551  -1.935 0.053123 .
## male:lnexper        0.04063    0.02149   1.891 0.058875 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2811 on 1460 degrees of freedom
## Multiple R-squared:  0.4032, Adjusted R-squared:  0.3988
## F-statistic: 89.69 on 11 and 1460 DF,  p-value: < 2.2e-16
anova(OLS5, OLS6)
## Analysis of Variance Table
##
## Model 1: lnwage ~ male + factor(educ) + lnexper
## Model 2: lnwage ~ male * factor(educ) + lnexper + lnexper * male
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)
## 1   1465 116.47
## 2   1460 115.37  5    1.0957 2.7732 0.01683 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#### Table 3.13

OLS results specification 7

summary(OLS7 <- lm(lnwage ~ male + lnexper*factor(educ), data=df))
##
## Call:
## lm(formula = lnwage ~ male + lnexper * factor(educ), data = df)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -1.63623 -0.15046  0.00831  0.16713  1.12415
##
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)
## (Intercept)            1.48891    0.21203   7.022 3.34e-12 ***
## male                   0.11597    0.01548   7.493 1.16e-13 ***
## lnexper                0.16312    0.06539   2.494   0.0127 *
## factor(educ)2          0.06727    0.22628   0.297   0.7663
## factor(educ)3          0.13525    0.21889   0.618   0.5367
## factor(educ)4          0.20495    0.21946   0.934   0.3505
## factor(educ)5          0.34130    0.21808   1.565   0.1178
## lnexper:factor(educ)2  0.01933    0.07049   0.274   0.7839
## lnexper:factor(educ)3  0.04988    0.06821   0.731   0.4647
## lnexper:factor(educ)4  0.08784    0.06877   1.277   0.2017
## lnexper:factor(educ)5  0.09996    0.06822   1.465   0.1430
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2815 on 1461 degrees of freedom
## Multiple R-squared:  0.4012, Adjusted R-squared:  0.3971
## F-statistic:  97.9 on 10 and 1461 DF,  p-value: < 2.2e-16