15. CHAPTER 15. REGRESSION WITH TRANSFORMED VARIABLES#
SET UP
library(foreign) # to open stata.dta files
library(psych) # for better sammary of descriptive statistics
library(repr) # to combine graphs with adjustable plot dimensions
options(repr.plot.width = 12, repr.plot.height = 6) # Plot dimensions (in inches)
options(width = 150) # To increase character width of printed output
Load Libraries
Show code cell source
pkgs <- c("car", "jtools", "sandwich","huxtable","margins","QuantPsyc")
invisible(lapply(pkgs, function(pkg) suppressPackageStartupMessages(library(pkg, character.only = TRUE))))
15.1. 15.1 EXAMPLE: EARNINGS, GENDER, EDUCATION and TYPE OF WORKER#
df = read.dta(file = "Dataset/AED_EARNINGS_COMPLETE.DTA")
attach(df)
print(describe(df))
vars n mean sd median trimmed mad min max range skew kurtosis se
earnings 1 872 56368.69 51516.05 44200.00 47580.66 24907.68 4000.00 504000.00 500000.00 4.17 25.48 1744.55
lnearnings 2 872 10.69 0.68 10.70 10.69 0.57 8.29 13.13 4.84 0.10 1.01 0.02
dearnings 3 872 0.16 0.37 0.00 0.08 0.00 0.00 1.00 1.00 1.81 1.28 0.01
gender 4 872 0.43 0.50 0.00 0.42 0.00 0.00 1.00 1.00 0.27 -1.93 0.02
age 5 872 43.31 10.68 44.00 43.22 13.34 25.00 65.00 40.00 0.04 -1.03 0.36
lnage 6 872 3.74 0.26 3.78 3.75 0.30 3.22 4.17 0.96 -0.33 -0.93 0.01
agesq 7 872 1989.67 935.69 1936.00 1935.44 1054.13 625.00 4225.00 3600.00 0.39 -0.83 31.69
education 8 872 13.85 2.88 13.00 13.87 1.48 0.00 20.00 20.00 -1.04 4.67 0.10
educsquared 9 872 200.22 73.91 169.00 195.67 40.03 0.00 400.00 400.00 0.39 0.00 2.50
agebyeduc 10 872 598.82 193.69 588.00 593.70 187.55 0.00 1260.00 1260.00 0.14 0.54 6.56
genderbyage 11 872 19.04 22.87 0.00 16.59 0.00 0.00 65.00 65.00 0.53 -1.43 0.77
genderbyeduc 12 872 6.08 7.17 0.00 5.45 0.00 0.00 20.00 20.00 0.42 -1.65 0.24
hours 13 872 44.34 8.50 40.00 42.66 0.00 35.00 99.00 64.00 2.32 6.65 0.29
lnhours 14 872 3.78 0.16 3.69 3.75 0.00 3.56 4.60 1.04 1.69 3.02 0.01
genderbyhours 15 872 18.56 21.76 0.00 16.56 0.00 0.00 80.00 80.00 0.45 -1.42 0.74
dself 16 872 0.09 0.29 0.00 0.00 0.00 0.00 1.00 1.00 2.85 6.12 0.01
dprivate 17 872 0.76 0.43 1.00 0.83 0.00 0.00 1.00 1.00 -1.22 -0.52 0.01
dgovt 18 872 0.15 0.36 0.00 0.06 0.00 0.00 1.00 1.00 1.97 1.87 0.01
state* 19 872 24.40 14.83 24.00 24.23 20.76 1.00 50.00 49.00 0.01 -1.39 0.50
statefip* 20 872 24.71 15.20 24.00 24.50 20.76 1.00 51.00 50.00 0.04 -1.39 0.51
stateunemp 21 872 9.60 1.65 9.45 9.59 1.85 4.80 14.40 9.60 0.04 -0.65 0.06
stateincomepc 22 872 40772.99 5558.63 39493.00 40392.34 5353.67 31186.00 71044.00 39858.00 0.97 2.16 188.24
year* 23 872 25.00 0.00 25.00 25.00 0.00 25.00 25.00 0.00 NaN NaN 0.00
pernum 24 872 1.54 0.89 1.00 1.37 0.00 1.00 8.00 7.00 2.86 12.31 0.03
perwt 25 872 145.78 90.99 109.00 132.50 56.34 14.00 626.00 612.00 1.44 2.15 3.08
relate* 26 872 2.10 2.47 1.00 1.42 0.00 1.00 12.00 11.00 3.01 8.03 0.08
region* 27 872 6.81 3.50 7.00 6.82 4.45 1.00 12.00 11.00 0.00 -1.17 0.12
metro* 28 872 3.70 1.22 4.00 3.82 1.48 1.00 5.00 4.00 -0.64 -0.68 0.04
marst* 29 872 2.55 2.05 1.00 2.32 0.00 1.00 6.00 5.00 0.76 -1.16 0.07
race* 30 872 1.70 1.69 1.00 1.18 0.00 1.00 8.00 7.00 2.52 4.94 0.06
raced* 31 872 10.67 26.61 1.00 2.66 0.00 1.00 152.00 151.00 2.97 8.01 0.90
hispan* 32 872 1.24 0.78 1.00 1.03 0.00 1.00 5.00 4.00 3.88 14.95 0.03
racesing* 33 872 1.33 0.81 1.00 1.10 0.00 1.00 5.00 4.00 2.70 6.41 0.03
hcovany* 34 872 1.87 0.34 2.00 1.96 0.00 1.00 2.00 1.00 -2.17 2.72 0.01
attainededuc* 35 872 8.77 2.29 8.00 8.82 1.48 1.00 12.00 11.00 -0.37 0.16 0.08
detailededuc* 36 872 30.58 6.87 29.00 30.59 5.93 3.00 43.00 40.00 -0.46 1.40 0.23
empstat* 37 872 2.02 0.16 2.00 2.00 0.00 2.00 4.00 2.00 9.88 103.99 0.01
classwkr* 38 872 2.91 0.29 3.00 3.00 0.00 2.00 3.00 1.00 -2.87 6.26 0.01
classwkrd* 39 872 9.51 2.21 9.00 9.33 0.00 5.00 16.00 11.00 0.84 1.58 0.07
wkswork2* 40 872 6.99 0.10 7.00 7.00 0.00 6.00 7.00 1.00 -9.67 91.68 0.00
workedyr* 41 872 4.00 0.00 4.00 4.00 0.00 4.00 4.00 0.00 NaN NaN 0.00
inctot 42 872 57718.70 53049.14 45000.00 48533.48 25204.20 4000.00 548000.00 544000.00 4.17 25.73 1796.47
incwage 43 872 52828.21 48915.23 41900.00 45591.83 26538.54 0.00 498000.00 498000.00 3.96 24.64 1656.48
incbus00 44 872 3540.48 20495.13 0.00 0.00 0.00 -7500.00 285000.00 292500.00 8.78 92.01 694.05
incearn 45 872 56368.69 51516.05 44200.00 47580.66 24907.68 4000.00 504000.00 500000.00 4.17 25.48 1744.55
15.1.1. Table 15.1#
table151vars = c("earnings", "lnearnings", "age", "agesq", "education", "agebyeduc",
"lnage", "gender", "dself", "dprivate", "dgovt", "hours", "lnhours")
print(describe(df[table151vars]))
vars n mean sd median trimmed mad min max range skew kurtosis se
earnings 1 872 56368.69 51516.05 44200.00 47580.66 24907.68 4000.00 504000.00 5.00e+05 4.17 25.48 1744.55
lnearnings 2 872 10.69 0.68 10.70 10.69 0.57 8.29 13.13 4.84e+00 0.10 1.01 0.02
age 3 872 43.31 10.68 44.00 43.22 13.34 25.00 65.00 4.00e+01 0.04 -1.03 0.36
agesq 4 872 1989.67 935.69 1936.00 1935.44 1054.13 625.00 4225.00 3.60e+03 0.39 -0.83 31.69
education 5 872 13.85 2.88 13.00 13.87 1.48 0.00 20.00 2.00e+01 -1.04 4.67 0.10
agebyeduc 6 872 598.82 193.69 588.00 593.70 187.55 0.00 1260.00 1.26e+03 0.14 0.54 6.56
lnage 7 872 3.74 0.26 3.78 3.75 0.30 3.22 4.17 9.60e-01 -0.33 -0.93 0.01
gender 8 872 0.43 0.50 0.00 0.42 0.00 0.00 1.00 1.00e+00 0.27 -1.93 0.02
dself 9 872 0.09 0.29 0.00 0.00 0.00 0.00 1.00 1.00e+00 2.85 6.12 0.01
dprivate 10 872 0.76 0.43 1.00 0.83 0.00 0.00 1.00 1.00e+00 -1.22 -0.52 0.01
dgovt 11 872 0.15 0.36 0.00 0.06 0.00 0.00 1.00 1.00e+00 1.97 1.87 0.01
hours 12 872 44.34 8.50 40.00 42.66 0.00 35.00 99.00 6.40e+01 2.32 6.65 0.29
lnhours 13 872 3.78 0.16 3.69 3.75 0.00 3.56 4.60 1.04e+00 1.69 3.02 0.01
15.2. 15.3 QUADRATIC AND POLYNOMIAL MODELS#
15.2.1. Figure 15.2 - generated data#
Quadratic Model for Earnings on Age with Education as a regressor as well
# Linear Model
ols.linear = lm(earnings ~ age+education)
summ(ols.linear, digits=3, robust = "HC1")
MODEL INFO:
Observations: 872
Dependent Variable: earnings
Type: OLS linear regression
MODEL FIT:
F(2,869) = 56.455, p = 0.000
R² = 0.115
Adj. R² = 0.113
Standard errors: Robust, type = HC1
-----------------------------------------------------------
Est. S.E. t val. p
----------------- ------------ ----------- -------- -------
(Intercept) -46875.360 11306.330 -4.146 0.000
age 524.995 151.387 3.468 0.001
education 5811.367 641.533 9.059 0.000
-----------------------------------------------------------
Quadratic model
ols.quad = lm(earnings ~ age+agesq+education)
summ(ols.quad, digits=3, robust = "HC1")
MODEL INFO:
Observations: 872
Dependent Variable: earnings
Type: OLS linear regression
MODEL FIT:
F(3,868) = 39.133, p = 0.000
R² = 0.119
Adj. R² = 0.116
Standard errors: Robust, type = HC1
-----------------------------------------------------------
Est. S.E. t val. p
----------------- ------------ ----------- -------- -------
(Intercept) -98620.326 24524.116 -4.021 0.000
age 3104.916 1087.323 2.856 0.004
agesq -29.658 12.456 -2.381 0.017
education 5740.398 642.024 8.941 0.000
-----------------------------------------------------------
Turning point
bage = summary(ols.quad)$coefficients["age",1]
bagesq = summary(ols.quad)$coefficients["agesq",1]
turning.point = -bage/(2*bagesq)
turning.point
Marginal effects
mequad = bage + 2*bagesq*age
print(describe(mequad))
AME.age = mean(mequad)
AME.age
MEM.age = bage + 2*bagesq*mean(age)
MEM.age
MER.age25 = bage + 2*bagesq*25
MER.age25
vars n mean sd median trimmed mad min max range skew kurtosis se
X1 1 872 535.87 633.27 494.99 540.96 791.49 -750.66 1622 2372.66 -0.04 -1.03 21.45
Joint hypothesis test (requires packages car and sandwich)
robvar = vcovHC(ols.quad, type="HC1")
print(
linearHypothesis(ols.quad,c("age=0", "agesq=0"), vcov=robvar)
)
Linear hypothesis test:
age = 0
agesq = 0
Model 1: restricted model
Model 2: earnings ~ age + agesq + education
Note: Coefficient covariance matrix supplied.
Res.Df Df F Pr(>F)
1 870
2 868 2 9.2919 0.0001017 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ME for age using margins package and R’s factor variable notation - age*age gives age and age^2
and use heteroskedastic-robust standard errors which uses sandwich package
ols.factor.quad = lm(earnings ~ age*age + education)
summ(ols.factor.quad, digits=3, robust = "HC1")
MODEL INFO:
Observations: 872
Dependent Variable: earnings
Type: OLS linear regression
MODEL FIT:
F(2,869) = 56.455, p = 0.000
R² = 0.115
Adj. R² = 0.113
Standard errors: Robust, type = HC1
-----------------------------------------------------------
Est. S.E. t val. p
----------------- ------------ ----------- -------- -------
(Intercept) -46875.360 11306.330 -4.146 0.000
age 524.995 151.387 3.468 0.001
education 5811.367 641.533 9.059 0.000
-----------------------------------------------------------
AMEs only
print(margins(ols.factor.quad))
Average marginal effects
lm(formula = earnings ~ age * age + education)
age education
525 5811
AMES with standard errors, z-statistic, confidence interval
print(summary(margins(ols.factor.quad, vcov = vcovHC(ols.factor.quad, type="HC1"))))
factor AME SE z p lower upper
age 524.9953 151.3924 3.4678 0.0005 228.2716 821.7190
education 5811.3673 639.2632 9.0907 0.0000 4558.4344 7064.3002
AME for a single regressor
print(
summary(margins(ols.factor.quad, variables = "age",
vcov = vcovHC(ols.factor.quad, type="HC1")))
)
factor AME SE z p lower upper
age 524.9953 151.3924 3.4678 0.0005 228.2716 821.7190
Note: MEM is not given by margins
MER for a single regressor at two values
print(summary(margins(ols.factor.quad, variables = "age",
at=list(age=c(25,65)), vcov = vcovHC(ols.factor.quad, type="HC1"))))
factor age AME SE z p lower upper
age 25.0000 524.9953 152.1844 3.4497 0.0006 226.7195 823.2712
age 65.0000 524.9953 151.3784 3.4681 0.0005 228.2992 821.6915
15.3. 15.4 INTERACTED REGRESSORS#
Regression with interactions
ols.interact = lm(earnings ~ age+education+agebyeduc)
summ(ols.interact, digits=3, robust = "HC1")
MODEL INFO:
Observations: 872
Dependent Variable: earnings
Type: OLS linear regression
MODEL FIT:
F(3,868) = 37.713, p = 0.000
R² = 0.115
Adj. R² = 0.112
Standard errors: Robust, type = HC1
-----------------------------------------------------------
Est. S.E. t val. p
----------------- ------------ ----------- -------- -------
(Intercept) -29089.382 30958.508 -0.940 0.348
age 127.492 719.280 0.177 0.859
education 4514.987 2401.517 1.880 0.060
agebyeduc 29.039 56.052 0.518 0.605
-----------------------------------------------------------
Joint test for statistical significance of age
robvar = vcovHC(ols.interact, type="HC1")
print(
linearHypothesis(ols.interact,c("age=0", "agebyeduc=0"), vcov=robvar)
)
Linear hypothesis test:
age = 0
agebyeduc = 0
Model 1: restricted model
Model 2: earnings ~ age + education + agebyeduc
Note: Coefficient covariance matrix supplied.
Res.Df Df F Pr(>F)
1 870
2 868 2 6.4896 0.001594 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The regressors are highly correlated
print(
cor(cbind(age, education, agebyeduc))
)
age education agebyeduc
age 1.0000000 -0.0381526 0.7291365
education -0.0381526 1.0000000 0.6359608
agebyeduc 0.7291365 0.6359608 1.0000000
Marginal effects manually
beducation = summary(ols.interact)$coefficients["education",1]
bagebyeduc = summary(ols.interact)$coefficients["agebyeduc",1]
meinteract = beducation + bagebyeduc*age
AME.educ = summary(meinteract)
AME.educ
MEM.educ = beducation + bagebyeduc*summary(age)
MEM.educ
MER.educ.25 = beducation + bagebyeduc*25
MER.educ.25
Min. 1st Qu. Median Mean 3rd Qu. Max.
5241 5531 5793 5773 6003 6403
Min. 1st Qu. Median Mean 3rd Qu. Max.
5241 5531 5793 5773 6003 6403
ME for education using margins package R’s factor variable notation - \(age*educ\) gives \(age\), \(educ\) and \(age*educ\) and use heteroskedastic-robust standard errors which uses sandwich package
ols.factor.interact = lm(earnings ~ age*education)
summ(ols.factor.interact, digits=3, robust = "HC1")
MODEL INFO:
Observations: 872
Dependent Variable: earnings
Type: OLS linear regression
MODEL FIT:
F(3,868) = 37.713, p = 0.000
R² = 0.115
Adj. R² = 0.112
Standard errors: Robust, type = HC1
-------------------------------------------------------------
Est. S.E. t val. p
------------------- ------------ ----------- -------- -------
(Intercept) -29089.382 30958.508 -0.940 0.348
age 127.492 719.280 0.177 0.859
education 4514.987 2401.517 1.880 0.060
age:education 29.039 56.052 0.518 0.605
-------------------------------------------------------------
# AMEs only
print(margins(ols.factor.interact))
Average marginal effects
lm(formula = earnings ~ age * education)
age education
529.8 5773
# AMES with standard errors, z-statistic, confidence interval
print(
summary(margins(ols.factor.interact, vcov = vcovHC(ols.factor.interact, type="HC1")))
)
factor AME SE z p lower upper
age 529.7778 155.2115 3.4133 0.0006 225.5689 833.9867
education 5772.6953 626.7852 9.2100 0.0000 4544.2189 7001.1717
# AME for a single regressor
print(
summary(margins(ols.factor.interact, variables = "education",
vcov = vcovHC(ols.factor.interact, type="HC1")))
)
factor AME SE z p lower upper
education 5772.6953 626.7852 9.2100 0.0000 4544.2189 7001.1717
# MEM is not given by margins
# MER for a single regressor at two values
print(summary(margins(ols.factor.interact, variables = "education",
vcov = vcovHC(ols.factor.interact, type="HC1")))
)
print(summary(margins(ols.factor.interact, variables = "education",
at=list(age=c(25,65)), vcov = vcovHC(ols.factor.interact, type="HC1")))
)
factor AME SE z p lower upper
education 5772.6953 626.7852 9.2100 0.0000 4544.2189 7001.1717
factor age AME SE z p lower upper
education 25.0000 5240.9657 1109.5069 4.7237 0.0000 3066.3722 7415.5591
education 65.0000 6402.5321 1457.8826 4.3917 0.0000 3545.1346 9259.9295
# Use package jtools which also needs huxtable for tables
#library(huxtable)
export_summs(ols.linear, ols.quad, ols.interact, scale = TRUE,
error_format = "({statistic})", robust = "HC1")
names | Model 1 | Model 2 | Model 3 | |
---|---|---|---|---|
<chr> | <chr> | <chr> | <chr> | |
Model 1 | Model 2 | Model 3 | ||
1 | (Intercept) | 56368.6938073394 *** | 56368.6938073394 *** | 56368.6938073394 *** |
2 | (34.3067991492708) | (34.3677132279417) | (34.293317153113) | |
3 | age | 5604.8734461588 *** | 33148.2247664834 ** | 1361.11194147122 |
4 | (3.46789224399738) | (2.85555972562047) | (0.177249741677106) | |
5 | education | 16760.8009996877 *** | 16556.1149293775 *** | 13021.8567165293 |
6 | (9.05856467518862) | (8.94110156422714) | (1.88005594862479) | |
7 | agesq | -27751.0364917983 * | ||
8 | (-2.3810493397332) | |||
9 | agebyeduc | 5624.61362960973 | ||
10 | (0.518075029268155) | |||
1.1 | N | 872 | 872 | 872 |
2.1 | R2 | 0.114989391658086 | 0.119138652546052 | 0.115312617880932 |
.1 | All continuous predictors are mean-centered and scaled by 1 standard deviation. The outcome variable is in its original units. Standard errors are heteroskedasticity robust. *** p < 0.001; ** p < 0.01; * p < 0.05. | All continuous predictors are mean-centered and scaled by 1 standard deviation. The outcome variable is in its original units. Standard errors are heteroskedasticity robust. *** p < 0.001; ** p < 0.01; * p < 0.05. | All continuous predictors are mean-centered and scaled by 1 standard deviation. The outcome variable is in its original units. Standard errors are heteroskedasticity robust. *** p < 0.001; ** p < 0.01; * p < 0.05. | All continuous predictors are mean-centered and scaled by 1 standard deviation. The outcome variable is in its original units. Standard errors are heteroskedasticity robust. *** p < 0.001; ** p < 0.01; * p < 0.05. |
15.4. 15.5 LOG-LINEAR AND LOG-LOG MODELS#
Levels model
ols.linear2 <- lm(earnings ~ age+education)
summ(ols.linear2, digits=3, robust = "HC1")
MODEL INFO:
Observations: 872
Dependent Variable: earnings
Type: OLS linear regression
MODEL FIT:
F(2,869) = 56.455, p = 0.000
R² = 0.115
Adj. R² = 0.113
Standard errors: Robust, type = HC1
-----------------------------------------------------------
Est. S.E. t val. p
----------------- ------------ ----------- -------- -------
(Intercept) -46875.360 11306.330 -4.146 0.000
age 524.995 151.387 3.468 0.001
education 5811.367 641.533 9.059 0.000
-----------------------------------------------------------
# Log-linear model
ols.loglin2 <- lm(lnearnings ~ age+education)
summ(ols.loglin2, digits=3, robust = "HC1")
MODEL INFO:
Observations: 872
Dependent Variable: lnearnings
Type: OLS linear regression
MODEL FIT:
F(2,869) = 102.198, p = 0.000
R² = 0.190
Adj. R² = 0.189
Standard errors: Robust, type = HC1
--------------------------------------------------
Est. S.E. t val. p
----------------- ------- ------- -------- -------
(Intercept) 8.962 0.150 59.626 0.000
age 0.008 0.002 3.832 0.000
education 0.101 0.009 11.683 0.000
--------------------------------------------------
# Log-log model
ols.loglog2 <- lm(lnearnings ~ lnage+education)
summ(ols.loglog2, digits=3, robust = "HC1")
MODEL INFO:
Observations: 872
Dependent Variable: lnearnings
Type: OLS linear regression
MODEL FIT:
F(2,869) = 103.743, p = 0.000
R² = 0.193
Adj. R² = 0.191
Standard errors: Robust, type = HC1
--------------------------------------------------
Est. S.E. t val. p
----------------- ------- ------- -------- -------
(Intercept) 8.009 0.331 24.229 0.000
lnage 0.346 0.082 4.211 0.000
education 0.100 0.009 11.665 0.000
--------------------------------------------------
# Use package jtools which also needs huxtable for tables
# For some reason gives wrong results
export_summs(ols.linear2, ols.loglin2, ols.loglog2, scale = TRUE,
error_format = "({statistic})", robust = "HC1")
names | Model 1 | Model 2 | Model 3 | |
---|---|---|---|---|
<chr> | <chr> | <chr> | <chr> | |
Model 1 | Model 2 | Model 3 | ||
1 | (Intercept) | 56368.6938073394 *** | 10.6911645176214 *** | 10.6911645176214 *** |
2 | (34.3067991492708) | (512.201453131804) | (512.938183351903) | |
3 | age | 5604.8734461588 *** | 0.0828323181429894 *** | |
4 | (3.46789224399738) | (3.83168075340195) | ||
5 | education | 16760.8009996877 *** | 0.290043843291207 *** | 0.289516927075926 *** |
6 | (9.05856467518862) | (11.6833123854303) | (11.6653455649129) | |
7 | lnage | 0.0891413931083605 *** | ||
8 | (4.21055308818822) | |||
1.1 | N | 872 | 872 | 872 |
2.1 | R2 | 0.114989391658086 | 0.190419488464102 | 0.192743410193262 |
.1 | All continuous predictors are mean-centered and scaled by 1 standard deviation. The outcome variable is in its original units. Standard errors are heteroskedasticity robust. *** p < 0.001; ** p < 0.01; * p < 0.05. | All continuous predictors are mean-centered and scaled by 1 standard deviation. The outcome variable is in its original units. Standard errors are heteroskedasticity robust. *** p < 0.001; ** p < 0.01; * p < 0.05. | All continuous predictors are mean-centered and scaled by 1 standard deviation. The outcome variable is in its original units. Standard errors are heteroskedasticity robust. *** p < 0.001; ** p < 0.01; * p < 0.05. | All continuous predictors are mean-centered and scaled by 1 standard deviation. The outcome variable is in its original units. Standard errors are heteroskedasticity robust. *** p < 0.001; ** p < 0.01; * p < 0.05. |
15.5. 15.6 PREDICTION FROM LOG-LINEAR AND LOG-LOG MODELS#
Levels model
ols.linear <- lm(earnings ~ age+education)
linear.predict = predict(ols.linear)
# Retransformation bias in log-linear model
ols.loglin = lm(lnearnings ~ age+education)
summ(ols.loglin)
predict.log = predict(ols.loglin)
biased.predict = exp(predict.log)
rmse = summary(ols.loglin)$sigma
rmse
exp(rmse^2/2)
adjusted.predict = exp(rmse^2/2)*biased.predict
print(summary(cbind(earnings, linear.predict, biased.predict, adjusted.predict)))
print(cor(cbind(earnings, linear.predict, biased.predict, adjusted.predict)))
MODEL INFO:
Observations: 872
Dependent Variable: lnearnings
Type: OLS linear regression
MODEL FIT:
F(2,869) = 102.20, p = 0.00
R² = 0.19
Adj. R² = 0.19
Standard errors:OLS
-----------------------------------------------
Est. S.E. t val. p
----------------- ------ ------ -------- ------
(Intercept) 8.96 0.14 66.15 0.00
age 0.01 0.00 3.96 0.00
education 0.10 0.01 13.88 0.00
-----------------------------------------------
earnings linear.predict biased.predict adjusted.predict
Min. : 4000 Min. :-33750 Min. : 9471 Min. : 11452
1st Qu.: 29000 1st Qu.: 46486 1st Qu.:37182 1st Qu.: 44960
Median : 44200 Median : 54361 Median :41858 Median : 50615
Mean : 56369 Mean : 56369 Mean :45838 Mean : 55427
3rd Qu.: 64250 3rd Qu.: 67631 3rd Qu.:53280 3rd Qu.: 64426
Max. :504000 Max. :102427 Max. :95043 Max. :114925
earnings linear.predict biased.predict adjusted.predict
earnings 1.0000000 0.3391009 0.3631965 0.3631965
linear.predict 0.3391009 1.0000000 0.9621589 0.9621589
biased.predict 0.3631965 0.9621589 1.0000000 1.0000000
adjusted.predict 0.3631965 0.9621589 1.0000000 1.0000000
# Retransformation bias in log-log model
ols.loglog <- lm(lnearnings ~ lnage+education)
summ(ols.loglog)
predict.loglog = predict(ols.loglog)
biased.predict = exp(predict.loglog)
rmse = summary(ols.loglog)$sigma
rmse
exp(rmse^2/2)
adjusted.predict = exp(rmse^2/2)*biased.predict
print(summary(cbind(earnings, linear.predict, biased.predict, adjusted.predict)))
print(cor(cbind(earnings, linear.predict, biased.predict, adjusted.predict)))
MODEL INFO:
Observations: 872
Dependent Variable: lnearnings
Type: OLS linear regression
MODEL FIT:
F(2,869) = 103.74, p = 0.00
R² = 0.19
Adj. R² = 0.19
Standard errors:OLS
-----------------------------------------------
Est. S.E. t val. p
----------------- ------ ------ -------- ------
(Intercept) 8.01 0.32 24.88 0.00
lnage 0.35 0.08 4.27 0.00
education 0.10 0.01 13.88 0.00
-----------------------------------------------
earnings linear.predict biased.predict adjusted.predict
Min. : 4000 Min. :-33750 Min. : 9152 Min. : 11060
1st Qu.: 29000 1st Qu.: 46486 1st Qu.:37401 1st Qu.: 45201
Median : 44200 Median : 54361 Median :41977 Median : 50731
Mean : 56369 Mean : 56369 Mean :45861 Mean : 55425
3rd Qu.: 64250 3rd Qu.: 67631 3rd Qu.:53767 3rd Qu.: 64980
Max. :504000 Max. :102427 Max. :93791 Max. :113350
earnings linear.predict biased.predict adjusted.predict
earnings 1.0000000 0.3391009 0.3659330 0.3659330
linear.predict 0.3391009 1.0000000 0.9620711 0.9620711
biased.predict 0.3659330 0.9620711 1.0000000 1.0000000
adjusted.predict 0.3659330 0.9620711 1.0000000 1.0000000
15.6. 15.7 MODELS WITH A MIX OF REGRESSOR TYPES#
Linear dependent variable
ols.linear <- lm(earnings ~ gender+age+agesq+education+dself+dgovt+lnhours)
summ(ols.linear, digits=3, robust = "HC1")
linear.predict = predict(ols.linear)
MODEL INFO:
Observations: 872
Dependent Variable: earnings
Type: OLS linear regression
MODEL FIT:
F(7,864) = 31.939, p = 0.000
R² = 0.206
Adj. R² = 0.199
Standard errors: Robust, type = HC1
------------------------------------------------------------
Est. S.E. t val. p
----------------- ------------- ----------- -------- -------
(Intercept) -356631.619 66302.731 -5.379 0.000
gender -14330.015 2696.808 -5.314 0.000
age 3282.868 1064.806 3.083 0.002
agesq -31.578 12.214 -2.585 0.010
education 5399.360 609.862 8.853 0.000
dself 9360.500 8711.602 1.074 0.283
dgovt -291.136 2914.162 -0.100 0.920
lnhours 69963.734 16103.000 4.345 0.000
------------------------------------------------------------
# Log-transformed dependent variable
ols.log <- lm(lnearnings ~ gender+age+agesq+education+dself+dgovt+lnhours)
summ(ols.log, digits=3, robust = "HC1")
predict.log = predict(ols.log)
biased.predict = exp(predict.log)
rmse = summary(ols.log)$sigma
rmse
exp(rmse^2/2)
adjusted.predict = exp(rmse^2/2)*biased.predict
print(summary(cbind(earnings, linear.predict, biased.predict, adjusted.predict)))
print(cor(cbind(earnings, linear.predict, biased.predict, adjusted.predict)))
MODEL INFO:
Observations: 872
Dependent Variable: lnearnings
Type: OLS linear regression
MODEL FIT:
F(7,864) = 48.314, p = 0.000
R² = 0.281
Adj. R² = 0.275
Standard errors: Robust, type = HC1
---------------------------------------------------
Est. S.E. t val. p
----------------- -------- ------- -------- -------
(Intercept) 4.459 0.648 6.885 0.000
gender -0.193 0.039 -4.881 0.000
age 0.056 0.016 3.550 0.000
agesq -0.001 0.000 -2.992 0.003
education 0.093 0.008 11.168 0.000
dself -0.118 0.101 -1.166 0.244
dgovt 0.070 0.045 1.534 0.125
lnhours 0.975 0.142 6.882 0.000
---------------------------------------------------
earnings linear.predict biased.predict adjusted.predict
Min. : 4000 Min. :-36208 Min. : 8738 Min. : 10353
1st Qu.: 29000 1st Qu.: 40778 1st Qu.: 35132 1st Qu.: 41626
Median : 44200 Median : 53840 Median : 42444 Median : 50290
Mean : 56369 Mean : 56369 Mean : 46906 Mean : 55576
3rd Qu.: 64250 3rd Qu.: 71050 3rd Qu.: 55612 3rd Qu.: 65891
Max. :504000 Max. :134977 Max. :127693 Max. :151295
earnings linear.predict biased.predict adjusted.predict
earnings 1.0000000 0.4533973 0.4739243 0.4739243
linear.predict 0.4533973 1.0000000 0.9352312 0.9352312
biased.predict 0.4739243 0.9352312 1.0000000 1.0000000
adjusted.predict 0.4739243 0.9352312 1.0000000 1.0000000
# Levels dependent variable and standardized coefficients
summ(ols.linear <- lm(earnings ~ gender+age+agesq+education+dself+dgovt+lnhours))
print(lm.beta(ols.linear))
MODEL INFO:
Observations: 872
Dependent Variable: earnings
Type: OLS linear regression
MODEL FIT:
F(7,864) = 31.94, p = 0.00
R² = 0.21
Adj. R² = 0.20
Standard errors:OLS
---------------------------------------------------------
Est. S.E. t val. p
----------------- ------------ ---------- -------- ------
(Intercept) -356631.62 44767.35 -7.97 0.00
gender -14330.02 3222.16 -4.45 0.00
age 3282.87 1224.05 2.68 0.01
agesq -31.58 13.97 -2.26 0.02
education 5399.36 550.41 9.81 0.00
dself 9360.50 5580.50 1.68 0.09
dgovt -291.14 4443.20 -0.07 0.95
lnhours 69963.73 9785.19 7.15 0.00
---------------------------------------------------------
gender age agesq education dself dgovt lnhours
-0.137925992 0.680332423 -0.573556814 0.302284707 0.052184230 -0.002014001 0.223770013