CHAPTER 15. REGRESSION WITH TRANSFORMED VARIABLES

15. CHAPTER 15. REGRESSION WITH TRANSFORMED VARIABLES#

SET UP

library(foreign) # to open stata.dta files
library(psych) # for better sammary of descriptive statistics
library(repr) # to combine graphs with adjustable plot dimensions
options(repr.plot.width = 12, repr.plot.height = 6) # Plot dimensions (in inches)
options(width = 150) # To increase character width of printed output

Load Libraries

15.1. 15.1 EXAMPLE: EARNINGS, GENDER, EDUCATION and TYPE OF WORKER#

df = read.dta(file = "Dataset/AED_EARNINGS_COMPLETE.DTA")
attach(df)
print(describe(df))

              vars   n     mean       sd   median  trimmed      mad      min       max     range  skew kurtosis      se
earnings         1 872 56368.69 51516.05 44200.00 47580.66 24907.68  4000.00 504000.00 500000.00  4.17    25.48 1744.55
lnearnings       2 872    10.69     0.68    10.70    10.69     0.57     8.29     13.13      4.84  0.10     1.01    0.02
dearnings        3 872     0.16     0.37     0.00     0.08     0.00     0.00      1.00      1.00  1.81     1.28    0.01
gender           4 872     0.43     0.50     0.00     0.42     0.00     0.00      1.00      1.00  0.27    -1.93    0.02
age              5 872    43.31    10.68    44.00    43.22    13.34    25.00     65.00     40.00  0.04    -1.03    0.36
lnage            6 872     3.74     0.26     3.78     3.75     0.30     3.22      4.17      0.96 -0.33    -0.93    0.01
agesq            7 872  1989.67   935.69  1936.00  1935.44  1054.13   625.00   4225.00   3600.00  0.39    -0.83   31.69
education        8 872    13.85     2.88    13.00    13.87     1.48     0.00     20.00     20.00 -1.04     4.67    0.10
educsquared      9 872   200.22    73.91   169.00   195.67    40.03     0.00    400.00    400.00  0.39     0.00    2.50
agebyeduc       10 872   598.82   193.69   588.00   593.70   187.55     0.00   1260.00   1260.00  0.14     0.54    6.56
genderbyage     11 872    19.04    22.87     0.00    16.59     0.00     0.00     65.00     65.00  0.53    -1.43    0.77
genderbyeduc    12 872     6.08     7.17     0.00     5.45     0.00     0.00     20.00     20.00  0.42    -1.65    0.24
hours           13 872    44.34     8.50    40.00    42.66     0.00    35.00     99.00     64.00  2.32     6.65    0.29
lnhours         14 872     3.78     0.16     3.69     3.75     0.00     3.56      4.60      1.04  1.69     3.02    0.01
genderbyhours   15 872    18.56    21.76     0.00    16.56     0.00     0.00     80.00     80.00  0.45    -1.42    0.74
dself           16 872     0.09     0.29     0.00     0.00     0.00     0.00      1.00      1.00  2.85     6.12    0.01
dprivate        17 872     0.76     0.43     1.00     0.83     0.00     0.00      1.00      1.00 -1.22    -0.52    0.01
dgovt           18 872     0.15     0.36     0.00     0.06     0.00     0.00      1.00      1.00  1.97     1.87    0.01
state*          19 872    24.40    14.83    24.00    24.23    20.76     1.00     50.00     49.00  0.01    -1.39    0.50
statefip*       20 872    24.71    15.20    24.00    24.50    20.76     1.00     51.00     50.00  0.04    -1.39    0.51
stateunemp      21 872     9.60     1.65     9.45     9.59     1.85     4.80     14.40      9.60  0.04    -0.65    0.06
stateincomepc   22 872 40772.99  5558.63 39493.00 40392.34  5353.67 31186.00  71044.00  39858.00  0.97     2.16  188.24
year*           23 872    25.00     0.00    25.00    25.00     0.00    25.00     25.00      0.00   NaN      NaN    0.00
pernum          24 872     1.54     0.89     1.00     1.37     0.00     1.00      8.00      7.00  2.86    12.31    0.03
perwt           25 872   145.78    90.99   109.00   132.50    56.34    14.00    626.00    612.00  1.44     2.15    3.08
relate*         26 872     2.10     2.47     1.00     1.42     0.00     1.00     12.00     11.00  3.01     8.03    0.08
region*         27 872     6.81     3.50     7.00     6.82     4.45     1.00     12.00     11.00  0.00    -1.17    0.12
metro*          28 872     3.70     1.22     4.00     3.82     1.48     1.00      5.00      4.00 -0.64    -0.68    0.04
marst*          29 872     2.55     2.05     1.00     2.32     0.00     1.00      6.00      5.00  0.76    -1.16    0.07
race*           30 872     1.70     1.69     1.00     1.18     0.00     1.00      8.00      7.00  2.52     4.94    0.06
raced*          31 872    10.67    26.61     1.00     2.66     0.00     1.00    152.00    151.00  2.97     8.01    0.90
hispan*         32 872     1.24     0.78     1.00     1.03     0.00     1.00      5.00      4.00  3.88    14.95    0.03
racesing*       33 872     1.33     0.81     1.00     1.10     0.00     1.00      5.00      4.00  2.70     6.41    0.03
hcovany*        34 872     1.87     0.34     2.00     1.96     0.00     1.00      2.00      1.00 -2.17     2.72    0.01
attainededuc*   35 872     8.77     2.29     8.00     8.82     1.48     1.00     12.00     11.00 -0.37     0.16    0.08
detailededuc*   36 872    30.58     6.87    29.00    30.59     5.93     3.00     43.00     40.00 -0.46     1.40    0.23
empstat*        37 872     2.02     0.16     2.00     2.00     0.00     2.00      4.00      2.00  9.88   103.99    0.01
classwkr*       38 872     2.91     0.29     3.00     3.00     0.00     2.00      3.00      1.00 -2.87     6.26    0.01
classwkrd*      39 872     9.51     2.21     9.00     9.33     0.00     5.00     16.00     11.00  0.84     1.58    0.07
wkswork2*       40 872     6.99     0.10     7.00     7.00     0.00     6.00      7.00      1.00 -9.67    91.68    0.00
workedyr*       41 872     4.00     0.00     4.00     4.00     0.00     4.00      4.00      0.00   NaN      NaN    0.00
inctot          42 872 57718.70 53049.14 45000.00 48533.48 25204.20  4000.00 548000.00 544000.00  4.17    25.73 1796.47
incwage         43 872 52828.21 48915.23 41900.00 45591.83 26538.54     0.00 498000.00 498000.00  3.96    24.64 1656.48
incbus00        44 872  3540.48 20495.13     0.00     0.00     0.00 -7500.00 285000.00 292500.00  8.78    92.01  694.05
incearn         45 872 56368.69 51516.05 44200.00 47580.66 24907.68  4000.00 504000.00 500000.00  4.17    25.48 1744.55

15.1.1. Table 15.1#

table151vars = c("earnings", "lnearnings", "age", "agesq", "education", "agebyeduc",  
   "lnage", "gender", "dself", "dprivate", "dgovt", "hours", "lnhours")
print(describe(df[table151vars]))

           vars   n     mean       sd   median  trimmed      mad     min       max    range  skew kurtosis      se
earnings      1 872 56368.69 51516.05 44200.00 47580.66 24907.68 4000.00 504000.00 5.00e+05  4.17    25.48 1744.55
lnearnings    2 872    10.69     0.68    10.70    10.69     0.57    8.29     13.13 4.84e+00  0.10     1.01    0.02
age           3 872    43.31    10.68    44.00    43.22    13.34   25.00     65.00 4.00e+01  0.04    -1.03    0.36
agesq         4 872  1989.67   935.69  1936.00  1935.44  1054.13  625.00   4225.00 3.60e+03  0.39    -0.83   31.69
education     5 872    13.85     2.88    13.00    13.87     1.48    0.00     20.00 2.00e+01 -1.04     4.67    0.10
agebyeduc     6 872   598.82   193.69   588.00   593.70   187.55    0.00   1260.00 1.26e+03  0.14     0.54    6.56
lnage         7 872     3.74     0.26     3.78     3.75     0.30    3.22      4.17 9.60e-01 -0.33    -0.93    0.01
gender        8 872     0.43     0.50     0.00     0.42     0.00    0.00      1.00 1.00e+00  0.27    -1.93    0.02
dself         9 872     0.09     0.29     0.00     0.00     0.00    0.00      1.00 1.00e+00  2.85     6.12    0.01
dprivate     10 872     0.76     0.43     1.00     0.83     0.00    0.00      1.00 1.00e+00 -1.22    -0.52    0.01
dgovt        11 872     0.15     0.36     0.00     0.06     0.00    0.00      1.00 1.00e+00  1.97     1.87    0.01
hours        12 872    44.34     8.50    40.00    42.66     0.00   35.00     99.00 6.40e+01  2.32     6.65    0.29
lnhours      13 872     3.78     0.16     3.69     3.75     0.00    3.56      4.60 1.04e+00  1.69     3.02    0.01

15.2. 15.3 QUADRATIC AND POLYNOMIAL MODELS#

15.2.1. Figure 15.2 - generated data#

Quadratic Model for Earnings on Age with Education as a regressor as well

# Linear Model
ols.linear = lm(earnings ~ age+education)
summ(ols.linear, digits=3, robust = "HC1")

MODEL INFO:
Observations: 872
Dependent Variable: earnings
Type: OLS linear regression 

MODEL FIT:
F(2,869) = 56.455, p = 0.000
R² = 0.115
Adj. R² = 0.113 

Standard errors: Robust, type = HC1
-----------------------------------------------------------
                          Est.        S.E.   t val.       p
----------------- ------------ ----------- -------- -------
(Intercept)         -46875.360   11306.330   -4.146   0.000
age                    524.995     151.387    3.468   0.001
education             5811.367     641.533    9.059   0.000
-----------------------------------------------------------

Quadratic model

ols.quad = lm(earnings ~ age+agesq+education)
summ(ols.quad, digits=3, robust = "HC1")

MODEL INFO:
Observations: 872
Dependent Variable: earnings
Type: OLS linear regression 

MODEL FIT:
F(3,868) = 39.133, p = 0.000
R² = 0.119
Adj. R² = 0.116 

Standard errors: Robust, type = HC1
-----------------------------------------------------------
                          Est.        S.E.   t val.       p
----------------- ------------ ----------- -------- -------
(Intercept)         -98620.326   24524.116   -4.021   0.000
age                   3104.916    1087.323    2.856   0.004
agesq                  -29.658      12.456   -2.381   0.017
education             5740.398     642.024    8.941   0.000
-----------------------------------------------------------

Turning point

bage = summary(ols.quad)$coefficients["age",1]
bagesq = summary(ols.quad)$coefficients["agesq",1]
turning.point = -bage/(2*bagesq)
turning.point

52.3448020156631

Marginal effects

mequad = bage + 2*bagesq*age
print(describe(mequad))
AME.age = mean(mequad)
AME.age
MEM.age = bage + 2*bagesq*mean(age)
MEM.age
MER.age25 = bage + 2*bagesq*25
MER.age25

   vars   n   mean     sd median trimmed    mad     min  max   range  skew kurtosis    se
X1    1 872 535.87 633.27 494.99  540.96 791.49 -750.66 1622 2372.66 -0.04    -1.03 21.45

535.867577076173

1622.00097351331

Joint hypothesis test (requires packages car and sandwich)

robvar = vcovHC(ols.quad, type="HC1")
print(
    linearHypothesis(ols.quad,c("age=0", "agesq=0"), vcov=robvar)
)

Linear hypothesis test:
age = 0
agesq = 0

Model 1: restricted model
Model 2: earnings ~ age + agesq + education

Note: Coefficient covariance matrix supplied.

  Res.Df Df      F    Pr(>F)    
1    870                        
2    868  2 9.2919 0.0001017 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ME for age using margins package and R’s factor variable notation - age*age gives age and age^2

and use heteroskedastic-robust standard errors which uses sandwich package

ols.factor.quad = lm(earnings ~ age*age + education)
summ(ols.factor.quad, digits=3, robust = "HC1")

MODEL INFO:
Observations: 872
Dependent Variable: earnings
Type: OLS linear regression 

MODEL FIT:
F(2,869) = 56.455, p = 0.000
R² = 0.115
Adj. R² = 0.113 

Standard errors: Robust, type = HC1
-----------------------------------------------------------
                          Est.        S.E.   t val.       p
----------------- ------------ ----------- -------- -------
(Intercept)         -46875.360   11306.330   -4.146   0.000
age                    524.995     151.387    3.468   0.001
education             5811.367     641.533    9.059   0.000
-----------------------------------------------------------

AMEs only

print(margins(ols.factor.quad))

Average marginal effects

lm(formula = earnings ~ age * age + education)

 age education
 525      5811

AMES with standard errors, z-statistic, confidence interval

print(summary(margins(ols.factor.quad, vcov = vcovHC(ols.factor.quad, type="HC1"))))

    factor       AME       SE      z      p     lower     upper
       age  524.9953 151.3924 3.4678 0.0005  228.2716  821.7190
 education 5811.3673 639.2632 9.0907 0.0000 4558.4344 7064.3002

AME for a single regressor

print(
    summary(margins(ols.factor.quad, variables = "age",
            vcov = vcovHC(ols.factor.quad, type="HC1")))
    )

 factor      AME       SE      z      p    lower    upper
    age 524.9953 151.3924 3.4678 0.0005 228.2716 821.7190

Note: MEM is not given by margins

MER for a single regressor at two values

print(summary(margins(ols.factor.quad, variables = "age", 
            at=list(age=c(25,65)), vcov = vcovHC(ols.factor.quad, type="HC1"))))

 factor     age      AME       SE      z      p    lower    upper
    age 25.0000 524.9953 152.1844 3.4497 0.0006 226.7195 823.2712
    age 65.0000 524.9953 151.3784 3.4681 0.0005 228.2992 821.6915

15.3. 15.4 INTERACTED REGRESSORS#

Regression with interactions

ols.interact = lm(earnings ~ age+education+agebyeduc)
summ(ols.interact, digits=3, robust = "HC1")

MODEL INFO:
Observations: 872
Dependent Variable: earnings
Type: OLS linear regression 

MODEL FIT:
F(3,868) = 37.713, p = 0.000
R² = 0.115
Adj. R² = 0.112 

Standard errors: Robust, type = HC1
-----------------------------------------------------------
                          Est.        S.E.   t val.       p
----------------- ------------ ----------- -------- -------
(Intercept)         -29089.382   30958.508   -0.940   0.348
age                    127.492     719.280    0.177   0.859
education             4514.987    2401.517    1.880   0.060
agebyeduc               29.039      56.052    0.518   0.605
-----------------------------------------------------------

Joint test for statistical significance of age

robvar = vcovHC(ols.interact, type="HC1")
print(
    linearHypothesis(ols.interact,c("age=0", "agebyeduc=0"), vcov=robvar)
    )

Linear hypothesis test:
age = 0
agebyeduc = 0

Model 1: restricted model
Model 2: earnings ~ age + education + agebyeduc

Note: Coefficient covariance matrix supplied.

  Res.Df Df      F   Pr(>F)   
1    870                      
2    868  2 6.4896 0.001594 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The regressors are highly correlated

print(
    cor(cbind(age, education, agebyeduc))
)

                 age  education agebyeduc
age        1.0000000 -0.0381526 0.7291365
education -0.0381526  1.0000000 0.6359608
agebyeduc  0.7291365  0.6359608 1.0000000

Marginal effects manually

beducation = summary(ols.interact)$coefficients["education",1]
bagebyeduc = summary(ols.interact)$coefficients["agebyeduc",1]
meinteract = beducation + bagebyeduc*age
AME.educ = summary(meinteract)
AME.educ
MEM.educ = beducation + bagebyeduc*summary(age)
MEM.educ
MER.educ.25 = beducation + bagebyeduc*25
MER.educ.25
     

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   5241    5531    5793    5773    6003    6403 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   5241    5531    5793    5773    6003    6403 

5240.96565954994

ME for education using margins package R’s factor variable notation - \(age*educ\) gives \(age\), \(educ\) and \(age*educ\) and use heteroskedastic-robust standard errors which uses sandwich package

ols.factor.interact = lm(earnings ~ age*education)
summ(ols.factor.interact, digits=3, robust = "HC1")

MODEL INFO:
Observations: 872
Dependent Variable: earnings
Type: OLS linear regression 

MODEL FIT:
F(3,868) = 37.713, p = 0.000
R² = 0.115
Adj. R² = 0.112 

Standard errors: Robust, type = HC1
-------------------------------------------------------------
                            Est.        S.E.   t val.       p
------------------- ------------ ----------- -------- -------
(Intercept)           -29089.382   30958.508   -0.940   0.348
age                      127.492     719.280    0.177   0.859
education               4514.987    2401.517    1.880   0.060
age:education             29.039      56.052    0.518   0.605
-------------------------------------------------------------

# AMEs only

print(margins(ols.factor.interact))

Average marginal effects

lm(formula = earnings ~ age * education)

   age education
 529.8      5773

# AMES with standard errors, z-statistic, confidence interval

print(
    summary(margins(ols.factor.interact, vcov = vcovHC(ols.factor.interact, type="HC1")))
    )

    factor       AME       SE      z      p     lower     upper
       age  529.7778 155.2115 3.4133 0.0006  225.5689  833.9867
 education 5772.6953 626.7852 9.2100 0.0000 4544.2189 7001.1717

# AME for a single regressor

print(
    summary(margins(ols.factor.interact, variables = "education",
                vcov = vcovHC(ols.factor.interact, type="HC1")))
    )

    factor       AME       SE      z      p     lower     upper
 education 5772.6953 626.7852 9.2100 0.0000 4544.2189 7001.1717

# MEM is not given by margins
# MER for a single regressor at two values

print(summary(margins(ols.factor.interact, variables = "education",
        vcov = vcovHC(ols.factor.interact, type="HC1")))
     )
print(summary(margins(ols.factor.interact, variables = "education", 
        at=list(age=c(25,65)), vcov = vcovHC(ols.factor.interact, type="HC1")))
      )

    factor       AME       SE      z      p     lower     upper
 education 5772.6953 626.7852 9.2100 0.0000 4544.2189 7001.1717

    factor     age       AME        SE      z      p     lower     upper
 education 25.0000 5240.9657 1109.5069 4.7237 0.0000 3066.3722 7415.5591
 education 65.0000 6402.5321 1457.8826 4.3917 0.0000 3545.1346 9259.9295

# Use package jtools which also needs huxtable for tables

#library(huxtable)
export_summs(ols.linear, ols.quad, ols.interact, scale = TRUE, 
             error_format = "({statistic})", robust = "HC1")

A huxtable: 14 × 4
	names	Model 1	Model 2	Model 3
	<chr>	<chr>	<chr>	<chr>
		Model 1	Model 2	Model 3
1	(Intercept)	56368.6938073394 ***	56368.6938073394 ***	56368.6938073394 ***
2		(34.3067991492708)	(34.3677132279417)	(34.293317153113)
3	age	5604.8734461588 ***	33148.2247664834 **	1361.11194147122
4		(3.46789224399738)	(2.85555972562047)	(0.177249741677106)
5	education	16760.8009996877 ***	16556.1149293775 ***	13021.8567165293
6		(9.05856467518862)	(8.94110156422714)	(1.88005594862479)
7	agesq		-27751.0364917983 *
8			(-2.3810493397332)
9	agebyeduc			5624.61362960973
10				(0.518075029268155)
1.1	N	872	872	872
2.1	R2	0.114989391658086	0.119138652546052	0.115312617880932
.1	All continuous predictors are mean-centered and scaled by 1 standard deviation. The outcome variable is in its original units. Standard errors are heteroskedasticity robust. * p < 0.001; p < 0.01; * p < 0.05.	All continuous predictors are mean-centered and scaled by 1 standard deviation. The outcome variable is in its original units. Standard errors are heteroskedasticity robust. * p < 0.001; p < 0.01; * p < 0.05.	All continuous predictors are mean-centered and scaled by 1 standard deviation. The outcome variable is in its original units. Standard errors are heteroskedasticity robust. * p < 0.001; p < 0.01; * p < 0.05.	All continuous predictors are mean-centered and scaled by 1 standard deviation. The outcome variable is in its original units. Standard errors are heteroskedasticity robust. * p < 0.001; p < 0.01; * p < 0.05.

15.4. 15.5 LOG-LINEAR AND LOG-LOG MODELS#

Levels model

ols.linear2 <- lm(earnings ~ age+education)
summ(ols.linear2, digits=3, robust = "HC1") 

MODEL INFO:
Observations: 872
Dependent Variable: earnings
Type: OLS linear regression 

MODEL FIT:
F(2,869) = 56.455, p = 0.000
R² = 0.115
Adj. R² = 0.113 

Standard errors: Robust, type = HC1
-----------------------------------------------------------
                          Est.        S.E.   t val.       p
----------------- ------------ ----------- -------- -------
(Intercept)         -46875.360   11306.330   -4.146   0.000
age                    524.995     151.387    3.468   0.001
education             5811.367     641.533    9.059   0.000
-----------------------------------------------------------

# Log-linear model

ols.loglin2 <- lm(lnearnings ~ age+education)
summ(ols.loglin2, digits=3, robust = "HC1") 

MODEL INFO:
Observations: 872
Dependent Variable: lnearnings
Type: OLS linear regression 

MODEL FIT:
F(2,869) = 102.198, p = 0.000
R² = 0.190
Adj. R² = 0.189 

Standard errors: Robust, type = HC1
--------------------------------------------------
                     Est.    S.E.   t val.       p
----------------- ------- ------- -------- -------
(Intercept)         8.962   0.150   59.626   0.000
age                 0.008   0.002    3.832   0.000
education           0.101   0.009   11.683   0.000
--------------------------------------------------

# Log-log model

ols.loglog2 <- lm(lnearnings ~ lnage+education)
summ(ols.loglog2, digits=3, robust = "HC1")

MODEL INFO:
Observations: 872
Dependent Variable: lnearnings
Type: OLS linear regression 

MODEL FIT:
F(2,869) = 103.743, p = 0.000
R² = 0.193
Adj. R² = 0.191 

Standard errors: Robust, type = HC1
--------------------------------------------------
                     Est.    S.E.   t val.       p
----------------- ------- ------- -------- -------
(Intercept)         8.009   0.331   24.229   0.000
lnage               0.346   0.082    4.211   0.000
education           0.100   0.009   11.665   0.000
--------------------------------------------------

# Use package jtools which also needs huxtable for tables
# For some reason gives wrong results

export_summs(ols.linear2, ols.loglin2, ols.loglog2, scale = TRUE, 
             error_format = "({statistic})", robust = "HC1")

A huxtable: 12 × 4
	names	Model 1	Model 2	Model 3
	<chr>	<chr>	<chr>	<chr>
		Model 1	Model 2	Model 3
1	(Intercept)	56368.6938073394 ***	10.6911645176214 ***	10.6911645176214 ***
2		(34.3067991492708)	(512.201453131804)	(512.938183351903)
3	age	5604.8734461588 ***	0.0828323181429894 ***
4		(3.46789224399738)	(3.83168075340195)
5	education	16760.8009996877 ***	0.290043843291207 ***	0.289516927075926 ***
6		(9.05856467518862)	(11.6833123854303)	(11.6653455649129)
7	lnage			0.0891413931083605 ***
8				(4.21055308818822)
1.1	N	872	872	872
2.1	R2	0.114989391658086	0.190419488464102	0.192743410193262
.1	All continuous predictors are mean-centered and scaled by 1 standard deviation. The outcome variable is in its original units. Standard errors are heteroskedasticity robust. * p < 0.001; p < 0.01; * p < 0.05.	All continuous predictors are mean-centered and scaled by 1 standard deviation. The outcome variable is in its original units. Standard errors are heteroskedasticity robust. * p < 0.001; p < 0.01; * p < 0.05.	All continuous predictors are mean-centered and scaled by 1 standard deviation. The outcome variable is in its original units. Standard errors are heteroskedasticity robust. * p < 0.001; p < 0.01; * p < 0.05.	All continuous predictors are mean-centered and scaled by 1 standard deviation. The outcome variable is in its original units. Standard errors are heteroskedasticity robust. * p < 0.001; p < 0.01; * p < 0.05.

15.5. 15.6 PREDICTION FROM LOG-LINEAR AND LOG-LOG MODELS#

Levels model

ols.linear <- lm(earnings ~ age+education)
linear.predict = predict(ols.linear)

# Retransformation bias in log-linear model

ols.loglin = lm(lnearnings ~ age+education)
summ(ols.loglin)
predict.log = predict(ols.loglin)
biased.predict = exp(predict.log)
rmse = summary(ols.loglin)$sigma
rmse
exp(rmse^2/2)
adjusted.predict = exp(rmse^2/2)*biased.predict
print(summary(cbind(earnings, linear.predict, biased.predict, adjusted.predict)))
print(cor(cbind(earnings, linear.predict, biased.predict, adjusted.predict)))

MODEL INFO:
Observations: 872
Dependent Variable: lnearnings
Type: OLS linear regression 

MODEL FIT:
F(2,869) = 102.20, p = 0.00
R² = 0.19
Adj. R² = 0.19 

Standard errors:OLS
-----------------------------------------------
                    Est.   S.E.   t val.      p
----------------- ------ ------ -------- ------
(Intercept)         8.96   0.14    66.15   0.00
age                 0.01   0.00     3.96   0.00
education           0.10   0.01    13.88   0.00
-----------------------------------------------

0.616371357189044

1.20919738941059

    earnings      linear.predict   biased.predict  adjusted.predict
 Min.   :  4000   Min.   :-33750   Min.   : 9471   Min.   : 11452  
 1st Qu.: 29000   1st Qu.: 46486   1st Qu.:37182   1st Qu.: 44960  
 Median : 44200   Median : 54361   Median :41858   Median : 50615  
 Mean   : 56369   Mean   : 56369   Mean   :45838   Mean   : 55427  
 3rd Qu.: 64250   3rd Qu.: 67631   3rd Qu.:53280   3rd Qu.: 64426  
 Max.   :504000   Max.   :102427   Max.   :95043   Max.   :114925  

                  earnings linear.predict biased.predict adjusted.predict
earnings         1.0000000      0.3391009      0.3631965        0.3631965
linear.predict   0.3391009      1.0000000      0.9621589        0.9621589
biased.predict   0.3631965      0.9621589      1.0000000        1.0000000
adjusted.predict 0.3631965      0.9621589      1.0000000        1.0000000

# Retransformation bias in log-log model

ols.loglog <- lm(lnearnings ~ lnage+education)
summ(ols.loglog)
predict.loglog = predict(ols.loglog)
biased.predict = exp(predict.loglog)
rmse = summary(ols.loglog)$sigma
rmse
exp(rmse^2/2)
adjusted.predict = exp(rmse^2/2)*biased.predict
print(summary(cbind(earnings, linear.predict, biased.predict, adjusted.predict)))
print(cor(cbind(earnings, linear.predict, biased.predict, adjusted.predict))) 

MODEL INFO:
Observations: 872
Dependent Variable: lnearnings
Type: OLS linear regression 

MODEL FIT:
F(2,869) = 103.74, p = 0.00
R² = 0.19
Adj. R² = 0.19 

Standard errors:OLS
-----------------------------------------------
                    Est.   S.E.   t val.      p
----------------- ------ ------ -------- ------
(Intercept)         8.01   0.32    24.88   0.00
lnage               0.35   0.08     4.27   0.00
education           0.10   0.01    13.88   0.00
-----------------------------------------------

0.61548606648466

1.20853822286961

    earnings      linear.predict   biased.predict  adjusted.predict
 Min.   :  4000   Min.   :-33750   Min.   : 9152   Min.   : 11060  
 1st Qu.: 29000   1st Qu.: 46486   1st Qu.:37401   1st Qu.: 45201  
 Median : 44200   Median : 54361   Median :41977   Median : 50731  
 Mean   : 56369   Mean   : 56369   Mean   :45861   Mean   : 55425  
 3rd Qu.: 64250   3rd Qu.: 67631   3rd Qu.:53767   3rd Qu.: 64980  
 Max.   :504000   Max.   :102427   Max.   :93791   Max.   :113350  

                  earnings linear.predict biased.predict adjusted.predict
earnings         1.0000000      0.3391009      0.3659330        0.3659330
linear.predict   0.3391009      1.0000000      0.9620711        0.9620711
biased.predict   0.3659330      0.9620711      1.0000000        1.0000000
adjusted.predict 0.3659330      0.9620711      1.0000000        1.0000000

15.6. 15.7 MODELS WITH A MIX OF REGRESSOR TYPES#

Linear dependent variable

ols.linear <- lm(earnings ~ gender+age+agesq+education+dself+dgovt+lnhours)
summ(ols.linear, digits=3, robust = "HC1") 
linear.predict = predict(ols.linear)

MODEL INFO:
Observations: 872
Dependent Variable: earnings
Type: OLS linear regression 

MODEL FIT:
F(7,864) = 31.939, p = 0.000
R² = 0.206
Adj. R² = 0.199 

Standard errors: Robust, type = HC1
------------------------------------------------------------
                           Est.        S.E.   t val.       p
----------------- ------------- ----------- -------- -------
(Intercept)         -356631.619   66302.731   -5.379   0.000
gender               -14330.015    2696.808   -5.314   0.000
age                    3282.868    1064.806    3.083   0.002
agesq                   -31.578      12.214   -2.585   0.010
education              5399.360     609.862    8.853   0.000
dself                  9360.500    8711.602    1.074   0.283
dgovt                  -291.136    2914.162   -0.100   0.920
lnhours               69963.734   16103.000    4.345   0.000
------------------------------------------------------------

# Log-transformed dependent variable

ols.log <- lm(lnearnings ~ gender+age+agesq+education+dself+dgovt+lnhours)
summ(ols.log, digits=3, robust = "HC1") 
predict.log = predict(ols.log)
biased.predict = exp(predict.log)
rmse = summary(ols.log)$sigma
rmse
exp(rmse^2/2)
adjusted.predict = exp(rmse^2/2)*biased.predict
print(summary(cbind(earnings, linear.predict, biased.predict, adjusted.predict)))
print(cor(cbind(earnings, linear.predict, biased.predict, adjusted.predict)))

MODEL INFO:
Observations: 872
Dependent Variable: lnearnings
Type: OLS linear regression 

MODEL FIT:
F(7,864) = 48.314, p = 0.000
R² = 0.281
Adj. R² = 0.275 

Standard errors: Robust, type = HC1
---------------------------------------------------
                      Est.    S.E.   t val.       p
----------------- -------- ------- -------- -------
(Intercept)          4.459   0.648    6.885   0.000
gender              -0.193   0.039   -4.881   0.000
age                  0.056   0.016    3.550   0.000
agesq               -0.001   0.000   -2.992   0.003
education            0.093   0.008   11.168   0.000
dself               -0.118   0.101   -1.166   0.244
dgovt                0.070   0.045    1.534   0.125
lnhours              0.975   0.142    6.882   0.000
---------------------------------------------------

0.582417015682795

1.18483649963775

    earnings      linear.predict   biased.predict   adjusted.predict
 Min.   :  4000   Min.   :-36208   Min.   :  8738   Min.   : 10353  
 1st Qu.: 29000   1st Qu.: 40778   1st Qu.: 35132   1st Qu.: 41626  
 Median : 44200   Median : 53840   Median : 42444   Median : 50290  
 Mean   : 56369   Mean   : 56369   Mean   : 46906   Mean   : 55576  
 3rd Qu.: 64250   3rd Qu.: 71050   3rd Qu.: 55612   3rd Qu.: 65891  
 Max.   :504000   Max.   :134977   Max.   :127693   Max.   :151295  

                  earnings linear.predict biased.predict adjusted.predict
earnings         1.0000000      0.4533973      0.4739243        0.4739243
linear.predict   0.4533973      1.0000000      0.9352312        0.9352312
biased.predict   0.4739243      0.9352312      1.0000000        1.0000000
adjusted.predict 0.4739243      0.9352312      1.0000000        1.0000000

# Levels dependent variable and standardized coefficients

summ(ols.linear <- lm(earnings ~ gender+age+agesq+education+dself+dgovt+lnhours))
print(lm.beta(ols.linear))

MODEL INFO:
Observations: 872
Dependent Variable: earnings
Type: OLS linear regression 

MODEL FIT:
F(7,864) = 31.94, p = 0.00
R² = 0.21
Adj. R² = 0.20 

Standard errors:OLS
---------------------------------------------------------
                          Est.       S.E.   t val.      p
----------------- ------------ ---------- -------- ------
(Intercept)         -356631.62   44767.35    -7.97   0.00
gender               -14330.02    3222.16    -4.45   0.00
age                    3282.87    1224.05     2.68   0.01
agesq                   -31.58      13.97    -2.26   0.02
education              5399.36     550.41     9.81   0.00
dself                  9360.50    5580.50     1.68   0.09
dgovt                  -291.14    4443.20    -0.07   0.95
lnhours               69963.73    9785.19     7.15   0.00
---------------------------------------------------------

      gender          age        agesq    education        dself        dgovt      lnhours 
-0.137925992  0.680332423 -0.573556814  0.302284707  0.052184230 -0.002014001  0.223770013