2 An Introduction to Linear Regression

2.1 Figure 2.1

Simple linear regression: fitted line and observation points, hypothetical data

set.seed(123)
x <- rnorm(100)/10
eps <- rnorm(x, mean =0, sd = 1.0)
y <- 10*x + .5 + eps
fm<-lm(y~x)
plot(x,y, pch=16, cex = .4, 
     main = "Figure 2.1 Simple linear regression: fitted line and observation points",  cex.main=0.8)
abline(fm, col="red")

2.2 2.1.3 Example

Load Libraries

library(haven)
library(stargazer)
library(AER)

Individual Wages

df <- read_dta("Data/Wages1.dta")
OLS1 = lm(wage ~ male, data=df)
summary(OLS1)

## 
## Call:
## lm(formula = wage ~ male, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.160 -2.102 -0.554  1.487 33.496 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.14692    0.08122   63.37   <2e-16 ***
## male         1.16610    0.11224   10.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.217 on 3292 degrees of freedom
## Multiple R-squared:  0.03175,    Adjusted R-squared:  0.03145 
## F-statistic: 107.9 on 1 and 3292 DF,  p-value: < 2.2e-16

2.3 Table 2.1

And 2.3.3 Example: Individual Wages (Continued)

stargazer(OLS1, no.space=TRUE, single.row = TRUE, type="text", 
          title ="Table 2.1 OLS results wage equation")

## 
## Table 2.1 OLS results wage equation
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                wage            
## -----------------------------------------------
## male                     1.166*** (0.112)      
## Constant                 5.147*** (0.081)      
## -----------------------------------------------
## Observations                   3,294           
## R2                             0.032           
## Adjusted R2                    0.031           
## Residual Std. Error      3.217 (df = 3292)     
## F Statistic          107.934*** (df = 1; 3292) 
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

2.4 2.5.2 Example

Individual Wages (Continued); Confidence interval

stargazer(OLS1, no.space=TRUE, type="text", 
          keep.stat=c("n", "rsq", "ser" ) , ci=TRUE, ci.level=0.95, single.row=TRUE)

## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                wage            
## -----------------------------------------------
## male                  1.166*** (0.946, 1.386)  
## Constant              5.147*** (4.988, 5.306)  
## -----------------------------------------------
## Observations                   3,294           
## R2                             0.032           
## Residual Std. Error      3.217 (df = 3292)     
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

2.5 Table 2.2

summary(OLS2 <- lm(wage ~ male + school + exper, data=df))

## 
## Call:
## lm(formula = wage ~ male + school + exper, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.654 -1.967 -0.457  1.444 34.194 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.38002    0.46498  -7.269 4.50e-13 ***
## male         1.34437    0.10768  12.485  < 2e-16 ***
## school       0.63880    0.03280  19.478  < 2e-16 ***
## exper        0.12483    0.02376   5.253 1.59e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.046 on 3290 degrees of freedom
## Multiple R-squared:  0.1326, Adjusted R-squared:  0.1318 
## F-statistic: 167.6 on 3 and 3290 DF,  p-value: < 2.2e-16

F-test

linearHypothesis(OLS2, c("school = 0", "exper =0"))

## Linear hypothesis test
## 
## Hypothesis:
## school = 0
## exper = 0
## 
## Model 1: restricted model
## Model 2: wage ~ male + school + exper
## 
##   Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
## 1   3292 34077                                  
## 2   3290 30528  2      3549 191.24 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

2.6 Figure 2.2

Monte Carlo Simulation

set.seed(123)
w <- rnorm(100)
x <- rnorm(100) 
eps <- 0.5 * w
y = 0 + 1*x + eps
b <- rep(0, 1000)
for(i in 1:1000) {
  ix <- sample(1:100, 100)
  b[i] <- lm(y~x)$coef[2]
}

plot(density(b), 
     main="Figure 2.2 Histogram of 1000 OLS estimates with normal density",  cex.main=0.8, ylab = "", xlab="b")

2.7 Table 2.3

CAPM regression (without intercept)

df <- read_dta("Data/Capm5.dta")

Food <- lm(foodrf ~ 0 + rmrf, data=df)
Durables <- lm(durblrf~ 0 + rmrf, data=df)
Construction <- lm(cnstrrf~ 0 + rmrf, data=df)

stargazer(Food, Durables, Construction, column.labels = c("Food", "Durables", "Construction"), keep.stat = c("N", "rsq", "ser" ), no.space=TRUE, type="text", title ="Table 2.3 CAPM regressions (without intercept)")

## 
## Table 2.3 CAPM regressions (without intercept)
## =============================================================
##                                     Dependent variable:      
##                                ------------------------------
##                                 foodrf  durblrf    cnstrrf   
##                                  Food   Durables Construction
##                                  (1)      (2)        (3)     
## -------------------------------------------------------------
## rmrf                           0.755*** 1.066***   1.174***  
##                                (0.025)  (0.027)    (0.025)   
## -------------------------------------------------------------
## Observations                     660      660        660     
## R2                              0.590    0.706      0.774    
## Residual Std. Error (df = 659)  2.812    3.072      2.831    
## =============================================================
## Note:                             *p<0.1; **p<0.05; ***p<0.01

2.8 Table 2.4

CAPM regression (with intercept)

df <- read_dta("Data/Capm5.dta")

Food <- lm(foodrf ~ 1 + rmrf, data=df)
Durables <- lm(durblrf~ rmrf + 1, data=df)
Construction <- lm(cnstrrf~ rmrf, data=df)

stargazer(Food, Durables, Construction, column.labels = c("Food", "Durables", "Construction"), keep.stat = c("N", "rsq", "ser" ), no.space=TRUE, type="text", title ="Table 2.4 CAPM regressions (with intercept)")

## 
## Table 2.4 CAPM regressions (with intercept)
## =============================================================
##                                     Dependent variable:      
##                                ------------------------------
##                                 foodrf  durblrf    cnstrrf   
##                                  Food   Durables Construction
##                                  (1)      (2)        (3)     
## -------------------------------------------------------------
## rmrf                           0.747*** 1.069***   1.174***  
##                                (0.025)  (0.027)    (0.025)   
## Constant                       0.320***  -0.120     -0.027   
##                                (0.110)  (0.120)    (0.111)   
## -------------------------------------------------------------
## Observations                     660      660        660     
## R2                              0.585    0.705      0.772    
## Residual Std. Error (df = 658)  2.796    3.072      2.833    
## =============================================================
## Note:                             *p<0.1; **p<0.05; ***p<0.01

2.9 Table 2.5

CAPM regressions (with intercept and January dummy)

df <- read_dta("Data/Capm5.dta")

Food <- lm(foodrf ~ 1 + rmrf + jan, data=df)
Durables <- lm(durblrf~ rmrf +jan + 1, data=df)
Construction <- lm(cnstrrf~ rmrf +jan, data=df)

stargazer(Food, Durables, Construction, column.labels = c("Food", "Durables", "Construction"), keep.stat = c("N", "rsq", "ser" ), no.space=TRUE, type="text", covariate.labels=c("excess market return","January dummy"), title ="Table 2.5 CAPM regressions (with intercept and January dummy)")

## 
## Table 2.5 CAPM regressions (with intercept and January dummy)
## =============================================================
##                                     Dependent variable:      
##                                ------------------------------
##                                 foodrf  durblrf    cnstrrf   
##                                  Food   Durables Construction
##                                  (1)      (2)        (3)     
## -------------------------------------------------------------
## excess market return           0.749*** 1.069***   1.173***  
##                                (0.024)  (0.027)    (0.025)   
## January dummy                  -0.971**  0.081      0.605    
##                                (0.393)  (0.433)    (0.399)   
## Constant                       0.400***  -0.126     -0.077   
##                                (0.114)  (0.126)    (0.116)   
## -------------------------------------------------------------
## Observations                     660      660        660     
## R2                              0.589    0.705      0.773    
## Residual Std. Error (df = 657)  2.786    3.074      2.831    
## =============================================================
## Note:                             *p<0.1; **p<0.05; ***p<0.01

2.10 Table 2.6

CAPM regression (with intercept) Madoff’s returns

df <- read_dta("Data/madoff.dta")
table2.6 <- lm(fslrf ~ rmrf, data=df)
summary(table2.6)

## 
## Call:
## lm(formula = fslrf ~ rmrf, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.34773 -0.48005 -0.08337  0.38865  2.97276 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.50495    0.04570  11.049  < 2e-16 ***
## rmrf         0.04089    0.01072   3.813  0.00018 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6658 on 213 degrees of freedom
## Multiple R-squared:  0.06388,    Adjusted R-squared:  0.05949 
## F-statistic: 14.54 on 1 and 213 DF,  p-value: 0.0001801

2.11 Table 2.7

Alternative specifications with dummy variables

df <- read_dta("Data/Wages1.dta")
df$female = ifelse(df$male == 0, 1, 0)

OLSm = lm(wage ~ male, data=df)
OLSf = lm(wage ~ female, data=df)
OLS = lm(wage ~ 0 + male + female, data=df)
stargazer(OLSm, OLSf, OLS, type="text", keep.stat = c("rsq"), 
          column.labels = c("OLSm", "OLSf", "OLS"), title ="Table 2.7 Alternative specifications with dummy variables)")

## 
## Table 2.7 Alternative specifications with dummy variables)
## ======================================
##               Dependent variable:     
##          -----------------------------
##                      wage             
##            OLSm      OLSf       OLS   
##             (1)       (2)       (3)   
## --------------------------------------
## male     1.166***            6.313*** 
##           (0.112)             (0.077) 
##                                       
## female             -1.166*** 5.147*** 
##                     (0.112)   (0.081) 
##                                       
## Constant 5.147***  6.313***           
##           (0.081)   (0.077)           
##                                       
## --------------------------------------
## R2         0.032     0.032     0.764  
## ======================================
## Note:      *p<0.1; **p<0.05; ***p<0.01

2.12 Figure 2.3

The impact of estimating with and without an outlying observation

set.seed(123)
x <- runif(n = 50, min = 0, max = 6)
eps <- rnorm(x, mean =0, sd = 1.0)
y<- x + 0.5 + eps
y1 <- ifelse(x < 5.9, y, 0.5)
fm<-lm(y~x)
fm1<-lm(y1~x)
plot(x,y1, pch=16, cex = .4, ylab = "",
     main = "Figure 2.3 The impact of estimating with and without an outlying observation",  cex.main=0.8)
abline(fm, col="red")
abline(fm1, col="blue")