# 2 An Introduction to Linear Regression

## 2.1 Figure 2.1

Simple linear regression: fitted line and observation points, hypothetical data

``````set.seed(123)
x <- rnorm(100)/10
eps <- rnorm(x, mean =0, sd = 1.0)
y <- 10*x + .5 + eps
fm<-lm(y~x)
plot(x,y, pch=16, cex = .4,
main = "Figure 2.1 Simple linear regression: fitted line and observation points",  cex.main=0.8)
abline(fm, col="red")``````

## 2.2 2.1.3 Example

``````library(haven)
library(stargazer)
library(AER)``````

Individual Wages

``````df <- read_dta("Data/Wages1.dta")
OLS1 = lm(wage ~ male, data=df)
summary(OLS1)``````
``````##
## Call:
## lm(formula = wage ~ male, data = df)
##
## Residuals:
##    Min     1Q Median     3Q    Max
## -6.160 -2.102 -0.554  1.487 33.496
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  5.14692    0.08122   63.37   <2e-16 ***
## male         1.16610    0.11224   10.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.217 on 3292 degrees of freedom
## Multiple R-squared:  0.03175,    Adjusted R-squared:  0.03145
## F-statistic: 107.9 on 1 and 3292 DF,  p-value: < 2.2e-16``````

## 2.3 Table 2.1

And 2.3.3 Example: Individual Wages (Continued)

``````stargazer(OLS1, no.space=TRUE, single.row = TRUE, type="text",
title ="Table 2.1 OLS results wage equation")``````
``````##
## Table 2.1 OLS results wage equation
## ===============================================
##                         Dependent variable:
##                     ---------------------------
##                                wage
## -----------------------------------------------
## male                     1.166*** (0.112)
## Constant                 5.147*** (0.081)
## -----------------------------------------------
## Observations                   3,294
## R2                             0.032
## Residual Std. Error      3.217 (df = 3292)
## F Statistic          107.934*** (df = 1; 3292)
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01``````

## 2.4 2.5.2 Example

Individual Wages (Continued); Confidence interval

``````stargazer(OLS1, no.space=TRUE, type="text",
keep.stat=c("n", "rsq", "ser" ) , ci=TRUE, ci.level=0.95, single.row=TRUE)``````
``````##
## ===============================================
##                         Dependent variable:
##                     ---------------------------
##                                wage
## -----------------------------------------------
## male                  1.166*** (0.946, 1.386)
## Constant              5.147*** (4.988, 5.306)
## -----------------------------------------------
## Observations                   3,294
## R2                             0.032
## Residual Std. Error      3.217 (df = 3292)
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01``````

## 2.5 Table 2.2

``summary(OLS2 <- lm(wage ~ male + school + exper, data=df))``
``````##
## Call:
## lm(formula = wage ~ male + school + exper, data = df)
##
## Residuals:
##    Min     1Q Median     3Q    Max
## -7.654 -1.967 -0.457  1.444 34.194
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.38002    0.46498  -7.269 4.50e-13 ***
## male         1.34437    0.10768  12.485  < 2e-16 ***
## school       0.63880    0.03280  19.478  < 2e-16 ***
## exper        0.12483    0.02376   5.253 1.59e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.046 on 3290 degrees of freedom
## Multiple R-squared:  0.1326, Adjusted R-squared:  0.1318
## F-statistic: 167.6 on 3 and 3290 DF,  p-value: < 2.2e-16``````

F-test

``linearHypothesis(OLS2, c("school = 0", "exper =0"))``
``````## Linear hypothesis test
##
## Hypothesis:
## school = 0
## exper = 0
##
## Model 1: restricted model
## Model 2: wage ~ male + school + exper
##
##   Res.Df   RSS Df Sum of Sq      F    Pr(>F)
## 1   3292 34077
## 2   3290 30528  2      3549 191.24 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1``````

## 2.6 Figure 2.2

Monte Carlo Simulation

``````set.seed(123)
w <- rnorm(100)
x <- rnorm(100)
eps <- 0.5 * w
y = 0 + 1*x + eps
b <- rep(0, 1000)
for(i in 1:1000) {
ix <- sample(1:100, 100)
b[i] <- lm(y~x)\$coef[2]
}

plot(density(b),
main="Figure 2.2 Histogram of 1000 OLS estimates with normal density",  cex.main=0.8, ylab = "", xlab="b")``````

## 2.7 Table 2.3

CAPM regression (without intercept)

``````df <- read_dta("Data/Capm5.dta")

Food <- lm(foodrf ~ 0 + rmrf, data=df)
Durables <- lm(durblrf~ 0 + rmrf, data=df)
Construction <- lm(cnstrrf~ 0 + rmrf, data=df)

stargazer(Food, Durables, Construction, column.labels = c("Food", "Durables", "Construction"), keep.stat = c("N", "rsq", "ser" ), no.space=TRUE, type="text", title ="Table 2.3 CAPM regressions (without intercept)")                 ``````
``````##
## Table 2.3 CAPM regressions (without intercept)
## =============================================================
##                                     Dependent variable:
##                                ------------------------------
##                                 foodrf  durblrf    cnstrrf
##                                  Food   Durables Construction
##                                  (1)      (2)        (3)
## -------------------------------------------------------------
## rmrf                           0.755*** 1.066***   1.174***
##                                (0.025)  (0.027)    (0.025)
## -------------------------------------------------------------
## Observations                     660      660        660
## R2                              0.590    0.706      0.774
## Residual Std. Error (df = 659)  2.812    3.072      2.831
## =============================================================
## Note:                             *p<0.1; **p<0.05; ***p<0.01``````

## 2.8 Table 2.4

CAPM regression (with intercept)

``````df <- read_dta("Data/Capm5.dta")

Food <- lm(foodrf ~ 1 + rmrf, data=df)
Durables <- lm(durblrf~ rmrf + 1, data=df)
Construction <- lm(cnstrrf~ rmrf, data=df)

stargazer(Food, Durables, Construction, column.labels = c("Food", "Durables", "Construction"), keep.stat = c("N", "rsq", "ser" ), no.space=TRUE, type="text", title ="Table 2.4 CAPM regressions (with intercept)")           ``````
``````##
## Table 2.4 CAPM regressions (with intercept)
## =============================================================
##                                     Dependent variable:
##                                ------------------------------
##                                 foodrf  durblrf    cnstrrf
##                                  Food   Durables Construction
##                                  (1)      (2)        (3)
## -------------------------------------------------------------
## rmrf                           0.747*** 1.069***   1.174***
##                                (0.025)  (0.027)    (0.025)
## Constant                       0.320***  -0.120     -0.027
##                                (0.110)  (0.120)    (0.111)
## -------------------------------------------------------------
## Observations                     660      660        660
## R2                              0.585    0.705      0.772
## Residual Std. Error (df = 658)  2.796    3.072      2.833
## =============================================================
## Note:                             *p<0.1; **p<0.05; ***p<0.01``````

## 2.9 Table 2.5

CAPM regressions (with intercept and January dummy)

``````df <- read_dta("Data/Capm5.dta")

Food <- lm(foodrf ~ 1 + rmrf + jan, data=df)
Durables <- lm(durblrf~ rmrf +jan + 1, data=df)
Construction <- lm(cnstrrf~ rmrf +jan, data=df)

stargazer(Food, Durables, Construction, column.labels = c("Food", "Durables", "Construction"), keep.stat = c("N", "rsq", "ser" ), no.space=TRUE, type="text", covariate.labels=c("excess market return","January dummy"), title ="Table 2.5 CAPM regressions (with intercept and January dummy)")      ``````
``````##
## Table 2.5 CAPM regressions (with intercept and January dummy)
## =============================================================
##                                     Dependent variable:
##                                ------------------------------
##                                 foodrf  durblrf    cnstrrf
##                                  Food   Durables Construction
##                                  (1)      (2)        (3)
## -------------------------------------------------------------
## excess market return           0.749*** 1.069***   1.173***
##                                (0.024)  (0.027)    (0.025)
## January dummy                  -0.971**  0.081      0.605
##                                (0.393)  (0.433)    (0.399)
## Constant                       0.400***  -0.126     -0.077
##                                (0.114)  (0.126)    (0.116)
## -------------------------------------------------------------
## Observations                     660      660        660
## R2                              0.589    0.705      0.773
## Residual Std. Error (df = 657)  2.786    3.074      2.831
## =============================================================
## Note:                             *p<0.1; **p<0.05; ***p<0.01``````

## 2.10 Table 2.6

CAPM regression (with intercept) Madoff’s returns

``````df <- read_dta("Data/madoff.dta")
table2.6 <- lm(fslrf ~ rmrf, data=df)
summary(table2.6)``````
``````##
## Call:
## lm(formula = fslrf ~ rmrf, data = df)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -1.34773 -0.48005 -0.08337  0.38865  2.97276
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.50495    0.04570  11.049  < 2e-16 ***
## rmrf         0.04089    0.01072   3.813  0.00018 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6658 on 213 degrees of freedom
## Multiple R-squared:  0.06388,    Adjusted R-squared:  0.05949
## F-statistic: 14.54 on 1 and 213 DF,  p-value: 0.0001801``````

## 2.11 Table 2.7

Alternative specifications with dummy variables

``````df <- read_dta("Data/Wages1.dta")
df\$female = ifelse(df\$male == 0, 1, 0)

OLSm = lm(wage ~ male, data=df)
OLSf = lm(wage ~ female, data=df)
OLS = lm(wage ~ 0 + male + female, data=df)
stargazer(OLSm, OLSf, OLS, type="text", keep.stat = c("rsq"),
column.labels = c("OLSm", "OLSf", "OLS"), title ="Table 2.7 Alternative specifications with dummy variables)")``````
``````##
## Table 2.7 Alternative specifications with dummy variables)
## ======================================
##               Dependent variable:
##          -----------------------------
##                      wage
##            OLSm      OLSf       OLS
##             (1)       (2)       (3)
## --------------------------------------
## male     1.166***            6.313***
##           (0.112)             (0.077)
##
## female             -1.166*** 5.147***
##                     (0.112)   (0.081)
##
## Constant 5.147***  6.313***
##           (0.081)   (0.077)
##
## --------------------------------------
## R2         0.032     0.032     0.764
## ======================================
## Note:      *p<0.1; **p<0.05; ***p<0.01``````

## 2.12 Figure 2.3

The impact of estimating with and without an outlying observation

``````set.seed(123)
x <- runif(n = 50, min = 0, max = 6)
eps <- rnorm(x, mean =0, sd = 1.0)
y<- x + 0.5 + eps
y1 <- ifelse(x < 5.9, y, 0.5)
fm<-lm(y~x)
fm1<-lm(y1~x)
plot(x,y1, pch=16, cex = .4, ylab = "",
main = "Figure 2.3 The impact of estimating with and without an outlying observation",  cex.main=0.8)
abline(fm, col="red")
abline(fm1, col="blue")``````