6. CHAPTER 6. THE LEAST SQUARE ESTIMATOR#

SET UP

library(foreign) # to open stata.dta files
library(psych) # for better sammary of descriptive statistics
library(repr) # to combine graphs with adjustable plot dimensions
options(repr.plot.width = 12, repr.plot.height = 6) # Plot dimensions (in inches)
options(width = 150) # To increase character width of printed output

6.1. 6.2 EXAMPLES OF SAMPLING FROM A POPULATION#

rm(list=ls())  
df = read.dta(file = "Dataset/AED_GENERATEDDATA.DTA")
attach(df)
df_desc<-describe(df)
print(df_desc)
         vars n  mean   sd median trimmed  mad   min   max range skew kurtosis   se
x           1 5  3.00 1.58   3.00    3.00 1.48  1.00  5.00  4.00 0.00    -1.91 0.71
Eygivenx    2 5  7.00 3.16   7.00    7.00 2.97  3.00 11.00  8.00 0.00    -1.91 1.41
u           3 5 -1.03 1.75  -1.63   -1.03 1.29 -2.51  1.69  4.20 0.54    -1.66 0.78
y           4 5  5.97 1.90   4.69    5.97 0.29  4.49  8.61  4.12 0.40    -2.03 0.85

6.1.1. Table 6.1#

print(df.head<-head(df))
  x Eygivenx          u        y
1 1        3  1.6898891 4.689889
2 2        5 -0.3187171 4.681283
3 3        7 -2.5066674 4.493333
4 4        9 -1.6332803 7.366720
5 5       11 -2.3907640 8.609236

6.1.2. Figure 6.2: panel A#

summary(olsEyx <- lm(Eygivenx ~ x))

plot(x, y, xlab="Regressor x", ylab="Dependent variable y", pch=19, main="Population line E[y|x]=1+2x")
abline(olsEyx)
legend(1.5, 8, c("Actual",  "Fitted"), lty=c(-1,1), pch=c(19,-1), bty="o")
Warning message in summary.lm(olsEyx <- lm(Eygivenx ~ x)):
"essentially perfect fit: summary may be unreliable"
Call:
lm(formula = Eygivenx ~ x)

Residuals:
         1          2          3          4          5 
 7.134e-19  6.054e-16 -7.714e-16 -2.760e-16  4.414e-16 

Coefficients:
             Estimate Std. Error   t value Pr(>|t|)    
(Intercept) 1.000e+00  6.723e-16 1.487e+15   <2e-16 ***
x           2.000e+00  2.027e-16 9.867e+15   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.41e-16 on 3 degrees of freedom
Multiple R-squared:      1,	Adjusted R-squared:      1 
F-statistic: 9.736e+31 on 1 and 3 DF,  p-value: < 2.2e-16
_images/30061c749de1eb23cd7c325500f5540a09d124f8bd7ffc34f2a651df680871fe.png

6.1.3. Figure 6.2: panel B#

olsyx <- lm(y ~ x)
summary(olsyx)
plot(x, y, xlab="Regressor x", ylab="Dependent variable y",pch=19, main = "Regression line yhat=2.81+1.05x" )
abline(olsyx)
legend(1.5, 8, c("Actual",  "Fitted"), lty=c(-1,1), pch=c(19,-1), bty="o")
Call:
lm(formula = y ~ x)

Residuals:
      1       2       3       4       5 
 0.8266 -0.2344 -1.4748  0.3462  0.5363 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)  
(Intercept)   2.8109     1.1034   2.547   0.0841 .
x             1.0524     0.3327   3.163   0.0507 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.052 on 3 degrees of freedom
Multiple R-squared:  0.7693,	Adjusted R-squared:  0.6925 
F-statistic: 10.01 on 1 and 3 DF,  p-value: 0.05074
_images/8a2a62cc76d66a31f5ba7549dc2fcabbc3e8570bf47ee676462a0d380b69c07f.png

Three regressions from the same model

rm(list=ls())
set.seed(12345)
u = runif(30,min=0,max=1)
x1 = rnorm(30,3,1)
u1 = rnorm(30,0,2)
y1 = 1 + 2*x1 + u1
x2 = rnorm(30,3,1)
u2 = rnorm(30,0,2)
y2 = 1 + 2*x2 + u2
x3 = rnorm(30,3,1)
u3 = rnorm(30,0,2)
y3 = 1 + 2*x3 + u3
ols1 <- lm(y1 ~ x1)
summary(ols1)
ols2 <- lm(y2 ~ x2)
summary(ols2)
ols3 <- lm(y3 ~ x3)
summary(ols3)
Call:
lm(formula = y1 ~ x1)

Residuals:
   Min     1Q Median     3Q    Max 
-4.783 -1.612 -0.090  1.693  4.003 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -0.3789     1.3276  -0.285    0.777    
x1            2.5173     0.3806   6.614 3.56e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.442 on 28 degrees of freedom
Multiple R-squared:  0.6097,	Adjusted R-squared:  0.5958 
F-statistic: 43.75 on 1 and 28 DF,  p-value: 3.565e-07
Call:
lm(formula = y2 ~ x2)

Residuals:
   Min     1Q Median     3Q    Max 
-4.623 -1.528 -0.207  2.122  4.896 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   1.2469     1.5805   0.789 0.436775    
x2            2.0538     0.4584   4.480 0.000115 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.528 on 28 degrees of freedom
Multiple R-squared:  0.4176,	Adjusted R-squared:  0.3968 
F-statistic: 20.07 on 1 and 28 DF,  p-value: 0.0001146
Call:
lm(formula = y3 ~ x3)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.9398 -0.9904  0.1098  1.0261  3.8738 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)   2.7696     1.2548   2.207  0.03567 * 
x3            1.5345     0.4433   3.461  0.00174 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.927 on 28 degrees of freedom
Multiple R-squared:  0.2997,	Adjusted R-squared:  0.2747 
F-statistic: 11.98 on 1 and 28 DF,  p-value: 0.001742

6.1.4. Figure 6.3 - three panels#

plot(x1, y1, pch=19, main = "Sample 1" )
abline(ols1)
plot(x2, y2, pch=19, main = "Sample 1" )
abline(ols2)
plot(x3, y3, pch=19, main = "Sample 1" )
abline(ols3)
_images/403ae468c27b792d390ff2604ac4943f8d7436bfd4e430364c6b32738c843037.png _images/e95edf9d33cea17a8237f988ada4ee30ff903327dcd635ca65f5d4eee5847e94.png _images/794754ffeb9b916b5e84bb842a4f7837131ae4416090fb08550e379951778ece.png

Generated data: results of regression on 400 samples of size 30

df = read.dta(file = "Dataset/AED_GENERATEDREGRESSIONS.DTA")
attach(df)
df_desc<-describe(df)
print(df_desc)
head(df)
            vars   n  mean   sd median trimmed  mad   min   max range  skew kurtosis   se
slope          1 400  1.99 0.38   1.96    1.98 0.38  0.59  3.17  2.58  0.04     0.19 0.02
seslope        2 400  0.38 0.07   0.37    0.37 0.07  0.19  0.65  0.46  0.60     0.42 0.00
intercept      3 400  1.02 1.19   1.06    1.03 1.13 -2.46  5.67  8.13 -0.05     0.16 0.06
seintercept    4 400  1.19 0.23   1.17    1.18 0.22  0.63  2.00  1.37  0.60     0.47 0.01
rsquared       5 400  0.50 0.12   0.51    0.50 0.12  0.09  0.82  0.73 -0.40     0.17 0.01
numobs         6 400 30.00 0.00  30.00   30.00 0.00 30.00 30.00  0.00   NaN      NaN 0.00
A data.frame: 6 × 6
slope seslope intercept seintercept rsquared numobs
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2.011422 0.2921605 0.9837164 0.9583587 0.6286392 30
2 1.634170 0.3506421 1.7954644 1.1950867 0.4368498 30
3 2.094961 0.4156610 -0.0209372 1.3890935 0.4756781 30
4 1.972498 0.2285200 0.6164197 0.8136204 0.7268423 30
5 2.548882 0.6503775 -0.6771045 1.9628325 0.3542315 30
6 2.064110 0.3353717 1.2542309 0.9950362 0.5749864 30

6.1.5. Figure 6.4#

par(mfrow=c(1,2))

# Panel A
hist(slope, freq=FALSE, xlab="slope", main="Generated data: 400 slopes")
lines(density(slope)) 

# Panel B
hist(intercept, prob=TRUE, xlab="intercept", main="Generated data: 400 intercepts")
lines(density(intercept))

par(mfrow = c(1,1))
_images/bde5f1b409186499cfed0f6eb96a80a1ed1ffbeb25ae9599f26ba6383fc69c2d.png

Census data: results of regression on 400 samples of size 120

df = read.dta(file = "Dataset/AED_CENSUSREGRESSIONS.DTA")
attach(df)
df_desc<-describe(df)
print(df_desc)
head(df)
The following objects are masked from df (pos = 3):

    intercept, numobs, seintercept, seslope, slope
            vars   n   mean   sd median trimmed  mad    min    max range  skew kurtosis   se
slope          1 400  -0.01 0.01  -0.01   -0.01 0.01  -0.03   0.01  0.04 -0.18    -0.15 0.00
seslope        2 400   0.01 0.00   0.01    0.01 0.00   0.00   0.01  0.00 -0.11    -0.02 0.00
intercept      3 400   1.56 0.45   1.55    1.55 0.43   0.45   2.80  2.36  0.17    -0.14 0.02
seintercept    4 400   0.41 0.04   0.42    0.41 0.04   0.31   0.53  0.23 -0.12    -0.03 0.00
numobs         5 400 200.00 0.00 200.00  200.00 0.00 200.00 200.00  0.00   NaN      NaN 0.00
A data.frame: 6 × 5
slope seslope intercept seintercept numobs
<dbl> <dbl> <dbl> <dbl> <dbl>
1 -0.01326593 0.007451856 1.707933 0.4797598 200
2 -0.01021786 0.006931265 1.534607 0.4446451 200
3 -0.00404300 0.006063364 1.174318 0.3894077 200
4 -0.01864705 0.005981046 2.090928 0.3841836 200
5 -0.01285031 0.006403644 1.704155 0.4113336 200
6 -0.01622651 0.006049114 1.949338 0.3898587 200

6.1.6. Figure 6.5#

par(mfrow = c(1,2))

# Panel A
hist(slope, freq=FALSE, xlab="slope", main="Census data: 400 slopes")
lines(density(slope)) 

# Panel B
hist(intercept, prob=TRUE, xlab="intercept", main="Census data: 400 intercepts")
lines(density(intercept))
_images/ca748ef0164ba12e16311b01d6e02f53dec36b9096b0430c473332e6b4b8372b.png