13. CHAPTER 13. CASE STUDIES FOR MULTIPLE REGRESSION#
SET UP
library(foreign) # to open stata.dta files
library(psych) # for better sammary of descriptive statistics
library(repr) # to combine graphs with adjustable plot dimensions
options(repr.plot.width = 12, repr.plot.height = 6) # Plot dimensions (in inches)
options(width = 150) # To increase character width of printed output
13.1. 13.1 Case Study 1: School Performance Index#
13.1.1. California Academic Performance Index#
df <- read.dta("Dataset/AED_API99.DTA")
attach(df)
df_desc<-describe(df)
print(df_desc)
vars n mean sd median trimmed mad min max range skew kurtosis se
sch_code 1 807 2878752.53 1428937.22 3033578.00 2871983.47 1629414.46 130054.00 5838305.0 5708251.00 0.02 -0.80 50300.97
api99 2 807 620.94 107.44 620.00 619.67 112.68 355.00 966.0 611.00 0.12 -0.40 3.78
edparent 3 807 12.84 1.23 12.88 12.85 1.22 9.62 16.0 6.38 -0.09 -0.33 0.04
meals 4 807 21.92 23.67 14.00 18.48 20.76 0.00 98.0 98.00 0.96 -0.06 0.83
englearn 5 807 14.00 12.79 10.00 12.24 11.86 0.00 66.0 66.00 1.17 1.06 0.45
yearround 6 807 0.02 0.15 0.00 0.00 0.00 0.00 1.0 1.00 6.27 37.40 0.01
credteach 7 807 89.84 8.44 92.00 90.80 7.41 33.00 100.0 67.00 -1.38 3.60 0.30
emerteach 8 807 10.46 8.21 9.00 9.59 7.41 0.00 56.0 56.00 1.45 3.78 0.29
avg_ed_raw 9 807 2.92 0.62 2.94 2.93 0.61 1.31 4.5 3.19 -0.09 -0.33 0.02
pct_af_am 10 807 7.24 11.11 3.00 4.83 2.97 0.00 79.0 79.00 3.26 13.45 0.39
pct_am_ind 11 807 1.15 3.21 1.00 0.64 1.48 0.00 63.0 63.00 12.60 212.38 0.11
pct_asian 12 807 9.27 12.20 5.00 6.61 5.93 0.00 70.0 70.00 2.32 5.90 0.43
pct_fil 13 807 2.59 4.68 1.00 1.58 1.48 0.00 44.0 44.00 4.62 28.86 0.16
pct_hisp 14 807 33.27 24.87 27.00 30.42 23.72 1.00 99.0 98.00 0.84 -0.23 0.88
pct_pac 15 807 0.52 0.96 0.00 0.32 0.00 0.00 10.0 10.00 3.49 20.02 0.03
pct_white 16 807 45.71 27.90 46.00 45.88 37.06 0.00 97.0 97.00 -0.04 -1.26 0.98
mobility 17 757 13.78 18.21 9.00 9.71 5.93 0.00 100.0 100.00 3.28 11.38 0.66
13.1.2. Univariate Analysis#
par(mfrow=c(1,2))
hist(api99, prob = TRUE, xlim=c(200,1000), ylim = c(0, 0.004),
breaks=c(24*0:42), main="Histogram and Density Estimate", xlab="Academic Performance Index" )
lines(density(api99), col = 4, lwd = 2)
plot(density(api99), main="Density Estimate and Normal Density", xlab="Academic Performance Index",
ylim = c(0, 0.004), xlim=c(200,1000), type="l", lwd=2)
x<-seq(0, 1, by=0.02) # need x here and not some other name
curve(dnorm(x,mean(api99),sd(api99)), add=TRUE)
13.1.3. Bivariate Analysis#
library(jtools)
summ(ols_api<- lm(api99 ~ edparent), digits=4)
MODEL INFO:
Observations: 807
Dependent Variable: api99
Type: OLS linear regression
MODEL FIT:
F(1,805) = 4072.8500, p = 0.0000
R² = 0.8350
Adj. R² = 0.8348
Standard errors:OLS
-----------------------------------------------------------
Est. S.E. t val. p
----------------- ----------- --------- ---------- --------
(Intercept) -400.3137 16.0761 -24.9011 0.0000
edparent 79.5292 1.2462 63.8189 0.0000
-----------------------------------------------------------
plot(edparent, api99,
xlab = "Parents' years of schooling",
ylab = "Academic Performance Index",
pch = 19,
main = "School Performance Index")
abline(ols_api, lwd = 3, col = 4)
lines(lowess(edparent, api99), lwd = 3, lty = 2, col = 2)
legend("topleft",
legend = c("Actual value", "OLS regression", "Nonparametric fit"),
lty = c(NA, 1, 2),
pch = c(19, NA, NA),
col = c("black", 4, 2),
lwd = c(NA, 3, 3),
bty = "o")
13.1.4. Correlation#
print(cor(cbind(api99, edparent, meals,englearn,yearround,credteach,emerteach)))
api99 edparent meals englearn yearround credteach emerteach
api99 1.0000000 0.9137660 -0.5419036 -0.6561381 -0.19355145 0.4592823 -0.45331322
edparent 0.9137660 1.0000000 -0.6046625 -0.7071808 -0.24685800 0.3967105 -0.37020234
meals -0.5419036 -0.6046625 1.0000000 0.5598149 0.29042515 -0.2729162 0.21499672
englearn -0.6561381 -0.7071808 0.5598149 1.0000000 0.22315910 -0.2606307 0.20265116
yearround -0.1935514 -0.2468580 0.2904251 0.2231591 1.00000000 -0.1811366 0.09374574
credteach 0.4592823 0.3967105 -0.2729162 -0.2606307 -0.18113659 1.0000000 -0.81944305
emerteach -0.4533132 -0.3702023 0.2149967 0.2026512 0.09374574 -0.8194430 1.00000000
13.1.5. Multiple regression#
summ(ols_api_multi<-lm(api99~edparent + meals+englearn+yearround+credteach+emerteach), digits=3)
MODEL INFO:
Observations: 807
Dependent Variable: api99
Type: OLS linear regression
MODEL FIT:
F(6,800) = 771.411, p = 0.000
R² = 0.853
Adj. R² = 0.852
Standard errors:OLS
------------------------------------------------------
Est. S.E. t val. p
----------------- ---------- -------- -------- -------
(Intercept) -345.328 39.954 -8.643 0.000
edparent 73.942 1.883 39.273 0.000
meals 0.079 0.081 0.980 0.327
englearn -0.358 0.167 -2.145 0.032
yearround 25.956 10.185 2.548 0.011
credteach 0.387 0.311 1.247 0.213
emerteach -1.470 0.315 -4.672 0.000
------------------------------------------------------
13.2. 13.2 Cobb-Douglas Production Function#
df <- read.dta("Dataset/AED_COBBDOUGLAS.DTA")
attach(df)
print(df)
year q k l lnq lnk lnl
1 1899 100 100 100 4.605170 4.605170 4.605170
2 1900 101 107 105 4.615120 4.672829 4.653960
3 1901 112 114 110 4.718499 4.736198 4.700480
4 1902 122 122 118 4.804021 4.804021 4.770685
5 1903 124 131 123 4.820282 4.875197 4.812184
6 1904 122 138 116 4.804021 4.927254 4.753590
7 1905 143 149 125 4.962845 5.003946 4.828314
8 1906 152 163 133 5.023880 5.093750 4.890349
9 1907 151 176 138 5.017280 5.170484 4.927254
10 1908 126 185 121 4.836282 5.220356 4.795791
11 1909 155 198 140 5.043425 5.288267 4.941642
12 1910 159 208 144 5.068904 5.337538 4.969813
13 1911 153 216 145 5.030438 5.375278 4.976734
14 1912 177 226 152 5.176150 5.420535 5.023880
15 1913 184 236 154 5.214936 5.463832 5.036952
16 1914 169 244 149 5.129899 5.497168 5.003946
17 1915 189 266 154 5.241747 5.583496 5.036952
18 1916 225 298 182 5.416101 5.697093 5.204007
19 1917 227 335 196 5.424950 5.814130 5.278115
20 1918 223 366 200 5.407172 5.902633 5.298317
21 1919 218 387 193 5.384495 5.958425 5.262690
22 1920 231 407 193 5.442418 6.008813 5.262690
23 1921 179 417 147 5.187386 6.033086 4.990433
24 1922 240 431 161 5.480639 6.066108 5.081404
print(describe(df))
vars n mean sd median trimmed mad min max range skew kurtosis se
year 1 24 1910.50 7.07 1910.50 1910.50 8.90 1899.00 1922.00 23.00 0.00 -1.35 1.44
q 2 24 165.92 43.75 157.00 165.50 48.18 100.00 240.00 140.00 0.21 -1.28 8.93
k 3 24 234.17 106.27 212.00 228.25 114.90 100.00 431.00 331.00 0.52 -1.13 21.69
l 4 24 145.79 29.62 144.50 144.90 30.39 100.00 200.00 100.00 0.40 -0.95 6.05
lnq 5 24 5.08 0.27 5.06 5.09 0.34 4.61 5.48 0.88 -0.12 -1.22 0.05
lnk 6 24 5.36 0.46 5.36 5.36 0.58 4.61 6.07 1.46 0.03 -1.30 0.09
lnl 7 24 4.96 0.20 4.97 4.96 0.23 4.61 5.30 0.69 0.10 -1.02 0.04
summ(ols_cd<-lm(lnq~lnk+lnl),digits=3)
MODEL INFO:
Observations: 24
Dependent Variable: lnq
Type: OLS linear regression
MODEL FIT:
F(2,21) = 236.122, p = 0.000
R² = 0.957
Adj. R² = 0.953
Standard errors:OLS
---------------------------------------------------
Est. S.E. t val. p
----------------- -------- ------- -------- -------
(Intercept) -0.177 0.434 -0.408 0.687
lnk 0.233 0.064 3.668 0.001
lnl 0.807 0.145 5.565 0.000
---------------------------------------------------
suppressPackageStartupMessages(library(car))
print(linearHypothesis(ols_cd,c("lnk=0.25","lnl=0.75")))
Linear hypothesis test:
lnk = 0.25
lnl = 0.75
Model 1: restricted model
Model 2: lnq ~ lnk + lnl
Res.Df RSS Df Sum of Sq F Pr(>F)
1 23 0.071676
2 21 0.070982 2 0.00069384 0.1026 0.9029
#Test Constant Returns to Scale
print(linearHypothesis(ols_cd,c("lnl+lnk=1")))
Linear hypothesis test:
lnk + lnl = 1
Model 1: restricted model
Model 2: lnq ~ lnk + lnl
Res.Df RSS Df Sum of Sq F Pr(>F)
1 22 0.071643
2 21 0.070982 1 0.00066109 0.1956 0.6628