Chapter 17. Nonparametric Density Estimation#

Figure 17.1. Histogram Estimate of Wage Density for Asian Women#

dat <- read.table("cps09mar.txt")
wage <- as.matrix(dat[,5]/(dat[,6]*dat[,7]))
asianfemale <- (dat[,11]==4)*(dat[,2]==1)
w <- wage[asianfemale==1]

m <- max(w)
br1 <- seq(0,m+10,10)
br2 <- seq(0,m+10,1)

# BinWidth = 10
hist(w,breaks=br1,xlim=c(0,57.7),ylim=c(0,.06),freq=FALSE,main="",xlab="wage",ylab="",cex.lab=1,cex.axis=1)
# BinWidth = 1
hist(w,breaks=br2,xlim=c(0,57.7),ylim=c(0,.06),freq=FALSE,main="",xlab="wage",ylab="",cex.lab=1,cex.axis=1)

Figure 17.2. Kernel Density Functions#

x <- seq(0,3,by=0.01)
x2 <- x^2
k1 <- (1/2/sqrt(3))*(x2 < 3)
k2 <- dnorm(x)
k3 <- (3/4/sqrt(5))*(1-x2/5)*(x2 < 5)
k4 <- (1/sqrt(6)-abs(x)/6)*(x2 < 6)
k5 <- (15/16/sqrt(7))*((1-x2/7)^2)*(x2<7)

plot(x,k1,type="l",lty=4,xaxs="i",yaxs="i",xlim=c(-.01,3),ylim=c(0,.41),ylab="",xlab="",bty="n",yaxt="n",cex.axis=.75)
lines(x,k2,type="l",lty=1,ylab="")
lines(x,k3,type="l",lty=2,ylab="")
lines(x,k4,type="l",lty=5,ylab="")
lines(x,k5,type="l",lty=6,ylab="")
abline(v=0)
legend("topright",inset=0,legend=c("Rectangular","Gaussian","Epanechnikov","Triangular","Biweight"),lty=c(4,1,2,5,6),cex=.75,bty="n")

Figure 17.3. Kernel Density Estimator of Wage Density for Asian Women#

dat <- read.table("cps09mar.txt")
wage <- as.matrix(dat[,5]/(dat[,6]*dat[,7]))
asianfemale <- (dat[,11]==4)*(dat[,2]==1)
w <- wage[asianfemale==1]

m <- max(w)
br1 <- seq(0,m+10,10)
b <- bw.SJ(w)
den <- density(w,bw=b)

hist(w,breaks=br1,xlim=c(0,57.7),ylim=c(0,.04),freq=FALSE,main="",xlab="wage",ylab="",cex.lab=.75,cex.axis=.75)
lines(den$x,den$y,type="l") 

Figure 17.4. Smoothing Bias#

w1 <- .75
mu1 <- 4
sig1 <- 1
mu2 <- 7
sig2 <- .75
n <- 200

w2 <- 1-w1
sd <- sqrt(w1*(mu1^2+sig1^2)+w2*(mu2^2+sig2^2)-(w1*mu1+w2*mu2)^2)
h <- .9*sd/(n^.2)

x <- seq(0,10,by=0.01)
f <- dnorm((x-mu1)/sig1)*w1/sig1 + dnorm((x-mu2)/sig2)*w2/sig2
f2 <- dnorm((x-mu1)/sig1)*(((x-mu1)/sig1)^2-1)*w1/(sig1^3) + dnorm((x-mu2)/sig2)*(((x-mu2)/sig2)^2-1)*w2/(sig2^3)

sig1h <- sqrt(sig1^2 + h^2)
sig2h <- sqrt(sig2^2 + h^2)
m <- dnorm((x-mu1)/sig1h)*w1/sig1h + dnorm((x-mu2)/sig2h)*w2/sig2h

b <- f + f2*(h^2)/2
leg1 <- expression(f(x))
leg2 <- expression(paste(E,"[",widehat(f)(x),"]"))
leg3 <- expression(f(x)+h^"2"*f^"(2)"*(x)/2)

plot(x,f,type="l",lty=1,xaxs="i",yaxs="i",ylab="",xlab="",xlim=c(0,10),ylim=c(0,.32),xaxt="n", yaxt="n",bty="n")
lines(x,m,type="l",lty=5,ylab="")
lines(x,b,type="l",lty=2,ylab="")
axis(side=1,seq(0,10,2),cex.axis=.75)
legend("topright",inset=0,legend=c(leg1,leg2,leg3),lty=c(1,5,2),cex=.75,bty="n") 

Figure 17.5. Choice of Bandwidth#

dat <- read.table("cps09mar.txt")
wage <- as.matrix(dat[,5]/(dat[,6]*dat[,7]))
asianfemale <- (dat[,11]==4)*(dat[,2]==1)
w <- wage[asianfemale==1]

# Bandwidths
n15 <- length(w)^(1/5)
s1 <- sd(w)
iqr <- IQR(w)
s2 <- iqr/1.34
s <- min(s1,s2)
b1 <- 1.06*s1/n15
b2 <- 0.9*s/n15
b3 <- bw.SJ(w)
cat("n^1/5")
print(n15)
cat("standard deviation")
print(s1)
cat("interquartile range")
print(iqr)
cat("min(sd,IQR/1.34)")
print(s)
cat("Gaussian Rule bandwidth")
print(b1)
cat("Rule of Thumb bandwidth")
print(b2)
cat("SJ bandwidth")
print(b3)
den1 <- density(w,bw=b1)		
den2 <- density(w,bw=b2)		
den3 <- density(w,bw=b3)		

plot(den1$x, den1$y, type="l",lty=1,xlab="",ylab="",xaxt="n", yaxt="n",title=NULL,xaxs="i",yaxs="i",xlim=c(0,60),ylim=c(0,.04),bty="n")
lines(den2,lty=5)
lines(den3,lty=2)
axis(side=1,seq(0,60,10),cex.axis=.75)
legend("topright",c("Gaussian Optimal", "Rule of Thumb", "Sheather-Jones"), lty=c(1,5,2),cex=.75,bty="n")	 
n^1/5
[1] 4.09321
standard deviation
[1] 20.61713
interquartile range
[1] 18.77778
min(sd,IQR/1.34)
[1] 14.01327
Gaussian Rule bandwidth
[1] 5.339125
Rule of Thumb bandwidth
[1] 3.081186
SJ bandwidth
[1] 2.142972
_images/cc3be9cc0f2210d1ddb43790c46f5792cb516e2cab14adf45dffef39958aa203.png

Figure 17.6. Choice of Kernel#

dat <- read.table("cps09mar.txt")
wage <- as.matrix(dat[,5]/(dat[,6]*dat[,7]))
asianfemale <- (dat[,11]==4)*(dat[,2]==1)
w <- wage[asianfemale==1]

# Bandwidths
b <- bw.SJ(w)
den1 <- density(w,kernel="rectangular",bw=b)		
den2 <- density(w,kernel="gaussian",bw=b)		
den3 <- density(w,kernel="epanechnikov",bw=b)		

plot(den1$x, den1$y, type="l",lty=1,xlab="",ylab="",xaxt="n", yaxt="n",title=NULL,xaxs="i",yaxs="i",xlim=c(0,60),ylim=c(0,.04),bty="n")
lines(den2,lty=5)
lines(den3,lty=2)
axis(side=1,seq(0,60,10),cex.axis=.75)
legend("topright",c("Rectangular", "Gaussian", "Epanechnikov"),lty=c(1,5,2),cex=.75,bty="n")