Chapter 12. Numerical Optimization#
Figure 12.1. Root Finding#
fx <- function(x) {
y <- exp(x)/sqrt(x) - exp(2)/sqrt(2)
return(y)
}
dx <- function(x) {
d <- exp(x)/sqrt(x) - exp(x)/(2*(x^1.5))
return(d)
}
x <- seq(1,4,by=0.05)
f <- fx(x)
x0 <- x[which.min(abs(f))]
### Newton Search
R <- 5
xi <- matrix(0,R,1)
fi <- matrix(0,R,1)
wd <- 1.4
plot(x,f,type="l",lty=1,xaxs="i",yaxs="i",ylab="",xlab="",bty="n",yaxt="n",xaxt="n",ylim=c(-3,23),xlim=c(1,4.04),lwd=wd)
abline(h=0,lty=3,lwd=wd)
points(x0,0,pch=19,col="black",cex=.8)
points(x0,0,pch=1,col="black",cex=1.5)
axis(side=1,seq(0,5,.5),lwd=wd)
axis(side=2,seq(-5,25,5),lwd=wd)
xi[1] <- xx <- 4
fi[1] <- ff <- fx(xx)
points(xx,ff,pch=19,col="black")
text(xx-.1,ff+.5,"1")
for (i in 2:R){
d <- dx(xx)
xi[i] <- xx <- xx - ff/d
fi[i] <- ff <- fx(xx)
x1 <- xi[i-1] - (fi[i-1]-ff)/d
if (i<5) points(xx,ff,pch=19,col="black",cex=.8)
if (i<5) text(xx,ff+1.5,i)
if (i==5) text(xx-.05,ff+1.5,i)
arrows(xi[i-1],fi[i-1],x1,ff,angle=20,length=.1)
}
cat("Newton Search \n")
print(cbind(xi,fi))
### Bisection Search
R <- 10
plot(x,f,type="l",lty=1,xaxs="i",yaxs="i",ylab="",xlab="",bty="n",yaxt="n",xaxt="n",ylim=c(-3,23),xlim=c(.96,4.04),lwd=wd)
axis(side=1,seq(0,5,.5),lwd=wd)
axis(side=2,seq(-5,25,5),lwd=wd)
abline(h=0,lty=3)
a <- min(x)
b <- max(x)
fa <- fx(a)
fb <- fx(b)
points(a,fa,pch=19,col="black",cex=.8)
text(a+.05,fa+1,"a")
points(b,fb,pch=19,col="black",cex=.8)
text(b-.1,fb,"b")
c <- (a+b)/2
fc <- fx(c)
lines(c(a,b),c(fa,fb),lty=2,lwd=wd)
ff <- (fa+fb)/2
points(c,ff,pch=19,col="black",cex=.8)
arrows(c,ff,c,fc,angle=20,length=.1)
text(c,ff+1,"c")
lines(c(a,c),c(fa,fc),lty=2,lwd=wd)
d <- (a+c)/2
fd <- fx(d)
points(d,(fa+fc)/2,pch=19,col="black",cex=.8)
lines(c(d,d),c((fa+fc)/2,fd),lty=2,lwd=wd)
lines(c(d,c),c(fd,fc),lty=2,lwd=wd)
text(c,fc-1.5,"f(c)")
text(d,(fa+fc)/2+1,expression(d))
text(d,fd-1,expression(f(d)))
points(x0,0,pch=19,col="black",cex=.8)
points(x0,0,pch=1,col="black",cex=1.5)
cat("Bisection Search \n")
print(cbind(a,b))
for (i in 3:R){
c <- (a+b)/2
fc <- fx(c)
if (fc*fa > 0) {
a <- c
fa <- fc
} else {
b <- c
fb <- fc
}
if (i<7) points(c,fc,pch=19,col="black",cex=.8)
if (i==5) text(c,fc-1.5,expression(e))
print(cbind(a,b))
}
Figure 12.2. Minimization in One Dimension#
fx <- function(x) {
y <- -log(x)+x/2
return(y)
}
dx <- function(x) {
d <- -1/x+1/2
return(d)
}
hx <- function(x) {
d <- 1/x^2
return(d)
}
x <- seq(.1,4,by=0.05)
f <- fx(x)
x0 <- x[which.min(abs(f))]
f0 = fx(x0)
## Newton Search
wd <- 1.4
plot(x,f,type="l",lty=1,xaxs="i",yaxs="i",ylab="",xlab="",ylim=c(0,2.5),xlim=c(0,4),xaxt="n",bty="n",lwd=wd)
axis(side=1,seq(0,4,1),lwd=wd)
axis(side=2,lwd=wd)
points(x0,f0,pch=1,col="black",cex=1.4)
R <- 8
xi <- matrix(0,R,1)
fi <- matrix(0,R,1)
xi[1] <- xx <- .1
fi[1] <- ff <- fx(xx)
points(xx,ff,pch=19,col="black",cex=.8)
text(xx+.15,ff,1)
points(x0,f0,pch=19,col="black",cex=.8)
for (i in 2:R){
xi[i] <- xx <- xx - dx(xx)/hx(xx)
fi[i] <- ff <- fx(xx)
points(xx,ff,pch=19,col="black",cex=.8)
if (i<6) text(xx+.3/i,ff+.02*i,i)
if (i==6) text(xx,ff+.125,i)
if (i==7) text(xx-.01,ff+.125,i)
if (i==8) text(xx+.05,ff+.125,i)
arrows(xi[i-1],fi[i-1],xx,ff,angle=20,length=.15,lwd=wd)
}
cat("Newton Search \n")
print(cbind(xi,fi))
## Golden Section Search
plot(x,f,type="l",lty=1,xaxs="i",yaxs="i",ylab="",xlab="",ylim=c(0,2.5),xlim=c(0,4.1),xaxt="n",bty="n",lwd=wd)
axis(side=1,seq(0,4,1),lwd=wd)
axis(side=2,lwd=wd)
points(x0,f0,pch=1,col="black",cex=1.4)
points(x0,f0,pch=19,col="black",cex=.8)
R <- 16
gi <- matrix(0,R,1)
yi <- matrix(0,R,1)
phi <- (1+sqrt(5))/2
yi[1] <- a <- 0.1
yi[2] <- b <- 4
yi[3] <- c <- b-(b-a)/phi
yi[4] <- d <- a+(b-a)/phi
gi[1] <- fa <- fx(a)
gi[2] <- fb <- fx(b)
gi[3] <- fc <- fx(c)
gi[4] <- fd <- fx(d)
hc <- fb-(fb-fa)/phi
hd <- fa+(fb-fa)/phi
lines(c(a,b),c(fa,fb),lty=2,lwd=wd)
points(a,fa,pch=19,col="black",cex=.8)
points(b,fb,pch=19,col="black",cex=.8)
points(c,hc,pch=19,col="black",cex=.8)
points(d,hd,pch=19,col="black",cex=.8)
text(.2,2.45,"a")
text(3.95,fb+.15,"b")
text(c,hc+.15,"c")
text(d,hd+.15,"d")
points(c,fc,pch=19,col="black",cex=.8)
points(d,fd,pch=19,col="black",cex=.8)
text(c,fc-.15,expression(f(c)),cex=.8)
text(d,fd-.15,expression(f(d)),cex=.8)
arrows(c,hc,c,fc,angle=20,length=.1,lwd=wd)
arrows(d,hd,d,fd,angle=20,length=.1,lwd=wd)
print(cbind(4,a,c,d,b))
e <- d-(d-a)/phi
fe <- fx(e)
he <- fd-(fd-fa)/phi
lines(c(a,d),c(fa,fd),lty=2,lwd=wd)
points(e,fe,pch=19,col="black",cex=.8)
points(e,he,pch=19,col="black",cex=.8)
text(e,he+.15,"e")
text(e,fe-.15,expression(f(e)))
arrows(e,he,e,fe,angle=20,length=.1,lwd=wd)
lines(c(e,d),c(fe,fd),lty=2,lwd=wd)
fnew <- e+(d-e)/phi
ff <- fx(fnew)
hf <- fe+(fd-fe)/phi
points(fnew,ff,pch=19,col="black",cex=.8)
points(fnew,hf,pch=19,col="black",cex=.8)
text(fnew,hf+.1,"f")
text(x0+.02,f0-.15,expression(x[0]))
lines(c(fnew,fnew),c(hf,ff),lwd=wd)
for (i in 5:R){
if (fc<fd) {
b <- d
d <- c
fd <- fc
yi[i] <- c <- b-(b-a)/phi
gi[i] <- fc <- fx(c)
} else {
a <- c
c <- d
fc <- fd
yi[i] <- d <- a+(b-a)/phi
gi[i] <- fd <- fx(d)
}
print(cbind(i,a,c,d,b))
}
a c d b
[1,] 4 0.1 1.589667 2.510333 4
i a c d b
[1,] 5 0.1 1.020665 1.589667 2.510333
i a c d b
[1,] 6 1.020665 1.589667 1.94133 2.510333
i a c d b
[1,] 7 1.589667 1.94133 2.15867 2.510333
i a c d b
[1,] 8 1.589667 1.807007 1.94133 2.15867
i a c d b
[1,] 9 1.807007 1.94133 2.024347 2.15867
i a c d b
[1,] 10 1.94133 2.024347 2.075653 2.15867
i a c d b
[1,] 11 1.94133 1.992637 2.024347 2.075653
i a c d b
[1,] 12 1.94133 1.97304 1.992637 2.024347
i a c d b
[1,] 13 1.97304 1.992637 2.004749 2.024347
i a c d b
[1,] 14 1.992637 2.004749 2.012235 2.024347
i a c d b
[1,] 15 1.992637 2.000123 2.004749 2.012235
i a c d b
[1,] 16 1.992637 1.997263 2.000123 2.004749

Figure 12.3. Multiple local minima#
x <- seq(-1.5,1.25,by=0.01)
f <- - dnorm(x+1/2) - dnorm(6*x-6)
i0 <- which.min(f)
x0 <- x[i0]
f0 <- f[i0]
i1 <- which.min(f[1:200])
x1 <- x[i1]
f1 <- f[i1]
plot(x,f,type="l",lty=1,xaxs="i",yaxs="i",ylim=c(-.57,-.2),xlim=c(-1.5,1.25),ylab="",xlab="",yaxt="n",xaxt="n",bty="n")
axis(side=1,seq(-2,1.5,.5),cex.axis=.75)
points(x0,f0,pch=19,col="black",cex=1)
points(x1,f1,pch=18,col="black",cex=1.5)
text(x0,f0-.025,expression(x[0]),cex=.75)
text(x1,f1-.025,expression(x[1]),cex=.75)
Figures 12.4. Grid Search and Steepest Descent#
suppressPackageStartupMessages(library(spatstat))
dat <- read.table("cps09mar.txt")
experience <- dat[,1]-dat[,4]-6
hisp <- (dat[,3]==1)&(dat[,2]==1) &(experience<=15)&(experience>=10)
dat1 <- dat[hisp,]
wage <- as.matrix(dat1[,5]/(dat1[,6]*dat1[,7]))
n <- length(wage)
lwage <- log(wage)
mwage <- mean(lwage)
y_dat <- lwage - mwage
# moment estimator
mu4 <- mean(y_dat^4)
v <- var(y_dat)
v4 <- v^2
thetat <- mwage
v2t <- mu4*v/(2*mu4-3*v4)
r <- (4*mu4-6*v4)/(mu4-3*v4)
beta <- c(r,v2t)
like <- function(beta) {
r <- beta[1]
v <- beta[2]
f <- (n/2)*log(pi*v*r) - n*lgamma((1+r)/2) + n*lgamma(r/2) + ((1+r)/2)*sum(log(1+(y_dat^2)/(v*r)))
return(f)
}
grad <- function(beta) {
r <- beta[1]
v <- beta[2]
y2 <- y_dat^2
gv <- n/(2*v) - ((r+1)/(2*r*(v^2)))*sum(y2/(1+y2/(r*v)))
gr <- (n/2)*(1/r - digamma((r+1)/2) + digamma(r/2)) + (1/2)*sum(log(1+y2/(r*v))) - ((r+1)/(2*v*(r^2)))*sum(y2/(1+y2/(r*v)))
g <- rbind(gr,gv)
return(g)
}
hess <- function(beta) {
r <- beta[1]
v <- beta[2]
y2 <- y_dat^2
H <- matrix(0,2,2)
X2 <- sum(y2/(1+y2/(r*v)))
X4 <- sum((y2/(1+y2/(r*v)))^2)
H[1,1] <- - X4*(1+1/r)/(r*(r*v)^2)/2 + X2/v/(r^3) - n/2/(r^2) - n*trigamma((r+1)/2)/4 + n*trigamma(r/2)/4
H[2,1] <- H[1,2] <- X2/2/((r*v)^2) - X4*(1/(r^2)+1/(r^3))/2/(v^3)
H[2,2] <- -n/(2*v*v) + X2*(1+1/r)/(v^3) - X4*(1/r+1/(r^2))/(v^4)/2
return(H)
}
newt <- function(beta) return(beta-solve(hess(beta),grad(beta)))
vs <- seq(.1,.5,.001)
nv <- length(vs)
rs <- seq(2.2,20,.2)
nr <- length(rs)
ff <- matrix(0,nr,nv)
for (i in 1:nv){
for (j in 1:nr){
bj <- c(rs[j],vs[i])
ff[j,i] <- like(bj)
}}
r0 <- rs[which.min(apply(ff,1,min))]
v0 <- vs[which.min(apply(t(ff),1,min))]
x0 <- c(r0,v0)
x0 <- newt(newt(newt(x0)))
wd <- 1.4
# Grid Search
contour(rs,vs,ff,nlevels=45, drawlabels=FALSE,xlab="r",ylab=expression(nu),xaxs="i",yaxs="i",xaxt="n",yaxt="n",cex.lab=1,lwd=wd)
axis(side=1,seq(2,20,2),lwd=wd)
axis(side=2,seq(.1,.7,.1),lwd=wd)
points(x0[1],x0[2],pch=19,col="black",cex=1)
for (i in 1:nv){
for (j in 1:nr){
if (((i/10)==round(i/10))*((j/2)==round(j/2))) points(rs[j],vs[i],pch=19,col="black",cex=.15)
}}
f1 <- vs[apply(ff,1,which.min)]
# Steepest Descent
contour(rs,vs,ff,nlevels=45, drawlabels=FALSE,xlab="r",ylab=expression(nu),xaxs="i",yaxs="i",xaxt="n",yaxt="n",cex.lab=1,lwd=wd)
axis(side=1,seq(2,20,2),lwd=wd)
axis(side=2,seq(.1,.7,.1),lwd=wd)
points(x0[1],x0[2],pch=19,col="black",cex=1)
x1 <- c(4,.25)
x2 <- x1 - grad(x1)/2500
d <- grad(x1)
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lty=1,lwd=wd)
x1 <- c(6,.18)
x2 <- x1 - grad(x1)/5000
d <- grad(x1)
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lty=1,lwd=wd)
x1 <- c(8,.35)
x2 <- x1 - grad(x1)/1650
d <- grad(x1)
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lty=1,lwd=wd)
x1 <- c(10,.15)
x2 <- x1 - grad(x1)/7500
d <- grad(x1)
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lty=1,lwd=wd)
x1 <- c(12,.3)
x2 <- x1 - grad(x1)/2200
d <- grad(x1)
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lty=1,lwd=wd)
x1 <- c(14,.2)
x2 <- x1 - grad(x1)/5000
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lty=1,lwd=wd)
x1 <- c(16,.45)
x2 <- x1 - grad(x1)/1100
d <- grad(x1)
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lty=1,lwd=wd)
x1 <- c(18,.15)
x2 <- x1 - grad(x1)/8500
d <- grad(x1)
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lty=1,lwd=wd)
x1 <- c(7,.45)
x2 <- x1 - grad(x1)/1000
d <- grad(x1)
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lty=1,lwd=wd)
x1 <- c(3,.4)
x2 <- x1 - grad(x1)/1100
d <- grad(x1)
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lty=1,lwd=wd)
Figure 12.5. Newton Search and Trust RegionMethod#
# Newton
contour(rs,vs,ff,nlevels=45, drawlabels=FALSE,xlab="r",ylab=expression(nu),xaxs="i",yaxs="i",xaxt="n",yaxt="n",cex.lab=1.0,lwd=wd)
axis(side=1,seq(2,20,2),lwd=wd)
axis(side=2,seq(.1,.7,.1),lwd=wd)
points(x0[1],x0[2],pch=19,col="black",cex=1.0)
x1 <- c(4,.25)
H <- hess(x1)
g <- grad(x1)
d <- solve(H,g)
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=wd)
ev <- eigen(H)
L <- ev$values
if (min(L) < 0){
V <- ev$vectors
d <- V%*%diag(1/abs(L))%*%t(V)%*%g
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=2*wd)
}
x1 <- c(6,.18)
H <- hess(x1)
g <- grad(x1)
d <- solve(H,g)
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=wd)
ev <- eigen(H)
L <- ev$values
if (min(L) < 0){
V <- ev$vectors
d <- V%*%diag(1/abs(L))%*%t(V)%*%g
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=2*wd)
}
x1 <- c(8,.35)
H <- hess(x1)
g <- grad(x1)
d <- solve(H,g)
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=wd)
ev <- eigen(H)
L <- ev$values
if (min(L) < 0){
V <- ev$vectors
d <- V%*%diag(1/abs(L))%*%t(V)%*%g
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=2*wd)
}
x1 <- c(10,.15)
H <- hess(x1)
g <- grad(x1)
d <- solve(H,g)
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=wd)
ev <- eigen(H)
L <- ev$values
if (min(L) < 0){
V <- ev$vectors
d <- V%*%diag(1/abs(L))%*%t(V)%*%g
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=2*wd)
}
x1 <- c(12,.3)
H <- hess(x1)
g <- grad(x1)
d <- solve(H,g)
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=wd)
ev <- eigen(H)
L <- ev$values
if (min(L) < 0){
V <- ev$vectors
d <- V%*%diag(1/abs(L))%*%t(V)%*%g
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=2*wd)
}
x1 <- c(14,.2)
H <- hess(x1)
g <- grad(x1)
d <- solve(H,g)
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=wd)
ev <- eigen(H)
L <- ev$values
if (min(L) < 0){
V <- ev$vectors
d <- V%*%diag(1/abs(L))%*%t(V)%*%g
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=2*wd)
}
x1 <- c(16,.45)
H <- hess(x1)
g <- grad(x1)
d <- solve(H,g)
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=wd)
ev <- eigen(H)
L <- ev$values
if (min(L) < 0){
V <- ev$vectors
d <- V%*%diag(1/abs(L))%*%t(V)%*%g
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=1)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=2*wd)
}
x1 <- c(18,.15)
H <- hess(x1)
g <- grad(x1)
d <- solve(H,g)
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=wd)
ev <- eigen(H)
L <- ev$values
if (min(L) < 0){
V <- ev$vectors
d <- V%*%diag(1/abs(L))%*%t(V)%*%g
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=2*wd)
}
x1 <- c(7,.45)
H <- hess(x1)
g <- grad(x1)
d <- solve(H,g)
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=wd)
ev <- eigen(H)
L <- ev$values
if (min(L) < 0){
V <- ev$vectors
d <- V%*%diag(1/abs(L))%*%t(V)%*%g
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=2*wd)
}
x1 <- c(3,.4)
H <- hess(x1)
g <- grad(x1)
d <- solve(H,g)
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=wd)
ev <- eigen(H)
L <- ev$values
if (min(L) < 0){
V <- ev$vectors
d <- V%*%diag(1/abs(L))%*%t(V)%*%g
d <- d/sqrt((d[1]/17)^2+(d[2]/.4)^2)
x2 <- x1 - d/10
points(x1[1],x1[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=2*wd)
}
# Trust Region
contour(rs,vs,ff,nlevels=45, drawlabels=FALSE,xlab="r",ylab=expression(nu),xaxs="i",yaxs="i",xaxt="n",yaxt="n",cex.lab=1.0,lwd=wd)
axis(side=1,seq(2,20,2),lwd=wd)
axis(side=2,seq(.1,.7,.1),lwd=wd)
points(x0[1],x0[2],pch=19,col="black",cex=1)
da <- 1
db <- .025
D <- diag(c(1/da^2,1/db^2))
L <- 100
delta <- 1
x1 <- c(18,.15)
points(x1[1],x1[2],pch=19,col="black",cex=.75)
g <- grad(x1)
H <- hess(x1)
W1 <- ellipse(a=da*sqrt(delta),b=db*sqrt(delta),centre=x1,phi=0,npoly=1024)
plot(W1,add=TRUE,lwd=wd)
rootf <- function(x) t(g)%*%solve(H+D*x)%*%D%*%solve(H+D*x)%*%g-delta
lambda <- uniroot(rootf,c(0,L))
x2 <- x1 - solve(H + D*lambda$root,g)
points(x2[1],x2[2],pch=19,col="black",cex=.75)
arrows(x1[1],x1[2],x2[1],x2[2],angle=20,length=.1,lwd=wd)
dx <- x2-x1
rho <- -(like(x1)-like(x2))/(t(g)%*%dx+t(dx)%*%H%*%dx/2)
if (rho>=0.9) delta <- delta*2
if (rho<0.1) delta <- delta/2
g <- grad(x2)
H <- hess(x2)
W1 <- ellipse(a=da*sqrt(delta),b=db*sqrt(delta),centre=x2,phi=0,npoly=1024)
plot(W1,add=TRUE,lwd=wd)
lambda <- uniroot(rootf,c(0,L))
x3 <- x2 - solve(H + D*lambda$root,g)
points(x3[1],x3[2],pch=19,col="black",cex=.75)
arrows(x2[1],x2[2],x3[1],x3[2],angle=20,length=.1,lwd=wd)
dx <- x3-x2
rho <- -(like(x2)-like(x3))/(t(g)%*%dx+t(dx)%*%H%*%dx/2)
if (rho>=0.9) delta <- delta*2
if (rho<0.1) delta <- delta/2
g <- grad(x3)
H <- hess(x3)
W1 <- ellipse(a=da*sqrt(delta),b=db*sqrt(delta),centre=x3,phi=0,npoly=1024)
plot(W1,add=TRUE,lwd=wd)
lambda <- uniroot(rootf,c(0,L))
x4 <- x3 - solve(H + D*lambda$root,g)
points(x4[1],x4[2],pch=19,col="black",cex=.75)
arrows(x3[1],x3[2],x4[1],x4[2],angle=20,length=.1,lwd=wd)
dx <- x4-x3
rho <- -(like(x3)-like(x4))/(t(g)%*%dx+t(dx)%*%H%*%dx/2)
if (rho>=0.9) delta <- delta*2
if (rho<0.1) delta <- delta/2
g <- grad(x4)
H <- hess(x4)
W1 <- ellipse(a=da*sqrt(delta),b=db*sqrt(delta),centre=x4,phi=0,npoly=1024)
plot(W1,add=TRUE,lwd=wd)
lambda <- uniroot(rootf,c(0,L))
x5 <- x4 - solve(H + D*lambda$root,g)
points(x5[1],x5[2],pch=19,col="black",cex=.75)
arrows(x4[1],x4[2],x5[1],x5[2],angle=20,length=.1,lwd=wd)
dx <- x5-x4
rho <- -(like(x4)-like(x5))/(t(g)%*%dx+t(dx)%*%H%*%dx/2)
if (rho>=0.9) delta <- delta*2
if (rho<0.1) delta <- delta/2
g <- grad(x5)
H <- hess(x5)
W1 <- ellipse(a=da*sqrt(delta),b=db*sqrt(delta),centre=x5,phi=0,npoly=1024)
plot(W1,add=TRUE,lwd=wd)
lambda <- uniroot(rootf,c(0,L))
x6 <- x5 - solve(H + D*lambda$root,g)
points(x6[1],x6[2],pch=19,col="black",cex=.75)
arrows(x5[1],x5[2],x6[1],x6[2],angle=20,length=.1,lwd=wd)
dx <- x6-x5
rho <- -(like(x5)-like(x6))/(t(g)%*%dx+t(dx)%*%H%*%dx/2)
if (rho>=0.9) delta <- delta*2
if (rho<0.1) delta <- delta/2
g <- grad(x6)
H <- hess(x6)
W1 <- ellipse(a=da*sqrt(delta),b=db*sqrt(delta),centre=x6,phi=0,npoly=1024)
plot(W1,add=TRUE,lwd=wd)
x7 <- x6 - solve(H,g)/8
points(x7[1],x7[2],pch=19,col="black",cex=.75)
arrows(x6[1],x6[2],x7[1],x7[2],angle=20,length=.1,lwd=wd)
dx <- x7-x6
rho <- -(like(x6)-like(x7))/(t(g)%*%dx+t(dx)%*%H%*%dx/2)
if (rho>=0.9) delta <- delta*2
if (rho<0.1) delta <- delta/2
g <- grad(x7)
H <- hess(x7)
W1 <- ellipse(a=da*sqrt(delta),b=db*sqrt(delta),centre=x7,phi=0,npoly=1024)
plot(W1,add=TRUE,lwd=wd)
x8 <- x7 - solve(H,g)
arrows(x7[1],x7[2],x8[1],x8[2],angle=20,length=.1,lwd=wd)
dx <- x8-x7
rho <- -(like(x7)-like(x8))/(t(g)%*%dx+t(dx)%*%H%*%dx/2)
if (rho>=0.9) delta <- delta*2
if (rho<0.1) delta <- delta/2
g <- grad(x8)
H <- hess(x8)
x9 <- x8 - solve(H,g)
Figure 12.6. Nelder-Mead Search#
dat <- read.table("cps09mar.txt")
experience <- dat[,1]-dat[,4]-6
hisp <- (dat[,3]==1)&(dat[,2]==1) &(experience<=15)&(experience>=10)
dat1 <- dat[hisp,]
wage <- as.matrix(dat1[,5]/(dat1[,6]*dat1[,7]))
n <- length(wage)
lwage <- log(wage)
mwage <- mean(lwage)
y_dat <- lwage - mwage
like <- function(beta) {
r <- beta[1]
v <- beta[2]
f <- (n/2)*log(pi*v*r) - n*lgamma((1+r)/2) + n*lgamma(r/2) + ((1+r)/2)*sum(log(1+(y_dat^2)/(v*r)))
return(f)
}
vs <- seq(.1,.5,.001)
nv <- length(vs)
rs <- seq(2.2,20,.2)
nr <- length(rs)
ff <- matrix(0,nr,nv)
for (i in 1:nv){
for (j in 1:nr){
bj <- c(rs[j],vs[i])
ff[j,i] <- like(bj)
}}
r0 <- rs[which.min(apply(ff,1,min))]
v0 <- vs[which.min(apply(t(ff),1,min))]
x0 <- c(r0,v0)
wd <- 1.4
# Small Initial Simplex
contour(rs,vs,ff,nlevels=45, drawlabels=FALSE,xlab="r",ylab=expression(nu),xaxs="i",yaxs="i",xaxt="n",yaxt="n",lwd=wd)
axis(side=1,seq(2,20,2),lwd=wd)
axis(side=2,seq(.1,.7,.1),lwd=wd)
points(x0[1],x0[2],pch=19,col="black",cex=1.0)
X <- matrix(0,3,2)
f <- matrix(0,3,1)
X[,1] <- c(17,18.5,20)
X[,2] <- c(.1,.15,.1)
points(X[1,1],X[1,2],pch=19,col="black",cex=.75)
points(X[2,1],X[2,2],pch=19,col="black",cex=.75)
points(X[3,1],X[3,2],pch=19,col="black",cex=.75)
f[1] <- like(X[1,])
f[2] <- like(X[2,])
f[3] <- like(X[3,])
i <- order(f)
f <- f[i]
X <- X[i,]
text(18.5,.134,"1",cex=.8)
polygon(X,lwd=wd)
c <- colMeans(X[1:2,])
xr <- c*2 - X[3,]
xr <- xr*2 - c
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(15,.155,"2",cex=.8)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- c*2 - X[3,]
xr <- xr*2 - c
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(14.5,.212,"3",cex=.8)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- c*2 - X[3,]
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(12.8,.23,"4",cex=.8)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- (c + X[3,])/2
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(10,.29,"5",cex=.8)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- (c + X[3,])/2
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(mean(X[,1]),mean(X[,2]),"6",cex=.8)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- c*2 - X[3,]
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(mean(X[,1]),.27,"7",cex=.7)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- (c + X[3,])/2
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(11.0,mean(X[,2]),"8",cex=.7)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- c*2 - X[3,]
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(mean(X[,1]),mean(X[,2]),"9",cex=.7)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- c*2 - X[3,]
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(mean(X[,1]),mean(X[,2]),"10",cex=.7)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- c*2 - X[3,]
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(mean(X[,1]),mean(X[,2]),"11",cex=.7)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- c*2 - X[3,]
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(6.9,mean(X[,2]),"12",cex=.6)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- (c + X[3,])/2
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(mean(X[,1]),mean(X[,2]),"13",cex=.6)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- c*2 - X[3,]
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(5.1,.205,"14",cex=.6)
polygon(X,lwd=wd)
# Large Initial Simplex
contour(rs,vs,ff,nlevels=45, drawlabels=FALSE,xlab="r",ylab=expression(nu),xaxs="i",yaxs="i",xaxt="n",yaxt="n",cex.lab=1,lwd=wd)
axis(side=1,seq(2,20,2),lwd=wd)
axis(side=2,seq(.1,.7,.1),lwd=wd)
points(x0[1],x0[2],pch=19,col="black",cex=1)
X <- matrix(0,3,2)
f <- matrix(0,3,1)
X[,1] <- c(3.2,20,3.2)
X[,2] <- c(.1,.1,.5)
points(X[1,1],X[1,2],pch=19,col="black",cex=.75)
points(X[2,1],X[2,2],pch=19,col="black",cex=.75)
points(X[3,1],X[3,2],pch=19,col="black",cex=.75)
f[1] <- like(X[1,])
f[2] <- like(X[2,])
f[3] <- like(X[3,])
i <- order(f)
f <- f[i]
X <- X[i,]
text(14,.15,"1",cex=1)
polygon(X,lwd=wd)
c <- colMeans(X[1:2,])
xr <- (c + X[3,])/2
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(5,.35,"2",cex=1)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- (c + X[3,])/2
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(5,.15,"3",cex=1)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- (c + X[3,])/2
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(6,.27,"4",cex=1)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- (c + X[3,])/2
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(9.7,mean(X[,2]),"5",cex=.8)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- (c + X[3,])/2
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(8.3,mean(X[,2]),"6",cex=.8)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- (c + X[3,])/2
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(6.4,.2,"7",cex=.7)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- (c + X[3,])/2
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(mean(X[,1]),.24,"8",cex=.7)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- (c + X[3,])/2
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(mean(X[,1]),mean(X[,2]),"9",cex=.7)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- c*2 - X[3,]
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(mean(X[,1]),mean(X[,2]),"10",cex=.7)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- c*2 - X[3,]
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(mean(X[,1]),mean(X[,2]),"11",cex=.7)
polygon(X,lwd=wd)
i <- order(f)
f <- f[i]
X <- X[i,]
c <- colMeans(X[1:2,])
xr <- c*2 - X[3,]
X[3,] <- xr
f[3] <- like(xr)
points(xr[1],xr[2],pch=19,col="black",cex=.75)
text(mean(X[,1]),mean(X[,2]),"12",cex=.6)
polygon(X,lwd=wd)