Stat542: Boosting

Toydata I: AdaBoost

Load the follwoing two functions I wrote for AdaBoost.

myboosting=function(G, w, x, y){
# G: previous committee
# w: previous weight
# x: covariates
# y: response
# g: current classifier, y=1, if x < split

 n = length(x); pos=sample(1:(n-1),1); 
 g=c(rep(1, pos),rep(-1, n-pos));
 err = sum((1-y*g)*w)/2;
 alpha=(1/2)*log((1-err)/(err));
 G=G+alpha*g
 w1 = w*exp(-alpha*y*g); w1 = w1/sum(w1);
 list(G=G, w=w,w1=w1, g=sign(alpha)*g, err=min(err,1-err), 
      split=(x[pos]+x[pos+1])/2, a=alpha)
}


myplot=function(myout, x,y ){
 n=length(x); m = sum(y>0);
 tmp1=as.vector(t(matrix(c(myout$w,myout$w1), ncol=2)))
 tmp2=as.vector(t(matrix(c(x-0.04,x), ncol=2)))
 tmp3=as.vector(t(matrix(c(rep(5, n), rep(6, n)), 
ncol=2)));
 mymax=max(myout$w1, myout$w);
 
plot(tmp2, tmp1, type="h", lwd=9, col=tmp3, xlim=c(0,2.2),
     ylim=c(-mymax/2, mymax+0.05), axes=T, 
     ylab="weights", xlab="");
 
 points(x[y>0], rep(-mymax/6, m), pch=3)
 points(x[y<0], rep(-mymax/6, (n-m)), pch=0)
 
 tmp1=sum(myout$g>0);
 points(x[myout$g>0], rep(-mymax/3, tmp1), pch=3)
 points(x[myout$g<0], rep(-mymax/3, (n-tmp1)), pch=0)

 tmp1=sum(sign(myout$g[1])*sign(myout$g)==1)
 tmp2=sum((x[tmp1]+x[tmp1+1])/2)
 lines(c(tmp2, tmp2), c(-mymax/2, mymax+0.03), lty=2)

 tmp1=sum(myout$G>0);
 points(x[myout$G>0], rep(-mymax/2, tmp1), pch=3)
 points(x[myout$G<0],rep(-mymax/2, (n-tmp1)),  pch=0)
 
 text(2.15, -mymax/6, "data")
 text(2.15, -mymax/3, "current")
 text(2.15, -mymax/2,  "overall")
}

Generate the toy data

set.seed(100)
x = (1:20)/10
n = length(x); 
y=2*as.numeric(runif(n)>0.5)-1

AdaBoost: t=1

G=rep(0,n); w=rep(1,n)/n;
myout=myboosting(G, w, x, y);
myplot(myout, x,y);

print(paste("error of g_t = ", round(myout$err, dig=3),
            ", error of G_t = ", round(sum(sign(myout$G)!=y)/n, dig=3)))

## [1] "error of g_t =  0.45 , error of G_t =  0.45"

AdaBoost: t=2

myout=myboosting(myout$G, myout$w1, x, y)
myplot(myout, x,y)

print(paste("error of g_t = ", round(myout$err, dig=3),
            ", error of G_t = ", round(sum(sign(myout$G)!=y)/n, dig=3)))

## [1] "error of g_t =  0.434 , error of G_t =  0.4"

AdaBoost: t=3

myout=myboosting(myout$G, myout$w1, x, y)
myplot(myout, x,y)

print(paste("error of g_t = ", round(myout$err, dig=3),
            ", error of G_t = ", round(sum(sign(myout$G)!=y)/n, dig=3)))

## [1] "error of g_t =  0.454 , error of G_t =  0.4"

Plot the 0/1 loss and exponential loss from T=200 boosting steps.

T=200; err=rep(1,T); exploss=rep(0,T);
n = length(x); 
G=rep(0,n); w=rep(1,n)/n; 
myout=myboosting(G, w, x, y);
err[1] = sum(sign(myout$G)!=y)/n; 
exploss[1] = sum(exp(-y*myout$G))/n;
for (i in 2:T) {    
    myout=myboosting(myout$G, myout$w1, x, y);
    err[i] = sum(sign(myout$G)!=y)/n;
    exploss[i]=sum(exp(-y*myout$G))/n;}

plot(c(1,T), c(0,max(err, exploss)), type="n", xlab="iterations",
ylab="Training Error");
points(err, col="red");
points(exploss, col="blue");

Toydata II: Early Stopping & Shrinkage

Data Generation

The true density, i.e., p(x) = P(Y=1 | X=x) is plotted in black, and the idea classification (blue for class 0 and red for class 1) is shown in the picture too. We generate 100 training samples from this distribution.

d=2;
test.x=1:100/100; 
dx=d*test.x-floor(d*test.x);
ptest.x=2*dx*as.numeric(dx<=0.5)+2*(1-dx)*as.numeric(dx>0.5);
plot(c(0,1),c(0,1), type="n", xlab="x", ylab="P(Y=1 | X=x)");
lines(test.x, ptest.x);
lines(c(0,1),c(0.5, .5),lwd=2, col="red");
lines( c(0, 0.125), rep(0.5,2), lwd=2,col="blue");
lines( c(0.375, 0.625), rep(0.5, 2),lwd=2,col="blue");
lines( c(0.875,1), rep(0.5,2), lwd=2,col="blue");

n=100;
x=runif(n); x=sort(x);
dx=d*x-floor(d*x);
px=2*dx*as.numeric(dx<=0.5)+2*(1-dx)*as.numeric(dx>0.5);
y=ifelse(px-runif(n)>0, 1, 0); 
points(x[y==1], y[y==1], col="red"); ## add on sample points
points(x[y==0], y[y==0], col="blue");

Use Package gbm

Y should be coded as numerical 0/1, not factor.
Commonly used loss functions: adaboost and bernoulli for classification, gaussian (squared error) and laplace (absolute loss) for regression, pairwise for ranking.
n.trees: number of trees (default 100)
shrinkage: shrinkage factor for the step size (default 0.001)
bag.fraction: subsampling rate (default 0.5)
cv.folds: return CV error (default 0, i.e., no CV error returned)
interaction.depth: depth of trees (default 1)

library(gbm)

The current version of "gbm" has a bug for CV when the feature x is one-dimensional. Check the CV output (in pdf) posted on Piazza, which were produced a couple years ago using the older version of "gbm".

toydata = data.frame(x=x, y=y)
toy.gbm1=gbm(y~x, data=toydata, distribution="adaboost", 
             n.trees=500, shrinkage=1, bag.fraction=0.5)

par(mfrow=c(2,2)); size=c(2,50,100,500);
for(i in 1:4){
    plot(c(0,1),c(0,1),type="n");
    points(x,as.numeric(y), col=8); ## add sample points
    y.pred=predict(toy.gbm1, data.frame(x=test.x), n.trees=size[i])
    lines(test.x, 1/(1+exp(-2*y.pred)))
    lines(x, px, col=8);
    title(paste("# of Iterations = ", size[i]))
    }

toy.gbm2=gbm(y~x, data=toydata, distribution="adaboost", 
             n.trees=1000, shrinkage=0.05, bag.fraction=1)
par(mfrow=c(2,2)); size=c(100,203,400,800);
for(i in 1:4){
    plot(c(0,1),c(0,1),type="n");
    points(x,as.numeric(y), col=8); ## add sample points
    y.pred=predict(toy.gbm2, data.frame(x=test.x), n.trees=size[i])
    lines(test.x, 1/(1+exp(-2*y.pred)))
    lines(x, px, col=8);
    title(paste("# of Iterations = ", size[i]))
    }