library(splines);
library(ggplot2)
help(bs)
help(ns)
x = (1:199)/100;
n = length(x)
m = 5;
myknots = 2*(1:m)/(m+1)
myknots
## [1] 0.3333 0.6667 1.0000 1.3333 1.6667
X = cbind(1, x, x^2, x^3);
for(i in 1:m){
tmp = (x-myknots[i])^3;
tmp[tmp<0] = 0;
X = cbind(X, tmp);
}
plot(c(0,2), range(X), type="n", xlab="", ylab="")
title("Truncated Power Basis")
for(i in 1:(m+4)){
tmp = X[,i];
if (i<=4) mylty=1 else mylty=2;
lines(x[tmp!=0], tmp[tmp!=0], col=i, lty=mylty, lwd=2)
}
for(i in 1:m){
points(myknots[i], 0, pty="m", pch=19, cex=2)
}
F = bs(x,knots = myknots, intercept = TRUE)
dim(F)
## [1] 199 9
mydf = m+4;
tmpdata = data.frame(t = rep(1:n, mydf),
basisfunc=as.vector(F),
type=as.factor(rep(1:mydf, each=n)))
ggplot(tmpdata, aes(x=t, y=basisfunc, color=type)) +
geom_path()
If we do not set intercept = TRUE
, then bs
will return 9-1 = 8 columns.
F = bs(x, knots = myknots)
dim(F)
mydf = m+3;
tmpdata = data.frame(t = rep(1:n, mydf),
basisfunc=as.vector(F),
type=as.factor(rep(1:mydf, each=n)))
ggplot(tmpdata, aes(x=t, y=basisfunc, color=type)) +
geom_path()
F = ns(x, knots=myknots, Boundary.knots=c(0,2), intercept=TRUE)
dim(F)
## [1] 199 7
mydf = 7
tmpdata = data.frame(t = rep(1:n, mydf),
basisfunc=as.vector(F),
type=as.factor(rep(1:mydf, each=n)))
ggplot(tmpdata, aes(x=t, y=basisfunc, color=type)) +
geom_path()
This dataset lists the number of live births per 10,000 23-year-old women in the United States between 1917 and 2003.
source("birthrates.txt");
birthrates = as.data.frame(birthrates)
names(birthrates) = c("year", "rate")
ggplot(birthrates, aes(x=year, y=rate)) +
geom_point() + geom_smooth(method="lm", se=FALSE)
fit1 = lm(rate~bs(year, knots=quantile(year, c(1/3, 2/3))),
data=birthrates);
fit2 = lm(rate~bs(year, df=5), data=birthrates);
fit3 = lm(rate~bs(year, df=6, intercept=TRUE), data=birthrates)
fit4 = lm(rate~bs(year, df=5, intercept=TRUE), data=birthrates)
plot(birthrates$year, birthrates$rate, ylim=c(90,280))
lines(spline(birthrates$year, predict(fit1)), col="red", lty=1)
lines(spline(birthrates$year, predict(fit2)), col="blue", lty=2)
lines(spline(birthrates$year, predict(fit3)), col="green", lty=3)
lines(spline(birthrates$year, predict(fit4)), lty=2, lwd=2)
# Alternatively, you can predict the spline fit on a fine grid, and then connect them
# plot(birthrates$year, birthrates$rate, ylim=c(90,280))
# year.grid = seq(from=min(birthrates$year), to=max(birthrates$year), length=200)
# ypred = predict(fit1, data.frame(year=year.grid))
# lines(year.grid, ypred, col="blue", lwd=2)
fit1=lm(rate~ns(year, knots=quantile(year, (1:4)/5)), data=birthrates);
fit2=lm(rate~ns(year, df=5), data=birthrates);
fit3=lm(rate~ns(year, df=6, intercept=TRUE), data=birthrates)
plot(birthrates$year, birthrates$rate, ylim=c(90,280))
lines(spline(birthrates$year, predict(fit1)), col="red", lty=1)
lines(spline(birthrates$year, predict(fit2)), col="blue", lty=2)
lines(spline(birthrates$year, predict(fit3)), col="green", lty=3)
plot(birthrates$year, birthrates$rate, ylim=c(90,280));
lines(spline(birthrates$year, predict(lm(rate~bs(year, df=7), data=birthrates))), col="blue");
lines(spline(birthrates$year, predict(lm(rate~bs(year, df=14), data=birthrates))), col="red");
lines(spline(birthrates$year, predict(lm(rate~bs(year, df=19), data=birthrates))), col="black");
legend("topright", lty=rep(1,3), col=c("blue", "red", "black"), legend=c("df=8", "df=15", "df=20"))
new = data.frame(year=1905:2015);
fit1=lm(rate~bs(year, df=7), data=birthrates);
pred1=predict(fit1, new);
## Warning in bs(year, degree = 3L, knots = structure(c(1934.2, 1951.4,
## 1968.6, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
fit2=lm(rate~ns(year, df=7), data=birthrates);
pred2=predict(fit2, new);
plot(birthrates$year,birthrates$rate, xlim=c(1905,2015),
ylim=c(min(pred1,pred2), max(pred1,pred2)),
ylab="Birth Rate", xlab="Year")
lines(new$year, pred1, col="red")
lines(new$year, pred2, col="blue")
legend("bottomleft", lty=rep(1,2), col=c("red", "blue" ), legend=c("CS with df=7", "NCS with df=7"))
The location of knots will affect the performance of a spline model. But selecting the location of knots is computationally too expensive. Instead, we place knots equally at quantiles of x, and then select just the number of knots, or equivalently, the df. Can we use F-test to select the number of knots?
For each df, we use 10-fold CV to calculate the CV error. When doing 10-fold CV, each time, based on 90% of the data, we place the (df-4) knots at the corresponding quantiles and then fit a regression spline,
First, we need to divide the data into K
folds.
K=10
n = nrow(birthrates)
fold.size = c(rep(9, 7), rep(8, 3))
fold.id = rep(1:K, fold.size)
fold.id
## [1] 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3
## [24] 3 3 3 3 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 6
## [47] 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 8 8 8 8 8 8
## [70] 8 8 9 9 9 9 9 9 9 9 10 10 10 10 10 10 10 10
fold.id = fold.id[sample(1:n, n)]
fold.id
## [1] 2 3 6 1 9 6 7 4 1 5 2 5 3 5 1 3 5 9 8 2 1 7 8
## [24] 10 10 7 9 5 1 10 8 4 10 10 2 3 8 1 9 8 6 3 8 2 6 1
## [47] 4 2 5 2 9 3 6 10 1 2 2 4 5 4 3 3 5 9 6 7 5 4 4
## [70] 4 7 9 8 6 7 7 6 8 9 6 7 4 3 10 10 7 1
mydf = 10:30
mycv = rep(0, length(mydf))
for(i in 1:length(mydf)){
m = mydf[i]-4;
for(k in 1:K){
id = which(fold.id == k);
myknots = quantile(birthrates$year[-id], (1:m)/(m+1))
myfit = lm(rate ~ bs(year, knots=myknots),
data=birthrates[-id,])
ypred = predict(myfit, newdata=birthrates[id,])
mycv[i]=mycv[i] + sum((birthrates$rate[id] - ypred)^2)
}
}
plot(mydf, mycv)
Re-run the 10-fold CV. The plot of mydf
versus mycv
may vary, but shouldn’t be too different.
fold.id = rep(1:K, fold.size)
fold.id = fold.id[sample(1:n, n)]
mydf = 10:30
mycv = rep(0, length(mydf))
for(i in 1:length(mydf)){
m = mydf[i]-4;
for(k in 1:K){
id = which(fold.id == k);
myknots = quantile(birthrates$year[-id], (1:m)/(m+1))
myfit = lm(rate ~ bs(year, knots=myknots),
data=birthrates[-id,])
ypred = predict(myfit, newdata=birthrates[id,])
mycv[i]=mycv[i] + sum((birthrates$rate[id] - ypred)^2)
}
}
plot(mydf, mycv)