#install.packages("ElemStatLearn")
library(ElemStatLearn)
library(splines)
data(phoneme)
mydata = phoneme[phoneme$g %in% c("aa", "ao"), ]
# mydata[1, ]
mydata[1, c(1:5, 250:258)]
## x.1 x.2 x.3 x.4 x.5 x.250 x.251 x.252
## 5 12.96705 13.69454 14.91182 18.22292 18.4539 7.58624 6.65202 7.69109
## x.253 x.254 x.255 x.256 g speaker
## 5 6.93683 7.036 7.01278 8.52197 aa train.dr1.mcpm0.sa1
X = data.matrix(mydata[, -c(257, 258)])
Y = mydata[, 257]
table(Y)
## Y
## aa ao dcl iy sh
## 695 1022 0 0 0
Y = as.numeric(Y)-1
table(Y) # 1: ao
## Y
## 0 1
## 695 1022
logfit1 = glm(Y ~ X, family="binomial")
coef1 = logfit1$coef[-1]
H = ns(1:256, df = 12)
B = X %*% H
logfit2 = glm(Y ~ B, family="binomial")
coef2 = H %*% logfit2$coef[-1]
plot(1:256, coef1, type="n",
xlab="Frequency", ylab="Logistic Regression Coefficients")
lines(1:256, coef1, col="gray")
lines(1:256, coef2, col="red")
T = 5
n = dim(X)[1]
Err1 = matrix(0, T, 2) # col 1: training; col 2: test
Err2 = matrix(0, T, 2) # col 1: training; col 2: test
set.seed(123)
for(t in 1:T){
testid = sample(1:n, round(n*0.2))
logfit1 = glm(Y[-testid] ~ X[-testid, ], family="binomial")
yhat1 = as.numeric(logfit1$fitted.values>0.5)
Err1[t, 1] = mean(yhat1 != Y[-testid])
yhat1 = X[testid, ] %*% logfit1$coef[-1] + logfit1$coef[1]
yhat1 = as.numeric(yhat1>0.5)
Err1[t, 2] = mean(yhat1 != Y[testid])
logfit2 = glm(Y[-testid] ~ B[-testid, ], family="binomial")
yhat2 = as.numeric(logfit2$fitted.values>0.5)
Err2[t, 1] = mean(yhat2 != Y[-testid])
yhat2 = B[testid, ] %*% logfit2$coef[-1] + logfit2$coef[1]
yhat2 = as.numeric(yhat2>0.5)
Err2[t, 2] = mean(yhat2 != Y[testid])
}
round(cbind(Err1, Err2), dig=3)
## [,1] [,2] [,3] [,4]
## [1,] 0.114 0.207 0.179 0.181
## [2,] 0.111 0.239 0.183 0.166
## [3,] 0.101 0.245 0.176 0.204
## [4,] 0.100 0.274 0.170 0.213
## [5,] 0.108 0.227 0.168 0.213