Load the spam data

library(klaR)
## Loading required package: MASS
library("ElemStatLearn")
names(spam)[ncol(spam)]="Y"
spam$Y = ifelse(spam$Y=="spam", 1, 0)
spam$Y = as.factor(spam$Y)

Try NaiveBayes in package klaR on the Spam data. Use the whole data as training and compute the training error.

NBfit = NaiveBayes( Y ~ ., data= spam)

names(NBfit)   # check the output from NaiveBayes

# Estimated parameters are
# 1) class frequency 2x1
NBfit$apriori  
# 2) class-specific mean/var for each class and each of the 57 features
NBfit$tables

y.class = predict(NBfit, newdata=spam)$class
y.prob = predict(NBfit, newdata = spam)$post
y.prob = y.prob[,1]  # prob of in class 1
table(spam$Y, y.prob>0.5)

There are 50 warning messages. The maximum number of warning messages is often set to be 50, so it’s possible that there are more than 50 warnings. It seems all the warning messages are the same, ``Numerical 0 probability for all classes with observation 1."

warnings()

Let’s find out why there are warning messages. You can use the command getAnywhere to show the source code of predict.NaiveBayes

getAnywhere(predict.NaiveBayes)
## A single object matching 'predict.NaiveBayes' was found
## It was found in the following places
##   registered S3 method for predict from namespace klaR
##   namespace:klaR
## with value
## 
## function (object, newdata, threshold = 0.001, ...) 
## {
##     if (missing(newdata)) 
##         newdata <- object$x
##     if ((!any(is.null(colnames(newdata)), is.null(object$varnames))) && 
##         all(is.element(object$varnames, colnames(newdata)))) 
##         newdata <- data.frame(newdata[, object$varnames])
##     nattribs <- ncol(newdata)
##     isnumeric <- sapply(newdata, is.numeric)
##     newdata <- data.matrix(newdata)
##     Lfoo <- function(i) {
##         tempfoo <- function(v) {
##             nd <- ndata[v]
##             if (is.na(nd)) 
##                 return(rep(1, length(object$apriori)))
##             prob <- if (isnumeric[v]) {
##                 msd <- object$tables[[v]]
##                 if (object$usekernel) 
##                   sapply(msd, FUN = function(y) dkernel(x = nd, 
##                     kernel = y, ...))
##                 else dnorm(nd, msd[, 1], msd[, 2])
##             }
##             else {
##                 object$tables[[v]][, nd]
##             }
##             prob[prob == 0] <- threshold
##             return(prob)
##         }
##         ndata <- newdata[i, ]
##         tempres <- log(sapply(1:nattribs, tempfoo))
##         L <- log(object$apriori) + rowSums(tempres)
##         if (isTRUE(all.equal(sum(exp(L)), 0))) 
##             warning("Numerical 0 probability for all classes with observation ", 
##                 i)
##         L
##     }
##     L <- sapply(1:nrow(newdata), Lfoo)
##     classdach <- factor(object$levels[apply(L, 2, which.max)], 
##         levels = object$levels)
##     posterior <- t(apply(exp(L), 2, function(x) x/sum(x)))
##     colnames(posterior) <- object$levels
##     rownames(posterior) <- names(classdach) <- rownames(newdata)
##     return(list(class = classdach, posterior = posterior))
## }
## <environment: namespace:klaR>

Check where the warning message is located, then you’ll find out that this message is printed because when computing the prediction, R thinks sum(exp(L)) is very close to 0.

L <- log(object$apriori) + rowSums(tempres)
        if (isTRUE(all.equal(sum(exp(L)), 0))) 
            warning("Numerical 0 probability for all classes with observation ", 
                i)

What’s L? If we have two classes, 1 or 2, then for each test sample, L has two elements where

The posterior probablity is equal to P(Y=1 | x) = exp(L1)/(exp(L1) + exp(L2)).

Recall that P(x|Y) is a product of one-dimensional normal densities; in predict.NaiveBayes, each normal density is evaluated using R function dnorm. It’s possible that the density functions for most dimensions are less than 1, then their log values are negative. If the dimension p is large, then L1 or L2, which is a sum of many negative values, could be easily less than -20, but in R, exp(-20) is regarded as 0.

isTRUE(all.equal(exp(-20), 0))
## [1] TRUE

Next let’s code the prediction differently to avoid the warning messages.

trainX = spam[, -58]; trainY = spam$Y;
p=dim(trainX)[2]; # p: number of predictors
y.levels = levels(trainY)
K= length(y.levels) # number of groups
mymean = matrix(0, K, p)  
mysd = matrix(0, K, p)    

for(k in 1:K){
  mymean[k,] = apply(trainX[trainY == y.levels[k],], 2, mean)
  mysd[k,] = apply(trainX[trainY == y.levels[k],], 2, sd)
}
w=mean(trainY==y.levels[1])
ntrain = length(trainY)
tmp1 = rep(0, ntrain); tmp2=rep(0, ntrain)
for(i in 1:p){
  prob = dnorm(trainX[,i], mymean[1,i], mysd[1,i])
  prob[prob==0] = 0.001
  tmp1 = tmp1 + log(prob)
  prob = dnorm(trainX[,i], mymean[2,i], mysd[2,i])
  prob[prob==0] = 0.001
  tmp2 = tmp2 + log(prob)
}
tmp1 = tmp1 + log(w)
tmp2 = tmp2 + log(1-w)
my.prob = 1/(1 + exp(tmp2-tmp1))  

my.prob should be the same as y.prob

which(abs(my.prob - y.prob) > 0.001)  
table(my.prob>0.5, y.prob>0.5)  
newtmp1 = rep(0, ntrain); newtmp2=rep(0, ntrain)
for(i in 1:p){
  newtmp1 = newtmp1 - log(mysd[1,i]) - (trainX[,i] - mymean[1,i])^2/(2*mysd[1,i]^2)
  newtmp2 = newtmp2 - log(mysd[2,i]) - (trainX[,i] - mymean[2,i])^2/(2*mysd[2,i]^2)
}
newtmp1 = newtmp1 + log(w)
newtmp2 = newtmp2 + log(1-w)
diff = newtmp2-newtmp1
my.newprob = 1/(1 + exp(diff)) 
table(my.newprob>0.5, my.prob>0.5)  
##        
##         FALSE TRUE
##   FALSE  2471   21
##   TRUE    472 1637
id = which((my.newprob>0.5) != (my.prob>0.5))
j=id[1];  # the 177th observation

# different prediction
my.newprob[177]
## [1] 0
my.prob[177]
## [1] 1

Evaluate the log-density for class 1. You’ll find that at dim 20, this particular example is at the tail of the normal, so its density is very small (or equivalently, -log is very large), but the default NaiveBayes truncates that contribution. So this is why the NaiveBayes predicts it to be in class 1, but the direct computation predicts it to be class 0.

# junk[,1]: dnorm, threshold, and then log
# junk[,2]: directly evaluate the log-density
k=1;
junk=matrix(0, p, 2)
for(i in 1:p){
  prob = dnorm(trainX[j,i], mymean[k,i], mysd[k,i])
  if (prob==0) {prob = 0.001}
 junk[i,1]=log(prob)
 junk[i, 2]= -0.5*log(2*pi)-log(mysd[k,i]) - (trainX[j,i]-mymean[k,i])^2/(2*mysd[k,i]^2)
}
round(junk, dig=2)[1:25,]
##        [,1]      [,2]
##  [1,]  0.26      0.26
##  [2,] -1.42     -1.42
##  [3,] -0.31     -0.31
##  [4,]  2.93      2.93
##  [5,] -0.48     -0.48
##  [6,]  0.56      0.56
##  [7,]  1.28      1.28
##  [8,]  0.47      0.47
##  [9,]  0.68      0.68
## [10,] -0.51     -0.51
## [11,]  0.97      0.97
## [12,] -1.05     -1.05
## [13,]  0.40      0.40
## [14,]  0.14      0.14
## [15,]  1.44      1.44
## [16,] -0.44     -0.44
## [17,]  0.58      0.58
## [18,] -0.03     -0.03
## [19,] -1.75     -1.75
## [20,] -6.91 -17662.17
## [21,] -1.04     -1.04
## [22,] -0.43     -0.43
## [23,]  1.77      1.77
## [24,]  0.38      0.38
## [25,] -1.74     -1.74