How Does NaiveBayes in klaR Predict?

Load the spam data

library(klaR)

## Loading required package: MASS

library("ElemStatLearn")
names(spam)[ncol(spam)]="Y"
spam$Y = ifelse(spam$Y=="spam", 1, 0)
spam$Y = as.factor(spam$Y)

Try NaiveBayes in package klaR on the Spam data. Use the whole data as training and compute the training error.

NBfit = NaiveBayes( Y ~ ., data= spam)

names(NBfit)   # check the output from NaiveBayes

# Estimated parameters are
# 1) class frequency 2x1
NBfit$apriori  
# 2) class-specific mean/var for each class and each of the 57 features
NBfit$tables

y.class = predict(NBfit, newdata=spam)$class
y.prob = predict(NBfit, newdata = spam)$post
y.prob = y.prob[,1]  # prob of in class 1
table(spam$Y, y.prob>0.5)

There are 50 warning messages. The maximum number of warning messages is often set to be 50, so it’s possible that there are more than 50 warnings. It seems all the warning messages are the same, ``Numerical 0 probability for all classes with observation 1."

warnings()

Let’s find out why there are warning messages. You can use the command getAnywhere to show the source code of predict.NaiveBayes

getAnywhere(predict.NaiveBayes)

## A single object matching 'predict.NaiveBayes' was found
## It was found in the following places
##   registered S3 method for predict from namespace klaR
##   namespace:klaR
## with value
## 
## function (object, newdata, threshold = 0.001, ...) 
## {
##     if (missing(newdata)) 
##         newdata <- object$x
##     if ((!any(is.null(colnames(newdata)), is.null(object$varnames))) && 
##         all(is.element(object$varnames, colnames(newdata)))) 
##         newdata <- data.frame(newdata[, object$varnames])
##     nattribs <- ncol(newdata)
##     isnumeric <- sapply(newdata, is.numeric)
##     newdata <- data.matrix(newdata)
##     Lfoo <- function(i) {
##         tempfoo <- function(v) {
##             nd <- ndata[v]
##             if (is.na(nd)) 
##                 return(rep(1, length(object$apriori)))
##             prob <- if (isnumeric[v]) {
##                 msd <- object$tables[[v]]
##                 if (object$usekernel) 
##                   sapply(msd, FUN = function(y) dkernel(x = nd, 
##                     kernel = y, ...))
##                 else dnorm(nd, msd[, 1], msd[, 2])
##             }
##             else {
##                 object$tables[[v]][, nd]
##             }
##             prob[prob == 0] <- threshold
##             return(prob)
##         }
##         ndata <- newdata[i, ]
##         tempres <- log(sapply(1:nattribs, tempfoo))
##         L <- log(object$apriori) + rowSums(tempres)
##         if (isTRUE(all.equal(sum(exp(L)), 0))) 
##             warning("Numerical 0 probability for all classes with observation ", 
##                 i)
##         L
##     }
##     L <- sapply(1:nrow(newdata), Lfoo)
##     classdach <- factor(object$levels[apply(L, 2, which.max)], 
##         levels = object$levels)
##     posterior <- t(apply(exp(L), 2, function(x) x/sum(x)))
##     colnames(posterior) <- object$levels
##     rownames(posterior) <- names(classdach) <- rownames(newdata)
##     return(list(class = classdach, posterior = posterior))
## }
## <environment: namespace:klaR>

Check where the warning message is located, then you’ll find out that this message is printed because when computing the prediction, R thinks sum(exp(L)) is very close to 0.

L <- log(object$apriori) + rowSums(tempres)
        if (isTRUE(all.equal(sum(exp(L)), 0))) 
            warning("Numerical 0 probability for all classes with observation ", 
                i)

What’s L? If we have two classes, 1 or 2, then for each test sample, L has two elements where

L1 = log P(x | Y=1) + log P(Y=1) and
L2 = log P(x | Y=2) + log P(Y=2).

The posterior probablity is equal to P(Y=1 | x) = exp(L1)/(exp(L1) + exp(L2)).

Recall that P(x|Y) is a product of one-dimensional normal densities; in predict.NaiveBayes, each normal density is evaluated using R function dnorm. It’s possible that the density functions for most dimensions are less than 1, then their log values are negative. If the dimension p is large, then L1 or L2, which is a sum of many negative values, could be easily less than -20, but in R, exp(-20) is regarded as 0.

isTRUE(all.equal(exp(-20), 0))

## [1] TRUE

Next let’s code the prediction differently to avoid the warning messages.

compute the normal parameters for each dimension and each class using training data: group means and standard deviations are saved in mymean and mysd, two 2-by-p matrices, which should be the same as NBfit$tables returned by NaiveBayes.

trainX = spam[, -58]; trainY = spam$Y;
p=dim(trainX)[2]; # p: number of predictors
y.levels = levels(trainY)
K= length(y.levels) # number of groups
mymean = matrix(0, K, p)  
mysd = matrix(0, K, p)    

for(k in 1:K){
  mymean[k,] = apply(trainX[trainY == y.levels[k],], 2, mean)
  mysd[k,] = apply(trainX[trainY == y.levels[k],], 2, sd)
}
w=mean(trainY==y.levels[1])

How does predict.NaiveBayes compute the posterior probability for class 1? To compute the log density for each dimension, predict.NaiveBayes 1) calls dnorm to evaluate the density; 2) if density is too small, say <0.001, set it to be 0.001; 3) take log of the density. Such a procedure may lead to some computation error, simply because log(exp(-20)) is NOT equal to -20, but -7, since the code will set exp(-20) to be 0.001 and log(0.001) is about -7.

ntrain = length(trainY)
tmp1 = rep(0, ntrain); tmp2=rep(0, ntrain)
for(i in 1:p){
  prob = dnorm(trainX[,i], mymean[1,i], mysd[1,i])
  prob[prob==0] = 0.001
  tmp1 = tmp1 + log(prob)
  prob = dnorm(trainX[,i], mymean[2,i], mysd[2,i])
  prob[prob==0] = 0.001
  tmp2 = tmp2 + log(prob)
}
tmp1 = tmp1 + log(w)
tmp2 = tmp2 + log(1-w)
my.prob = 1/(1 + exp(tmp2-tmp1))

my.prob should be the same as y.prob

which(abs(my.prob - y.prob) > 0.001)  
table(my.prob>0.5, y.prob>0.5)

Instead of calling dnorm to compute the density and then take log, we directly evaluate the log of the density function, which is a quadratic function of X’s. You’ll find that the prediction from our calculation is different from the one from NaiveBayes.

newtmp1 = rep(0, ntrain); newtmp2=rep(0, ntrain)
for(i in 1:p){
  newtmp1 = newtmp1 - log(mysd[1,i]) - (trainX[,i] - mymean[1,i])^2/(2*mysd[1,i]^2)
  newtmp2 = newtmp2 - log(mysd[2,i]) - (trainX[,i] - mymean[2,i])^2/(2*mysd[2,i]^2)
}
newtmp1 = newtmp1 + log(w)
newtmp2 = newtmp2 + log(1-w)
diff = newtmp2-newtmp1
my.newprob = 1/(1 + exp(diff)) 
table(my.newprob>0.5, my.prob>0.5)

##        
##         FALSE TRUE
##   FALSE  2471   21
##   TRUE    472 1637

Let’s find out on which samples our prediction differs from NaiveBayes. Take a look of the 177th example: the prediction from NaiveBayes is class 1, but the prediction from direct calculation (my.newprob) is class 0.

id = which((my.newprob>0.5) != (my.prob>0.5))
j=id[1];  # the 177th observation

# different prediction
my.newprob[177]

## [1] 0

my.prob[177]

## [1] 1

Evaluate the log-density for class 1. You’ll find that at dim 20, this particular example is at the tail of the normal, so its density is very small (or equivalently, -log is very large), but the default NaiveBayes truncates that contribution. So this is why the NaiveBayes predicts it to be in class 1, but the direct computation predicts it to be class 0.

# junk[,1]: dnorm, threshold, and then log
# junk[,2]: directly evaluate the log-density
k=1;
junk=matrix(0, p, 2)
for(i in 1:p){
  prob = dnorm(trainX[j,i], mymean[k,i], mysd[k,i])
  if (prob==0) {prob = 0.001}
 junk[i,1]=log(prob)
 junk[i, 2]= -0.5*log(2*pi)-log(mysd[k,i]) - (trainX[j,i]-mymean[k,i])^2/(2*mysd[k,i]^2)
}
round(junk, dig=2)[1:25,]

##        [,1]      [,2]
##  [1,]  0.26      0.26
##  [2,] -1.42     -1.42
##  [3,] -0.31     -0.31
##  [4,]  2.93      2.93
##  [5,] -0.48     -0.48
##  [6,]  0.56      0.56
##  [7,]  1.28      1.28
##  [8,]  0.47      0.47
##  [9,]  0.68      0.68
## [10,] -0.51     -0.51
## [11,]  0.97      0.97
## [12,] -1.05     -1.05
## [13,]  0.40      0.40
## [14,]  0.14      0.14
## [15,]  1.44      1.44
## [16,] -0.44     -0.44
## [17,]  0.58      0.58
## [18,] -0.03     -0.03
## [19,] -1.75     -1.75
## [20,] -6.91 -17662.17
## [21,] -1.04     -1.04
## [22,] -0.43     -0.43
## [23,]  1.77      1.77
## [24,]  0.38      0.38
## [25,] -1.74     -1.74