textmodel Performance Comparisons

Kenneth Benoit

library("quanteda.textmodels")
library("quanteda")
## Package version: 2.9.9000
## Unicode version: 10.0
## ICU version: 61.1
## Parallel computing: 12 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:quanteda.textmodels':
## 
##     data_dfm_lbgexample
## The following object is masked from 'package:utils':
## 
##     View

Naive Bayes

quanteda.textmodels implements fast methods for fitting and predicting Naive Bayes textmodels built especially for sparse document-feature matrices from textual data. It implements two models: multinomial and Bernoulli. (See Manning, Raghavan, and Schütze 2008, Chapter 13.)

Here, we compare performance for the two models, and then to the performance from two other packages for fitting these models.

For these tests, we will choose the dataset of 50,000 movie reviews from Maas et. al. (2011). We will use their partition into test and training sets for training and fitting our models.

# large movie review database of 50,000 movie reviews
load(url("https://www.dropbox.com/s/sjdfmx8ggwfda5o/data_corpus_LMRD.rda?dl=1"))

dfmat <- dfm(data_corpus_LMRD)
dfmat_train <- dfm_subset(dfmat, set == "train")
dfmat_test <- dfm_subset(dfmat, set == "test")

Comparing the performance of fitting the model:

library("microbenchmark")
microbenchmark(
    multi = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
    bern = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
    times = 50
)
## Unit: milliseconds
##   expr       min        lq     mean   median       uq      max neval
##  multi  91.06298  96.99505 115.0232 111.1489 122.0688 177.6943    50
##   bern 104.46766 115.19890 150.0707 136.2682 166.3753 378.4081    50

And for prediction:

microbenchmark(
    multi = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
                    newdata = dfmat_test),
    bern = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
                   newdata = dfmat_test),
    times = 50
)
## Unit: milliseconds
##   expr      min       lq     mean   median       uq      max neval
##  multi 111.6401 115.9561 135.7320 125.7128 146.0875 204.3883    50
##   bern 161.6112 168.5677 201.5291 185.3464 225.6300 355.9197    50

Now let’s see how textmodel_nb() compares to equivalent functions from other packages. Multinomial:

library("fastNaiveBayes")
library("naivebayes")
## naivebayes 0.9.7 loaded

microbenchmark(
    textmodels = {
      tmod <-  textmodel_nb(dfmat_train, dfmat_train$polarity, smooth = 1, distribution = "multinomial")
      pred <- predict(tmod, newdata = dfmat_test)
    },
    fastNaiveBayes = { 
      tmod <- fnb.multinomial(as(dfmat_train, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
      pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
    },
    naivebayes = {
      tmod = multinomial_naive_bayes(as(dfmat_train, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
      pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
    },
    times = 50
)
## Unit: milliseconds
##            expr      min       lq     mean   median       uq      max neval
##      textmodels 112.5101 118.2039 136.7075 122.5853 138.1479 210.0075    50
##  fastNaiveBayes 203.0412 211.7781 240.2213 231.0921 251.0667 462.9836    50
##      naivebayes 157.1311 161.3571 192.3014 175.7320 193.7351 585.6871    50

And Bernoulli. Note here that while we are supplying the boolean matrix to textmodel_nb(), this re-weighting from the count matrix would have been performed automatically within the function had we not done so in advance - it’s done here just for comparison.

dfmat_train_bern <- dfm_weight(dfmat_train, scheme = "boolean")
dfmat_test_bern <- dfm_weight(dfmat_test, scheme = "boolean")

microbenchmark(
    textmodels = {
      tmod <-  textmodel_nb(dfmat_train_bern, dfmat_train$polarity, smooth = 1, distribution = "Bernoulli")
      pred <- predict(tmod, newdata = dfmat_test)
    },
    fastNaiveBayes = { 
      tmod <- fnb.bernoulli(as(dfmat_train_bern, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
      pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
    },
    naivebayes = {
      tmod = bernoulli_naive_bayes(as(dfmat_train_bern, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
      pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
    },
    times = 50
)
## Unit: milliseconds
##            expr      min       lq     mean   median       uq      max neval
##      textmodels 165.5035 187.6931 215.6976 215.7263 231.7469 410.6544    50
##  fastNaiveBayes 224.3868 251.6130 276.6522 264.4959 302.3617 413.6238    50
##      naivebayes 178.2366 196.8630 211.0112 203.7715 211.5721 276.8402    50

😎

References

Maas, Andrew L., Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts (2011). “Learning Word Vectors for Sentiment Analysis”. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).

Majka M (2020). naivebayes: High Performance Implementation of the Naive Bayes Algorithm in R. R package version 0.9.7, <URL: https://CRAN.R-project.org/package=naivebayes>. Date: 2020-03-08.

Manning, Christopher D., Prabhakar Raghavan, and Hinrich Schütze (2008). Introduction to Information Retrieval. Cambridge University Press.

Skogholt, Martin (2020). fastNaiveBayes: Extremely Fast Implementation of a Naive Bayes Classifier. R package version 2.2.0. https://github.com/mskogholt/fastNaiveBayes. Date: 2020-02-23.