library("quanteda.textmodels")
library("quanteda")
## Package version: 2.9.9000
## Unicode version: 10.0
## ICU version: 61.1
## Parallel computing: 12 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:quanteda.textmodels':
##
## data_dfm_lbgexample
## The following object is masked from 'package:utils':
##
## View
quanteda.textmodels implements fast methods for fitting and predicting Naive Bayes textmodels built especially for sparse document-feature matrices from textual data. It implements two models: multinomial and Bernoulli. (See Manning, Raghavan, and Schütze 2008, Chapter 13.)
Here, we compare performance for the two models, and then to the performance from two other packages for fitting these models.
For these tests, we will choose the dataset of 50,000 movie reviews from Maas et. al. (2011). We will use their partition into test and training sets for training and fitting our models.
# large movie review database of 50,000 movie reviews
load(url("https://www.dropbox.com/s/sjdfmx8ggwfda5o/data_corpus_LMRD.rda?dl=1"))
dfmat <- dfm(data_corpus_LMRD)
dfmat_train <- dfm_subset(dfmat, set == "train")
dfmat_test <- dfm_subset(dfmat, set == "test")
Comparing the performance of fitting the model:
library("microbenchmark")
microbenchmark(
multi = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
bern = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
times = 50
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## multi 91.06298 96.99505 115.0232 111.1489 122.0688 177.6943 50
## bern 104.46766 115.19890 150.0707 136.2682 166.3753 378.4081 50
And for prediction:
microbenchmark(
multi = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
newdata = dfmat_test),
bern = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
newdata = dfmat_test),
times = 50
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## multi 111.6401 115.9561 135.7320 125.7128 146.0875 204.3883 50
## bern 161.6112 168.5677 201.5291 185.3464 225.6300 355.9197 50
Now let’s see how textmodel_nb()
compares to equivalent functions from other packages. Multinomial:
library("fastNaiveBayes")
library("naivebayes")
## naivebayes 0.9.7 loaded
microbenchmark(
textmodels = {
tmod <- textmodel_nb(dfmat_train, dfmat_train$polarity, smooth = 1, distribution = "multinomial")
pred <- predict(tmod, newdata = dfmat_test)
},
fastNaiveBayes = {
tmod <- fnb.multinomial(as(dfmat_train, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
},
naivebayes = {
tmod = multinomial_naive_bayes(as(dfmat_train, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
},
times = 50
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## textmodels 112.5101 118.2039 136.7075 122.5853 138.1479 210.0075 50
## fastNaiveBayes 203.0412 211.7781 240.2213 231.0921 251.0667 462.9836 50
## naivebayes 157.1311 161.3571 192.3014 175.7320 193.7351 585.6871 50
And Bernoulli. Note here that while we are supplying the boolean matrix to textmodel_nb()
, this re-weighting from the count matrix would have been performed automatically within the function had we not done so in advance - it’s done here just for comparison.
dfmat_train_bern <- dfm_weight(dfmat_train, scheme = "boolean")
dfmat_test_bern <- dfm_weight(dfmat_test, scheme = "boolean")
microbenchmark(
textmodels = {
tmod <- textmodel_nb(dfmat_train_bern, dfmat_train$polarity, smooth = 1, distribution = "Bernoulli")
pred <- predict(tmod, newdata = dfmat_test)
},
fastNaiveBayes = {
tmod <- fnb.bernoulli(as(dfmat_train_bern, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
},
naivebayes = {
tmod = bernoulli_naive_bayes(as(dfmat_train_bern, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
},
times = 50
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## textmodels 165.5035 187.6931 215.6976 215.7263 231.7469 410.6544 50
## fastNaiveBayes 224.3868 251.6130 276.6522 264.4959 302.3617 413.6238 50
## naivebayes 178.2366 196.8630 211.0112 203.7715 211.5721 276.8402 50
😎
Maas, Andrew L., Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts (2011). “Learning Word Vectors for Sentiment Analysis”. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).
Majka M (2020). naivebayes: High Performance Implementation of the Naive Bayes Algorithm in R. R package version 0.9.7, <URL: https://CRAN.R-project.org/package=naivebayes>. Date: 2020-03-08.
Manning, Christopher D., Prabhakar Raghavan, and Hinrich Schütze (2008). Introduction to Information Retrieval. Cambridge University Press.
Skogholt, Martin (2020). fastNaiveBayes: Extremely Fast Implementation of a Naive Bayes Classifier. R package version 2.2.0. https://github.com/mskogholt/fastNaiveBayes. Date: 2020-02-23.