There are multiple R packages that can transform text data into a matrix of term frequency counts. This document benchmarks five packages:
There are four benchmarks, two for unigrams only, and two for unigrams and bigrams. In each benchmark, we perform the following sequence of operations:
- case fold the text
- tokenize into words
- remove punctuation
- remove numbers
- remove stop words
- stem
- compute bigrams (bigram benchmarks only)
- compute term frequencies
- remove terms that appear fewer than five times in the corpus
- compute a term frequency matrix (text by term)
There are some subtle and not-so-subtle differences in how the five packages implement these operations, so this is not really an apples-to-apples comparison, and the outputs are different. Keep that in mind.
We will load the following packages.
library("Matrix")
library("dplyr", warn.conflicts = FALSE)
library("ggplot2")
library("magrittr")
library("methods")
library("stringr")
The remaining packages need to be installed, but we will not load their namespaces:
# Not run:
# install.packages(c("microbenchmark", "corpus", "quanteda", "text2vec", "tidytext", "tm"))
For the first test corpus, we use the 62 chapters from Pride and Prejudice, provided by the janeaustenr library:
lines <- (data_frame(text = janeaustenr::prideprejudice)
%>% mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))))
text_novel <- c(tapply(lines$text, lines$chapter, paste, collapse = "\n"))
For the second test corpus, we use the 5000 movie reviews provided by the text2vec package:
text_reviews <- text2vec::movie_review$review
names(text_reviews) <- text2vec::movie_review$id
We will use the Snowball English stop word list:
stop_words <- corpus::stopwords_en
As a baseline, we will include a basic implementation, written from scratch by Dmitriy Selivanov (text2vec author) that can handle unigrams but not bigrams:
# helper function for normalizing text, also used by text2vec below
preprocess <- function(x)
{
# Note: this works fine for ASCII but not for general Unicode.
# For Unicode, do the following instead:
#
# (stringi::stri_trans_nfkc_casefold(x)
# %>% stringi::stri_replace_all_regex("[^\\p{Letter}\\s]", ""))
str_to_lower(x) %>% str_replace_all("[^[:alpha:]\\s]", "")
}
# helper function for tokenizing and stemming, also used by text2vec below
stem_tokenizer <- function(x)
{
str_split(x, boundary("word")) %>% lapply(SnowballC::wordStem, "english")
}
matrix_basic <- function(text, min_count = 5)
{
# normalize and tokenize the text
toks <- text %>% preprocess %>% stem_tokenizer
toks_flat <- unlist(toks, recursive = FALSE, use.names = FALSE)
# compute the text lengths
ntok <- vapply(toks, length, 0L)
# compute the types, remove stop words
types <- unique(toks_flat) %>% setdiff(stop_words)
# construct the term matrix
i <- rep.int(seq_along(text), ntok)
j <- match(toks_flat, types)
drop <- is.na(j)
x <- sparseMatrix(i = i[!drop], j = j[!drop], x = 1,
dims = c(length(text), length(types)),
dimnames = list(names(text), types),
check = FALSE)
# drop terms below the minimum count
x <- x[, colSums(x) >= min_count, drop = FALSE]
x
}
matrix_corpus <- function(text, bigrams = FALSE, min_count = 5)
{
if (bigrams) {
ngrams <- 1:2
} else {
ngrams <- 1
}
f <- corpus::text_filter(stemmer = "en", drop_punct = TRUE,
drop_number = TRUE, drop = stop_words)
stats <- corpus::term_stats(text, f, ngrams = ngrams,
min_count = min_count)
x <- corpus::term_matrix(text, f, select = stats$term)
x
}
matrix_quanteda <- function(text, bigrams = FALSE, min_count = 5)
{
if (bigrams) {
ngrams <- 1:2
} else {
ngrams <- 1
}
x <- quanteda::dfm(text, stem = TRUE, remove_punct = TRUE,
remove_numbers = TRUE, remove = stop_words,
ngrams = ngrams, verbose = FALSE)
x <- quanteda::dfm_trim(x, min_count = min_count, verbose = FALSE)
x
}
# Written by Dmitriy Selivanov
matrix_text2vec <- function(text, bigrams = FALSE, min_count = 5)
{
if (bigrams) {
ngram <- c(1, 2)
} else {
ngram <- c(1, 1)
}
# since we don't care about RAM usage we will tokenize everything only
# once and do it with a single call to preprocess and tokenizer
tokens <- preprocess(text) %>% stem_tokenizer
it_train <- text2vec::itoken(tokens, n_chunks = 1, progressbar = FALSE)
vocab <- text2vec::create_vocabulary(it_train, ngram = ngram,
stopwords = stop_words)
pruned_vocab <- text2vec::prune_vocabulary(vocab,
term_count_min = min_count)
vectorizer <- text2vec::vocab_vectorizer(pruned_vocab)
x <- text2vec::create_dtm(it_train, vectorizer)
x
}
# Note: this filters punctuation but keeps numbers
matrix_tidytext <- function(text, bigrams = FALSE, min_count = 5)
{
data <- tibble::tibble(text_id = seq_along(text), text = text)
stops <- tibble::tibble(word = stop_words)
# unigrams
freqs <- (data
%>% tidytext::unnest_tokens(word, text)
%>% anti_join(stops, by = "word")
%>% mutate(term = SnowballC::wordStem(word, "english"))
%>% count(text_id, term)
%>% ungroup())
# bigrams
if (bigrams) {
freqs2 <- (data
%>% tidytext::unnest_tokens(bigram, text, token = "ngrams", n = 2)
%>% tidyr::separate(bigram, c("type1", "type2"), sep = " ")
%>% filter(!type1 %in% stop_words,
!type2 %in% stop_words)
%>% mutate(type1 = SnowballC::wordStem(type1, "english"),
type2 = SnowballC::wordStem(type2, "english"))
%>% mutate(term = paste(type1, type2))
%>% count(text_id, term)
%>% ungroup())
freqs <- rbind(freqs, freqs2)
}
# form matrix in slam format
x <- freqs %>% tidytext::cast_dtm(text_id, term, n)
# remove rare terms
x <- x[, slam::col_sums(x) >= min_count, drop = FALSE]
# cast to dgCMatrix format
x <- sparseMatrix(i = x$i, j = x$j, x = x$v, dims = dim(x),
dimnames = dimnames(x), check = FALSE)
x
}
# from http://tm.r-forge.r-project.org/faq.html#Bigrams
BigramTokenizer <- function(x)
{
unlist(lapply(NLP::ngrams(NLP::words(x), 2), paste, collapse = " "),
use.names = FALSE)
}
matrix_tm <- function(text, bigrams = FALSE, min_count = 5)
{
corpus <- (tm::VCorpus(tm::VectorSource(text))
%>% tm::tm_map(tm::content_transformer(tolower))
%>% tm::tm_map(tm::removeWords, stop_words)
%>% tm::tm_map(tm::removePunctuation)
%>% tm::tm_map(tm::removeNumbers)
%>% tm::tm_map(tm::stemDocument, language = "english"))
control <- list(wordLengths = c(1, Inf),
bounds = list(global = c(min_count, Inf)))
x <- tm::DocumentTermMatrix(corpus, control = control)
if (bigrams) {
control$tokenize <- BigramTokenizer
x2 <- tm::DocumentTermMatrix(corpus, control = control)
x <- cbind(x, x2)
}
x <- sparseMatrix(i = x$i, j = x$j, x = x$v, dims = dim(x),
dimnames = dimnames(x), check = FALSE)
x
}
These implementations all give different results. See, for example, the results on the following sample text:
sample <- "Above ground. Another sentence. Others..."
# compute term matrices using five implementations
xs <- list(
corpus = matrix_corpus(sample, bigrams = TRUE, min_count = 1),
quanteda = matrix_quanteda(sample, bigrams = TRUE, min_count = 1),
text2vec = matrix_text2vec(sample, bigrams = TRUE, min_count = 1),
tidytext = matrix_tidytext(sample, bigrams = TRUE, min_count = 1),
tm = matrix_tm(sample, bigrams = TRUE, min_count = 1))
# normalize the names (some use '_' to join bigrams, others use ' ')
for (i in seq_along(xs)) {
colnames(xs[[i]]) <- str_replace_all(colnames(xs[[i]]), " ", "_")
}
# get the unique terms
terms <- unique(c(sapply(xs, colnames), recursive = TRUE))
# put unigrams before bigrams, then order lexicographically
terms <- terms[order(str_count(terms, "_"), terms)]
# combine everything into a single matrix
x <- matrix(0, length(xs), length(terms), dimnames = list(names(xs), terms))
for (i in seq_along(xs)) {
xi <- xs[[i]]
x[i, colnames(xi)] <- as.numeric(xi[1, ])
}
print(as(x, "dgCMatrix"))
5 x 9 sparse Matrix of class "dgCMatrix"
abov anoth ground other sentenc abov_ground anoth_sentenc ground_anoth sentenc_other
corpus . 1 1 . 1 . 1 . .
quanteda . 1 1 1 1 1 1 1 1
text2vec 1 1 1 . 1 1 1 1 .
tidytext . 1 1 1 1 . 1 1 1
tm . 1 1 1 1 . 1 1 1
print(sample)
[1] "Above ground. Another sentence. Others..."
Some major differences between the implementations:
-
With the quanteda, tidytext, and tm implementations, we remove stop words first, and then stem. With text2vec, we stem and then remove stop words. Corpus removes stop words after stemming and by default does not stem any words on the drop list. The word "other" is a stop word, but "others" is not. However, "others" stems to "other". Corpus and text2vec remove "others"; quanteda, tidytext, and tm replace "others" with a non-dropped "other" token. Another example: "above" is a stop word that stems to "abov". Text2vec replaces "above" with "abov" and keeps the token; the other packages drop "above".
-
By design, corpus does not form bigrams across dropped tokens, in particular across dropped punctuation. The other packagages form bigrams from "ground. Another" and "sentence. Others"; corpus does not.
There are also differences in how the packages handle numbers and
punctuation. Beyond that, there are differences in the default output
formats, but we have converted everything to the Matrix "dgCMatrix"
format to make the outputs comparable. (By default, corpus,
quanteda, and text2vec return Matrix objects, but tidytext and
tm return slam objects.)
First we benchmark the implementations:
make_bench <- function(name, text, bigrams)
{
if (!bigrams) {
results <- microbenchmark::microbenchmark (
basic = matrix_basic(text),
corpus = matrix_corpus(text, bigrams = FALSE),
quanteda = matrix_quanteda(text, bigrams = FALSE),
text2vec = matrix_text2vec(text, bigrams = FALSE),
tidytext = matrix_tidytext(text, bigrams = FALSE),
tm = matrix_tm(text, bigrams = FALSE),
times = 5)
} else {
results <- microbenchmark::microbenchmark (
corpus = matrix_corpus(text, bigrams = TRUE),
quanteda = matrix_quanteda(text, bigrams = TRUE),
text2vec = matrix_text2vec(text, bigrams = TRUE),
tidytext = matrix_tidytext(text, bigrams = TRUE),
tm = matrix_tm(text, bigrams = TRUE),
times = 5)
}
list(name = name, results = results)
}
plot_bench <- function(bench, title)
{
(ggplot(summary(bench$results),
aes(x = expr, fill = expr, y = median, ymin = lq, ymax = uq))
+ geom_bar(color = "white", stat = "identity")
+ geom_errorbar()
+ scale_fill_discrete(name = "Implementation")
+ xlab("")
+ ylab("Computation time (less is better)"))
}
Next, we present the results for the four benchmarks.
bench1 <- make_bench("Unigram, Novel", text_novel, bigrams = FALSE)
plot_bench(bench1)
print(bench1$results)
Unit: milliseconds
expr min lq mean median uq max neval
basic 88.48920 89.05394 96.37584 92.10699 103.3259 108.90318 5
corpus 72.55908 73.68373 79.02957 74.27550 82.2842 92.34531 5
quanteda 178.18855 192.49325 197.66854 194.64166 206.5873 216.43199 5
text2vec 131.45966 133.51842 140.94905 144.34047 147.4138 148.01290 5
tidytext 183.18050 184.20259 198.73485 189.34038 208.6867 228.26406 5
tm 699.21757 723.36367 743.64822 758.79199 768.1538 768.71406 5
bench2 <- make_bench("Unigram, Reviews", text_reviews, bigrams = FALSE)
plot_bench(bench2)
print(bench2$results)
Unit: milliseconds
expr min lq mean median uq max neval
basic 1114.2377 1173.7770 1251.7259 1193.6571 1193.7397 1583.2179 5
corpus 796.6223 827.9568 868.8841 856.4446 883.8539 979.5432 5
quanteda 2657.2460 2906.8692 2958.8925 2994.7007 3011.7327 3223.9141 5
text2vec 1454.0189 1524.6996 1643.7814 1575.0321 1641.7728 2023.3836 5
tidytext 2883.3277 3085.5559 3127.4713 3086.8100 3222.3369 3359.3260 5
tm 10733.1389 10928.9481 11294.6036 11203.5126 11299.2535 12308.1646 5
bench3 <- make_bench("Bigram, Novel", text_novel, bigrams = TRUE)
plot_bench(bench3)
print(bench3$results)
Unit: milliseconds
expr min lq mean median uq max neval
corpus 81.8879 87.02909 87.46721 87.96738 88.66089 91.79078 5
quanteda 2097.5938 2130.44951 2253.24351 2135.06500 2219.83249 2683.27677 5
text2vec 213.3557 215.30105 235.06664 220.82011 248.88193 276.97439 5
tidytext 669.0296 720.40514 750.92072 755.79841 773.24436 836.12608 5
tm 1351.4030 1359.86280 1393.70624 1403.16597 1425.30467 1428.79475 5
bench4 <- make_bench("Bigram, Reviews", text_reviews, bigrams = TRUE)
plot_bench(bench4)
print(bench4$results)
Unit: seconds
expr min lq mean median uq max neval
corpus 1.141393 1.152793 1.298710 1.155420 1.327140 1.716806 5
quanteda 23.969605 25.460181 26.070375 26.198521 26.770043 27.953526 5
text2vec 2.804406 2.840682 3.213001 3.011508 3.062619 4.345788 5
tidytext 12.643300 12.819985 13.333948 13.082897 13.962451 14.161108 5
tm 21.835697 22.255695 22.458758 22.649470 22.710763 22.842165 5
Corpus is faster than the other packages, by at least a factor of 2 and as much as a factor of 10. What's going on here? The other packages tokenize the text into a list of character vectors, then the process the token lists to form the term matrices. Corpus instead bypasses the intermediate step, going directly from the text to the term matrix without constructing an intermediate "tokens" object. This is only possible because all of the corpus normalization and tokenization is written directly in C.
The downside of the corpus approach is flexibility: if you're using corpus, you can't swap out the normalization or tokenizer for something custom. With varying degrees of ease, the other packages let you swap out these steps for your own custom functions.
Of course, there's more to text mining than just term matrices, so if you need more, than corpus alone probably won't be sufficient for you. The other packages have different strengths: quanteda and text2vec provide a host of models and metrics; tidytext fits in well with "tidy data" pipelines built on dplyr and related tools; tm has lots of extension packages for data input and modeling. Choose the package that best needs your needs.
sessionInfo()
R version 3.4.1 (2017-06-30)
Platform: x86_64-apple-darwin16.7.0 (64-bit)
Running under: macOS Sierra 10.12.5
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libLAPACK.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] methods stats graphics grDevices utils datasets base
other attached packages:
[1] bindrcpp_0.2 stringr_1.2.0 magrittr_1.5 dplyr_0.7.2 Matrix_1.2-10 quanteda_0.99 ggplot2_2.2.1
loaded via a namespace (and not attached):
[1] slam_0.1-40 NLP_0.1-10 reshape2_1.4.2 purrr_0.2.3 lattice_0.20-35
[6] colorspace_1.3-2 htmltools_0.3.6 SnowballC_0.5.1 tidytext_0.1.3 rlang_0.1.2.9000
[11] foreign_0.8-69 glue_1.1.1 lambda.r_1.1.9 text2vec_0.5.0 foreach_1.4.3
[16] bindr_0.1 plyr_1.8.4 munsell_0.4.3 gtable_0.2.0 futile.logger_1.4.3
[21] codetools_0.2-15 psych_1.7.5 evaluate_0.10.1 labeling_0.3 knitr_1.17
[26] tm_0.7-1 parallel_3.4.1 broom_0.4.2 tokenizers_0.1.4 Rcpp_0.12.12
[31] scales_0.4.1 backports_1.1.0 corpus_0.9.2 RcppParallel_4.3.20 microbenchmark_1.4-2.1
[36] fastmatch_1.1-0 mnormt_1.5-5 digest_0.6.12 stringi_1.1.5 grid_3.4.1
[41] rprojroot_1.2 tools_3.4.1 lazyeval_0.2.0 tibble_1.3.4 janeaustenr_0.1.4
[46] futile.options_1.0.0 tidyr_0.6.3 pkgconfig_2.0.1 data.table_1.10.4 lubridate_1.6.0
[51] assertthat_0.2.0 rmarkdown_1.6 iterators_1.0.8 R6_2.2.2 nlme_3.1-131
[56] compiler_3.4.1