Term Matrix Comparison

Overview

There are multiple R packages that can transform text data into a matrix of term frequency counts. This document benchmarks five packages:

There are four benchmarks, two for unigrams only, and two for unigrams and bigrams. In each benchmark, we perform the following sequence of operations:

case fold the text
tokenize into words
remove punctuation
remove numbers
remove stop words
stem
compute bigrams (bigram benchmarks only)
compute term frequencies
remove terms that appear fewer than five times in the corpus
compute a term frequency matrix (text by term)

There are some subtle and not-so-subtle differences in how the five packages implement these operations, so this is not really an apples-to-apples comparison, and the outputs are different. Keep that in mind.

Prelude

We will load the following packages.

library("Matrix")
library("dplyr", warn.conflicts = FALSE)
library("ggplot2")
library("magrittr")
library("methods")
library("stringr")

The remaining packages need to be installed, but we will not load their namespaces:

# Not run:
# install.packages(c("microbenchmark", "corpus", "quanteda", "text2vec", "tidytext", "tm"))

For the first test corpus, we use the 62 chapters from Pride and Prejudice, provided by the janeaustenr library:

lines <- (data_frame(text = janeaustenr::prideprejudice)
          %>% mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
                                            ignore_case = TRUE)))))
text_novel <- c(tapply(lines$text, lines$chapter, paste, collapse = "\n"))

For the second test corpus, we use the 5000 movie reviews provided by the text2vec package:

text_reviews <- text2vec::movie_review$review
names(text_reviews) <- text2vec::movie_review$id

We will use the Snowball English stop word list:

stop_words <- corpus::stopwords_en

Implementations

Basic

As a baseline, we will include a basic implementation, written from scratch by Dmitriy Selivanov (text2vec author) that can handle unigrams but not bigrams:

# helper function for normalizing text, also used by text2vec below
preprocess <- function(x)
{
    # Note: this works fine for ASCII but not for general Unicode.
    # For Unicode, do the following instead:
    #
    # (stringi::stri_trans_nfkc_casefold(x)
    #  %>% stringi::stri_replace_all_regex("[^\\p{Letter}\\s]", ""))

    str_to_lower(x) %>% str_replace_all("[^[:alpha:]\\s]", "")
}

# helper function for tokenizing and stemming, also used by text2vec below
stem_tokenizer <- function(x)
{
    str_split(x, boundary("word")) %>% lapply(SnowballC::wordStem, "english")
}

matrix_basic <- function(text, min_count = 5)
{
    # normalize and tokenize the text
    toks <- text %>% preprocess %>% stem_tokenizer
    toks_flat <- unlist(toks, recursive = FALSE, use.names = FALSE)

    # compute the text lengths
    ntok <- vapply(toks, length, 0L)

    # compute the types, remove stop words
    types <- unique(toks_flat) %>% setdiff(stop_words)

    # construct the term matrix
    i <- rep.int(seq_along(text), ntok)
    j <- match(toks_flat, types)
    drop <- is.na(j)
    x <- sparseMatrix(i = i[!drop], j = j[!drop], x = 1,
                      dims = c(length(text), length(types)),
                      dimnames = list(names(text), types),
                      check = FALSE)

    # drop terms below the minimum count
    x <- x[, colSums(x) >= min_count, drop = FALSE]
    x
}

corpus

matrix_corpus <- function(text, bigrams = FALSE, min_count = 5)
{
    if (bigrams) {
        ngrams <- 1:2
    } else {
        ngrams <- 1
    }
    f <- corpus::text_filter(stemmer = "en", drop_punct = TRUE,
                             drop_number = TRUE, drop = stop_words)
    stats <- corpus::term_stats(text, f, ngrams = ngrams,
                                min_count = min_count)
    x <- corpus::term_matrix(text, f, select = stats$term)
    x
}

quanteda

matrix_quanteda <- function(text, bigrams = FALSE, min_count = 5)
{
    if (bigrams) {
        ngrams <- 1:2
    } else {
        ngrams <- 1
    }
    x <- quanteda::dfm(text, stem = TRUE, remove_punct = TRUE,
                       remove_numbers = TRUE, remove = stop_words,
                       ngrams = ngrams, verbose = FALSE)
    x <- quanteda::dfm_trim(x, min_count = min_count, verbose = FALSE)
    x
}

text2vec

# Written by Dmitriy Selivanov
matrix_text2vec <- function(text, bigrams = FALSE, min_count = 5)
{
    if (bigrams) {
        ngram <- c(1, 2)
    } else {
        ngram <- c(1, 1)
    }

    # since we don't care about RAM usage we will tokenize everything only
    # once and do it with a single call to preprocess and tokenizer
    tokens <- preprocess(text) %>% stem_tokenizer
  
    it_train <- text2vec::itoken(tokens, n_chunks = 1, progressbar = FALSE)
    vocab <- text2vec::create_vocabulary(it_train, ngram = ngram,
                                         stopwords = stop_words)
    pruned_vocab <- text2vec::prune_vocabulary(vocab,
                                               term_count_min = min_count)
    vectorizer <- text2vec::vocab_vectorizer(pruned_vocab)
    x <- text2vec::create_dtm(it_train, vectorizer)
    x
}

tidytext

# Note: this filters punctuation but keeps numbers
matrix_tidytext <- function(text, bigrams = FALSE, min_count = 5)
{
    data <- tibble::tibble(text_id = seq_along(text), text = text)
    stops <- tibble::tibble(word = stop_words)

    # unigrams
    freqs <- (data
        %>% tidytext::unnest_tokens(word, text)
        %>% anti_join(stops, by = "word")
        %>% mutate(term = SnowballC::wordStem(word, "english"))
        %>% count(text_id, term)
        %>% ungroup())

    # bigrams
    if  (bigrams) {
        freqs2 <- (data
            %>% tidytext::unnest_tokens(bigram, text, token = "ngrams", n = 2)
            %>% tidyr::separate(bigram, c("type1", "type2"), sep = " ")
            %>% filter(!type1 %in% stop_words,
                       !type2 %in% stop_words)
            %>% mutate(type1 = SnowballC::wordStem(type1, "english"),
                       type2 = SnowballC::wordStem(type2, "english"))
            %>% mutate(term = paste(type1, type2))
            %>% count(text_id, term)
            %>% ungroup())

        freqs <- rbind(freqs, freqs2)
    }

    # form matrix in slam format
    x <- freqs %>% tidytext::cast_dtm(text_id, term, n)

    # remove rare terms
    x <- x[, slam::col_sums(x) >= min_count, drop = FALSE]

    # cast to dgCMatrix format
    x <- sparseMatrix(i = x$i, j = x$j, x = x$v, dims = dim(x),
                      dimnames = dimnames(x), check = FALSE)
    x
}

tm

# from http://tm.r-forge.r-project.org/faq.html#Bigrams
BigramTokenizer <- function(x)
{
    unlist(lapply(NLP::ngrams(NLP::words(x), 2), paste, collapse = " "),
           use.names = FALSE)
}

matrix_tm <- function(text, bigrams = FALSE, min_count = 5)
{
    corpus <- (tm::VCorpus(tm::VectorSource(text))
               %>% tm::tm_map(tm::content_transformer(tolower))
               %>% tm::tm_map(tm::removeWords, stop_words)
               %>% tm::tm_map(tm::removePunctuation)
               %>% tm::tm_map(tm::removeNumbers)
               %>% tm::tm_map(tm::stemDocument, language = "english"))

    control <- list(wordLengths = c(1, Inf),
                    bounds = list(global = c(min_count, Inf)))

    x <- tm::DocumentTermMatrix(corpus, control = control)

    if (bigrams) {
        control$tokenize <- BigramTokenizer
        x2 <- tm::DocumentTermMatrix(corpus, control = control)

        x <- cbind(x, x2)
    }

    x <- sparseMatrix(i = x$i, j = x$j, x = x$v, dims = dim(x),
                      dimnames = dimnames(x), check = FALSE)
    x
}

Caveats

These implementations all give different results. See, for example, the results on the following sample text:

sample <- "Above ground. Another sentence. Others..."

# compute term matrices using five implementations
xs <- list(
    corpus   = matrix_corpus(sample, bigrams = TRUE, min_count = 1),
    quanteda = matrix_quanteda(sample, bigrams = TRUE, min_count = 1),
    text2vec = matrix_text2vec(sample, bigrams = TRUE, min_count = 1),
    tidytext = matrix_tidytext(sample, bigrams = TRUE, min_count = 1),
    tm       = matrix_tm(sample, bigrams = TRUE, min_count = 1))

# normalize the names (some use '_' to join bigrams, others use ' ')
for (i in seq_along(xs)) {
    colnames(xs[[i]]) <- str_replace_all(colnames(xs[[i]]), " ", "_")
}

# get the unique terms
terms <- unique(c(sapply(xs, colnames), recursive = TRUE))

# put unigrams before bigrams, then order lexicographically
terms <- terms[order(str_count(terms, "_"), terms)]

# combine everything into a single matrix
x <- matrix(0, length(xs), length(terms), dimnames = list(names(xs), terms))
for (i in seq_along(xs)) {
    xi <- xs[[i]]
    x[i, colnames(xi)] <- as.numeric(xi[1, ])
}

print(as(x, "dgCMatrix"))

5 x 9 sparse Matrix of class "dgCMatrix"
         abov anoth ground other sentenc abov_ground anoth_sentenc ground_anoth sentenc_other
corpus      .     1      1     .       1           .             1            .             .
quanteda    .     1      1     1       1           1             1            1             1
text2vec    1     1      1     .       1           1             1            1             .
tidytext    .     1      1     1       1           .             1            1             1
tm          .     1      1     1       1           .             1            1             1

print(sample)

[1] "Above ground. Another sentence. Others..."

Some major differences between the implementations:

With the quanteda, tidytext, and tm implementations, we remove stop words first, and then stem. With text2vec, we stem and then remove stop words. Corpus removes stop words after stemming and by default does not stem any words on the drop list. The word "other" is a stop word, but "others" is not. However, "others" stems to "other". Corpus and text2vec remove "others"; quanteda, tidytext, and tm replace "others" with a non-dropped "other" token. Another example: "above" is a stop word that stems to "abov". Text2vec replaces "above" with "abov" and keeps the token; the other packages drop "above".
By design, corpus does not form bigrams across dropped tokens, in particular across dropped punctuation. The other packagages form bigrams from "ground. Another" and "sentence. Others"; corpus does not.

There are also differences in how the packages handle numbers and punctuation. Beyond that, there are differences in the default output formats, but we have converted everything to the Matrix "dgCMatrix" format to make the outputs comparable. (By default, corpus, quanteda, and text2vec return Matrix objects, but tidytext and tm return slam objects.)

Results

Setup

First we benchmark the implementations:

make_bench <- function(name, text, bigrams)
{
    if (!bigrams) {
        results <- microbenchmark::microbenchmark (
            basic = matrix_basic(text),
            corpus = matrix_corpus(text, bigrams = FALSE),
            quanteda = matrix_quanteda(text, bigrams = FALSE),
            text2vec = matrix_text2vec(text, bigrams = FALSE),
            tidytext = matrix_tidytext(text, bigrams = FALSE),
            tm = matrix_tm(text, bigrams = FALSE),
            times = 5)
    } else {
        results <- microbenchmark::microbenchmark (
            corpus = matrix_corpus(text, bigrams = TRUE),
            quanteda = matrix_quanteda(text, bigrams = TRUE),
            text2vec = matrix_text2vec(text, bigrams = TRUE),
            tidytext = matrix_tidytext(text, bigrams = TRUE),
            tm = matrix_tm(text, bigrams = TRUE),
            times = 5)
    }

    list(name = name, results = results)
}

plot_bench <- function(bench, title)
{
    (ggplot(summary(bench$results),
            aes(x = expr, fill = expr, y = median, ymin = lq, ymax = uq))
     + geom_bar(color = "white", stat = "identity")
     + geom_errorbar()
     + scale_fill_discrete(name = "Implementation")
     + xlab("")
     + ylab("Computation time (less is better)"))
}

Next, we present the results for the four benchmarks.

Unigrams (novel)

bench1 <- make_bench("Unigram, Novel", text_novel, bigrams = FALSE)
plot_bench(bench1)

print(bench1$results)

Unit: milliseconds
     expr       min        lq      mean    median       uq       max neval
    basic  88.48920  89.05394  96.37584  92.10699 103.3259 108.90318     5
   corpus  72.55908  73.68373  79.02957  74.27550  82.2842  92.34531     5
 quanteda 178.18855 192.49325 197.66854 194.64166 206.5873 216.43199     5
 text2vec 131.45966 133.51842 140.94905 144.34047 147.4138 148.01290     5
 tidytext 183.18050 184.20259 198.73485 189.34038 208.6867 228.26406     5
       tm 699.21757 723.36367 743.64822 758.79199 768.1538 768.71406     5

Unigrams (reviews)

bench2 <- make_bench("Unigram, Reviews", text_reviews, bigrams = FALSE)
plot_bench(bench2)

print(bench2$results)

Unit: milliseconds
     expr        min         lq       mean     median         uq        max neval
    basic  1114.2377  1173.7770  1251.7259  1193.6571  1193.7397  1583.2179     5
   corpus   796.6223   827.9568   868.8841   856.4446   883.8539   979.5432     5
 quanteda  2657.2460  2906.8692  2958.8925  2994.7007  3011.7327  3223.9141     5
 text2vec  1454.0189  1524.6996  1643.7814  1575.0321  1641.7728  2023.3836     5
 tidytext  2883.3277  3085.5559  3127.4713  3086.8100  3222.3369  3359.3260     5
       tm 10733.1389 10928.9481 11294.6036 11203.5126 11299.2535 12308.1646     5

Bigrams (novel)

bench3 <- make_bench("Bigram, Novel", text_novel, bigrams = TRUE)
plot_bench(bench3)

print(bench3$results)

Unit: milliseconds
     expr       min         lq       mean     median         uq        max neval
   corpus   81.8879   87.02909   87.46721   87.96738   88.66089   91.79078     5
 quanteda 2097.5938 2130.44951 2253.24351 2135.06500 2219.83249 2683.27677     5
 text2vec  213.3557  215.30105  235.06664  220.82011  248.88193  276.97439     5
 tidytext  669.0296  720.40514  750.92072  755.79841  773.24436  836.12608     5
       tm 1351.4030 1359.86280 1393.70624 1403.16597 1425.30467 1428.79475     5

Bigrams (reviews)

bench4 <- make_bench("Bigram, Reviews", text_reviews, bigrams = TRUE)
plot_bench(bench4)

print(bench4$results)

Unit: seconds
     expr       min        lq      mean    median        uq       max neval
   corpus  1.141393  1.152793  1.298710  1.155420  1.327140  1.716806     5
 quanteda 23.969605 25.460181 26.070375 26.198521 26.770043 27.953526     5
 text2vec  2.804406  2.840682  3.213001  3.011508  3.062619  4.345788     5
 tidytext 12.643300 12.819985 13.333948 13.082897 13.962451 14.161108     5
       tm 21.835697 22.255695 22.458758 22.649470 22.710763 22.842165     5

Summary

Corpus is faster than the other packages, by at least a factor of 2 and as much as a factor of 10. What's going on here? The other packages tokenize the text into a list of character vectors, then the process the token lists to form the term matrices. Corpus instead bypasses the intermediate step, going directly from the text to the term matrix without constructing an intermediate "tokens" object. This is only possible because all of the corpus normalization and tokenization is written directly in C.

The downside of the corpus approach is flexibility: if you're using corpus, you can't swap out the normalization or tokenizer for something custom. With varying degrees of ease, the other packages let you swap out these steps for your own custom functions.

Of course, there's more to text mining than just term matrices, so if you need more, than corpus alone probably won't be sufficient for you. The other packages have different strengths: quanteda and text2vec provide a host of models and metrics; tidytext fits in well with "tidy data" pipelines built on dplyr and related tools; tm has lots of extension packages for data input and modeling. Choose the package that best needs your needs.

Session information

sessionInfo()

R version 3.4.1 (2017-06-30)
Platform: x86_64-apple-darwin16.7.0 (64-bit)
Running under: macOS Sierra 10.12.5

Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libLAPACK.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] methods   stats     graphics  grDevices utils     datasets  base     

other attached packages:
[1] bindrcpp_0.2  stringr_1.2.0 magrittr_1.5  dplyr_0.7.2   Matrix_1.2-10 quanteda_0.99 ggplot2_2.2.1

loaded via a namespace (and not attached):
 [1] slam_0.1-40            NLP_0.1-10             reshape2_1.4.2         purrr_0.2.3            lattice_0.20-35       
 [6] colorspace_1.3-2       htmltools_0.3.6        SnowballC_0.5.1        tidytext_0.1.3         rlang_0.1.2.9000      
[11] foreign_0.8-69         glue_1.1.1             lambda.r_1.1.9         text2vec_0.5.0         foreach_1.4.3         
[16] bindr_0.1              plyr_1.8.4             munsell_0.4.3          gtable_0.2.0           futile.logger_1.4.3   
[21] codetools_0.2-15       psych_1.7.5            evaluate_0.10.1        labeling_0.3           knitr_1.17            
[26] tm_0.7-1               parallel_3.4.1         broom_0.4.2            tokenizers_0.1.4       Rcpp_0.12.12          
[31] scales_0.4.1           backports_1.1.0        corpus_0.9.2           RcppParallel_4.3.20    microbenchmark_1.4-2.1
[36] fastmatch_1.1-0        mnormt_1.5-5           digest_0.6.12          stringi_1.1.5          grid_3.4.1            
[41] rprojroot_1.2          tools_3.4.1            lazyeval_0.2.0         tibble_1.3.4           janeaustenr_0.1.4     
[46] futile.options_1.0.0   tidyr_0.6.3            pkgconfig_2.0.1        data.table_1.10.4      lubridate_1.6.0       
[51] assertthat_0.2.0       rmarkdown_1.6          iterators_1.0.8        R6_2.2.2               nlme_3.1-131          
[56] compiler_3.4.1

manishpatwal / bench-term-matrix Goto Github PK

bench-term-matrix's Introduction