This packages its designed to run the Jaccard similarity for binary matrices in parallel using Rcpp and RcppParallel
You can install the development version of fastJaccard from GitHub with:
# install.packages("devtools")
devtools::install_github("alrobles/fastJaccard")
We can create a binary matrix as example and run against a build r basic code
As a baseline we implement the Jaccard distance in plain R
jaccard_distance <- function(mat) {
intersection <- function(p,q){
sum(ifelse(p + q == 2, 1 , 0))
}
union = function(p,q){
sum(p) + sum(q) - intersection(p, q)
}
res = matrix(0, nrow(mat), nrow(mat))
for (i in 1:(nrow(mat) - 1)) {
for (j in (i+1):nrow(mat)) {
d1 = intersection(mat[i,], mat[j,])
d2 = union(mat[i,], mat[j,])
res[j,i] = 1 - d1/d2
res[i,j] = 1 - d1/d2
}
}
res
}
We create now a random binary matrix and run both implementations
library(fastJaccard)
## basic example code
# create a matrix
n = 1000
k = 2000
m = matrix(ifelse(runif(n*k) > 0.5, 1, 0), ncol = k)
# ensure that serial and parallel versions give the same result
r_res <- jaccard_distance(m)
rcpp_parallel_res <- fastJaccard::jaccard_fast_matrix(m)
stopifnot(all(rcpp_parallel_res - r_res < 1e-10)) ## precision differences
# compare performance
library(rbenchmark)
res <- benchmark(jaccard_distance(m),
jaccard_fast_matrix(m),
replications = 30,
order="relative")
res[,1:4]
We can also can get a Jaccard similarity for vectors
set.seed(1235)
x = rbinom(1e6,1,.5)
y = rbinom(1e6,1,.5)
fastJaccard::jaccard_fast(x, y)