library("tidyverse")
library("FactoMineR")
# get a data set
d <- read_csv("test_data.csv.gz")
summary(d)
# get two subsamples from the dataset
d1 <- d[1:500,]
d2 <- d[501:1000,]
## 1. Data transformation ----
# Fit lambda values for each column of a data.frame for Box Cox transformation
#
# @param x a data.frame
# @param lambdas the values of lambda to try
boxcox <- function(x, lambdas=seq(-5, 5, 1/10)) {
# find the lambda for each variable
lambdas <- sapply(x, function(y) {
# fit Box-Cox parameter
bc_estim <- MASS::boxcox(y ~ 1, lambda=lambdas, plotit=F)
# get the optimal parameter
lambda <- bc_estim$x[which.max(bc_estim$y)]
return(lambda)
})
# return the original data and the lambdas
res <- list(data=x, lambda=lambdas)
class(res) <- c("boxcox", class(res))
return(res)
}
# Transform the variables in a data.frame using the fitted lambdas values
#
# @param object an object output by boxcox()
# @param newdata a new data.frame on which to apply the transformation; if this is NULL (the default) the transformation is applied on the dataset on which the lambdas were fitted
predict.boxcox <- function(object, newdata=NULL) {
# get data form the object of missing
if (is.null(newdata)) {
newdata <- object$data
}
# check consistency
data_names <- names(newdata)
lambda_names <- names(object$lambda)
missing_lambdas <- setdiff(data_names, lambda_names)
if (length(missing_lambdas) > 0) {
warning("Lambdas have not been fitted for columns:\n ",
paste(missing_lambdas, collapse=", "),
"\n they will not be transformed")
}
extra_lambdas <- setdiff(lambda_names, data_names)
# if (length(extra_lambdas) > 0) {
# warning("Lambdas have been defined for columns:\n ",
# paste(extra_lambdas, collapse=", "),
# "\n but those are missing in the data")
# }
# transform the relevant columns
for (n in intersect(data_names, lambda_names)) {
newdata[[n]] <- bc_power_transform(newdata[[n]], object$lambda[n])
}
return(newdata)
}
# Compute the Box Cox power transform
#
# @param x a vector a numeric values
# @param l the lambda value
bc_power_transform <- function(x, l) {
if (l == 0) {
xt <- x # = do not transform
} else if (abs(l) <= 10^-6) {
xt <- log(x)
} else {
xt <- (x^(l)-1)/l
}
return(xt)
}
# Box Cox transform only works with strictly positive values
# => we need to shift the data above 0; and that shift needs to be the same for both data sets
# compute the min per column across both data sets
mins <- apply(bind_rows(d1, d2), 2, min)
# shift if the min is <=0
shifts <- ifelse(mins <= 0, abs(mins) + 10^-26, 0)
# shift each data set (with the same shifts)
d1_shifted <- sweep(d1, 2, shifts, `+`)
d2_shifted <- sweep(d2, 2, shifts, `+`)
bc1 <- boxcox(d1_shifted)
d1_bc <- predict(bc1)
d2_bc <- predict(bc1, newdata=d2_shifted)
ggplot(gather(d1)) + geom_histogram(aes(x=value), bins=30) + facet_wrap(~key, scale="free")
ggplot(gather(d1_bc)) + geom_histogram(aes(x=value), bins=30) + facet_wrap(~key, scale="free")
ggplot(gather(d2)) + geom_histogram(aes(x=value), bins=30) + facet_wrap(~key, scale="free")
ggplot(gather(d2_bc)) + geom_histogram(aes(x=value), bins=30) + facet_wrap(~key, scale="free")
## 2. PCA ----
# perform PCA on d1
pca1 <- PCA(d1_bc, ncp=4, graph=F, scale.unit=TRUE)
# and get coordinates of objects
coord_d1_pca1 <- pca1$ind$coord %>% as_tibble()
# NB: we just kept the first 4 principal components
# project d2 in the same PCA space
# i.e. that means re-scaling it the same way d1 was scaled and then reprojecting it in the same space
d2_in_pca1 <- predict(pca1, d2_bc)
# and get the coordinates
coord_d2_pca1 <- d2_in_pca1$coord %>% as_tibble()
## 3. Clustering ----
# define clusters on d1 only, in the pca space of d1
k_clust <- kmeans(coord_d1_pca1, centers=4, nstart=10)
# and get cluster numbers
coord_d1_pca1$cluster <- k_clust$cluster
# define a function to assign clusters in a new data set, based on an existing kmeans clustering
# this is done by assigning each point to the closest cluster center
predict.kmeans <- predict.kmeans <- function(object, newdata, ...) {
# get centers
centers <- object$centers
# compute (squared) Euclidean distance from each center to each new point
ss_by_center <- apply(centers, 1, function(x) {
colSums((t(newdata) - x) ^ 2)
})
# find the closest center and consider that this is the appropriate cluster
best_clusters <- apply(ss_by_center, 1, which.min)
return(best_clusters)
}
# use it to predict the clusters for d2 projected in the space of d1
coord_d2_pca1$cluster <- predict(k_clust, newdata=coord_d2_pca1)
## 4. Plot and verify ----
# identify each dataset
coord_d1_pca1$dataset <- "source"
coord_d2_pca1$dataset <- "projected"
# combine them
coords <- bind_rows(coord_d1_pca1, coord_d2_pca1)
# plot them
ggplot(coords) +
coord_fixed() +
geom_point(aes(x=Dim.1, y=Dim.2, shape=dataset, colour=factor(cluster))) +
scale_shape_manual(values=c(1, 3))
# -> OK, the objects are well reprojected the same way and the clusters are consistent across the two datasets