Hi, Thank you for this very useful package. As many of the CCLE line

See also this <a href="https://github.com/UCLouvain-CBIO/depmap/issues/69#issuecomment

I have created a to convert most of the DepMap datasets to a <code class="notra

serve data as MultiAssayExperiment? about depmap HOT 4 OPEN

allisonvuong commented on August 16, 2024

serve data as MultiAssayExperiment?

from depmap.

Comments (4)

lgatto commented on August 16, 2024

Thank you for your interest, Allison. An MAE would be very much appreciated indeed. If you have time to send a PR, that would be great. Note that the data are disseminated through ExperimentHub, so I assume we would use the same mechanism for the MAE.

from depmap.

allisonvuong commented on August 16, 2024

Hi,

Okay great, I will do that!

Best,
Allison

from depmap.

lgatto commented on August 16, 2024

See also this comment

from depmap.

tfkillian commented on August 16, 2024

I have created a script to convert most of the DepMap datasets to a MultiAssayExperiment object. However, I'm a bit stuck coercing the mutationCalls dataset to a RangedSummarizedExperiment Does anyone have any suggested solutions?

## Create MultiAssayExperiment object out of the most current Depmap datasets

# TODO:
# 1) convert the mutationCalls to a RangedSummarizedExperiment
# 2) make MAE out of all datesets

## load libraries
# BiocManager::install(
#   c("MultiAssayExperiment", "SummarizedExperiment", "S4Vectors", "dplyr",
#     "tidyr", "tibble", "readr", "ExperimentHub", "depmap"))

library("MultiAssayExperiment")
library("SummarizedExperiment")
library("S4Vectors")
library("dplyr")
library("tidyr")
library("tibble")
library("readr")
library("ExperimentHub")
library("depmap")

Sys.setenv(VROOM_CONNECTION_SIZE = 500072)

## output file path
file_path <- "media/seq-srv-05/vrc/Project/Project_Theo" ## server
#file_path <- "~/tmp/multiomics/data/" ## local

#################### depmap `metadata_21Q4` dataset ############################
## "metadata" dataset will serve as MAE colData
readr::read_csv("https://ndownloader.figshare.com/files/31316011") %>% 
  as.data.frame() -> metadata_21Q4
names(metadata_21Q4)[1:26] <- c(
    "depmap_id", "cell_line_name", "stripped_cell_line_name", "cell_line",
    "aliases", "cosmic_id", "sex", "source", "Achilles_n_replicates",
    "cell_line_NNMD", "culture_type", "culture_medium", "cas9_activity",
    "RRID", "WTSI_master_cell_ID", "sample_collection_site",
    "primary_or_metastasis", "primary_disease", "subtype_disease", "age",
    "sanger_id", "additional_info", "lineage", "lineage_subtype",
    "lineage_sub_subtype", "lineage_molecular_subtype")
rownames(metadata_21Q4) <- metadata_21Q4$depmap_id

######################## depmap dep_2_name  ###################################
### `dep_2_name` to add `depmap_id` or `cell_line` to other datasets
metadata_21Q4 %>% dplyr::select(depmap_id, cell_line) -> dep_2_name_21Q4

###################### depmap `mutationCalls_21Q4` dataset #####################
mutationCalls_21Q4 <- readr::read_csv("https://ndownloader.figshare.com/files/31315930")
names(mutationCalls_21Q4)[1:32] <- c(
    "gene_name", "entrez_id", "ncbi_build", "chromosome", "start_pos",
    "end_pos", "strand", "var_class", "var_type", "ref_allele",
    "tumor_seq_allele1", "dbSNP_RS", "dbSNP_val_status", "genome_change",
    "annotation_trans", "depmap_id", "cDNA_change", "codon_change",
    "protein_change", "is_deleterious", "is_tcga_hotspot", "tcga_hsCnt",
    "is_cosmic_hotspot", "cosmic_hsCnt", "ExAC_AF", "var_annotation",
    "CGA_WES_AC", "HC_AC", "RD_AC", "RNAseq_AC", "sanger_WES_AC", "WGS_AC")
mutationCalls_21Q4 %>% dplyr::select(depmap_id, everything()) -> mutationCalls_21Q4

######################## depmap `copyNumber_21Q4` dataset ######################
readr::read_csv("https://ndownloader.figshare.com/files/31315924") %>% 
  as.data.frame() %>% dplyr::rename(depmap_id = names(.)[1]) -> copyNumber_21Q4
names(copyNumber_21Q4) <- gsub("&", ";", sub(" \\(.+\\)$", "", names(copyNumber_21Q4)))
copyNumber_21Q4 %>%
  tibble::column_to_rownames(var = "depmap_id") %>%
  t() %>% as.data.frame() -> copyNumber_21Q4_mat

## build SE object
data.frame(Samples = names(copyNumber_21Q4_mat)) -> copyNumber_colData
rownames(copyNumber_colData) <- copyNumber_colData$Samples
SummarizedExperiment(assays = copyNumber_21Q4_mat,
                     colData = copyNumber_colData) -> copyNumber_se

######################## depmap `crispr_21Q4` dataset ##########################
readr::read_csv("https://ndownloader.figshare.com/files/31315996") %>%
    as.data.frame() %>% dplyr::rename(depmap_id = names(.)[1]) -> crispr_21Q4
names(crispr_21Q4) <- gsub("&", ";", sub(" \\(.+\\)$", "", names(crispr_21Q4)))
crispr_21Q4 %>%
  tibble::column_to_rownames(var = "depmap_id") %>%
  t() %>% as.data.frame() -> crispr_21Q4_mat

## build SE object
data.frame(Samples = names(crispr_21Q4_mat)) -> crispr_colData
rownames(crispr_colData) <- crispr_colData$Samples
SummarizedExperiment(assays = crispr_21Q4_mat,
                     colData = crispr_colData) -> crispr_se

######################## depmap `TPM_21Q4` dataset #############################
readr::read_csv("https://ndownloader.figshare.com/files/31315882") %>%
  as.data.frame() %>% dplyr::rename(depmap_id = names(.)[1]) -> TPM_21Q4
names(TPM_21Q4) <- gsub("&", ";", sub(" \\(.+\\)$", "", names(TPM_21Q4)))
TPM_21Q4 %>%
  tibble::column_to_rownames(var = "depmap_id") %>%
  t() %>% as.data.frame() -> TPM_21Q4_mat

## build SE object
data.frame(Samples = names(TPM_21Q4_mat)) -> TPM_colData
rownames(TPM_colData) <- TPM_colData$Samples
SummarizedExperiment(assays = TPM_21Q4_mat,
                     colData = TPM_colData) -> TPM_se

##################### depmap `RPPA_19Q3` dataset ###############################
## download the 19Q3 metadata
eh <- ExperimentHub()
query(eh, "depmap")
eh[["EH3086"]] -> metadata_19Q3
metadata_19Q3 %>% dplyr::select(depmap_id, cell_line) -> dep_2_name_19Q3

### loading data (downloading .csv file from online source)
readr::read_csv(
  paste0("https://depmap.org/portal/download/api/download?file_name=ccle%2Fccle",
         "_2019%2FCCLE_RPPA_20181003.csv&bucket=depmap-external-downloads")) %>% 
  dplyr::rename(cell_line = names(.)[1]) %>% 
  dplyr::left_join(dep_2_name_19Q3, by = "cell_line") %>% 
  dplyr::select(depmap_id, cell_line, everything()) -> RPPA_19Q3

## fill in NA depmap ID value (source):
## https://web.expasy.org/cellosaurus/CVCL_9980 # NCIH684_LIVER
## https://web.expasy.org/cellosaurus/CVCL_3386 # KE97_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE
RPPA_19Q3$depmap_id[RPPA_19Q3$cell_line == "NCIH684_LIVER"] <- "ACH-000089"
RPPA_19Q3$depmap_id[RPPA_19Q3$cell_line == "KE97_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE"] <- "ACH-000167"
RPPA_19Q3 %>% dplyr::select(-cell_line) %>% 
  tibble::column_to_rownames(var = "depmap_id") %>%
  t() %>% as.data.frame() -> RPPA_19Q3_mat

## build SE object
data.frame(Samples = names(RPPA_19Q3_mat)) -> RPPA_colData
rownames(RPPA_colData) <- RPPA_colData$Samples
SummarizedExperiment(assays = RPPA_19Q3_mat,
                     colData = RPPA_colData) -> RPPA_se

##################### depmap `proteomic_20Q2` dataset ##########################
read_csv("https://gygi.hms.harvard.edu/data/ccle/protein_quant_current_normalized.csv.gz") %>% 
  dplyr::select(Gene_Symbol, Uniprot_Acc, MDAMB468_BREAST_TenPx01:last_col()) -> proteomic_20Q2

## manually add gene names that are NA (these were taken from Uniprot)
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "H7BZ55"] <- "CROCC2" # 270
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "A6NL28"] <- "LOC101929943" # 576
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "Q9H552"] <- "KRT8P11" # 1054
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "S4R362"] <- "SEPTIN1" #3233
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "Q96FF7"] <- "MISP3" # 4833
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "Q2M2H8"] <- "MGAM2" # 7933
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "P01614"] <- "IGKV2D-40" # 8523
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "P01619"] <- "IGKV3-20" # 8623
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "Q69YL0"] <- "NCBP2AS2" # 10005
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "I3L1I5"] <- "ARHGEF18" # 10487
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "E9PS84"] <- "AP000783.1" # 10676
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "P60507"] <- "ERVFC1" # 10822
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "H3BQF6"] <- "RP11-12J10.3" # 11313
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "C9J7I0"] <- "UMAD1" # 11632
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "Q902F9"] <- "HERVK_113" # 11691
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "P63135"] <- "ERVK-7" # 11692
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "H3BMH7"] <- "RP11-178L8.4" # 11735
proteomic_20Q2$Gene_Symbol[proteomic_20Q2$Uniprot_Acc == "P01780"] <- "IGHV3-7" # 12070

proteomic_20Q2 %>%
  dplyr::filter(!is.na(Gene_Symbol)) %>%
  dplyr::filter(!duplicated(Gene_Symbol)) %>%
  tibble::column_to_rownames(var = "Gene_Symbol") %>% 
  dplyr::select(-Uniprot_Acc) -> proteomic_20Q2
names(proteomic_20Q2) <- gsub('(.*)_\\w+', '\\1', names(proteomic_20Q2))

proteomic_20Q2 %>% 
  t() %>% as.data.frame() %>%
  tibble::rownames_to_column() %>%
  dplyr::rename(cell_line = rowname) %>% 
  dplyr::left_join(dep_2_name_21Q4, by = "cell_line") %>% 
  dplyr:: select(cell_line, depmap_id, everything()) -> proteomic_20Q2

## manually add depmap_ids that are NA (these were taken from CCLE)
proteomic_20Q2$depmap_id[proteomic_20Q2$cell_line == "SW948_LARGE_INTESTINE.1"] <- "ACH-000680" 
proteomic_20Q2$depmap_id[proteomic_20Q2$cell_line == "CAL120_BREAST.1"] <- "ACH-000212" 
proteomic_20Q2$depmap_id[proteomic_20Q2$cell_line == "X8505C_THYROID"] <- "ACH-001307" 
proteomic_20Q2$depmap_id[proteomic_20Q2$cell_line == "HCT15_LARGE_INTESTINE.1"] <- "ACH-000997"
proteomic_20Q2$depmap_id[proteomic_20Q2$cell_line == "SW948_LARGE_INTESTINE"] <- "ACH-000680"

proteomic_20Q2 %>%
  dplyr::filter(!is.na(depmap_id)) %>%
  dplyr::filter(!duplicated(depmap_id)) %>%
  dplyr::select(-cell_line) %>%
  tibble::column_to_rownames(var = "depmap_id") %>%
  t() %>% as.data.frame() -> proteomic_20Q2_mat

data.frame(Samples = names(proteomic_20Q2_mat),
           row.names = proteomic_colData$Samples) -> proteomic_colData

## build SE object
SummarizedExperiment(assays = proteomic_20Q2_mat,
                     colData = proteomic_colData) -> proteomic_se

#################### depmap `drug_dependency_21Q4` dataset #####################
# read_csv("https://ndownloader.figshare.com/files/17741420") %>% 
#   as.data.frame() %>% dplyr::rename(depmap_id = names(.)[1]) %>% 
#   dplyr::mutate(depmap_id = gsub("_FAILED_STR", "", depmap_id)) %>% 
#   tibble::column_to_rownames(var = "depmap_id") %>% 
#   as.matrix() %>% t() -> drug_dep_19Q3_w

#################### depmap `drug_dependency_21Q4` metadata ####################
## data cleaning of `drug_sensativity` dataset and screen info
# read_csv("https://ndownloader.figshare.com/files/20237715") %>% 
#   rename(compound = column_name) -> screen_info
# eh[["EH3087"]] -> drug_dep_19Q3

############################# build depmap MAE #################################
## Combine to a named list and call the ExperimentList constructor function
list(crispr_21Q4 = crispr_21Q4_se,
     copyNumber_21Q4 = copyNumber_21Q4_se,
     TPM_21Q4 = TPM_21Q4_se,
     proteomic_20Q2 = proteomic_20Q2_se,
     # mutationCalls_21Q4 = mutationCalls_21Q4_mat,
     # drug_dep_19Q3 = drug_dep_19Q3,
     RPPA_19Q3 = RPPA_19Q3_se) -> assayList

## Use the ExperimentList constructor
exp_list <- ExperimentList(assayList)

## build colData, a DataFrame describing the characteristics of biological units
mae_coldata <- DataFrame(metadata_21Q4)

## build MAE
MultiAssayExperiment::MultiAssayExperiment(
    experiments = exp_list,
    colData = mae_coldata #,
    #sampleMap = sample_map
    ) -> depmap_mae

# saveRDS(depmap_mae, file = paste0(file_path, "depmap_mae.rds"))

from depmap.

serve data as MultiAssayExperiment? about depmap HOT 4 OPEN

Comments (4)

Related Issues (20)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent