diff --git a/single_cell/dge/STUB_put_dge_functions_here.txt b/single_cell/dge/STUB_put_dge_functions_here.txt new file mode 100644 index 0000000..e69de29 diff --git a/single_cell/pseudobulk_function.R b/single_cell/pseudobulk_function.R new file mode 100644 index 0000000..45274dc --- /dev/null +++ b/single_cell/pseudobulk_function.R @@ -0,0 +1,188 @@ +### This script provides a function for pseudobulking single-cell RNAseq data, with a focus towards DGE functions also being planned +# +#' Standardized Pseudobulk Implementations +#' @param object A Seurat object +#' @param sample.by String or string vector naming metadata columns of \code{object} to use for assigning sample identities of cells. +#' A biospecimen, timepoint, or subject name is the common target here, and a batch identifier may be desired as well. +#' @param cell.by String naming the metadata column of \code{object} to use for assigning annotation or cluster identities of cells. +#' @param min.cells Number, 10 by default, which sets the minimum number of cells that a pseudobulk should represent in order to be retained. +#' @param cell.targets Optionally, a string vector naming particular \code{cell.by} values to target. Only these cell identities will be retained and pseudobulked. +#' @param features Optionally, a string vector naming a subset of genes to target. Only counts of these genes will be retained in pseudobulking. +#' @param assay String naming the assay of \code{object} to target. Default is "RNA". +#' @param meta.targets Optionally, a string vector naming particular metadata columns of \code{object} to target for retention. By default, as much a possible will be retained. +#' @param meta.num.summary.method Function like \code{\link[base]{mean}} or \code{\link[base]{median}} (with an \code{na.rm = TRUE} option) describing how to summarize numeric metadata from all cells of each pseudobulk +#' @param method "Seurat", "scuttle", or "dreamlet". Determines which pseudobulking tool to rely on. \itemize{ +#' \item "Seurat" = \code{\link[Seurat]{AggregateExpression}} +#' \item Others planned but not yet implemented +#' } +#' @param output.style "Seurat" or "raw". Determines how data should be returned. Either as a Seurat object, or as a raw list of the 'counts' (matrix) and 'metadata' (data.frame) +#' @param output.metadata.cell.count String denoting the metadata column name added to hold the number of original cells going into each pseudobulk +#' @param verbose Logical which controls whether timestamped log messages should be shown +#' @return A Seurat object or, if \code{output.style = 'raw'} a named list with elements 'counts' and 'metadata'. +#' @details +#' The primary role of this function is standardizing the pseudobulking process. +#' +#' \emph{It is under construction and subject to change!} +#' +#' Currently, a Seurat object is expected and pseudobulking is performed per the groupings given by \code{cell.by} and any number of \code{sample.by} metadata. +#' Pseudobulk counts reflect the sum of counts across all cells that they represent. +#' +#' For the \code{method = "Seurat"} (the only method currently implemented) this is done using \code{Seurat::\link[Seurat]{AggregateExpression}} function with parameters: +#' \code{return.seurat = TRUE, group.by = c(sample.by, cell.by), features = features, assays = assay}. +#' +#' The number of cells that each pseudbulk represents are added to a metadata named 'cells_in_pseudobulk' by default. +#' Use \code{output.metadata.cell.count} to use a different column name. +#' \emph{Pseudobulks representing fewer than \code{min.cells} cells are removed.} +#' +#' \code{meta.targets}, or all metadata or \code{object} are then extracted for each pseudobulk. +#' For discrete metadata, the column will be ignored if data are not consistent within ALL pseudobulks. +#' For numeric metadata, values from each cell in the pseudobulks are summarized by the \code{meta.num.summary.method} function (\code{\link[base]{median}} by default). +#' +#' Finally, the output structure is determined by the \code{output.style} input. +#' +#' Prior to pseudobulking, particular cell identities or genes can be targeted using the \code{cell.targets} and \code{features} inputs, respectively. +#' +#' @author Daniel Bunis +#' @examples +#' library(Seurat) +#' dsco_pseudobulk( +#' object = pbmc_small, +#' sample.by = "groups", +#' cell.by = "RNA_snn_res.0.8" +#' ) +#' +dsco_pseudobulk <- function( + object, sample.by, cell.by, + min.cells = 10, + cell.targets = NULL, features = NULL, + meta.targets = NULL, + assay = "RNA", + method = c('Seurat', 'scuttle', 'dreamlet'), + meta.num.summary.method = median, + output.style = c('Seurat', 'raw'), + output.metadata.cell.count = 'cells_in_pseudobulk', + verbose = TRUE +) { + method <- match.arg(method) + output.style <- match.arg(output.style) + + orig_metas <- colnames(object@meta.data) + + # Ensure ts_log exists, or source it. + if (verbose) { + if (!exists('ts_log')) { + stop("'ts_log' function required when 'verbose = TRUE'. Set 'verbose = FALSE' or run 'source('/R_utils/ts_log.R', chdir = TRUE, local = TRUE)' and then try again.") + } + msg_if <- ts_log + } else { + msg_if <- function(...) {} + } + + # Ensure all of cell.by and sample.by exist as metadata + if (!all(c(cell.by, sample.by) %in% orig_metas)) { + stop("A 'cell.by' or 'sample.by' element does not exist as a column the 'object' metadata.") + } + # Ensure assay exists + if (!assay %in% Seurat::Assays(object)) { + stop("'assay' is not a valid assay of the 'object'.") + } + + # Trim to 'cell.targets' + if (!is.null(cell.targets)) { + object <- object[,object@meta.data[,cell.by] %in% cell.targets] + retained_cell_num <- ncol(object) + retained_cell_idents <- unique(object@meta.data[,cell.by]) + msg_if( + "Trimming to 'cell.targets' retained ", retained_cell_num, + " cells of identities:\n", paste0(retained_cell_idents, collapse=", ") + ) + } + + ### Pseudobulk + # (Trim to features internally) + if (method == 'Seurat') { + msg_if("Initiating pseudobulking with Seurat's AggregateExpression...") + group.metas <- c(sample.by, cell.by) + psobject <- Seurat::AggregateExpression( + object = object, return.seurat = TRUE, group.by = group.metas, + features = features, + assays = assay) + + ### Add cell counts and Trim too small pseudobulks + msg_if("Adding cell counts as metadata '", output.metadata.cell.count, "'.") + psobject@meta.data[,output.metadata.cell.count] <- vapply( + seq_len(ncol(psobject)), + function(i) { + # Using unlist for ignorance of rownames (cell vs pseudobulk won't match!) + targ <- unlist(psobject@meta.data[i,group.metas]) + sum( + apply(object@meta.data[,group.metas], 1, function(x) { + identical(unlist(x), targ) + }) + ) + }, + numeric(1) + ) + too_small <- psobject@meta.data[,output.metadata.cell.count] < min.cells + if (too_small == ncol(psobject)) { + warning(paste0("Skipping triming pseudobulks smaller than 'min_cells' as NONE were built from more than ", min_cells, " cells.")) + } else if (too_small > 0) { + msg_if("\tTrimming ", too_small, " pseudobulks built from fewer than ", min_cells, " cells.") + psobject <- psobject[,psobject@meta.data[,output.metadata.cell.count] >= min.cells] + } + + ### Ensure meta.targets, or as many metadata as possible, are retained. + meta_ignored <- c() + meta_kept <- colnames(psobject@meta.data) + if (is.null(meta.targets)) { + meta.targets <- orig_metas + } + meta.targets <- meta.targets[!meta.targets %in% meta_kept] + msg_if("Grabbing additional metadata '", paste0(meta.targets, collapse = "', '"), "'.") + meta_collapse <- function(df_col, name) { + if (is.numeric(df_col)) { + meta.num.summary.method(df_col, na.rm = TRUE) + } else { + if (name %in% meta_ignored) { + NA + } else { + if (length(unique(df_col))>1) { + msg_if("\tignoring discrete metadata column '", name, "' because it is not consistent within all pseudobulks") + assign('meta_ignored', c(meta_ignored, name), inherits = TRUE) + NA + } else { + df_col[1] + } + } + } + } + for (i in seq_len(ncol(psobject))) { + targ <- unlist(psobject@meta.data[i,group.metas]) + matches <- apply(object@meta.data[,group.metas], 1, function(x) { + identical(unlist(x), targ) + }) + for (meta in meta.targets) { + if (i == 1) { + psobject@meta.data[,meta] <- NA + } + psobject@meta.data[i,meta] <- meta_collapse(object@meta.data[matches,meta], meta) + } + } + psobject@meta.data <- psobject@meta.data[,!colnames(psobject@meta.data)%in% meta_ignored] + } else { + stop('Alternative Pseudobulking methods not yet implemented.') + } + + # Output in requested style + msg_if("Pseudobulk function COMPLETE.") + if (output.style == 'Seurat') { + psobject + } else { + exp <- if (packageVersion("Seurat")>='5.0') { + Seurat::GetAssayData(psobject, layer = 'counts') + } else { + Seurat::GetAssayData(psobject, slot = 'counts') + } + list(counts = exp, metadata = psobject@meta.data) + } +} diff --git a/single_cell/pseudobulk_function.py b/single_cell/pseudobulk_function.py new file mode 100644 index 0000000..c230335 --- /dev/null +++ b/single_cell/pseudobulk_function.py @@ -0,0 +1,183 @@ +from ..python_utils.ts_log import ts_log +from scanpy.get import aggregate +import anndata as ad +from pandas.api.types import is_numeric_dtype +from warnings import warn + +def dsco_pseudobulk( + object, sample_by, cell_by, + layer = None, use_raw = True, + modality = "rna", + min_cells = 10, + cell_targets = None, + # features = None, + meta_targets = None, + meta_num_summary_method = 'median', + output_style = 'adata', # 'adata' or 'raw' + output_metadata_cell_count = 'cells_in_pseudobulk', + verbose = True): + """ +The primary role of this function is standardizing the pseudobulking process. +Note that it is under construction still and subject to change!. + +Pseudobulking is performed per the groupings given by cell_by and sample_by metadata using the scanpy.get.aggregate +method to sum counts across cells. Beforehand, for MuData 'object's, the targeted modality will be extracted. + +Raw counts data is normally the desired data for pseudobulking. If 'layer' is given, the matrix used for pseudobulking +will be the 'object'.layers['layer']. +If no 'layer' is given and (default) 'use_raw' = True, the 'object'.raw.X slot element is used for pseudobulking. +Otherwise, 'object'.X is used in pseudobulking. + +The number of cells that each pseudbulk represents are added to a metadata named 'cells_in_pseudobulk' by default, but +this column name is adjustable with the 'output_metadata_cell_count' argument. + +Pseudobulks representing fewer than 'min.cells' cells are removed. + +'meta_targets', or all metadata of the target 'object' (or the targeted 'modality' for a muon 'object') are then +extracted for each pseudobulk. +For discrete metadata, the column will be ignored if data are not consistent within ALL pseudobulks. +For numeric metadata, values from each cell in the pseudobulks are summarized by the 'meta_num_summary_method' method, +("median" by default). + +Finally, the output structure is determined by the output_style argument, either as an AnnData ('adata') or as a dict +('raw') with keys ['counts', 'feature_names', 'metadata'] + +Prior to pseudobulking, particular cell identities or genes can be targeted using the 'cell_targets' and 'features' +inputs, respectively. + +Arguments: + object An AnnData or MuData. For MuData objects, only elements inside the internal AnnData of + the target 'modality' will remain accessible to the function. + samply_by A string or list of strings naming columns of 'object'.obs to use for assigning sample + identity of cells. A biospecimen, timepoint, or subject name is the common target here, + and a batch identifier may be desired as well. + cell_by String naming the metadata column of 'object'.obs to use for assigning annotation or + cluster identities of cells. + layer Optionally, a string naming as 'object'.layers key holding the count data. + use_raw Boolean, True by default, denoting whether to use the object's .raw.X matrix instead of + its .X matrix unless a 'layer' is given. + modality String, "rna" by default, giving the modality name to use when 'object' is a MuData. + min_cells Number, 10 by default, which sets the minimum number of cells that a pseudobulk should + represent in order to be retained. + cell_targets Optionally, a string list naming particular 'cell_by' data values to target. Only these + cell identities will be retained and pseudobulked. + meta_targets Optionally, a string list naming particular columns of 'object'.obs to target for + retention. By default, as much a possible will be retained. + meta_num_summary_method String like "median" (default) or "mean" naming how to summarize numeric metadata + describing how to summarize numeric metadata from all cells of each pseudobulk. + output_style 'adata' (default) or 'raw'. Determines how data should be returned. Either as an AnnData + object, or as a 'raw' dict of the 'counts', 'feature_names', and 'metadata'. + output_metadata_cell_count String ('cells_in_pseudobulk' by default) denoting the metadata column name added to + hold the number of original cells going into each pseudobulk. + verbose Logical which controls whether timestamped log messages should be shown + +Example call: + dsco_pseudobulk( + object = adata, + sample_by = ['biospecimen_id', 'orig.ident'], + cell_by = 'leiden_res1', + layer = 'counts' + ) + + """ + if verbose: + msg_if = ts_log + else: + def msg_if(*args, **kwargs): + return + + # To anndata with target data in 'layer' or .X + if not isinstance(object, ad.AnnData): + if modality in object.mod_names: + object = object[modality] + else: + raise Exception("'object' is not an AnnData or not a MuData with 'modality' in its mod_names.") + if layer is None and use_raw: + object.X = object.raw.X + + # Ensure all of cell_by and sample_by exist as metadata + orig_metas = object.obs.columns + group_metas = list(sample_by) + [cell_by] + if not all([i in orig_metas for i in group_metas]): + raise Exception("A 'cell_by' or 'sample_by' element does not exist as a column of 'object.obs'.") + + # Trim to 'cell_targets' + if not cell_targets is None: + object = object[object.obs[cell_by] in cell_targets].copy() + msg_if( + "Trimming to 'cell_targets' retained ", object.n_obs, + " cells of identities:\n", object.obs[cell_by].unique.join(", ") + ) + + ### Pseudobulk + # (Trim to features internally) + msg_if("Initiating pseudobulking with scanpy.get.aggregate...") + psobject = aggregate( + adata = object, + by = group_metas, + func = 'sum', + layer = layer) + msg_if("completed, moving pseudobulk sum counts matrix to .X") + psobject.X = psobject.layers['sum'].copy() + del psobject.layers['sum'] + + ### Add cell counts and Trim too small pseudobulks + msg_if("Adding cell counts as metadata '", output_metadata_cell_count, "'.") + counts = object.obs.value_counts(subset=group_metas) + pseudo_vals = counts.index.copy() + counts.index = ['_'.join(map(str, x)) for x in counts.index] + psobject.obs[output_metadata_cell_count] = counts + psobject.obs[output_metadata_cell_count] = psobject.obs[output_metadata_cell_count].fillna(0).astype('int') + zero_cells = sum(psobject.obs[output_metadata_cell_count] == 0) + if zero_cells > 0: + msg_if(f"\tTrimming fake pseudobulks matching zero original cells.") + psobject = psobject[psobject.obs[output_metadata_cell_count] > 0].copy() + if min_cells > 0: + too_small = sum(psobject.obs[output_metadata_cell_count] < min_cells) + if too_small == psobject.obs.shape[0]: + warn(f"Skipping triming pseudobulks smaller than 'min_cells' as NONE were built from more than {min_cells} cells.") + elif too_small > 0: + msg_if(f"\tTrimming {too_small} pseudobulks built from fewer than {min_cells} cells.") + psobject = psobject[psobject.obs[output_metadata_cell_count] >= min_cells].copy() + + ### Ensure meta_targets, or as many metadata as possible, are retained. + meta_ignored = [] + meta_kept = psobject.obs.columns + if meta_targets is None: + meta_targets = orig_metas + meta_targets = [i for i in meta_targets if not i in meta_kept] + if len(meta_targets) > 0: + msg_if(f"Pulling in additional metadata targets: '{', '.join(meta_targets)}'.") + numeric = [i for i in meta_targets if is_numeric_dtype(object.obs[i])] + discrete = [i for i in meta_targets if not is_numeric_dtype(object.obs[i])] + if len(discrete)>0: + for col in discrete: + if len(object.obs.value_counts(subset=group_metas+[col]).index) > len(pseudo_vals): + meta_ignored.append(col) + meta_targets.remove(col) + if len(meta_ignored)>0: + msg_if(f"\tignoring discrete metadata column(s) with inconsistency within some pseudobulks: '{', '.join(meta_ignored)}'.") + if len(meta_targets) > 0: + dtypes = object.obs.dtypes.astype(str).to_dict() + if len(meta_targets) > 0: + for col in meta_targets: + method = 'first' if not col in numeric else meta_num_summary_method + new_dat = object.obs.groupby(group_metas, observed = True).agg(x=(col, method)).loc[pseudo_vals,:] + new_dat.index = counts.index + psobject.obs[col] = new_dat + try: + psobject.obs[col] = psobject.obs[col].astype(dtypes[col]) + except (KeyError, ValueError): + pass + else: + msg_if('\tZero non-ignored additional metadata targets.') + + # Output in requested style + msg_if("Pseudobulk function COMPLETE.") + if (output_style == 'adata'): + return psobject + return { + 'counts': psobject.X, + 'feature_names': psobject.var_names, + 'metadata': psobject.obs + } diff --git a/single_cell/pseudobulk_function.txt b/single_cell/pseudobulk_function.txt new file mode 100644 index 0000000..f5f0fe9 --- /dev/null +++ b/single_cell/pseudobulk_function.txt @@ -0,0 +1,129 @@ +Standardized Pseudobulk Implementations + +Description: + + Standardized Pseudobulk Implementations + +Usage: + + dsco_pseudobulk( + object, + sample.by, + cell.by, + min.cells = 10, + cell.targets = NULL, + features = NULL, + meta.targets = NULL, + assay = "RNA", + method = c("Seurat", "scuttle", "dreamlet"), + meta.num.summary.method = median, + output_style = c("Seurat", "raw"), + output_metadata_cell_count = "cells_in_pseudobulk", + verbose = TRUE + ) + +Arguments: + + object: A Seurat object + +sample.by: String or string vector naming metadata columns of ‘object’ + to use for assigning sample identities of cells. A + biospecimen, timepoint, or subject name is the common target + here, and a batch identifier may be desired as well. + + cell.by: String naming the metadata column of ‘object’ to use for + assigning annotation or cluster identities of cells. + +min.cells: Number, 10 by default, which sets the minimum number of + cells that a pseudobulk should represent in order to be + retained. + +cell.targets: Optionally, a string vector naming particular ‘cell.by’ + values to target. Only these cell identities will be retained + and pseudobulked. + +features: Optionally, a string vector naming a subset of genes to + target. Only counts of these genes will be retained in + pseudobulking. + +meta.targets: Optionally, a string vector naming particular metadata + columns of ‘object’ to target for retention. + + assay: String naming the assay of ‘object’ to target. Default is + "RNA". + + method: "Seurat", "scuttle", or "dreamlet". Determines which + pseudobulking tool to rely on. + + • "Seurat" = ‘AggregateExpression’ + + • Others planned but not yet implemented + +meta.num.summary.method: Function like ‘mean’ or ‘median’ (with an + ‘na.rm = TRUE’ option) describing how to summarize numeric + metadata from all cells of each pseudobulk + +output_style: "Seurat" or "raw". Determines how data should be + returned. Either as a Seurat object, or as a raw list of the + 'counts' (matrix) and 'meta' (data.frame) + +output_metadata_cell_count: String denoting the metadata name to use + for adding the number of original cells going into each + pseudobulk + + verbose: Logical which controls whether log messages should be output + +Details: + + The primary role of this function is standardizing the + pseudobulking process. + + _It is under construction and subject to change!_ + + Currently, a Seurat object is expected and pseudobulking is + performed per the groupings given by ‘cell.by’ and any number of + ‘sample.by’ metadata. Pseudobulk counts reflect the sum of counts + across all cells that they represent. + + For the ‘method = "Seurat"’ (the only method currently + implemented) this is done using ‘Seurat::AggregateExpression’ + function with parameters: ‘return.seurat = TRUE, group.by = + c(sample.by, cell.by), features = features, assays = assay’. + + The number of cells that each pseudbulk represents are added to a + metadata named 'cells_in_pseudobulk' by default. Use + ‘output_metadata_cell_count’ to use a different column name. + _Pseudobulks representing fewer than ‘min.cells’ cells are + removed._ + + ‘meta.targets’, or all metadata or ‘object’ are then extracted for + each pseudobulk. For discrete metadata, the column will be ignored + if data are not consistent within ALL pseudobulks. For numeric + metadata, values from each cell in the pseudobulks are summarized + by the ‘meta.num.summary.method’ function (‘median’ by default). + + Finally, the output structure is determined by the ‘output_style’ + input. + + Prior to pseudobulking, particular cell identities or genes can be + targeted using the ‘cell.targets’ and ‘features’ inputs, + respectively. + +Value: + + A Seurat object or, if ‘output_style = 'raw'’ a named list with + elements 'counts' and 'meta'. + +Author(s): + + Daniel Bunis + +Examples: + + library(Seurat) + dsco_pseudobulk( + object = pbmc_small, + sample.by = "groups", + cell.by = "RNA_snn_res.0.8" + ) +