From 71645692e12063cbc2718e4340ded539a4cea214 Mon Sep 17 00:00:00 2001 From: Logan Blaine Date: Thu, 4 Dec 2025 23:52:33 -0500 Subject: [PATCH 1/7] update perturbo --- bin/perturbo_inference.py | 61 ++++---- bin/tf_enrichment.py | 284 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 319 insertions(+), 26 deletions(-) create mode 100644 bin/tf_enrichment.py diff --git a/bin/perturbo_inference.py b/bin/perturbo_inference.py index e0e6cb6..82d6f54 100755 --- a/bin/perturbo_inference.py +++ b/bin/perturbo_inference.py @@ -23,6 +23,7 @@ def run_perturbo( guide_modality_name="guide", # name of the guide modality in the MuData test_all_pairs=False, # whether to test all pairs or only those in pairs_to_test inference_type="element", # can be per-guide or per-element + drop_ntc_guides=False, # whether to drop non-targeting control guides in low_MOI analysis num_workers=0, # number of worker processes for data loading ): scvi.settings.seed = 0 @@ -31,6 +32,13 @@ def run_perturbo( mdata = md.read(mdata_input_fp) + if inference_type == "guide": + element_key = "guide_id" + elif inference_type == "element": + element_key = "intended_target_name" + else: + raise ValueError("inference_type must be 'guide' or 'element'") + control_guide_filter = ( (~mdata["guide"].var["targeting"]) | (mdata["guide"].var["type"] == "non-targeting") @@ -38,7 +46,7 @@ def run_perturbo( ) if np.any(control_guide_filter): - control_guides = mdata["guide"].var_names[control_guide_filter].tolist() + control_guides = control_guide_filter else: control_guides = None @@ -50,39 +58,40 @@ def run_perturbo( - mdata[gene_modality_name].obs["log1p_total_guide_umis"].mean() ) - efficiency_mode = {"undecided": "auto", "low": "mixture", "high": "scaled"}[ - efficiency_mode - ] + # efficiency_mode = {"undecided": "auto", "low": "mixture", "high": "scaled"}[ + # efficiency_mode + # ] + efficiency_mode = "auto" if efficiency_mode == "auto": - max_guides_per_cell = mdata[guide_modality_name].X.sum(axis=1).max() - if max_guides_per_cell > 1: + avg_guides_per_element = ( + mdata[guide_modality_name].var[element_key].value_counts().mean() + ) + + # max_guides_per_cell = mdata[guide_modality_name].X.sum(axis=1).max() + if avg_guides_per_element > 2: efficiency_mode = "scaled" - print( - "Using 'scaled' efficiency mode due to high MOI (max guides per cell > 1)." - ) + print("Using 'scaled' efficiency mode due to avg guides per element > 2.") else: - efficiency_mode = "mixture" + efficiency_mode = None print( - "Using 'mixture' efficiency mode due to low MOI (max guides per cell <= 1)." + "Not fitting guide efficiency (efficiency_mode=None) due to avg guides per element <= 2." ) - if inference_type == "guide": - element_key = "guide_id" - elif inference_type == "element": - element_key = "intended_target_name" - else: - raise ValueError("inference_type must be 'guide' or 'element'") - intended_targets_df = pd.get_dummies( mdata[guide_modality_name].var[element_key] ).astype(float) if efficiency_mode == "mixture": # don't test for control guides in low_MOI analysis (slightly more robust alternative to just dropping "non-targeting") - control_elements_idx = intended_targets_df.loc[control_guides].sum(axis=0) > 0 - control_elements = intended_targets_df.columns[control_elements_idx].tolist() - intended_targets_df = intended_targets_df.drop(control_elements, axis=1) + if drop_ntc_guides: + control_elements_idx = ( + intended_targets_df.loc[control_guides].sum(axis=0) > 0 + ) + control_elements = intended_targets_df.columns[ + control_elements_idx + ].tolist() + intended_targets_df = intended_targets_df.drop(control_elements, axis=1) # filter any cells with >1 guide multi_guide_cells = ( @@ -142,17 +151,17 @@ def run_perturbo( model = perturbo.PERTURBO( mdata, - # control_guides=control_guides, # broken in current PerTurbo version, fix when we update image + # control_guides=control_guides, # broken in current PerTurbo version, fix when we update image likelihood="nb", efficiency_mode=efficiency_mode, fit_guide_efficacy=fit_guide_efficacy, ) model.view_anndata_setup(mdata, hide_state_registries=True) - if batch_size is None: - # batch_size = int(np.clip(len(mdata) // 20, 128, 4096)) - batch_size = int(np.clip(len(mdata) // 20, 512, 10_000)) - print(f"Training using batch size of {batch_size}") + # if batch_size is None: + batch_size = int(np.clip(len(mdata) // 20, 128, 1024)) + # batch_size = int(np.clip(len(mdata) // 20, 512, 10_000)) + print(f"Training using batch size of {batch_size}") model.train( num_epochs, # max number of epochs lr=lr, diff --git a/bin/tf_enrichment.py b/bin/tf_enrichment.py new file mode 100644 index 0000000..f2e031f --- /dev/null +++ b/bin/tf_enrichment.py @@ -0,0 +1,284 @@ +import pandas as pd +import numpy as np +import pyranges as pr +from pybiomart import Dataset +from statsmodels.stats.multitest import multipletests +from scipy.stats import fisher_exact + + +def run_tf_enrichment( + results_df, + peak_file_paths_df, + genes=None, + element_col="element", + pval_col="p_value", + method_col="method", + gene_col="gene", + use_gene_ids="auto", + fdr_corrected=False, + promoter_window_width=5000, + chipseq_threshold=0.75, + fdr_cutoff=0.1, +): + """ + Run TF target enrichment analysis. + + Parameters + ---------- + results_df : pd.DataFrame + DataFrame with method results. Required columns: gene_col, method_col, element_col, pval_col. + peak_file_paths_df : dict + Dictionary mapping TF names to their ChIP-seq peak file paths. + genes : list, optional + List of gene names to consider as potential targets. If None, all genes are used. + element_col : str, optional + Column name in results_df that contains TF names. Default is "element". + pval_col : str, optional + Column name in results_df that contains p-values. Default is "p_value". + fdr_corrected : bool, optional + Whether the p-values in results_df are already FDR corrected. Default is False. + fdr_cutoff : float, optional + FDR cutoff for significance. Default is 0.1. + """ + if use_gene_ids == "auto": + use_gene_ids = any(results_df[gene_col].str.startswith("ENSG")) + + tfs = list(peak_file_paths_df.keys()) + for tf in tfs: + if tf not in results_df[element_col].values: + raise ValueError(f"TF {tf} not found in results DataFrame.") + + chipseq_gr = process_chipseq_data( + peak_file_paths_df, + element_col=element_col, + threshold=chipseq_threshold, + ) + + for tf in tfs: + if chipseq_gr[chipseq_gr.as_df()[element_col] == tf].empty: + raise ValueError(f"No peaks found for TF {tf} in ChIP-seq data.") + + promoter_gr = get_tss_info( + gene_names=genes, + use_ids=use_gene_ids, + promoter_window_width=promoter_window_width, + gene_col=gene_col, + ) + # print(promoter_gr) + # print(chipseq_gr.as_df().head()) + target_matrix = find_tf_targets( + chipseq_gr, promoter_gr, tfs, element_col=element_col, gene_col=gene_col + ) + tf_targets = target_matrix.reset_index(names=gene_col).melt( + id_vars=gene_col, var_name=element_col, value_name="target" + ) + + df_final_input = results_df[results_df[element_col].isin(tfs)].merge( + tf_targets, how="inner", on=[element_col, gene_col] + ) + final_summarized_df = summarize_results( + df_final_input, + element_col=element_col, + method_col=method_col, + fdr_corrected=fdr_corrected, + pval_col=pval_col, + q=fdr_cutoff, + ) + return final_summarized_df + + +# def compute_gene_by_element_counts(mdata): +# """Compute gene x element counts.""" +# return pd.DataFrame( +# mdata["rna"].X.T @ mdata["grna"].X @ mdata["grna"].varm["guide_targets"].values, +# index=mdata["rna"].var_names, +# columns=mdata["grna"].varm["guide_targets"].columns, +# ) + + +# def filter_test_pairs(gene_by_element_counts, mdata, threshold=7, gene_col="gene"): +# """Filter test pairs based on effective sample size threshold.""" +# control_counts = ( +# mdata["rna"].X.T.astype(bool) +# @ mdata["grna"][:, mdata["grna"].var_names.str.startswith("NT")].X +# ).sum(axis=1) +# passing_genes = mdata["rna"].var_names[control_counts >= threshold] +# filtered_test_pairs = ( +# gene_by_element_counts.loc[passing_genes, :] +# .melt(var_name="element", value_name="count", ignore_index=False) +# .reset_index() +# .rename(columns={"gene_symbol": gene_col}) +# .query("count >= @threshold") +# ) +# return filtered_test_pairs + + +def process_chipseq_data(file_paths, element_col="TF", threshold=0.75): + """Process ChIP-seq data for transcription factors.""" + col_names = [ + "Chromosome", + "Start", + "End", + "pval", + "score", + "pos_max_peak", + "max_peak_height", + "rel_pos_max_peak_height", + "peak_size", + "mid_point", + "peak_to_mid_dist", + ] + if threshold is None: + threshold = 0.0 + dfs = [] + for tf, file_path in file_paths.items(): + if file_path.endswith("bed.gz"): + score_col = "Score" + df = pr.read_bed(file_path).as_df() + assert score_col in df.columns, ( + f"Score column {score_col} not found in BED file for TF {tf} among {df.columns}" + ) + else: + score_col = "score" + df = pd.read_csv(file_path, sep="\t", names=col_names, comment="#") + print(f"Processing TF {tf} with {len(df)} peaks.") + # df["pval"] = df["pval"].fillna(1) + # print(df[score_col].quantile(threshold)) + df = df[df[score_col] >= df[score_col].quantile(threshold)] + print(f"Retained {len(df)} peaks after thresholding for TF {tf}.") + df[element_col] = tf + dfs.append(df) + chipseq_df = pd.concat(dfs, ignore_index=True) + return pr.PyRanges(chipseq_df) + + +def get_tss_info( + gene_names=None, + use_ids=False, + promoter_window_width=5000, + gene_col="gene", + reference="hg19", +): + """Query Ensembl for TSS positions and define promoter windows.""" + if reference != "hg19": + raise NotImplementedError("Only hg19 reference is currently supported.") + + dataset = Dataset(name="hsapiens_gene_ensembl", host="http://grch37.ensembl.org") + attrs = [ + "ensembl_gene_id", + "external_gene_name", + "chromosome_name", + "start_position", + "end_position", + "strand", + ] + if use_ids: + ensembl_gene_key = "ensembl_gene_id" + else: + ensembl_gene_key = "external_gene_name" + + tss_info = dataset.query(attributes=attrs, use_attr_names=True) + if gene_names is not None: + tss_info = tss_info[tss_info[ensembl_gene_key].isin(gene_names)] + + valid_chr = [str(i) for i in range(1, 23)] + ["X", "Y"] + tss_info = tss_info[tss_info["chromosome_name"].isin(valid_chr)] + tss_info["TSS"] = np.where( + tss_info["strand"] == 1, tss_info["start_position"], tss_info["end_position"] + ) + tss_info["chr"] = "chr" + tss_info["chromosome_name"] + tss_df = tss_info[[ensembl_gene_key, "chr", "TSS"]].rename( + columns={ensembl_gene_key: gene_col} + ) + keep_genes = tss_df.groupby(gene_col)["chr"].nunique() == 1 + tss_df = tss_df[tss_df[gene_col].isin(keep_genes[keep_genes].index)] + tss_summary = ( + tss_df.groupby([gene_col, "chr"], as_index=False)["TSS"] + .mean() + .round() + .astype({"TSS": int}) + ) + tss_summary["Start"] = tss_summary["TSS"] - promoter_window_width + tss_summary["End"] = tss_summary["TSS"] + promoter_window_width + return pr.PyRanges(tss_summary.rename(columns={"chr": "Chromosome"})) + + +def find_tf_targets( + chipseq_gr, promoter_gr, tf_list, element_col="TF", gene_col="gene" +): + """Find transcription factor targets by overlap.""" + target_matrix = pd.DataFrame( + False, index=promoter_gr.df[gene_col].unique(), columns=tf_list + ) + assert element_col in chipseq_gr.as_df().columns, ( + f"Element column {element_col} not found in ChIP-seq data." + ) + for TF in tf_list: + print(f"Processing TF: {TF}") + tf_sites = chipseq_gr[chipseq_gr.as_df()[element_col] == TF] + if tf_sites.empty: + raise ValueError(f"No targets found for TF {TF}") + overlaps = promoter_gr.join(tf_sites) + print(f"Found {len(overlaps)} target genes for TF {TF}.") + genes = overlaps.df[gene_col].unique() + target_matrix.loc[genes, TF] = True + return target_matrix + + +def summarize_results( + df, + pval_col="q_value", + method_col="method", + element_col="TF", + q=0.1, + fdr_corrected=True, +): + """Summarize results with multiple testing correction.""" + if not fdr_corrected: + df["significant"] = df.groupby([element_col, method_col])[pval_col].transform( + lambda pvals: multipletests(pvals, alpha=q, method="fdr_bh")[0] + ) + else: + df["significant"] = df[pval_col] <= q + rows = [] + for (tf, method), sub in df.groupby([element_col, method_col]): + significant = sub["significant"] + target_flags = sub["target"] + if target_flags.nunique() < 2 or significant.nunique() < 2: + or_val, pval = np.nan, np.nan + else: + or_val, pval = fisher_exact( + [ + [sum(target_flags & significant), sum(~target_flags & significant)], + [ + sum(target_flags & ~significant), + sum(~target_flags & ~significant), + ], + ], + alternative="greater", + ) + num_rej = int(significant.sum()) + power = int((significant & target_flags).sum()) / int(target_flags.sum()) + rows.append( + { + "TF": tf, + "Method": method, + "num_rejections": num_rej, + "odds_ratio": or_val, + "pvalue": pval, + "recall": power, + } + ) + # for tf, grp in tf_targets.groupby("TF"): + # rows.append( + # { + # "TF": tf, + # "Method": "ChIP-seq", + # "num_rejections": int(grp["target"].sum()), + # "odds_ratio": np.nan, + # "pvalue": np.nan, + # "fdp": np.nan, + # "power": np.nan, + # } + # ) + return pd.DataFrame(rows) From fdc3a57e4ec20e7e62d18468898e3332c65bc7bd Mon Sep 17 00:00:00 2001 From: Logan Blaine Date: Fri, 5 Dec 2025 00:02:33 -0500 Subject: [PATCH 2/7] add calibration check results --- bin/inference_sceptre.R | 68 +++++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/bin/inference_sceptre.R b/bin/inference_sceptre.R index c8a88d1..9ea4195 100755 --- a/bin/inference_sceptre.R +++ b/bin/inference_sceptre.R @@ -101,7 +101,7 @@ inference_sceptre_m <- function(mudata, n_processors = NA, ...) { # extract set of discovery pairs to test (if available) moi <- MultiAssayExperiment::metadata(mudata[["guide"]])$moi - + # Check if pairs_to_test exists in metadata if (!is.null(MultiAssayExperiment::metadata(mudata)$pairs_to_test)) { pairs_to_test <- MultiAssayExperiment::metadata(mudata)$pairs_to_test |> @@ -115,20 +115,20 @@ inference_sceptre_m <- function(mudata, n_processors = NA, ...) { # assemble base arguments to set_analysis_parameters() args_list <- list(...) - discovery_pairs <- discovery_pairs |> - dplyr::mutate(grna_target = replace(grna_target, tolower(trimws(as.character(grna_target))) == "nan", NA_character_)) |> - dplyr::filter(!is.na(grna_target)) + discovery_pairs <- discovery_pairs |> + dplyr::mutate(grna_target = replace(grna_target, tolower(trimws(as.character(grna_target))) == "nan", NA_character_)) |> + dplyr::filter(!is.na(grna_target)) + + tmp_allowed <- unique(sceptre_object@grna_target_data_frame$grna_target) + tmp_bad <- setdiff(unique(discovery_pairs$grna_target), tmp_allowed) - tmp_allowed <- unique(sceptre_object@grna_target_data_frame$grna_target) - tmp_bad <- setdiff(unique(discovery_pairs$grna_target), tmp_allowed) + print("Diferences:") + print(tmp_bad) + + print(setdiff(unique(discovery_pairs$grna_target), unique(sceptre_object@grna_target_data_frame$grna_target))) - print('Diferences:') - print (tmp_bad) - - print (setdiff(unique(discovery_pairs$grna_target), unique(sceptre_object@grna_target_data_frame$grna_target))) - if ("discovery_pairs" %in% names(args_list)) { warning("The `discovery_pairs` argument is ignored. The `discovery_pairs` are set from the `pairs_to_test` metadata.") } @@ -140,7 +140,7 @@ inference_sceptre_m <- function(mudata, n_processors = NA, ...) { discovery_pairs <- sceptre::construct_trans_pairs(sceptre_object) args_list[["discovery_pairs"]] <- discovery_pairs } - #print (discovery_pairs) + # print (discovery_pairs) # construct formula excluding gRNA covariates to avoid multicollinearity @@ -193,8 +193,28 @@ inference_sceptre_m <- function(mudata, n_processors = NA, ...) { intended_target_name = grna_target, log2_fc = log_2_fold_change ) - # use union_results directly (no extra distinct/left_join) - union_test_results <- union_results + + # run calibration check + sceptre_object <- sceptre_object |> + sceptre::run_calibration_check( + parallel = (!is.na(n_processors) && as.integer(n_processors) > 1), + n_processors = if (!is.na(n_processors) && as.integer(n_processors) > 1) as.integer(n_processors) else NULL + ) + + + # get calibration results + calibration_results <- sceptre_object |> + sceptre::get_result(analysis = "run_calibration_check") |> + dplyr::select(response_id, grna_target, p_value, log_2_fold_change) |> + dplyr::rename( + gene_id = response_id, + intended_target_name = grna_target, + log2_fc = log_2_fold_change + ) + + union_test_results <- rbind(union_results, calibration_results) + + # store union results in mudata metadata as requested MultiAssayExperiment::metadata(mudata)$per_element_results <- union_test_results @@ -230,11 +250,16 @@ inference_sceptre_m <- function(mudata, n_processors = NA, ...) { n_nonzero_cntrl_thresh = 0L, p_mito_threshold = 1 ) |> - sceptre::run_discovery_analysis( + sceptre::run_calibration_check( parallel = (!is.na(n_processors) && as.integer(n_processors) > 1), n_processors = if (!is.na(n_processors) && as.integer(n_processors) > 1) as.integer(n_processors) else NULL ) + sceptre::run_discovery_analysis( + parallel = (!is.na(n_processors) && as.integer(n_processors) > 1), + n_processors = if (!is.na(n_processors) && as.integer(n_processors) > 1) as.integer(n_processors) else NULL + ) + # extract singleton (per-guide) results, preserve grna_id and rename to guide_id singleton_results <- sceptre_object |> sceptre::get_result(analysis = "run_discovery_analysis") |> @@ -245,8 +270,19 @@ inference_sceptre_m <- function(mudata, n_processors = NA, ...) { log2_fc = log_2_fold_change ) + + # extract singleton calibration results + singleton_calibration_results <- sceptre_object |> + sceptre::get_result(analysis = "run_calibration_check") |> + dplyr::select(response_id, grna_id, p_value, log_2_fold_change) |> + dplyr::rename( + gene_id = response_id, + guide_id = grna_id, + log2_fc = log_2_fold_change + ) + # use singleton_results directly - singleton_test_results <- singleton_results + singleton_test_results <- rbind(singleton_results, singleton_calibration_results) MultiAssayExperiment::metadata(mudata)$per_guide_results <- singleton_test_results try( From ac85bed661946339b59554a4a625d8772f7e85d2 Mon Sep 17 00:00:00 2001 From: Logan Blaine Date: Fri, 5 Dec 2025 15:13:02 -0500 Subject: [PATCH 3/7] remove un-needed steps from SCEPTRE inference --- bin/inference_sceptre.R | 76 +++++++++-------------------------------- 1 file changed, 16 insertions(+), 60 deletions(-) diff --git a/bin/inference_sceptre.R b/bin/inference_sceptre.R index 9ea4195..01ffea5 100755 --- a/bin/inference_sceptre.R +++ b/bin/inference_sceptre.R @@ -61,13 +61,14 @@ convert_mudata_to_sceptre_object_v1 <- function(mudata, remove_collinear_covaria extra_covariates <- data.frame() } - # if guide assignments not present, then extract guide counts - if (length(guides_data@assays@data@listData) == 1) { - grna_matrix <- guides_data@assays@data@listData[["counts"]] - # otherwise, extract guide assignments - } else { - grna_matrix <- guides_data@assays@data@listData[["guide_assignment"]] - } + # # if guide assignments not present, then extract guide counts + # if (length(guides_data@assays@data@listData) == 1) { + # grna_matrix <- guides_data@assays@data@listData[["counts"]] + # # otherwise, extract guide assignments + # } else { + + # } + grna_matrix <- guides_data@assays@data@listData[["guide_assignment"]] grna_ids <- rownames(SingleCellExperiment::rowData(mudata[["guide"]])) rownames(grna_matrix) <- grna_ids @@ -96,6 +97,8 @@ convert_mudata_to_sceptre_object_v1 <- function(mudata, remove_collinear_covaria inference_sceptre_m <- function(mudata, n_processors = NA, ...) { + use_parallel <- !is.na(n_processors) && as.integer(n_processors) > 1 + n_processors <- if (use_parallel) as.integer(n_processors) else NULL # convert MuData object to sceptre object sceptre_object <- convert_mudata_to_sceptre_object_v1(mudata, remove_collinear_covariates = TRUE) @@ -163,26 +166,11 @@ inference_sceptre_m <- function(mudata, n_processors = NA, ...) { args_union$grna_integration_strategy <- "union" # set analysis parameters for union sceptre_object <- do.call(sceptre::set_analysis_parameters, args_union) - # assign grnas and run QC (relaxed thresholds to keep all cells; mirror prior behaviour) - sceptre_object <- sceptre_object |> - sceptre::assign_grnas( - method = "thresholding", - threshold = 1, - parallel = (!is.na(n_processors) && as.integer(n_processors) > 1), - n_processors = if (!is.na(n_processors) && as.integer(n_processors) > 1) as.integer(n_processors) else NULL - ) |> - sceptre::run_qc( - n_nonzero_trt_thresh = 0L, - n_nonzero_cntrl_thresh = 0L, - p_mito_threshold = 1 - ) # run discovery analysis (grouped) sceptre_object <- sceptre_object |> - sceptre::run_discovery_analysis( - parallel = (!is.na(n_processors) && as.integer(n_processors) > 1), - n_processors = if (!is.na(n_processors) && as.integer(n_processors) > 1) as.integer(n_processors) else NULL - ) + sceptre::run_discovery_analysis(parallel = use_parallel, n_processors = n_processors) |> + sceptre::run_calibration_check(parallel = use_parallel, n_processors = n_processors) # get union (per-element) results union_results <- sceptre_object |> @@ -194,14 +182,6 @@ inference_sceptre_m <- function(mudata, n_processors = NA, ...) { log2_fc = log_2_fold_change ) - # run calibration check - sceptre_object <- sceptre_object |> - sceptre::run_calibration_check( - parallel = (!is.na(n_processors) && as.integer(n_processors) > 1), - n_processors = if (!is.na(n_processors) && as.integer(n_processors) > 1) as.integer(n_processors) else NULL - ) - - # get calibration results calibration_results <- sceptre_object |> sceptre::get_result(analysis = "run_calibration_check") |> @@ -213,10 +193,6 @@ inference_sceptre_m <- function(mudata, n_processors = NA, ...) { ) union_test_results <- rbind(union_results, calibration_results) - - - - # store union results in mudata metadata as requested MultiAssayExperiment::metadata(mudata)$per_element_results <- union_test_results # also write the per-element (union) results to file @@ -231,34 +207,16 @@ inference_sceptre_m <- function(mudata, n_processors = NA, ...) { silent = TRUE ) - ## 2) Singleton (per-guide) analysis — reuse sceptre_object to exploit caching + ## 2) Singleton (per-guide) analysis args_singleton <- args_list args_singleton$grna_integration_strategy <- "singleton" + # set analysis parameters for singleton (reuse same sceptre_object reference) sceptre_object <- do.call(sceptre::set_analysis_parameters, args_singleton) - # run assignment, qc and discovery for singleton sceptre_object <- sceptre_object |> - sceptre::assign_grnas( - method = "thresholding", - threshold = 1, - parallel = (!is.na(n_processors) && as.integer(n_processors) > 1), - n_processors = if (!is.na(n_processors) && as.integer(n_processors) > 1) as.integer(n_processors) else NULL - ) |> - sceptre::run_qc( - n_nonzero_trt_thresh = 0L, - n_nonzero_cntrl_thresh = 0L, - p_mito_threshold = 1 - ) |> - sceptre::run_calibration_check( - parallel = (!is.na(n_processors) && as.integer(n_processors) > 1), - n_processors = if (!is.na(n_processors) && as.integer(n_processors) > 1) as.integer(n_processors) else NULL - ) - - sceptre::run_discovery_analysis( - parallel = (!is.na(n_processors) && as.integer(n_processors) > 1), - n_processors = if (!is.na(n_processors) && as.integer(n_processors) > 1) as.integer(n_processors) else NULL - ) + sceptre::run_discovery_analysis(parallel = use_parallel, n_processors = n_processors) |> + sceptre::run_calibration_check(parallel = use_parallel, n_processors = n_processors) # extract singleton (per-guide) results, preserve grna_id and rename to guide_id singleton_results <- sceptre_object |> @@ -270,7 +228,6 @@ inference_sceptre_m <- function(mudata, n_processors = NA, ...) { log2_fc = log_2_fold_change ) - # extract singleton calibration results singleton_calibration_results <- sceptre_object |> sceptre::get_result(analysis = "run_calibration_check") |> @@ -281,7 +238,6 @@ inference_sceptre_m <- function(mudata, n_processors = NA, ...) { log2_fc = log_2_fold_change ) - # use singleton_results directly singleton_test_results <- rbind(singleton_results, singleton_calibration_results) MultiAssayExperiment::metadata(mudata)$per_guide_results <- singleton_test_results From f8cbd50ce02a62d4a63a609f3ceae46c5ecb1ad9 Mon Sep 17 00:00:00 2001 From: Logan Blaine Date: Fri, 5 Dec 2025 15:16:58 -0500 Subject: [PATCH 4/7] output guide assignment directly to MuData --- bin/assign_grnas_sceptre.R | 84 +++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/bin/assign_grnas_sceptre.R b/bin/assign_grnas_sceptre.R index 95b6dbc..5e3cec3 100755 --- a/bin/assign_grnas_sceptre.R +++ b/bin/assign_grnas_sceptre.R @@ -19,7 +19,7 @@ assign_grnas_sceptre_v1 <- function(mudata, probability_threshold = "default", n # assign gRNAs assign_grnas_args <- list(sceptre_object, method = "mixture") - + # Add optional parameters if they are not "default" if (probability_threshold != "default") { assign_grnas_args$probability_threshold <- as.numeric(probability_threshold) @@ -32,40 +32,40 @@ assign_grnas_sceptre_v1 <- function(mudata, probability_threshold = "default", n assign_grnas_args$parallel <- TRUE assign_grnas_args$n_processors <- as.integer(n_processors) } - + sceptre_object <- do.call(sceptre::assign_grnas, assign_grnas_args) # extract sparse logical matrix of gRNA assignments grna_assignment_matrix <- sceptre_object |> sceptre::get_grna_assignments() |> methods::as("dsparseMatrix") - colnames(grna_assignment_matrix) <- colnames(MultiAssayExperiment::assay(mudata[['guide']])) + colnames(grna_assignment_matrix) <- colnames(MultiAssayExperiment::assay(mudata[["guide"]])) # add gRNA assignment matrix to MuData - # SummarizedExperiment::assays(mudata[['guide']])[['guide_assignment']] <- grna_assignment_matrix + SummarizedExperiment::assays(mudata[["guide"]])[["guide_assignment"]] <- grna_assignment_matrix return(list(mudata = mudata, guide_assignment = grna_assignment_matrix)) } -convert_mudata_to_sceptre_object_v1 <- function(mudata, remove_collinear_covariates = FALSE){ +convert_mudata_to_sceptre_object_v1 <- function(mudata, remove_collinear_covariates = FALSE) { # extract information from MuData - moi <- MultiAssayExperiment::metadata(mudata[['guide']])$moi - if(is.null(SummarizedExperiment::assayNames(mudata[['gene']]))){ - SummarizedExperiment::assayNames(mudata[['gene']]) <- 'counts' - } else{ - SummarizedExperiment::assayNames(mudata[['gene']])[[1]] <- 'counts' + moi <- MultiAssayExperiment::metadata(mudata[["guide"]])$moi + if (is.null(SummarizedExperiment::assayNames(mudata[["gene"]]))) { + SummarizedExperiment::assayNames(mudata[["gene"]]) <- "counts" + } else { + SummarizedExperiment::assayNames(mudata[["gene"]])[[1]] <- "counts" } - if(is.null(SummarizedExperiment::assayNames(mudata[['guide']]))){ - SummarizedExperiment::assayNames(mudata[['guide']]) <- 'counts' - } else{ - SummarizedExperiment::assayNames(mudata[['guide']])[[1]] <- 'counts' + if (is.null(SummarizedExperiment::assayNames(mudata[["guide"]]))) { + SummarizedExperiment::assayNames(mudata[["guide"]]) <- "counts" + } else { + SummarizedExperiment::assayNames(mudata[["guide"]])[[1]] <- "counts" } scRNA_data <- mudata@ExperimentList$gene guides_data <- mudata@ExperimentList$guide response_matrix <- scRNA_data@assays@data@listData[["counts"]] - if(!is.null(SummarizedExperiment::colData(mudata))){ + if (!is.null(SummarizedExperiment::colData(mudata))) { covariates <- SummarizedExperiment::colData(mudata) |> as.data.frame() covariates[] <- lapply(covariates, as.factor) # Check and remove factor variables with fewer than two levels @@ -73,39 +73,39 @@ convert_mudata_to_sceptre_object_v1 <- function(mudata, remove_collinear_covaria multi_level_factors <- number_of_levels > 1 covariates_clean <- covariates[, multi_level_factors, drop = FALSE] - if(ncol(covariates_clean) == 0){ + if (ncol(covariates_clean) == 0) { remove_collinear_covariates <- FALSE } - - if(remove_collinear_covariates){ - model_matrix <- stats::model.matrix(object = ~ ., data = covariates_clean) + + if (remove_collinear_covariates) { + model_matrix <- stats::model.matrix(object = ~., data = covariates_clean) multicollinear <- Matrix::rankMatrix(model_matrix) < ncol(model_matrix) - if(multicollinear){ + if (multicollinear) { extra_covariates <- data.frame() - } else{ + } else { extra_covariates <- covariates_clean } - } else{ + } else { extra_covariates <- covariates_clean } - } else{ + } else { extra_covariates <- data.frame() } # if guide assignments not present, then extract guide counts - if(length(guides_data@assays@data@listData) == 1){ + if (length(guides_data@assays@data@listData) == 1) { grna_matrix <- guides_data@assays@data@listData[["counts"]] # otherwise, extract guide assignments - } else{ + } else { grna_matrix <- guides_data@assays@data@listData[["guide_assignment"]] } - grna_ids <- rownames(SingleCellExperiment::rowData(mudata[['guide']])) + grna_ids <- rownames(SingleCellExperiment::rowData(mudata[["guide"]])) rownames(grna_matrix) <- grna_ids - gene_ids <- rownames(SingleCellExperiment::rowData(mudata[['gene']])) + gene_ids <- rownames(SingleCellExperiment::rowData(mudata[["gene"]])) rownames(response_matrix) <- gene_ids - grna_target_data_frame <- SingleCellExperiment::rowData(mudata[['guide']]) |> + grna_target_data_frame <- SingleCellExperiment::rowData(mudata[["guide"]]) |> as.data.frame() |> tibble::rownames_to_column(var = "grna_id") |> dplyr::rename(grna_target = intended_target_name) |> @@ -126,20 +126,22 @@ convert_mudata_to_sceptre_object_v1 <- function(mudata, remove_collinear_covaria # Run Command (only execute when script is run directly, not when sourced) if (!exists(".sourced_from_test")) { - args <- commandArgs(trailingOnly = TRUE) - + mudata_input <- args[1] - probability_threshold <- if(length(args) >= 2) args[2] else "default" - n_em_rep <- if(length(args) >= 3) args[3] else "default" - n_processors <- if(length(args) >= 4) suppressWarnings(as.integer(args[4])) else NA - + probability_threshold <- if (length(args) >= 2) args[2] else "default" + n_em_rep <- if (length(args) >= 3) args[3] else "default" + n_processors <- if (length(args) >= 4) suppressWarnings(as.integer(args[4])) else NA + mudata_in <- MuData::readH5MU(mudata_input) - results <- assign_grnas_sceptre_v1(mudata = mudata_in, - probability_threshold = probability_threshold, - n_em_rep = n_em_rep, - n_processors = n_processors) - guide_assignment <- results$guide_assignment - - writeMM(guide_assignment, "guide_assignment.mtx") + results <- assign_grnas_sceptre_v1( + mudata = mudata_in, + probability_threshold = probability_threshold, + n_em_rep = n_em_rep, + n_processors = n_processors + ) + MuData::writeH5MU(results$mudata, "mudata_out.h5mu") + # guide_assignment <- results$guide_assignment + + # writeMM(guide_assignment, "guide_assignment.mtx") } From bbb5401ae30565c4ee909572f198cff807e131af Mon Sep 17 00:00:00 2001 From: Logan Blaine Date: Fri, 5 Dec 2025 15:25:45 -0500 Subject: [PATCH 5/7] skip redundant steps in guide assignment module --- modules/local/guide_assignment_sceptre/main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/local/guide_assignment_sceptre/main.nf b/modules/local/guide_assignment_sceptre/main.nf index 45914b6..93f908a 100644 --- a/modules/local/guide_assignment_sceptre/main.nf +++ b/modules/local/guide_assignment_sceptre/main.nf @@ -8,13 +8,13 @@ process guide_assignment_sceptre { val SCEPTRE_n_em_rep output: - path "${mudata_input.simpleName}_output.h5mu", emit: guide_assignment_mudata_output + path "mudata_out.h5mu", emit: guide_assignment_mudata_output script: """ export NUMBA_CACHE_DIR=/tmp assign_grnas_sceptre.R ${mudata_input} ${probability_threshold} ${SCEPTRE_n_em_rep} ${task.cpus} - add_guide_assignment.py --guide_assignment guide_assignment.mtx --mudata ${mudata_input} - mv sceptre_assignment_mudata.h5mu ${mudata_input.simpleName}_output.h5mu + // add_guide_assignment.py --guide_assignment guide_assignment.mtx --mudata ${mudata_input} + // mv sceptre_assignment_mudata.h5mu ${mudata_input.simpleName}_output.h5mu """ } From 1997556b1b7cc824cc4881a46c3ef492b531b399 Mon Sep 17 00:00:00 2001 From: Logan Blaine Date: Mon, 8 Dec 2025 14:05:07 -0500 Subject: [PATCH 6/7] fix for bug with parallel guide assignment --- bin/assign_grnas_sceptre.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/bin/assign_grnas_sceptre.R b/bin/assign_grnas_sceptre.R index 5e3cec3..0abba33 100755 --- a/bin/assign_grnas_sceptre.R +++ b/bin/assign_grnas_sceptre.R @@ -39,10 +39,16 @@ assign_grnas_sceptre_v1 <- function(mudata, probability_threshold = "default", n grna_assignment_matrix <- sceptre_object |> sceptre::get_grna_assignments() |> methods::as("dsparseMatrix") - colnames(grna_assignment_matrix) <- colnames(MultiAssayExperiment::assay(mudata[["guide"]])) + + # reorder columns to match original mudata order + original_colnames <- colnames( + MultiAssayExperiment::assay(mudata[["guide"]]) + ) + grna_assignment_matrix <- grna_assignment_matrix[, original_colnames] # add gRNA assignment matrix to MuData - SummarizedExperiment::assays(mudata[["guide"]])[["guide_assignment"]] <- grna_assignment_matrix + assays(mudata[["guide"]])$guide_assignment <- + grna_assignment_matrix return(list(mudata = mudata, guide_assignment = grna_assignment_matrix)) } From dd12449741fc9846cbcaa80fda4b25100a861d0b Mon Sep 17 00:00:00 2001 From: Logan Blaine Date: Mon, 8 Dec 2025 14:21:27 -0500 Subject: [PATCH 7/7] create unique filenames from guide assignment --- bin/assign_grnas_sceptre.R | 5 ++++- modules/local/guide_assignment_sceptre/main.nf | 4 +--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/bin/assign_grnas_sceptre.R b/bin/assign_grnas_sceptre.R index 0abba33..7ee9b52 100755 --- a/bin/assign_grnas_sceptre.R +++ b/bin/assign_grnas_sceptre.R @@ -146,7 +146,10 @@ if (!exists(".sourced_from_test")) { n_em_rep = n_em_rep, n_processors = n_processors ) - MuData::writeH5MU(results$mudata, "mudata_out.h5mu") + + output_file <- sub("(\\.h5mu)$", "_out\\1", mudata_input) + + MuData::writeH5MU(results$mudata, output_file) # guide_assignment <- results$guide_assignment # writeMM(guide_assignment, "guide_assignment.mtx") diff --git a/modules/local/guide_assignment_sceptre/main.nf b/modules/local/guide_assignment_sceptre/main.nf index 93f908a..74c8b24 100644 --- a/modules/local/guide_assignment_sceptre/main.nf +++ b/modules/local/guide_assignment_sceptre/main.nf @@ -8,13 +8,11 @@ process guide_assignment_sceptre { val SCEPTRE_n_em_rep output: - path "mudata_out.h5mu", emit: guide_assignment_mudata_output + path "${mudata_input.simpleName}_out.h5mu", emit: guide_assignment_mudata_output script: """ export NUMBA_CACHE_DIR=/tmp assign_grnas_sceptre.R ${mudata_input} ${probability_threshold} ${SCEPTRE_n_em_rep} ${task.cpus} - // add_guide_assignment.py --guide_assignment guide_assignment.mtx --mudata ${mudata_input} - // mv sceptre_assignment_mudata.h5mu ${mudata_input.simpleName}_output.h5mu """ }