add method

stemangiola · Dec 13, 2023 · a22afaa · a22afaa
1 parent 915117b
commit a22afaa
Show file tree

Hide file tree

Showing 6 changed files with 255 additions and 3 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -53,6 +53,7 @@ export(quantile_normalise_abundance)
 export(reduce_dimensions)
 export(remove_redundancy)
 export(rename)
+export(resolve_complete_confounders_of_non_interest)
 export(right_join)
 export(rotate_dimensions)
 export(rowwise)
@@ -141,9 +142,11 @@ importFrom(purrr,map2_dfc)
 importFrom(purrr,map2_dfr)
 importFrom(purrr,map_chr)
 importFrom(purrr,map_dfr)
+importFrom(purrr,map_int)
 importFrom(purrr,map_lgl)
 importFrom(purrr,reduce)
 importFrom(purrr,when)
+importFrom(rlang,"!!")
 importFrom(rlang,":=")
 importFrom(rlang,dots_list)
 importFrom(rlang,dots_values)
@@ -159,6 +162,7 @@ importFrom(rlang,quo_is_symbol)
 importFrom(rlang,quo_is_symbolic)
 importFrom(rlang,quo_name)
 importFrom(rlang,quo_squash)
+importFrom(rlang,set_names)
 importFrom(scales,extended_breaks)
 importFrom(scales,label_scientific)
 importFrom(scales,log_breaks)

diff --git a/R/functions_SE.R b/R/functions_SE.R
@@ -1139,7 +1139,7 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data,
       object = .formula |> lme4::nobars(),
       data = metadata
     )
-  
+
   if(quo_is_symbolic(.dispersion))
     dispersion = rowData(.data)[,quo_name(.dispersion),drop=FALSE] |> as_tibble(rownames = feature__$name) |> deframe()
   else
@@ -1156,8 +1156,8 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data,
 
   # Scaling
   sizeFactors <- counts |> edgeR::calcNormFactors(method = scaling_method)
-  
-  
+
+
   glmmSeq_object =
     glmmSeq( .formula,
                       countdata = counts ,
@@ -1490,3 +1490,86 @@ univariable_differential_tissue_composition_SE = function(
 
 		unnest(surv_test, keep_empty = TRUE)
 }
+
+
+#' Resolve Complete Confounders of Non-Interest
+#'
+#' This function processes a SummarizedExperiment object to handle confounders
+#' that are not of interest in the analysis. It deals with two factors, adjusting
+#' the data by nesting and summarizing over these factors.
+#'
+#' @importFrom rlang enquo
+#' @importFrom rlang quo_name
+#' @importFrom rlang !!
+#' @importFrom tidyr unnest
+#' @importFrom tidyr nest
+#' @importFrom dplyr mutate
+#' @importFrom dplyr arrange
+#' @importFrom dplyr slice
+#' @importFrom dplyr pull
+#' @importFrom dplyr select
+#' @importFrom purrr map_int
+#' @importFrom dplyr if_else
+#'
+#' @param se A SummarizedExperiment object that contains the data to be processed.
+#' @param .factor_1 A symbol or quosure representing the first factor variable in `se`.
+#' @param .factor_2 A symbol or quosure representing the second factor variable in `se`.
+#' @return A modified SummarizedExperiment object with confounders resolved.
+#' @examples
+#' # Not run:
+#' # se is a SummarizedExperiment object
+#' resolve_complete_confounders_of_non_interest(se, .factor_1 = factor1, .factor_2 = factor2)
+#' @noRd
+resolve_complete_confounders_of_non_interest_pair_SE <- function(se, .factor_1, .factor_2){
+
+  .factor_1 <- enquo(.factor_1)
+  .factor_2 <- enquo(.factor_2)
+
+  cd =
+    colData(se) |>
+    as_tibble() |>
+    rowid_to_column() |>
+    distinct(rowid, !!.factor_1, !!.factor_2) |>
+
+    nest(se_data = -c(!!.factor_1, !!.factor_2)) |>
+    nest(data = -!!.factor_1) |>
+    mutate(n1 = map_int(data, ~ .x |> distinct(!!.factor_2) |> nrow())) |>
+    unnest(data) |>
+
+    # Nest data excluding .factor_2 and count distinct .factor_1 values
+    nest(data = -!!.factor_2) |>
+    mutate(n2 = map_int(data, ~ .x |> distinct(!!.factor_1) |> nrow())) |>
+    unnest(data)
+
+  # Choose a dummy value for .factor_2 based on sorting by n1 + n2
+  dummy_factor_2 <- cd |> arrange(desc(n1 + n2)) |> slice(1) |> pull(!!.factor_2)
+
+  # Messages if I have confounders
+  if(cd |> filter(n1 + n2 < 3) |> nrow() > 0){
+
+    message(sprintf("tidybulk says: IMPORTANT! the columns %s and %s, have been corrected for complete confounders and now are NOT interpretable, and cannot be used in hypothesis testing.", quo_name(.factor_1), quo_name(.factor_2)))
+
+    message(sprintf(
+      "tidybulk says: The value(s) %s in column %s from sample(s) %s, has been changed to %s.",
+      cd |> filter(n1 + n2 < 3) |> pull(!!.factor_2),
+      quo_name(.factor_2),
+      cd |> filter(n1 + n2 < 3) |> pull(se_data) |> map(colnames) |> unlist(),
+      dummy_factor_2
+    ))
+
+    # Replace .factor_2 with dummy_factor_2 where n1 + n2 is less than 3 and unnest
+    cd = cd |>
+      mutate(!!.factor_2 := if_else(n1 + n2 < 3, dummy_factor_2, !!.factor_2))
+  }
+
+  colData(se)[,c(quo_name(.factor_1), quo_name(.factor_2))] =
+    cd |>
+    unnest(se_data) |>
+    arrange(rowid) |>
+    select(!!.factor_1, !!.factor_2) |>
+    DataFrame()
+
+  se
+
+}
+
diff --git a/R/methods.R b/R/methods.R
@@ -4956,3 +4956,30 @@ as_matrix <- function(tbl,
 }
 
 
+#' Resolve Complete Confounders of Non-Interest
+#'
+#' This generic function processes a SummarizedExperiment object to handle confounders
+#' that are not of interest in the analysis. It dynamically handles combinations
+#' of provided factors, adjusting the data by nesting and summarizing over these factors.
+#'
+#'
+#' @param se A SummarizedExperiment object that contains the data to be processed.
+#' @param ... Arbitrary number of factor variables represented as symbols or quosures
+#'            to be considered for resolving confounders. These factors are processed
+#'            in combinations of two.
+#'
+#' @rdname resolve_complete_confounders_of_non_interest-methods
+#'
+#' @return A modified SummarizedExperiment object with confounders resolved.
+#'
+#' @examples
+#' # Not run:
+#' # se is a SummarizedExperiment object
+#' # resolve_complete_confounders_of_non_interest(se, factor1, factor2, factor3)
+#' @export
+setGeneric("resolve_complete_confounders_of_non_interest", function(se, ...) {
+  standardGeneric("resolve_complete_confounders_of_non_interest")
+})
+
+
+
diff --git a/R/methods_SE.R b/R/methods_SE.R
@@ -2884,3 +2884,56 @@ setMethod("describe_transcript", "SummarizedExperiment", .describe_transcript_SE
 #'
 #' @return A consistent object (to the input) including additional columns for transcript symbol
 setMethod("describe_transcript", "RangedSummarizedExperiment", .describe_transcript_SE)
+
+
+#' @importFrom dplyr select
+#' @importFrom rlang set_names
+#' @importFrom tibble as_tibble
+.resolve_complete_confounders_of_non_interest <- function(se, ...){
+
+  combination_of_factors_of_NON_interest =
+    # Factors
+    se[1,1, drop=FALSE] |>
+    select(...) |>
+    suppressWarnings() |>
+    colnames() |>
+
+    # Combinations
+    combn(2) |>
+    t() |>
+    as_tibble() |>
+    set_names(c("factor_1", "factor_2"))
+
+  for(i in combination_of_factors_of_NON_interest |> nrow() |> seq_len()){
+    se =
+      se |>
+      resolve_complete_confounders_of_non_interest_pair_SE(
+        !!as.symbol(combination_of_factors_of_NON_interest[i,]$factor_1),
+        !!as.symbol(combination_of_factors_of_NON_interest[i,]$factor_2)
+      )
+  }
+
+  se
+}
+
+#' resolve_complete_confounders_of_non_interest
+#' @inheritParams resolve_complete_confounders_of_non_interest
+#'
+#' @docType methods
+#' @rdname resolve_complete_confounders_of_non_interest-methods
+#'
+#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g.,  log fold change, p-value and false discovery rate).
+setMethod("resolve_complete_confounders_of_non_interest",
+          "SummarizedExperiment",
+          .resolve_complete_confounders_of_non_interest)
+
+#' resolve_complete_confounders_of_non_interest
+#' @inheritParams resolve_complete_confounders_of_non_interest
+#'
+#' @docType methods
+#' @rdname resolve_complete_confounders_of_non_interest-methods
+#'
+#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g.,  log fold change, p-value and false discovery rate).
+setMethod("resolve_complete_confounders_of_non_interest",
+          "RangedSummarizedExperiment",
+          .resolve_complete_confounders_of_non_interest)
diff --git a/man/resolve_complete_confounders_of_non_interest-methods.Rd b/man/resolve_complete_confounders_of_non_interest-methods.Rd
diff --git a/tests/testthat/test-bulk_methods_SummarizedExperiment.R b/tests/testthat/test-bulk_methods_SummarizedExperiment.R
@@ -825,3 +825,49 @@ test_that("Only reduced dimensions UMAP - no object",{
   expect_equal(	class(attr(res, "internals")$UMAP[[1]])[1], 	"numeric"  )
 
 })
+
+
+test_that("resolve_complete_confounders_of_non_interest",{
+
+
+  library(tidySummarizedExperiment)
+  library(tidybulk)
+
+  # Sample annotations
+  sample_annotations <- data.frame(
+    sample_id = paste0("Sample", seq(1, 9)),
+    factor_of_interest = c(rep("treated", 4), rep("untreated", 5)),
+    A = c("a1", "a2", "a1", "a2", "a1", "a2", "a1", "a2", "a3"),
+    B = c("b1", "b1", "b2", "b1", "b1", "b1", "b2", "b1", "b3"),
+    C = c("c1", "c1", "c1", "c1", "c1", "c1", "c1", "c1", "c3"),
+    stringsAsFactors = FALSE
+  )
+
+  # Simulated assay data (e.g., gene expression data)
+  # Let's assume we have 100 genes (rows) and 9 samples (columns)
+  assay_data <- matrix(rnorm(100 * 9), nrow = 100, ncol = 9)
+
+  # Row data (e.g., gene annotations)
+  # For simplicity, we'll just use a sequence of gene IDs
+  row_data <- data.frame(gene_id = paste0("Gene", seq_len(100)))
+
+  # Create SummarizedExperiment object
+  se <- SummarizedExperiment(assays = list(counts = assay_data),
+                             rowData = row_data,
+                             colData = DataFrame(sample_annotations))
+
+  expected_tibble =
+    tibble(
+      .sample = c("1", "5", "2", "4", "6", "8", "9", "3", "7"),
+      A = c("a1", "a1", "a2", "a2", "a2", "a2", "a3", "a1", "a1"),
+      B = c("b1", "b1", "b1", "b1", "b1", "b1", "b1", "b2", "b2"),
+      C = rep("c1", 9)
+    ) |> arrange(.sample)
+
+  se |>
+    resolve_complete_confounders_of_non_interest(A, B, C) |>
+    distinct(.sample, A, B, C) |>
+    expect_identical(expected_tibble )
+
+
+})