From 7aad89f77cbf5171c961d8ea4f749be6ee9bf42f Mon Sep 17 00:00:00 2001
From: stemangiola <mangiolastefano@gmail.com>
Date: Thu, 16 May 2024 00:43:46 +1000
Subject: [PATCH] more improvements

---
 R/functions.R                    | 22 +++++++++---------
 R/functions_SE.R                 |  2 +-
 R/methods.R                      | 38 ++++++++++++++++----------------
 dev/solve_complete_confounders.R | 38 ++++++++++++++++++++++++++++++++
 4 files changed, 69 insertions(+), 31 deletions(-)
 create mode 100644 dev/solve_complete_confounders.R

diff --git a/R/functions.R b/R/functions.R
index ed5d0087..3f9ce957 100755
--- a/R/functions.R
+++ b/R/functions.R
@@ -465,20 +465,20 @@ get_differential_transcript_abundance_bulk <- function(.data,
 			data = df_for_edgeR %>% select(!!.sample, one_of(parse_formula(.formula))) %>% distinct %>% arrange(!!.sample)
 		)
 
-	# Print the design column names in case I want contrasts
-	message(
-		sprintf(
-			"tidybulk says: The design column names are \"%s\"",
-			design %>% colnames %>% paste(collapse = ", ")
-		)
-	)
+	# # Print the design column names in case I want contrasts
+	# message(
+	# 	sprintf(
+	# 		"tidybulk says: The design column names are \"%s\"",
+	# 		design %>% colnames %>% paste(collapse = ", ")
+	# 	)
+	# )
 
 	# Specify the design column tested
 	if(is.null(.contrasts))
 	  message(
 	    sprintf(
 	      "tidybulk says: The design column being tested is %s",
-	      design %>% colnames %>% .[1]
+	      design %>% colnames %>% .[2]
 	    )
 	  )
 
@@ -764,7 +764,7 @@ get_differential_transcript_abundance_glmmSeq <- function(.data,
       object = .formula |> eliminate_random_effects(),
       data = metadata
     )
-  
+
   if(quo_is_symbolic(.dispersion))
     dispersion = .data |> pivot_transcript(!!.transcript) |> select(!!.transcript, !!.dispersion) |> deframe()
   else
@@ -781,8 +781,8 @@ get_differential_transcript_abundance_glmmSeq <- function(.data,
 
   # Scaling
   sizeFactors <- counts |> edgeR::calcNormFactors(method = scaling_method)
-  
-  
+
+
   glmmSeq_object =
     glmmSeq( .formula,
           countdata = counts ,
diff --git a/R/functions_SE.R b/R/functions_SE.R
index 0b596319..893a5851 100755
--- a/R/functions_SE.R
+++ b/R/functions_SE.R
@@ -408,7 +408,7 @@ get_reduced_dimensions_TSNE_bulk_SE <-
 #' @param of_samples A boolean
 #' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity
 #' @param calculate_for_pca_dimensions An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered
-#' @param ... Further parameters passed to the function uwot
+#' @param ... Further parameters passed to the function uwot::tumap
 #'
 #' @return A tibble with additional columns
 #'
diff --git a/R/methods.R b/R/methods.R
index c43a2817..d16e355f 100755
--- a/R/methods.R
+++ b/R/methods.R
@@ -1014,7 +1014,7 @@ setMethod("cluster_elements", "tidybulk", .cluster_elements)
 #' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity
 #' @param scale A boolean for method="PCA", this will be passed to the `prcomp` function. It is not included in the ... argument because although the default for `prcomp` if FALSE, it is advisable to set it as TRUE.
 #' @param action A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get).
-#' @param ... Further parameters passed to the function prcomp if you choose method="PCA" or Rtsne if you choose method="tSNE"
+#' @param ... Further parameters passed to the function prcomp if you choose method="PCA" or Rtsne if you choose method="tSNE", or uwot::tumap if you choose method="umap"
 #'
 #' @param log_transform DEPRECATED - A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)
 #'
@@ -1143,14 +1143,14 @@ setGeneric("reduce_dimensions", function(.data,
 	# adjust top for the max number of features I have
 	if(top > .data |> distinct(!!.feature) |> nrow()){
 	  warning(sprintf(
-	    "tidybulk says: the \"top\" argument %s is higher than the number of features %s", 
-	    top, 
+	    "tidybulk says: the \"top\" argument %s is higher than the number of features %s",
+	    top,
 	    .data |> distinct(!!.feature) |> nrow()
 	  ))
-	  
+
 	  top = min(top, .data |> distinct(!!.feature) |> nrow())
 	}
-	
+
 	# Validate data frame
 	if(do_validate()) {
 	validation(.data, !!.element, !!.feature, !!.abundance)
@@ -2600,7 +2600,7 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol)
 #' All methods use raw counts, irrespective of if scale_abundance or adjust_abundance have been calculated, therefore it is essential to add covariates such as batch effects (if applicable) in the formula.
 #'
 #' Underlying method for edgeR framework:
-#' 
+#'
 #' 	.data |>
 #'
 #' 	# Filter
@@ -2627,7 +2627,7 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol)
 #'
 #'
 #'	Underlying method for DESeq2 framework:
-#'	
+#'
 #'	keep_abundant(
 #'			factor_of_interest = !!as.symbol(parse_formula(.formula)[[1]]),
 #'			minimum_counts = minimum_counts,
@@ -2646,16 +2646,16 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol)
 #' counts =
 #' .data %>%
 #'   assay(my_assay)
-#' 
+#'
 #' # Create design matrix for dispersion, removing random effects
 #' design =
 #'   model.matrix(
 #'     object = .formula |> eliminate_random_effects(),
 #'     data = metadata
 #'   )
-#' 
+#'
 #' dispersion = counts |> edgeR::estimateDisp(design = design) %$% tagwise.dispersion |> setNames(rownames(counts))
-#' 
+#'
 #'   glmmSeq( .formula,
 #'            countdata = counts ,
 #'            metadata =   metadata |> as.data.frame(),
@@ -2663,8 +2663,8 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol)
 #'            progress = TRUE,
 #'            method = method |> str_remove("(?i)^glmmSeq_" ),
 #'   )
-#'   
-#' 
+#'
+#'
 #' @return A consistent object (to the input) with additional columns for the statistics from the test (e.g.,  log fold change, p-value and false discovery rate).
 #'
 #'
@@ -3969,12 +3969,12 @@ setGeneric("test_gene_rank", function(.data,
 
 	# DEPRECATION OF reference function
 	if (is_present(.sample) & !is.null(.sample)) {
-	  
+
 	  # Signal the deprecation to the user
 	  deprecate_warn("1.13.2", "tidybulk::test_gene_rank(.sample = )", details = "The argument .sample is now deprecated and not needed anymore.")
 
 	}
-	
+
 	# Get column names
 	.arrange_desc = enquo(.arrange_desc)
 	.entrez = enquo(.entrez)
@@ -4012,14 +4012,14 @@ setGeneric("test_gene_rank", function(.data,
 
 	.data |>
 		select(!!.entrez, !!.arrange_desc) |>
-	  distinct() |> 
-	  
+	  distinct() |>
+
 	  # Select one entrez - NEEDED?
-	  with_groups(c(!!.entrez,!!.arrange_desc ), slice, 1) |> 
+	  with_groups(c(!!.entrez,!!.arrange_desc ), slice, 1) |>
 
-	  # arrange 
+	  # arrange
 	  arrange(desc(!!.arrange_desc)) |>
-	  
+
 	  # Format
 		deframe() |>
 		entrez_rank_to_gsea(species, gene_collections  = gene_sets ) |>
diff --git a/dev/solve_complete_confounders.R b/dev/solve_complete_confounders.R
new file mode 100644
index 00000000..6f5bf253
--- /dev/null
+++ b/dev/solve_complete_confounders.R
@@ -0,0 +1,38 @@
+
+
+
+
+
+
+library(tidySummarizedExperiment)
+library(tidybulk)
+
+# Sample annotations
+sample_annotations <- data.frame(
+	sample_id = paste0("Sample", seq(1, 9)),
+	factor_of_interest = c(rep("treated", 4), rep("untreated", 5)),
+	A = c("a1", "a2", "a1", "a2", "a1", "a2", "a1", "a2", "a3"),
+	B = c("b1", "b1", "b2", "b1", "b1", "b1", "b2", "b1", "b3"),
+	C = c("c1", "c1", "c1", "c1", "c1", "c1", "c1", "c1", "c3"),
+	stringsAsFactors = FALSE
+)
+
+# Simulated assay data (e.g., gene expression data)
+# Let's assume we have 100 genes (rows) and 9 samples (columns)
+assay_data <- matrix(rnorm(100 * 9), nrow = 100, ncol = 9)
+
+# Row data (e.g., gene annotations)
+# For simplicity, we'll just use a sequence of gene IDs
+row_data <- data.frame(gene_id = paste0("Gene", seq_len(100)))
+
+# Create SummarizedExperiment object
+se <- SummarizedExperiment(assays = list(counts = assay_data),
+													 rowData = row_data,
+													 colData = DataFrame(sample_annotations))
+
+se |>
+  resolve_complete_confounders_of_non_interest(A, B, C) |>
+  distinct(.sample, A, B, C)
+
+
+