From 7aad89f77cbf5171c961d8ea4f749be6ee9bf42f Mon Sep 17 00:00:00 2001 From: stemangiola Date: Thu, 16 May 2024 00:43:46 +1000 Subject: [PATCH] more improvements --- R/functions.R | 22 +++++++++--------- R/functions_SE.R | 2 +- R/methods.R | 38 ++++++++++++++++---------------- dev/solve_complete_confounders.R | 38 ++++++++++++++++++++++++++++++++ 4 files changed, 69 insertions(+), 31 deletions(-) create mode 100644 dev/solve_complete_confounders.R diff --git a/R/functions.R b/R/functions.R index ed5d0087..3f9ce957 100755 --- a/R/functions.R +++ b/R/functions.R @@ -465,20 +465,20 @@ get_differential_transcript_abundance_bulk <- function(.data, data = df_for_edgeR %>% select(!!.sample, one_of(parse_formula(.formula))) %>% distinct %>% arrange(!!.sample) ) - # Print the design column names in case I want contrasts - message( - sprintf( - "tidybulk says: The design column names are \"%s\"", - design %>% colnames %>% paste(collapse = ", ") - ) - ) + # # Print the design column names in case I want contrasts + # message( + # sprintf( + # "tidybulk says: The design column names are \"%s\"", + # design %>% colnames %>% paste(collapse = ", ") + # ) + # ) # Specify the design column tested if(is.null(.contrasts)) message( sprintf( "tidybulk says: The design column being tested is %s", - design %>% colnames %>% .[1] + design %>% colnames %>% .[2] ) ) @@ -764,7 +764,7 @@ get_differential_transcript_abundance_glmmSeq <- function(.data, object = .formula |> eliminate_random_effects(), data = metadata ) - + if(quo_is_symbolic(.dispersion)) dispersion = .data |> pivot_transcript(!!.transcript) |> select(!!.transcript, !!.dispersion) |> deframe() else @@ -781,8 +781,8 @@ get_differential_transcript_abundance_glmmSeq <- function(.data, # Scaling sizeFactors <- counts |> edgeR::calcNormFactors(method = scaling_method) - - + + glmmSeq_object = glmmSeq( .formula, countdata = counts , diff --git a/R/functions_SE.R b/R/functions_SE.R index 0b596319..893a5851 100755 --- a/R/functions_SE.R +++ b/R/functions_SE.R @@ -408,7 +408,7 @@ get_reduced_dimensions_TSNE_bulk_SE <- #' @param of_samples A boolean #' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity #' @param calculate_for_pca_dimensions An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered -#' @param ... Further parameters passed to the function uwot +#' @param ... Further parameters passed to the function uwot::tumap #' #' @return A tibble with additional columns #' diff --git a/R/methods.R b/R/methods.R index c43a2817..d16e355f 100755 --- a/R/methods.R +++ b/R/methods.R @@ -1014,7 +1014,7 @@ setMethod("cluster_elements", "tidybulk", .cluster_elements) #' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity #' @param scale A boolean for method="PCA", this will be passed to the `prcomp` function. It is not included in the ... argument because although the default for `prcomp` if FALSE, it is advisable to set it as TRUE. #' @param action A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get). -#' @param ... Further parameters passed to the function prcomp if you choose method="PCA" or Rtsne if you choose method="tSNE" +#' @param ... Further parameters passed to the function prcomp if you choose method="PCA" or Rtsne if you choose method="tSNE", or uwot::tumap if you choose method="umap" #' #' @param log_transform DEPRECATED - A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) #' @@ -1143,14 +1143,14 @@ setGeneric("reduce_dimensions", function(.data, # adjust top for the max number of features I have if(top > .data |> distinct(!!.feature) |> nrow()){ warning(sprintf( - "tidybulk says: the \"top\" argument %s is higher than the number of features %s", - top, + "tidybulk says: the \"top\" argument %s is higher than the number of features %s", + top, .data |> distinct(!!.feature) |> nrow() )) - + top = min(top, .data |> distinct(!!.feature) |> nrow()) } - + # Validate data frame if(do_validate()) { validation(.data, !!.element, !!.feature, !!.abundance) @@ -2600,7 +2600,7 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol) #' All methods use raw counts, irrespective of if scale_abundance or adjust_abundance have been calculated, therefore it is essential to add covariates such as batch effects (if applicable) in the formula. #' #' Underlying method for edgeR framework: -#' +#' #' .data |> #' #' # Filter @@ -2627,7 +2627,7 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol) #' #' #' Underlying method for DESeq2 framework: -#' +#' #' keep_abundant( #' factor_of_interest = !!as.symbol(parse_formula(.formula)[[1]]), #' minimum_counts = minimum_counts, @@ -2646,16 +2646,16 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol) #' counts = #' .data %>% #' assay(my_assay) -#' +#' #' # Create design matrix for dispersion, removing random effects #' design = #' model.matrix( #' object = .formula |> eliminate_random_effects(), #' data = metadata #' ) -#' +#' #' dispersion = counts |> edgeR::estimateDisp(design = design) %$% tagwise.dispersion |> setNames(rownames(counts)) -#' +#' #' glmmSeq( .formula, #' countdata = counts , #' metadata = metadata |> as.data.frame(), @@ -2663,8 +2663,8 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol) #' progress = TRUE, #' method = method |> str_remove("(?i)^glmmSeq_" ), #' ) -#' -#' +#' +#' #' @return A consistent object (to the input) with additional columns for the statistics from the test (e.g., log fold change, p-value and false discovery rate). #' #' @@ -3969,12 +3969,12 @@ setGeneric("test_gene_rank", function(.data, # DEPRECATION OF reference function if (is_present(.sample) & !is.null(.sample)) { - + # Signal the deprecation to the user deprecate_warn("1.13.2", "tidybulk::test_gene_rank(.sample = )", details = "The argument .sample is now deprecated and not needed anymore.") } - + # Get column names .arrange_desc = enquo(.arrange_desc) .entrez = enquo(.entrez) @@ -4012,14 +4012,14 @@ setGeneric("test_gene_rank", function(.data, .data |> select(!!.entrez, !!.arrange_desc) |> - distinct() |> - + distinct() |> + # Select one entrez - NEEDED? - with_groups(c(!!.entrez,!!.arrange_desc ), slice, 1) |> + with_groups(c(!!.entrez,!!.arrange_desc ), slice, 1) |> - # arrange + # arrange arrange(desc(!!.arrange_desc)) |> - + # Format deframe() |> entrez_rank_to_gsea(species, gene_collections = gene_sets ) |> diff --git a/dev/solve_complete_confounders.R b/dev/solve_complete_confounders.R new file mode 100644 index 00000000..6f5bf253 --- /dev/null +++ b/dev/solve_complete_confounders.R @@ -0,0 +1,38 @@ + + + + + + +library(tidySummarizedExperiment) +library(tidybulk) + +# Sample annotations +sample_annotations <- data.frame( + sample_id = paste0("Sample", seq(1, 9)), + factor_of_interest = c(rep("treated", 4), rep("untreated", 5)), + A = c("a1", "a2", "a1", "a2", "a1", "a2", "a1", "a2", "a3"), + B = c("b1", "b1", "b2", "b1", "b1", "b1", "b2", "b1", "b3"), + C = c("c1", "c1", "c1", "c1", "c1", "c1", "c1", "c1", "c3"), + stringsAsFactors = FALSE +) + +# Simulated assay data (e.g., gene expression data) +# Let's assume we have 100 genes (rows) and 9 samples (columns) +assay_data <- matrix(rnorm(100 * 9), nrow = 100, ncol = 9) + +# Row data (e.g., gene annotations) +# For simplicity, we'll just use a sequence of gene IDs +row_data <- data.frame(gene_id = paste0("Gene", seq_len(100))) + +# Create SummarizedExperiment object +se <- SummarizedExperiment(assays = list(counts = assay_data), + rowData = row_data, + colData = DataFrame(sample_annotations)) + +se |> + resolve_complete_confounders_of_non_interest(A, B, C) |> + distinct(.sample, A, B, C) + + +