Merge pull request #277 from stemangiola/quantile-normalisation

added quantile normalisation
stemangiola · Jul 6, 2023 · 8dad2db · 8dad2db
2 parents e97612a + 0b91f71
commit 8dad2db
Show file tree

Hide file tree

Showing 8 changed files with 435 additions and 3 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -17,7 +17,7 @@ Depends:
 Imports:
     tibble,
     readr,
-    dplyr,
+    dplyr (>= 1.1.0),
     magrittr,
     tidyr,
     stringi,

diff --git a/NAMESPACE b/NAMESPACE
@@ -49,6 +49,7 @@ export(mutate)
 export(nest)
 export(pivot_sample)
 export(pivot_transcript)
+export(quantile_normalise_abundance)
 export(reduce_dimensions)
 export(remove_redundancy)
 export(rename)
@@ -70,6 +71,7 @@ export(tidybulk)
 export(tidybulk_SAM_BAM)
 export(unnest)
 exportMethods(as_SummarizedExperiment)
+exportMethods(quantile_normalise_abundance)
 exportMethods(scale_abundance)
 exportMethods(tidybulk)
 exportMethods(tidybulk_SAM_BAM)
@@ -103,6 +105,7 @@ importFrom(dplyr,group_by_drop_default)
 importFrom(dplyr,group_cols)
 importFrom(dplyr,if_else)
 importFrom(dplyr,inner_join)
+importFrom(dplyr,join_by)
 importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
 importFrom(dplyr,mutate_if)

diff --git a/R/functions.R b/R/functions.R
@@ -355,8 +355,6 @@ get_scaled_counts_bulk <- function(.data,
 
 }
 
-
-
 #' Get differential transcription information to a tibble using edgeR.
 #'
 #' @keywords internal

diff --git a/R/methods.R b/R/methods.R
@@ -527,6 +527,7 @@ setGeneric("scale_abundance", function(.data,
 		)
 }
 
+
 #' scale_abundance
 #' 
 #' @export
@@ -567,6 +568,182 @@ setMethod("scale_abundance", "tbl_df", .scale_abundance)
 setMethod("scale_abundance", "tidybulk", .scale_abundance)
 
 
+
+#' Normalise by quantiles the counts of transcripts/genes
+#'
+#' `r lifecycle::badge("maturing")`
+#'
+#' @description quantile_normalise_abundance() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and Scales transcript abundance compansating for sequencing depth (e.g., with TMM algorithm, Robinson and Oshlack doi.org/10.1186/gb-2010-11-3-r25).
+#'
+#' @importFrom rlang enquo
+#' @importFrom magrittr "%>%"
+#' @importFrom stats median
+#' @importFrom dplyr join_by
+#'
+#' @name quantile_normalise_abundance
+#'
+#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))
+#' @param .sample The name of the sample column
+#' @param .transcript The name of the transcript/gene column
+#' @param .abundance The name of the transcript/gene abundance column
+#' @param .subset_for_scaling A gene-wise quosure condition. This will be used to filter rows (features/genes) of the dataset. For example
+#' @param action A character string between "add" (default) and "only". "add" joins the new information to the input tbl (default), "only" return a non-redundant tbl with the just new information.
+#'
+#'
+#' @details Scales transcript abundance compensating for sequencing depth
+#' (e.g., with TMM algorithm, Robinson and Oshlack doi.org/10.1186/gb-2010-11-3-r25).
+#' Lowly transcribed transcripts/genes (defined with minimum_counts and minimum_proportion parameters)
+#' are filtered out from the scaling procedure.
+#' The scaling inference is then applied back to all unfiltered data.
+#'
+#' Underlying method
+#' edgeR::calcNormFactors(.data, method = c("TMM","TMMwsp","RLE","upperquartile"))
+#'
+#'
+#'
+#' @return A tbl object with additional columns with scaled data as `<NAME OF COUNT COLUMN>_scaled`
+#'
+#'
+#' @examples
+#'
+#'
+#'  tidybulk::se_mini |>
+#'    quantile_normalise_abundance()
+#'
+#'
+#'
+#' @docType methods
+#' @rdname quantile_normalise_abundance-methods
+#' @export
+
+setGeneric("quantile_normalise_abundance", function(.data,
+                                                    .sample = NULL,
+                                                    .transcript = NULL,
+                                                    .abundance = NULL,
+                                                    .subset_for_scaling = NULL,
+                                                    action = "add")
+  standardGeneric("quantile_normalise_abundance"))
+
+# Set internal
+.quantile_normalise_abundance = 	function(.data,
+                                          .sample = NULL,
+                                          .transcript = NULL,
+                                          .abundance = NULL,
+                                          .subset_for_scaling = NULL,
+                                          action = "add")
+{
+
+  # Fix NOTEs
+  . = NULL
+
+  # Get column names
+  .sample = enquo(.sample)
+  .transcript = enquo(.transcript)
+  .abundance = enquo(.abundance)
+  col_names = get_sample_transcript_counts(.data, .sample, .transcript, .abundance)
+  .sample = col_names$.sample
+  .transcript = col_names$.transcript
+  .abundance = col_names$.abundance
+
+  .subset_for_scaling = enquo(.subset_for_scaling)
+
+  # Set column name for value scaled
+  value_scaled = as.symbol(sprintf("%s%s",  quo_name(.abundance), scaled_string))
+
+
+  # Check if package is installed, otherwise install
+  if (find.package("limma", quiet = TRUE) %>% length %>% equals(0)) {
+    message("tidybulk says: Installing limma needed for analyses")
+    if (!requireNamespace("BiocManager", quietly = TRUE))
+      install.packages("BiocManager", repos = "https://cloud.r-project.org")
+    BiocManager::install("limma", ask = FALSE)
+  }
+
+  # Reformat input data set
+  .data_norm <-
+    .data %>%
+
+    # Rename
+    dplyr::select(!!.sample,!!.transcript,!!.abundance) %>%
+
+    # Set samples and genes as factors
+    dplyr::mutate(!!.sample := factor(!!.sample),!!.transcript := factor(!!.transcript))  |> 
+    pivot_wider(names_from = !!.sample, values_from = !!.abundance) |> 
+    as_matrix(rownames=!!.transcript) |> 
+    limma::normalizeQuantiles() |> 
+    as_tibble(rownames = quo_name(.transcript)) |> 
+    pivot_longer(-!!.transcript, names_to = quo_name(.sample), values_to = quo_names(value_scaled)) |> 
+
+
+    # Attach column internals
+    add_tt_columns(
+      !!.sample,
+      !!.transcript,
+      !!.abundance,
+      !!(function(x, v)	enquo(v))(x,!!value_scaled)
+    )
+
+
+  if (action %in% c( "add", "get")){
+
+    .data %>%
+
+      left_join(.data_norm, by= join_by(!!.sample, !!.transcript)) %>%
+
+      # Attach attributes
+      reattach_internals(.data_norm) |> 
+
+      # Add methods
+      memorise_methods_used(c("quantile")) 
+
+  }
+  else if (action == "only") .data_norm
+  else
+    stop(
+      "tidybulk says: action must be either \"add\" for adding this information to your data frame or \"get\" to just get the information"
+    )
+}
+
+#' quantile_normalise_abundance
+#' 
+#' @export
+#' 
+#' @inheritParams quantile_normalise_abundance
+#'
+#' @docType methods
+#' @rdname quantile_normalise_abundance-methods
+#'
+#' @return A tbl object with additional columns with scaled data as `<NAME OF COUNT COLUMN>_scaled`
+#'
+setMethod("quantile_normalise_abundance", "spec_tbl_df", .quantile_normalise_abundance)
+
+#' quantile_normalise_abundance
+#' 
+#' @export
+#' 
+#' @inheritParams quantile_normalise_abundance
+#'
+#' @docType methods
+#' @rdname quantile_normalise_abundance-methods
+#'
+#' @return A tbl object with additional columns with scaled data as `<NAME OF COUNT COLUMN>_scaled`
+#'
+setMethod("quantile_normalise_abundance", "tbl_df", .quantile_normalise_abundance)
+
+#' quantile_normalise_abundance
+#' 
+#' @export
+#' 
+#' @inheritParams quantile_normalise_abundance
+#'
+#' @docType methods
+#' @rdname quantile_normalise_abundance-methods
+#'
+#' @return A tbl object with additional columns with scaled data as `<NAME OF COUNT COLUMN>_scaled`
+#'
+setMethod("quantile_normalise_abundance", "tidybulk", .quantile_normalise_abundance)
+
+
 #' Get clusters of elements (e.g., samples or transcripts)
 #'
 #' `r lifecycle::badge("maturing")`

diff --git a/R/methods_SE.R b/R/methods_SE.R
@@ -235,6 +235,97 @@ setMethod("scale_abundance",
 					.scale_abundance_se)
 
 
+
+#' @importFrom magrittr multiply_by
+#' @importFrom magrittr divide_by
+#' @importFrom SummarizedExperiment assays
+#' @importFrom SummarizedExperiment colData
+#' @importFrom utils tail
+#' @importFrom stats na.omit
+#'
+.quantile_normalise_abundance_se = function(.data,
+                               .sample = NULL,
+                               .transcript = NULL,
+                               .abundance = NULL,
+                               .subset_for_scaling = NULL,
+                               action = NULL) {
+
+
+  # Fix NOTEs
+  . = NULL
+
+  .abundance = enquo(.abundance)
+  .subset_for_scaling = enquo(.subset_for_scaling)
+
+  # Set column name for value scaled
+
+  # If no assay is specified take first
+  my_assay = ifelse(
+    quo_is_symbol(.abundance), 
+    quo_name(.abundance), 
+    .data |>
+      assayNames() |>
+      extract2(1)
+  )
+
+  # Set column name for value scaled
+  value_scaled = my_assay %>% paste0(scaled_string)
+
+  # Check if package is installed, otherwise install
+  if (find.package("limma", quiet = TRUE) %>% length %>% equals(0)) {
+    message("tidybulk says: Installing limma needed for analyses")
+    if (!requireNamespace("BiocManager", quietly = TRUE))
+      install.packages("BiocManager", repos = "https://cloud.r-project.org")
+    BiocManager::install("limma", ask = FALSE)
+  }
+
+  # Reformat input data set
+  .data_norm <-
+    .data %>%
+    assay(my_assay) |> 
+    limma::normalizeQuantiles() |> 
+    list() |> 
+    setNames(value_scaled)
+
+  # Add the assay
+  assays(.data) =  assays(.data) %>% c(.data_norm)
+
+  .data %>%
+
+    # Add methods
+    memorise_methods_used(c("quantile")) %>%
+
+    # Attach column internals
+    add_tt_columns(.abundance_scaled = !!(function(x, v)	enquo(v))(x,!!as.symbol(value_scaled)))
+
+}
+
+#' quantile_normalise_abundance
+#' @inheritParams quantile_normalise_abundance
+#'
+#' @docType methods
+#' @rdname quantile_normalise_abundance-methods
+#'
+#' @return A `SummarizedExperiment` object
+#'
+setMethod("quantile_normalise_abundance",
+          "SummarizedExperiment",
+          .quantile_normalise_abundance_se)
+
+#' quantile_normalise_abundance
+#' @inheritParams quantile_normalise_abundance
+#'
+#' @docType methods
+#' @rdname quantile_normalise_abundance-methods
+#'
+#' @return A `SummarizedExperiment` object
+#'
+setMethod("quantile_normalise_abundance",
+          "RangedSummarizedExperiment",
+          .quantile_normalise_abundance_se)
+
+
+
 .cluster_elements_se = function(.data,
 																method ,
 																of_samples = TRUE,