From a38897869fe2a02dd08085c96c7bd23964148f73 Mon Sep 17 00:00:00 2001 From: Sean Hackett Date: Mon, 9 Sep 2024 12:59:20 -0700 Subject: [PATCH] added sample mahalanobis distance fxn --- NAMESPACE | 1 + R/dim_reduction.R | 74 +++++++++++++++++++ R/mutates.R | 8 +- man/calculate_sample_mahalanobis_distances.Rd | 41 ++++++++++ 4 files changed, 121 insertions(+), 3 deletions(-) create mode 100644 man/calculate_sample_mahalanobis_distances.Rd diff --git a/NAMESPACE b/NAMESPACE index 8a83565..5e8e3e2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ export(add_pcs) export(app_flow) export(app_heatmap) export(app_pcs) +export(calculate_sample_mahalanobis_distances) export(center_tomic) export(check_design) export(check_tomic) diff --git a/R/dim_reduction.R b/R/dim_reduction.R index 878a3bd..a59f6d6 100644 --- a/R/dim_reduction.R +++ b/R/dim_reduction.R @@ -456,3 +456,77 @@ find_triple_omic_missing_values <- function( return(output) } + +#' Calculate Sample Mahalanobis Distances +#' +#' Determine each samples distance from the center of the data using Mahalanobis distance. +#' +#' @inheritParams tomic_to +#' @param value_var the measurement variable to use for calculating distances +#' @param max_pcs the maximum number of principal components to used for +#' representing the covariance matrix. +#' @param scale if TRUE then the data will be scaled before calculating distances +#' +#' @returns The samples tibble with a new column `pc_distance` which contains the +#' Mahalanobis distances of individual samples from the PC elipsoid +#' +#' @details +#' Since `romic` is built around using tall data where there are more features than +#' samples calculating Mahalanobis distance off of the covariance matrix is not +#' possible. Instead, we use SVD to create a low-dimensional representation of the +#' covariance matrix and calculate distances from the center of the data in this +#' space. This essentially involves weighting the principal components by their +#' loadings. +#' +#' @examples +#' calculate_sample_mahalanobis_distances(brauer_2008_tidy) +#' @export +calculate_sample_mahalanobis_distances <- function ( + tomic, + value_var = NULL, + max_pcs = 10, + scale = FALSE + ) { + + checkmate::assertClass(tomic, "tomic") + value_var <- romic:::value_var_handler(value_var, tomic$design) + checkmate::assertIntegerish(max_pcs, len = 1) + + n_features <- nrow(romic::get_tomic_table(tomic, "features")) + n_samples <- nrow(romic::get_tomic_table(tomic, "samples")) + npcs <- pmin(max_pcs, n_features, n_samples) + + X_std <- tomic %>% + # remove features with missing values + romic::remove_missing_values(value_var, "drop_features", verbose = FALSE) %>% + # convert to a matrix + romic::tomic_to_matrix(value_var) %>% + {scale(t(.), scale = scale) %>% t()} + + # if X is constant for a feature (such all reads are 0) then + # scaling will put it to NaN. We need to remove these features + valid_X_std <- X_std[rowSums(is.nan(X_std)) == 0,] + + # perform a singular value decomposition + svd_results <- svd(valid_X_std, nv = npcs) + + # re-weight principal components by eigenvalues + X_std_v <- svd_results$v * matrix(svd_results$d[1:npcs], ncol = npcs, nrow = ncol(valid_X_std), byrow = TRUE) + rownames(X_std_v) <- colnames(valid_X_std) + + pc_distances_by_sample <- tibble::tibble( + !!rlang::sym(tomic$design$sample_pk) := rownames(X_std_v), + pc_distance = rowSums(X_std_v^2) + ) %>% + arrange(desc(pc_distance)) + + # validate calculation + varex_leading_components <- sum(svd_results$d^2) * sum((svd_results$d^2 / sum(svd_results$d^2))[1:npcs]) + + stopifnot((sum(pc_distances_by_sample$pc_distance) - varex_leading_components)/varex_leading_components < 0.01) + + samples_w_pc_distances <- romic::get_tomic_table(tomic, "samples") %>% + dplyr::left_join(pc_distances_by_sample, by = tomic$design$sample_pk) + + return(samples_w_pc_distances) +} diff --git a/R/mutates.R b/R/mutates.R index 0bdf04f..54b0195 100644 --- a/R/mutates.R +++ b/R/mutates.R @@ -151,9 +151,11 @@ center <- function(x) { #' select(-DR) #' new_variable_tables <- c("new_sample_var" = "samples") #' @export -update_tidy_omic <- function(tidy_omic, - updated_tidy_data, - new_variable_tables = c()) { +update_tidy_omic <- function( + tidy_omic, + updated_tidy_data, + new_variable_tables = c() + ) { checkmate::assertClass(tidy_omic, "tomic") checkmate::assertClass(tidy_omic, "tidy_omic") checkmate::assertDataFrame(updated_tidy_data) diff --git a/man/calculate_sample_mahalanobis_distances.Rd b/man/calculate_sample_mahalanobis_distances.Rd new file mode 100644 index 0000000..dd11675 --- /dev/null +++ b/man/calculate_sample_mahalanobis_distances.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dim_reduction.R +\name{calculate_sample_mahalanobis_distances} +\alias{calculate_sample_mahalanobis_distances} +\title{Calculate Sample Mahalanobis Distances} +\usage{ +calculate_sample_mahalanobis_distances( + tomic, + value_var = NULL, + max_pcs = 10, + scale = FALSE +) +} +\arguments{ +\item{tomic}{Either a \code{tidy_omic} or \code{triple_omic} object} + +\item{value_var}{the measurement variable to use for calculating distances} + +\item{max_pcs}{the maximum number of principal components to used for +representing the covariance matrix.} + +\item{scale}{if TRUE then the data will be scaled before calculating distances} +} +\value{ +The samples tibble with a new column `pc_distance` which contains the + Mahalanobis distances of individual samples from the PC elipsoid +} +\description{ +Determine each samples distance from the center of the data using Mahalanobis distance. +} +\details{ +Since `romic` is built around using tall data where there are more features than +samples calculating Mahalanobis distance off of the covariance matrix is not +possible. Instead, we use SVD to create a low-dimensional representation of the +covariance matrix and calculate distances from the center of the data in this +space. This essentially involves weighting the principal components by their +loadings. +} +\examples{ +calculate_sample_mahalanobis_distances(brauer_2008_tidy) +}