Merge pull request #48 from VEuPathDB/soften-correlations-errors

warning rather than error for non-cont vars in correlation inputs
VEuPathDB · Apr 30, 2024 · 0767353 · 0767353
2 parents e49db12 + 35b6ab7
commit 0767353
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 3 deletions.
diff --git a/R/method-correlation.R b/R/method-correlation.R
@@ -111,8 +111,22 @@ function(
   }
 
   # Check that all values are numeric
-  if (!identical(veupathUtils::findNumericCols(data1), names(data1))) { stop("All columns in data1 must be numeric.")}
-  if (!identical(veupathUtils::findNumericCols(data2), names(data2))) { stop("All columns in data2 must be numeric.")}
+  if (!identical(veupathUtils::findNumericCols(data1), names(data1))) { 
+    warning("All columns in data1 are not numeric. Only numeric columns will be used.")
+    keepCols <- veupathUtils::findNumericCols(data1)
+    if (length(keepCols) == 0) {
+      stop("No numeric columns found in data1.")
+    }
+    data1 <- data1[, ..keepCols]
+  }
+  if (!identical(veupathUtils::findNumericCols(data2), names(data2))) { 
+    warning("All columns in data2 are not numeric. Only numeric columns will be used.")
+    keepCols <- veupathUtils::findNumericCols(data2)
+    if (length(keepCols) == 0) {
+      stop("No numeric columns found in data2.")
+    }
+    data2 <- data2[, ..keepCols]
+  }
 
 
   ## Compute correlation
@@ -175,7 +189,14 @@ function(
   verbose <- veupathUtils::matchArg(verbose)
 
   # Check that all values are numeric
-  if (!identical(veupathUtils::findNumericCols(data1), names(data1))) { stop("All columns in data1 must be numeric.")}
+  if (!identical(veupathUtils::findNumericCols(data1), names(data1))) { 
+    warning("All columns in data1 are not numeric. Only numeric columns will be used.")
+    keepCols <- veupathUtils::findNumericCols(data1)
+    if (length(keepCols) == 0) {
+      stop("No numeric columns found in data1.")
+    }
+    data1 <- data1[, ..keepCols]
+  }
 
   ## Compute correlation
   # rownames and colnames should be the same in this case

diff --git a/tests/testthat/test-correlation.R b/tests/testthat/test-correlation.R
@@ -429,6 +429,49 @@ test_that("correlation fails with improper inputs", {
   expect_error(corrleation(data, verbose=F))
 })
 
+test_that("correlation succeeds w a mix of cat and cont metadata", {
+  nSamples <- 200
+  df <- data.table::data.table(
+    "entity.SampleID" = 1:nSamples,
+    "entity.cont1" = rnorm(nSamples),
+    "entity.cont2" = rnorm(nSamples),
+    "entity.cont3" = rnorm(nSamples)
+  )
+
+  counts <- round(df[, -c("entity.SampleID")]*1000) # make into "counts"
+  counts[ ,entity.SampleID:= df$entity.SampleID]
+
+  sampleMetadata <- SampleMetadata(
+    data = data.frame(list(
+      "entity.SampleID" = df[["entity.SampleID"]],
+      "entity.binA" = sample(c("binA_a", "binA_b"), nSamples, replace=T),
+      "entity.cat2" = sample(c("cat2_a", "cat2_b"), nSamples, replace=T),
+      "entity.cat3" = sample(paste0("cat3_", letters[1:3]), nSamples, replace=T),
+      "entity.cat4" = sample(paste0("cat4_", letters[1:4]), nSamples, replace=T),
+      "entity.cont1" = rnorm(nSamples),
+      "entity.cont2" = rnorm(nSamples),
+      "entity.cont3" = rnorm(nSamples)
+      )),
+    recordIdColumn = "entity.SampleID"
+  )
+
+  data <- CollectionWithMetadata(
+              name = 'testing',
+              data = counts,
+              sampleMetadata = sampleMetadata,
+              recordIdColumn = 'entity.SampleID')
+
+  data@sampleMetadata <- sampleMetadata
+
+  result <- correlation(data, method='pearson', proportionNonZeroThreshold = 0, verbose = FALSE)
+  # Check stats (all correlation outputs)
+  statsData <- result@statistics@statistics
+  expect_s3_class(statsData, 'data.frame')
+  expect_equal(names(statsData), c('data1','data2','correlationCoef','pValue'))
+  expect_equal(nrow(statsData), 9) # Should be number of variables in df1 * number of variables in df2
+  expect_true(all(!is.na(statsData)))
+})
+
 test_that("toJSON works as expected for the CorrelationResult class", {
 
   nSamples <- 200