Merge pull request #44 from VEuPathDB/impute-zero-fix

Impute zero fix
VEuPathDB · Apr 18, 2024 · 64602ae · 64602ae
2 parents 6e94c07 + 6c61e4c
commit 64602ae
Show file tree

Hide file tree

Showing 4 changed files with 149 additions and 11 deletions.
diff --git a/R/class-Megastudy.R b/R/class-Megastudy.R
@@ -74,6 +74,13 @@ check_megastudy <- function(object) {
     errors <- c(errors, msg)
   }
 
+  if (!!length(object@collectionsDT)) {
+    if (!all(ancestor_id_cols[1:length(ancestor_id_cols)-1] %in% names(object@collectionsDT))) {
+      msg <- paste("Not all ancestor ID columns are present in collection data.frame")
+      errors <- c(errors, msg)
+    }
+  }
+
   return(if (length(errors) == 0) TRUE else errors)
 }
 
@@ -85,13 +92,17 @@ check_megastudy <- function(object) {
 #' that data.
 #' 
 #' @slot data A data.table
+#' @slot ancestorIdColumns A character vector of column names representing parent entities of the recordIdColumn.
 #' @slot studySpecificVocabularies veupathUtils::StudySpecificVocabulariesByVariableList
+#' @slot collectionIds A data.table including collection ids and any variables of interest for the collection entity.
+#' If none provided, the collection ids will be inferred from those present in `data`.
 #' 
 #' @name Megastudy-class
 #' @rdname Megastudy-class
 #' @export 
 Megastudy <- setClass("Megastudy", representation(
     data = 'data.table',
     ancestorIdColumns = 'character',
-    studySpecificVocabularies = 'StudySpecificVocabulariesByVariableList'
+    studySpecificVocabularies = 'StudySpecificVocabulariesByVariableList',
+    collectionsDT = 'data.frame'
 ), validity = check_megastudy)
diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R
@@ -204,10 +204,12 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
   veupathUtils::logWithTime(paste0("Imputing zeroes for data.table with ", ncol(.dt), " columns and ", nrow(.dt), " rows"), verbose)
   allEntityIdColumns <- object@ancestorIdColumns
   vocabs <- object@studySpecificVocabularies
+  collectionsDT <- object@collectionsDT
 
   # it seems a lot of this validation could belong to some custom obj w both a megastudy and vm slot.. but what is that? a MegastudyPlot?
   # plus going that route means using this class in plot.data means an api change for plot.data
   # that api change might be worth making in any case, but not doing it now
+  ## TODO validate that any collections variables are present in collectionsDT
   variableMetadataNeedingStudyVocabularies <- findStudyDependentVocabularyVariableMetadata(variables)
   variableSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, "Never")
   variableCollectionSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, "Always")
@@ -251,23 +253,41 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
     veupathUtils::logWithTime("Downstream entities present. No imputation will be done (for now... mwahahaha).", verbose)
     return(.dt)
   }
+  studyEntityIdColName <- upstreamEntityIdColNames[1] # still working off the assumption theyre ordered
+
+  # variables that are from the upstream entities need to be in collectionsDT
+  # otherwise we erroneously try to impute values for those variables too, rather than only the weighting variable
+  upstreamEntities <- veupathUtils::strSplit(upstreamEntityIdColNames, ".", 2, 1)
+  if (!!length(collectionsDT)) {
+    upstreamEntityVariableColNames <- findColNamesByPredicate(variables, function(x) { x@variableSpec@entityId %in% upstreamEntities })
+    if (!all(upstreamEntityVariableColNames %in% names(collectionsDT))) {
+      stop("All variables from the upstream entities must be in collectionsDT.")
+    }
+  }
 
   # for upstream entities data
-  upstreamEntityVariables.dt <- unique(.dt[, -c(weightingVarColName, varSpecColNames), with=FALSE])
+  upstreamEntityVariables.dt <- .dt[, -c(weightingVarColName, varSpecColNames), with=FALSE]
   upstreamEntityVariables.dt[[varSpecEntityIdColName]] <- NULL
   upstreamEntityVariables.dt <- unique(upstreamEntityVariables.dt)
   veupathUtils::logWithTime(paste("Found", nrow(upstreamEntityVariables.dt), "unique existing upstream variable value combinations."), verbose)
+  if (!!length(collectionsDT)) {
+    upstreamEntityVariables.dt <- collectionsDT
+  }
   entityIds.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecEntityIdColName), with=FALSE])
 
   # make all possible variable value combinations table
-  #vocabDTs <- lapply(variableSpecsToImputeZeroesFor, makeVocabDT)
   vocabDTs <- lapply(vocabs, function(x) {x@studyVocab})
-  allCombinations.dt <- purrr::reduce(vocabDTs, merge, allow.cartesian=TRUE, all=TRUE)
+  if (!!length(collectionsDT)) {
+    vocabDTs <- lapply(vocabDTs, function(x) { merge(x, collectionsDT[, upstreamEntityIdColNames], by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE) })
+  }
+  mergeBy <- studyEntityIdColName
+  if (!!length(collectionsDT)) mergeBy <- upstreamEntityIdColNames
+  allCombinations.dt <- purrr::reduce(vocabDTs, merge, by = mergeBy, allow.cartesian=TRUE, all=TRUE)
 
   # find which ones we need to add
   presentCombinations.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE])
   # need upstream entity ids for all combinations in order to properly find and merge missing values
-  allCombinations.dt <- merge(allCombinations.dt, upstreamEntityVariables.dt, allow.cartesian=TRUE)
+  allCombinations.dt <- merge(allCombinations.dt, upstreamEntityVariables.dt, by = mergeBy, all = TRUE, allow.cartesian=TRUE)
   # NOTE: we're assuming if a value was explicitly filtered against that its not in the vocab
   addCombinations.dt <- allCombinations.dt[!presentCombinations.dt, on=c(upstreamEntityIdColNames, varSpecColNames)]
 
@@ -280,10 +300,14 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
 
   # go ahead and add them, first filling in values for all columns
   addCombinations.dt[[weightingVarColName]] <- 0  
-  addCombinations.dt[[varSpecEntityIdColName]] <- apply(addCombinations.dt[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE], 1, digest::digest, algo="md5")
+  addCombinations.dt[[varSpecEntityIdColName]] <- stringi::stri_rand_strings(nrow(addCombinations.dt), 10)
   # bind them to the existing rows
+  upstreamVariablesInCollectionsDT <- names(collectionsDT)[!names(collectionsDT) %in% upstreamEntityIdColNames]
+  if (!!length(collectionsDT) & !all(upstreamVariablesInCollectionsDT %in% names(.dt))) {
+    .dt <- merge(.dt, upstreamEntityVariables.dt)
+  }
   .dt <- data.table::rbindlist(list(.dt, addCombinations.dt), use.names=TRUE)
   veupathUtils::logWithTime("Added imputed values to existing table. Finished imputing zeroes.", verbose)
- 
+
   return(.dt)
 })
diff --git a/man/Megastudy-class.Rd b/man/Megastudy-class.Rd
diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R
@@ -13,6 +13,15 @@ megastudyDT <- data.table('study.id'=c('a','a','a','b','b','b'),
                           'assay.pathogen3_presence'=c('No','Yes','No','Yes','No','Yes'),
                           'assay.weighting_variable'=c(5,10,15,20,25,30))
 
+## the collectionsDT needs to include all collections, unless explicitly filtered against
+## it also needs to tell us values for any collection variables that are in the plot
+## were trying to impute samples, based on collections that dont have samples.
+## so the only way to get collection variables values for those samples, is to get them from the collectionsDT
+collectionsDT <- data.table(
+  'study.id' = c('a', 'a', 'a','b', 'b', 'b', 'b'),
+  'collection.id' = c(1, 2, 3, 1, 2, 3, 4),
+  'collection.attractant' = c('A', 'B', 'C', 'C', 'D', 'D', 'E'))
+
 sexVocabs.dt <- data.table::data.table(
   'study.id' = c('a', 'a', 'a', 'a', 'a', 'b', 'b'),
   'sample.sex' = c('female', 'male', 'non-binary', 'other', 'do not wish to specify', 'male', 'female'))
@@ -79,7 +88,7 @@ test_that("Megastudy and associated validation works", {
                  ancestorIdColumns=c('study.id', 'collection.id'),
                  studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs)))
 
-  expect_equal(slotNames(m), c('data','ancestorIdColumns','studySpecificVocabularies'))
+  expect_equal(slotNames(m), c('data','ancestorIdColumns','studySpecificVocabularies','collectionsDT'))
   expect_equal(length(m@studySpecificVocabularies), 1)
   expect_equal(data.table::uniqueN(m@studySpecificVocabularies[[1]]@studyVocab[,1]), 2)
   expect_equal(slotNames(m@studySpecificVocabularies[[1]]), c("studyIdColumnName","variableSpec","studyVocab"))    
@@ -89,7 +98,7 @@ test_that("Megastudy and associated validation works", {
                  ancestorIdColumns=c('study.id', 'collection.id'),
                  studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)))
 
-  expect_equal(slotNames(m), c('data','ancestorIdColumns','studySpecificVocabularies'))
+  expect_equal(slotNames(m), c('data','ancestorIdColumns','studySpecificVocabularies','collectionsDT'))
   expect_equal(length(m@studySpecificVocabularies), 2)
   expect_equal(data.table::uniqueN(m@studySpecificVocabularies[[2]]@studyVocab[,1]), 2)
   expect_equal(slotNames(m@studySpecificVocabularies[[2]]), c("studyIdColumnName","variableSpec","studyVocab"))              
@@ -188,6 +197,90 @@ test_that("imputeZeroes method is sane", {
   expect_equal(nrow(imputedDT), 42)
   expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36)
 
+  # case where some collection ids are missing
+  # in real life, we have some collections where all samples are 0 and so not loaded
+  # in this case, the collection ids are missing from the data table R gets handed. 
+  # we want to impute zeroes for their samples anyhow.
+
+  # in this version, some collection level variables are in both data and collectionsDT
+  m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant'), with=FALSE],
+                  ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'),
+                  studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)),
+                  collectionsDT=collectionsDT)
+
+  variables <- new("VariableMetadataList", SimpleList(
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'species', entityId = 'sample'),
+      plotReference = new("PlotReference", value = 'xAxis'),
+      dataType = new("DataType", value = 'STRING'),
+      dataShape = new("DataShape", value = 'CATEGORICAL'),
+      weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'),
+      hasStudyDependentVocabulary = TRUE),
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'),
+      plotReference = new("PlotReference", value = 'yAxis'),
+      dataType = new("DataType", value = 'NUMBER'),
+      dataShape = new("DataShape", value = 'CONTINUOUS')),
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'),
+      # empty plotReference means that it is not plotted
+      dataType = new("DataType", value = 'STRING'),
+      dataShape = new("DataShape", value = 'CATEGORICAL'),
+      weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'),
+      hasStudyDependentVocabulary = TRUE)
+  ))
+
+  imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
+  # result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes
+  expect_equal(all(c("sample.species","sample.specimen_count") %in% names(imputedDT)), TRUE)
+  # 5 sexes * 3 species * 3 collections in study A (45) + 2 sexes * 3 species * 4 collections in study B (30) = 69
+  expect_equal(nrow(imputedDT), 69) ## TODO check these numbers
+  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 63)
+
+  # in this version, collection level variables are only in collectionsDT
+  m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species'), with=FALSE],
+                  ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'),
+                  studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)),
+                  collectionsDT=collectionsDT)
+
+  variables <- new("VariableMetadataList", SimpleList(
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'species', entityId = 'sample'),
+      plotReference = new("PlotReference", value = 'xAxis'),
+      dataType = new("DataType", value = 'STRING'),
+      dataShape = new("DataShape", value = 'CATEGORICAL'),
+      weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'),
+      hasStudyDependentVocabulary = TRUE),
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'),
+      plotReference = new("PlotReference", value = 'yAxis'),
+      dataType = new("DataType", value = 'NUMBER'),
+      dataShape = new("DataShape", value = 'CONTINUOUS')),
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'),
+      # empty plotReference means that it is not plotted
+      dataType = new("DataType", value = 'STRING'),
+      dataShape = new("DataShape", value = 'CATEGORICAL'),
+      weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'),
+      hasStudyDependentVocabulary = TRUE)
+  ))
+
+  imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
+  # result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes
+  expect_equal(all(c("sample.species","sample.specimen_count") %in% names(imputedDT)), TRUE)
+  # 5 sexes * 3 species * 3 collections in study A (45) + 2 sexes * 3 species * 4 collections in study B (30) = 69
+  expect_equal(nrow(imputedDT), 69) ## TODO check these numbers
+  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 63)
+
+  #################################################################################################################
+
+
   # case where one study vocab is missing a study
   mDTSexSingleStudy <- megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE]
   mDTSexSingleStudy$sample.sex[mDTSexSingleStudy$study.id == 'b'] <- NA_character_
@@ -204,6 +297,10 @@ test_that("imputeZeroes method is sane", {
   expect_equal(nrow(imputedDT), 36)
   expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 30)
 
+  m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE],
+                  ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'),
+                  studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)))
+
   # collection entity var is present
   variables <- new("VariableMetadataList", SimpleList(
     new("VariableMetadata",
@@ -760,6 +857,7 @@ test_that("we have reasonable perf w a real-ish use case", {
   )
 
   benchmark <- microbenchmark::microbenchmark(getDTWithImputedZeroes(megastudyReal, megastudyVariablesReal, verbose = FALSE))
-  expect_true(mean(benchmark$time)/1000000 < 50) ## this is in milliseconds
-  expect_true(median(benchmark$time)/1000000 < 50)
+  print(benchmark)
+  expect_true(mean(benchmark$time)/1000000 < 55) ## this is in milliseconds
+  expect_true(median(benchmark$time)/1000000 < 55)
 })