From 27d2c6f0de757a873cbe8957b33a6a1038b8da4e Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Wed, 10 Apr 2024 21:04:43 -0400
Subject: [PATCH 01/12] draft update to megastudy class to specify collection
 ids

---
 R/class-Megastudy.R | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/R/class-Megastudy.R b/R/class-Megastudy.R
index 4c97a54..40e9cd7 100644
--- a/R/class-Megastudy.R
+++ b/R/class-Megastudy.R
@@ -74,6 +74,26 @@ check_megastudy <- function(object) {
     errors <- c(errors, msg)
   }
 
+  if (!!length(object@collectionIdColumn)) {
+    
+    if (length(object@collectionIdColumn) != 1) {
+      msg <- paste("Collection ID column must have a single value.")
+      errors <- c(errors, msg)
+    }
+
+    if (!object@collectionIdColumn %in% ancestor_id_cols) {
+      msg <- paste("Collection ID column must be an ancestor ID column")
+      errors <- c(errors, msg)
+    }
+
+    if (!!length(object@collectionsDT)) {
+      if (!any(object@collectionsDT[[object@collectionIdColumn]] %in% df[[object@collectionIdColumn]])) {
+        msg <- paste("Collection IDs not found in data.frame. At least some of the provided collection IDs should be present in the data.frame.")
+        errors <- c(errors, msg)
+      }
+    }
+  }
+
   return(if (length(errors) == 0) TRUE else errors)
 }
 
@@ -85,7 +105,11 @@ check_megastudy <- function(object) {
 #' that data.
 #' 
 #' @slot data A data.table
+#' @slot ancestorIdColumns A character vector of column names representing parent entities of the recordIdColumn.
 #' @slot studySpecificVocabularies veupathUtils::StudySpecificVocabulariesByVariableList
+#' @slot collectionIdColumn The name of the column in the data.frame that contains the collection ids. 
+#' The collectionId should also be a member of ancestorIdColumns.
+#' @slot collectionIds A character vector of collection ids we expect. If none provided, they will be inferred from those present.
 #' 
 #' @name Megastudy-class
 #' @rdname Megastudy-class
@@ -93,5 +117,7 @@ check_megastudy <- function(object) {
 Megastudy <- setClass("Megastudy", representation(
     data = 'data.table',
     ancestorIdColumns = 'character',
-    studySpecificVocabularies = 'StudySpecificVocabulariesByVariableList'
+    studySpecificVocabularies = 'StudySpecificVocabulariesByVariableList',
+    collectionIdColumn = 'character',
+    collectionsDT = 'data.frame'
 ), validity = check_megastudy)
\ No newline at end of file

From dba1e40b7d6bd1a85cb9bc1b7a8bf9e08054a718 Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Wed, 10 Apr 2024 21:05:10 -0400
Subject: [PATCH 02/12] draft using collection ids to inform impute zero on
 megastudies

---
 R/methods-Megastudy.R | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R
index 63ae5e4..998841a 100644
--- a/R/methods-Megastudy.R
+++ b/R/methods-Megastudy.R
@@ -260,14 +260,16 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
   entityIds.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecEntityIdColName), with=FALSE])
 
   # make all possible variable value combinations table
-  #vocabDTs <- lapply(variableSpecsToImputeZeroesFor, makeVocabDT)
+  studyEntityIdColName <- upstreamEntityIdColNames[1] # still working off the assumption theyre ordered
   vocabDTs <- lapply(vocabs, function(x) {x@studyVocab})
-  allCombinations.dt <- purrr::reduce(vocabDTs, merge, allow.cartesian=TRUE, all=TRUE)
+  vocabDTs <- lapply(vocabDTs, function(x) { merge(x, object@collectionsDT, by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE) })
+  allCombinations.dt <- purrr::reduce(vocabDTs, merge, by = upstreamEntityIdColNames, allow.cartesian=TRUE, all=TRUE)
 
   # find which ones we need to add
   presentCombinations.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE])
   # need upstream entity ids for all combinations in order to properly find and merge missing values
-  allCombinations.dt <- merge(allCombinations.dt, upstreamEntityVariables.dt, allow.cartesian=TRUE)
+  allCombinations.dt <- merge(allCombinations.dt, upstreamEntityVariables.dt, by = upstreamEntityIdColNames, all = TRUE, allow.cartesian=TRUE)
+  ## TODO figure how to populate study and collection entity variable values based on ids
   # NOTE: we're assuming if a value was explicitly filtered against that its not in the vocab
   addCombinations.dt <- allCombinations.dt[!presentCombinations.dt, on=c(upstreamEntityIdColNames, varSpecColNames)]
 
@@ -284,6 +286,6 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
   # bind them to the existing rows
   .dt <- data.table::rbindlist(list(.dt, addCombinations.dt), use.names=TRUE)
   veupathUtils::logWithTime("Added imputed values to existing table. Finished imputing zeroes.", verbose)
- 
+
   return(.dt)
 })

From acdf56fac0d781d43471caafcbd5476383862e01 Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Wed, 10 Apr 2024 21:05:28 -0400
Subject: [PATCH 03/12] draft test for collection id aware megastudy imputing
 zeroes

---
 tests/testthat/test-class-Megastudy.R | 46 +++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R
index b224c4f..dcddf5b 100644
--- a/tests/testthat/test-class-Megastudy.R
+++ b/tests/testthat/test-class-Megastudy.R
@@ -13,6 +13,10 @@ megastudyDT <- data.table('study.id'=c('a','a','a','b','b','b'),
                           'assay.pathogen3_presence'=c('No','Yes','No','Yes','No','Yes'),
                           'assay.weighting_variable'=c(5,10,15,20,25,30))
 
+collectionsDT <- data.table(
+  'study.id' = c('a', 'a', 'a','b', 'b', 'b', 'b'),
+  'collection.id' = c(1, 2, 3, 1, 2, 3, 4))
+
 sexVocabs.dt <- data.table::data.table(
   'study.id' = c('a', 'a', 'a', 'a', 'a', 'b', 'b'),
   'sample.sex' = c('female', 'male', 'non-binary', 'other', 'do not wish to specify', 'male', 'female'))
@@ -188,6 +192,48 @@ test_that("imputeZeroes method is sane", {
   expect_equal(nrow(imputedDT), 42)
   expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36)
 
+  # case where some collection ids are missing
+  # in real life, we have some collections where all samples are 0 and so not loaded
+  # in this case, the collection ids are missing from the data table R gets handed. 
+  # we want to impute zeroes for their samples anyhow.
+  m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE],
+                  ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'),
+                  studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)),
+                  collectionIdColumn='collection.id',
+                  collectionsDT=collectionsDT)
+
+  variables <- new("VariableMetadataList", SimpleList(
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'species', entityId = 'sample'),
+      plotReference = new("PlotReference", value = 'xAxis'),
+      dataType = new("DataType", value = 'STRING'),
+      dataShape = new("DataShape", value = 'CATEGORICAL'),
+      weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'),
+      hasStudyDependentVocabulary = TRUE),
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'),
+      plotReference = new("PlotReference", value = 'yAxis'),
+      dataType = new("DataType", value = 'NUMBER'),
+      dataShape = new("DataShape", value = 'CONTINUOUS')),
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'),
+      # empty plotReference means that it is not plotted
+      dataType = new("DataType", value = 'STRING'),
+      dataShape = new("DataShape", value = 'CATEGORICAL'),
+      weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'),
+      hasStudyDependentVocabulary = TRUE)
+  ))
+
+  imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
+  # result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes
+  expect_equal(all(c("sample.species","sample.specimen_count") %in% names(imputedDT)), TRUE)
+  # 5 sexes * 3 species in study A (15) + 2 sexes * 3 species in study B (6) * 2 collections per study = 42
+  expect_equal(nrow(imputedDT), 42)
+  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36)
+
   # case where one study vocab is missing a study
   mDTSexSingleStudy <- megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE]
   mDTSexSingleStudy$sample.sex[mDTSexSingleStudy$study.id == 'b'] <- NA_character_

From 842c03d2ad6ea8c902a91013720f63ce684fcb11 Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Wed, 10 Apr 2024 21:30:04 -0400
Subject: [PATCH 04/12] remove unused collections entity id slot from megastudy
 class

---
 R/class-Megastudy.R | 22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/R/class-Megastudy.R b/R/class-Megastudy.R
index 40e9cd7..85f21c7 100644
--- a/R/class-Megastudy.R
+++ b/R/class-Megastudy.R
@@ -74,24 +74,11 @@ check_megastudy <- function(object) {
     errors <- c(errors, msg)
   }
 
-  if (!!length(object@collectionIdColumn)) {
-    
-    if (length(object@collectionIdColumn) != 1) {
-      msg <- paste("Collection ID column must have a single value.")
+  if (!!length(object@collectionsDT)) {
+    if (!all(ancestor_id_cols[1:length(ancestor_id_cols)-1] %in% names(object@collectionsDT))) {
+      msg <- paste("Not all ancestor ID columns are present in collection data.frame")
       errors <- c(errors, msg)
     }
-
-    if (!object@collectionIdColumn %in% ancestor_id_cols) {
-      msg <- paste("Collection ID column must be an ancestor ID column")
-      errors <- c(errors, msg)
-    }
-
-    if (!!length(object@collectionsDT)) {
-      if (!any(object@collectionsDT[[object@collectionIdColumn]] %in% df[[object@collectionIdColumn]])) {
-        msg <- paste("Collection IDs not found in data.frame. At least some of the provided collection IDs should be present in the data.frame.")
-        errors <- c(errors, msg)
-      }
-    }
   }
 
   return(if (length(errors) == 0) TRUE else errors)
@@ -107,8 +94,6 @@ check_megastudy <- function(object) {
 #' @slot data A data.table
 #' @slot ancestorIdColumns A character vector of column names representing parent entities of the recordIdColumn.
 #' @slot studySpecificVocabularies veupathUtils::StudySpecificVocabulariesByVariableList
-#' @slot collectionIdColumn The name of the column in the data.frame that contains the collection ids. 
-#' The collectionId should also be a member of ancestorIdColumns.
 #' @slot collectionIds A character vector of collection ids we expect. If none provided, they will be inferred from those present.
 #' 
 #' @name Megastudy-class
@@ -118,6 +103,5 @@ Megastudy <- setClass("Megastudy", representation(
     data = 'data.table',
     ancestorIdColumns = 'character',
     studySpecificVocabularies = 'StudySpecificVocabulariesByVariableList',
-    collectionIdColumn = 'character',
     collectionsDT = 'data.frame'
 ), validity = check_megastudy)
\ No newline at end of file

From 968be24303784fff0880b0075daddb96306d7ba9 Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Wed, 10 Apr 2024 22:24:09 -0400
Subject: [PATCH 05/12] fix some bugs imputing zeroes for megastudies w
 explicit collection ids

---
 R/methods-Megastudy.R | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R
index 998841a..d4ac92d 100644
--- a/R/methods-Megastudy.R
+++ b/R/methods-Megastudy.R
@@ -204,6 +204,7 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
   veupathUtils::logWithTime(paste0("Imputing zeroes for data.table with ", ncol(.dt), " columns and ", nrow(.dt), " rows"), verbose)
   allEntityIdColumns <- object@ancestorIdColumns
   vocabs <- object@studySpecificVocabularies
+  collectionsDT <- object@collectionsDT
 
   # it seems a lot of this validation could belong to some custom obj w both a megastudy and vm slot.. but what is that? a MegastudyPlot?
   # plus going that route means using this class in plot.data means an api change for plot.data
@@ -251,24 +252,34 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
     veupathUtils::logWithTime("Downstream entities present. No imputation will be done (for now... mwahahaha).", verbose)
     return(.dt)
   }
+  studyEntityIdColName <- upstreamEntityIdColNames[1] # still working off the assumption theyre ordered
 
   # for upstream entities data
-  upstreamEntityVariables.dt <- unique(.dt[, -c(weightingVarColName, varSpecColNames), with=FALSE])
+  upstreamEntityVariables.dt <- .dt[, -c(weightingVarColName, varSpecColNames), with=FALSE]
   upstreamEntityVariables.dt[[varSpecEntityIdColName]] <- NULL
   upstreamEntityVariables.dt <- unique(upstreamEntityVariables.dt)
   veupathUtils::logWithTime(paste("Found", nrow(upstreamEntityVariables.dt), "unique existing upstream variable value combinations."), verbose)
+  if (!!length(collectionsDT)) {
+    upstreamEntityVariables.dt <- merge(upstreamEntityVariables.dt, collectionsDT, by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE)
+    upstreamEntityVariables.dt[[which(grepl('.x',names(upstreamEntityVariables.dt),fixed=T))]] <- NULL
+    collectionEntityColumnIndex <- which(grepl('.y',names(upstreamEntityVariables.dt),fixed=T))
+    names(upstreamEntityVariables.dt)[collectionEntityColumnIndex] <- gsub('.y', '', names(upstreamEntityVariables.dt)[collectionEntityColumnIndex], fixed=T)
+  }
   entityIds.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecEntityIdColName), with=FALSE])
 
   # make all possible variable value combinations table
-  studyEntityIdColName <- upstreamEntityIdColNames[1] # still working off the assumption theyre ordered
   vocabDTs <- lapply(vocabs, function(x) {x@studyVocab})
-  vocabDTs <- lapply(vocabDTs, function(x) { merge(x, object@collectionsDT, by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE) })
-  allCombinations.dt <- purrr::reduce(vocabDTs, merge, by = upstreamEntityIdColNames, allow.cartesian=TRUE, all=TRUE)
+  if (!!length(collectionsDT)) {
+    vocabDTs <- lapply(vocabDTs, function(x) { merge(x, collectionsDT, by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE) })
+  }
+  mergeBy <- studyEntityIdColName
+  if (!!length(collectionsDT)) mergeBy <- upstreamEntityIdColNames
+  allCombinations.dt <- purrr::reduce(vocabDTs, merge, by = mergeBy, allow.cartesian=TRUE, all=TRUE)
 
   # find which ones we need to add
   presentCombinations.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE])
   # need upstream entity ids for all combinations in order to properly find and merge missing values
-  allCombinations.dt <- merge(allCombinations.dt, upstreamEntityVariables.dt, by = upstreamEntityIdColNames, all = TRUE, allow.cartesian=TRUE)
+  allCombinations.dt <- merge(allCombinations.dt, upstreamEntityVariables.dt, by = mergeBy, all = TRUE, allow.cartesian=TRUE)
   ## TODO figure how to populate study and collection entity variable values based on ids
   # NOTE: we're assuming if a value was explicitly filtered against that its not in the vocab
   addCombinations.dt <- allCombinations.dt[!presentCombinations.dt, on=c(upstreamEntityIdColNames, varSpecColNames)]

From 0b7c01b1a06626accb81f36f9c888805ee0de4b5 Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Wed, 10 Apr 2024 22:24:40 -0400
Subject: [PATCH 06/12] fix up some tests

---
 tests/testthat/test-class-Megastudy.R | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R
index dcddf5b..b82e42c 100644
--- a/tests/testthat/test-class-Megastudy.R
+++ b/tests/testthat/test-class-Megastudy.R
@@ -83,7 +83,7 @@ test_that("Megastudy and associated validation works", {
                  ancestorIdColumns=c('study.id', 'collection.id'),
                  studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs)))
 
-  expect_equal(slotNames(m), c('data','ancestorIdColumns','studySpecificVocabularies'))
+  expect_equal(slotNames(m), c('data','ancestorIdColumns','studySpecificVocabularies','collectionsDT'))
   expect_equal(length(m@studySpecificVocabularies), 1)
   expect_equal(data.table::uniqueN(m@studySpecificVocabularies[[1]]@studyVocab[,1]), 2)
   expect_equal(slotNames(m@studySpecificVocabularies[[1]]), c("studyIdColumnName","variableSpec","studyVocab"))    
@@ -93,7 +93,7 @@ test_that("Megastudy and associated validation works", {
                  ancestorIdColumns=c('study.id', 'collection.id'),
                  studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)))
 
-  expect_equal(slotNames(m), c('data','ancestorIdColumns','studySpecificVocabularies'))
+  expect_equal(slotNames(m), c('data','ancestorIdColumns','studySpecificVocabularies','collectionsDT'))
   expect_equal(length(m@studySpecificVocabularies), 2)
   expect_equal(data.table::uniqueN(m@studySpecificVocabularies[[2]]@studyVocab[,1]), 2)
   expect_equal(slotNames(m@studySpecificVocabularies[[2]]), c("studyIdColumnName","variableSpec","studyVocab"))              
@@ -199,7 +199,6 @@ test_that("imputeZeroes method is sane", {
   m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE],
                   ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'),
                   studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)),
-                  collectionIdColumn='collection.id',
                   collectionsDT=collectionsDT)
 
   variables <- new("VariableMetadataList", SimpleList(

From 40200dc9b715791bb5043deba033c43dba5d4c36 Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Thu, 11 Apr 2024 12:45:42 -0400
Subject: [PATCH 07/12] fix a bug where we were trying to impute values of
 collections variables too, oops

---
 R/methods-Megastudy.R                 | 10 ++++++----
 tests/testthat/test-class-Megastudy.R | 17 +++++++++++++----
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R
index d4ac92d..b530a78 100644
--- a/R/methods-Megastudy.R
+++ b/R/methods-Megastudy.R
@@ -209,6 +209,7 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
   # it seems a lot of this validation could belong to some custom obj w both a megastudy and vm slot.. but what is that? a MegastudyPlot?
   # plus going that route means using this class in plot.data means an api change for plot.data
   # that api change might be worth making in any case, but not doing it now
+  ## TODO validate that any collections variables are present in collectionsDT
   variableMetadataNeedingStudyVocabularies <- findStudyDependentVocabularyVariableMetadata(variables)
   variableSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, "Never")
   variableCollectionSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, "Always")
@@ -261,16 +262,17 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
   veupathUtils::logWithTime(paste("Found", nrow(upstreamEntityVariables.dt), "unique existing upstream variable value combinations."), verbose)
   if (!!length(collectionsDT)) {
     upstreamEntityVariables.dt <- merge(upstreamEntityVariables.dt, collectionsDT, by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE)
-    upstreamEntityVariables.dt[[which(grepl('.x',names(upstreamEntityVariables.dt),fixed=T))]] <- NULL
-    collectionEntityColumnIndex <- which(grepl('.y',names(upstreamEntityVariables.dt),fixed=T))
-    names(upstreamEntityVariables.dt)[collectionEntityColumnIndex] <- gsub('.y', '', names(upstreamEntityVariables.dt)[collectionEntityColumnIndex], fixed=T)
+  upstreamEntityVariables.dt <- upstreamEntityVariables.dt[, -which(grepl('.x',names(upstreamEntityVariables.dt),fixed=T)), with=FALSE]
+    collectionEntityColumnIndices <- which(grepl('.y',names(upstreamEntityVariables.dt),fixed=T))
+    names(upstreamEntityVariables.dt)[collectionEntityColumnIndices] <- gsub('.y', '', names(upstreamEntityVariables.dt)[collectionEntityColumnIndices], fixed=T)
+    upstreamEntityVariables.dt <- unique(upstreamEntityVariables.dt)
   }
   entityIds.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecEntityIdColName), with=FALSE])
 
   # make all possible variable value combinations table
   vocabDTs <- lapply(vocabs, function(x) {x@studyVocab})
   if (!!length(collectionsDT)) {
-    vocabDTs <- lapply(vocabDTs, function(x) { merge(x, collectionsDT, by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE) })
+    vocabDTs <- lapply(vocabDTs, function(x) { merge(x, collectionsDT[, upstreamEntityIdColNames], by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE) })
   }
   mergeBy <- studyEntityIdColName
   if (!!length(collectionsDT)) mergeBy <- upstreamEntityIdColNames
diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R
index b82e42c..c9148a8 100644
--- a/tests/testthat/test-class-Megastudy.R
+++ b/tests/testthat/test-class-Megastudy.R
@@ -13,9 +13,14 @@ megastudyDT <- data.table('study.id'=c('a','a','a','b','b','b'),
                           'assay.pathogen3_presence'=c('No','Yes','No','Yes','No','Yes'),
                           'assay.weighting_variable'=c(5,10,15,20,25,30))
 
+## the collectionsDT needs to include all collections, unless explicitly filtered against
+## it also needs to tell us values for any collection variables that are in the plot
+## were trying to impute samples, based on collections that dont have samples.
+## so the only way to get collection variables values for those samples, is to get them from the collectionsDT
 collectionsDT <- data.table(
   'study.id' = c('a', 'a', 'a','b', 'b', 'b', 'b'),
-  'collection.id' = c(1, 2, 3, 1, 2, 3, 4))
+  'collection.id' = c(1, 2, 3, 1, 2, 3, 4),
+  'collection.attractant' = c('A', 'B', 'C', 'C', 'D', 'D', 'E'))
 
 sexVocabs.dt <- data.table::data.table(
   'study.id' = c('a', 'a', 'a', 'a', 'a', 'b', 'b'),
@@ -229,9 +234,9 @@ test_that("imputeZeroes method is sane", {
   imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
   # result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes
   expect_equal(all(c("sample.species","sample.specimen_count") %in% names(imputedDT)), TRUE)
-  # 5 sexes * 3 species in study A (15) + 2 sexes * 3 species in study B (6) * 2 collections per study = 42
-  expect_equal(nrow(imputedDT), 42)
-  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36)
+  # 5 sexes * 3 species * 3 collections in study A (45) + 2 sexes * 3 species * 4 collections in study B (30) = 69
+  expect_equal(nrow(imputedDT), 69) ## TODO check these numbers
+  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 63)
 
   # case where one study vocab is missing a study
   mDTSexSingleStudy <- megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE]
@@ -249,6 +254,10 @@ test_that("imputeZeroes method is sane", {
   expect_equal(nrow(imputedDT), 36)
   expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 30)
 
+  m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE],
+                  ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'),
+                  studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)))
+
   # collection entity var is present
   variables <- new("VariableMetadataList", SimpleList(
     new("VariableMetadata",

From 691543dc93957650f68c9cac649eb2b89b905060 Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Thu, 11 Apr 2024 13:30:42 -0400
Subject: [PATCH 08/12] a perf improvement

---
 R/methods-Megastudy.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R
index b530a78..c89d791 100644
--- a/R/methods-Megastudy.R
+++ b/R/methods-Megastudy.R
@@ -295,7 +295,7 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
 
   # go ahead and add them, first filling in values for all columns
   addCombinations.dt[[weightingVarColName]] <- 0  
-  addCombinations.dt[[varSpecEntityIdColName]] <- apply(addCombinations.dt[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE], 1, digest::digest, algo="md5")
+  addCombinations.dt[[varSpecEntityIdColName]] <- stringi::stri_rand_strings(nrow(addCombinations.dt), 10)
   # bind them to the existing rows
   .dt <- data.table::rbindlist(list(.dt, addCombinations.dt), use.names=TRUE)
   veupathUtils::logWithTime("Added imputed values to existing table. Finished imputing zeroes.", verbose)

From 48ae23fce55d0a9eef23672e70cd8c61c177b579 Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Thu, 11 Apr 2024 14:04:36 -0400
Subject: [PATCH 09/12] add some validation for collectionsDT

---
 R/methods-Megastudy.R                 | 11 ++++++++++-
 tests/testthat/test-class-Megastudy.R |  5 +++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R
index c89d791..d650b39 100644
--- a/R/methods-Megastudy.R
+++ b/R/methods-Megastudy.R
@@ -255,6 +255,16 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
   }
   studyEntityIdColName <- upstreamEntityIdColNames[1] # still working off the assumption theyre ordered
 
+  # variables that are from the upstream entities need to be in collectionsDT
+  # otherwise we erroneously try to impute values for those variables too, rather than only the weighting variable
+  
+  if (!!length(collectionsDT)) {
+    upstreamEntityVariableColNames <- findColNamesByPredicate(variables, function(x) { x@variableSpec@entityId %in% upstreamEntityIdColNames })
+    if (!all(upstreamEntityVariableColNames %in% names(collectionsDT))) {
+      stop("All variables from the upstream entities must be in collectionsDT.")
+    }
+  }
+
   # for upstream entities data
   upstreamEntityVariables.dt <- .dt[, -c(weightingVarColName, varSpecColNames), with=FALSE]
   upstreamEntityVariables.dt[[varSpecEntityIdColName]] <- NULL
@@ -282,7 +292,6 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
   presentCombinations.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE])
   # need upstream entity ids for all combinations in order to properly find and merge missing values
   allCombinations.dt <- merge(allCombinations.dt, upstreamEntityVariables.dt, by = mergeBy, all = TRUE, allow.cartesian=TRUE)
-  ## TODO figure how to populate study and collection entity variable values based on ids
   # NOTE: we're assuming if a value was explicitly filtered against that its not in the vocab
   addCombinations.dt <- allCombinations.dt[!presentCombinations.dt, on=c(upstreamEntityIdColNames, varSpecColNames)]
 
diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R
index c9148a8..969fed1 100644
--- a/tests/testthat/test-class-Megastudy.R
+++ b/tests/testthat/test-class-Megastudy.R
@@ -814,6 +814,7 @@ test_that("we have reasonable perf w a real-ish use case", {
   )
 
   benchmark <- microbenchmark::microbenchmark(getDTWithImputedZeroes(megastudyReal, megastudyVariablesReal, verbose = FALSE))
-  expect_true(mean(benchmark$time)/1000000 < 50) ## this is in milliseconds
-  expect_true(median(benchmark$time)/1000000 < 50)
+  print(benchmark)
+  expect_true(mean(benchmark$time)/1000000 < 55) ## this is in milliseconds
+  expect_true(median(benchmark$time)/1000000 < 55)
 })
\ No newline at end of file

From ad7efdf33082e8c1041ef7d30ba549f5be77bdf9 Mon Sep 17 00:00:00 2001
From: Danielle Callan <53306535+d-callan@users.noreply.github.com>
Date: Sat, 13 Apr 2024 07:01:43 -0400
Subject: [PATCH 10/12] Update R/methods-Megastudy.R

Co-authored-by: bobular <uncoolbob@gmail.com>
---
 R/methods-Megastudy.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R
index d650b39..cf0715f 100644
--- a/R/methods-Megastudy.R
+++ b/R/methods-Megastudy.R
@@ -272,7 +272,7 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
   veupathUtils::logWithTime(paste("Found", nrow(upstreamEntityVariables.dt), "unique existing upstream variable value combinations."), verbose)
   if (!!length(collectionsDT)) {
     upstreamEntityVariables.dt <- merge(upstreamEntityVariables.dt, collectionsDT, by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE)
-  upstreamEntityVariables.dt <- upstreamEntityVariables.dt[, -which(grepl('.x',names(upstreamEntityVariables.dt),fixed=T)), with=FALSE]
+    upstreamEntityVariables.dt <- upstreamEntityVariables.dt[, -which(grepl('.x',names(upstreamEntityVariables.dt),fixed=T)), with=FALSE]
     collectionEntityColumnIndices <- which(grepl('.y',names(upstreamEntityVariables.dt),fixed=T))
     names(upstreamEntityVariables.dt)[collectionEntityColumnIndices] <- gsub('.y', '', names(upstreamEntityVariables.dt)[collectionEntityColumnIndices], fixed=T)
     upstreamEntityVariables.dt <- unique(upstreamEntityVariables.dt)

From 7ab9529242bc8e074421506d12f00ef6e9780ceb Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Mon, 15 Apr 2024 11:07:37 -0400
Subject: [PATCH 11/12] handle case where upstream variables are only in
 collectionsDT

---
 R/methods-Megastudy.R                 | 14 ++++-----
 tests/testthat/test-class-Megastudy.R | 45 ++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R
index d650b39..06e80c0 100644
--- a/R/methods-Megastudy.R
+++ b/R/methods-Megastudy.R
@@ -257,9 +257,9 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
 
   # variables that are from the upstream entities need to be in collectionsDT
   # otherwise we erroneously try to impute values for those variables too, rather than only the weighting variable
-  
+  upstreamEntities <- veupathUtils::strSplit(upstreamEntityIdColNames, ".", 2, 1)
   if (!!length(collectionsDT)) {
-    upstreamEntityVariableColNames <- findColNamesByPredicate(variables, function(x) { x@variableSpec@entityId %in% upstreamEntityIdColNames })
+    upstreamEntityVariableColNames <- findColNamesByPredicate(variables, function(x) { x@variableSpec@entityId %in% upstreamEntities })
     if (!all(upstreamEntityVariableColNames %in% names(collectionsDT))) {
       stop("All variables from the upstream entities must be in collectionsDT.")
     }
@@ -271,11 +271,7 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
   upstreamEntityVariables.dt <- unique(upstreamEntityVariables.dt)
   veupathUtils::logWithTime(paste("Found", nrow(upstreamEntityVariables.dt), "unique existing upstream variable value combinations."), verbose)
   if (!!length(collectionsDT)) {
-    upstreamEntityVariables.dt <- merge(upstreamEntityVariables.dt, collectionsDT, by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE)
-  upstreamEntityVariables.dt <- upstreamEntityVariables.dt[, -which(grepl('.x',names(upstreamEntityVariables.dt),fixed=T)), with=FALSE]
-    collectionEntityColumnIndices <- which(grepl('.y',names(upstreamEntityVariables.dt),fixed=T))
-    names(upstreamEntityVariables.dt)[collectionEntityColumnIndices] <- gsub('.y', '', names(upstreamEntityVariables.dt)[collectionEntityColumnIndices], fixed=T)
-    upstreamEntityVariables.dt <- unique(upstreamEntityVariables.dt)
+    upstreamEntityVariables.dt <- collectionsDT
   }
   entityIds.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecEntityIdColName), with=FALSE])
 
@@ -306,6 +302,10 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
   addCombinations.dt[[weightingVarColName]] <- 0  
   addCombinations.dt[[varSpecEntityIdColName]] <- stringi::stri_rand_strings(nrow(addCombinations.dt), 10)
   # bind them to the existing rows
+  upstreamVariablesInCollectionsDT <- names(collectionsDT)[!names(collectionsDT) %in% upstreamEntityIdColNames]
+  if (!!length(collectionsDT) & !all(upstreamVariablesInCollectionsDT %in% names(.dt))) {
+    .dt <- merge(.dt, upstreamEntityVariables.dt)
+  }
   .dt <- data.table::rbindlist(list(.dt, addCombinations.dt), use.names=TRUE)
   veupathUtils::logWithTime("Added imputed values to existing table. Finished imputing zeroes.", verbose)
 
diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R
index 969fed1..ffffead 100644
--- a/tests/testthat/test-class-Megastudy.R
+++ b/tests/testthat/test-class-Megastudy.R
@@ -201,7 +201,47 @@ test_that("imputeZeroes method is sane", {
   # in real life, we have some collections where all samples are 0 and so not loaded
   # in this case, the collection ids are missing from the data table R gets handed. 
   # we want to impute zeroes for their samples anyhow.
-  m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE],
+
+  # in this version, some collection level variables are in both data and collectionsDT
+  m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant'), with=FALSE],
+                  ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'),
+                  studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)),
+                  collectionsDT=collectionsDT)
+
+  variables <- new("VariableMetadataList", SimpleList(
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'species', entityId = 'sample'),
+      plotReference = new("PlotReference", value = 'xAxis'),
+      dataType = new("DataType", value = 'STRING'),
+      dataShape = new("DataShape", value = 'CATEGORICAL'),
+      weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'),
+      hasStudyDependentVocabulary = TRUE),
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'),
+      plotReference = new("PlotReference", value = 'yAxis'),
+      dataType = new("DataType", value = 'NUMBER'),
+      dataShape = new("DataShape", value = 'CONTINUOUS')),
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'),
+      # empty plotReference means that it is not plotted
+      dataType = new("DataType", value = 'STRING'),
+      dataShape = new("DataShape", value = 'CATEGORICAL'),
+      weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'),
+      hasStudyDependentVocabulary = TRUE)
+  ))
+
+  imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
+  # result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes
+  expect_equal(all(c("sample.species","sample.specimen_count") %in% names(imputedDT)), TRUE)
+  # 5 sexes * 3 species * 3 collections in study A (45) + 2 sexes * 3 species * 4 collections in study B (30) = 69
+  expect_equal(nrow(imputedDT), 69) ## TODO check these numbers
+  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 63)
+
+  # in this version, collection level variables are only in collectionsDT
+  m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species'), with=FALSE],
                   ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'),
                   studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)),
                   collectionsDT=collectionsDT)
@@ -238,6 +278,9 @@ test_that("imputeZeroes method is sane", {
   expect_equal(nrow(imputedDT), 69) ## TODO check these numbers
   expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 63)
 
+  #################################################################################################################
+
+
   # case where one study vocab is missing a study
   mDTSexSingleStudy <- megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE]
   mDTSexSingleStudy$sample.sex[mDTSexSingleStudy$study.id == 'b'] <- NA_character_

From 6c61e4cc08d7d0803a480dee0483a740fad43b29 Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Tue, 16 Apr 2024 14:44:35 -0400
Subject: [PATCH 12/12] update docs

---
 R/class-Megastudy.R    | 3 ++-
 man/Megastudy-class.Rd | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/R/class-Megastudy.R b/R/class-Megastudy.R
index 85f21c7..4ff2df8 100644
--- a/R/class-Megastudy.R
+++ b/R/class-Megastudy.R
@@ -94,7 +94,8 @@ check_megastudy <- function(object) {
 #' @slot data A data.table
 #' @slot ancestorIdColumns A character vector of column names representing parent entities of the recordIdColumn.
 #' @slot studySpecificVocabularies veupathUtils::StudySpecificVocabulariesByVariableList
-#' @slot collectionIds A character vector of collection ids we expect. If none provided, they will be inferred from those present.
+#' @slot collectionIds A data.table including collection ids and any variables of interest for the collection entity.
+#' If none provided, the collection ids will be inferred from those present in `data`.
 #' 
 #' @name Megastudy-class
 #' @rdname Megastudy-class
diff --git a/man/Megastudy-class.Rd b/man/Megastudy-class.Rd
index 6dbb37f..1527fa1 100644
--- a/man/Megastudy-class.Rd
+++ b/man/Megastudy-class.Rd
@@ -16,6 +16,11 @@ that data.
 \describe{
 \item{\code{data}}{A data.table}
 
+\item{\code{ancestorIdColumns}}{A character vector of column names representing parent entities of the recordIdColumn.}
+
 \item{\code{studySpecificVocabularies}}{veupathUtils::StudySpecificVocabulariesByVariableList}
+
+\item{\code{collectionIds}}{A data.table including collection ids and any variables of interest for the collection entity.
+If none provided, the collection ids will be inferred from those present in \code{data}.}
 }}