From c3fa47f373d8afdfabe707b78aae4109b5d7af55 Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Mon, 18 Dec 2023 09:21:34 -0500
Subject: [PATCH 1/8] allow impute zeroes to work with columns that arent in
 the final plot

---
 R/methods-Megastudy.R                 | 2 +-
 tests/testthat/test-class-Megastudy.R | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R
index b41244e..82adafa 100644
--- a/R/methods-Megastudy.R
+++ b/R/methods-Megastudy.R
@@ -227,7 +227,7 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
   variableColumnNames <- unlist(lapply(as.list(variables), getVariableColumnNames))
   allEntityIdColumns <- object@ancestorIdColumns
   # drop things that arent in the plot, except ids
-  .dt <- .dt[, c(variableColumnNames, allEntityIdColumns), with=FALSE]
+  #.dt <- .dt[, c(variableColumnNames, allEntityIdColumns), with=FALSE]
   vocabs <- object@studySpecificVocabularies
 
   # it seems a lot of this validation could belong to some custom obj w both a megastudy and vm slot.. but what is that? a MegastudyPlot?
diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R
index cdcc281..b3edaff 100644
--- a/tests/testthat/test-class-Megastudy.R
+++ b/tests/testthat/test-class-Megastudy.R
@@ -115,7 +115,7 @@ test_that("Megastudy and associated validation works", {
 
 # TODO this could go in its own file maybe
 test_that("imputeZeroes method is sane", {
-  m <- Megastudy(data=megastudyDT,
+  m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE],
                  ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'),
                  studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs)))
 
@@ -259,7 +259,7 @@ test_that("imputeZeroes method is sane", {
   ))
 
   imputedDT <- getDTWithImputedZeroes(mCOMPLETE, variables, FALSE)
-  expect_equal(all(names(imputedDT) %in% names(m@data)), TRUE)
+  expect_equal(all(names(imputedDT) %in% names(mCOMPLETE@data)), TRUE)
   expect_equal(nrow(imputedDT), nrow(mCOMPLETE@data))
   expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 0)
 
@@ -368,7 +368,7 @@ test_that("imputeZeroes method is sane", {
   expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 0)
 
   # multiple special vocabs in same plot, w one shared weighting var
-  m <- Megastudy(data=megastudyDT,
+  m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.species', 'sample.sex', 'collection.attractant'), with=FALSE],
                  ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'),
                  studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs,sexVocabs)))
 

From 373429d8ab21640e6505683f03e5b1616aba4f6f Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Mon, 18 Dec 2023 11:35:31 -0500
Subject: [PATCH 2/8] proper null handling for variable collections when
 imputing zeroes

---
 R/methods-Megastudy.R | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R
index 82adafa..c041da9 100644
--- a/R/methods-Megastudy.R
+++ b/R/methods-Megastudy.R
@@ -28,8 +28,11 @@ setMethod('getVariableSpec', signature('VariableMetadata'), function(object, get
   getCollectionMemberVarSpecs <- veupathUtils::matchArg(getCollectionMemberVarSpecs)
   varSpecs <- list(object@variableSpec)
 
+  #if the variable is a collection, then we want to return the member variable specs
   if (object@isCollection && getCollectionMemberVarSpecs) {
     varSpecs <- as.list(object@members)
+  } else if (!object@isCollection && getCollectionMemberVarSpecs) {
+    varSpecs <- NULL
   }
 
   return(varSpecs)
@@ -38,8 +41,13 @@ setMethod('getVariableSpec', signature('VariableMetadata'), function(object, get
 #' @export 
 setMethod('getVariableSpec', signature('VariableMetadataList'), function(object, getCollectionMemberVarSpecs = c(TRUE, FALSE)) {
   getCollectionMemberVarSpecs <- veupathUtils::matchArg(getCollectionMemberVarSpecs)
+  
+  varSpecs <- unlist(lapply(as.list(object), veupathUtils::getVariableSpec, getCollectionMemberVarSpecs))
+  if (all(unlist(lapply(varSpecs, is.null)))) {
+    return(NULL)
+  }
 
-  return(unlist(lapply(as.list(object), veupathUtils::getVariableSpec, getCollectionMemberVarSpecs)))
+  return(varSpecs)
 })
 
 #' StuydIdColName as String
@@ -176,7 +184,11 @@ findVariableSpecsFromStudyVocabulary <- function(vocabs, variables, getCollectio
 
   if (getCollectionMemberVarSpecs) {
     varMetadataWithVocabs <- findVariableMetadataFromVariableSpec(variables, varSpecsWithVocabs)
-    varSpecsWithVocabs <- VariableSpecList(S4Vectors::SimpleList(getVariableSpec(varMetadataWithVocabs, getCollectionMemberVarSpecs)))
+    varSpecsWithVocabs <- getVariableSpec(varMetadataWithVocabs, getCollectionMemberVarSpecs)
+    if (is.null(varSpecsWithVocabs)) {
+      return(NULL)
+    }
+    varSpecsWithVocabs <- VariableSpecList(S4Vectors::SimpleList(varSpecsWithVocabs))
   }
 
   return(varSpecsWithVocabs)
@@ -222,28 +234,23 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
 
   .dt <- object@data
   veupathUtils::logWithTime(paste0("Imputing zeroes for data.table with ", ncol(.dt), " columns and ", nrow(.dt), " rows"), verbose)
-  # TODO feel like im doing this operation a lot.. maybe another method/ helper?
-  # also, try to figure a way we dont have to do this.. i dont remember why i did this and its inconsistent behavior
-  variableColumnNames <- unlist(lapply(as.list(variables), getVariableColumnNames))
   allEntityIdColumns <- object@ancestorIdColumns
-  # drop things that arent in the plot, except ids
-  #.dt <- .dt[, c(variableColumnNames, allEntityIdColumns), with=FALSE]
   vocabs <- object@studySpecificVocabularies
 
   # it seems a lot of this validation could belong to some custom obj w both a megastudy and vm slot.. but what is that? a MegastudyPlot?
   # plus going that route means using this class in plot.data means an api change for plot.data
   # that api change might be worth making in any case, but not doing it now
   variableMetadataNeedingStudyVocabularies <- findStudyDependentVocabularyVariableMetadata(variables)
-  variableSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, TRUE)
-  variableCollectionSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, FALSE)
+  variableSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, FALSE)
+  variableCollectionSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, TRUE)
   if (is.null(variableCollectionSpecsWithStudyVocabs)) {
     variableMetadataForStudyVocabVariables <- findVariableMetadataFromVariableSpec(variables, variableSpecsWithStudyVocabs)
   } else {
     variableMetadataForStudyVocabVariables <- findVariableMetadataFromVariableSpec(variables, variableCollectionSpecsWithStudyVocabs)
   }
-  if (length(variableSpecsWithStudyVocabs) > length(variableMetadataForStudyVocabVariables)) {
-    warning("Study vocabularies were provided for variables that are not present in the plot. These will be ignored.")
-  }
+  #if (length(variableSpecsWithStudyVocabs) > length(variableMetadataForStudyVocabVariables)) {
+  #  warning("Study vocabularies were provided for variables that are not present in the plot. These will be ignored.")
+  #}
   if (length(variableMetadataForStudyVocabVariables) < length(variableMetadataNeedingStudyVocabularies)) {
     stop("Some provided variables require study vocabularies but dont have one.")
   }

From 4bf593628bc19f28b2967ead7dfbbd91147f4bab Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Mon, 18 Dec 2023 13:36:02 -0500
Subject: [PATCH 3/8] keep improving getCollectionMemberVariableSpecs arg

---
 R/methods-Megastudy.R        | 21 +++++++++------------
 R/methods-VariableMetadata.R |  7 ++++---
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R
index c041da9..88cc4d1 100644
--- a/R/methods-Megastudy.R
+++ b/R/methods-Megastudy.R
@@ -22,16 +22,15 @@ setMethod('getVariableSpec', signature('ANY'), function(object) {
   return(object@variableSpec)
 })
 
-# this might be unexpected behavior. should there be a param to choose between the collection and its member specs?
 #' @export
-setMethod('getVariableSpec', signature('VariableMetadata'), function(object, getCollectionMemberVarSpecs = c(FALSE, TRUE)) {
+setMethod('getVariableSpec', signature('VariableMetadata'), function(object, getCollectionMemberVarSpecs = c("Dynamic", "Never", "Always")) {
   getCollectionMemberVarSpecs <- veupathUtils::matchArg(getCollectionMemberVarSpecs)
   varSpecs <- list(object@variableSpec)
 
   #if the variable is a collection, then we want to return the member variable specs
-  if (object@isCollection && getCollectionMemberVarSpecs) {
+  if (object@isCollection && getCollectionMemberVarSpecs %in% c("Dynamic", "Always")) {
     varSpecs <- as.list(object@members)
-  } else if (!object@isCollection && getCollectionMemberVarSpecs) {
+  } else if (!object@isCollection && getCollectionMemberVarSpecs == "Always") {
     varSpecs <- NULL
   }
 
@@ -39,7 +38,7 @@ setMethod('getVariableSpec', signature('VariableMetadata'), function(object, get
 })
 
 #' @export 
-setMethod('getVariableSpec', signature('VariableMetadataList'), function(object, getCollectionMemberVarSpecs = c(TRUE, FALSE)) {
+setMethod('getVariableSpec', signature('VariableMetadataList'), function(object, getCollectionMemberVarSpecs = c("Dynamic", "Never", "Always")) {
   getCollectionMemberVarSpecs <- veupathUtils::matchArg(getCollectionMemberVarSpecs)
   
   varSpecs <- unlist(lapply(as.list(object), veupathUtils::getVariableSpec, getCollectionMemberVarSpecs))
@@ -157,7 +156,7 @@ findStudyVocabularyByVariableSpec <- function(vocabs, variables, variableSpec) {
 
   vocabVariableSpecs <- lapply(as.list(vocabs), veupathUtils::getVariableSpec)
   vocabVariableMetadata <- veupathUtils::findVariableMetadataFromVariableSpec(variables, veupathUtils::VariableSpecList(S4Vectors::SimpleList(vocabVariableSpecs)))
-  vocabVariableSpecsAdjustedForVariableCollectionMembers <- veupathUtils::getVariableSpec(vocabVariableMetadata, TRUE)
+  vocabVariableSpecsAdjustedForVariableCollectionMembers <- veupathUtils::getVariableSpec(vocabVariableMetadata, "Dynamic")
   
   # if we have found variable collection members in the VariableMetadata, need to check if the passed varspec was a member
   # look through the list that includes the members, and if we match one, get the varspec of the parent/ collection
@@ -175,14 +174,14 @@ findStudyVocabularyByVariableSpec <- function(vocabs, variables, variableSpec) {
 }
 
 
-findVariableSpecsFromStudyVocabulary <- function(vocabs, variables, getCollectionMemberVarSpecs = c(TRUE, FALSE)) {
+findVariableSpecsFromStudyVocabulary <- function(vocabs, variables, getCollectionMemberVarSpecs = c("Dynamic", "Never", "Always")) {
   if (!inherits(vocabs, 'StudySpecificVocabulariesByVariableList')) stop("The first argument must be of the S4 class `StudySpecificVocabulariesByVariableList`.")
   if (!inherits(variables, 'VariableMetadataList')) stop("The second argument must be of the S4 class `VariableMetadataList`.")
   getCollectionMemberVarSpecs <- veupathUtils::matchArg(getCollectionMemberVarSpecs)
 
   varSpecsWithVocabs <- VariableSpecList(S4Vectors::SimpleList(lapply(as.list(vocabs), getVariableSpec)))
 
-  if (getCollectionMemberVarSpecs) {
+  if (getCollectionMemberVarSpecs != "Never") {
     varMetadataWithVocabs <- findVariableMetadataFromVariableSpec(variables, varSpecsWithVocabs)
     varSpecsWithVocabs <- getVariableSpec(varMetadataWithVocabs, getCollectionMemberVarSpecs)
     if (is.null(varSpecsWithVocabs)) {
@@ -241,8 +240,8 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
   # plus going that route means using this class in plot.data means an api change for plot.data
   # that api change might be worth making in any case, but not doing it now
   variableMetadataNeedingStudyVocabularies <- findStudyDependentVocabularyVariableMetadata(variables)
-  variableSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, FALSE)
-  variableCollectionSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, TRUE)
+  variableSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, "Never")
+  variableCollectionSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, "Always")
   if (is.null(variableCollectionSpecsWithStudyVocabs)) {
     variableMetadataForStudyVocabVariables <- findVariableMetadataFromVariableSpec(variables, variableSpecsWithStudyVocabs)
   } else {
@@ -326,8 +325,6 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
   }
   .dt2 <- purrr::reduce(dataTablesOfImputedValues, mergeDTsOfImputedValues)
   veupathUtils::logWithTime(paste("Finished collapsing imputed values for all variables into one table. Added", nrow(.dt2), "total rows."), verbose)
-  message(colnames(.dt2))
-  message(head(.dt2))
 
   #make impossibly unique ids
   .dt2[[varSpecEntityIdColName]] <- apply(.dt2[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE], 1, digest::digest, algo='md5')
diff --git a/R/methods-VariableMetadata.R b/R/methods-VariableMetadata.R
index fda8759..8bbac93 100644
--- a/R/methods-VariableMetadata.R
+++ b/R/methods-VariableMetadata.R
@@ -639,18 +639,19 @@ setGeneric("findVariableMetadataFromVariableSpec",
 
 #' @export
 setMethod("findVariableMetadataFromVariableSpec", signature("VariableMetadataList", "VariableSpecList"), function(variables, object) {
-  variableSpecs <- unlist(lapply(as.list(variables), veupathUtils::getVariableSpec, FALSE))
+  variableSpecs <- unlist(lapply(as.list(variables), veupathUtils::getVariableSpec, "Never"))
   colNamesToMatch <- unlist(lapply(as.list(object), veupathUtils::getColName))
  
   index <- which(purrr::map(variableSpecs, function(x) {veupathUtils::getColName(x)}) %in% colNamesToMatch)
+  
   if (!length(index)) return(NULL)
-
+  
   return(variables[index]) 
 })
 
 #' @export
 setMethod("findVariableMetadataFromVariableSpec", signature("VariableMetadataList", "VariableSpec"), function(variables, object) {
-  variableSpecs <- unlist(lapply(as.list(variables), veupathUtils::getVariableSpec, FALSE))
+  variableSpecs <- unlist(lapply(as.list(variables), veupathUtils::getVariableSpec, "Never"))
  
   index <- which(purrr::map(variableSpecs, function(x) {veupathUtils::getColName(x)}) == veupathUtils::getColName(object))
   if (!length(index)) return(NULL)

From 164e28fdf883310194b00a3014bbb4253d0c6659 Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Mon, 18 Dec 2023 13:36:40 -0500
Subject: [PATCH 4/8] update tests to better reflect real world data

---
 tests/testthat/test-class-Megastudy.R | 102 +++++++++++++++++++++-----
 1 file changed, 83 insertions(+), 19 deletions(-)

diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R
index b3edaff..bc7bb33 100644
--- a/tests/testthat/test-class-Megastudy.R
+++ b/tests/testthat/test-class-Megastudy.R
@@ -115,9 +115,9 @@ test_that("Megastudy and associated validation works", {
 
 # TODO this could go in its own file maybe
 test_that("imputeZeroes method is sane", {
-  m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE],
+  m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE],
                  ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'),
-                 studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs)))
+                 studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)))
 
   # case where neither study nor collection vars in the plot
   variables <- new("VariableMetadataList", SimpleList(
@@ -134,15 +134,23 @@ test_that("imputeZeroes method is sane", {
       variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'),
       plotReference = new("PlotReference", value = 'yAxis'),
       dataType = new("DataType", value = 'NUMBER'),
-      dataShape = new("DataShape", value = 'CONTINUOUS'))
+      dataShape = new("DataShape", value = 'CONTINUOUS')),
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'),
+      # empty plotReference means that it is not plotted
+      dataType = new("DataType", value = 'STRING'),
+      dataShape = new("DataShape", value = 'CATEGORICAL'),
+      weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'),
+      hasStudyDependentVocabulary = TRUE)
   ))
 
   imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
   # result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes
-  # TODO lol its just possible this fxn shouldnt remove cols but my brain hurts enough already 
   expect_equal(all(c("sample.species","sample.specimen_count") %in% names(imputedDT)), TRUE)
-  expect_equal(nrow(imputedDT), 12)
-  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 6)
+  # 5 sexes * 3 species in study A (15) + 2 sexes * 3 species in study B (6) = 21
+  expect_equal(nrow(imputedDT), 21)
+  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15)
 
   # collection entity var is present
   variables <- new("VariableMetadataList", SimpleList(
@@ -165,13 +173,21 @@ test_that("imputeZeroes method is sane", {
       variableSpec = new("VariableSpec", variableId = 'attractant', entityId = 'collection'),
       plotReference = new("PlotReference", value = 'overlay'),
       dataType = new("DataType", value = 'STRING'),
-      dataShape = new("DataShape", value = 'CATEGORICAL'))
+      dataShape = new("DataShape", value = 'CATEGORICAL')),
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'),
+      # empty plotReference means that it is not plotted
+      dataType = new("DataType", value = 'STRING'),
+      dataShape = new("DataShape", value = 'CATEGORICAL'),
+      weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'),
+      hasStudyDependentVocabulary = TRUE)
   ))
 
   imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
   expect_equal(all(c("sample.species","sample.specimen_count","collection.attractant") %in% names(imputedDT)), TRUE)
-  expect_equal(nrow(imputedDT), 12)
-  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 6)
+  expect_equal(nrow(imputedDT), 21)
+  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15)
 
   # both collection and study entity vars are present
   variables <- new("VariableMetadataList", SimpleList(
@@ -200,13 +216,21 @@ test_that("imputeZeroes method is sane", {
       variableSpec = new("VariableSpec", variableId = 'author', entityId = 'study'),
       plotReference = new("PlotReference", value = 'facet1'),
       dataType = new("DataType", value = 'STRING'),
-      dataShape = new("DataShape", value = 'CATEGORICAL'))
+      dataShape = new("DataShape", value = 'CATEGORICAL')),
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'),
+      # empty plotReference means that it is not plotted
+      dataType = new("DataType", value = 'STRING'),
+      dataShape = new("DataShape", value = 'CATEGORICAL'),
+      weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'),
+      hasStudyDependentVocabulary = TRUE)
   ))
 
   imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
   expect_equal(all(c("sample.species","sample.specimen_count","collection.attractant","study.author") %in% names(imputedDT)), TRUE)
-  expect_equal(nrow(imputedDT), 12)
-  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 6)
+  expect_equal(nrow(imputedDT), 21)
+  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15)
 
   # all values in vocab already present
   megastudyDTSMALL <- rbind(megastudyDT, 
@@ -255,7 +279,15 @@ test_that("imputeZeroes method is sane", {
       variableSpec = new("VariableSpec", variableId = 'author', entityId = 'study'),
       plotReference = new("PlotReference", value = 'facet1'),
       dataType = new("DataType", value = 'STRING'),
-      dataShape = new("DataShape", value = 'CATEGORICAL'))
+      dataShape = new("DataShape", value = 'CATEGORICAL')),
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'),
+      # empty plotReference means that it is not plotted
+      dataType = new("DataType", value = 'STRING'),
+      dataShape = new("DataShape", value = 'CATEGORICAL'),
+      weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'),
+      hasStudyDependentVocabulary = TRUE)
   ))
 
   imputedDT <- getDTWithImputedZeroes(mCOMPLETE, variables, FALSE)
@@ -284,7 +316,15 @@ test_that("imputeZeroes method is sane", {
       variableSpec = new("VariableSpec", variableId = 'author', entityId = 'study'),
       plotReference = new("PlotReference", value = 'facet1'),
       dataType = new("DataType", value = 'STRING'),
-      dataShape = new("DataShape", value = 'CATEGORICAL'))
+      dataShape = new("DataShape", value = 'CATEGORICAL')),
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'),
+      # empty plotReference means that it is not plotted
+      dataType = new("DataType", value = 'STRING'),
+      dataShape = new("DataShape", value = 'CATEGORICAL'),
+      weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'),
+      hasStudyDependentVocabulary = TRUE)
   ))
 
   imputedDT <- getDTWithImputedZeroes(mCOMPLETE, variables, FALSE)
@@ -323,7 +363,15 @@ test_that("imputeZeroes method is sane", {
       variableSpec = new("VariableSpec", variableId = 'author', entityId = 'study'),
       plotReference = new("PlotReference", value = 'facet1'),
       dataType = new("DataType", value = 'STRING'),
-      dataShape = new("DataShape", value = 'CATEGORICAL'))
+      dataShape = new("DataShape", value = 'CATEGORICAL')),
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'),
+      # empty plotReference means that it is not plotted
+      dataType = new("DataType", value = 'STRING'),
+      dataShape = new("DataShape", value = 'CATEGORICAL'),
+      weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'),
+      hasStudyDependentVocabulary = TRUE)
   ))
 
   imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
@@ -359,7 +407,15 @@ test_that("imputeZeroes method is sane", {
       variableSpec = new("VariableSpec", variableId = 'pathogen_presence', entityId = 'assay'),
       plotReference = new("PlotReference", value = 'facet1'),
       dataType = new("DataType", value = 'STRING'),
-      dataShape = new("DataShape", value = 'CATEGORICAL'))
+      dataShape = new("DataShape", value = 'CATEGORICAL')),
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'),
+      # empty plotReference means that it is not plotted
+      dataType = new("DataType", value = 'STRING'),
+      dataShape = new("DataShape", value = 'CATEGORICAL'),
+      weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'),
+      hasStudyDependentVocabulary = TRUE)
   ))
 
   imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
@@ -405,8 +461,8 @@ test_that("imputeZeroes method is sane", {
 
   imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
   expect_equal(all(c("sample.species","sample.specimen_count","sample.sex","collection.attractant") %in% names(imputedDT)), TRUE)
-  expect_equal(nrow(imputedDT), 20)
-  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 14)
+  expect_equal(nrow(imputedDT), 21)
+  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15)
 
   # special vocab on sample, regular weighting var on assay
   # phase this in
@@ -544,7 +600,15 @@ test_that("imputeZeroes method is sane", {
       variableSpec = new("VariableSpec", variableId = 'attractant', entityId = 'collection'),
       plotReference = new("PlotReference", value = 'facet1'),
       dataType = new("DataType", value = 'STRING'),
-      dataShape = new("DataShape", value = 'CATEGORICAL'))
+      dataShape = new("DataShape", value = 'CATEGORICAL')),
+    new("VariableMetadata",
+      variableClass = new("VariableClass", value = 'native'),
+      variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'),
+      # empty plotReference means that it is not plotted
+      dataType = new("DataType", value = 'STRING'),
+      dataShape = new("DataShape", value = 'CATEGORICAL'),
+      weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'),
+      hasStudyDependentVocabulary = TRUE)
   ))
 
   expect_error(getDTWithImputedZeroes(m, variables, FALSE))

From d21bb0d7f7d3c46076ea9d61234b1fd1c63f546d Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Tue, 19 Dec 2023 10:28:40 -0500
Subject: [PATCH 5/8] make sure we dont lose any combinations when merging
 imputed zeroes

---
 R/methods-Megastudy.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R
index 88cc4d1..f5b9c2d 100644
--- a/R/methods-Megastudy.R
+++ b/R/methods-Megastudy.R
@@ -321,7 +321,7 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
   
   dataTablesOfImputedValues <- lapply(variableSpecsToImputeZeroesFor, makeImputedZeroesDT)
   mergeDTsOfImputedValues <- function(x,y) {
-    merge(x, y, by = c(upstreamEntityIdColNames, varSpecEntityIdColName, weightingVarColName), allow.cartesian=TRUE)
+    merge(x, y, by = c(upstreamEntityIdColNames, varSpecEntityIdColName, weightingVarColName), allow.cartesian=TRUE, all=TRUE)
   }
   .dt2 <- purrr::reduce(dataTablesOfImputedValues, mergeDTsOfImputedValues)
   veupathUtils::logWithTime(paste("Finished collapsing imputed values for all variables into one table. Added", nrow(.dt2), "total rows."), verbose)

From 07855d9ed5bb254034bd3d0efa498253b67a569a Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Tue, 19 Dec 2023 11:47:03 -0500
Subject: [PATCH 6/8] update tests

---
 R/methods-Megastudy.R                 |  9 +--------
 tests/testthat/test-class-Megastudy.R | 17 ++++++++---------
 2 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R
index f5b9c2d..03838a3 100644
--- a/R/methods-Megastudy.R
+++ b/R/methods-Megastudy.R
@@ -242,14 +242,7 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
   variableMetadataNeedingStudyVocabularies <- findStudyDependentVocabularyVariableMetadata(variables)
   variableSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, "Never")
   variableCollectionSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, "Always")
-  if (is.null(variableCollectionSpecsWithStudyVocabs)) {
-    variableMetadataForStudyVocabVariables <- findVariableMetadataFromVariableSpec(variables, variableSpecsWithStudyVocabs)
-  } else {
-    variableMetadataForStudyVocabVariables <- findVariableMetadataFromVariableSpec(variables, variableCollectionSpecsWithStudyVocabs)
-  }
-  #if (length(variableSpecsWithStudyVocabs) > length(variableMetadataForStudyVocabVariables)) {
-  #  warning("Study vocabularies were provided for variables that are not present in the plot. These will be ignored.")
-  #}
+  variableMetadataForStudyVocabVariables <- findVariableMetadataFromVariableSpec(variables, variableSpecsWithStudyVocabs)
   if (length(variableMetadataForStudyVocabVariables) < length(variableMetadataNeedingStudyVocabularies)) {
     stop("Some provided variables require study vocabularies but dont have one.")
   }
diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R
index bc7bb33..0677aa5 100644
--- a/tests/testthat/test-class-Megastudy.R
+++ b/tests/testthat/test-class-Megastudy.R
@@ -249,9 +249,9 @@ test_that("imputeZeroes method is sane", {
                                                    assay.pathogen3_presence=c('No','Yes'),
                                                    assay.weighting_variable=c(35,40)))
   
-  mCOMPLETE <- Megastudy(data=megastudyDTSMALL,
+  mCOMPLETE <- Megastudy(data=megastudyDTSMALL[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.species', 'sample.sex', 'collection.attractant', 'study.author'), with=FALSE],
                  ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'),
-                 studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabsSMALL)))
+                 studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabsSMALL, sexVocabsSMALL)))
   
   variables <- new("VariableMetadataList", SimpleList(
     new("VariableMetadata",
@@ -292,8 +292,9 @@ test_that("imputeZeroes method is sane", {
 
   imputedDT <- getDTWithImputedZeroes(mCOMPLETE, variables, FALSE)
   expect_equal(all(names(imputedDT) %in% names(mCOMPLETE@data)), TRUE)
-  expect_equal(nrow(imputedDT), nrow(mCOMPLETE@data))
-  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 0)
+  # 2 species * 2 sexes * 2 collections * 2 studies = 16
+  expect_equal(nrow(imputedDT), 16)
+  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 6)
 
   # no weighting var in plot
   variables <- new("VariableMetadataList", SimpleList(
@@ -333,9 +334,9 @@ test_that("imputeZeroes method is sane", {
   expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 0)
 
   # an assay var is present 
-  m <- Megastudy(data=megastudyDT,
+  m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id','assay.id', 'assay.pathogen_prevalence', 'sample.species', 'sample.sex', 'collection.attractant', 'study.author')],
                  ancestorIdColumns=c('study.id', 'collection.id', 'sample.id','assay.id'),
-                 studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs)))
+                 studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)))
 
   variables <- new("VariableMetadataList", SimpleList(
     new("VariableMetadata",
@@ -375,8 +376,6 @@ test_that("imputeZeroes method is sane", {
   ))
 
   imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
-  # are we ok that it leaves all cols if it decides nothing needs doing??
-  # it wont hurt anything, its just inconsistent behavior
   expect_equal(all(names(imputedDT) %in% names(m@data)), TRUE)
   expect_equal(nrow(imputedDT), nrow(m@data))
   expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 0)
@@ -618,7 +617,7 @@ test_that("imputeZeroes method is sane", {
                                                                                                 StudySpecificVocabulary(studyIdColumnName='study.id', study='b', variableSpec=VariableSpec(entityId='assay',variableId='pathogen_presence_variable_collection'), vocabulary=c('Yes','No'))))
 
 
-  m <- Megastudy(data=megastudyDT,
+  m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'assay.id', 'sample.specimen_count', 'collection.attractant', 'study.author', 'assay.pathogen_presence', 'assay.pathogen2_presence', 'assay.pathogen3_presence'), with=FALSE],
                  ancestorIdColumns=c('study.id', 'collection.id', 'sample.id', 'assay.id'),
                  studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(pathogenVariableCollectionVocabs)))
 

From 1e7f4a94a07ca3d074d07fb24c6a3d8ed60b812d Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Tue, 19 Dec 2023 13:11:54 -0500
Subject: [PATCH 7/8] clean up code, fix a bug, update some tests, etc

---
 R/methods-Megastudy.R                 | 58 +++++++++++++--------------
 tests/testthat/test-class-Megastudy.R | 28 +++++++------
 2 files changed, 44 insertions(+), 42 deletions(-)

diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R
index 03838a3..ad4837d 100644
--- a/R/methods-Megastudy.R
+++ b/R/methods-Megastudy.R
@@ -284,45 +284,45 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
   }
 
   # for upstream entities data
-  combinations.dt <- unique(.dt[, -c(weightingVarColName, varSpecColNames), with=FALSE])
-  combinations.dt[[varSpecEntityIdColName]] <- NULL
-  combinations.dt <- unique(combinations.dt)
-  veupathUtils::logWithTime(paste("Found", nrow(combinations.dt), "existing variable value combinations."), verbose)
+  upstreamEntityVariables.dt <- unique(.dt[, -c(weightingVarColName, varSpecColNames), with=FALSE])
+  upstreamEntityVariables.dt[[varSpecEntityIdColName]] <- NULL
+  upstreamEntityVariables.dt <- unique(upstreamEntityVariables.dt)
+  veupathUtils::logWithTime(paste("Found", nrow(upstreamEntityVariables.dt), "unique existing upstream variable value combinations."), verbose)
   entityIds.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecEntityIdColName), with=FALSE])
   
-
-  # impute zeroes for each study vocab iteratively
-  makeImputedZeroesDT <- function(variableSpec) {
-    veupathUtils::logWithTime(paste("Imputing zeroes for", veupathUtils::getColName(variableSpec)), verbose)
+  # make vocab table for a single variable
+  makeVocabDT <- function(variableSpec) {
+    veupathUtils::logWithTime(paste("Finding vocab for", veupathUtils::getColName(variableSpec)), verbose)
     varSpecColName <- veupathUtils::getColName(variableSpec)
     vocab <- findStudyVocabularyByVariableSpec(vocabs, variables, variableSpec)
     vocabs.dt <- veupathUtils::as.data.table(vocab)
     names(vocabs.dt)[2] <- varSpecColName
-    vocabs.dt <- merge(entityIds.dt, vocabs.dt, by=studyIdColName, allow.cartesian=TRUE)
-    present.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecColName), with=FALSE])
-    # assume if a value was explicitly filtered against that its not in the vocab
-    add.dt <- vocabs.dt[!present.dt, on=c(upstreamEntityIdColNames, varSpecColName)]
-    if (nrow(add.dt) > 0) {
-      add.dt[[weightingVarColName]] <- 0
-    } else {
-      add.dt[[weightingVarColName]] <- numeric()
-    }
-   
-    veupathUtils::logWithTime(paste("Found", nrow(add.dt), "new combinations of values for", veupathUtils::getColName(variableSpec)), verbose)
-    return(unique(add.dt))
+    return(vocabs.dt)
   }
+
+  # make all possible variable value combinations table
+  vocabDTs <- lapply(variableSpecsToImputeZeroesFor, makeVocabDT)
+  allCombinations.dt <- purrr::reduce(vocabDTs, merge, allow.cartesian=TRUE)
   
-  dataTablesOfImputedValues <- lapply(variableSpecsToImputeZeroesFor, makeImputedZeroesDT)
-  mergeDTsOfImputedValues <- function(x,y) {
-    merge(x, y, by = c(upstreamEntityIdColNames, varSpecEntityIdColName, weightingVarColName), allow.cartesian=TRUE, all=TRUE)
+  # find which ones we need to add
+  presentCombinations.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE])
+  # need upstream entity ids for all combinations in order to properly find and merge missing values
+  allCombinations.dt <- merge(allCombinations.dt, upstreamEntityVariables.dt, allow.cartesian=TRUE)
+  # NOTE: we're assuming if a value was explicitly filtered against that its not in the vocab
+  addCombinations.dt <- allCombinations.dt[!presentCombinations.dt, on=c(upstreamEntityIdColNames, varSpecColNames)]
+
+  if (nrow(addCombinations.dt) == 0) {
+    veupathUtils::logWithTime("No new combinations to add. Returning existing table.", verbose)
+    return(.dt)
+  } else {
+    veupathUtils::logWithTime(paste("Adding", nrow(addCombinations.dt), "new combinations."), verbose)
   }
-  .dt2 <- purrr::reduce(dataTablesOfImputedValues, mergeDTsOfImputedValues)
-  veupathUtils::logWithTime(paste("Finished collapsing imputed values for all variables into one table. Added", nrow(.dt2), "total rows."), verbose)
 
-  #make impossibly unique ids
-  .dt2[[varSpecEntityIdColName]] <- apply(.dt2[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE], 1, digest::digest, algo='md5')
-  .dt2 <- unique(merge(.dt2, combinations.dt, by=upstreamEntityIdColNames))
-  .dt <- rbind(.dt, .dt2)
+  # go ahead and add them, first filling in values for all columns
+  addCombinations.dt[[weightingVarColName]] <- 0  
+  addCombinations.dt[[varSpecEntityIdColName]] <- apply(addCombinations.dt[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE], 1, digest::digest, algo="md5")
+  # bind them to the existing rows
+  .dt <- rbind(.dt, addCombinations.dt)
   veupathUtils::logWithTime("Added imputed values to existing table. Finished imputing zeroes.", verbose)
  
   return(.dt)
diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R
index 0677aa5..ef3607b 100644
--- a/tests/testthat/test-class-Megastudy.R
+++ b/tests/testthat/test-class-Megastudy.R
@@ -148,9 +148,9 @@ test_that("imputeZeroes method is sane", {
   imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
   # result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes
   expect_equal(all(c("sample.species","sample.specimen_count") %in% names(imputedDT)), TRUE)
-  # 5 sexes * 3 species in study A (15) + 2 sexes * 3 species in study B (6) = 21
-  expect_equal(nrow(imputedDT), 21)
-  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15)
+  # 5 sexes * 3 species in study A (15) + 2 sexes * 3 species in study B (6) * 2 collections per study = 42
+  expect_equal(nrow(imputedDT), 42)
+  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36)
 
   # collection entity var is present
   variables <- new("VariableMetadataList", SimpleList(
@@ -186,8 +186,8 @@ test_that("imputeZeroes method is sane", {
 
   imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
   expect_equal(all(c("sample.species","sample.specimen_count","collection.attractant") %in% names(imputedDT)), TRUE)
-  expect_equal(nrow(imputedDT), 21)
-  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15)
+  expect_equal(nrow(imputedDT), 42)
+  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36)
 
   # both collection and study entity vars are present
   variables <- new("VariableMetadataList", SimpleList(
@@ -229,8 +229,8 @@ test_that("imputeZeroes method is sane", {
 
   imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
   expect_equal(all(c("sample.species","sample.specimen_count","collection.attractant","study.author") %in% names(imputedDT)), TRUE)
-  expect_equal(nrow(imputedDT), 21)
-  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15)
+  expect_equal(nrow(imputedDT), 42)
+  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36)
 
   # all values in vocab already present
   megastudyDTSMALL <- rbind(megastudyDT, 
@@ -294,7 +294,7 @@ test_that("imputeZeroes method is sane", {
   expect_equal(all(names(imputedDT) %in% names(mCOMPLETE@data)), TRUE)
   # 2 species * 2 sexes * 2 collections * 2 studies = 16
   expect_equal(nrow(imputedDT), 16)
-  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 6)
+  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 8)
 
   # no weighting var in plot
   variables <- new("VariableMetadataList", SimpleList(
@@ -460,8 +460,8 @@ test_that("imputeZeroes method is sane", {
 
   imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
   expect_equal(all(c("sample.species","sample.specimen_count","sample.sex","collection.attractant") %in% names(imputedDT)), TRUE)
-  expect_equal(nrow(imputedDT), 21)
-  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15)
+  expect_equal(nrow(imputedDT), 42)
+  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36)
 
   # special vocab on sample, regular weighting var on assay
   # phase this in
@@ -645,8 +645,10 @@ test_that("imputeZeroes method is sane", {
 
   imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
   # result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes
-  # TODO lol its just possible this fxn shouldnt remove cols but my brain hurts enough already 
   expect_equal(all(c("assay.pathogen_presence","assay.pathogen2_presence","assay.pathogen3_presence","sample.specimen_count") %in% names(imputedDT)), TRUE)
-  expect_equal(nrow(imputedDT), 12)
-  expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 6)
+  # 2 studies * 2 collections per study * 2 values for each of 3 pathogen variables = 32?
+  # im not sure this test makes any sense, bc were imputing 0 on a sample for a collection on assay
+  # im going to comment until we see a real use case
+  #expect_equal(nrow(imputedDT), 32)
+  #expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 26)
 })
\ No newline at end of file

From c0c47c0929a9910dfcafb6f665b248a263755d8a Mon Sep 17 00:00:00 2001
From: Danielle Callan <dcallan@upenn.edu>
Date: Thu, 21 Dec 2023 09:30:03 -0500
Subject: [PATCH 8/8] remove duplicate entry from variables in the megastudy
 tests

---
 tests/testthat/test-class-Megastudy.R | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R
index ef3607b..f0bc89c 100644
--- a/tests/testthat/test-class-Megastudy.R
+++ b/tests/testthat/test-class-Megastudy.R
@@ -599,15 +599,7 @@ test_that("imputeZeroes method is sane", {
       variableSpec = new("VariableSpec", variableId = 'attractant', entityId = 'collection'),
       plotReference = new("PlotReference", value = 'facet1'),
       dataType = new("DataType", value = 'STRING'),
-      dataShape = new("DataShape", value = 'CATEGORICAL')),
-    new("VariableMetadata",
-      variableClass = new("VariableClass", value = 'native'),
-      variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'),
-      # empty plotReference means that it is not plotted
-      dataType = new("DataType", value = 'STRING'),
-      dataShape = new("DataShape", value = 'CATEGORICAL'),
-      weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'),
-      hasStudyDependentVocabulary = TRUE)
+      dataShape = new("DataShape", value = 'CATEGORICAL'))
   ))
 
   expect_error(getDTWithImputedZeroes(m, variables, FALSE))