From c3fa47f373d8afdfabe707b78aae4109b5d7af55 Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Mon, 18 Dec 2023 09:21:34 -0500 Subject: [PATCH 1/8] allow impute zeroes to work with columns that arent in the final plot --- R/methods-Megastudy.R | 2 +- tests/testthat/test-class-Megastudy.R | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R index b41244e..82adafa 100644 --- a/R/methods-Megastudy.R +++ b/R/methods-Megastudy.R @@ -227,7 +227,7 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata variableColumnNames <- unlist(lapply(as.list(variables), getVariableColumnNames)) allEntityIdColumns <- object@ancestorIdColumns # drop things that arent in the plot, except ids - .dt <- .dt[, c(variableColumnNames, allEntityIdColumns), with=FALSE] + #.dt <- .dt[, c(variableColumnNames, allEntityIdColumns), with=FALSE] vocabs <- object@studySpecificVocabularies # it seems a lot of this validation could belong to some custom obj w both a megastudy and vm slot.. but what is that? a MegastudyPlot? diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R index cdcc281..b3edaff 100644 --- a/tests/testthat/test-class-Megastudy.R +++ b/tests/testthat/test-class-Megastudy.R @@ -115,7 +115,7 @@ test_that("Megastudy and associated validation works", { # TODO this could go in its own file maybe test_that("imputeZeroes method is sane", { - m <- Megastudy(data=megastudyDT, + m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE], ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'), studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs))) @@ -259,7 +259,7 @@ test_that("imputeZeroes method is sane", { )) imputedDT <- getDTWithImputedZeroes(mCOMPLETE, variables, FALSE) - expect_equal(all(names(imputedDT) %in% names(m@data)), TRUE) + expect_equal(all(names(imputedDT) %in% names(mCOMPLETE@data)), TRUE) expect_equal(nrow(imputedDT), nrow(mCOMPLETE@data)) expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 0) @@ -368,7 +368,7 @@ test_that("imputeZeroes method is sane", { expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 0) # multiple special vocabs in same plot, w one shared weighting var - m <- Megastudy(data=megastudyDT, + m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.species', 'sample.sex', 'collection.attractant'), with=FALSE], ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'), studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs,sexVocabs))) From 373429d8ab21640e6505683f03e5b1616aba4f6f Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Mon, 18 Dec 2023 11:35:31 -0500 Subject: [PATCH 2/8] proper null handling for variable collections when imputing zeroes --- R/methods-Megastudy.R | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R index 82adafa..c041da9 100644 --- a/R/methods-Megastudy.R +++ b/R/methods-Megastudy.R @@ -28,8 +28,11 @@ setMethod('getVariableSpec', signature('VariableMetadata'), function(object, get getCollectionMemberVarSpecs <- veupathUtils::matchArg(getCollectionMemberVarSpecs) varSpecs <- list(object@variableSpec) + #if the variable is a collection, then we want to return the member variable specs if (object@isCollection && getCollectionMemberVarSpecs) { varSpecs <- as.list(object@members) + } else if (!object@isCollection && getCollectionMemberVarSpecs) { + varSpecs <- NULL } return(varSpecs) @@ -38,8 +41,13 @@ setMethod('getVariableSpec', signature('VariableMetadata'), function(object, get #' @export setMethod('getVariableSpec', signature('VariableMetadataList'), function(object, getCollectionMemberVarSpecs = c(TRUE, FALSE)) { getCollectionMemberVarSpecs <- veupathUtils::matchArg(getCollectionMemberVarSpecs) + + varSpecs <- unlist(lapply(as.list(object), veupathUtils::getVariableSpec, getCollectionMemberVarSpecs)) + if (all(unlist(lapply(varSpecs, is.null)))) { + return(NULL) + } - return(unlist(lapply(as.list(object), veupathUtils::getVariableSpec, getCollectionMemberVarSpecs))) + return(varSpecs) }) #' StuydIdColName as String @@ -176,7 +184,11 @@ findVariableSpecsFromStudyVocabulary <- function(vocabs, variables, getCollectio if (getCollectionMemberVarSpecs) { varMetadataWithVocabs <- findVariableMetadataFromVariableSpec(variables, varSpecsWithVocabs) - varSpecsWithVocabs <- VariableSpecList(S4Vectors::SimpleList(getVariableSpec(varMetadataWithVocabs, getCollectionMemberVarSpecs))) + varSpecsWithVocabs <- getVariableSpec(varMetadataWithVocabs, getCollectionMemberVarSpecs) + if (is.null(varSpecsWithVocabs)) { + return(NULL) + } + varSpecsWithVocabs <- VariableSpecList(S4Vectors::SimpleList(varSpecsWithVocabs)) } return(varSpecsWithVocabs) @@ -222,28 +234,23 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata .dt <- object@data veupathUtils::logWithTime(paste0("Imputing zeroes for data.table with ", ncol(.dt), " columns and ", nrow(.dt), " rows"), verbose) - # TODO feel like im doing this operation a lot.. maybe another method/ helper? - # also, try to figure a way we dont have to do this.. i dont remember why i did this and its inconsistent behavior - variableColumnNames <- unlist(lapply(as.list(variables), getVariableColumnNames)) allEntityIdColumns <- object@ancestorIdColumns - # drop things that arent in the plot, except ids - #.dt <- .dt[, c(variableColumnNames, allEntityIdColumns), with=FALSE] vocabs <- object@studySpecificVocabularies # it seems a lot of this validation could belong to some custom obj w both a megastudy and vm slot.. but what is that? a MegastudyPlot? # plus going that route means using this class in plot.data means an api change for plot.data # that api change might be worth making in any case, but not doing it now variableMetadataNeedingStudyVocabularies <- findStudyDependentVocabularyVariableMetadata(variables) - variableSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, TRUE) - variableCollectionSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, FALSE) + variableSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, FALSE) + variableCollectionSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, TRUE) if (is.null(variableCollectionSpecsWithStudyVocabs)) { variableMetadataForStudyVocabVariables <- findVariableMetadataFromVariableSpec(variables, variableSpecsWithStudyVocabs) } else { variableMetadataForStudyVocabVariables <- findVariableMetadataFromVariableSpec(variables, variableCollectionSpecsWithStudyVocabs) } - if (length(variableSpecsWithStudyVocabs) > length(variableMetadataForStudyVocabVariables)) { - warning("Study vocabularies were provided for variables that are not present in the plot. These will be ignored.") - } + #if (length(variableSpecsWithStudyVocabs) > length(variableMetadataForStudyVocabVariables)) { + # warning("Study vocabularies were provided for variables that are not present in the plot. These will be ignored.") + #} if (length(variableMetadataForStudyVocabVariables) < length(variableMetadataNeedingStudyVocabularies)) { stop("Some provided variables require study vocabularies but dont have one.") } From 4bf593628bc19f28b2967ead7dfbbd91147f4bab Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Mon, 18 Dec 2023 13:36:02 -0500 Subject: [PATCH 3/8] keep improving getCollectionMemberVariableSpecs arg --- R/methods-Megastudy.R | 21 +++++++++------------ R/methods-VariableMetadata.R | 7 ++++--- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R index c041da9..88cc4d1 100644 --- a/R/methods-Megastudy.R +++ b/R/methods-Megastudy.R @@ -22,16 +22,15 @@ setMethod('getVariableSpec', signature('ANY'), function(object) { return(object@variableSpec) }) -# this might be unexpected behavior. should there be a param to choose between the collection and its member specs? #' @export -setMethod('getVariableSpec', signature('VariableMetadata'), function(object, getCollectionMemberVarSpecs = c(FALSE, TRUE)) { +setMethod('getVariableSpec', signature('VariableMetadata'), function(object, getCollectionMemberVarSpecs = c("Dynamic", "Never", "Always")) { getCollectionMemberVarSpecs <- veupathUtils::matchArg(getCollectionMemberVarSpecs) varSpecs <- list(object@variableSpec) #if the variable is a collection, then we want to return the member variable specs - if (object@isCollection && getCollectionMemberVarSpecs) { + if (object@isCollection && getCollectionMemberVarSpecs %in% c("Dynamic", "Always")) { varSpecs <- as.list(object@members) - } else if (!object@isCollection && getCollectionMemberVarSpecs) { + } else if (!object@isCollection && getCollectionMemberVarSpecs == "Always") { varSpecs <- NULL } @@ -39,7 +38,7 @@ setMethod('getVariableSpec', signature('VariableMetadata'), function(object, get }) #' @export -setMethod('getVariableSpec', signature('VariableMetadataList'), function(object, getCollectionMemberVarSpecs = c(TRUE, FALSE)) { +setMethod('getVariableSpec', signature('VariableMetadataList'), function(object, getCollectionMemberVarSpecs = c("Dynamic", "Never", "Always")) { getCollectionMemberVarSpecs <- veupathUtils::matchArg(getCollectionMemberVarSpecs) varSpecs <- unlist(lapply(as.list(object), veupathUtils::getVariableSpec, getCollectionMemberVarSpecs)) @@ -157,7 +156,7 @@ findStudyVocabularyByVariableSpec <- function(vocabs, variables, variableSpec) { vocabVariableSpecs <- lapply(as.list(vocabs), veupathUtils::getVariableSpec) vocabVariableMetadata <- veupathUtils::findVariableMetadataFromVariableSpec(variables, veupathUtils::VariableSpecList(S4Vectors::SimpleList(vocabVariableSpecs))) - vocabVariableSpecsAdjustedForVariableCollectionMembers <- veupathUtils::getVariableSpec(vocabVariableMetadata, TRUE) + vocabVariableSpecsAdjustedForVariableCollectionMembers <- veupathUtils::getVariableSpec(vocabVariableMetadata, "Dynamic") # if we have found variable collection members in the VariableMetadata, need to check if the passed varspec was a member # look through the list that includes the members, and if we match one, get the varspec of the parent/ collection @@ -175,14 +174,14 @@ findStudyVocabularyByVariableSpec <- function(vocabs, variables, variableSpec) { } -findVariableSpecsFromStudyVocabulary <- function(vocabs, variables, getCollectionMemberVarSpecs = c(TRUE, FALSE)) { +findVariableSpecsFromStudyVocabulary <- function(vocabs, variables, getCollectionMemberVarSpecs = c("Dynamic", "Never", "Always")) { if (!inherits(vocabs, 'StudySpecificVocabulariesByVariableList')) stop("The first argument must be of the S4 class `StudySpecificVocabulariesByVariableList`.") if (!inherits(variables, 'VariableMetadataList')) stop("The second argument must be of the S4 class `VariableMetadataList`.") getCollectionMemberVarSpecs <- veupathUtils::matchArg(getCollectionMemberVarSpecs) varSpecsWithVocabs <- VariableSpecList(S4Vectors::SimpleList(lapply(as.list(vocabs), getVariableSpec))) - if (getCollectionMemberVarSpecs) { + if (getCollectionMemberVarSpecs != "Never") { varMetadataWithVocabs <- findVariableMetadataFromVariableSpec(variables, varSpecsWithVocabs) varSpecsWithVocabs <- getVariableSpec(varMetadataWithVocabs, getCollectionMemberVarSpecs) if (is.null(varSpecsWithVocabs)) { @@ -241,8 +240,8 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata # plus going that route means using this class in plot.data means an api change for plot.data # that api change might be worth making in any case, but not doing it now variableMetadataNeedingStudyVocabularies <- findStudyDependentVocabularyVariableMetadata(variables) - variableSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, FALSE) - variableCollectionSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, TRUE) + variableSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, "Never") + variableCollectionSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, "Always") if (is.null(variableCollectionSpecsWithStudyVocabs)) { variableMetadataForStudyVocabVariables <- findVariableMetadataFromVariableSpec(variables, variableSpecsWithStudyVocabs) } else { @@ -326,8 +325,6 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata } .dt2 <- purrr::reduce(dataTablesOfImputedValues, mergeDTsOfImputedValues) veupathUtils::logWithTime(paste("Finished collapsing imputed values for all variables into one table. Added", nrow(.dt2), "total rows."), verbose) - message(colnames(.dt2)) - message(head(.dt2)) #make impossibly unique ids .dt2[[varSpecEntityIdColName]] <- apply(.dt2[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE], 1, digest::digest, algo='md5') diff --git a/R/methods-VariableMetadata.R b/R/methods-VariableMetadata.R index fda8759..8bbac93 100644 --- a/R/methods-VariableMetadata.R +++ b/R/methods-VariableMetadata.R @@ -639,18 +639,19 @@ setGeneric("findVariableMetadataFromVariableSpec", #' @export setMethod("findVariableMetadataFromVariableSpec", signature("VariableMetadataList", "VariableSpecList"), function(variables, object) { - variableSpecs <- unlist(lapply(as.list(variables), veupathUtils::getVariableSpec, FALSE)) + variableSpecs <- unlist(lapply(as.list(variables), veupathUtils::getVariableSpec, "Never")) colNamesToMatch <- unlist(lapply(as.list(object), veupathUtils::getColName)) index <- which(purrr::map(variableSpecs, function(x) {veupathUtils::getColName(x)}) %in% colNamesToMatch) + if (!length(index)) return(NULL) - + return(variables[index]) }) #' @export setMethod("findVariableMetadataFromVariableSpec", signature("VariableMetadataList", "VariableSpec"), function(variables, object) { - variableSpecs <- unlist(lapply(as.list(variables), veupathUtils::getVariableSpec, FALSE)) + variableSpecs <- unlist(lapply(as.list(variables), veupathUtils::getVariableSpec, "Never")) index <- which(purrr::map(variableSpecs, function(x) {veupathUtils::getColName(x)}) == veupathUtils::getColName(object)) if (!length(index)) return(NULL) From 164e28fdf883310194b00a3014bbb4253d0c6659 Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Mon, 18 Dec 2023 13:36:40 -0500 Subject: [PATCH 4/8] update tests to better reflect real world data --- tests/testthat/test-class-Megastudy.R | 102 +++++++++++++++++++++----- 1 file changed, 83 insertions(+), 19 deletions(-) diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R index b3edaff..bc7bb33 100644 --- a/tests/testthat/test-class-Megastudy.R +++ b/tests/testthat/test-class-Megastudy.R @@ -115,9 +115,9 @@ test_that("Megastudy and associated validation works", { # TODO this could go in its own file maybe test_that("imputeZeroes method is sane", { - m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE], + m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE], ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs))) + studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs))) # case where neither study nor collection vars in the plot variables <- new("VariableMetadataList", SimpleList( @@ -134,15 +134,23 @@ test_that("imputeZeroes method is sane", { variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'), plotReference = new("PlotReference", value = 'yAxis'), dataType = new("DataType", value = 'NUMBER'), - dataShape = new("DataShape", value = 'CONTINUOUS')) + dataShape = new("DataShape", value = 'CONTINUOUS')), + new("VariableMetadata", + variableClass = new("VariableClass", value = 'native'), + variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), + # empty plotReference means that it is not plotted + dataType = new("DataType", value = 'STRING'), + dataShape = new("DataShape", value = 'CATEGORICAL'), + weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), + hasStudyDependentVocabulary = TRUE) )) imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) # result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes - # TODO lol its just possible this fxn shouldnt remove cols but my brain hurts enough already expect_equal(all(c("sample.species","sample.specimen_count") %in% names(imputedDT)), TRUE) - expect_equal(nrow(imputedDT), 12) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 6) + # 5 sexes * 3 species in study A (15) + 2 sexes * 3 species in study B (6) = 21 + expect_equal(nrow(imputedDT), 21) + expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15) # collection entity var is present variables <- new("VariableMetadataList", SimpleList( @@ -165,13 +173,21 @@ test_that("imputeZeroes method is sane", { variableSpec = new("VariableSpec", variableId = 'attractant', entityId = 'collection'), plotReference = new("PlotReference", value = 'overlay'), dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')) + dataShape = new("DataShape", value = 'CATEGORICAL')), + new("VariableMetadata", + variableClass = new("VariableClass", value = 'native'), + variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), + # empty plotReference means that it is not plotted + dataType = new("DataType", value = 'STRING'), + dataShape = new("DataShape", value = 'CATEGORICAL'), + weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), + hasStudyDependentVocabulary = TRUE) )) imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) expect_equal(all(c("sample.species","sample.specimen_count","collection.attractant") %in% names(imputedDT)), TRUE) - expect_equal(nrow(imputedDT), 12) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 6) + expect_equal(nrow(imputedDT), 21) + expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15) # both collection and study entity vars are present variables <- new("VariableMetadataList", SimpleList( @@ -200,13 +216,21 @@ test_that("imputeZeroes method is sane", { variableSpec = new("VariableSpec", variableId = 'author', entityId = 'study'), plotReference = new("PlotReference", value = 'facet1'), dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')) + dataShape = new("DataShape", value = 'CATEGORICAL')), + new("VariableMetadata", + variableClass = new("VariableClass", value = 'native'), + variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), + # empty plotReference means that it is not plotted + dataType = new("DataType", value = 'STRING'), + dataShape = new("DataShape", value = 'CATEGORICAL'), + weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), + hasStudyDependentVocabulary = TRUE) )) imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) expect_equal(all(c("sample.species","sample.specimen_count","collection.attractant","study.author") %in% names(imputedDT)), TRUE) - expect_equal(nrow(imputedDT), 12) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 6) + expect_equal(nrow(imputedDT), 21) + expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15) # all values in vocab already present megastudyDTSMALL <- rbind(megastudyDT, @@ -255,7 +279,15 @@ test_that("imputeZeroes method is sane", { variableSpec = new("VariableSpec", variableId = 'author', entityId = 'study'), plotReference = new("PlotReference", value = 'facet1'), dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')) + dataShape = new("DataShape", value = 'CATEGORICAL')), + new("VariableMetadata", + variableClass = new("VariableClass", value = 'native'), + variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), + # empty plotReference means that it is not plotted + dataType = new("DataType", value = 'STRING'), + dataShape = new("DataShape", value = 'CATEGORICAL'), + weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), + hasStudyDependentVocabulary = TRUE) )) imputedDT <- getDTWithImputedZeroes(mCOMPLETE, variables, FALSE) @@ -284,7 +316,15 @@ test_that("imputeZeroes method is sane", { variableSpec = new("VariableSpec", variableId = 'author', entityId = 'study'), plotReference = new("PlotReference", value = 'facet1'), dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')) + dataShape = new("DataShape", value = 'CATEGORICAL')), + new("VariableMetadata", + variableClass = new("VariableClass", value = 'native'), + variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), + # empty plotReference means that it is not plotted + dataType = new("DataType", value = 'STRING'), + dataShape = new("DataShape", value = 'CATEGORICAL'), + weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), + hasStudyDependentVocabulary = TRUE) )) imputedDT <- getDTWithImputedZeroes(mCOMPLETE, variables, FALSE) @@ -323,7 +363,15 @@ test_that("imputeZeroes method is sane", { variableSpec = new("VariableSpec", variableId = 'author', entityId = 'study'), plotReference = new("PlotReference", value = 'facet1'), dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')) + dataShape = new("DataShape", value = 'CATEGORICAL')), + new("VariableMetadata", + variableClass = new("VariableClass", value = 'native'), + variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), + # empty plotReference means that it is not plotted + dataType = new("DataType", value = 'STRING'), + dataShape = new("DataShape", value = 'CATEGORICAL'), + weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), + hasStudyDependentVocabulary = TRUE) )) imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) @@ -359,7 +407,15 @@ test_that("imputeZeroes method is sane", { variableSpec = new("VariableSpec", variableId = 'pathogen_presence', entityId = 'assay'), plotReference = new("PlotReference", value = 'facet1'), dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')) + dataShape = new("DataShape", value = 'CATEGORICAL')), + new("VariableMetadata", + variableClass = new("VariableClass", value = 'native'), + variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), + # empty plotReference means that it is not plotted + dataType = new("DataType", value = 'STRING'), + dataShape = new("DataShape", value = 'CATEGORICAL'), + weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), + hasStudyDependentVocabulary = TRUE) )) imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) @@ -405,8 +461,8 @@ test_that("imputeZeroes method is sane", { imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) expect_equal(all(c("sample.species","sample.specimen_count","sample.sex","collection.attractant") %in% names(imputedDT)), TRUE) - expect_equal(nrow(imputedDT), 20) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 14) + expect_equal(nrow(imputedDT), 21) + expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15) # special vocab on sample, regular weighting var on assay # phase this in @@ -544,7 +600,15 @@ test_that("imputeZeroes method is sane", { variableSpec = new("VariableSpec", variableId = 'attractant', entityId = 'collection'), plotReference = new("PlotReference", value = 'facet1'), dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')) + dataShape = new("DataShape", value = 'CATEGORICAL')), + new("VariableMetadata", + variableClass = new("VariableClass", value = 'native'), + variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), + # empty plotReference means that it is not plotted + dataType = new("DataType", value = 'STRING'), + dataShape = new("DataShape", value = 'CATEGORICAL'), + weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), + hasStudyDependentVocabulary = TRUE) )) expect_error(getDTWithImputedZeroes(m, variables, FALSE)) From d21bb0d7f7d3c46076ea9d61234b1fd1c63f546d Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Tue, 19 Dec 2023 10:28:40 -0500 Subject: [PATCH 5/8] make sure we dont lose any combinations when merging imputed zeroes --- R/methods-Megastudy.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R index 88cc4d1..f5b9c2d 100644 --- a/R/methods-Megastudy.R +++ b/R/methods-Megastudy.R @@ -321,7 +321,7 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata dataTablesOfImputedValues <- lapply(variableSpecsToImputeZeroesFor, makeImputedZeroesDT) mergeDTsOfImputedValues <- function(x,y) { - merge(x, y, by = c(upstreamEntityIdColNames, varSpecEntityIdColName, weightingVarColName), allow.cartesian=TRUE) + merge(x, y, by = c(upstreamEntityIdColNames, varSpecEntityIdColName, weightingVarColName), allow.cartesian=TRUE, all=TRUE) } .dt2 <- purrr::reduce(dataTablesOfImputedValues, mergeDTsOfImputedValues) veupathUtils::logWithTime(paste("Finished collapsing imputed values for all variables into one table. Added", nrow(.dt2), "total rows."), verbose) From 07855d9ed5bb254034bd3d0efa498253b67a569a Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Tue, 19 Dec 2023 11:47:03 -0500 Subject: [PATCH 6/8] update tests --- R/methods-Megastudy.R | 9 +-------- tests/testthat/test-class-Megastudy.R | 17 ++++++++--------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R index f5b9c2d..03838a3 100644 --- a/R/methods-Megastudy.R +++ b/R/methods-Megastudy.R @@ -242,14 +242,7 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata variableMetadataNeedingStudyVocabularies <- findStudyDependentVocabularyVariableMetadata(variables) variableSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, "Never") variableCollectionSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, "Always") - if (is.null(variableCollectionSpecsWithStudyVocabs)) { - variableMetadataForStudyVocabVariables <- findVariableMetadataFromVariableSpec(variables, variableSpecsWithStudyVocabs) - } else { - variableMetadataForStudyVocabVariables <- findVariableMetadataFromVariableSpec(variables, variableCollectionSpecsWithStudyVocabs) - } - #if (length(variableSpecsWithStudyVocabs) > length(variableMetadataForStudyVocabVariables)) { - # warning("Study vocabularies were provided for variables that are not present in the plot. These will be ignored.") - #} + variableMetadataForStudyVocabVariables <- findVariableMetadataFromVariableSpec(variables, variableSpecsWithStudyVocabs) if (length(variableMetadataForStudyVocabVariables) < length(variableMetadataNeedingStudyVocabularies)) { stop("Some provided variables require study vocabularies but dont have one.") } diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R index bc7bb33..0677aa5 100644 --- a/tests/testthat/test-class-Megastudy.R +++ b/tests/testthat/test-class-Megastudy.R @@ -249,9 +249,9 @@ test_that("imputeZeroes method is sane", { assay.pathogen3_presence=c('No','Yes'), assay.weighting_variable=c(35,40))) - mCOMPLETE <- Megastudy(data=megastudyDTSMALL, + mCOMPLETE <- Megastudy(data=megastudyDTSMALL[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.species', 'sample.sex', 'collection.attractant', 'study.author'), with=FALSE], ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabsSMALL))) + studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabsSMALL, sexVocabsSMALL))) variables <- new("VariableMetadataList", SimpleList( new("VariableMetadata", @@ -292,8 +292,9 @@ test_that("imputeZeroes method is sane", { imputedDT <- getDTWithImputedZeroes(mCOMPLETE, variables, FALSE) expect_equal(all(names(imputedDT) %in% names(mCOMPLETE@data)), TRUE) - expect_equal(nrow(imputedDT), nrow(mCOMPLETE@data)) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 0) + # 2 species * 2 sexes * 2 collections * 2 studies = 16 + expect_equal(nrow(imputedDT), 16) + expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 6) # no weighting var in plot variables <- new("VariableMetadataList", SimpleList( @@ -333,9 +334,9 @@ test_that("imputeZeroes method is sane", { expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 0) # an assay var is present - m <- Megastudy(data=megastudyDT, + m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id','assay.id', 'assay.pathogen_prevalence', 'sample.species', 'sample.sex', 'collection.attractant', 'study.author')], ancestorIdColumns=c('study.id', 'collection.id', 'sample.id','assay.id'), - studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs))) + studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs))) variables <- new("VariableMetadataList", SimpleList( new("VariableMetadata", @@ -375,8 +376,6 @@ test_that("imputeZeroes method is sane", { )) imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) - # are we ok that it leaves all cols if it decides nothing needs doing?? - # it wont hurt anything, its just inconsistent behavior expect_equal(all(names(imputedDT) %in% names(m@data)), TRUE) expect_equal(nrow(imputedDT), nrow(m@data)) expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 0) @@ -618,7 +617,7 @@ test_that("imputeZeroes method is sane", { StudySpecificVocabulary(studyIdColumnName='study.id', study='b', variableSpec=VariableSpec(entityId='assay',variableId='pathogen_presence_variable_collection'), vocabulary=c('Yes','No')))) - m <- Megastudy(data=megastudyDT, + m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'assay.id', 'sample.specimen_count', 'collection.attractant', 'study.author', 'assay.pathogen_presence', 'assay.pathogen2_presence', 'assay.pathogen3_presence'), with=FALSE], ancestorIdColumns=c('study.id', 'collection.id', 'sample.id', 'assay.id'), studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(pathogenVariableCollectionVocabs))) From 1e7f4a94a07ca3d074d07fb24c6a3d8ed60b812d Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Tue, 19 Dec 2023 13:11:54 -0500 Subject: [PATCH 7/8] clean up code, fix a bug, update some tests, etc --- R/methods-Megastudy.R | 58 +++++++++++++-------------- tests/testthat/test-class-Megastudy.R | 28 +++++++------ 2 files changed, 44 insertions(+), 42 deletions(-) diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R index 03838a3..ad4837d 100644 --- a/R/methods-Megastudy.R +++ b/R/methods-Megastudy.R @@ -284,45 +284,45 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata } # for upstream entities data - combinations.dt <- unique(.dt[, -c(weightingVarColName, varSpecColNames), with=FALSE]) - combinations.dt[[varSpecEntityIdColName]] <- NULL - combinations.dt <- unique(combinations.dt) - veupathUtils::logWithTime(paste("Found", nrow(combinations.dt), "existing variable value combinations."), verbose) + upstreamEntityVariables.dt <- unique(.dt[, -c(weightingVarColName, varSpecColNames), with=FALSE]) + upstreamEntityVariables.dt[[varSpecEntityIdColName]] <- NULL + upstreamEntityVariables.dt <- unique(upstreamEntityVariables.dt) + veupathUtils::logWithTime(paste("Found", nrow(upstreamEntityVariables.dt), "unique existing upstream variable value combinations."), verbose) entityIds.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecEntityIdColName), with=FALSE]) - - # impute zeroes for each study vocab iteratively - makeImputedZeroesDT <- function(variableSpec) { - veupathUtils::logWithTime(paste("Imputing zeroes for", veupathUtils::getColName(variableSpec)), verbose) + # make vocab table for a single variable + makeVocabDT <- function(variableSpec) { + veupathUtils::logWithTime(paste("Finding vocab for", veupathUtils::getColName(variableSpec)), verbose) varSpecColName <- veupathUtils::getColName(variableSpec) vocab <- findStudyVocabularyByVariableSpec(vocabs, variables, variableSpec) vocabs.dt <- veupathUtils::as.data.table(vocab) names(vocabs.dt)[2] <- varSpecColName - vocabs.dt <- merge(entityIds.dt, vocabs.dt, by=studyIdColName, allow.cartesian=TRUE) - present.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecColName), with=FALSE]) - # assume if a value was explicitly filtered against that its not in the vocab - add.dt <- vocabs.dt[!present.dt, on=c(upstreamEntityIdColNames, varSpecColName)] - if (nrow(add.dt) > 0) { - add.dt[[weightingVarColName]] <- 0 - } else { - add.dt[[weightingVarColName]] <- numeric() - } - - veupathUtils::logWithTime(paste("Found", nrow(add.dt), "new combinations of values for", veupathUtils::getColName(variableSpec)), verbose) - return(unique(add.dt)) + return(vocabs.dt) } + + # make all possible variable value combinations table + vocabDTs <- lapply(variableSpecsToImputeZeroesFor, makeVocabDT) + allCombinations.dt <- purrr::reduce(vocabDTs, merge, allow.cartesian=TRUE) - dataTablesOfImputedValues <- lapply(variableSpecsToImputeZeroesFor, makeImputedZeroesDT) - mergeDTsOfImputedValues <- function(x,y) { - merge(x, y, by = c(upstreamEntityIdColNames, varSpecEntityIdColName, weightingVarColName), allow.cartesian=TRUE, all=TRUE) + # find which ones we need to add + presentCombinations.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE]) + # need upstream entity ids for all combinations in order to properly find and merge missing values + allCombinations.dt <- merge(allCombinations.dt, upstreamEntityVariables.dt, allow.cartesian=TRUE) + # NOTE: we're assuming if a value was explicitly filtered against that its not in the vocab + addCombinations.dt <- allCombinations.dt[!presentCombinations.dt, on=c(upstreamEntityIdColNames, varSpecColNames)] + + if (nrow(addCombinations.dt) == 0) { + veupathUtils::logWithTime("No new combinations to add. Returning existing table.", verbose) + return(.dt) + } else { + veupathUtils::logWithTime(paste("Adding", nrow(addCombinations.dt), "new combinations."), verbose) } - .dt2 <- purrr::reduce(dataTablesOfImputedValues, mergeDTsOfImputedValues) - veupathUtils::logWithTime(paste("Finished collapsing imputed values for all variables into one table. Added", nrow(.dt2), "total rows."), verbose) - #make impossibly unique ids - .dt2[[varSpecEntityIdColName]] <- apply(.dt2[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE], 1, digest::digest, algo='md5') - .dt2 <- unique(merge(.dt2, combinations.dt, by=upstreamEntityIdColNames)) - .dt <- rbind(.dt, .dt2) + # go ahead and add them, first filling in values for all columns + addCombinations.dt[[weightingVarColName]] <- 0 + addCombinations.dt[[varSpecEntityIdColName]] <- apply(addCombinations.dt[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE], 1, digest::digest, algo="md5") + # bind them to the existing rows + .dt <- rbind(.dt, addCombinations.dt) veupathUtils::logWithTime("Added imputed values to existing table. Finished imputing zeroes.", verbose) return(.dt) diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R index 0677aa5..ef3607b 100644 --- a/tests/testthat/test-class-Megastudy.R +++ b/tests/testthat/test-class-Megastudy.R @@ -148,9 +148,9 @@ test_that("imputeZeroes method is sane", { imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) # result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes expect_equal(all(c("sample.species","sample.specimen_count") %in% names(imputedDT)), TRUE) - # 5 sexes * 3 species in study A (15) + 2 sexes * 3 species in study B (6) = 21 - expect_equal(nrow(imputedDT), 21) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15) + # 5 sexes * 3 species in study A (15) + 2 sexes * 3 species in study B (6) * 2 collections per study = 42 + expect_equal(nrow(imputedDT), 42) + expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36) # collection entity var is present variables <- new("VariableMetadataList", SimpleList( @@ -186,8 +186,8 @@ test_that("imputeZeroes method is sane", { imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) expect_equal(all(c("sample.species","sample.specimen_count","collection.attractant") %in% names(imputedDT)), TRUE) - expect_equal(nrow(imputedDT), 21) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15) + expect_equal(nrow(imputedDT), 42) + expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36) # both collection and study entity vars are present variables <- new("VariableMetadataList", SimpleList( @@ -229,8 +229,8 @@ test_that("imputeZeroes method is sane", { imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) expect_equal(all(c("sample.species","sample.specimen_count","collection.attractant","study.author") %in% names(imputedDT)), TRUE) - expect_equal(nrow(imputedDT), 21) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15) + expect_equal(nrow(imputedDT), 42) + expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36) # all values in vocab already present megastudyDTSMALL <- rbind(megastudyDT, @@ -294,7 +294,7 @@ test_that("imputeZeroes method is sane", { expect_equal(all(names(imputedDT) %in% names(mCOMPLETE@data)), TRUE) # 2 species * 2 sexes * 2 collections * 2 studies = 16 expect_equal(nrow(imputedDT), 16) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 6) + expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 8) # no weighting var in plot variables <- new("VariableMetadataList", SimpleList( @@ -460,8 +460,8 @@ test_that("imputeZeroes method is sane", { imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) expect_equal(all(c("sample.species","sample.specimen_count","sample.sex","collection.attractant") %in% names(imputedDT)), TRUE) - expect_equal(nrow(imputedDT), 21) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15) + expect_equal(nrow(imputedDT), 42) + expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36) # special vocab on sample, regular weighting var on assay # phase this in @@ -645,8 +645,10 @@ test_that("imputeZeroes method is sane", { imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) # result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes - # TODO lol its just possible this fxn shouldnt remove cols but my brain hurts enough already expect_equal(all(c("assay.pathogen_presence","assay.pathogen2_presence","assay.pathogen3_presence","sample.specimen_count") %in% names(imputedDT)), TRUE) - expect_equal(nrow(imputedDT), 12) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 6) + # 2 studies * 2 collections per study * 2 values for each of 3 pathogen variables = 32? + # im not sure this test makes any sense, bc were imputing 0 on a sample for a collection on assay + # im going to comment until we see a real use case + #expect_equal(nrow(imputedDT), 32) + #expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 26) }) \ No newline at end of file From c0c47c0929a9910dfcafb6f665b248a263755d8a Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Thu, 21 Dec 2023 09:30:03 -0500 Subject: [PATCH 8/8] remove duplicate entry from variables in the megastudy tests --- tests/testthat/test-class-Megastudy.R | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R index ef3607b..f0bc89c 100644 --- a/tests/testthat/test-class-Megastudy.R +++ b/tests/testthat/test-class-Megastudy.R @@ -599,15 +599,7 @@ test_that("imputeZeroes method is sane", { variableSpec = new("VariableSpec", variableId = 'attractant', entityId = 'collection'), plotReference = new("PlotReference", value = 'facet1'), dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL')), - new("VariableMetadata", - variableClass = new("VariableClass", value = 'native'), - variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), - # empty plotReference means that it is not plotted - dataType = new("DataType", value = 'STRING'), - dataShape = new("DataShape", value = 'CATEGORICAL'), - weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), - hasStudyDependentVocabulary = TRUE) + dataShape = new("DataShape", value = 'CATEGORICAL')) )) expect_error(getDTWithImputedZeroes(m, variables, FALSE))