From 27d2c6f0de757a873cbe8957b33a6a1038b8da4e Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Wed, 10 Apr 2024 21:04:43 -0400 Subject: [PATCH 01/12] draft update to megastudy class to specify collection ids --- R/class-Megastudy.R | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/R/class-Megastudy.R b/R/class-Megastudy.R index 4c97a54..40e9cd7 100644 --- a/R/class-Megastudy.R +++ b/R/class-Megastudy.R @@ -74,6 +74,26 @@ check_megastudy <- function(object) { errors <- c(errors, msg) } + if (!!length(object@collectionIdColumn)) { + + if (length(object@collectionIdColumn) != 1) { + msg <- paste("Collection ID column must have a single value.") + errors <- c(errors, msg) + } + + if (!object@collectionIdColumn %in% ancestor_id_cols) { + msg <- paste("Collection ID column must be an ancestor ID column") + errors <- c(errors, msg) + } + + if (!!length(object@collectionsDT)) { + if (!any(object@collectionsDT[[object@collectionIdColumn]] %in% df[[object@collectionIdColumn]])) { + msg <- paste("Collection IDs not found in data.frame. At least some of the provided collection IDs should be present in the data.frame.") + errors <- c(errors, msg) + } + } + } + return(if (length(errors) == 0) TRUE else errors) } @@ -85,7 +105,11 @@ check_megastudy <- function(object) { #' that data. #' #' @slot data A data.table +#' @slot ancestorIdColumns A character vector of column names representing parent entities of the recordIdColumn. #' @slot studySpecificVocabularies veupathUtils::StudySpecificVocabulariesByVariableList +#' @slot collectionIdColumn The name of the column in the data.frame that contains the collection ids. +#' The collectionId should also be a member of ancestorIdColumns. +#' @slot collectionIds A character vector of collection ids we expect. If none provided, they will be inferred from those present. #' #' @name Megastudy-class #' @rdname Megastudy-class @@ -93,5 +117,7 @@ check_megastudy <- function(object) { Megastudy <- setClass("Megastudy", representation( data = 'data.table', ancestorIdColumns = 'character', - studySpecificVocabularies = 'StudySpecificVocabulariesByVariableList' + studySpecificVocabularies = 'StudySpecificVocabulariesByVariableList', + collectionIdColumn = 'character', + collectionsDT = 'data.frame' ), validity = check_megastudy) \ No newline at end of file From dba1e40b7d6bd1a85cb9bc1b7a8bf9e08054a718 Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Wed, 10 Apr 2024 21:05:10 -0400 Subject: [PATCH 02/12] draft using collection ids to inform impute zero on megastudies --- R/methods-Megastudy.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R index 63ae5e4..998841a 100644 --- a/R/methods-Megastudy.R +++ b/R/methods-Megastudy.R @@ -260,14 +260,16 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata entityIds.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecEntityIdColName), with=FALSE]) # make all possible variable value combinations table - #vocabDTs <- lapply(variableSpecsToImputeZeroesFor, makeVocabDT) + studyEntityIdColName <- upstreamEntityIdColNames[1] # still working off the assumption theyre ordered vocabDTs <- lapply(vocabs, function(x) {x@studyVocab}) - allCombinations.dt <- purrr::reduce(vocabDTs, merge, allow.cartesian=TRUE, all=TRUE) + vocabDTs <- lapply(vocabDTs, function(x) { merge(x, object@collectionsDT, by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE) }) + allCombinations.dt <- purrr::reduce(vocabDTs, merge, by = upstreamEntityIdColNames, allow.cartesian=TRUE, all=TRUE) # find which ones we need to add presentCombinations.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE]) # need upstream entity ids for all combinations in order to properly find and merge missing values - allCombinations.dt <- merge(allCombinations.dt, upstreamEntityVariables.dt, allow.cartesian=TRUE) + allCombinations.dt <- merge(allCombinations.dt, upstreamEntityVariables.dt, by = upstreamEntityIdColNames, all = TRUE, allow.cartesian=TRUE) + ## TODO figure how to populate study and collection entity variable values based on ids # NOTE: we're assuming if a value was explicitly filtered against that its not in the vocab addCombinations.dt <- allCombinations.dt[!presentCombinations.dt, on=c(upstreamEntityIdColNames, varSpecColNames)] @@ -284,6 +286,6 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata # bind them to the existing rows .dt <- data.table::rbindlist(list(.dt, addCombinations.dt), use.names=TRUE) veupathUtils::logWithTime("Added imputed values to existing table. Finished imputing zeroes.", verbose) - + return(.dt) }) From acdf56fac0d781d43471caafcbd5476383862e01 Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Wed, 10 Apr 2024 21:05:28 -0400 Subject: [PATCH 03/12] draft test for collection id aware megastudy imputing zeroes --- tests/testthat/test-class-Megastudy.R | 46 +++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R index b224c4f..dcddf5b 100644 --- a/tests/testthat/test-class-Megastudy.R +++ b/tests/testthat/test-class-Megastudy.R @@ -13,6 +13,10 @@ megastudyDT <- data.table('study.id'=c('a','a','a','b','b','b'), 'assay.pathogen3_presence'=c('No','Yes','No','Yes','No','Yes'), 'assay.weighting_variable'=c(5,10,15,20,25,30)) +collectionsDT <- data.table( + 'study.id' = c('a', 'a', 'a','b', 'b', 'b', 'b'), + 'collection.id' = c(1, 2, 3, 1, 2, 3, 4)) + sexVocabs.dt <- data.table::data.table( 'study.id' = c('a', 'a', 'a', 'a', 'a', 'b', 'b'), 'sample.sex' = c('female', 'male', 'non-binary', 'other', 'do not wish to specify', 'male', 'female')) @@ -188,6 +192,48 @@ test_that("imputeZeroes method is sane", { expect_equal(nrow(imputedDT), 42) expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36) + # case where some collection ids are missing + # in real life, we have some collections where all samples are 0 and so not loaded + # in this case, the collection ids are missing from the data table R gets handed. + # we want to impute zeroes for their samples anyhow. + m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE], + ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'), + studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)), + collectionIdColumn='collection.id', + collectionsDT=collectionsDT) + + variables <- new("VariableMetadataList", SimpleList( + new("VariableMetadata", + variableClass = new("VariableClass", value = 'native'), + variableSpec = new("VariableSpec", variableId = 'species', entityId = 'sample'), + plotReference = new("PlotReference", value = 'xAxis'), + dataType = new("DataType", value = 'STRING'), + dataShape = new("DataShape", value = 'CATEGORICAL'), + weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), + hasStudyDependentVocabulary = TRUE), + new("VariableMetadata", + variableClass = new("VariableClass", value = 'native'), + variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'), + plotReference = new("PlotReference", value = 'yAxis'), + dataType = new("DataType", value = 'NUMBER'), + dataShape = new("DataShape", value = 'CONTINUOUS')), + new("VariableMetadata", + variableClass = new("VariableClass", value = 'native'), + variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), + # empty plotReference means that it is not plotted + dataType = new("DataType", value = 'STRING'), + dataShape = new("DataShape", value = 'CATEGORICAL'), + weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), + hasStudyDependentVocabulary = TRUE) + )) + + imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) + # result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes + expect_equal(all(c("sample.species","sample.specimen_count") %in% names(imputedDT)), TRUE) + # 5 sexes * 3 species in study A (15) + 2 sexes * 3 species in study B (6) * 2 collections per study = 42 + expect_equal(nrow(imputedDT), 42) + expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36) + # case where one study vocab is missing a study mDTSexSingleStudy <- megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE] mDTSexSingleStudy$sample.sex[mDTSexSingleStudy$study.id == 'b'] <- NA_character_ From 842c03d2ad6ea8c902a91013720f63ce684fcb11 Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Wed, 10 Apr 2024 21:30:04 -0400 Subject: [PATCH 04/12] remove unused collections entity id slot from megastudy class --- R/class-Megastudy.R | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/R/class-Megastudy.R b/R/class-Megastudy.R index 40e9cd7..85f21c7 100644 --- a/R/class-Megastudy.R +++ b/R/class-Megastudy.R @@ -74,24 +74,11 @@ check_megastudy <- function(object) { errors <- c(errors, msg) } - if (!!length(object@collectionIdColumn)) { - - if (length(object@collectionIdColumn) != 1) { - msg <- paste("Collection ID column must have a single value.") + if (!!length(object@collectionsDT)) { + if (!all(ancestor_id_cols[1:length(ancestor_id_cols)-1] %in% names(object@collectionsDT))) { + msg <- paste("Not all ancestor ID columns are present in collection data.frame") errors <- c(errors, msg) } - - if (!object@collectionIdColumn %in% ancestor_id_cols) { - msg <- paste("Collection ID column must be an ancestor ID column") - errors <- c(errors, msg) - } - - if (!!length(object@collectionsDT)) { - if (!any(object@collectionsDT[[object@collectionIdColumn]] %in% df[[object@collectionIdColumn]])) { - msg <- paste("Collection IDs not found in data.frame. At least some of the provided collection IDs should be present in the data.frame.") - errors <- c(errors, msg) - } - } } return(if (length(errors) == 0) TRUE else errors) @@ -107,8 +94,6 @@ check_megastudy <- function(object) { #' @slot data A data.table #' @slot ancestorIdColumns A character vector of column names representing parent entities of the recordIdColumn. #' @slot studySpecificVocabularies veupathUtils::StudySpecificVocabulariesByVariableList -#' @slot collectionIdColumn The name of the column in the data.frame that contains the collection ids. -#' The collectionId should also be a member of ancestorIdColumns. #' @slot collectionIds A character vector of collection ids we expect. If none provided, they will be inferred from those present. #' #' @name Megastudy-class @@ -118,6 +103,5 @@ Megastudy <- setClass("Megastudy", representation( data = 'data.table', ancestorIdColumns = 'character', studySpecificVocabularies = 'StudySpecificVocabulariesByVariableList', - collectionIdColumn = 'character', collectionsDT = 'data.frame' ), validity = check_megastudy) \ No newline at end of file From 968be24303784fff0880b0075daddb96306d7ba9 Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Wed, 10 Apr 2024 22:24:09 -0400 Subject: [PATCH 05/12] fix some bugs imputing zeroes for megastudies w explicit collection ids --- R/methods-Megastudy.R | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R index 998841a..d4ac92d 100644 --- a/R/methods-Megastudy.R +++ b/R/methods-Megastudy.R @@ -204,6 +204,7 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata veupathUtils::logWithTime(paste0("Imputing zeroes for data.table with ", ncol(.dt), " columns and ", nrow(.dt), " rows"), verbose) allEntityIdColumns <- object@ancestorIdColumns vocabs <- object@studySpecificVocabularies + collectionsDT <- object@collectionsDT # it seems a lot of this validation could belong to some custom obj w both a megastudy and vm slot.. but what is that? a MegastudyPlot? # plus going that route means using this class in plot.data means an api change for plot.data @@ -251,24 +252,34 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata veupathUtils::logWithTime("Downstream entities present. No imputation will be done (for now... mwahahaha).", verbose) return(.dt) } + studyEntityIdColName <- upstreamEntityIdColNames[1] # still working off the assumption theyre ordered # for upstream entities data - upstreamEntityVariables.dt <- unique(.dt[, -c(weightingVarColName, varSpecColNames), with=FALSE]) + upstreamEntityVariables.dt <- .dt[, -c(weightingVarColName, varSpecColNames), with=FALSE] upstreamEntityVariables.dt[[varSpecEntityIdColName]] <- NULL upstreamEntityVariables.dt <- unique(upstreamEntityVariables.dt) veupathUtils::logWithTime(paste("Found", nrow(upstreamEntityVariables.dt), "unique existing upstream variable value combinations."), verbose) + if (!!length(collectionsDT)) { + upstreamEntityVariables.dt <- merge(upstreamEntityVariables.dt, collectionsDT, by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE) + upstreamEntityVariables.dt[[which(grepl('.x',names(upstreamEntityVariables.dt),fixed=T))]] <- NULL + collectionEntityColumnIndex <- which(grepl('.y',names(upstreamEntityVariables.dt),fixed=T)) + names(upstreamEntityVariables.dt)[collectionEntityColumnIndex] <- gsub('.y', '', names(upstreamEntityVariables.dt)[collectionEntityColumnIndex], fixed=T) + } entityIds.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecEntityIdColName), with=FALSE]) # make all possible variable value combinations table - studyEntityIdColName <- upstreamEntityIdColNames[1] # still working off the assumption theyre ordered vocabDTs <- lapply(vocabs, function(x) {x@studyVocab}) - vocabDTs <- lapply(vocabDTs, function(x) { merge(x, object@collectionsDT, by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE) }) - allCombinations.dt <- purrr::reduce(vocabDTs, merge, by = upstreamEntityIdColNames, allow.cartesian=TRUE, all=TRUE) + if (!!length(collectionsDT)) { + vocabDTs <- lapply(vocabDTs, function(x) { merge(x, collectionsDT, by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE) }) + } + mergeBy <- studyEntityIdColName + if (!!length(collectionsDT)) mergeBy <- upstreamEntityIdColNames + allCombinations.dt <- purrr::reduce(vocabDTs, merge, by = mergeBy, allow.cartesian=TRUE, all=TRUE) # find which ones we need to add presentCombinations.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE]) # need upstream entity ids for all combinations in order to properly find and merge missing values - allCombinations.dt <- merge(allCombinations.dt, upstreamEntityVariables.dt, by = upstreamEntityIdColNames, all = TRUE, allow.cartesian=TRUE) + allCombinations.dt <- merge(allCombinations.dt, upstreamEntityVariables.dt, by = mergeBy, all = TRUE, allow.cartesian=TRUE) ## TODO figure how to populate study and collection entity variable values based on ids # NOTE: we're assuming if a value was explicitly filtered against that its not in the vocab addCombinations.dt <- allCombinations.dt[!presentCombinations.dt, on=c(upstreamEntityIdColNames, varSpecColNames)] From 0b7c01b1a06626accb81f36f9c888805ee0de4b5 Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Wed, 10 Apr 2024 22:24:40 -0400 Subject: [PATCH 06/12] fix up some tests --- tests/testthat/test-class-Megastudy.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R index dcddf5b..b82e42c 100644 --- a/tests/testthat/test-class-Megastudy.R +++ b/tests/testthat/test-class-Megastudy.R @@ -83,7 +83,7 @@ test_that("Megastudy and associated validation works", { ancestorIdColumns=c('study.id', 'collection.id'), studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs))) - expect_equal(slotNames(m), c('data','ancestorIdColumns','studySpecificVocabularies')) + expect_equal(slotNames(m), c('data','ancestorIdColumns','studySpecificVocabularies','collectionsDT')) expect_equal(length(m@studySpecificVocabularies), 1) expect_equal(data.table::uniqueN(m@studySpecificVocabularies[[1]]@studyVocab[,1]), 2) expect_equal(slotNames(m@studySpecificVocabularies[[1]]), c("studyIdColumnName","variableSpec","studyVocab")) @@ -93,7 +93,7 @@ test_that("Megastudy and associated validation works", { ancestorIdColumns=c('study.id', 'collection.id'), studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs))) - expect_equal(slotNames(m), c('data','ancestorIdColumns','studySpecificVocabularies')) + expect_equal(slotNames(m), c('data','ancestorIdColumns','studySpecificVocabularies','collectionsDT')) expect_equal(length(m@studySpecificVocabularies), 2) expect_equal(data.table::uniqueN(m@studySpecificVocabularies[[2]]@studyVocab[,1]), 2) expect_equal(slotNames(m@studySpecificVocabularies[[2]]), c("studyIdColumnName","variableSpec","studyVocab")) @@ -199,7 +199,6 @@ test_that("imputeZeroes method is sane", { m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE], ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'), studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)), - collectionIdColumn='collection.id', collectionsDT=collectionsDT) variables <- new("VariableMetadataList", SimpleList( From 40200dc9b715791bb5043deba033c43dba5d4c36 Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Thu, 11 Apr 2024 12:45:42 -0400 Subject: [PATCH 07/12] fix a bug where we were trying to impute values of collections variables too, oops --- R/methods-Megastudy.R | 10 ++++++---- tests/testthat/test-class-Megastudy.R | 17 +++++++++++++---- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R index d4ac92d..b530a78 100644 --- a/R/methods-Megastudy.R +++ b/R/methods-Megastudy.R @@ -209,6 +209,7 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata # it seems a lot of this validation could belong to some custom obj w both a megastudy and vm slot.. but what is that? a MegastudyPlot? # plus going that route means using this class in plot.data means an api change for plot.data # that api change might be worth making in any case, but not doing it now + ## TODO validate that any collections variables are present in collectionsDT variableMetadataNeedingStudyVocabularies <- findStudyDependentVocabularyVariableMetadata(variables) variableSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, "Never") variableCollectionSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, "Always") @@ -261,16 +262,17 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata veupathUtils::logWithTime(paste("Found", nrow(upstreamEntityVariables.dt), "unique existing upstream variable value combinations."), verbose) if (!!length(collectionsDT)) { upstreamEntityVariables.dt <- merge(upstreamEntityVariables.dt, collectionsDT, by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE) - upstreamEntityVariables.dt[[which(grepl('.x',names(upstreamEntityVariables.dt),fixed=T))]] <- NULL - collectionEntityColumnIndex <- which(grepl('.y',names(upstreamEntityVariables.dt),fixed=T)) - names(upstreamEntityVariables.dt)[collectionEntityColumnIndex] <- gsub('.y', '', names(upstreamEntityVariables.dt)[collectionEntityColumnIndex], fixed=T) + upstreamEntityVariables.dt <- upstreamEntityVariables.dt[, -which(grepl('.x',names(upstreamEntityVariables.dt),fixed=T)), with=FALSE] + collectionEntityColumnIndices <- which(grepl('.y',names(upstreamEntityVariables.dt),fixed=T)) + names(upstreamEntityVariables.dt)[collectionEntityColumnIndices] <- gsub('.y', '', names(upstreamEntityVariables.dt)[collectionEntityColumnIndices], fixed=T) + upstreamEntityVariables.dt <- unique(upstreamEntityVariables.dt) } entityIds.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecEntityIdColName), with=FALSE]) # make all possible variable value combinations table vocabDTs <- lapply(vocabs, function(x) {x@studyVocab}) if (!!length(collectionsDT)) { - vocabDTs <- lapply(vocabDTs, function(x) { merge(x, collectionsDT, by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE) }) + vocabDTs <- lapply(vocabDTs, function(x) { merge(x, collectionsDT[, upstreamEntityIdColNames], by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE) }) } mergeBy <- studyEntityIdColName if (!!length(collectionsDT)) mergeBy <- upstreamEntityIdColNames diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R index b82e42c..c9148a8 100644 --- a/tests/testthat/test-class-Megastudy.R +++ b/tests/testthat/test-class-Megastudy.R @@ -13,9 +13,14 @@ megastudyDT <- data.table('study.id'=c('a','a','a','b','b','b'), 'assay.pathogen3_presence'=c('No','Yes','No','Yes','No','Yes'), 'assay.weighting_variable'=c(5,10,15,20,25,30)) +## the collectionsDT needs to include all collections, unless explicitly filtered against +## it also needs to tell us values for any collection variables that are in the plot +## were trying to impute samples, based on collections that dont have samples. +## so the only way to get collection variables values for those samples, is to get them from the collectionsDT collectionsDT <- data.table( 'study.id' = c('a', 'a', 'a','b', 'b', 'b', 'b'), - 'collection.id' = c(1, 2, 3, 1, 2, 3, 4)) + 'collection.id' = c(1, 2, 3, 1, 2, 3, 4), + 'collection.attractant' = c('A', 'B', 'C', 'C', 'D', 'D', 'E')) sexVocabs.dt <- data.table::data.table( 'study.id' = c('a', 'a', 'a', 'a', 'a', 'b', 'b'), @@ -229,9 +234,9 @@ test_that("imputeZeroes method is sane", { imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) # result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes expect_equal(all(c("sample.species","sample.specimen_count") %in% names(imputedDT)), TRUE) - # 5 sexes * 3 species in study A (15) + 2 sexes * 3 species in study B (6) * 2 collections per study = 42 - expect_equal(nrow(imputedDT), 42) - expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36) + # 5 sexes * 3 species * 3 collections in study A (45) + 2 sexes * 3 species * 4 collections in study B (30) = 69 + expect_equal(nrow(imputedDT), 69) ## TODO check these numbers + expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 63) # case where one study vocab is missing a study mDTSexSingleStudy <- megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE] @@ -249,6 +254,10 @@ test_that("imputeZeroes method is sane", { expect_equal(nrow(imputedDT), 36) expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 30) + m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE], + ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'), + studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs))) + # collection entity var is present variables <- new("VariableMetadataList", SimpleList( new("VariableMetadata", From 691543dc93957650f68c9cac649eb2b89b905060 Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Thu, 11 Apr 2024 13:30:42 -0400 Subject: [PATCH 08/12] a perf improvement --- R/methods-Megastudy.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R index b530a78..c89d791 100644 --- a/R/methods-Megastudy.R +++ b/R/methods-Megastudy.R @@ -295,7 +295,7 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata # go ahead and add them, first filling in values for all columns addCombinations.dt[[weightingVarColName]] <- 0 - addCombinations.dt[[varSpecEntityIdColName]] <- apply(addCombinations.dt[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE], 1, digest::digest, algo="md5") + addCombinations.dt[[varSpecEntityIdColName]] <- stringi::stri_rand_strings(nrow(addCombinations.dt), 10) # bind them to the existing rows .dt <- data.table::rbindlist(list(.dt, addCombinations.dt), use.names=TRUE) veupathUtils::logWithTime("Added imputed values to existing table. Finished imputing zeroes.", verbose) From 48ae23fce55d0a9eef23672e70cd8c61c177b579 Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Thu, 11 Apr 2024 14:04:36 -0400 Subject: [PATCH 09/12] add some validation for collectionsDT --- R/methods-Megastudy.R | 11 ++++++++++- tests/testthat/test-class-Megastudy.R | 5 +++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R index c89d791..d650b39 100644 --- a/R/methods-Megastudy.R +++ b/R/methods-Megastudy.R @@ -255,6 +255,16 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata } studyEntityIdColName <- upstreamEntityIdColNames[1] # still working off the assumption theyre ordered + # variables that are from the upstream entities need to be in collectionsDT + # otherwise we erroneously try to impute values for those variables too, rather than only the weighting variable + + if (!!length(collectionsDT)) { + upstreamEntityVariableColNames <- findColNamesByPredicate(variables, function(x) { x@variableSpec@entityId %in% upstreamEntityIdColNames }) + if (!all(upstreamEntityVariableColNames %in% names(collectionsDT))) { + stop("All variables from the upstream entities must be in collectionsDT.") + } + } + # for upstream entities data upstreamEntityVariables.dt <- .dt[, -c(weightingVarColName, varSpecColNames), with=FALSE] upstreamEntityVariables.dt[[varSpecEntityIdColName]] <- NULL @@ -282,7 +292,6 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata presentCombinations.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE]) # need upstream entity ids for all combinations in order to properly find and merge missing values allCombinations.dt <- merge(allCombinations.dt, upstreamEntityVariables.dt, by = mergeBy, all = TRUE, allow.cartesian=TRUE) - ## TODO figure how to populate study and collection entity variable values based on ids # NOTE: we're assuming if a value was explicitly filtered against that its not in the vocab addCombinations.dt <- allCombinations.dt[!presentCombinations.dt, on=c(upstreamEntityIdColNames, varSpecColNames)] diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R index c9148a8..969fed1 100644 --- a/tests/testthat/test-class-Megastudy.R +++ b/tests/testthat/test-class-Megastudy.R @@ -814,6 +814,7 @@ test_that("we have reasonable perf w a real-ish use case", { ) benchmark <- microbenchmark::microbenchmark(getDTWithImputedZeroes(megastudyReal, megastudyVariablesReal, verbose = FALSE)) - expect_true(mean(benchmark$time)/1000000 < 50) ## this is in milliseconds - expect_true(median(benchmark$time)/1000000 < 50) + print(benchmark) + expect_true(mean(benchmark$time)/1000000 < 55) ## this is in milliseconds + expect_true(median(benchmark$time)/1000000 < 55) }) \ No newline at end of file From ad7efdf33082e8c1041ef7d30ba549f5be77bdf9 Mon Sep 17 00:00:00 2001 From: Danielle Callan <53306535+d-callan@users.noreply.github.com> Date: Sat, 13 Apr 2024 07:01:43 -0400 Subject: [PATCH 10/12] Update R/methods-Megastudy.R Co-authored-by: bobular --- R/methods-Megastudy.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R index d650b39..cf0715f 100644 --- a/R/methods-Megastudy.R +++ b/R/methods-Megastudy.R @@ -272,7 +272,7 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata veupathUtils::logWithTime(paste("Found", nrow(upstreamEntityVariables.dt), "unique existing upstream variable value combinations."), verbose) if (!!length(collectionsDT)) { upstreamEntityVariables.dt <- merge(upstreamEntityVariables.dt, collectionsDT, by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE) - upstreamEntityVariables.dt <- upstreamEntityVariables.dt[, -which(grepl('.x',names(upstreamEntityVariables.dt),fixed=T)), with=FALSE] + upstreamEntityVariables.dt <- upstreamEntityVariables.dt[, -which(grepl('.x',names(upstreamEntityVariables.dt),fixed=T)), with=FALSE] collectionEntityColumnIndices <- which(grepl('.y',names(upstreamEntityVariables.dt),fixed=T)) names(upstreamEntityVariables.dt)[collectionEntityColumnIndices] <- gsub('.y', '', names(upstreamEntityVariables.dt)[collectionEntityColumnIndices], fixed=T) upstreamEntityVariables.dt <- unique(upstreamEntityVariables.dt) From 7ab9529242bc8e074421506d12f00ef6e9780ceb Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Mon, 15 Apr 2024 11:07:37 -0400 Subject: [PATCH 11/12] handle case where upstream variables are only in collectionsDT --- R/methods-Megastudy.R | 14 ++++----- tests/testthat/test-class-Megastudy.R | 45 ++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 8 deletions(-) diff --git a/R/methods-Megastudy.R b/R/methods-Megastudy.R index d650b39..06e80c0 100644 --- a/R/methods-Megastudy.R +++ b/R/methods-Megastudy.R @@ -257,9 +257,9 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata # variables that are from the upstream entities need to be in collectionsDT # otherwise we erroneously try to impute values for those variables too, rather than only the weighting variable - + upstreamEntities <- veupathUtils::strSplit(upstreamEntityIdColNames, ".", 2, 1) if (!!length(collectionsDT)) { - upstreamEntityVariableColNames <- findColNamesByPredicate(variables, function(x) { x@variableSpec@entityId %in% upstreamEntityIdColNames }) + upstreamEntityVariableColNames <- findColNamesByPredicate(variables, function(x) { x@variableSpec@entityId %in% upstreamEntities }) if (!all(upstreamEntityVariableColNames %in% names(collectionsDT))) { stop("All variables from the upstream entities must be in collectionsDT.") } @@ -271,11 +271,7 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata upstreamEntityVariables.dt <- unique(upstreamEntityVariables.dt) veupathUtils::logWithTime(paste("Found", nrow(upstreamEntityVariables.dt), "unique existing upstream variable value combinations."), verbose) if (!!length(collectionsDT)) { - upstreamEntityVariables.dt <- merge(upstreamEntityVariables.dt, collectionsDT, by=studyEntityIdColName, all=TRUE, allow.cartesian=TRUE) - upstreamEntityVariables.dt <- upstreamEntityVariables.dt[, -which(grepl('.x',names(upstreamEntityVariables.dt),fixed=T)), with=FALSE] - collectionEntityColumnIndices <- which(grepl('.y',names(upstreamEntityVariables.dt),fixed=T)) - names(upstreamEntityVariables.dt)[collectionEntityColumnIndices] <- gsub('.y', '', names(upstreamEntityVariables.dt)[collectionEntityColumnIndices], fixed=T) - upstreamEntityVariables.dt <- unique(upstreamEntityVariables.dt) + upstreamEntityVariables.dt <- collectionsDT } entityIds.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecEntityIdColName), with=FALSE]) @@ -306,6 +302,10 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata addCombinations.dt[[weightingVarColName]] <- 0 addCombinations.dt[[varSpecEntityIdColName]] <- stringi::stri_rand_strings(nrow(addCombinations.dt), 10) # bind them to the existing rows + upstreamVariablesInCollectionsDT <- names(collectionsDT)[!names(collectionsDT) %in% upstreamEntityIdColNames] + if (!!length(collectionsDT) & !all(upstreamVariablesInCollectionsDT %in% names(.dt))) { + .dt <- merge(.dt, upstreamEntityVariables.dt) + } .dt <- data.table::rbindlist(list(.dt, addCombinations.dt), use.names=TRUE) veupathUtils::logWithTime("Added imputed values to existing table. Finished imputing zeroes.", verbose) diff --git a/tests/testthat/test-class-Megastudy.R b/tests/testthat/test-class-Megastudy.R index 969fed1..ffffead 100644 --- a/tests/testthat/test-class-Megastudy.R +++ b/tests/testthat/test-class-Megastudy.R @@ -201,7 +201,47 @@ test_that("imputeZeroes method is sane", { # in real life, we have some collections where all samples are 0 and so not loaded # in this case, the collection ids are missing from the data table R gets handed. # we want to impute zeroes for their samples anyhow. - m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE], + + # in this version, some collection level variables are in both data and collectionsDT + m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant'), with=FALSE], + ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'), + studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)), + collectionsDT=collectionsDT) + + variables <- new("VariableMetadataList", SimpleList( + new("VariableMetadata", + variableClass = new("VariableClass", value = 'native'), + variableSpec = new("VariableSpec", variableId = 'species', entityId = 'sample'), + plotReference = new("PlotReference", value = 'xAxis'), + dataType = new("DataType", value = 'STRING'), + dataShape = new("DataShape", value = 'CATEGORICAL'), + weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), + hasStudyDependentVocabulary = TRUE), + new("VariableMetadata", + variableClass = new("VariableClass", value = 'native'), + variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'), + plotReference = new("PlotReference", value = 'yAxis'), + dataType = new("DataType", value = 'NUMBER'), + dataShape = new("DataShape", value = 'CONTINUOUS')), + new("VariableMetadata", + variableClass = new("VariableClass", value = 'native'), + variableSpec = new("VariableSpec", variableId = 'sex', entityId = 'sample'), + # empty plotReference means that it is not plotted + dataType = new("DataType", value = 'STRING'), + dataShape = new("DataShape", value = 'CATEGORICAL'), + weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'), + hasStudyDependentVocabulary = TRUE) + )) + + imputedDT <- getDTWithImputedZeroes(m, variables, FALSE) + # result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes + expect_equal(all(c("sample.species","sample.specimen_count") %in% names(imputedDT)), TRUE) + # 5 sexes * 3 species * 3 collections in study A (45) + 2 sexes * 3 species * 4 collections in study B (30) = 69 + expect_equal(nrow(imputedDT), 69) ## TODO check these numbers + expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 63) + + # in this version, collection level variables are only in collectionsDT + m <- Megastudy(data=megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species'), with=FALSE], ancestorIdColumns=c('study.id', 'collection.id', 'sample.id'), studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(speciesVocabs, sexVocabs)), collectionsDT=collectionsDT) @@ -238,6 +278,9 @@ test_that("imputeZeroes method is sane", { expect_equal(nrow(imputedDT), 69) ## TODO check these numbers expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 63) + ################################################################################################################# + + # case where one study vocab is missing a study mDTSexSingleStudy <- megastudyDT[, c('study.id', 'collection.id', 'sample.id', 'sample.specimen_count', 'sample.sex', 'sample.species', 'collection.attractant', 'study.author'), with=FALSE] mDTSexSingleStudy$sample.sex[mDTSexSingleStudy$study.id == 'b'] <- NA_character_ From 6c61e4cc08d7d0803a480dee0483a740fad43b29 Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Tue, 16 Apr 2024 14:44:35 -0400 Subject: [PATCH 12/12] update docs --- R/class-Megastudy.R | 3 ++- man/Megastudy-class.Rd | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/R/class-Megastudy.R b/R/class-Megastudy.R index 85f21c7..4ff2df8 100644 --- a/R/class-Megastudy.R +++ b/R/class-Megastudy.R @@ -94,7 +94,8 @@ check_megastudy <- function(object) { #' @slot data A data.table #' @slot ancestorIdColumns A character vector of column names representing parent entities of the recordIdColumn. #' @slot studySpecificVocabularies veupathUtils::StudySpecificVocabulariesByVariableList -#' @slot collectionIds A character vector of collection ids we expect. If none provided, they will be inferred from those present. +#' @slot collectionIds A data.table including collection ids and any variables of interest for the collection entity. +#' If none provided, the collection ids will be inferred from those present in `data`. #' #' @name Megastudy-class #' @rdname Megastudy-class diff --git a/man/Megastudy-class.Rd b/man/Megastudy-class.Rd index 6dbb37f..1527fa1 100644 --- a/man/Megastudy-class.Rd +++ b/man/Megastudy-class.Rd @@ -16,6 +16,11 @@ that data. \describe{ \item{\code{data}}{A data.table} +\item{\code{ancestorIdColumns}}{A character vector of column names representing parent entities of the recordIdColumn.} + \item{\code{studySpecificVocabularies}}{veupathUtils::StudySpecificVocabulariesByVariableList} + +\item{\code{collectionIds}}{A data.table including collection ids and any variables of interest for the collection entity. +If none provided, the collection ids will be inferred from those present in \code{data}.} }}