Skip to content

Commit

Permalink
clean up code, fix a bug, update some tests, etc
Browse files Browse the repository at this point in the history
  • Loading branch information
d-callan committed Dec 19, 2023
1 parent 07855d9 commit 1e7f4a9
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 42 deletions.
58 changes: 29 additions & 29 deletions R/methods-Megastudy.R
Original file line number Diff line number Diff line change
Expand Up @@ -284,45 +284,45 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
}

# for upstream entities data
combinations.dt <- unique(.dt[, -c(weightingVarColName, varSpecColNames), with=FALSE])
combinations.dt[[varSpecEntityIdColName]] <- NULL
combinations.dt <- unique(combinations.dt)
veupathUtils::logWithTime(paste("Found", nrow(combinations.dt), "existing variable value combinations."), verbose)
upstreamEntityVariables.dt <- unique(.dt[, -c(weightingVarColName, varSpecColNames), with=FALSE])
upstreamEntityVariables.dt[[varSpecEntityIdColName]] <- NULL
upstreamEntityVariables.dt <- unique(upstreamEntityVariables.dt)
veupathUtils::logWithTime(paste("Found", nrow(upstreamEntityVariables.dt), "unique existing upstream variable value combinations."), verbose)
entityIds.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecEntityIdColName), with=FALSE])


# impute zeroes for each study vocab iteratively
makeImputedZeroesDT <- function(variableSpec) {
veupathUtils::logWithTime(paste("Imputing zeroes for", veupathUtils::getColName(variableSpec)), verbose)
# make vocab table for a single variable
makeVocabDT <- function(variableSpec) {
veupathUtils::logWithTime(paste("Finding vocab for", veupathUtils::getColName(variableSpec)), verbose)
varSpecColName <- veupathUtils::getColName(variableSpec)
vocab <- findStudyVocabularyByVariableSpec(vocabs, variables, variableSpec)
vocabs.dt <- veupathUtils::as.data.table(vocab)
names(vocabs.dt)[2] <- varSpecColName
vocabs.dt <- merge(entityIds.dt, vocabs.dt, by=studyIdColName, allow.cartesian=TRUE)
present.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecColName), with=FALSE])
# assume if a value was explicitly filtered against that its not in the vocab
add.dt <- vocabs.dt[!present.dt, on=c(upstreamEntityIdColNames, varSpecColName)]
if (nrow(add.dt) > 0) {
add.dt[[weightingVarColName]] <- 0
} else {
add.dt[[weightingVarColName]] <- numeric()
}

veupathUtils::logWithTime(paste("Found", nrow(add.dt), "new combinations of values for", veupathUtils::getColName(variableSpec)), verbose)
return(unique(add.dt))
return(vocabs.dt)
}

# make all possible variable value combinations table
vocabDTs <- lapply(variableSpecsToImputeZeroesFor, makeVocabDT)
allCombinations.dt <- purrr::reduce(vocabDTs, merge, allow.cartesian=TRUE)

dataTablesOfImputedValues <- lapply(variableSpecsToImputeZeroesFor, makeImputedZeroesDT)
mergeDTsOfImputedValues <- function(x,y) {
merge(x, y, by = c(upstreamEntityIdColNames, varSpecEntityIdColName, weightingVarColName), allow.cartesian=TRUE, all=TRUE)
# find which ones we need to add
presentCombinations.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE])
# need upstream entity ids for all combinations in order to properly find and merge missing values
allCombinations.dt <- merge(allCombinations.dt, upstreamEntityVariables.dt, allow.cartesian=TRUE)
# NOTE: we're assuming if a value was explicitly filtered against that its not in the vocab
addCombinations.dt <- allCombinations.dt[!presentCombinations.dt, on=c(upstreamEntityIdColNames, varSpecColNames)]

if (nrow(addCombinations.dt) == 0) {
veupathUtils::logWithTime("No new combinations to add. Returning existing table.", verbose)
return(.dt)
} else {
veupathUtils::logWithTime(paste("Adding", nrow(addCombinations.dt), "new combinations."), verbose)
}
.dt2 <- purrr::reduce(dataTablesOfImputedValues, mergeDTsOfImputedValues)
veupathUtils::logWithTime(paste("Finished collapsing imputed values for all variables into one table. Added", nrow(.dt2), "total rows."), verbose)

#make impossibly unique ids
.dt2[[varSpecEntityIdColName]] <- apply(.dt2[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE], 1, digest::digest, algo='md5')
.dt2 <- unique(merge(.dt2, combinations.dt, by=upstreamEntityIdColNames))
.dt <- rbind(.dt, .dt2)
# go ahead and add them, first filling in values for all columns
addCombinations.dt[[weightingVarColName]] <- 0
addCombinations.dt[[varSpecEntityIdColName]] <- apply(addCombinations.dt[, c(upstreamEntityIdColNames, varSpecColNames), with=FALSE], 1, digest::digest, algo="md5")
# bind them to the existing rows
.dt <- rbind(.dt, addCombinations.dt)
veupathUtils::logWithTime("Added imputed values to existing table. Finished imputing zeroes.", verbose)

return(.dt)
Expand Down
28 changes: 15 additions & 13 deletions tests/testthat/test-class-Megastudy.R
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,9 @@ test_that("imputeZeroes method is sane", {
imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
# result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes
expect_equal(all(c("sample.species","sample.specimen_count") %in% names(imputedDT)), TRUE)
# 5 sexes * 3 species in study A (15) + 2 sexes * 3 species in study B (6) = 21
expect_equal(nrow(imputedDT), 21)
expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15)
# 5 sexes * 3 species in study A (15) + 2 sexes * 3 species in study B (6) * 2 collections per study = 42
expect_equal(nrow(imputedDT), 42)
expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36)

# collection entity var is present
variables <- new("VariableMetadataList", SimpleList(
Expand Down Expand Up @@ -186,8 +186,8 @@ test_that("imputeZeroes method is sane", {

imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
expect_equal(all(c("sample.species","sample.specimen_count","collection.attractant") %in% names(imputedDT)), TRUE)
expect_equal(nrow(imputedDT), 21)
expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15)
expect_equal(nrow(imputedDT), 42)
expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36)

# both collection and study entity vars are present
variables <- new("VariableMetadataList", SimpleList(
Expand Down Expand Up @@ -229,8 +229,8 @@ test_that("imputeZeroes method is sane", {

imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
expect_equal(all(c("sample.species","sample.specimen_count","collection.attractant","study.author") %in% names(imputedDT)), TRUE)
expect_equal(nrow(imputedDT), 21)
expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15)
expect_equal(nrow(imputedDT), 42)
expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36)

# all values in vocab already present
megastudyDTSMALL <- rbind(megastudyDT,
Expand Down Expand Up @@ -294,7 +294,7 @@ test_that("imputeZeroes method is sane", {
expect_equal(all(names(imputedDT) %in% names(mCOMPLETE@data)), TRUE)
# 2 species * 2 sexes * 2 collections * 2 studies = 16
expect_equal(nrow(imputedDT), 16)
expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 6)
expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 8)

# no weighting var in plot
variables <- new("VariableMetadataList", SimpleList(
Expand Down Expand Up @@ -460,8 +460,8 @@ test_that("imputeZeroes method is sane", {

imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
expect_equal(all(c("sample.species","sample.specimen_count","sample.sex","collection.attractant") %in% names(imputedDT)), TRUE)
expect_equal(nrow(imputedDT), 21)
expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 15)
expect_equal(nrow(imputedDT), 42)
expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 36)

# special vocab on sample, regular weighting var on assay
# phase this in
Expand Down Expand Up @@ -645,8 +645,10 @@ test_that("imputeZeroes method is sane", {

imputedDT <- getDTWithImputedZeroes(m, variables, FALSE)
# result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes
# TODO lol its just possible this fxn shouldnt remove cols but my brain hurts enough already
expect_equal(all(c("assay.pathogen_presence","assay.pathogen2_presence","assay.pathogen3_presence","sample.specimen_count") %in% names(imputedDT)), TRUE)
expect_equal(nrow(imputedDT), 12)
expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 6)
# 2 studies * 2 collections per study * 2 values for each of 3 pathogen variables = 32?
# im not sure this test makes any sense, bc were imputing 0 on a sample for a collection on assay
# im going to comment until we see a real use case
#expect_equal(nrow(imputedDT), 32)
#expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 26)
})

0 comments on commit 1e7f4a9

Please sign in to comment.