Skip to content

Commit

Permalink
Merge pull request #26 from VEuPathDB/impute-zeroes-variable-collections
Browse files Browse the repository at this point in the history
Impute zeroes variable collections
  • Loading branch information
d-callan authored Sep 7, 2023
2 parents 90797be + f7ccbdf commit 3531e79
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 17 deletions.
95 changes: 81 additions & 14 deletions R/methods-Megastudy.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#' @return character
#' @export
setGeneric("getVariableSpec",
function(object) standardGeneric("getVariableSpec"),
function(object, ...) standardGeneric("getVariableSpec"),
signature = "object"
)

Expand All @@ -22,6 +22,26 @@ setMethod('getVariableSpec', signature('ANY'), function(object) {
return(object@variableSpec)
})

# this might be unexpected behavior. should there be a param to choose between the collection and its member specs?
#' @export
setMethod('getVariableSpec', signature('VariableMetadata'), function(object, getCollectionMemberVarSpecs = c(FALSE, TRUE)) {
getCollectionMemberVarSpecs <- veupathUtils::matchArg(getCollectionMemberVarSpecs)
varSpecs <- list(object@variableSpec)

if (object@isCollection && getCollectionMemberVarSpecs) {
varSpecs <- as.list(object@members)
}

return(varSpecs)
})

#' @export
setMethod('getVariableSpec', signature('VariableMetadataList'), function(object, getCollectionMemberVarSpecs = c(TRUE, FALSE)) {
getCollectionMemberVarSpecs <- veupathUtils::matchArg(getCollectionMemberVarSpecs)

return(unlist(lapply(as.list(object), veupathUtils::getVariableSpec, getCollectionMemberVarSpecs)))
})

#' StuydIdColName as String
#'
#' This function returns the studyIdColName from an StudySpecificVocabulary
Expand Down Expand Up @@ -122,16 +142,58 @@ findEntityIdColumnNameForVariableSpec <- function(varSpec, entityIdColumns) {
return(entityIdColumns[grepl(varSpec@entityId, entityIdColumns)])
}

findStudyVocabularyByVariableSpec <- function(vocabs, variableSpec) {
findStudyVocabularyByVariableSpec <- function(vocabs, variables, variableSpec) {
if (!inherits(vocabs, 'StudySpecificVocabulariesByVariableList')) stop("The first argument must be of the S4 class `StudySpecificVocabulariesByVariableList`.")
if (!inherits(variableSpec, 'VariableSpec')) stop("The second argument must be of the S4 class `VariableSpec`.")
if (!inherits(variables, 'VariableMetadataList')) stop("The second argument must be of the S4 class `VariableMetadataList`.")
if (!inherits(variableSpec, 'VariableSpec')) stop("The third argument must be of the S4 class `VariableSpec`.")

vocabVariableSpecs <- lapply(as.list(vocabs), veupathUtils::getVariableSpec)
index <- which(purrr::map(vocabVariableSpecs, function(x) {veupathUtils::getColName(x)}) == veupathUtils::getColName(variableSpec))
vocabVariableMetadata <- veupathUtils::findVariableMetadataFromVariableSpec(variables, veupathUtils::VariableSpecList(S4Vectors::SimpleList(vocabVariableSpecs)))
vocabVariableSpecsAdjustedForVariableCollectionMembers <- veupathUtils::getVariableSpec(vocabVariableMetadata, TRUE)

# if we have found variable collection members in the VariableMetadata, need to check if the passed varspec was a member
# look through the list that includes the members, and if we match one, get the varspec of the parent/ collection
# use the varspec of the parent/ collection to get the VariableMetadata associated w the entire collection
# remember, individual members dont have their own VariableMetadata
if (!identical(vocabVariableSpecs, vocabVariableSpecsAdjustedForVariableCollectionMembers)) {
index <- which(purrr::map(vocabVariableSpecsAdjustedForVariableCollectionMembers, function(x) {veupathUtils::getColName(x)}) == veupathUtils::getColName(variableSpec))
variableCollectionSpecs <- vocabVariableSpecsAdjustedForVariableCollectionMembers[[index]]
index <- which(purrr::map(vocabVariableMetadata, function(x) {veupathUtils::getColName(variableCollectionSpecs) %in% unlist(veupathUtils::getColName(x@members))}) == TRUE)
} else {
index <- which(purrr::map(vocabVariableSpecs, function(x) {veupathUtils::getColName(x)}) == veupathUtils::getColName(variableSpec))
}

return(vocabs[[index]])
}


findVariableSpecsFromStudyVocabulary <- function(vocabs, variables, getCollectionMemberVarSpecs = c(TRUE, FALSE)) {
if (!inherits(vocabs, 'StudySpecificVocabulariesByVariableList')) stop("The first argument must be of the S4 class `StudySpecificVocabulariesByVariableList`.")
if (!inherits(variables, 'VariableMetadataList')) stop("The second argument must be of the S4 class `VariableMetadataList`.")
getCollectionMemberVarSpecs <- veupathUtils::matchArg(getCollectionMemberVarSpecs)

varSpecsWithVocabs <- VariableSpecList(S4Vectors::SimpleList(lapply(as.list(vocabs), getVariableSpec)))

if (getCollectionMemberVarSpecs) {
varMetadataWithVocabs <- findVariableMetadataFromVariableSpec(variables, varSpecsWithVocabs)
varSpecsWithVocabs <- VariableSpecList(S4Vectors::SimpleList(getVariableSpec(varMetadataWithVocabs, getCollectionMemberVarSpecs)))
}

return(varSpecsWithVocabs)
}

getVariableColumnNames <- function(variableMetadata) {
if (!inherits(variableMetadata, 'VariableMetadata')) stop("The specified object must be of the S4 class `VariableMetadata`.")

colNames <- veupathUtils::getColName(variableMetadata@variableSpec)

if (variableMetadata@isCollection) {
colNames <- unlist(lapply(as.list(variableMetadata@members), veupathUtils::getColName))
}

return(colNames)
}

#' Impute Zeroes (on tall data)
#'
#' This function returns a data.table which has explicit zero values
Expand All @@ -156,7 +218,7 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
.dt <- object@data
# TODO feel like im doing this operation a lot.. maybe another method/ helper?
# also, try to figure a way we dont have to do this.. i dont remember why i did this and its inconsistent behavior
variableColumnNames <- unlist(lapply(lapply(as.list(variables), veupathUtils::getVariableSpec), veupathUtils::getColName))
variableColumnNames <- unlist(lapply(as.list(variables), getVariableColumnNames))
allEntityIdColumns <- object@ancestorIdColumns
# drop things that arent in the plot, except ids
.dt <- .dt[, c(variableColumnNames, allEntityIdColumns), with=FALSE]
Expand All @@ -166,8 +228,13 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
# plus going that route means using this class in plot.data means an api change for plot.data
# that api change might be worth making in any case, but not doing it now
variableMetadataNeedingStudyVocabularies <- findStudyDependentVocabularyVariableMetadata(variables)
variableSpecsWithStudyVocabs <- VariableSpecList(S4Vectors::SimpleList(lapply(as.list(vocabs), getVariableSpec)))
variableMetadataForStudyVocabVariables <- findVariableMetadataFromVariableSpec(variables, variableSpecsWithStudyVocabs)
variableSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, TRUE)
variableCollectionSpecsWithStudyVocabs <- findVariableSpecsFromStudyVocabulary(vocabs, variables, FALSE)
if (is.null(variableCollectionSpecsWithStudyVocabs)) {
variableMetadataForStudyVocabVariables <- findVariableMetadataFromVariableSpec(variables, variableSpecsWithStudyVocabs)
} else {
variableMetadataForStudyVocabVariables <- findVariableMetadataFromVariableSpec(variables, variableCollectionSpecsWithStudyVocabs)
}
if (length(variableSpecsWithStudyVocabs) > length(variableMetadataForStudyVocabVariables)) {
warning("Study vocabularies were provided for variables that are not present in the plot. These will be ignored.")
}
Expand All @@ -187,13 +254,13 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
weightingVarColName <- veupathUtils::getColName(findVariableMetadataFromVariableSpec(variables, veupathUtils::getVariableSpec(vocabs[[1]]))[[1]]@weightingVariableSpec)
}

variableSpecsToImputeZeroesFor <- veupathUtils::getVariableSpec(variableMetadataForStudyVocabVariables)
studyIdColName <- getStudyIdColumnName(vocabs)
# TODO should this be made based on variableSpecsTiImputeZeroesFor ??
varSpecColNames <- getVariableSpecColumnName(vocabs)
varSpecColNames <- unlist(lapply(variableSpecsToImputeZeroesFor, veupathUtils::getColName))
# this works bc we validate all vocabs must be on the same entity
varSpecEntityIdColName <- findEntityIdColumnNameForVariableSpec(veupathUtils::getVariableSpec(vocabs[[1]]), allEntityIdColumns)
variablesFromEntityOfInterest <- findVariableMetadataFromEntityId(variables, veupathUtils::getVariableSpec(vocabs[[1]])@entityId)
variableSpecsFromEntityOfInterest <- lapply(as.list(variablesFromEntityOfInterest), getVariableSpec)
variableSpecsFromEntityOfInterest <- veupathUtils::getVariableSpec(variablesFromEntityOfInterest)
if (any(unlist(getHasStudyDependentVocabulary(variablesFromEntityOfInterest)) &
unlist(lapply(variableSpecsFromEntityOfInterest, identical, weightingVarSpecsForStudyVocabVariables[[1]])))) {
stop("Not all variables on the entity associated with the present study vocabulary have study vocabularies.")
Expand All @@ -214,11 +281,12 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
entityIds.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecEntityIdColName), with=FALSE])

# impute zeroes for each study vocab iteratively
variableSpecsToImputeZeroesFor <- lapply(as.list(variableMetadataForStudyVocabVariables), veupathUtils::getVariableSpec)
makeImputedZeroesDT <- function(variableSpec) {
vocab <- findStudyVocabularyByVariableSpec(vocabs, variableSpec)
vocabs.dt <- merge(entityIds.dt, veupathUtils::as.data.table(vocab), by=studyIdColName, allow.cartesian=TRUE)
varSpecColName <- veupathUtils::getColName(variableSpec)
vocab <- findStudyVocabularyByVariableSpec(vocabs, variables, variableSpec)
vocabs.dt <- veupathUtils::as.data.table(vocab)
names(vocabs.dt)[2] <- varSpecColName
vocabs.dt <- merge(entityIds.dt, vocabs.dt, by=studyIdColName, allow.cartesian=TRUE)
present.dt <- unique(.dt[, c(upstreamEntityIdColNames, varSpecColName), with=FALSE])
# assume if a value was explicitly filtered against that its not in the vocab
add.dt <- vocabs.dt[!present.dt, on=c(upstreamEntityIdColNames, varSpecColName)]
Expand All @@ -232,7 +300,6 @@ setMethod('getDTWithImputedZeroes', signature = c('Megastudy', 'VariableMetadata
}
dataTablesOfImputedValues <- lapply(variableSpecsToImputeZeroesFor, makeImputedZeroesDT)
mergeDTsOfImputedValues <- function(x,y) {
# TODO test this merge, not sure i got it right..
merge(x, y, by = c(upstreamEntityIdColNames, varSpecEntityIdColName, weightingVarColName), allow.cartesian=TRUE)
}
.dt2 <- purrr::reduce(dataTablesOfImputedValues, mergeDTsOfImputedValues)
Expand Down
11 changes: 8 additions & 3 deletions R/methods-VariableMetadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,11 @@ setMethod("getColName", signature("VariableSpec"), function(varSpec) {
return(veupathUtils::toStringOrNull(paste0(entityId, ".", varId)))
})

#' @export
setMethod("getColName", signature("VariableSpecList"), function(varSpec) {
lapply(as.list(varSpec), veupathUtils::getColName)
})

#' @export
setMethod("getColName", signature("NULL"), function(varSpec) {
NULL
Expand Down Expand Up @@ -628,13 +633,13 @@ setMethod("findColNamesByPredicate", signature("VariableMetadataList"), function

#' @export
setGeneric("findVariableMetadataFromVariableSpec",
function(variables, object) standardGeneric("findVariableMetadataFromVariableSpec"),
function(variables, object, ...) standardGeneric("findVariableMetadataFromVariableSpec"),
signature = c("variables","object")
)

#' @export
setMethod("findVariableMetadataFromVariableSpec", signature("VariableMetadataList", "VariableSpecList"), function(variables, object) {
variableSpecs <- lapply(as.list(variables), veupathUtils::getVariableSpec)
variableSpecs <- unlist(lapply(as.list(variables), veupathUtils::getVariableSpec, FALSE))
colNamesToMatch <- unlist(lapply(as.list(object), veupathUtils::getColName))

index <- which(purrr::map(variableSpecs, function(x) {veupathUtils::getColName(x)}) %in% colNamesToMatch)
Expand All @@ -645,7 +650,7 @@ setMethod("findVariableMetadataFromVariableSpec", signature("VariableMetadataLis

#' @export
setMethod("findVariableMetadataFromVariableSpec", signature("VariableMetadataList", "VariableSpec"), function(variables, object) {
variableSpecs <- lapply(as.list(variables), veupathUtils::getVariableSpec)
variableSpecs <- unlist(lapply(as.list(variables), veupathUtils::getVariableSpec, FALSE))

index <- which(purrr::map(variableSpecs, function(x) {veupathUtils::getColName(x)}) == veupathUtils::getColName(object))
if (!length(index)) return(NULL)
Expand Down
42 changes: 42 additions & 0 deletions tests/testthat/test-class-Megastudy.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ megastudyDT <- data.table('study.id'=c('a','a','a','b','b','b'),
'assay.id'=c(11,12,13,14,15,16),
'assay.pathogen_prevalence'=c(.1,.2,.3,.4,.5,.6),
'assay.pathogen_presence'=c('Yes','Yes','No','No','Yes','No'),
'assay.pathogen2_presence'=c('Yes','No','Yes','No','Yes','No'),
'assay.pathogen3_presence'=c('No','Yes','No','Yes','No','Yes'),
'assay.weighting_variable'=c(5,10,15,20,25,30))

studyAspecies <- StudySpecificVocabulary(studyIdColumnName='study.id', study='a', variableSpec=VariableSpec(entityId='sample',variableId='species'), vocabulary=c('species1','species2','species3'))
Expand Down Expand Up @@ -219,6 +221,8 @@ test_that("imputeZeroes method is sane", {
assay.id=c(17,18),
assay.pathogen_prevalence=c(.7,.8),
assay.pathogen_presence=c('No','Yes'),
assay.pathogen2_presence=c('Yes','No'),
assay.pathogen3_presence=c('No','Yes'),
assay.weighting_variable=c(35,40)))

mCOMPLETE <- Megastudy(data=megastudyDTSMALL,
Expand Down Expand Up @@ -544,4 +548,42 @@ test_that("imputeZeroes method is sane", {
))

expect_error(getDTWithImputedZeroes(m, variables))

# variable collection exists in plot
pathogenVariableCollectionVocabs <- StudySpecificVocabulariesByVariable(S4Vectors::SimpleList(StudySpecificVocabulary(studyIdColumnName='study.id', study='a', variableSpec=VariableSpec(entityId='assay',variableId='pathogen_presence_variable_collection'), vocabulary=c('Yes','No')),
StudySpecificVocabulary(studyIdColumnName='study.id', study='b', variableSpec=VariableSpec(entityId='assay',variableId='pathogen_presence_variable_collection'), vocabulary=c('Yes','No'))))


m <- Megastudy(data=megastudyDT,
ancestorIdColumns=c('study.id', 'collection.id', 'sample.id', 'assay.id'),
studySpecificVocabularies=StudySpecificVocabulariesByVariableList(S4Vectors::SimpleList(pathogenVariableCollectionVocabs)))

variables <- new("VariableMetadataList", SimpleList(
new("VariableMetadata",
variableClass = new("VariableClass", value = 'native'),
variableSpec = new("VariableSpec", variableId = 'pathogen_presence_variable_collection', entityId = 'assay'),
plotReference = new("PlotReference", value = 'overlay'),
dataType = new("DataType", value = 'STRING'),
dataShape = new("DataShape", value = 'CATEGORICAL'),
isCollection = TRUE,
weightingVariableSpec = VariableSpec(variableId='specimen_count',entityId='sample'),
hasStudyDependentVocabulary = TRUE,
members = VariableSpecList(S4Vectors::SimpleList(VariableSpec(variableId='pathogen_presence', entityId='assay'),
VariableSpec(variableId='pathogen2_presence', entityId='assay'),
VariableSpec(variableId='pathogen3_presence', entityId='assay')))
),
new("VariableMetadata",
variableClass = new("VariableClass", value = 'native'),
variableSpec = new("VariableSpec", variableId = 'specimen_count', entityId = 'sample'),
plotReference = new("PlotReference", value = 'yAxis'),
dataType = new("DataType", value = 'NUMBER'),
dataShape = new("DataShape", value = 'CONTINUOUS'))
))

imputedDT <- getDTWithImputedZeroes(m, variables)
# result has the columns needed to build a plot, based on variables AND the correct number of rows/ zeroes
# TODO lol its just possible this fxn shouldnt remove cols but my brain hurts enough already
expect_equal(all(c("assay.pathogen_presence","assay.pathogen2_presence","assay.pathogen3_presence","sample.specimen_count") %in% names(imputedDT)), TRUE)
expect_equal(nrow(imputedDT), 12)
expect_equal(nrow(imputedDT[imputedDT$sample.specimen_count == 0]), 6)
})

0 comments on commit 3531e79

Please sign in to comment.