Skip to content

Commit

Permalink
Merge pull request #326 from EGA-archive/develop
Browse files Browse the repository at this point in the history
Optimising indexing
  • Loading branch information
costero-e authored May 17, 2024
2 parents 196c930 + 1f5a9a6 commit ea9ba3e
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 17 deletions.
36 changes: 29 additions & 7 deletions beacon/db/extract_filtering_terms.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,31 @@ def __call__(self, block_num: int, block_size: int, total_size: int):


def get_ontology_field_name(ontology_id:str, term_id:str, collection:str):

query = {
'$text': {
'$search': '\"' + ontology_id + ":" + term_id + '\"'
}
}
biosamples=['biosampleStatus.id','diagnosticMarkers.id','histologicalDiagnosis.id','measurements.assayCode.id','measurements.measurementValue.id','measurements.measurementValue.referenceRange.unit.id','measurements.measurementValue.typedQuantities.quantity.unit.id','measurements.measurementValue.unit.id','measurements.observationMoment.id','measurements.procedure.bodySite.id','measurements.procedure.procedureCode.id','pathologicalStage.id','pathologicalTnmFinding.id','phenotypicFeatures.evidence.evidenceCode.id','phenotypicFeatures.evidence.reference.id','phenotypicFeatures.featureType.id','phenotypicFeatures.modifiers.id','phenotypicFeatures.onset.id','phenotypicFeatures.resolution.id','phenotypicFeatures.severity.id','sampleOriginDetail.id','sampleOriginType.id','sampleProcessing.id','sampleStorage.id','tumorGrade.id','tumorProgression.id']
cohorts=['cohortDataTypes.id','cohortDesign.id','exclusionCriteria.diseaseConditions.diseaseCode.id','exclusionCriteria.diseaseConditions.severity.id','exclusionCriteria.diseaseConditions.stage.id','exclusionCriteria.ethnicities.id','exclusionCriteria.genders.id','exclusionCriteria.locations.id','exclusionCriteria.phenotypicConditions.featureType.id','exclusionCriteria.phenotypicConditions.severity.id','inclusionCriteria.diseaseConditions.diseaseCode.id','inclusionCriteria.diseaseConditions.severity.id','inclusionCriteria.diseaseConditions.stage.id','inclusionCriteria.ethnicities.id','inclusionCriteria.genders.id','inclusionCriteria.locations.id','inclusionCriteria.phenotypicConditions.featureType.id','inclusionCriteria.phenotypicConditions.severity.id']
datasets=['dataUseConditions.duoDataUse.id']
genomicVariations=['caseLevelData.alleleOrigin.id','caseLevelData.clinicalInterpretations.category.id','caseLevelData.clinicalInterpretations.effect.id','caseLevelData.clinicalInterpretations.evidenceType.id','caseLevelData.id','caseLevelData.phenotypicEffects.category.id','caseLevelData.phenotypicEffects.effect.id','caseLevelData.phenotypicEffects.evidenceType.id','caseLevelData.zygosity.id','identifiers.variantAlternativeIds.id','molecularAttributes.molecularEffects.id','variantLevelData.clinicalInterpretations.category.id','variantLevelData.clinicalInterpretations.effect.id','variantLevelData.clinicalInterpretations.evidenceType.id','variantLevelData.phenotypicEffects.category.id','variantLevelData.phenotypicEffects.effect.id','variantLevelData.phenotypicEffects.evidenceType.id']
individuals=['diseases.ageOfOnset.id','diseases.diseaseCode.id','diseases.severity.id','diseases.stage.id','ethnicity.id','exposures.exposureCode.id','exposures.unit.id','geographicOrigin.id','interventionsOrProcedures.ageAtProcedure.id','interventionsOrProcedures.bodySite.id','interventionsOrProcedures.procedureCode.id','measures.assayCode.id','measures.measurementValue.id','measures.measurementValue.typedQuantities.quantity.unit.id','measures.measurementValue.unit.id','measures.observationMoment.id','measures.procedure.bodySite.id','measures.procedure.procedureCode.id','pedigrees.disease.diseaseCode.id','pedigrees.disease.severity.id','pedigrees.disease.stage.id','pedigrees.id','pedigrees.members.role.id','phenotypicFeatures.evidence.evidenceCode.id','phenotypicFeatures.evidence.reference.id','phenotypicFeatures.featureType.id','phenotypicFeatures.modifiers.id','phenotypicFeatures.onset.id','phenotypicFeatures.resolution.id','phenotypicFeatures.severity.id','sex.id','treatments.cumulativeDose.referenceRange.id','treatments.doseIntervals.id','treatments.routeOfAdministration.id','treatments.treatmentCode.id']
runs=['librarySource.id','platformModel.id']
array=[]
if collection == 'biosamples':
array=biosamples
elif collection == 'cohorts':
array=cohorts
elif collection == 'datasets':
array=datasets
elif collection == 'genomicVariations':
array=genomicVariations
elif collection == 'individuals':
array=individuals
elif collection == 'runs':
array=runs
query={}
query['$or']=[]
for field in array:
fieldquery={}
fieldquery[field]=ontology_id + ":" + term_id
query['$or'].append(fieldquery)
results = client.beacon.get_collection(collection).find(query).limit(1)
results = list(results)
results = dumps(results)
Expand Down Expand Up @@ -232,6 +251,8 @@ def find_ontology_terms_used(collection_name: str) -> List[Dict]:
if term not in terms_ids:
terms_ids.append(term)
i += 10000
if i > 30000:
break
print(i)
else:
xs = client.beacon.get_collection(collection_name).find().skip(0).limit(10000)
Expand Down Expand Up @@ -382,7 +403,8 @@ def merge_terms():
'scopes': array_of_scopes
})
client.beacon.filtering_terms.delete_many({"id": repeated_id})
client.beacon.filtering_terms.insert_many(new_terms)
if new_terms != []:
client.beacon.filtering_terms.insert_many(new_terms)



Expand Down
20 changes: 10 additions & 10 deletions beacon/reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,19 @@
client.beacon.validate_collection("similarities")
except Exception:
db=client.beacon.create_collection(name="similarities")
client.beacon.analyses.create_index([("$**", "text")])
client.beacon.biosamples.create_index([("$**", "text")])
client.beacon.cohorts.create_index([("$**", "text")])
client.beacon.datasets.create_index([("$**", "text")])
client.beacon.genomicVariations.create_index([("$**", "text")])
client.beacon.genomicVariations.create_index([("caseLevelData.biosampleId", 1)])
client.beacon.genomicVariations.create_index([("variation.location.interval.end.value", -1), ("variation.location.interval.start.value", 1)])
#client.beacon.analyses.create_index([("$**", "text")])
#client.beacon.biosamples.create_index([("$**", "text")])
#client.beacon.cohorts.create_index([("$**", "text")])
#client.beacon.datasets.create_index([("$**", "text")])
#client.beacon.genomicVariations.create_index([("$**", "text")])
#client.beacon.genomicVariations.create_index([("caseLevelData.biosampleId", 1)])
#client.beacon.genomicVariations.create_index([("variation.location.interval.end.value", -1), ("variation.location.interval.start.value", 1)])
client.beacon.genomicVariations.create_index([("variantInternalId", 1), ("caseLevelData.biosampleId", 1)])
client.beacon.genomicVariations.create_index([("identifiers.genomicHGVSId", 1), ("variation.location.interval.start.value", 1), ("caseLevelData.biosampleId", 1), ("variation.referenceBases", 1), ("variation.alternateBases", 1)])
#client.beacon.genomicVariations.create_index([("identifiers.genomicHGVSId", 1), ("variation.location.interval.start.value", 1), ("caseLevelData.biosampleId", 1), ("variation.referenceBases", 1), ("variation.alternateBases", 1)])
client.beacon.genomicVariations.create_index([("variation.location.interval.end.value", -1), ("variation.location.interval.start.value", 1), ("variation.referenceBases", 1), ("variation.alternateBases", 1)])
client.beacon.genomicVariations.create_index([("molecularAttributes.geneIds", 1), ("variantInternalId", 1), ("variation.variantType", 1)])
client.beacon.individuals.create_index([("$**", "text")])
client.beacon.runs.create_index([("$**", "text")])
#client.beacon.individuals.create_index([("$**", "text")])
#client.beacon.runs.create_index([("$**", "text")])
#collection_name = client.beacon.analyses
#print(collection_name.index_information())

0 comments on commit ea9ba3e

Please sign in to comment.