diff --git a/beacon/db/extract_filtering_terms.py b/beacon/db/extract_filtering_terms.py index 7fbeb334..adcc8347 100644 --- a/beacon/db/extract_filtering_terms.py +++ b/beacon/db/extract_filtering_terms.py @@ -66,12 +66,31 @@ def __call__(self, block_num: int, block_size: int, total_size: int): def get_ontology_field_name(ontology_id:str, term_id:str, collection:str): - - query = { - '$text': { - '$search': '\"' + ontology_id + ":" + term_id + '\"' - } - } + biosamples=['biosampleStatus.id','diagnosticMarkers.id','histologicalDiagnosis.id','measurements.assayCode.id','measurements.measurementValue.id','measurements.measurementValue.referenceRange.unit.id','measurements.measurementValue.typedQuantities.quantity.unit.id','measurements.measurementValue.unit.id','measurements.observationMoment.id','measurements.procedure.bodySite.id','measurements.procedure.procedureCode.id','pathologicalStage.id','pathologicalTnmFinding.id','phenotypicFeatures.evidence.evidenceCode.id','phenotypicFeatures.evidence.reference.id','phenotypicFeatures.featureType.id','phenotypicFeatures.modifiers.id','phenotypicFeatures.onset.id','phenotypicFeatures.resolution.id','phenotypicFeatures.severity.id','sampleOriginDetail.id','sampleOriginType.id','sampleProcessing.id','sampleStorage.id','tumorGrade.id','tumorProgression.id'] + cohorts=['cohortDataTypes.id','cohortDesign.id','exclusionCriteria.diseaseConditions.diseaseCode.id','exclusionCriteria.diseaseConditions.severity.id','exclusionCriteria.diseaseConditions.stage.id','exclusionCriteria.ethnicities.id','exclusionCriteria.genders.id','exclusionCriteria.locations.id','exclusionCriteria.phenotypicConditions.featureType.id','exclusionCriteria.phenotypicConditions.severity.id','inclusionCriteria.diseaseConditions.diseaseCode.id','inclusionCriteria.diseaseConditions.severity.id','inclusionCriteria.diseaseConditions.stage.id','inclusionCriteria.ethnicities.id','inclusionCriteria.genders.id','inclusionCriteria.locations.id','inclusionCriteria.phenotypicConditions.featureType.id','inclusionCriteria.phenotypicConditions.severity.id'] + datasets=['dataUseConditions.duoDataUse.id'] + genomicVariations=['caseLevelData.alleleOrigin.id','caseLevelData.clinicalInterpretations.category.id','caseLevelData.clinicalInterpretations.effect.id','caseLevelData.clinicalInterpretations.evidenceType.id','caseLevelData.id','caseLevelData.phenotypicEffects.category.id','caseLevelData.phenotypicEffects.effect.id','caseLevelData.phenotypicEffects.evidenceType.id','caseLevelData.zygosity.id','identifiers.variantAlternativeIds.id','molecularAttributes.molecularEffects.id','variantLevelData.clinicalInterpretations.category.id','variantLevelData.clinicalInterpretations.effect.id','variantLevelData.clinicalInterpretations.evidenceType.id','variantLevelData.phenotypicEffects.category.id','variantLevelData.phenotypicEffects.effect.id','variantLevelData.phenotypicEffects.evidenceType.id'] + individuals=['diseases.ageOfOnset.id','diseases.diseaseCode.id','diseases.severity.id','diseases.stage.id','ethnicity.id','exposures.exposureCode.id','exposures.unit.id','geographicOrigin.id','interventionsOrProcedures.ageAtProcedure.id','interventionsOrProcedures.bodySite.id','interventionsOrProcedures.procedureCode.id','measures.assayCode.id','measures.measurementValue.id','measures.measurementValue.typedQuantities.quantity.unit.id','measures.measurementValue.unit.id','measures.observationMoment.id','measures.procedure.bodySite.id','measures.procedure.procedureCode.id','pedigrees.disease.diseaseCode.id','pedigrees.disease.severity.id','pedigrees.disease.stage.id','pedigrees.id','pedigrees.members.role.id','phenotypicFeatures.evidence.evidenceCode.id','phenotypicFeatures.evidence.reference.id','phenotypicFeatures.featureType.id','phenotypicFeatures.modifiers.id','phenotypicFeatures.onset.id','phenotypicFeatures.resolution.id','phenotypicFeatures.severity.id','sex.id','treatments.cumulativeDose.referenceRange.id','treatments.doseIntervals.id','treatments.routeOfAdministration.id','treatments.treatmentCode.id'] + runs=['librarySource.id','platformModel.id'] + array=[] + if collection == 'biosamples': + array=biosamples + elif collection == 'cohorts': + array=cohorts + elif collection == 'datasets': + array=datasets + elif collection == 'genomicVariations': + array=genomicVariations + elif collection == 'individuals': + array=individuals + elif collection == 'runs': + array=runs + query={} + query['$or']=[] + for field in array: + fieldquery={} + fieldquery[field]=ontology_id + ":" + term_id + query['$or'].append(fieldquery) results = client.beacon.get_collection(collection).find(query).limit(1) results = list(results) results = dumps(results) @@ -232,6 +251,8 @@ def find_ontology_terms_used(collection_name: str) -> List[Dict]: if term not in terms_ids: terms_ids.append(term) i += 10000 + if i > 30000: + break print(i) else: xs = client.beacon.get_collection(collection_name).find().skip(0).limit(10000) @@ -382,7 +403,8 @@ def merge_terms(): 'scopes': array_of_scopes }) client.beacon.filtering_terms.delete_many({"id": repeated_id}) - client.beacon.filtering_terms.insert_many(new_terms) + if new_terms != []: + client.beacon.filtering_terms.insert_many(new_terms) diff --git a/beacon/reindex.py b/beacon/reindex.py index 5e335813..396f5d80 100644 --- a/beacon/reindex.py +++ b/beacon/reindex.py @@ -36,19 +36,19 @@ client.beacon.validate_collection("similarities") except Exception: db=client.beacon.create_collection(name="similarities") -client.beacon.analyses.create_index([("$**", "text")]) -client.beacon.biosamples.create_index([("$**", "text")]) -client.beacon.cohorts.create_index([("$**", "text")]) -client.beacon.datasets.create_index([("$**", "text")]) -client.beacon.genomicVariations.create_index([("$**", "text")]) -client.beacon.genomicVariations.create_index([("caseLevelData.biosampleId", 1)]) -client.beacon.genomicVariations.create_index([("variation.location.interval.end.value", -1), ("variation.location.interval.start.value", 1)]) +#client.beacon.analyses.create_index([("$**", "text")]) +#client.beacon.biosamples.create_index([("$**", "text")]) +#client.beacon.cohorts.create_index([("$**", "text")]) +#client.beacon.datasets.create_index([("$**", "text")]) +#client.beacon.genomicVariations.create_index([("$**", "text")]) +#client.beacon.genomicVariations.create_index([("caseLevelData.biosampleId", 1)]) +#client.beacon.genomicVariations.create_index([("variation.location.interval.end.value", -1), ("variation.location.interval.start.value", 1)]) client.beacon.genomicVariations.create_index([("variantInternalId", 1), ("caseLevelData.biosampleId", 1)]) -client.beacon.genomicVariations.create_index([("identifiers.genomicHGVSId", 1), ("variation.location.interval.start.value", 1), ("caseLevelData.biosampleId", 1), ("variation.referenceBases", 1), ("variation.alternateBases", 1)]) +#client.beacon.genomicVariations.create_index([("identifiers.genomicHGVSId", 1), ("variation.location.interval.start.value", 1), ("caseLevelData.biosampleId", 1), ("variation.referenceBases", 1), ("variation.alternateBases", 1)]) client.beacon.genomicVariations.create_index([("variation.location.interval.end.value", -1), ("variation.location.interval.start.value", 1), ("variation.referenceBases", 1), ("variation.alternateBases", 1)]) client.beacon.genomicVariations.create_index([("molecularAttributes.geneIds", 1), ("variantInternalId", 1), ("variation.variantType", 1)]) -client.beacon.individuals.create_index([("$**", "text")]) -client.beacon.runs.create_index([("$**", "text")]) +#client.beacon.individuals.create_index([("$**", "text")]) +#client.beacon.runs.create_index([("$**", "text")]) #collection_name = client.beacon.analyses #print(collection_name.index_information())