Skip to content

Commit

Permalink
Merge pull request #56 from elixir-europe/merge-faidare-dd
Browse files Browse the repository at this point in the history
Merge faidare dd
  • Loading branch information
cpommier committed Jul 20, 2022
2 parents 717ab74 + c805040 commit c179abb
Show file tree
Hide file tree
Showing 13 changed files with 189 additions and 47 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
"source-entity": "germplasm",

"document-transform": {
"@type": [ "Germplasm" ],
"entryType": "Germplasm",
"@id": "{.germplasmPUI}",
"identifier": "{.germplasmPUI}",
"identifier": "{.germplasmDbId}",
"name": "{.germplasmName}",

"schema:includedInDataCatalog": "{.source}",
Expand Down Expand Up @@ -47,15 +46,10 @@
" {.comment}"
]
},

"species": {
"{or}": [
"{.genusSpecies}",
"{.genus}",
"{.species}"
]
"{join}": ["{.genus + .species }"],
"{separator}": " "
},

"germplasm": {
"cropName": {
"{list}": [
Expand Down Expand Up @@ -84,19 +78,29 @@
]
}
},
"trait": {
"observationVariableIds": {
"{flatten_distinct}": [
"{.studyURIs => .observationVariableDbIds}"
]
}
},
"node": "{.source}",
"databaseName": {
"{join}": [
"brapi@",
"{.source}"
]
},
"holdingInstitute": {
"{join}": [
"{.holdingInstitute.organisation} {.instituteName}"
]
},
"biologicalStatus": "{.biologicalStatusOfAccessionCode}",
"geneticNature": "{.geneticNature}",
"countryOfOrigin": "{.countryOfOriginCode}",
"taxonGroup": "{.genus}",
"germplasmList": {
"{flatten_distinct}": [
"{.panel.name}",
"{.collection.name}",
"{.population.name}",
"{.holdingGenbank.instituteName}"
]
}
}
}
40 changes: 33 additions & 7 deletions config/transform-elasticsearch/documents/datadiscovery_study.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,13 @@
"Study"
]
},
"accessionNumber": {
"{flatten_distinct}": [
"{.germplasmURIs => .accessionNumber}"
]
},
"@id": "{.studyPUI}",
"identifier": "{.studyPUI}",
"identifier": "{.studyDbId}",
"schema:includedInDataCatalog": "{.source}",
"schema:identifier": "{.studyDbId}",
"schema:name": {
Expand Down Expand Up @@ -129,17 +134,14 @@
" in {.locationURI => .countryName}"
]
},
". this study is part of the {.programName} program",
". This study is part of the {.programName} program",
".",
" {.studyDescription}"
]
},
"species": {
"{or}": [
"{.germplasmURIs => .genus + .species + .subtaxa}",
"{.germplasmURIs => .genus + .species}",
"{.germplasmURIs => .genus}"
]
"{join}" : ["{.germplasmURIs => .genus + .species}"],
"{separator}": " "
},
"germplasm": {
"cropName": {
Expand Down Expand Up @@ -187,6 +189,30 @@
"brapi@",
"{.source}"
]
},
"holdingInstitute": {
"{join}": [
"{.holdingInstitute.organisation} {.instituteName}"
]
},
"biologicalStatus": "{.biologicalStatusOfAccessionCode}",
"geneticNature": "{.geneticNature}",
"countryOfOrigin": "{.countryOfOriginCode}",
"taxonGroup": {
"{flatten_distinct}": "{.germplasmURIs => .genus}"
},
"observationVariableIds": {
"{flatten_distinct}": [
"{.observationVariableDbIds}"
]
},
"germplasmList": {
"{flatten_distinct}": [
"{.germplasmURIs => .panel.name}",
"{.germplasmURIs => .collection.name}",
"{.germplasmURIs => .population.name}",
"{.germplasmURIs => .holdingGenbank.instituteName}"
]
}
}
}
8 changes: 7 additions & 1 deletion etl/common/templating.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,11 @@ def resolve_or_template(template, data, data_index):
initial
)

def resolve_equals_template(template, data, data_index):
elements = template.get('{equals}')
initial = False
return elements[0] == elements[1]


def resolve_map_template(template, data, data_index):
elements = template.get('{map}')
Expand Down Expand Up @@ -218,7 +223,8 @@ def resolve(parsed_template, data, data_index):
'{lark}': resolve_field_value_template,
'{map}': resolve_map_template,
'{merge}': resolve_merge_template,
'{replace}': resolve_replace_with_template
'{replace}': resolve_replace_with_template,
'{equals}': resolve_equals_template
}
for key, evaluator in evaluable_templates.items():
if key in parsed_template:
Expand Down
48 changes: 38 additions & 10 deletions etl/extract/brapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
from copy import deepcopy

import urllib3
import urllib.request
from multiprocessing.pool import ThreadPool
import json

from etl.common.brapi import BreedingAPIIterator, get_implemented_calls, get_implemented_call
from etl.common.brapi import get_identifier
Expand Down Expand Up @@ -274,6 +276,31 @@ def extract_source(source, entities, config, output_dir):
entity['store'].clear()


def extract_statics_files(source, output_dir, entities, config):

source_name = source['schema:identifier']
action = 'extract-' + source_name
log_file = get_file_path([config['log-dir'], action], ext='.log', recreate=True)
logger = create_logger(action, log_file, config['options']['verbose'])

logger.info("Downloading files from {}...".format(source_name))
for document_type in entities:
try:
local_filename = urllib.request.urlretrieve(source["brapi:static-file-repository-url"] + "/" + document_type + ".json", output_dir + "/" + document_type + ".json")

with open(output_dir + "/" + document_type + ".json", 'r') as f:
json_data = json.load(f)
with open(output_dir + "/" + document_type + ".json", 'w') as outfile:
for entry in json_data:
json.dump(entry, outfile)
outfile.write('\n')

logger.info("Extracting BrAPI {}.json".format(document_type))

except :
continue


def main(config):
entities = config["extract-brapi"]["entities"]
for (entity_name, entity) in entities.items():
Expand All @@ -284,23 +311,24 @@ def main(config):

threads = list()
for source_name in sources:
if source_name == 'EVA':
print("# INFO: EVA data can't be extracted, EVA Skipped ..")
continue

source_json_dir = get_folder_path([json_dir, source_name], recreate=True)
source_json_dir_failed = source_json_dir + '-failed'
if os.path.exists(source_json_dir_failed):
shutil.rmtree(source_json_dir_failed)

source = deepcopy(sources[source_name])
entities_copy = deepcopy(entities)

thread = threading.Thread(target=extract_source,
args=(source, entities_copy, config, source_json_dir))
thread.daemon = True
thread.start()
threads.append(thread)

if "brapi:endpointUrl" in sources[source_name]:
thread = threading.Thread(target=extract_source,
args=(source, entities_copy, config, source_json_dir))
thread.daemon = True
thread.start()
threads.append(thread)

elif "brapi:static-file-repository-url" in sources[source_name]:
extract_statics_files(sources[source_name], source_json_dir, entities, config)

for thread in threads:
while thread.isAlive():
thread.join(500)
Expand Down
24 changes: 18 additions & 6 deletions etl/transform/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from logging import Logger
import json
import glob
import gzip
import shutil
from xml.sax import saxutils as su

import jsonschema
Expand Down Expand Up @@ -132,7 +134,7 @@ def validate_documents(document_tuples, validation_schemas, logger):
logger.debug(f"Validated {document_count} documents.")


def dump_clean_in_json_files(source_dir, logger, documents_tuples):
def dump_clean_in_json_files(source_dir, source_name, logger, documents_tuples):
"""
Consumes an iterable of document tuples and clean email
"""
Expand All @@ -144,18 +146,25 @@ def dump_clean_in_json_files(source_dir, logger, documents_tuples):

# Hide email
if ("email" in document):
document["email"]= document["email"].replace('@', '_')
document["email"] = document["email"].replace('@', '_')

if ("contacts" in document):
for contact in document["contacts"]:
if "email" in contact :
contact["email"]= contact["email"].replace('@', '_')
if "email" in contact:
contact["email"] = contact["email"].replace('@', '_')

if document_header not in json_dict:
json_dict[document_header] = []

json_dict[document_header].append(document)

if ("node" not in document):
document["node"] = source_name
document["databaseName"] = "brapi@" + source_name

if ("source" not in document):
document["source"] = source_name

document_count += 1
if is_checkpoint(document_count):
logger.debug(f"checkpoint: {document_count} documents saved")
Expand All @@ -172,7 +181,10 @@ def save_json(source_dir, json_dict):
while saved_documents < len(document):
with open(source_dir + "/" + type + '-' + str(file_number) + '.json', 'w') as f:
json.dump(document[saved_documents:file_number*10000], f, ensure_ascii=False)
f.close()
with open(source_dir + "/" + type + '-' + str(file_number) + '.json', 'rb') as f:
with gzip.open(source_dir + "/" + type + '-' + str(file_number) + '.json.gz', 'wb') as f_out:
shutil.copyfileobj(f, f_out)
os.remove(source_dir + "/" + type + '-' + str(file_number) + '.json')
file_number += 1
saved_documents += 10000

Expand Down Expand Up @@ -280,7 +292,7 @@ def transform_source(source, transform_config, source_json_dir, source_bulk_dir,
validated_documents = validate_documents(documents, validation_schemas, logger)

# Write the documents in jsonfiles
dump_clean_in_json_files(source_bulk_dir, logger, validated_documents)
dump_clean_in_json_files(source_bulk_dir, source_name, logger, validated_documents)
# shutil.rmtree(tmp_index_dir, ignore_errors=True)

logger.info(f"SUCCEEDED Transforming BrAPI {source_name}.")
Expand Down
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
requests==2.22.0
elasticsearch==5.4.0
rdflib==4.2.2
rdflib-jsonld==0.4.0
rdflib==6.0.0
rdflib-jsonld==0.5.0
lark-parser==0.6.3
rfc3987==1.3.8
jsonschema==2.6.0
urllib3==1.25.2
chardet==3.0.3
pyhashxx==0.1.3
unqlite==0.7.1
deepdiff==5.5.0
12 changes: 12 additions & 0 deletions sources/FZH.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"@context": {
"schema": "http://schema.org/",
"brapi": "https://brapi.org/rdf/"
},
"@type": "schema:DataCatalog",
"@id": "http://apps.fz-juelich.de",
"schema:identifier": "FZH",
"schema:name": "FZH",
"brapi:static-file-repository-url": "http://apps.fz-juelich.de/faidare/",
"brapi:studyType": "Phenotyping"
}
22 changes: 20 additions & 2 deletions tests/common/test_templating.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from etl.common.templating import resolve, parse_template

data_0 = {"refURIs": [1, 2, 3, '4', 5], "foo": [1, 2, 3], "genus": "Zea", "species": "mays", "falseField": False, "studyTypeName": "gnomic"}
data_1 = {"a": "a", "genus": "Zea", "species": "Zea mays"}
data_1 = {"a": "a", "genus": "Zea", "species": "Zea mays", "source": "URGI"}
data_2 = {"a": "b", "g": {"genus": "Populus"}}
data_3 = {"a": "b", "g": {"genus": "Triticum", "species": "Triticum aestivum"}}
data_4 = {"g": {"genus": "Triticum", "species": "aestivum"}}
Expand Down Expand Up @@ -129,7 +129,7 @@ def test_resolve_join3(self):
expected = "foo123foo123"
self.assertEqual(actual, expected)

def test_resolve_if1(self):
def test_resolve_if1(self):# this test is supicious, it passes whatever the value of "foo" in `"{if}": "foo"` maybe because "foo" is truthy
template = parse_template({"{if}": "foo", "{then}": "then"})
actual = resolve(template, data_0, data_index)
expected = "then"
Expand Down Expand Up @@ -159,6 +159,24 @@ def test_resolve_if5(self):
expected = "else"
self.assertEqual(actual, expected)

def test_resolve_if6(self):
template = parse_template({"{if}": "{.studyTypeName}", "{then}": "{.studyTypeName}"})
actual = resolve(template, data_0, data_index)
expected = "gnomic"
self.assertEqual(actual, expected)

def test_resolve_if6_ko(self):
template = parse_template({"{if}": "{.studyTypeName}", "{then}": "{.studyTypeName}"})
actual = resolve(template, data_0, data_index)
expected = "gnomicus"
self.assertNotEqual(actual, expected)

def test_resolve_if7(self):
template = parse_template({"{if}": {"{equals}":["{.source}", "URGI"]}, "{then}": "gotcha", "{else}": "{.source}"})
actual = resolve(template, data_1, data_index)
expected = "URGI"
self.assertEqual(actual, expected)

def test_resolve_replace_with(self):
template = parse_template({
"{replace}": {
Expand Down
Empty file added tests/extract/__init__.py
Empty file.
Loading

0 comments on commit c179abb

Please sign in to comment.