Merge pull request #56 from elixir-europe/merge-faidare-dd

Merge faidare dd
elixir-europe · Jul 20, 2022 · c179abb · c179abb
2 parents 717ab74 + c805040
commit c179abb
Show file tree

Hide file tree

Showing 13 changed files with 189 additions and 47 deletions.
diff --git a/config/transform-elasticsearch/documents/datadiscovery_germplasm.json b/config/transform-elasticsearch/documents/datadiscovery_germplasm.json
@@ -3,10 +3,9 @@
   "source-entity": "germplasm",
 
   "document-transform": {
-    "@type": [ "Germplasm" ],
     "entryType": "Germplasm",
     "@id": "{.germplasmPUI}",
-    "identifier": "{.germplasmPUI}",
+    "identifier": "{.germplasmDbId}",
     "name": "{.germplasmName}",
 
     "schema:includedInDataCatalog": "{.source}",
@@ -47,15 +46,10 @@
         " {.comment}"
       ]
     },
-
     "species": {
-      "{or}": [
-        "{.genusSpecies}",
-        "{.genus}",
-        "{.species}"
-      ]
+      "{join}": ["{.genus + .species }"],
+      "{separator}": " "
     },
-
     "germplasm": {
       "cropName": {
         "{list}": [
@@ -84,19 +78,29 @@
         ]
       }
     },
-    "trait": {
-      "observationVariableIds": {
-        "{flatten_distinct}": [
-          "{.studyURIs => .observationVariableDbIds}"
-        ]
-      }
-    },
     "node": "{.source}",
     "databaseName": {
       "{join}": [
         "brapi@",
         "{.source}"
       ]
+    },
+    "holdingInstitute": {
+      "{join}": [
+        "{.holdingInstitute.organisation} {.instituteName}"
+      ]
+    },
+    "biologicalStatus": "{.biologicalStatusOfAccessionCode}",
+    "geneticNature": "{.geneticNature}",
+    "countryOfOrigin": "{.countryOfOriginCode}",
+    "taxonGroup": "{.genus}",
+    "germplasmList": {
+      "{flatten_distinct}": [
+        "{.panel.name}",
+        "{.collection.name}",
+        "{.population.name}",
+        "{.holdingGenbank.instituteName}"
+      ]
     }
   }
 }
diff --git a/config/transform-elasticsearch/documents/datadiscovery_study.json b/config/transform-elasticsearch/documents/datadiscovery_study.json
@@ -54,8 +54,13 @@
         "Study"
       ]
     },
+    "accessionNumber": {
+      "{flatten_distinct}": [
+        "{.germplasmURIs => .accessionNumber}"
+      ]
+    },
     "@id": "{.studyPUI}",
-    "identifier": "{.studyPUI}",
+    "identifier": "{.studyDbId}",
     "schema:includedInDataCatalog": "{.source}",
     "schema:identifier": "{.studyDbId}",
     "schema:name": {
@@ -129,17 +134,14 @@
             " in {.locationURI => .countryName}"
           ]
         },
-        ". this study is part of the {.programName} program",
+        ". This study is part of the {.programName} program",
         ".",
         " {.studyDescription}"
       ]
     },
     "species": {
-      "{or}": [
-        "{.germplasmURIs => .genus + .species + .subtaxa}",
-        "{.germplasmURIs => .genus + .species}",
-        "{.germplasmURIs => .genus}"
-      ]
+      "{join}" : ["{.germplasmURIs => .genus + .species}"],
+      "{separator}": " "
     },
     "germplasm": {
       "cropName": {
@@ -187,6 +189,30 @@
         "brapi@",
         "{.source}"
       ]
+    },
+    "holdingInstitute": {
+      "{join}": [
+        "{.holdingInstitute.organisation} {.instituteName}"
+      ]
+    },
+    "biologicalStatus": "{.biologicalStatusOfAccessionCode}",
+    "geneticNature": "{.geneticNature}",
+    "countryOfOrigin": "{.countryOfOriginCode}",
+    "taxonGroup": {
+      "{flatten_distinct}": "{.germplasmURIs => .genus}"
+    },
+    "observationVariableIds": {
+      "{flatten_distinct}": [
+        "{.observationVariableDbIds}"
+      ]
+    },
+    "germplasmList": {
+      "{flatten_distinct}": [
+        "{.germplasmURIs => .panel.name}",
+        "{.germplasmURIs => .collection.name}",
+        "{.germplasmURIs => .population.name}",
+        "{.germplasmURIs => .holdingGenbank.instituteName}"
+      ]
     }
   }
 }
diff --git a/etl/common/templating.py b/etl/common/templating.py
@@ -177,6 +177,11 @@ def resolve_or_template(template, data, data_index):
         initial
     )
 
+def resolve_equals_template(template, data, data_index):
+    elements = template.get('{equals}')
+    initial = False
+    return elements[0] == elements[1]
+
 
 def resolve_map_template(template, data, data_index):
     elements = template.get('{map}')
@@ -218,7 +223,8 @@ def resolve(parsed_template, data, data_index):
             '{lark}': resolve_field_value_template,
             '{map}': resolve_map_template,
             '{merge}': resolve_merge_template,
-            '{replace}': resolve_replace_with_template
+            '{replace}': resolve_replace_with_template,
+            '{equals}': resolve_equals_template
         }
         for key, evaluator in evaluable_templates.items():
             if key in parsed_template:

diff --git a/etl/extract/brapi.py b/etl/extract/brapi.py
@@ -6,7 +6,9 @@
 from copy import deepcopy
 
 import urllib3
+import urllib.request
 from multiprocessing.pool import ThreadPool
+import json
 
 from etl.common.brapi import BreedingAPIIterator, get_implemented_calls, get_implemented_call
 from etl.common.brapi import get_identifier
@@ -274,6 +276,31 @@ def extract_source(source, entities, config, output_dir):
         entity['store'].clear()
 
 
+def extract_statics_files(source, output_dir, entities, config):
+
+    source_name = source['schema:identifier']
+    action = 'extract-' + source_name
+    log_file = get_file_path([config['log-dir'], action], ext='.log', recreate=True)
+    logger = create_logger(action, log_file, config['options']['verbose'])
+
+    logger.info("Downloading files from {}...".format(source_name))
+    for document_type in entities:
+        try:
+            local_filename = urllib.request.urlretrieve(source["brapi:static-file-repository-url"] + "/" + document_type + ".json", output_dir + "/" + document_type + ".json")
+
+            with open(output_dir + "/" + document_type + ".json", 'r') as f:
+                json_data = json.load(f)
+            with open(output_dir + "/" + document_type + ".json", 'w') as outfile:
+                for entry in json_data:
+                    json.dump(entry, outfile)
+                    outfile.write('\n')
+
+            logger.info("Extracting BrAPI {}.json".format(document_type))
+
+        except :
+            continue
+
+
 def main(config):
     entities = config["extract-brapi"]["entities"]
     for (entity_name, entity) in entities.items():
@@ -284,23 +311,24 @@ def main(config):
 
     threads = list()
     for source_name in sources:
-        if source_name == 'EVA':
-            print("# INFO: EVA data can't be extracted, EVA Skipped ..")
-            continue
+
         source_json_dir = get_folder_path([json_dir, source_name], recreate=True)
         source_json_dir_failed = source_json_dir + '-failed'
         if os.path.exists(source_json_dir_failed):
             shutil.rmtree(source_json_dir_failed)
 
         source = deepcopy(sources[source_name])
         entities_copy = deepcopy(entities)
-
-        thread = threading.Thread(target=extract_source,
-                                  args=(source, entities_copy, config, source_json_dir))
-        thread.daemon = True
-        thread.start()
-        threads.append(thread)
-
+        if "brapi:endpointUrl" in sources[source_name]:
+            thread = threading.Thread(target=extract_source,
+                                      args=(source, entities_copy, config, source_json_dir))
+            thread.daemon = True
+            thread.start()
+            threads.append(thread)
+
+        elif "brapi:static-file-repository-url" in sources[source_name]:
+            extract_statics_files(sources[source_name], source_json_dir, entities, config)
+
     for thread in threads:
         while thread.isAlive():
             thread.join(500)

diff --git a/etl/transform/elasticsearch.py b/etl/transform/elasticsearch.py
@@ -3,6 +3,8 @@
 from logging import Logger
 import json
 import glob
+import gzip
+import shutil
 from xml.sax import saxutils as su
 
 import jsonschema
@@ -132,7 +134,7 @@ def validate_documents(document_tuples, validation_schemas, logger):
     logger.debug(f"Validated {document_count} documents.")
 
 
-def dump_clean_in_json_files(source_dir, logger, documents_tuples):
+def dump_clean_in_json_files(source_dir, source_name, logger, documents_tuples):
     """
     Consumes an iterable of document tuples and clean email
     """
@@ -144,18 +146,25 @@ def dump_clean_in_json_files(source_dir, logger, documents_tuples):
 
         # Hide email
         if ("email" in document):
-                document["email"]= document["email"].replace('@', '_')
+                document["email"] = document["email"].replace('@', '_')
 
         if ("contacts" in document):
             for contact in document["contacts"]:
-                if "email" in contact :
-                    contact["email"]= contact["email"].replace('@', '_')
+                if "email" in contact:
+                    contact["email"] = contact["email"].replace('@', '_')
 
         if document_header not in json_dict:
             json_dict[document_header] = []
 
         json_dict[document_header].append(document)
 
+        if ("node" not in document):
+            document["node"] = source_name
+            document["databaseName"] = "brapi@" + source_name
+
+        if ("source" not in document):
+            document["source"] = source_name
+
         document_count += 1
         if is_checkpoint(document_count):
             logger.debug(f"checkpoint: {document_count} documents saved")
@@ -172,7 +181,10 @@ def save_json(source_dir, json_dict):
         while saved_documents < len(document):
             with open(source_dir + "/" + type + '-' + str(file_number) + '.json', 'w') as f:
                 json.dump(document[saved_documents:file_number*10000], f, ensure_ascii=False)
-            f.close()
+            with open(source_dir + "/" + type + '-' + str(file_number) + '.json', 'rb') as f:
+                with gzip.open(source_dir + "/" + type + '-' + str(file_number) + '.json.gz', 'wb') as f_out:
+                    shutil.copyfileobj(f, f_out)
+            os.remove(source_dir + "/" + type + '-' + str(file_number) + '.json')
             file_number += 1
             saved_documents += 10000
 
@@ -280,7 +292,7 @@ def transform_source(source, transform_config, source_json_dir, source_bulk_dir,
         validated_documents = validate_documents(documents, validation_schemas, logger)
 
         # Write the documents in jsonfiles
-        dump_clean_in_json_files(source_bulk_dir, logger, validated_documents)
+        dump_clean_in_json_files(source_bulk_dir, source_name, logger, validated_documents)
         # shutil.rmtree(tmp_index_dir, ignore_errors=True)
 
         logger.info(f"SUCCEEDED Transforming BrAPI {source_name}.")

diff --git a/requirements.txt b/requirements.txt
@@ -1,11 +1,12 @@
 requests==2.22.0
 elasticsearch==5.4.0
-rdflib==4.2.2
-rdflib-jsonld==0.4.0
+rdflib==6.0.0
+rdflib-jsonld==0.5.0
 lark-parser==0.6.3
 rfc3987==1.3.8
 jsonschema==2.6.0
 urllib3==1.25.2
 chardet==3.0.3
 pyhashxx==0.1.3
 unqlite==0.7.1
+deepdiff==5.5.0
diff --git a/sources/FZH.json b/sources/FZH.json
@@ -0,0 +1,12 @@
+{
+  "@context": {
+    "schema": "http://schema.org/",
+    "brapi": "https://brapi.org/rdf/"
+  },
+  "@type": "schema:DataCatalog",
+  "@id": "http://apps.fz-juelich.de",
+  "schema:identifier": "FZH",
+  "schema:name": "FZH",
+  "brapi:static-file-repository-url": "http://apps.fz-juelich.de/faidare/",
+  "brapi:studyType": "Phenotyping"
+}
diff --git a/tests/common/test_templating.py b/tests/common/test_templating.py
@@ -3,7 +3,7 @@
 from etl.common.templating import resolve, parse_template
 
 data_0 = {"refURIs": [1, 2, 3, '4', 5], "foo": [1, 2, 3], "genus": "Zea", "species": "mays", "falseField": False, "studyTypeName": "gnomic"}
-data_1 = {"a": "a", "genus": "Zea", "species": "Zea mays"}
+data_1 = {"a": "a", "genus": "Zea", "species": "Zea mays", "source": "URGI"}
 data_2 = {"a": "b", "g": {"genus": "Populus"}}
 data_3 = {"a": "b", "g": {"genus": "Triticum", "species": "Triticum aestivum"}}
 data_4 = {"g": {"genus": "Triticum", "species": "aestivum"}}
@@ -129,7 +129,7 @@ def test_resolve_join3(self):
         expected = "foo123foo123"
         self.assertEqual(actual, expected)
 
-    def test_resolve_if1(self):
+    def test_resolve_if1(self):# this test is supicious, it passes whatever the value of "foo" in `"{if}": "foo"` maybe because "foo" is truthy
         template = parse_template({"{if}": "foo", "{then}": "then"})
         actual = resolve(template, data_0, data_index)
         expected = "then"
@@ -159,6 +159,24 @@ def test_resolve_if5(self):
         expected = "else"
         self.assertEqual(actual, expected)
 
+    def test_resolve_if6(self):
+        template = parse_template({"{if}": "{.studyTypeName}", "{then}": "{.studyTypeName}"})
+        actual = resolve(template, data_0, data_index)
+        expected = "gnomic"
+        self.assertEqual(actual, expected)
+
+    def test_resolve_if6_ko(self):
+        template = parse_template({"{if}": "{.studyTypeName}", "{then}": "{.studyTypeName}"})
+        actual = resolve(template, data_0, data_index)
+        expected = "gnomicus"
+        self.assertNotEqual(actual, expected)
+
+    def test_resolve_if7(self):
+        template = parse_template({"{if}": {"{equals}":["{.source}", "URGI"]}, "{then}": "gotcha", "{else}": "{.source}"})
+        actual = resolve(template, data_1, data_index)
+        expected = "URGI"
+        self.assertEqual(actual, expected)
+
     def test_resolve_replace_with(self):
         template = parse_template({
             "{replace}": {

diff --git a/tests/extract/__init__.py b/tests/extract/__init__.py