[267] Pool ES mappings across datasets (#280)

* changed branch name * mappings build * updated docs * updated docs * updated docs * added docstrings * added dynamic strict to settings * removed index.json in favour of a single defaults file * using soft alias until a future PR to minimise changes * cleaned and sorted json * [267] Tidy & slim schema transformations (#281) * pruned deprecated schema transformations * updated fos fieldname on arxlive * unified data set schema transformations * restructured directory * refactored references to schema_transformation * refactored references to schema_transformation * slimmed down transformations, and included entity_type * pruned ontology * tidied schemas * consistency tests * reverted unrelated json file * harmonised name fieldsofstudy across arxiv * added novelty back in * sorted json * sorted json * sorted json Co-authored-by: Joel Klinger <[email protected]> Co-authored-by: Joel Klinger <[email protected]>
nestauk · Jun 9, 2020 · aac29f1 · aac29f1
1 parent ac88d8f
commit aac29f1
Show file tree

Hide file tree

Showing 71 changed files with 1,042 additions and 1,939 deletions.
diff --git a/docs/source/nesta.core.schemas.rst b/docs/source/nesta.core.schemas.rst
@@ -1 +1,2 @@
 .. include:: ../../nesta/core/schemas/README.rst
+.. include:: ../../nesta/core/schemas/tier_1/mappings/README.rst
diff --git a/docs/source/nesta.core.scripts.rst b/docs/source/nesta.core.scripts.rst
@@ -1,4 +1 @@
-Scripts
-=======
-
 .. include:: ../../nesta/core/scripts/README.rst
diff --git a/nesta/core/batchables/arxiv/arxiv_elasticsearch/run.py b/nesta/core/batchables/arxiv/arxiv_elasticsearch/run.py
@@ -20,7 +20,6 @@
 from datetime import datetime as dt
 
 from nesta.core.orms.orm_utils import db_session, get_mysql_engine
-from nesta.core.orms.orm_utils import load_json_from_pathstub
 from nesta.core.orms.orm_utils import object_to_dict
 from nesta.core.orms.arxiv_orm import Article
 from nesta.core.orms.grid_orm import Institute
@@ -76,10 +75,7 @@ def run():
     ngrammer = Ngrammer(database="production")
 
     # es setup
-    strans_kwargs={'filename':'arxiv.json',
-                   'from_key':'tier_0',
-                   'to_key':'tier_1',
-                   'ignore':['id']}
+    strans_kwargs={'filename':'arxiv.json', 'ignore':['id']}
     es = ElasticsearchPlus(hosts=es_host,
                            port=es_port,
                            aws_auth_region=aws_auth_region,
@@ -164,9 +160,9 @@ def run():
             countries = set(grid_countries[inst_id]
                             for inst_id in good_institutes
                             if inst_id in grid_countries)
-            row['categories'], _, _ = hierarchy_field(cats)
-            row['fos'], _, _ = hierarchy_field(fos)
-            row['countries'], _, _ = hierarchy_field(countries)
+            row['nested_categories'], _, _ = hierarchy_field(cats)
+            row['fields_of_study'], _, _ = hierarchy_field(fos)
+            row['nested_location'], _, _ = hierarchy_field(countries)
 
             # Pull out international institute info
             has_mn = any(is_multinational(inst,

diff --git a/nesta/core/batchables/crunchbase/crunchbase_elasticsearch/run.py b/nesta/core/batchables/crunchbase/crunchbase_elasticsearch/run.py
@@ -58,12 +58,8 @@ def run():
     continent_lookup[None] = None
 
     # es setup
-    field_null_mapping = load_json_from_pathstub("tier_1/field_null_mappings/",
-                                                 "health_scanner.json")
-    strans_kwargs={'filename':'crunchbase_organisation_members.json',
-                   'from_key':'tier_0',
-                   'to_key':'tier_1',
-                   'ignore':['id']}
+    field_null_mapping = load_json_from_pathstub("health-scanner", "nulls.json")
+    strans_kwargs = {'filename': 'companies.json', 'ignore': ['id']}
     es = ElasticsearchPlus(hosts=es_host,
                            port=es_port,
                            aws_auth_region=aws_auth_region,

diff --git a/nesta/core/batchables/eurito/arxiv_eu/run.py b/nesta/core/batchables/eurito/arxiv_eu/run.py
@@ -54,9 +54,7 @@ def run():
 
     # es setup
     logging.info('Connecting to ES')
-    strans_kwargs={'filename':'eurito/arxiv-eu.json',
-                   'from_key':'tier_0', 'to_key':'tier_1',
-                   'ignore':['id']}
+    strans_kwargs = {'filename': 'arxiv.json', 'ignore': ['id']}
     es = ElasticsearchPlus(hosts=es_host,
                            port=es_port,
                            aws_auth_region=aws_auth_region,

diff --git a/...re/batchables/eurito/crunchbase_eu/run.py → ...ore/batchables/eurito/companies_eu/run.py b/...re/batchables/eurito/crunchbase_eu/run.py → ...ore/batchables/eurito/companies_eu/run.py
@@ -61,10 +61,7 @@ def run():
     eu_countries = get_eu_countries()
 
     # es setup
-    strans_kwargs={'filename':'eurito/crunchbase-eu.json',
-                   'from_key':'tier_0',
-                   'to_key':'tier_1',
-                   'ignore':['id']}
+    strans_kwargs = {'filename': 'companies.json', 'ignore': ['id']}
     es = ElasticsearchPlus(hosts=es_host,
                            port=es_port,
                            aws_auth_region=aws_auth_region,

diff --git a/nesta/core/batchables/eurito/cordis_eu/run.py b/nesta/core/batchables/eurito/cordis_eu/run.py
@@ -88,9 +88,7 @@ def run():
 
     # es setup
     logging.info('Connecting to ES')
-    strans_kwargs={'filename':'eurito/cordis-eu.json',
-                   'from_key':'tier_0', 'to_key':'tier_1',
-                   'ignore':['id']}
+    strans_kwargs = {'filename': 'cordis.json', 'ignore': ['id']}
     es = ElasticsearchPlus(hosts=es_host,
                            port=es_port,
                            aws_auth_region=aws_auth_region,

diff --git a/nesta/core/batchables/eurito/patstat-eu/run.py b/nesta/core/batchables/eurito/patstat-eu/run.py
diff --git a/nesta/core/batchables/eurito/patstat_eu/run.py b/nesta/core/batchables/eurito/patstat_eu/run.py
@@ -63,9 +63,7 @@ def run():
 
     # es setup
     logging.info('Connecting to ES')
-    strans_kwargs={'filename':'eurito/patstat-eu.json',
-                   'from_key':'tier_0', 'to_key':'tier_1',
-                   'ignore':['id']}
+    strans_kwargs = {'filename': 'patstat.json', 'ignore': ['id']}
     es = ElasticsearchPlus(hosts=es_host,
                            port=es_port,
                            aws_auth_region=aws_auth_region,

diff --git a/nesta/core/batchables/health_data/nih_abstract_mesh_data/run.py b/nesta/core/batchables/health_data/nih_abstract_mesh_data/run.py
@@ -68,9 +68,7 @@ def run():
     dupes = format_duplicate_map(dupes)
 
     # Set up elastic search connection
-    field_null_mapping = load_json_from_pathstub("tier_1/"
-                                                 "field_null_mappings/",
-                                                 "health_scanner.json")
+    field_null_mapping = load_json_from_pathstub("health-scanner", "nulls.json")
     es = ElasticsearchPlus(hosts=es_config['host'],
                            port=es_config['port'],
                            aws_auth_region=es_config['region'],

diff --git a/nesta/core/batchables/health_data/nih_dedupe/run.py b/nesta/core/batchables/health_data/nih_dedupe/run.py
@@ -61,9 +61,7 @@ def run():
     art_ids = json.loads(ids_obj.get()['Body']._raw_stream.read())
     logging.info(f'Processing {len(art_ids)} article ids')
 
-    field_null_mapping = load_json_from_pathstub(("tier_1/"
-                                                  "field_null_mappings/"),
-                                                 "health_scanner.json")
+    field_null_mapping = load_json_from_pathstub("health-scanner", "nulls.json")
     es = ElasticsearchPlus(hosts=es_host,
                            port=es_port,
                            aws_auth_region=aws_auth_region,

diff --git a/nesta/core/batchables/health_data/nih_process_data/run.py b/nesta/core/batchables/health_data/nih_process_data/run.py
@@ -25,7 +25,6 @@
 def run():
     start_index = os.environ["BATCHPAR_start_index"]
     end_index = os.environ["BATCHPAR_end_index"]
-    #mysqldb_config = os.environ["BATCHPAR_config"]
     es_host = os.environ["BATCHPAR_outinfo"]
     es_port = os.environ["BATCHPAR_out_port"]
     es_index = os.environ["BATCHPAR_out_index"]
@@ -87,13 +86,8 @@ def run():
     df['total_cost_currency'] = 'USD'
 
     # output to elasticsearch
-    field_null_mapping = load_json_from_pathstub("tier_1/field_null_mappings/",
-                                                 "health_scanner.json")
-    strans_kwargs={'filename':'nih.json',
-                   'from_key':'tier_0',
-                   'to_key':'tier_1',
-                   'ignore':['application_id']}
-
+    field_null_mapping = load_json_from_pathstub("health-scanner", "nulls.json")
+    strans_kwargs = {'filename': 'nih.json', 'ignore': ['application_id']}
     es = ElasticsearchPlus(hosts=es_host,
                            port=es_port,
                            aws_auth_region=aws_auth_region,
@@ -143,15 +137,15 @@ def run():
         pars = {'start_index': '2001360',
                 'end_index': '2003940',
                 'db': 'dev',
+                'done': 'False',
                 'config': (f'{os.environ["HOME"]}/nesta/nesta/'
                            'core/config/mysqldb.config'),
-                'done': 'False',
                 'outinfo': ('https://search-health-scanner-'
                             '5cs7g52446h7qscocqmiky5dn4.'
                             'eu-west-2.es.amazonaws.com'),
                 'out_index': 'nih_dev',
                 'out_type': '_doc',
-                'out_port': '_doc',
+                'out_port': '_443',
                 'aws_auth_region': 'eu-west-2',
                 'entity_type': 'paper',
                 'test': 'False'}

diff --git a/nesta/core/batchables/meetup/topic_tag_elasticsearch/run.py b/nesta/core/batchables/meetup/topic_tag_elasticsearch/run.py
@@ -71,13 +71,8 @@ def run():
     mesh_terms = format_mesh_terms(df_mesh)
 
     # Setup ES+
-    field_null_mapping = load_json_from_pathstub(("tier_1/"
-                                                  "field_null_mappings/"),
-                                                 "health_scanner.json")
-    strans_kwargs={'filename':'meetup.json',
-                   'from_key':'tier_0',
-                   'to_key':'tier_1',
-                   'ignore':[]}
+    field_null_mapping = load_json_from_pathstub("health-scanner", "nulls.json")
+    strans_kwargs = {'filename': 'meetup.json'}
     es = ElasticsearchPlus(hosts=es_host,
                            port=es_port,
                            aws_auth_region=aws_auth_region,

diff --git a/nesta/core/config/elasticsearch.yaml b/nesta/core/config/elasticsearch.yaml