From f98955aab98e0f5a2ee30c424fa303b3cbc63c95 Mon Sep 17 00:00:00 2001 From: Joel Klinger Date: Tue, 9 Jun 2020 17:44:07 +0100 Subject: [PATCH] [266] Refactor and simplify ES configuration (#275) * make sure conf dir is empty * simplified es config * added orm es config reader * modified setup_es to pick up new es config * swapped es_mode for boolean * aliases now consistent with config * aliases now automatically located * added endpoint field to estasks * added endpoint field to sql2estasks * [267] Pool ES mappings across datasets (#280) * changed branch name * mappings build * updated docs * updated docs * updated docs * added docstrings * added dynamic strict to settings * removed index.json in favour of a single defaults file * using soft alias until a future PR to minimise changes * cleaned and sorted json * [267] Tidy & slim schema transformations (#281) * pruned deprecated schema transformations * updated fos fieldname on arxlive * unified data set schema transformations * restructured directory * refactored references to schema_transformation * refactored references to schema_transformation * slimmed down transformations, and included entity_type * pruned ontology * tidied schemas * consistency tests * reverted unrelated json file * harmonised name fieldsofstudy across arxiv * added novelty back in * sorted json * sorted json * sorted json Co-authored-by: Joel Klinger Co-authored-by: Joel Klinger * patched out es config setup from tests * removed redundant tests * fixed json formatting * none included for testing * picked up bug in test Co-authored-by: Joel Klinger --- docs/source/nesta.core.schemas.rst | 1 + docs/source/nesta.core.scripts.rst | 3 - .../arxiv/arxiv_elasticsearch/run.py | 16 +- .../crunchbase_elasticsearch/run.py | 14 +- nesta/core/batchables/eurito/arxiv_eu/run.py | 8 +- .../{crunchbase_eu => companies_eu}/run.py | 5 +- nesta/core/batchables/eurito/cordis_eu/run.py | 8 +- .../core/batchables/eurito/patstat-eu/run.py | 128 --------- .../core/batchables/eurito/patstat_eu/run.py | 4 +- .../health_data/nih_abstract_mesh_data/run.py | 4 +- .../batchables/health_data/nih_dedupe/run.py | 4 +- .../health_data/nih_process_data/run.py | 14 +- .../meetup/topic_tag_elasticsearch/run.py | 9 +- nesta/core/config/elasticsearch.config | Bin 2633 -> 0 bytes nesta/core/config/elasticsearch.yaml | Bin 0 -> 1086 bytes nesta/core/luigihacks/estask.py | 8 +- nesta/core/luigihacks/sql2estask.py | 12 +- nesta/core/orms/arxiv_es_config.json | 153 ---------- nesta/core/orms/crunchbase-eu_es_config.json | 262 ------------------ nesta/core/orms/orm_utils.py | 261 ++++++++++++----- nesta/core/orms/tests/test_orm_utils.py | 151 ++++------ nesta/core/routines/arxiv/arxiv_es_tokens.py | 7 +- nesta/core/routines/arxiv/arxiv_lolvelty.py | 11 +- nesta/core/routines/arxiv/arxiv_root_task.py | 13 +- .../crunchbase_elasticsearch_task.py | 8 +- .../crunchbase/crunchbase_lolvelty.py | 3 +- .../crunchbase/crunchbase_root_task.py | 2 +- nesta/core/routines/eurito_es/es_root.py | 7 +- .../nih_data/nih_abstracts_mesh_task.py | 7 +- .../health_data/nih_data/nih_dedupe_task.py | 25 +- .../health_data/nih_data/nih_lolvelty.py | 1 + .../health_data/nih_data/nih_process_task.py | 6 +- .../health_tagging/health_meetup_es_task.py | 4 +- .../meetup/health_tagging/meetup_lolvelty.py | 1 + nesta/core/schemas/README.rst | 1 + nesta/core/schemas/tier_1/datasets/arxiv.json | 30 ++ .../schemas/tier_1/datasets/companies.json | 49 ++++ .../core/schemas/tier_1/datasets/cordis.json | 16 ++ .../core/schemas/tier_1/datasets/meetup.json | 29 ++ nesta/core/schemas/tier_1/datasets/nih.json | 35 +++ .../core/schemas/tier_1/datasets/patstat.json | 18 ++ nesta/core/schemas/tier_1/mappings/README.rst | 148 ++++++++++ .../mappings/datasets/arxiv_mapping.json | 65 +++++ .../mappings/datasets/companies_mapping.json} | 37 +-- .../mappings/datasets/cordis_mapping.json | 68 +++++ .../mappings/datasets/meetup_mapping.json} | 27 +- .../mappings/datasets/nih_mapping.json} | 25 +- .../mappings/datasets/patstat_mapping.json} | 24 +- .../tier_1/mappings/defaults/defaults.json | 24 ++ .../endpoints/arxlive/arxiv_mapping.json | 76 +++++ .../endpoints/eurito-dev/arxiv_mapping.json} | 79 +----- .../eurito-dev/companies_mapping.json | 14 + .../endpoints/eurito-dev/cordis_mapping.json} | 23 +- .../endpoints/eurito-dev/patstat_mapping.json | 12 + .../endpoints/eurito/arxiv_mapping.json | 97 +++++++ .../endpoints/eurito/companies_mapping.json | 14 + .../endpoints/eurito/patstat_mapping.json | 12 + .../endpoints/health-scanner/aliases.json} | 51 ++-- .../endpoints/health-scanner/config.yaml | 3 + .../endpoints/health-scanner/nulls.json} | 0 .../tier_1/{tier_1.json => ontology.json} | 49 +--- .../tier_1/schema_transformations/arxiv.json | 67 ----- .../crunchbase_organisation.json | 11 - .../crunchbase_organisation_members.json | 158 ----------- .../eurito/arxiv-eu.json | 87 ------ .../eurito/cordis-eu.json | 47 ---- .../eurito/crunchbase-eu.json | 162 ----------- .../eurito/patstat-eu.json | 55 ---- .../tier_1/schema_transformations/github.json | 72 ----- .../tier_1/schema_transformations/meetup.json | 79 ------ .../meetup_members.json | 12 - .../tier_1/schema_transformations/nih.json | 103 ------- .../schema_transformations/worldbank.json | 83 ------ .../core/schemas/tier_1/tests/test_aliases.py | 35 +++ .../core/schemas/tier_1/tests/test_format.py | 44 +++ .../schemas/tier_1/tests/test_ontology.py | 48 ++++ .../schemas/tier_1/tests/test_validate.py | 87 ------ .../schemas/tier_1/{ => tests}/tidy_schema.py | 2 +- nesta/packages/biorxiv/collect_biorxiv.py | 35 --- .../packages/biorxiv/test_collect_biorxiv.py | 25 -- nesta/packages/decorators/schema_transform.py | 30 +- .../decorators/tests/test_schema_transform.py | 9 +- nesta/packages/geo_utils/country_iso_code.py | 15 +- requirements.txt | 1 + 84 files changed, 1238 insertions(+), 2215 deletions(-) rename nesta/core/batchables/eurito/{crunchbase_eu => companies_eu}/run.py (97%) delete mode 100644 nesta/core/batchables/eurito/patstat-eu/run.py delete mode 100644 nesta/core/config/elasticsearch.config create mode 100644 nesta/core/config/elasticsearch.yaml delete mode 100644 nesta/core/orms/arxiv_es_config.json delete mode 100644 nesta/core/orms/crunchbase-eu_es_config.json create mode 100644 nesta/core/schemas/tier_1/datasets/arxiv.json create mode 100644 nesta/core/schemas/tier_1/datasets/companies.json create mode 100644 nesta/core/schemas/tier_1/datasets/cordis.json create mode 100644 nesta/core/schemas/tier_1/datasets/meetup.json create mode 100644 nesta/core/schemas/tier_1/datasets/nih.json create mode 100644 nesta/core/schemas/tier_1/datasets/patstat.json create mode 100644 nesta/core/schemas/tier_1/mappings/README.rst create mode 100644 nesta/core/schemas/tier_1/mappings/datasets/arxiv_mapping.json rename nesta/core/{orms/crunchbase_es_config.json => schemas/tier_1/mappings/datasets/companies_mapping.json} (89%) create mode 100644 nesta/core/schemas/tier_1/mappings/datasets/cordis_mapping.json rename nesta/core/{orms/meetup_es_config.json => schemas/tier_1/mappings/datasets/meetup_mapping.json} (87%) rename nesta/core/{orms/nih_es_config.json => schemas/tier_1/mappings/datasets/nih_mapping.json} (90%) rename nesta/core/{orms/patstat-eu_es_config.json => schemas/tier_1/mappings/datasets/patstat_mapping.json} (85%) create mode 100644 nesta/core/schemas/tier_1/mappings/defaults/defaults.json create mode 100644 nesta/core/schemas/tier_1/mappings/endpoints/arxlive/arxiv_mapping.json rename nesta/core/{orms/arxiv-eu_es_config.json => schemas/tier_1/mappings/endpoints/eurito-dev/arxiv_mapping.json} (56%) create mode 100644 nesta/core/schemas/tier_1/mappings/endpoints/eurito-dev/companies_mapping.json rename nesta/core/{orms/cordis-eu_es_config.json => schemas/tier_1/mappings/endpoints/eurito-dev/cordis_mapping.json} (81%) create mode 100644 nesta/core/schemas/tier_1/mappings/endpoints/eurito-dev/patstat_mapping.json create mode 100644 nesta/core/schemas/tier_1/mappings/endpoints/eurito/arxiv_mapping.json create mode 100644 nesta/core/schemas/tier_1/mappings/endpoints/eurito/companies_mapping.json create mode 100644 nesta/core/schemas/tier_1/mappings/endpoints/eurito/patstat_mapping.json rename nesta/core/schemas/tier_1/{aliases/health_scanner.json => mappings/endpoints/health-scanner/aliases.json} (63%) create mode 100644 nesta/core/schemas/tier_1/mappings/endpoints/health-scanner/config.yaml rename nesta/core/schemas/tier_1/{field_null_mappings/health_scanner.json => mappings/endpoints/health-scanner/nulls.json} (100%) rename nesta/core/schemas/tier_1/{tier_1.json => ontology.json} (68%) delete mode 100644 nesta/core/schemas/tier_1/schema_transformations/arxiv.json delete mode 100644 nesta/core/schemas/tier_1/schema_transformations/crunchbase_organisation.json delete mode 100644 nesta/core/schemas/tier_1/schema_transformations/crunchbase_organisation_members.json delete mode 100644 nesta/core/schemas/tier_1/schema_transformations/eurito/arxiv-eu.json delete mode 100644 nesta/core/schemas/tier_1/schema_transformations/eurito/cordis-eu.json delete mode 100644 nesta/core/schemas/tier_1/schema_transformations/eurito/crunchbase-eu.json delete mode 100644 nesta/core/schemas/tier_1/schema_transformations/eurito/patstat-eu.json delete mode 100644 nesta/core/schemas/tier_1/schema_transformations/github.json delete mode 100644 nesta/core/schemas/tier_1/schema_transformations/meetup.json delete mode 100644 nesta/core/schemas/tier_1/schema_transformations/meetup_members.json delete mode 100644 nesta/core/schemas/tier_1/schema_transformations/nih.json delete mode 100644 nesta/core/schemas/tier_1/schema_transformations/worldbank.json create mode 100644 nesta/core/schemas/tier_1/tests/test_aliases.py create mode 100644 nesta/core/schemas/tier_1/tests/test_format.py create mode 100644 nesta/core/schemas/tier_1/tests/test_ontology.py delete mode 100644 nesta/core/schemas/tier_1/tests/test_validate.py rename nesta/core/schemas/tier_1/{ => tests}/tidy_schema.py (91%) delete mode 100644 nesta/packages/biorxiv/collect_biorxiv.py delete mode 100644 nesta/packages/biorxiv/test_collect_biorxiv.py diff --git a/docs/source/nesta.core.schemas.rst b/docs/source/nesta.core.schemas.rst index d6cda5a6..ae43472c 100644 --- a/docs/source/nesta.core.schemas.rst +++ b/docs/source/nesta.core.schemas.rst @@ -1 +1,2 @@ .. include:: ../../nesta/core/schemas/README.rst +.. include:: ../../nesta/core/schemas/tier_1/mappings/README.rst diff --git a/docs/source/nesta.core.scripts.rst b/docs/source/nesta.core.scripts.rst index 3522e96d..cb91def7 100644 --- a/docs/source/nesta.core.scripts.rst +++ b/docs/source/nesta.core.scripts.rst @@ -1,4 +1 @@ -Scripts -======= - .. include:: ../../nesta/core/scripts/README.rst diff --git a/nesta/core/batchables/arxiv/arxiv_elasticsearch/run.py b/nesta/core/batchables/arxiv/arxiv_elasticsearch/run.py index 19106279..40bb4010 100644 --- a/nesta/core/batchables/arxiv/arxiv_elasticsearch/run.py +++ b/nesta/core/batchables/arxiv/arxiv_elasticsearch/run.py @@ -20,7 +20,6 @@ from datetime import datetime as dt from nesta.core.orms.orm_utils import db_session, get_mysql_engine -from nesta.core.orms.orm_utils import load_json_from_pathstub from nesta.core.orms.orm_utils import object_to_dict from nesta.core.orms.arxiv_orm import Article from nesta.core.orms.grid_orm import Institute @@ -76,10 +75,7 @@ def run(): ngrammer = Ngrammer(database="production") # es setup - strans_kwargs={'filename':'arxiv.json', - 'from_key':'tier_0', - 'to_key':'tier_1', - 'ignore':['id']} + strans_kwargs={'filename':'arxiv.json', 'ignore':['id']} es = ElasticsearchPlus(hosts=es_host, port=es_port, aws_auth_region=aws_auth_region, @@ -164,9 +160,9 @@ def run(): countries = set(grid_countries[inst_id] for inst_id in good_institutes if inst_id in grid_countries) - row['categories'], _, _ = hierarchy_field(cats) - row['fos'], _, _ = hierarchy_field(fos) - row['countries'], _, _ = hierarchy_field(countries) + row['nested_categories'], _, _ = hierarchy_field(cats) + row['fields_of_study'], _, _ = hierarchy_field(fos) + row['nested_location'], _, _ = hierarchy_field(countries) # Pull out international institute info has_mn = any(is_multinational(inst, @@ -216,8 +212,8 @@ def run(): if 'BATCHPAR_outinfo' not in os.environ: from nesta.core.orms.orm_utils import setup_es - es, es_config = setup_es('dev', True, True, - dataset='arxiv') + es, es_config = setup_es(endpoint='arxlive', dataset='arxiv', + production=False, drop_and_recreate=True) environ = {'batch_file': ('ArxivESTask-2019-09-19-' 'False-1568888970724721.json'), 'config': ('/home/ec2-user/nesta-eu/nesta/' diff --git a/nesta/core/batchables/crunchbase/crunchbase_elasticsearch/run.py b/nesta/core/batchables/crunchbase/crunchbase_elasticsearch/run.py index 6a1a5646..c25d6fc9 100644 --- a/nesta/core/batchables/crunchbase/crunchbase_elasticsearch/run.py +++ b/nesta/core/batchables/crunchbase/crunchbase_elasticsearch/run.py @@ -58,12 +58,8 @@ def run(): continent_lookup[None] = None # es setup - field_null_mapping = load_json_from_pathstub("tier_1/field_null_mappings/", - "health_scanner.json") - strans_kwargs={'filename':'crunchbase_organisation_members.json', - 'from_key':'tier_0', - 'to_key':'tier_1', - 'ignore':['id']} + field_null_mapping = load_json_from_pathstub("health-scanner", "nulls.json") + strans_kwargs = {'filename': 'companies.json', 'ignore': ['id']} es = ElasticsearchPlus(hosts=es_host, port=es_port, aws_auth_region=aws_auth_region, @@ -162,9 +158,9 @@ def run(): if 'BATCHPAR_outinfo' not in os.environ: from nesta.core.orms.orm_utils import setup_es - es, es_config = setup_es('dev', True, True, - dataset='crunchbase', - aliases='health_scanner') + es, es_config = setup_es(production=False, endpoint='health-scanner', + dataset='companies', + drop_and_recreate=True) environ = {"AWSBATCHTEST": "", 'BATCHPAR_batch_file': 'crunchbase_to_es-15597291977144725.json', diff --git a/nesta/core/batchables/eurito/arxiv_eu/run.py b/nesta/core/batchables/eurito/arxiv_eu/run.py index 19be1961..643c3595 100644 --- a/nesta/core/batchables/eurito/arxiv_eu/run.py +++ b/nesta/core/batchables/eurito/arxiv_eu/run.py @@ -54,9 +54,7 @@ def run(): # es setup logging.info('Connecting to ES') - strans_kwargs={'filename':'eurito/arxiv-eu.json', - 'from_key':'tier_0', 'to_key':'tier_1', - 'ignore':['id']} + strans_kwargs = {'filename': 'arxiv.json', 'ignore': ['id']} es = ElasticsearchPlus(hosts=es_host, port=es_port, aws_auth_region=aws_auth_region, @@ -202,8 +200,8 @@ def run(): set_log_level() if 'BATCHPAR_outinfo' not in os.environ: from nesta.core.orms.orm_utils import setup_es - es, es_config = setup_es('dev', True, True, - dataset='arxiv-eu') + es, es_config = setup_es(production=False, endpoint='eurito', + dataset='arxiv', drop_and_recreate=True) environ = {'config': ('/home/ec2-user/nesta-eu/nesta/' 'core/config/mysqldb.config'), 'batch_file' : ('arxiv-eu_EURITO-ElasticsearchTask-' diff --git a/nesta/core/batchables/eurito/crunchbase_eu/run.py b/nesta/core/batchables/eurito/companies_eu/run.py similarity index 97% rename from nesta/core/batchables/eurito/crunchbase_eu/run.py rename to nesta/core/batchables/eurito/companies_eu/run.py index 652f3752..5c63436c 100644 --- a/nesta/core/batchables/eurito/crunchbase_eu/run.py +++ b/nesta/core/batchables/eurito/companies_eu/run.py @@ -61,10 +61,7 @@ def run(): eu_countries = get_eu_countries() # es setup - strans_kwargs={'filename':'eurito/crunchbase-eu.json', - 'from_key':'tier_0', - 'to_key':'tier_1', - 'ignore':['id']} + strans_kwargs = {'filename': 'companies.json', 'ignore': ['id']} es = ElasticsearchPlus(hosts=es_host, port=es_port, aws_auth_region=aws_auth_region, diff --git a/nesta/core/batchables/eurito/cordis_eu/run.py b/nesta/core/batchables/eurito/cordis_eu/run.py index 01b2b948..d64fa1d2 100644 --- a/nesta/core/batchables/eurito/cordis_eu/run.py +++ b/nesta/core/batchables/eurito/cordis_eu/run.py @@ -88,9 +88,7 @@ def run(): # es setup logging.info('Connecting to ES') - strans_kwargs={'filename':'eurito/cordis-eu.json', - 'from_key':'tier_0', 'to_key':'tier_1', - 'ignore':['id']} + strans_kwargs = {'filename': 'cordis.json', 'ignore': ['id']} es = ElasticsearchPlus(hosts=es_host, port=es_port, aws_auth_region=aws_auth_region, @@ -132,8 +130,8 @@ def run(): if 'BATCHPAR_outinfo' not in os.environ: from nesta.core.orms.orm_utils import setup_es from nesta.core.luigihacks.misctools import find_filepath_from_pathstub - es, es_config = setup_es('dev', True, True, - dataset='cordis-eu') + es, es_config = setup_es(production=False, endpoint='eurito', + dataset='cordis', drop_and_recreate=True) environ = {'config': find_filepath_from_pathstub('mysqldb.config'), 'batch_file' : ('cordis-eu_EURITO-ElasticsearchTask-' '2020-04-10-True-15865345336407135.json'), diff --git a/nesta/core/batchables/eurito/patstat-eu/run.py b/nesta/core/batchables/eurito/patstat-eu/run.py deleted file mode 100644 index f45d7953..00000000 --- a/nesta/core/batchables/eurito/patstat-eu/run.py +++ /dev/null @@ -1,128 +0,0 @@ -from ast import literal_eval -import boto3 -import json -import logging -import os - -from nesta.core.luigihacks.elasticsearchplus import ElasticsearchPlus -from nesta.core.luigihacks.luigi_logging import set_log_level -from nesta.core.orms.orm_utils import db_session, get_mysql_engine -from nesta.core.orms.orm_utils import load_json_from_pathstub -from nesta.core.orms.orm_utils import object_to_dict -from nesta.core.orms.patstat_eu_orm import ApplnFamily -from nesta.core.orms.patstat_2019_05_13 import * -from nesta.packages.geo_utils.lookup import get_eu_countries - - -def select_text(objs, lang_field, text_field): - if len(objs) == 0: - return None - _objs = [t for t in objs if t[lang_field] == 'en'] - if len(_objs) == 0: - _objs = objs - obj = sorted(_objs, key=lambda x: len(x), reverse=True)[0] - return obj[text_field] - - -def metadata(orm, session, appln_ids, field_selector=None): - if field_selector is None: - field_selector = orm.appln_id - _filter = field_selector.in_(appln_ids) - return [object_to_dict(_obj) for _obj in - session.query(orm).filter(_filter).all()] - - -def run(): - test = literal_eval(os.environ["BATCHPAR_test"]) - bucket = os.environ['BATCHPAR_bucket'] - batch_file = os.environ['BATCHPAR_batch_file'] - - db_name = os.environ["BATCHPAR_db_name"] - es_host = os.environ['BATCHPAR_outinfo'] - es_port = int(os.environ['BATCHPAR_out_port']) - es_index = os.environ['BATCHPAR_out_index'] - es_type = os.environ['BATCHPAR_out_type'] - entity_type = os.environ["BATCHPAR_entity_type"] - aws_auth_region = os.environ["BATCHPAR_aws_auth_region"] - - # database setup - logging.info('Retrieving engine connection') - engine = get_mysql_engine("BATCHPAR_config", "mysqldb", - db_name) - _engine = get_mysql_engine("BATCHPAR_config", "readonly", - "patstat_2019_05_13") - - # es setup - logging.info('Connecting to ES') - strans_kwargs={'filename':'eurito/patstat-eu.json', - 'from_key':'tier_0', 'to_key':'tier_1', - 'ignore':['id']} - es = ElasticsearchPlus(hosts=es_host, - port=es_port, - aws_auth_region=aws_auth_region, - no_commit=("AWSBATCHTEST" in - os.environ), - entity_type=entity_type, - strans_kwargs=strans_kwargs, - auto_translate=True, - auto_translate_kwargs={'min_len':20}, - null_empty_str=True, - coordinates_as_floats=True, - do_sort=True, - ngram_fields=['textBody_abstract_patent']) - - # collect file - logging.info('Retrieving patent family ids') - nrows = 20 if test else None - s3 = boto3.resource('s3') - obj = s3.Object(bucket, batch_file) - docdb_fam_ids = json.loads(obj.get()['Body']._raw_stream.read()) - logging.info(f"{len(docdb_fam_ids)} patent family IDs " - "retrieved from s3") - - eu_countries = get_eu_countries() - - logging.info('Processing rows') - _filter = ApplnFamily.docdb_family_id.in_(docdb_fam_ids) - with db_session(engine) as session: - for obj in session.query(ApplnFamily).filter(_filter).all(): - row = object_to_dict(obj) - appln_ids = row.pop('appln_id') - with db_session(_engine) as _session: - _titles = metadata(Tls202ApplnTitle, _session, appln_ids) - _abstrs = metadata(Tls203ApplnAbstr, _session, appln_ids) - ipcs = metadata(Tls209ApplnIpc, _session, appln_ids) - nace2s = metadata(Tls229ApplnNace2, _session, appln_ids) - techs = metadata(Tls230ApplnTechnField, _session, appln_ids) - # Get persons - _pers_applns = metadata(Tls207PersAppln, _session, appln_ids) - pers_ids = set(pa['person_id'] for pa in _pers_applns) - persons = metadata(Tls906Person, _session, pers_ids, - field_selector=Tls906Person.person_id) - - title = select_text(_titles, 'appln_title_lg', 'appln_title') - abstr = select_text(_abstrs, 'appln_abstract_lg', 'appln_abstract') - - # Get names from lookups - ipcs = list(set(i['ipc_class_symbol'].split()[0] for i in ipcs)) - nace2s = list(set(n['nace2_code'] for n in nace2s)) - techs = list(set(t['techn_field_nr'] for t in techs)) - ctrys = list(set(p['person_ctry_code'] for p in persons)) - nuts = list(set(p['nuts'] for p in persons)) - is_eu = any(c in eu_countries for c in ctrys) - - # Index the data - row = dict(title=title, abstract=abstr, ipc=ipcs, nace2=nace2s, - tech=techs, ctry=ctrys, nuts=nuts, is_eu=is_eu, **row) - uid = row.pop('docdb_family_id') - _row = es.index(index=es_index, doc_type=es_type, - id=uid, body=row) - - - logging.warning("Batch job complete.") - - -if __name__ == "__main__": - set_log_level() - logging.info('Starting...') - run() diff --git a/nesta/core/batchables/eurito/patstat_eu/run.py b/nesta/core/batchables/eurito/patstat_eu/run.py index 8871850f..3638501f 100644 --- a/nesta/core/batchables/eurito/patstat_eu/run.py +++ b/nesta/core/batchables/eurito/patstat_eu/run.py @@ -63,9 +63,7 @@ def run(): # es setup logging.info('Connecting to ES') - strans_kwargs={'filename':'eurito/patstat-eu.json', - 'from_key':'tier_0', 'to_key':'tier_1', - 'ignore':['id']} + strans_kwargs = {'filename': 'patstat.json', 'ignore': ['id']} es = ElasticsearchPlus(hosts=es_host, port=es_port, aws_auth_region=aws_auth_region, diff --git a/nesta/core/batchables/health_data/nih_abstract_mesh_data/run.py b/nesta/core/batchables/health_data/nih_abstract_mesh_data/run.py index 93be1e88..4c3decea 100644 --- a/nesta/core/batchables/health_data/nih_abstract_mesh_data/run.py +++ b/nesta/core/batchables/health_data/nih_abstract_mesh_data/run.py @@ -68,9 +68,7 @@ def run(): dupes = format_duplicate_map(dupes) # Set up elastic search connection - field_null_mapping = load_json_from_pathstub("tier_1/" - "field_null_mappings/", - "health_scanner.json") + field_null_mapping = load_json_from_pathstub("health-scanner", "nulls.json") es = ElasticsearchPlus(hosts=es_config['host'], port=es_config['port'], aws_auth_region=es_config['region'], diff --git a/nesta/core/batchables/health_data/nih_dedupe/run.py b/nesta/core/batchables/health_data/nih_dedupe/run.py index 3bd14f61..b9c4c3eb 100644 --- a/nesta/core/batchables/health_data/nih_dedupe/run.py +++ b/nesta/core/batchables/health_data/nih_dedupe/run.py @@ -61,9 +61,7 @@ def run(): art_ids = json.loads(ids_obj.get()['Body']._raw_stream.read()) logging.info(f'Processing {len(art_ids)} article ids') - field_null_mapping = load_json_from_pathstub(("tier_1/" - "field_null_mappings/"), - "health_scanner.json") + field_null_mapping = load_json_from_pathstub("health-scanner", "nulls.json") es = ElasticsearchPlus(hosts=es_host, port=es_port, aws_auth_region=aws_auth_region, diff --git a/nesta/core/batchables/health_data/nih_process_data/run.py b/nesta/core/batchables/health_data/nih_process_data/run.py index e8f0a905..dc1d9c78 100644 --- a/nesta/core/batchables/health_data/nih_process_data/run.py +++ b/nesta/core/batchables/health_data/nih_process_data/run.py @@ -25,7 +25,6 @@ def run(): start_index = os.environ["BATCHPAR_start_index"] end_index = os.environ["BATCHPAR_end_index"] - #mysqldb_config = os.environ["BATCHPAR_config"] es_host = os.environ["BATCHPAR_outinfo"] es_port = os.environ["BATCHPAR_out_port"] es_index = os.environ["BATCHPAR_out_index"] @@ -87,13 +86,8 @@ def run(): df['total_cost_currency'] = 'USD' # output to elasticsearch - field_null_mapping = load_json_from_pathstub("tier_1/field_null_mappings/", - "health_scanner.json") - strans_kwargs={'filename':'nih.json', - 'from_key':'tier_0', - 'to_key':'tier_1', - 'ignore':['application_id']} - + field_null_mapping = load_json_from_pathstub("health-scanner", "nulls.json") + strans_kwargs = {'filename': 'nih.json', 'ignore': ['application_id']} es = ElasticsearchPlus(hosts=es_host, port=es_port, aws_auth_region=aws_auth_region, @@ -143,15 +137,15 @@ def run(): pars = {'start_index': '2001360', 'end_index': '2003940', 'db': 'dev', + 'done': 'False', 'config': (f'{os.environ["HOME"]}/nesta/nesta/' 'core/config/mysqldb.config'), - 'done': 'False', 'outinfo': ('https://search-health-scanner-' '5cs7g52446h7qscocqmiky5dn4.' 'eu-west-2.es.amazonaws.com'), 'out_index': 'nih_dev', 'out_type': '_doc', - 'out_port': '_doc', + 'out_port': '_443', 'aws_auth_region': 'eu-west-2', 'entity_type': 'paper', 'test': 'False'} diff --git a/nesta/core/batchables/meetup/topic_tag_elasticsearch/run.py b/nesta/core/batchables/meetup/topic_tag_elasticsearch/run.py index a6a80ccc..e60eda35 100644 --- a/nesta/core/batchables/meetup/topic_tag_elasticsearch/run.py +++ b/nesta/core/batchables/meetup/topic_tag_elasticsearch/run.py @@ -71,13 +71,8 @@ def run(): mesh_terms = format_mesh_terms(df_mesh) # Setup ES+ - field_null_mapping = load_json_from_pathstub(("tier_1/" - "field_null_mappings/"), - "health_scanner.json") - strans_kwargs={'filename':'meetup.json', - 'from_key':'tier_0', - 'to_key':'tier_1', - 'ignore':[]} + field_null_mapping = load_json_from_pathstub("health-scanner", "nulls.json") + strans_kwargs = {'filename': 'meetup.json'} es = ElasticsearchPlus(hosts=es_host, port=es_port, aws_auth_region=aws_auth_region, diff --git a/nesta/core/config/elasticsearch.config b/nesta/core/config/elasticsearch.config deleted file mode 100644 index d1ae5c3c48022f0d9b8c3f549f3e7027c7ecce2b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2633 zcmV-P3byqCM@dveQdv+`09X^`PK9BP^n`^&kT!TY@ITgGY1N74p(O)tHcSRDL|4k8 z9C4vomK9z2-%ZgBSyIVJ)sdsE@MGABA6r8tmP=9X*;YLT2pw%^8Nb-Y4LL0`n;DLZ z^u~zr@yIbCN6ZSFcB!G%s~{zXlSd_yhvlh4AtuHFgU(YSTiZXf-@mF^8JkCRGw*m7 zgV2!99fK;pQTnIS4&LuTZ0|sM=9-=)H#m5w671jhp6T?Qr;pFfYbwv|Z2$E(L+aPg zA(pJaa`gu+g(^0%0aD^@yn=iZm~v^=SNb&j%oEmIBu?5jMZeyOtFrM9@1nn{uSAuN zBMdqAlP@KdPvm;038&B)$@`9R<2~t3sY2x2tP__FnR5MqS|3EGQ^2Zr{~BV1$I}q( zw%|Q)AEBIu7H&;Kf~ix-Wx;w-DJEr$nVTA#lmPR_cG6tX&u|6MGBO}r6ESI1bcc?X zMJwQpI0X!)+hu{$P4Be+VDKC4mRKeVx2n~+`Y57sFp(zI9Wr1szsS#(RfT8+xAk5= zR!14}j@GO=PXS`1T1hhP=EhL|e4Kq-YA~x}L3q@q16-@YKDJw!5(GFIV_d&&($sAS zq#T!>%z4B@|5@p_#&X+aBW9)|AJ9>UIvWuL@r}Jbryg-)J{FCaGO#*s-^~L4LE#de zuRE01y~3boAA0@vi(tmQatkqwGSTSI`45j)nG`4BX~+Y@)f8LD*emI0C@epW>CHCA z2a}y*(Of4O_Mo4K-{41{d)?`+$NEkUfztx$!bCLujcRieAylWvn0+`0*}?2In;m={ z-zZWxaikY5Mfb2961yK=3zAAQ#h1bRy?@d|Uo6;!LubIf>&XCjd&!w??uF*WKJR;7 z6qH+tEf<9l#)Px81Q^!$_#|E`pDb8o+nT6N#n{h(^X66Iw^+#o)eG|R?HScY8oNp^ zaOGsA*bT1B%~KED!GC~0-Te}zBB9<43V;!c)1FmfV$x16?7}yEQF!u*{cW(9-Jq;q zR@zo>tSZr^W;w3t?Y`_Fc~BLj?{@wnCN;M1-gjC6L!qM55C6LIP*PKM2n>~Up_8E9 zR<5xwXkMQ~$vly>R#KIS-J=uQo+Vu$is2+q-tOl4d9&CnLV0#AOju47ki0^4qTCkZ z{%N!1qm>8kKkD<=?@9c{%u+0nLMo}wOH@y=neUNY=twR&vZhgeV!cgp)?E0A%usK- z7a`zZPPh9a$Q*eF2;~vXn$lv0h@Agn+}a;pgy^tab%K<=f&lE|)3{1Fb9#g|UoL=* zufU95MT>szOZK8$ZjoX`QUJ9v@y{T*GcALgGtEqZ&4o zr%crLn0HA;QRzsY+(4q?6z^gglUcTT=zv9Lsm}NOB^o$ztecF`F$cnfw()Poe#8$& zOw$xTSj!uUF194qG69KN(lPvbw!dbblv#tFSynND{Gnxn+600AwiI`x!Q8bDHO5Z| z;0tdzMSVL-DRBK-wYAsM&;`PF_|~=wO0J(t?-&WCg z{{VfymsSe>(jP8;_~Vf*9B_R32;-`%yxe9rnDD2M;n38FIlAt<^AA!O+CC8sc}wA# zvr}<`PYdO{Q2J~D6itev4uttOy3p9~StPlgz0xbe@P5=)FdhM@`?Al<)aYbLwGBl4 zR<(@h-Vb?SP$X=pvWD`HL2TEBqgcDWn07ggQV;{XsHvkGk{6tsT8~(?zG`#FHNT;} z;D7IfvLcwvca;J;P#NPhG)NXXU`18X59qRU$f5uc12&1WuEeH27jte`(i_IuQQnS0 z)ElB&*b{=*f;t0~^D+3=xY5RV?isPYDba;IBJ&YqbuXd5#DD51^2q|BcVQI~?m`b7%YMR_FY)bDgkR@zuy$a9F z8EMl*t#FY=O1ve|JWz1v427bzo~?CB{W9{_JwFonqLTXK)hK74wo>T+C)12z)#f&-AUo!w4(9! z0{7>9jHKWsjD&LRu=P9Q7_{hQ;(wcts5tihLv&Z9r}UhJ{0dYRDglX zb|c-^OMN6=MD?z7`l*kY@c2GYYVO2Y(~0ysb1v7Uwv@>{`c zZxBt}1u>PHO>^;XRNyEpSQK{}e}-@`x3ef`0Eb}lp;DE*gK0%9YQ)tXPQQ|zG*#HS zfR=eCtuPzyrjyYIiAumpkIXtFEDu+26lVa}Qfc-cX{1?oJ2h=y`JUoM`|p+qm<87S z^~#kKiyI<5^_8W+`?;+^m+RbAjSORASd`FEvTCa-^Tuc$L1l+U8pxk!TGEx3Md%)W rX4p}k)v_FG~xxM<(D76UY8}22i diff --git a/nesta/core/config/elasticsearch.yaml b/nesta/core/config/elasticsearch.yaml new file mode 100644 index 0000000000000000000000000000000000000000..deae86aed21c60d04431b501b899e15b99bfac53 GIT binary patch literal 1086 zcmV-E1i||NM@dveQdv+`01X!mSSVzV1~km4#c;^#h|O0mnxso3M_a&r1Pw&OVW>t| zr^5=w9&D7WA6K4z-@F7{W*C~k2pU#f@W}W+3-K=eTkGjr5;?Zhg>Tzu?hAO!HSO~t zoe=M<`@`ar!)B8QsK>n9%&+%6Tm{S06E8y=B|hcEQBK+F%2}cYkXzD6I-?Ml18TAj z#pTucwWC(>5X=rURs=~n&+E^%4v+L9InC($g`CHs1l%Rw6i$9zF70aX4;l_VUo z&ikfK^+~d2m4qY)WI7k5&s(RY_H~{~2ZB1D9U79%y5`Q>{4Yos55UW3RmLiFgVVP@ zse&0O8>fc8CdOI$MVKY2(M=u6rcUX?7gGdi}D()4~Z z`Pb^SxU1wjCKgtW0X=$Z;Yz6&WHcM^^hGaM4mm+U)w6aiKSg1PM(=WGX!Fjj_L3V{ zD0j%S8vEIdC;JlT@hu=#ho+t|rREsHzICTdfy?}?Rce?oA5wF+!#6MOBoKp(-Jj73 z+cGHt5eX}{@>3k?oBsBT?rQ!T-S0F+qnj_l*@N!y_ZWrDUCfSfyS2s*{J8%)P-G5x z`hxRWUT8 zhoxTcm4tssdG-ay=M)fYA1m!^H_HEB;o^WONJWRDHl)>;+N8mXyO)M-yn(c`W{UfH zUO~k>HxV|&?o0s-`MwpZa6}sBc`x3F$bX@(VK3H@ZT^s&c4;9m8nz$)W!`woJV@9u z^mRV68vXa99~xf4Bk=p-vj}1B`1^B}j*^WvKBMy=1g$8<0P%)r_8h!nOr`f-MqsJ2 zS-L5M=^u)j2HW#IC#+8|Keo=p~-%4p8k(M;4&WLBjZmACgMoH7QkS6CH zl}595UtXg4ANV6Hei)uLZT*3o6E+;pvnH-)GS0h~Z>&GUuke|oEi_Nk;v5D9Y{>`{WVYvk_b$Qpbu6MDtEe1^3a5A}FM z;+27*0xr<9SrmcE!b6$njJ?#ac><}yEFaFT E0G?_fGXMYp literal 0 HcmV?d00001 diff --git a/nesta/core/luigihacks/estask.py b/nesta/core/luigihacks/estask.py index d6c40a38..a316f3bd 100644 --- a/nesta/core/luigihacks/estask.py +++ b/nesta/core/luigihacks/estask.py @@ -17,6 +17,7 @@ class ElasticsearchTask(AutoBatchTask): Args: routine_id (str): Label for this routine. db_config_path (str): Database config path. + endpoint (str): AWS domain name of the ES endpoint. dataset (str): Name of the ES dataset. entity_type (str): Entity type, for :obj:`ElasticsearchPlus`. kwargs (dict): Any extra parameters to pass to the batchables. @@ -27,6 +28,7 @@ class ElasticsearchTask(AutoBatchTask): ''' routine_id = luigi.Parameter() db_config_path = luigi.Parameter('mysqldb.config') + endpoint = luigi.Parameter() dataset = luigi.Parameter() entity_type = luigi.Parameter() kwargs = luigi.DictParameter(default={}) @@ -72,10 +74,10 @@ def prepare(self): " while in test mode") # Setup elasticsearch and extract all ids - es_mode = 'dev' if self.test else 'prod' - es, es_config = setup_es(es_mode, self.test, - drop_and_recreate=False, + es, es_config = setup_es(endpoint=self.endpoint, dataset=self.dataset, + production=not self.test, + drop_and_recreate=False, increment_version=False) ids = get_es_ids(es, es_config, size=10000) # All ids in this index ids = ids - self._done_ids # Don't repeat done ids diff --git a/nesta/core/luigihacks/sql2estask.py b/nesta/core/luigihacks/sql2estask.py index a2f37f29..82ff11c6 100644 --- a/nesta/core/luigihacks/sql2estask.py +++ b/nesta/core/luigihacks/sql2estask.py @@ -31,6 +31,7 @@ class Sql2EsTask(autobatch.AutoBatchTask): process_batch_size (int): Number of rows to process in a batch. drop_and_recreate (bool): If in test mode, drop and recreate the ES index? dataset (str): Name of the elasticsearch dataset. + endpoint (str): Name of the AWS ES domain endpoint. id_field (SqlAlchemy selectable attribute): The ID field attribute. filter (SqlAlchemy conditional statement): A conditional statement, to be passed to query.filter(). This allows for @@ -45,8 +46,8 @@ class Sql2EsTask(autobatch.AutoBatchTask): db_section = luigi.Parameter(default="mysqldb") process_batch_size = luigi.IntParameter(default=10000) drop_and_recreate = luigi.BoolParameter(default=False) - aliases = luigi.Parameter(default=None) dataset = luigi.Parameter() + endpoint = luigi.Parameter() id_field = luigi.Parameter() filter = luigi.Parameter(default=None) entity_type = luigi.Parameter() @@ -75,11 +76,10 @@ def prepare(self): database) # Elasticsearch setup - es_mode = 'dev' if self.test else 'prod' - es, es_config = setup_es(es_mode, self.test, - self.drop_and_recreate, + es, es_config = setup_es(endpoint=self.endpoint, dataset=self.dataset, - aliases=self.aliases) + production=not self.test, + drop_and_recreate=self.drop_and_recreate) # Get set of existing ids from elasticsearch via scroll existing_ids = get_es_ids(es, es_config) @@ -122,7 +122,7 @@ def prepare(self): 'routine_id': self.routine_id } params.update(self.kwargs) - + logging.info(params) job_params.append(params) if self.test and count > 1: diff --git a/nesta/core/orms/arxiv_es_config.json b/nesta/core/orms/arxiv_es_config.json deleted file mode 100644 index ba7a5eef..00000000 --- a/nesta/core/orms/arxiv_es_config.json +++ /dev/null @@ -1,153 +0,0 @@ -{ - "mappings": { - "_doc": { - "dynamic": "strict", - "properties": { - "booleanFlag_multinational_article": { - "type": "boolean" - }, - "count_citations_article": { - "type": "integer" - }, - "date_created_article": { - "type": "date" - }, - "id_digitalObjectIdentifier_article": { - "type": "keyword" - }, - "json_category_article": { - "properties": { - "ancestors": { - "type": "keyword" - }, - "level": { - "type": "integer" - }, - "order": { - "type": "integer" - }, - "value": { - "type": "keyword" - } - }, - "type": "nested" - }, - "json_fieldOfStudy_article": { - "properties": { - "ancestors": { - "type": "keyword" - }, - "level": { - "type": "integer" - }, - "order": { - "type": "integer" - }, - "value": { - "type": "keyword" - } - }, - "type": "nested" - }, - "json_location_article": { - "properties": { - "ancestors": { - "type": "keyword" - }, - "level": { - "type": "integer" - }, - "order": { - "type": "integer" - }, - "value": { - "type": "keyword" - } - }, - "type": "nested" - }, - "metric_citations_article": { - "type": "float" - }, - "metric_novelty_article": { - "type": "float" - }, - "terms_authors_article": { - "analyzer": "terms_analyzer", - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "terms_institutes_article": { - "analyzer": "terms_analyzer", - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "terms_tokens_article": { - "type": "keyword" - }, - "textBody_abstract_article": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "title_of_article": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "type_of_entity": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "url_of_article": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "year_of_article": { - "type": "integer" - } - } - } - }, - "settings": { - "analysis": { - "analyzer": { - "terms_analyzer": { - "filter": [ - "standard", - "lowercase", - "stop" - ], - "tokenizer": "standard", - "type": "custom" - } - } - }, - "index": { - "number_of_replicas": "1", - "number_of_shards": "5" - } - } -} diff --git a/nesta/core/orms/crunchbase-eu_es_config.json b/nesta/core/orms/crunchbase-eu_es_config.json deleted file mode 100644 index f42aef6a..00000000 --- a/nesta/core/orms/crunchbase-eu_es_config.json +++ /dev/null @@ -1,262 +0,0 @@ -{ - "mappings": { - "_doc": { - "dynamic": "strict", - "properties": { - "_cost_usd2018_organisation": { - "type": "float" - }, - "_terms_sdg_summary": { - "type": "keyword" - }, - "address_of_organisation": { - "type": "keyword" - }, - "booleanFlag_eu_organisation": { - "type": "boolean" - }, - "booleanFlag_health_organisation": { - "type": "boolean" - }, - "coordinate_of_city": { - "type": "geo_point" - }, - "cost_of_funding": { - "type": "long" - }, - "count_employee_organisation": { - "type": "keyword" - }, - "count_rounds_funding": { - "type": "integer" - }, - "currency_of_funding": { - "type": "keyword" - }, - "date_birth_organisation": { - "format": "yyyy-MM-dd", - "type": "date" - }, - "date_death_organisation": { - "format": "yyyy-MM-dd", - "type": "date" - }, - "date_last_funding": { - "format": "yyyy-MM-dd", - "type": "date" - }, - "date_updated_organisation": { - "format": "yyyy-MM-dd", - "type": "date" - }, - "id_continent_organisation": { - "type": "keyword" - }, - "id_iso2_country": { - "type": "keyword" - }, - "id_iso3_country": { - "type": "keyword" - }, - "id_isoNumeric_country": { - "type": "integer" - }, - "id_of_continent": { - "type": "keyword" - }, - "id_parent_organisation": { - "type": "keyword" - }, - "id_state_organisation": { - "type": "keyword" - }, - "metric_novelty_organisation": { - "type": "float" - }, - "name_of_organisation": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "placeName_city_organisation": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "placeName_continent_organisation": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "placeName_country_organisation": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "placeName_region_organisation": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "placeName_state_organisation": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "rank_rhodonite_organisation": { - "type": "float" - }, - "status_of_organisation": { - "type": "keyword" - }, - "terms_alias_organisation": { - "type": "keyword" - }, - "terms_category_organisation": { - "type": "keyword" - }, - "terms_mesh_description": { - "analyzer": "mesh_terms_analyzer", - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "terms_of_countryTags": { - "type": "keyword" - }, - "terms_of_funders": { - "analyzer": "mesh_terms_analyzer", - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "terms_roles_organisation": { - "analyzer": "mesh_terms_analyzer", - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "terms_subcategory_organisation": { - "analyzer": "mesh_terms_analyzer", - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "terms_tokens_entity": { - "type": "keyword" - }, - "textBody_descriptive_organisation": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "textBody_summary_organisation": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "type_of_entity": { - "type": "keyword" - }, - "type_of_organisation": { - "type": "keyword" - }, - "url_crunchBase_organisation": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "url_facebook_organisation": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "url_linkedIn_organisation": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "url_of_organisation": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "url_twitter_organisation": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - } - } - } - }, - "settings": { - "analysis": { - "analyzer": { - "mesh_terms_analyzer": { - "filter": [ - "standard", - "lowercase", - "stop" - ], - "tokenizer": "standard", - "type": "custom" - } - } - }, - "index": { - "number_of_replicas": "1", - "number_of_shards": "5" - } - } -} diff --git a/nesta/core/orms/orm_utils.py b/nesta/core/orms/orm_utils.py index 5b732d3f..555ce841 100644 --- a/nesta/core/orms/orm_utils.py +++ b/nesta/core/orms/orm_utils.py @@ -18,8 +18,11 @@ import pymysql import os import json +import yaml import logging import time +from collections import defaultdict +from collections.abc import Mapping def _get_key_value(obj, key): @@ -93,68 +96,86 @@ def assert_correct_config(test, config, key): raise ValueError(f"In test mode the index '{key}' " "must end with '_dev'") +def default_to_regular(d): + """Convert nested defaultdicts to nested dicts. + This is useful when you want to throw KeyErrors, which + would be dynamically accepted otherwise. + + Args: + d (nested defaultdict): A nested defaultdict object. + Returns: + _d (nested dict): A nested dict object. + """ + if isinstance(d, defaultdict): + d = {k: default_to_regular(v) for k, v in d.items()} + return d -def setup_es(es_mode, test_mode, drop_and_recreate, - dataset, aliases=None, increment_version=False): + +def parse_es_config(increment_version): + """Retrieve the ES config for all endpoints and indexes, + including auto-version-incrementing if required. + + Args: + increment_version (bool): Move one version up? (NB: no changes to config file on disk) + Returns: + config: Elasticsearch config dict, for all endpoints and indexes. + """ + raw_config = load_yaml_from_pathstub('config', 'elasticsearch.yaml') + config = defaultdict(lambda: defaultdict(dict)) + for endpoint, endpoint_config in raw_config['endpoints'].items(): + # Build the base configuration for this endpoint + indexes = endpoint_config.pop('indexes') + base_config = raw_config['defaults'].copy() # use defaults as the base... + base_config.update(endpoint_config) # then override with endpoint settings + # Add the host to the config + scheme = base_config.pop('scheme') + _id = base_config.pop('id') + rgn = base_config['region'] + base_config['host'] = f'{scheme}://search-{endpoint}-{_id}.{rgn}.es.amazonaws.com' + for dataset, version in indexes.items(): + prod_idx = f'{dataset}_v' + str(version + increment_version) # e.g. arxiv_v1 / v2 + dev_idx = f'{dataset}_dev' + ('0' if increment_version else '') # e.g. arxiv_dev / dev0 + config[endpoint][dataset][True] = {'index': prod_idx, **base_config} # production mode + config[endpoint][dataset][False] = {'index': dev_idx, **base_config} # dev mode + return default_to_regular(config) + + +def setup_es(endpoint, dataset, production, + drop_and_recreate=False, increment_version=False): """Retrieve the ES connection, ES config and setup the index if required. Args: - es_mode (str): One of "prod" or "dev". - test_mode (bool): Running in test mode? - drop_and_recreate (bool): Drop and recreate ES index? + endpoint (str): Name of the AWS ES endpoint. dataset (str): Name of the dataset for the ES mapping. - aliases (str): Name of the aliases for the ES mapping. + production (bool): Running in production mode? + drop_and_recreate (bool): Drop and recreate ES index? increment_version (bool): Move one version up? Returns: {es, es_config}: Elasticsearch connection and config dict. """ - if es_mode not in ("prod", "dev"): - raise ValueError("es_mode required to be one of " - f"'prod' or 'dev', but '{es_mode}' provided.") - - # Get and check the config - key = f"{dataset}_{es_mode}" - es_config = get_config('elasticsearch.config', key) - assert_correct_config(test_mode, es_config, key) - - # If required, create new index from the old one - if increment_version: - old_index = es_config['index'] - if es_mode == 'prod': - tag, version = re.findall(r'(\w+)(\d+)', old_index)[0] - new_index = f'{tag}{int(version)+1}' - else: - tag = old_index - new_index = f'{old_index}0' - es_config['index'] = new_index - es_config['old_index'] = old_index - if any((new_index == old_index, - not old_index.startswith(tag), - not new_index.startswith(tag), - len(new_index) - len(old_index) > 1)): - raise ValueError('Could not create a new valid ' - f'index from {old_index}. Tried, ' - f'but got {new_index}.') - + es_master_config = parse_es_config(increment_version) + es_config = es_master_config[endpoint][dataset][production] # Make the ES connection es = Elasticsearch(es_config['host'], port=es_config['port'], use_ssl=True, send_get_body_as='POST') - # Drop the index if required (must be in test mode to do this) - _index = es_config['index'] - exists = es.indices.exists(index=_index) - if drop_and_recreate and test_mode and exists: - es.indices.delete(index=_index) + # Does the index already exist? + index = es_config['index'] + exists = es.indices.exists(index=index) + # Drop index for fresh recreation (if in test mode) + if drop_and_recreate and (not production) and exists: + es.indices.delete(index=index) exists = False # Create the index if required if not exists: - mapping = get_es_mapping(dataset, aliases=aliases) - es.indices.create(index=_index, body=mapping) + mapping = get_es_mapping(dataset, endpoint) + es.indices.create(index=index, body=mapping) return es, es_config + def get_es_ids(es, es_config, size=1000, query={}): '''Get all existing ES document ids for a given config - + Args: es: Elasticsearch connection. es_config (dict): Elasticsearch configuration. @@ -188,45 +209,137 @@ def load_json_from_pathstub(pathstub, filename, sort_on_load=True): return js -def get_es_mapping(dataset, aliases): - '''Get the configuration from a file in the luigi config path - directory, and convert the key-value pairs under the config :code:`header` - into a `dict`. +def load_yaml_from_pathstub(pathstub, filename): + """Basic wrapper around :obj:`find_filepath_from_pathstub` + which also opens the file (assumed to be yaml). + + Args: + pathstub (str): Stub of filepath where the file should be found. + filename (str): The filename. + Returns: + The file contents as a json-like object. + """ + _path = find_filepath_from_pathstub(pathstub) + _path = os.path.join(_path, filename) + with open(_path) as f: + return yaml.safe_load(f) - Parameters: - file_name (str): The configuation file name. - header (str): The header key in the config file. +def update_nested(original_dict, update_dict): + """Update a nested dictionary with another nested dictionary. + Has equivalent behaviour to :obj:`dict.update(self, update_dict)`. + + Args: + original_dict (dict): The original dictionary to update. + update_dict (dict): The dictionary from which to extract updates. Returns: - :obj:`dict` - ''' - # Get the mapping and lookup - mapping = load_json_from_pathstub("core/orms/", - f"{dataset}_es_config.json") - alias_lookup = {} - if aliases is not None: - alias_lookup = load_json_from_pathstub("tier_1/aliases/", - f"{aliases}.json") - # Get a list of valid fields for verification - fields = mapping["mappings"]["_doc"]["properties"].keys() - # Add any aliases to the mapping + original_dict (dict): The original dictionary after updates. + """ + for k, v in update_dict.items(): + if isinstance(v, Mapping): # Mapping ~= any dict-like object + original_dict[k] = update_nested(original_dict.get(k, {}), v) + else: + original_dict[k] = v + return original_dict + + +def _get_es_mapping(dataset, endpoint): + """Sequentially apply the mappings from index, settings, the + dataset and finally the endpoint. None of these files is strictly + required to exist, so an endpoint could conceivably have a dataset + unique to itself. + + Args: + dataset (str): Name of the dataset for the ES mapping. + endpoint (str): Name of the AWS ES endpoint. + Returns: + :obj:`dict`: The constructed mapping. + """ + mapping = {} + for _path, _prefix in [('defaults', 'defaults'), + ('datasets', f'{dataset}_mapping'), + (f'endpoints/{endpoint}', f'{dataset}_mapping')]: + try: + _mapping = load_json_from_pathstub(f"mappings/{_path}", f"{_prefix}.json") + except json.JSONDecodeError as exc: + raise ValueError(f'Could not decode "mappings/{_path}/{_prefix}.json"') from exc + except FileNotFoundError: + continue + update_nested(mapping, _mapping) + return mapping + + +def _apply_alias(mapping, dataset, endpoint): + """Dynamically apply aliases to an Elasticsearch mapping. Note that + the mapping is changed in-place. + + Args: + mapping (dict): An ES mapping. + dataset (str): Name of the dataset for this ES mapping. + endpoint (str): Name of the AWS ES endpoint. + """ + ep_path = f"mappings/endpoints/{endpoint}" + # Load an alias, if it exists + try: + alias_lookup = load_json_from_pathstub(ep_path, "aliases.json") + except FileNotFoundError: + return + # Check whether this is a soft or hard alias + try: + config = load_yaml_from_pathstub(ep_path, "config.yaml") + hard_alias = config['hard-alias'] + except (FileNotFoundError, KeyError): + hard_alias = False + # Apply the aliases to the mapping properties + propts = mapping["mappings"]["_doc"]["properties"] + _fields = set() for alias, lookup in alias_lookup.items(): if dataset not in lookup: continue - # Validate the field field = lookup[dataset] - if field not in fields: - raise ValueError(f"Alias '{alias}' to '{field}' but '{field}'" - "does not exist in the mapping.") - # Add the alias to the mapping - value = {"type": "alias", "path": lookup[dataset]} - mapping["mappings"]["_doc"]["properties"][alias] = value + propts[alias] = (propts[field] if hard_alias # New field same as old for 'hard-alias' + else {"type": "alias", "path": field}) # Otherwise use an ES alias + _fields.add(field) + # Remove old fields if 'hard alias' + if hard_alias: + for f in _fields: + propts.pop(f) + + +def _prune_nested(mapping): + """Recursively remove any fields with null values from + a nested dictionary. The input is changed in-place. + + Args: + mapping (dict): The dictionary to prune. + """ + for k in list(mapping.keys()): + v = mapping[k] + if isinstance(v, Mapping): # Mapping ~= any dict-like + _prune_nested(v) + elif v is None: + mapping.pop(k) + + +def get_es_mapping(dataset, endpoint): + '''Load the ES mapping for this dataset and endpoint, + including aliases. + + Args: + dataset (str): Name of the dataset for the ES mapping. + endpoint (str): Name of the AWS ES endpoint. + Returns: + :obj:`dict` + ''' + mapping = _get_es_mapping(dataset, endpoint) + _apply_alias(mapping, dataset, endpoint) + _prune_nested(mapping) # prunes any nested keys with null values return mapping def cast_as_sql_python_type(field, data): """Cast the data to ensure that it is the python type expected by SQL - + Args: field (SqlAlchemy field): SqlAlchemy field, to cast the data data: A data field to be cast @@ -241,7 +354,7 @@ def cast_as_sql_python_type(field, data): return _data -def filter_out_duplicates(db_env, section, database, +def filter_out_duplicates(db_env, section, database, Base, _class, data, low_memory=False): """Produce a filtered list of data, exluding duplicates and entries that @@ -303,7 +416,7 @@ def _filter_out_duplicates(session, Base, _class, data, # Read all pks if in low_memory mode if low_memory and not is_auto_pkey: - fields = [getattr(_class, pkey.name) + fields = [getattr(_class, pkey.name) for pkey in pkey_cols] all_pks = set(session.query(*fields).all()) @@ -359,11 +472,11 @@ def insert_data(db_env, section, database, Base, :obj:`list` of :obj:`dict` data which could not be imported (optional) """ - response = filter_out_duplicates(db_env=db_env, + response = filter_out_duplicates(db_env=db_env, section=section, database=database, - Base=Base, - _class=_class, + Base=Base, + _class=_class, data=data, low_memory=low_memory) objs, existing_objs, failed_objs = response diff --git a/nesta/core/orms/tests/test_orm_utils.py b/nesta/core/orms/tests/test_orm_utils.py index 4defe898..d3130c3c 100644 --- a/nesta/core/orms/tests/test_orm_utils.py +++ b/nesta/core/orms/tests/test_orm_utils.py @@ -26,36 +26,8 @@ from nesta.core.orms.orm_utils import db_session_query from nesta.core.orms.orm_utils import cast_as_sql_python_type -@pytest.fixture -def alias_lookup(): - return { - "alias1": { - "dataset1": "field1a", - "dataset2": "field1b" - }, - "alias2": { - "dataset1": "field2a", - "dataset2": "field2b" - } - } - -@pytest.fixture -def mapping(): - return { - 'mappings': { - '_doc': { - 'properties': { - 'field1a': {'type': 'keyword'}, - 'field2a': {'type': 'text'}, - } - } - } - } - Base = declarative_base() - - class DummyModel(Base): __tablename__ = 'dummy_model' @@ -229,109 +201,89 @@ def test_db_session_query(self): assert n_rows == len(parents) == 1000 def test_load_json_from_pathstub(): - for ds in ["nih", "crunchbase"]: - js = load_json_from_pathstub("core/orms/", - f"{ds}_es_config.json") + for ds in ["nih", "companies"]: + js = load_json_from_pathstub("datasets/", + f"{ds}.json") assert len(js) > 0 -@mock.patch("nesta.core.orms.orm_utils.load_json_from_pathstub") -def test_get_es_mapping(mocked_load_json_from_pathstub, alias_lookup, - mapping): - mocked_load_json_from_pathstub.side_effect = (mapping, - alias_lookup) - _mapping = get_es_mapping("dataset1", "blah") - alias1 = _mapping["mappings"]["_doc"]["properties"].pop("alias1") - alias2 = _mapping["mappings"]["_doc"]["properties"].pop("alias2") - assert mapping == _mapping - assert alias1 == {'type': 'alias', 'path': 'field1a'} - assert alias2 == {'type': 'alias', 'path': 'field2a'} - -@mock.patch("nesta.core.orms.orm_utils.load_json_from_pathstub") -def test_get_es_mapping_bad_alias(mocked_load_json_from_pathstub, - alias_lookup, mapping): - mocked_load_json_from_pathstub.side_effect = (mapping, - alias_lookup) - with pytest.raises(ValueError): - get_es_mapping("dataset2", "blah") - -@mock.patch("nesta.core.orms.orm_utils.get_config") -@mock.patch("nesta.core.orms.orm_utils.assert_correct_config") -@mock.patch("nesta.core.orms.orm_utils.Elasticsearch") -@mock.patch("nesta.core.orms.orm_utils.get_es_mapping") -def test_setup_es_bad_es_mode(mock_get_es_mapping, mock_Elasticsearch, - mock_assert_correct_config, mock_get_config): - with pytest.raises(ValueError): - setup_es(es_mode="dave", test_mode=False, drop_and_recreate=False, - dataset=None, aliases=None) - - -@mock.patch("nesta.core.orms.orm_utils.get_config") -@mock.patch("nesta.core.orms.orm_utils.assert_correct_config") -@mock.patch("nesta.core.orms.orm_utils.Elasticsearch") -@mock.patch("nesta.core.orms.orm_utils.get_es_mapping") -def test_setup_es_true_test_delete_called(mock_get_es_mapping, +PATH = "nesta.core.orms.orm_utils.{}" +@mock.patch(PATH.format("get_config")) +@mock.patch(PATH.format("assert_correct_config")) +@mock.patch(PATH.format("Elasticsearch")) +@mock.patch(PATH.format("get_es_mapping")) +@mock.patch(PATH.format("parse_es_config")) +def test_setup_es_true_test_delete_called(mock_parse_es_config, + mock_get_es_mapping, mock_Elasticsearch, mock_assert_correct_config, mock_get_config): mock_Elasticsearch.return_value.indices.exists.return_value = True - setup_es(es_mode="dev", test_mode=True, drop_and_recreate=True, - dataset=None, aliases=None) + setup_es(endpoint='arxlive', dataset='arxiv', production=False, + drop_and_recreate=True) assert mock_Elasticsearch.return_value.indices.delete.call_count == 1 assert mock_Elasticsearch.return_value.indices.create.call_count == 1 -@mock.patch("nesta.core.orms.orm_utils.get_config") -@mock.patch("nesta.core.orms.orm_utils.assert_correct_config") -@mock.patch("nesta.core.orms.orm_utils.Elasticsearch") -@mock.patch("nesta.core.orms.orm_utils.get_es_mapping") -def test_setup_es_true_test_delete_not_called_not_exists(mock_get_es_mapping, +@mock.patch(PATH.format("get_config")) +@mock.patch(PATH.format("assert_correct_config")) +@mock.patch(PATH.format("Elasticsearch")) +@mock.patch(PATH.format("get_es_mapping")) +@mock.patch(PATH.format("parse_es_config")) +def test_setup_es_true_test_delete_not_called_not_exists(mock_parse_es_config, + mock_get_es_mapping, mock_Elasticsearch, mock_assert_correct_config, mock_get_config): mock_Elasticsearch.return_value.indices.exists.return_value = False - setup_es(es_mode="dev", test_mode=True, drop_and_recreate=True, - dataset=None, aliases=None) + setup_es(drop_and_recreate=True, production=False, + endpoint='arxlive', dataset='arxiv') assert mock_Elasticsearch.return_value.indices.delete.call_count == 0 assert mock_Elasticsearch.return_value.indices.create.call_count == 1 -@mock.patch("nesta.core.orms.orm_utils.get_config") -@mock.patch("nesta.core.orms.orm_utils.assert_correct_config") -@mock.patch("nesta.core.orms.orm_utils.Elasticsearch") -@mock.patch("nesta.core.orms.orm_utils.get_es_mapping") -def test_setup_es_false_test_delete_not_called(mock_get_es_mapping, +@mock.patch(PATH.format("get_config")) +@mock.patch(PATH.format("assert_correct_config")) +@mock.patch(PATH.format("Elasticsearch")) +@mock.patch(PATH.format("get_es_mapping")) +@mock.patch(PATH.format("parse_es_config")) +def test_setup_es_false_test_delete_not_called(mock_parse_es_config, + mock_get_es_mapping, mock_Elasticsearch, mock_assert_correct_config, mock_get_config): mock_Elasticsearch.return_value.indices.exists.return_value = False - setup_es(es_mode="dev", test_mode=False, drop_and_recreate=True, - dataset=None, aliases=None) + setup_es(drop_and_recreate=True, production=False, + endpoint='arxlive', dataset='arxiv') assert mock_Elasticsearch.return_value.indices.delete.call_count == 0 assert mock_Elasticsearch.return_value.indices.create.call_count == 1 -@mock.patch("nesta.core.orms.orm_utils.get_config") -@mock.patch("nesta.core.orms.orm_utils.assert_correct_config") -@mock.patch("nesta.core.orms.orm_utils.Elasticsearch") -@mock.patch("nesta.core.orms.orm_utils.get_es_mapping") -def test_setup_es_false_reindex_delete_not_called(mock_get_es_mapping, +@mock.patch(PATH.format("get_config")) +@mock.patch(PATH.format("assert_correct_config")) +@mock.patch(PATH.format("Elasticsearch")) +@mock.patch(PATH.format("get_es_mapping")) +@mock.patch(PATH.format("parse_es_config")) +def test_setup_es_false_reindex_delete_not_called(mock_parse_es_config, + mock_get_es_mapping, mock_Elasticsearch, mock_assert_correct_config, mock_get_config): mock_Elasticsearch.return_value.indices.exists.return_value = False - setup_es(es_mode="dev", test_mode=True, drop_and_recreate=False, - dataset=None, aliases=None) + setup_es(drop_and_recreate=False, production=False, + endpoint='arxlive', dataset='arxiv') assert mock_Elasticsearch.return_value.indices.delete.call_count == 0 assert mock_Elasticsearch.return_value.indices.create.call_count == 1 -@mock.patch("nesta.core.orms.orm_utils.get_config") -@mock.patch("nesta.core.orms.orm_utils.assert_correct_config") -@mock.patch("nesta.core.orms.orm_utils.Elasticsearch") -@mock.patch("nesta.core.orms.orm_utils.get_es_mapping") -def test_setup_es_no_create_if_exists(mock_get_es_mapping, +@mock.patch(PATH.format("get_config")) +@mock.patch(PATH.format("assert_correct_config")) +@mock.patch(PATH.format("Elasticsearch")) +@mock.patch(PATH.format("get_es_mapping")) +@mock.patch(PATH.format("parse_es_config")) +def test_setup_es_no_create_if_exists(mock_parse_es_config, + mock_get_es_mapping, mock_Elasticsearch, mock_assert_correct_config, mock_get_config): mock_Elasticsearch.return_value.indices.exists.return_value = True - setup_es(es_mode="dev", test_mode=True, drop_and_recreate=False, - dataset=None, aliases=None) + setup_es(drop_and_recreate=False, production=False, + endpoint='arxlive', dataset='arxiv') assert mock_Elasticsearch.return_value.indices.delete.call_count == 0 assert mock_Elasticsearch.return_value.indices.create.call_count == 0 @@ -392,9 +344,8 @@ def test_merge_metadata_with_three_bases(primary_base, secondary_base, tertiary_ 'second_table', 'third_table'] -@mock.patch("nesta.core.orms.orm_utils.scan", - return_value=[{'_id':1},{'_id':1}, - {'_id':22.3},{'_id':3.3}]*134) +@mock.patch(PATH.format("scan"), return_value=[{'_id':1},{'_id':1}, + {'_id':22.3},{'_id':3.3}]*134) def test_get_es_ids(mocked_scan): ids = get_es_ids(mock.MagicMock(), mock.MagicMock()) assert ids == {1, 22.3, 3.3} diff --git a/nesta/core/routines/arxiv/arxiv_es_tokens.py b/nesta/core/routines/arxiv/arxiv_es_tokens.py index 34d399a9..f9268562 100644 --- a/nesta/core/routines/arxiv/arxiv_es_tokens.py +++ b/nesta/core/routines/arxiv/arxiv_es_tokens.py @@ -16,10 +16,11 @@ class ArxivESTokenTask(ElasticsearchTask): def done_ids(self): - es_mode = 'dev' if self.test else 'prod' - es, es_config = setup_es(es_mode, self.test, - drop_and_recreate=False, + es, es_config = setup_es(es_mode=es_mode, + endpoint=self.dataset, dataset=self.dataset, + production=not self.test, + drop_and_recreate=False, increment_version=False) field = "terms_tokens_article" ids = get_es_ids(es, es_config, size=10000, diff --git a/nesta/core/routines/arxiv/arxiv_lolvelty.py b/nesta/core/routines/arxiv/arxiv_lolvelty.py index b432d9c7..7579b17e 100644 --- a/nesta/core/routines/arxiv/arxiv_lolvelty.py +++ b/nesta/core/routines/arxiv/arxiv_lolvelty.py @@ -25,10 +25,10 @@ class ArxivElasticsearchTask(ElasticsearchTask): grid_task_kwargs = DictParameterPlus(default={}) def done_ids(self): - es_mode = 'dev' if self.test else 'prod' - es, es_config = setup_es(es_mode, self.test, - drop_and_recreate=False, + es, es_config = setup_es(endpoint=self.endpoint, dataset=self.dataset, + production=not self.test, + drop_and_recreate=False, increment_version=False) field = "metric_novelty_article" ids = get_es_ids(es, es_config, size=10000, @@ -42,6 +42,7 @@ def requires(self): process_batch_size=10000, drop_and_recreate=self.drop_and_recreate, dataset='arxiv', + endpoint='arxlive', id_field=Article.id, filter=Article.article_source == 'arxiv', entity_type='article', @@ -54,8 +55,7 @@ def requires(self): env_files=[f3p('nesta/'), f3p('config/' 'mysqldb.config'), - f3p('schema_transformations/' - 'arxiv.json'), + f3p('datasets/arxiv.json'), f3p('config/' 'elasticsearch.config')], job_def='py36_amzn1_image', @@ -86,6 +86,7 @@ def requires(self): test=test, index=index, dataset='arxiv', + endpoint='arxlive', entity_type='article', kwargs=kwargs, batchable=f3p("batchables/novelty" diff --git a/nesta/core/routines/arxiv/arxiv_root_task.py b/nesta/core/routines/arxiv/arxiv_root_task.py index 73c2f0dd..e15328f3 100644 --- a/nesta/core/routines/arxiv/arxiv_root_task.py +++ b/nesta/core/routines/arxiv/arxiv_root_task.py @@ -66,19 +66,14 @@ def requires(self): test = not self.production routine_id = f"ArxivLolveltyTask-{self.date}-{test}" - # Elasticsearch setup - dataset = 'arxiv' - _, es_config = setup_es('prod' if self.production else 'dev', - not self.production, - self.drop_and_recreate, - dataset=dataset) yield ArxivElasticsearchTask(date=self.date, process_batch_size=1000, routine_id=routine_id, grid_task_kwargs=grid_task_kwargs, test=not self.production, - index=es_config['index'], + drop_and_recreate=self.drop_and_recreate, dataset='arxiv', + endpoint='arxlive', entity_type='article', kwargs=kwargs, batchable=f3p("batchables/novelty" @@ -134,6 +129,7 @@ def requires(self): process_batch_size=10000, drop_and_recreate=self.drop_and_recreate, dataset='arxiv', + endpoint='arxlive', id_field=Article.id, entity_type='article', db_config_env='MYSQLDB', @@ -145,8 +141,7 @@ def requires(self): env_files=[f3p('nesta/'), f3p('config/' 'mysqldb.config'), - f3p('schema_transformations/' - 'arxiv.json'), + f3p('datasets/arxiv.json'), f3p('config/' 'elasticsearch.config')], job_def='py36_amzn1_image', diff --git a/nesta/core/routines/crunchbase/crunchbase_elasticsearch_task.py b/nesta/core/routines/crunchbase/crunchbase_elasticsearch_task.py index df9d9af6..6ecebac3 100644 --- a/nesta/core/routines/crunchbase/crunchbase_elasticsearch_task.py +++ b/nesta/core/routines/crunchbase/crunchbase_elasticsearch_task.py @@ -78,10 +78,10 @@ def prepare(self): self.database) # Elasticsearch setup - es_mode = 'dev' if self.test else 'prod' - es, es_config = setup_es(es_mode, self.test, self.drop_and_recreate, - dataset='crunchbase', - aliases='health_scanner') + es, es_config = setup_es(endpoint='health-scanner', + dataset='companies', + production=not self.test, + drop_and_recreate=self.drop_and_recreate) # Get set of existing ids from elasticsearch via scroll scanner = scan(es, query={"_source": False}, diff --git a/nesta/core/routines/crunchbase/crunchbase_lolvelty.py b/nesta/core/routines/crunchbase/crunchbase_lolvelty.py index 236e2e5e..621c569b 100644 --- a/nesta/core/routines/crunchbase/crunchbase_lolvelty.py +++ b/nesta/core/routines/crunchbase/crunchbase_lolvelty.py @@ -37,7 +37,8 @@ def requires(self): return LazyElasticsearchTask(routine_id=routine_id, test=test, index=index, - dataset='crunchbase', + dataset='companies', + endpoint='health-scanner', entity_type='company', kwargs=kwargs, batchable=f3p("batchables/novelty/lolvelty"), diff --git a/nesta/core/routines/crunchbase/crunchbase_root_task.py b/nesta/core/routines/crunchbase/crunchbase_root_task.py index 7eb54db5..d46c864b 100644 --- a/nesta/core/routines/crunchbase/crunchbase_root_task.py +++ b/nesta/core/routines/crunchbase/crunchbase_root_task.py @@ -45,7 +45,7 @@ def requires(self): batchable=f3p("core/batchables/crunchbase/crunchbase_elasticsearch"), env_files=[f3p("nesta/"), f3p("config/mysqldb.config"), - f3p("schema_transformations/crunchbase_organisation_members.json"), + f3p("datasets/companies.json"), f3p("config/elasticsearch.config")], job_def="py36_amzn1_image", job_name=f"CrunchBaseElasticsearchTask-{_routine_id}", diff --git a/nesta/core/routines/eurito_es/es_root.py b/nesta/core/routines/eurito_es/es_root.py index f4c2bd69..90ee0de4 100644 --- a/nesta/core/routines/eurito_es/es_root.py +++ b/nesta/core/routines/eurito_es/es_root.py @@ -22,10 +22,11 @@ def kwarg_maker(dataset, routine_id): env_files=[f3p('config/mysqldb.config'), f3p('config/elasticsearch.config'), - f3p('schema_transformations/eurito/'), + f3p(f'tier_1/datasets/{dataset}.json'), f3p('nesta')] batchable=f3p(f'batchables/eurito/{dataset}_eu') - return dict(dataset=f'{dataset}-eu', + return dict(dataset=dataset, + endpoint='eurito-dev', routine_id=f'{dataset}-eu_{routine_id}', env_files=env_files, batchable=batchable) @@ -56,7 +57,7 @@ def requires(self): intermediate_bucket=S3_BUCKET) params = (('arxiv', 'article', Article.id), - ('crunchbase', 'company', Organization.id), + ('companies', 'company', Organization.id), ('patstat', 'patent', ApplnFamily.docdb_family_id), ('cordis', 'project', Project.rcn),) for dataset, entity_type, id_field in params: diff --git a/nesta/core/routines/health_data/nih_data/nih_abstracts_mesh_task.py b/nesta/core/routines/health_data/nih_data/nih_abstracts_mesh_task.py index 40046baf..0da21aad 100644 --- a/nesta/core/routines/health_data/nih_data/nih_abstracts_mesh_task.py +++ b/nesta/core/routines/health_data/nih_data/nih_abstracts_mesh_task.py @@ -134,11 +134,10 @@ def prepare(self): db = 'production' if not self.test else 'dev' # elasticsearch setup - es_mode = 'dev' if self.test else 'prod' - es, es_config = setup_es(es_mode, self.test, - drop_and_recreate=False, + es, es_config = setup_es(endpoint='health-scanner', dataset='nih', - aliases='health_scanner') + production=not self.test, + drop_and_recreate=False) # s3 setup and file key collection bucket = 'innovation-mapping-general' diff --git a/nesta/core/routines/health_data/nih_data/nih_dedupe_task.py b/nesta/core/routines/health_data/nih_data/nih_dedupe_task.py index cd1038b3..f2e53a8a 100644 --- a/nesta/core/routines/health_data/nih_data/nih_dedupe_task.py +++ b/nesta/core/routines/health_data/nih_data/nih_dedupe_task.py @@ -32,7 +32,7 @@ class DedupeTask(autobatch.AutoBatchTask): def output(self): '''Points to the output database engine''' - db_config = get_config(self.db_config_path, + db_config = get_config(self.db_config_path, "mysqldb") db_config["database"] = ('dev' if self.test else 'production') @@ -68,33 +68,30 @@ def prepare(self): f"{self.process_batch_size}" " while in test mode") - es_mode = 'dev' if self.test else 'prod' - es, es_config = setup_es(es_mode, self.test, - self.drop_and_recreate, - dataset='nih', - aliases='health_scanner', - increment_version=True) + es_kwargs = dict(endpoint='health-scanner', + dataset='nih', production=not self.test) + _, _old_config = setup_es(**es_kwargs) + es, es_config = setup_es(drop_and_recreate=self.drop_and_recreate, + increment_version=True, **es_kwargs) # Count articles from the old index - _old_config = es_config.copy() - _old_config['index'] = es_config['old_index'] logging.info(f"Collected article IDs...") _ids = get_es_ids(es, _old_config, size=10000) logging.info(f"Collected {len(_ids)} IDs") done_ids = get_es_ids(es, es_config, size=10000) # Generate the job params - job_params = [] + job_params = [] batches = split_batches(_ids, self.process_batch_size) for count, batch in enumerate(batches, 1): # Magical '0.3' is the lower end of the deduplication # fraction found by inspection - done = sum(_id in done_ids + done = sum(_id in done_ids for _id in batch) / len(batch) > 0.3 # write batch of ids to s3 batch_file = '' if not done: - batch_file = put_s3_batch(batch, + batch_file = put_s3_batch(batch, self.intermediate_bucket, self.routine_id) params = { @@ -105,13 +102,13 @@ def prepare(self): 'outinfo': es_config['host'], 'out_port': es_config['port'], 'out_index': es_config['index'], - 'in_index': es_config['old_index'], + 'in_index': _old_config['index'], 'out_type': es_config['type'], 'aws_auth_region': es_config['region'], 'entity_type': 'paper', 'test': self.test, 'routine_id': self.routine_id - } + } job_params.append(params) if self.test and count > 1: diff --git a/nesta/core/routines/health_data/nih_data/nih_lolvelty.py b/nesta/core/routines/health_data/nih_data/nih_lolvelty.py index edb4a98c..f42555a5 100644 --- a/nesta/core/routines/health_data/nih_data/nih_lolvelty.py +++ b/nesta/core/routines/health_data/nih_data/nih_lolvelty.py @@ -24,6 +24,7 @@ def requires(self): test=test, index=index, dataset='nih', + endpoint='health-scanner', entity_type='paper', kwargs=kwargs, batchable=f3p("batchables/novelty/lolvelty"), diff --git a/nesta/core/routines/health_data/nih_data/nih_process_task.py b/nesta/core/routines/health_data/nih_data/nih_process_task.py index 7e3cf64a..6b58cf1e 100644 --- a/nesta/core/routines/health_data/nih_data/nih_process_task.py +++ b/nesta/core/routines/health_data/nih_data/nih_process_task.py @@ -102,10 +102,10 @@ def prepare(self): project_query = session.query(Projects) # elasticsearch setup - es_mode = 'dev' if self.test else 'prod' - es, es_config = setup_es(es_mode, self.test, self.drop_and_recreate, + es, es_config = setup_es(endpoint='health-scanner', dataset='nih', - aliases='health_scanner') + production=not self.test, + drop_and_recreate=self.drop_and_recreate) batches = self.batch_limits(project_query, BATCH_SIZE) job_params = [] diff --git a/nesta/core/routines/meetup/health_tagging/health_meetup_es_task.py b/nesta/core/routines/meetup/health_tagging/health_meetup_es_task.py index 2a60d862..315bb7e4 100644 --- a/nesta/core/routines/meetup/health_tagging/health_meetup_es_task.py +++ b/nesta/core/routines/meetup/health_tagging/health_meetup_es_task.py @@ -72,8 +72,8 @@ def requires(self): date=self.date, process_batch_size=100, drop_and_recreate=self.drop_and_recreate, - aliases='health_scanner', dataset='meetup', + endpoint='health-scanner', id_field=Group.id, entity_type='meetup', core_categories=self.core_categories, @@ -85,7 +85,7 @@ def requires(self): batchable=f3p("batchables/meetup/topic_tag_elasticsearch"), env_files=[f3p("nesta/"), f3p("config/mysqldb.config"), - f3p("schema_transformations/meetup.json"), + f3p("datasets/meetup.json"), f3p("config/elasticsearch.config")], job_def="py36_amzn1_image", job_name=f"MeetupHealthSql2EsTask-{routine_id}", diff --git a/nesta/core/routines/meetup/health_tagging/meetup_lolvelty.py b/nesta/core/routines/meetup/health_tagging/meetup_lolvelty.py index 7fd57c08..10528b5e 100644 --- a/nesta/core/routines/meetup/health_tagging/meetup_lolvelty.py +++ b/nesta/core/routines/meetup/health_tagging/meetup_lolvelty.py @@ -35,6 +35,7 @@ def requires(self): test=test, index=index, dataset='meetup', + endpoint='health-scanner', entity_type='meetup', kwargs=kwargs, batchable=f3p("batchables/novelty/lolvelty"), diff --git a/nesta/core/schemas/README.rst b/nesta/core/schemas/README.rst index f61af8e6..1498c7a3 100644 --- a/nesta/core/schemas/README.rst +++ b/nesta/core/schemas/README.rst @@ -21,6 +21,7 @@ Valid examples are :code:`date_start_project` and :code:`title_of_project`. Tier 0 fields are implictly excluded from tier 1 if they are missing from the :code:`schema_transformation` file. Tier 1 schema field names are applied via `nesta.packages.decorator.schema_transform` + Tier 2 ------ diff --git a/nesta/core/schemas/tier_1/datasets/arxiv.json b/nesta/core/schemas/tier_1/datasets/arxiv.json new file mode 100644 index 00000000..e3952667 --- /dev/null +++ b/nesta/core/schemas/tier_1/datasets/arxiv.json @@ -0,0 +1,30 @@ +{ + "entity_type": "article", + "tier0_to_tier1": { + "_fields_of_study": "terms_fieldsOfStudy_article", + "abstract": "textBody_abstract_article", + "authors": "terms_authors_article", + "categories": "terms_category_article", + "citation_count": "count_citations_article", + "countries": "terms_countries_article", + "created": "date_created_article", + "doi": "id_digitalObjectIdentifier_article", + "fields_of_study": "json_fieldsOfStudy_article", + "has_multinational": "booleanFlag_multinational_article", + "id": "id_of_article", + "institutes": "terms_institutes_article", + "is_eu": "booleanFlag_eu_article", + "nested_categories": "json_category_article", + "nested_location": "json_location_article", + "normalised_citation": "metric_citations_article", + "novelty_of_article": "metric_novelty_article", + "nuts_0": "terms_nuts0_article", + "nuts_1": "terms_nuts1_article", + "nuts_2": "terms_nuts2_article", + "nuts_3": "terms_nuts3_article", + "regions": "terms_regions_article", + "title": "title_of_article", + "tokens": "terms_tokens_article", + "year": "year_of_article" + } +} \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/datasets/companies.json b/nesta/core/schemas/tier_1/datasets/companies.json new file mode 100644 index 00000000..7374c4fa --- /dev/null +++ b/nesta/core/schemas/tier_1/datasets/companies.json @@ -0,0 +1,49 @@ +{ + "entity_type": "company", + "tier0_to_tier1": { + "_booleanFlag_autotranslated_entity": "booleanFlag_autotranslated_entity", + "_rank_rhodonite_organisation": "rank_rhodonite_organisation", + "_terms_iso2lang_entity": "terms_iso2lang_entity", + "_terms_of_countryTags": "terms_of_countryTags", + "_total_cost_usd2018": "_cost_usd2018_organisation", + "address": "address_of_organisation", + "aliases": "terms_alias_organisation", + "category_group_list": "terms_category_organisation", + "category_list": "terms_subcategory_organisation", + "cb_url": "url_crunchBase_organisation", + "city": "placeName_city_organisation", + "closed_on": "date_death_organisation", + "company_name": "name_of_organisation", + "continent": "id_of_continent", + "coordinates": "coordinate_of_city", + "country": "placeName_country_organisation", + "country_alpha_2": "id_iso2_country", + "country_alpha_3": "id_iso3_country", + "country_numeric": "id_isoNumeric_country", + "currency_of_funding": "currency_of_funding", + "employee_count": "count_employee_organisation", + "facebook_url": "url_facebook_organisation", + "founded_on": "date_birth_organisation", + "funding_rounds": "count_rounds_funding", + "funding_total_usd": "cost_of_funding", + "homepage_url": "url_of_organisation", + "investor_names": "terms_of_funders", + "is_eu": "booleanFlag_eu_organisation", + "is_health": "booleanFlag_health_organisation", + "last_funding_on": "date_last_funding", + "linkedin_url": "url_linkedIn_organisation", + "long_description": "textBody_descriptive_organisation", + "mesh_terms": "terms_mesh_description", + "parent_id": "id_parent_organisation", + "placeName_continent_organisation": "placeName_continent_organisation", + "placeName_state_organisation": "placeName_state_organisation", + "primary_role": "type_of_organisation", + "region": "placeName_region_organisation", + "roles": "terms_roles_organisation", + "short_description": "textBody_summary_organisation", + "state_code": "id_state_organisation", + "status": "status_of_organisation", + "twitter_url": "url_twitter_organisation", + "updated_at": "date_updated_organisation" + } +} \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/datasets/cordis.json b/nesta/core/schemas/tier_1/datasets/cordis.json new file mode 100644 index 00000000..b27e608c --- /dev/null +++ b/nesta/core/schemas/tier_1/datasets/cordis.json @@ -0,0 +1,16 @@ +{ + "entity_type": "project", + "tier0_to_tier1": { + "description": "textBody_description_project", + "ec_contribution": "cost_ecFunding_project", + "end_date_code": "date_ended_project", + "framework": "name_framework_project", + "link": "url_of_project", + "rcn": "id_of_project", + "start_date_code": "date_started_project", + "status": "status_of_project", + "title": "title_of_project", + "total_cost": "cost_total_project", + "year": "year_of_project" + } +} \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/datasets/meetup.json b/nesta/core/schemas/tier_1/datasets/meetup.json new file mode 100644 index 00000000..6b269458 --- /dev/null +++ b/nesta/core/schemas/tier_1/datasets/meetup.json @@ -0,0 +1,29 @@ +{ + "entity_type": "meetup", + "tier0_to_tier1": { + "_booleanFlag_autotranslated_entity": "booleanFlag_autotranslated_entity", + "_placeName_state_group": "_placeName_state_group", + "_rank_rhodonite_group": "rank_rhodonite_group", + "_terms_iso2lang_entity": "terms_iso2lang_entity", + "_terms_of_countryTags": "terms_of_countryTags", + "category_name": "name_of_category", + "city": "placeName_city_group", + "continent": "placeName_continent_group", + "continent_id": "id_continent_group", + "coordinate": "coordinate_of_group", + "country": "id_iso2_country", + "country_id": "id_country_group", + "country_name": "placeName_country_group", + "created": "date_start_group", + "description": "textBody_descriptive_group", + "id": "id_of_group", + "iso3": "id_iso3_country", + "isoNumeric": "id_isoNumeric_country", + "member_origins": "terms_memberOrigin_group", + "members": "count_member_group", + "mesh_terms": "terms_mesh_group", + "name": "name_of_group", + "topics": "terms_topics_group", + "urlname": "url_of_group" + } +} \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/datasets/nih.json b/nesta/core/schemas/tier_1/datasets/nih.json new file mode 100644 index 00000000..bfda383b --- /dev/null +++ b/nesta/core/schemas/tier_1/datasets/nih.json @@ -0,0 +1,35 @@ +{ + "entity_type": "paper", + "tier0_to_tier1": { + "_booleanFlag_autotranslated_entity": "booleanFlag_autotranslated_entity", + "_rank_rhodonite_abstract": "rank_rhodonite_abstract", + "_terms_iso2lang_entity": "terms_iso2lang_entity", + "_terms_of_countryTags": "terms_of_countryTags", + "_terms_of_funders": "terms_of_funders", + "_total_cost_usd2018": "_cost_usd2018_project", + "abstract_text": "textBody_abstract_project", + "city": "placeName_city_organisation", + "continent": "id_of_continent", + "coordinates": "coordinate_of_organisation", + "country": "placeName_country_organisation", + "country_alpha_2": "id_iso2_country", + "country_alpha_3": "id_iso3_country", + "country_numeric": "id_isoNumeric_country", + "duplicate_abstract": "booleanFlag_duplicate_abstract", + "full_project_num": "id_of_project", + "fy": "year_fiscal_funding", + "mesh_terms": "terms_mesh_abstract", + "org_name": "title_of_organisation", + "org_state": "id_state_organisation", + "org_zipcode": "placeName_zipcode_organisation", + "phr": "textBody_descriptive_project", + "placeName_continent_organisation": "placeName_continent_organisation", + "placeName_state_organisation": "placeName_state_organisation", + "project_end": "date_end_project", + "project_start": "date_start_project", + "project_terms": "terms_descriptive_project", + "project_title": "title_of_project", + "total_cost": "cost_total_project", + "total_cost_currency": "currency_total_cost" + } +} \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/datasets/patstat.json b/nesta/core/schemas/tier_1/datasets/patstat.json new file mode 100644 index 00000000..96bfbfc3 --- /dev/null +++ b/nesta/core/schemas/tier_1/datasets/patstat.json @@ -0,0 +1,18 @@ +{ + "entity_type": "patent", + "tier0_to_tier1": { + "abstract": "textBody_abstract_patent", + "appln_auth": "terms_authCountry_patent", + "ctry": "terms_personCountry_patent", + "earliest_filing_date": "date_of_patent", + "earliest_filing_year": "year_of_patent", + "id": "id_family_patent", + "ipc": "terms_ipc_patent", + "is_eu": "booleanFlag_eu_patent", + "nace2": "terms_nace2_patent", + "nb_citing_docdb_fam": "count_citations_patent", + "nuts": "terms_personNuts_patent", + "tech": "terms_techFieldNumber_patent", + "title": "title_of_patent" + } +} \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/mappings/README.rst b/nesta/core/schemas/tier_1/mappings/README.rst new file mode 100644 index 00000000..d9b98a11 --- /dev/null +++ b/nesta/core/schemas/tier_1/mappings/README.rst @@ -0,0 +1,148 @@ +Elasticsearch mappings +====================== + +Our methodology for constructing Elasticsearch mappings is described here. It is intended to minimise duplication of efforts and enforce standardisation when referring to a common dataset whilst being flexible to individual project needs. It is implied in our framework that a single :code:`dataset` can be used across many projects, and each project is mapped to a single :code:`endpoint`. It is useful to start by looking at the structure of the :code:`nesta/core/schemas/tier_1/mappings/` directory: + +.. code-block:: bash + + . + ├── datasets + │ ├── arxiv_mapping.json + │ ├── companies_mapping.json + │ ├── cordis_mapping.json + │ ├── gtr_mapping.json + │ ├── meetup_mapping.json + │ ├── nih_mapping.json + │ └── patstat_mapping.json + ├── defaults + │ └── defaults.json + └── endpoints + ├── arxlive + │ └── arxiv_mapping.json + ├── eurito + │ ├── arxiv_mapping.json + │ ├── companies_mapping.json + │ └── patstat_mapping.json + └── health-scanner + ├── aliases.json + ├── config.yaml + ├── nih_mapping.json + └── nulls.json + +Firstly we consider :code:`defaults/defaults.json` which should contain all default fields for all mappings - for example standard analyzers and dynamic strictness. We might also consider putting global fields there. + +Next consider the :code:`datasets` subdirectory. Each mapping file in here should contain the complete :code:`mappings` field for the respective dataset. The naming convention :code:`_mapping.json` is a hard requirement, as :code:`` will map to the index for this :code:`dataset` at any given :code:`endpoint`. + +Finally consider the :code:`endpoints` subdirectory. Each sub-subdirectory here should map to any :code:`endpoint` which requires changes beyond the :code:`defaults` and :code:`datasets` mappings. Each mapping file within each :code:`endpoint` sub-subdirectory (e.g. :code:`arxlive` or :code:`health-scanner`) should satisfy the same naming convention (:code:`_mapping.json`). All conventions here are also consistent with the :code:`elasticsearch.yaml` configuration file (to see this configuration, you will need to clone the repo and follow `these steps `_ to unencrypt the config), which looks a little like this: + + +.. code-block:: yaml + + ## The following assumes the AWS host endpoing naming convention: + ## {scheme}://search-{endpoint}-{id}.{region}.es.amazonaws.com + defaults: + scheme: https + port: 443 + region: eu-west-2 + type: _doc + endpoints: + # ------------------------------- + # : + # id: + # : ## e.g.: scheme, port, region, _type + # indexes: + # : ## Note: defaults to _dev in testing mode + # ------------------------------- + arxlive: + id: + indexes: + arxiv: 4 + # ------------------------------- + health-scanner: + id: + indexes: + nih: 6 + companies: 5 + meetup: 4 + ... etc ... + +Note that for the :code:`health-scanner` endpoint, :code:`companies` and :code:`meetup` will be generated from the :code:`datasets` mappings, as they are not specified under the :code:`endpoints/health-scanner` subdirectory. Also note that :code:`endpoints` sub-directories do not need to exist for each :code:`endpoint` to be generated: the mappings will simply be generated from the dataset defaults. For example, a new endpoint :code:`general` can be generated from the DAPS codebase using the above, even though there is no :code:`endpoints/general` sub-subdirectory. + +Individual :code:`endpoints` can also specify :code:`aliases.json` which harmonises field names across datasets for specific endpoints. This uses a convention as follows: + +.. code-block:: python + + { + #...the convention is... + "": { + "": "", + "": "", + "": "" + }, + #...an example is... + "city": { + "companies": "placeName_city_organisation", + "meetup": "placeName_city_group", + "nih": "placeName_city_organisation" + }, + #...etc...# + } + +By default, this applies (what Joel calls) a "soft" alias, which is an `Elasticsearch alias `_, however by specifying :code:`hard-alias=true` in :code:`config.yaml` (see :code:`health-scanner` above), the alias is instead applied directly (i.e. field names are physically replaced, not aliased). + +You will also notice the :code:`nulls.json` file in the :code:`health-scanner` endpoint. This is a relatively experimental feature for automatically nullifying values on ingestion through ElasticsearchPlus, in lieu of proper exploratory data analysis. The logic and format for this `is documented here `_. + +Mapping construction hierarchy +------------------------------ + +Each mapping is constructed by overriding nested fields using the :code:`defaults` :code:`datasets` and :code:`endpoints`, in that order (i.e. :code:`endpoints` override nested fields in :code:`datasets`, and :code:`datasets` override those in :code:`defaults`). If you would like to "switch off" a field from the :code:`defaults` or :code:`datasets` mappings, you should set the value of the nested field to :code:`null`. For example: + +.. code-block:: javascript + + { + "mappings": { + "_doc": { + "dynamic": "strict", + "properties": { + "placeName_zipcode_organisation": null + } + } + } + } + +will simply "switch off" the field :code:`placeName_zipcode_organisation`, which was specified in :code:`datasets`. + +The logic for the mapping construction hierarchy is demonstrated in the respective :code:`orms.orm_utils.get_es_mapping` function: + + +.. code-block:: python + + def get_es_mapping(dataset, endpoint): + '''Load the ES mapping for this dataset and endpoint, + including aliases. + + Args: + dataset (str): Name of the dataset for the ES mapping. + endpoint (str): Name of the AWS ES endpoint. + Returns: + :obj:`dict` + ''' + mapping = _get_es_mapping(dataset, endpoint) + _apply_alias(mapping, dataset, endpoint) + _prune_nested(mapping) # prunes any nested keys with null values + return mapping + +Integrated tests +---------------- + +The following :code:`pytest` tests are made (and triggered on PR via travis): + +- :code:`aliases.json` files are checked for consistency with available :code:`datasets`. +- All mappings for each in :code:`datasets` and :code:`endpoints` are fully generated, and tested for compatibility with the schema transformations (which are, in turn, checked against the valid ontology in :code:`ontology.json`). + +Features in DAPS2 +----------------- + +- The index version (e.g. :code:`'arxiv': 4` in :code:`elasticsearch.yaml`) will be automatically generated from semantic versioning and the git hash in DAPS2, therefore the :code:`indexes` field will consolidate to an itemised list of indexes. +- The mappings under :code:`datasets` will be automatically generated from the open ontology which will be baked into the tier-0 schemas. This will render :code:`schema_transformations` redundant. +- Elasticsearch components will be factored out of :code:`orm_utils`. diff --git a/nesta/core/schemas/tier_1/mappings/datasets/arxiv_mapping.json b/nesta/core/schemas/tier_1/mappings/datasets/arxiv_mapping.json new file mode 100644 index 00000000..401d1bf3 --- /dev/null +++ b/nesta/core/schemas/tier_1/mappings/datasets/arxiv_mapping.json @@ -0,0 +1,65 @@ +{ + "mappings": { + "_doc": { + "properties": { + "booleanFlag_multinational_article": { + "type": "boolean" + }, + "count_citations_article": { + "type": "integer" + }, + "id_digitalObjectIdentifier_article": { + "type": "keyword" + }, + "metric_novelty_article": { + "type": "float" + }, + "terms_authors_article": { + "analyzer": "terms_analyzer", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "type": "text" + }, + "terms_institutes_article": { + "analyzer": "terms_analyzer", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "type": "text" + }, + "textBody_abstract_article": { + "fields": { + "keyword": { + "type": "keyword" + } + }, + "type": "text" + }, + "title_of_article": { + "fields": { + "keyword": { + "type": "keyword" + } + }, + "type": "text" + }, + "url_of_article": { + "fields": { + "keyword": { + "type": "keyword" + } + }, + "type": "text" + }, + "year_of_article": { + "type": "integer" + } + } + } + } +} \ No newline at end of file diff --git a/nesta/core/orms/crunchbase_es_config.json b/nesta/core/schemas/tier_1/mappings/datasets/companies_mapping.json similarity index 89% rename from nesta/core/orms/crunchbase_es_config.json rename to nesta/core/schemas/tier_1/mappings/datasets/companies_mapping.json index 0d3362f1..ccf22c71 100644 --- a/nesta/core/orms/crunchbase_es_config.json +++ b/nesta/core/schemas/tier_1/mappings/datasets/companies_mapping.json @@ -1,7 +1,6 @@ { "mappings": { "_doc": { - "dynamic": "strict", "properties": { "_cost_usd2018_organisation": { "type": "float" @@ -42,8 +41,8 @@ "format": "yyyy-MM-dd", "type": "date" }, - "datetime_updated_organisation": { - "format": "yyyy-MM-dd HH:mm:ss", + "date_updated_organisation": { + "format": "yyyy-MM-dd", "type": "date" }, "id_continent_organisation": { @@ -128,7 +127,7 @@ "type": "keyword" }, "terms_mesh_description": { - "analyzer": "mesh_terms_analyzer", + "analyzer": "terms_analyzer", "fields": { "keyword": { "type": "keyword" @@ -140,7 +139,7 @@ "type": "keyword" }, "terms_of_funders": { - "analyzer": "mesh_terms_analyzer", + "analyzer": "terms_analyzer", "fields": { "keyword": { "type": "keyword" @@ -149,7 +148,7 @@ "type": "text" }, "terms_roles_organisation": { - "analyzer": "mesh_terms_analyzer", + "analyzer": "terms_analyzer", "fields": { "keyword": { "type": "keyword" @@ -158,7 +157,7 @@ "type": "text" }, "terms_subcategory_organisation": { - "analyzer": "mesh_terms_analyzer", + "analyzer": "terms_analyzer", "fields": { "keyword": { "type": "keyword" @@ -166,6 +165,9 @@ }, "type": "text" }, + "terms_tokens_entity": { + "type": "keyword" + }, "textBody_descriptive_organisation": { "fields": { "keyword": { @@ -230,24 +232,5 @@ } } } - }, - "settings": { - "analysis": { - "analyzer": { - "mesh_terms_analyzer": { - "filter": [ - "standard", - "lowercase", - "stop" - ], - "tokenizer": "standard", - "type": "custom" - } - } - }, - "index": { - "number_of_replicas": "1", - "number_of_shards": "5" - } } -} +} \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/mappings/datasets/cordis_mapping.json b/nesta/core/schemas/tier_1/mappings/datasets/cordis_mapping.json new file mode 100644 index 00000000..e0d6685f --- /dev/null +++ b/nesta/core/schemas/tier_1/mappings/datasets/cordis_mapping.json @@ -0,0 +1,68 @@ +{ + "mappings": { + "dynamic": "strict", + "properties": { + "cost_ecFunding_project": { + "type": "integer" + }, + "cost_total_project": { + "type": "integer" + }, + "date_ended_project": { + "format": "yyyy-MM-dd", + "type": "date" + }, + "date_started_project": { + "format": "yyyy-MM-dd", + "type": "date" + }, + "metric_novelty_project": { + "type": "float" + }, + "name_framework_project": { + "fields": { + "keyword": { + "type": "keyword" + } + }, + "type": "text" + }, + "status_of_project": { + "type": "keyword" + }, + "terms_tokens_entity": { + "type": "keyword" + }, + "textBody_description_project": { + "fields": { + "keyword": { + "type": "keyword" + } + }, + "type": "text" + }, + "title_of_project": { + "fields": { + "keyword": { + "type": "keyword" + } + }, + "type": "text" + }, + "type_of_entity": { + "type": "keyword" + }, + "url_of_project": { + "fields": { + "keyword": { + "type": "keyword" + } + }, + "type": "text" + }, + "year_of_project": { + "type": "integer" + } + } + } +} \ No newline at end of file diff --git a/nesta/core/orms/meetup_es_config.json b/nesta/core/schemas/tier_1/mappings/datasets/meetup_mapping.json similarity index 87% rename from nesta/core/orms/meetup_es_config.json rename to nesta/core/schemas/tier_1/mappings/datasets/meetup_mapping.json index f1d14cb6..49558368 100644 --- a/nesta/core/orms/meetup_es_config.json +++ b/nesta/core/schemas/tier_1/mappings/datasets/meetup_mapping.json @@ -15,7 +15,7 @@ "type": "text" }, "_terms_memberOrigin_group": { - "analyzer": "mesh_terms_analyzer", + "analyzer": "terms_analyzer", "fields": { "keyword": { "type": "keyword" @@ -104,7 +104,7 @@ "type": "keyword" }, "terms_mesh_group": { - "analyzer": "mesh_terms_analyzer", + "analyzer": "terms_analyzer", "fields": { "keyword": { "type": "keyword" @@ -116,7 +116,7 @@ "type": "keyword" }, "terms_topics_group": { - "analyzer": "mesh_terms_analyzer", + "analyzer": "terms_analyzer", "fields": { "keyword": { "type": "keyword" @@ -145,24 +145,5 @@ } } } - }, - "settings": { - "analysis": { - "analyzer": { - "mesh_terms_analyzer": { - "filter": [ - "standard", - "lowercase", - "stop" - ], - "tokenizer": "standard", - "type": "custom" - } - } - }, - "index": { - "number_of_replicas": "1", - "number_of_shards": "5" - } } -} +} \ No newline at end of file diff --git a/nesta/core/orms/nih_es_config.json b/nesta/core/schemas/tier_1/mappings/datasets/nih_mapping.json similarity index 90% rename from nesta/core/orms/nih_es_config.json rename to nesta/core/schemas/tier_1/mappings/datasets/nih_mapping.json index d4901b66..864e8dc4 100644 --- a/nesta/core/orms/nih_es_config.json +++ b/nesta/core/schemas/tier_1/mappings/datasets/nih_mapping.json @@ -108,7 +108,7 @@ "type": "text" }, "terms_mesh_abstract": { - "analyzer": "mesh_terms_analyzer", + "analyzer": "terms_analyzer", "fields": { "keyword": { "type": "keyword" @@ -120,7 +120,7 @@ "type": "keyword" }, "terms_of_funders": { - "analyzer": "mesh_terms_analyzer", + "analyzer": "terms_analyzer", "fields": { "keyword": { "type": "keyword" @@ -179,24 +179,5 @@ } } } - }, - "settings": { - "analysis": { - "analyzer": { - "mesh_terms_analyzer": { - "filter": [ - "standard", - "lowercase", - "stop" - ], - "tokenizer": "standard", - "type": "custom" - } - } - }, - "index": { - "number_of_replicas": "1", - "number_of_shards": "5" - } } -} +} \ No newline at end of file diff --git a/nesta/core/orms/patstat-eu_es_config.json b/nesta/core/schemas/tier_1/mappings/datasets/patstat_mapping.json similarity index 85% rename from nesta/core/orms/patstat-eu_es_config.json rename to nesta/core/schemas/tier_1/mappings/datasets/patstat_mapping.json index 43dea5a6..2b93cb51 100644 --- a/nesta/core/orms/patstat-eu_es_config.json +++ b/nesta/core/schemas/tier_1/mappings/datasets/patstat_mapping.json @@ -6,9 +6,6 @@ "booleanFlag_autotranslated_entity": { "type": "boolean" }, - "booleanFlag_eu_patent": { - "type": "boolean" - }, "count_citations_patent": { "type": "integer" }, @@ -109,24 +106,5 @@ } } } - }, - "settings": { - "analysis": { - "analyzer": { - "terms_analyzer": { - "filter": [ - "standard", - "lowercase", - "stop" - ], - "tokenizer": "standard", - "type": "custom" - } - } - }, - "index": { - "number_of_replicas": "1", - "number_of_shards": "5" - } } -} +} \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/mappings/defaults/defaults.json b/nesta/core/schemas/tier_1/mappings/defaults/defaults.json new file mode 100644 index 00000000..6a853756 --- /dev/null +++ b/nesta/core/schemas/tier_1/mappings/defaults/defaults.json @@ -0,0 +1,24 @@ +{ + "mappings": { + "_doc": { + "dynamic": "strict" + } + }, + "settings": { + "index": { + "analysis": { + "analyzer": { + "terms_analyzer": { + "filter": [ + "standard", + "lowercase", + "stop" + ], + "tokenizer": "standard", + "type": "custom" + } + } + } + } + } +} \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/mappings/endpoints/arxlive/arxiv_mapping.json b/nesta/core/schemas/tier_1/mappings/endpoints/arxlive/arxiv_mapping.json new file mode 100644 index 00000000..0b114844 --- /dev/null +++ b/nesta/core/schemas/tier_1/mappings/endpoints/arxlive/arxiv_mapping.json @@ -0,0 +1,76 @@ +{ + "mappings": { + "_doc": { + "properties": { + "date_created_article": { + "type": "date" + }, + "json_category_article": { + "properties": { + "ancestors": { + "type": "keyword" + }, + "level": { + "type": "integer" + }, + "order": { + "type": "integer" + }, + "value": { + "type": "keyword" + } + }, + "type": "nested" + }, + "json_fieldsOfStudy_article": { + "properties": { + "ancestors": { + "type": "keyword" + }, + "level": { + "type": "integer" + }, + "order": { + "type": "integer" + }, + "value": { + "type": "keyword" + } + }, + "type": "nested" + }, + "json_location_article": { + "properties": { + "ancestors": { + "type": "keyword" + }, + "level": { + "type": "integer" + }, + "order": { + "type": "integer" + }, + "value": { + "type": "keyword" + } + }, + "type": "nested" + }, + "metric_citations_article": { + "type": "float" + }, + "terms_tokens_article": { + "type": "keyword" + }, + "type_of_entity": { + "fields": { + "keyword": { + "type": "keyword" + } + }, + "type": "text" + } + } + } + } +} \ No newline at end of file diff --git a/nesta/core/orms/arxiv-eu_es_config.json b/nesta/core/schemas/tier_1/mappings/endpoints/eurito-dev/arxiv_mapping.json similarity index 56% rename from nesta/core/orms/arxiv-eu_es_config.json rename to nesta/core/schemas/tier_1/mappings/endpoints/eurito-dev/arxiv_mapping.json index 6b3cf02a..2823ea20 100644 --- a/nesta/core/orms/arxiv-eu_es_config.json +++ b/nesta/core/schemas/tier_1/mappings/endpoints/eurito-dev/arxiv_mapping.json @@ -1,40 +1,18 @@ { "mappings": { "_doc": { - "dynamic": "strict", "properties": { "booleanFlag_eu_article": { "type": "boolean" }, - "booleanFlag_multinational_article": { - "type": "boolean" - }, - "count_citations_article": { - "type": "integer" - }, "date_created_article": { "format": "yyyy-MM-dd", "type": "date" }, - "id_digitalObjectIdentifier_article": { - "type": "keyword" - }, "json_fieldsOfStudy_article": { "dynamic": true, "properties": {} }, - "metric_novelty_article": { - "type": "float" - }, - "terms_authors_article": { - "analyzer": "terms_analyzer", - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, "terms_category_article": { "analyzer": "terms_analyzer", "fields": { @@ -62,15 +40,6 @@ }, "type": "text" }, - "terms_institutes_article": { - "analyzer": "terms_analyzer", - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, "terms_nuts0_article": { "analyzer": "terms_analyzer", "fields": { @@ -119,56 +88,10 @@ "terms_tokens_entity": { "type": "keyword" }, - "textBody_abstract_article": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "title_of_article": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, "type_of_entity": { "type": "keyword" - }, - "url_of_article": { - "fields": { - "keyword": { - "type": "keyword" - } - }, - "type": "text" - }, - "year_of_article": { - "type": "integer" - } - } - } - }, - "settings": { - "analysis": { - "analyzer": { - "terms_analyzer": { - "filter": [ - "standard", - "lowercase", - "stop" - ], - "tokenizer": "standard", - "type": "custom" } } - }, - "index": { - "number_of_replicas": "1", - "number_of_shards": "5" } } -} +} \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/mappings/endpoints/eurito-dev/companies_mapping.json b/nesta/core/schemas/tier_1/mappings/endpoints/eurito-dev/companies_mapping.json new file mode 100644 index 00000000..a2496aa3 --- /dev/null +++ b/nesta/core/schemas/tier_1/mappings/endpoints/eurito-dev/companies_mapping.json @@ -0,0 +1,14 @@ +{ + "mappings": { + "_doc": { + "properties": { + "booleanFlag_eu_organisation": { + "type": "boolean" + }, + "metric_novelty_organisation": { + "type": "float" + } + } + } + } +} \ No newline at end of file diff --git a/nesta/core/orms/cordis-eu_es_config.json b/nesta/core/schemas/tier_1/mappings/endpoints/eurito-dev/cordis_mapping.json similarity index 81% rename from nesta/core/orms/cordis-eu_es_config.json rename to nesta/core/schemas/tier_1/mappings/endpoints/eurito-dev/cordis_mapping.json index 0e5cae2d..115f354f 100644 --- a/nesta/core/orms/cordis-eu_es_config.json +++ b/nesta/core/schemas/tier_1/mappings/endpoints/eurito-dev/cordis_mapping.json @@ -65,25 +65,8 @@ "type": "integer" } } - } - }, - "settings": { - "analysis": { - "analyzer": { - "terms_analyzer": { - "filter": [ - "standard", - "lowercase", - "stop" - ], - "tokenizer": "standard", - "type": "custom" - } - } }, - "index": { - "number_of_replicas": "1", - "number_of_shards": "5" - } + "dynamic": null, + "properties": null } -} +} \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/mappings/endpoints/eurito-dev/patstat_mapping.json b/nesta/core/schemas/tier_1/mappings/endpoints/eurito-dev/patstat_mapping.json new file mode 100644 index 00000000..65a50f06 --- /dev/null +++ b/nesta/core/schemas/tier_1/mappings/endpoints/eurito-dev/patstat_mapping.json @@ -0,0 +1,12 @@ +{ + "mappings": { + "_doc": { + "dynamic": "strict", + "properties": { + "booleanFlag_eu_patent": { + "type": "boolean" + } + } + } + } +} \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/mappings/endpoints/eurito/arxiv_mapping.json b/nesta/core/schemas/tier_1/mappings/endpoints/eurito/arxiv_mapping.json new file mode 100644 index 00000000..2823ea20 --- /dev/null +++ b/nesta/core/schemas/tier_1/mappings/endpoints/eurito/arxiv_mapping.json @@ -0,0 +1,97 @@ +{ + "mappings": { + "_doc": { + "properties": { + "booleanFlag_eu_article": { + "type": "boolean" + }, + "date_created_article": { + "format": "yyyy-MM-dd", + "type": "date" + }, + "json_fieldsOfStudy_article": { + "dynamic": true, + "properties": {} + }, + "terms_category_article": { + "analyzer": "terms_analyzer", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "type": "text" + }, + "terms_countries_article": { + "analyzer": "terms_analyzer", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "type": "text" + }, + "terms_fieldsOfStudy_article": { + "analyzer": "terms_analyzer", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "type": "text" + }, + "terms_nuts0_article": { + "analyzer": "terms_analyzer", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "type": "text" + }, + "terms_nuts1_article": { + "analyzer": "terms_analyzer", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "type": "text" + }, + "terms_nuts2_article": { + "analyzer": "terms_analyzer", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "type": "text" + }, + "terms_nuts3_article": { + "analyzer": "terms_analyzer", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "type": "text" + }, + "terms_regions_article": { + "analyzer": "terms_analyzer", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "type": "text" + }, + "terms_tokens_entity": { + "type": "keyword" + }, + "type_of_entity": { + "type": "keyword" + } + } + } + } +} \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/mappings/endpoints/eurito/companies_mapping.json b/nesta/core/schemas/tier_1/mappings/endpoints/eurito/companies_mapping.json new file mode 100644 index 00000000..a2496aa3 --- /dev/null +++ b/nesta/core/schemas/tier_1/mappings/endpoints/eurito/companies_mapping.json @@ -0,0 +1,14 @@ +{ + "mappings": { + "_doc": { + "properties": { + "booleanFlag_eu_organisation": { + "type": "boolean" + }, + "metric_novelty_organisation": { + "type": "float" + } + } + } + } +} \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/mappings/endpoints/eurito/patstat_mapping.json b/nesta/core/schemas/tier_1/mappings/endpoints/eurito/patstat_mapping.json new file mode 100644 index 00000000..65a50f06 --- /dev/null +++ b/nesta/core/schemas/tier_1/mappings/endpoints/eurito/patstat_mapping.json @@ -0,0 +1,12 @@ +{ + "mappings": { + "_doc": { + "dynamic": "strict", + "properties": { + "booleanFlag_eu_patent": { + "type": "boolean" + } + } + } + } +} \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/aliases/health_scanner.json b/nesta/core/schemas/tier_1/mappings/endpoints/health-scanner/aliases.json similarity index 63% rename from nesta/core/schemas/tier_1/aliases/health_scanner.json rename to nesta/core/schemas/tier_1/mappings/endpoints/health-scanner/aliases.json index d03fce7b..f1431691 100644 --- a/nesta/core/schemas/tier_1/aliases/health_scanner.json +++ b/nesta/core/schemas/tier_1/mappings/endpoints/health-scanner/aliases.json @@ -1,64 +1,64 @@ { "body": { - "crunchbase": "textBody_descriptive_organisation", + "companies": "textBody_descriptive_organisation", "meetup": "textBody_descriptive_group", "nih": "textBody_descriptive_project" }, "city": { - "crunchbase": "placeName_city_organisation", + "companies": "placeName_city_organisation", "meetup": "placeName_city_group", "nih": "placeName_city_organisation" }, "continent": { - "crunchbase": "placeName_continent_organisation", + "companies": "placeName_continent_organisation", "meetup": "placeName_continent_group", "nih": "placeName_continent_organisation" }, "continent_id": { - "crunchbase": "id_continent_organisation", + "companies": "id_of_continent", "meetup": "id_continent_group", "nih": "id_of_continent" }, "cost": { - "crunchbase": "cost_of_funding", + "companies": "cost_of_funding", "nih": "cost_total_project" }, "cost_ref": { - "crunchbase": "cost_of_funding", + "companies": "cost_of_funding", "nih": "cost_total_project" }, "countries_ids": { - "crunchbase": "terms_of_countryTags", + "companies": "terms_of_countryTags", "meetup": "terms_of_countryTags", "nih": "terms_of_countryTags" }, "country": { - "crunchbase": "placeName_country_organisation", + "companies": "placeName_country_organisation", "meetup": "placeName_country_group", "nih": "placeName_country_organisation" }, "country_id": { - "crunchbase": "id_iso2_country", + "companies": "id_iso2_country", "meetup": "id_iso2_country", "nih": "id_iso2_country" }, "currency": { - "crunchbase": "currency_of_funding", + "companies": "currency_of_funding", "nih": "currency_total_cost" }, "end": { - "crunchbase": "date_death_organisation", + "companies": "date_death_organisation", "nih": "date_end_project" }, "funders": { - "crunchbase": "terms_of_funders", + "companies": "terms_of_funders", "nih": "terms_of_funders" }, "is_duplicate": { "nih": "booleanFlag_duplicate_abstract" }, "is_health_related": { - "crunchbase": "booleanFlag_health_organisation" + "companies": "booleanFlag_health_organisation" }, "is_translated": { "meetup": "booleanFlag_autotranslated_entity" @@ -67,48 +67,43 @@ "meetup": "terms_iso2lang_entity" }, "location": { - "crunchbase": "coordinate_of_city", + "companies": "coordinate_of_city", "meetup": "coordinate_of_group", "nih": "coordinate_of_organisation" }, "name": { - "crunchbase": "name_of_organisation", + "companies": "name_of_organisation", "meetup": "name_of_group", "nih": "title_of_organisation" }, "novelty": { - "crunchbase": "rank_rhodonite_organisation", + "companies": "rank_rhodonite_organisation", "meetup": "rank_rhodonite_group", "nih": "rank_rhodonite_abstract" }, "region": { - "crunchbase": "placeName_region_organisation" - }, - "sdg_labels": { - "crunchbase": "_terms_sdg_summary", - "meetup": "_terms_sdg_description", - "nih": "terms_sdg_abstract" + "companies": "placeName_region_organisation" }, "start": { - "crunchbase": "date_birth_organisation", + "companies": "date_birth_organisation", "meetup": "date_start_group", "nih": "date_start_project" }, "state": { - "crunchbase": "placeName_state_organisation", + "companies": "placeName_state_organisation", "meetup": "_placeName_state_group", "nih": "placeName_state_organisation" }, "state_id": { - "crunchbase": "id_state_organisation", + "companies": "id_state_organisation", "nih": "id_state_organisation" }, "summary": { - "crunchbase": "textBody_summary_organisation", + "companies": "textBody_summary_organisation", "nih": "textBody_abstract_project" }, "terms": { - "crunchbase": "terms_mesh_description", + "companies": "terms_mesh_description", "meetup": "terms_mesh_group", "nih": "terms_mesh_abstract" }, @@ -116,7 +111,7 @@ "nih": "title_of_project" }, "url": { - "crunchbase": "url_of_organisation", + "companies": "url_of_organisation", "meetup": "url_of_group" } } \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/mappings/endpoints/health-scanner/config.yaml b/nesta/core/schemas/tier_1/mappings/endpoints/health-scanner/config.yaml new file mode 100644 index 00000000..fc7c9834 --- /dev/null +++ b/nesta/core/schemas/tier_1/mappings/endpoints/health-scanner/config.yaml @@ -0,0 +1,3 @@ +# if hard-alias is true, actually change the names of the fields +# rather than performing an elasticsearch alias +hard-alias: false diff --git a/nesta/core/schemas/tier_1/field_null_mappings/health_scanner.json b/nesta/core/schemas/tier_1/mappings/endpoints/health-scanner/nulls.json similarity index 100% rename from nesta/core/schemas/tier_1/field_null_mappings/health_scanner.json rename to nesta/core/schemas/tier_1/mappings/endpoints/health-scanner/nulls.json diff --git a/nesta/core/schemas/tier_1/tier_1.json b/nesta/core/schemas/tier_1/ontology.json similarity index 68% rename from nesta/core/schemas/tier_1/tier_1.json rename to nesta/core/schemas/tier_1/ontology.json index 9fc32c95..7f1915b2 100644 --- a/nesta/core/schemas/tier_1/tier_1.json +++ b/nesta/core/schemas/tier_1/ontology.json @@ -9,15 +9,12 @@ "count", "currency", "date", - "datetime", "id", "json", - "language", - "level", "metric", "name", - "personName", "placeName", + "rank", "status", "terms", "textBody", @@ -31,14 +28,11 @@ "term": "middleName", "values": [ "abstract", - "adminRegion", - "ageDepRatioPercWorkAgePop", "alias", - "arxivSubjectCategory", "authCountry", "authors", + "autotranslated", "birth", - "capitalCity", "category", "citations", "city", @@ -47,12 +41,10 @@ "country", "created", "crunchBase", - "datestamp", "death", "description", "descriptive", "digitalObjectIdentifier", - "duns", "duplicate", "ecFunding", "employee", @@ -61,34 +53,19 @@ "eu", "facebook", "family", - "fieldOfStudy", - "fieldOfStudy1", - "fieldOfStudy2", - "fieldOfStudy3", - "fieldOfStudy4", "fieldsOfStudy", "fiscal", - "forked", - "forkedFrom", "framework", - "gini", "health", - "income", - "infMortPer1000LiveBirths", "institutes", "ipc", "iso2", + "iso2lang", "iso3", "isoNumeric", - "journalReference", "last", - "latitude", - "lifeExpAtBirth", "linkedIn", "location", - "login", - "longitude", - "mathSubjectClassification", "member", "memberOrigin", "mesh", @@ -100,37 +77,26 @@ "nuts2", "nuts3", "of", - "official", "parent", - "percPopBelowPovertyLine", - "percPopOver25NoEduc", - "percPopOver25TertiaryEduc", - "percPopRural", - "percPopUrban", "personCountry", "personNuts", - "population", - "programming", "region", "regions", + "rhodonite", "roles", "rounds", - "short", "start", "started", "state", "subcategory", "summary", - "tech", "techFieldNumber", "tokens", "topics", "total", "twitter", - "update", "updated", "usd2018", - "worldRegion", "zipcode" ] }, @@ -139,21 +105,20 @@ "values": [ "abstract", "article", - "author", "category", "city", "continent", + "countryTags", "cost", "country", "description", + "entity", "funders", "funding", "group", - "member", "organisation", "patent", - "project", - "user" + "project" ] } ] \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/schema_transformations/arxiv.json b/nesta/core/schemas/tier_1/schema_transformations/arxiv.json deleted file mode 100644 index a480fe30..00000000 --- a/nesta/core/schemas/tier_1/schema_transformations/arxiv.json +++ /dev/null @@ -1,67 +0,0 @@ -[ - { - "tier_0": "id", - "tier_1": "id_of_article", - "unique": true - }, - { - "tier_0": "created", - "tier_1": "date_created_article" - }, - { - "tier_0": "title", - "tier_1": "title_of_article" - }, - { - "tier_0": "doi", - "tier_1": "id_digitalObjectIdentifier_article" - }, - { - "tier_0": "abstract", - "tier_1": "textBody_abstract_article" - }, - { - "tier_0": "authors", - "tier_1": "terms_authors_article" - }, - { - "tier_0": "citation_count", - "tier_1": "count_citations_article" - }, - { - "tier_0": "normalised_citation", - "tier_1": "metric_citations_article" - }, - { - "tier_0": "fos", - "tier_1": "json_fieldOfStudy_article" - }, - { - "tier_0": "categories", - "tier_1": "json_category_article" - }, - { - "tier_0": "has_multinational", - "tier_1": "booleanFlag_multinational_article" - }, - { - "tier_0": "institutes", - "tier_1": "terms_institutes_article" - }, - { - "tier_0": "tokens", - "tier_1": "terms_tokens_article" - }, - { - "tier_0": "novelty_of_article", - "tier_1": "metric_novelty_article" - }, - { - "tier_0": "countries", - "tier_1": "json_location_article" - }, - { - "tier_0": "year", - "tier_1": "year_of_article" - } -] \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/schema_transformations/crunchbase_organisation.json b/nesta/core/schemas/tier_1/schema_transformations/crunchbase_organisation.json deleted file mode 100644 index 5fbcf86a..00000000 --- a/nesta/core/schemas/tier_1/schema_transformations/crunchbase_organisation.json +++ /dev/null @@ -1,11 +0,0 @@ -[ - { - "tier_0": "uuid", - "tier_1": "id_of_organisation", - "unique": true - }, - { - "tier_0": "description", - "tier_1": "textBody_descriptive_organisation" - } -] \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/schema_transformations/crunchbase_organisation_members.json b/nesta/core/schemas/tier_1/schema_transformations/crunchbase_organisation_members.json deleted file mode 100644 index 18ca49da..00000000 --- a/nesta/core/schemas/tier_1/schema_transformations/crunchbase_organisation_members.json +++ /dev/null @@ -1,158 +0,0 @@ -[ - { - "tier_0": "company_name", - "tier_1": "name_of_organisation" - }, - { - "tier_0": "roles", - "tier_1": "terms_roles_organisation" - }, - { - "tier_0": "homepage_url", - "tier_1": "url_of_organisation" - }, - { - "tier_0": "country", - "tier_1": "placeName_country_organisation" - }, - { - "tier_0": "country_alpha_2", - "tier_1": "id_iso2_country" - }, - { - "tier_0": "country_alpha_3", - "tier_1": "id_iso3_country" - }, - { - "tier_0": "country_numeric", - "tier_1": "id_isoNumeric_country" - }, - { - "tier_0": "continent", - "tier_1": "id_of_continent" - }, - { - "tier_0": "coordinates", - "tier_1": "coordinate_of_city" - }, - { - "tier_0": "state_code", - "tier_1": "id_state_organisation" - }, - { - "tier_0": "region", - "tier_1": "placeName_region_organisation" - }, - { - "tier_0": "city", - "tier_1": "placeName_city_organisation" - }, - { - "tier_0": "address", - "tier_1": "address_of_organisation" - }, - { - "tier_0": "status", - "tier_1": "status_of_organisation" - }, - { - "tier_0": "short_description", - "tier_1": "textBody_summary_organisation" - }, - { - "tier_0": "long_description", - "tier_1": "textBody_descriptive_organisation" - }, - { - "tier_0": "category_list", - "tier_1": "terms_subcategory_organisation" - }, - { - "tier_0": "category_group_list", - "tier_1": "terms_category_organisation" - }, - { - "tier_0": "funding_rounds", - "tier_1": "count_rounds_funding" - }, - { - "tier_0": "funding_total_usd", - "tier_1": "cost_of_funding" - }, - { - "tier_0": "currency_of_funding", - "tier_1": "currency_of_funding" - }, - { - "tier_0": "_total_cost_usd2018", - "tier_1": "_cost_usd2018_organisation" - }, - { - "tier_0": "founded_on", - "tier_1": "date_birth_organisation" - }, - { - "tier_0": "last_funding_on", - "tier_1": "date_last_funding" - }, - { - "tier_0": "closed_on", - "tier_1": "date_death_organisation" - }, - { - "tier_0": "employee_count", - "tier_1": "count_employee_organisation" - }, - { - "tier_0": "facebook_url", - "tier_1": "url_facebook_organisation" - }, - { - "tier_0": "linkedin_url", - "tier_1": "url_linkedIn_organisation" - }, - { - "tier_0": "cb_url", - "tier_1": "url_crunchBase_organisation" - }, - { - "tier_0": "twitter_url", - "tier_1": "url_twitter_organisation" - }, - { - "tier_0": "aliases", - "tier_1": "terms_alias_organisation" - }, - { - "tier_0": "updated_at", - "tier_1": "datetime_updated_organisation" - }, - { - "tier_0": "primary_role", - "tier_1": "type_of_organisation" - }, - { - "tier_0": "parent_id", - "tier_1": "id_parent_organisation" - }, - { - "tier_0": "is_health", - "tier_1": "booleanFlag_health_organisation" - }, - { - "tier_0": "mesh_terms", - "tier_1": "terms_mesh_description" - }, - { - "tier_0": "investor_names", - "tier_1": "terms_of_funders" - }, - { - "tier_0": "placeName_state_organisation", - "tier_1": "placeName_state_organisation" - }, - { - "tier_0": "placeName_continent_organisation", - "tier_1": "placeName_continent_organisation" - } -] \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/schema_transformations/eurito/arxiv-eu.json b/nesta/core/schemas/tier_1/schema_transformations/eurito/arxiv-eu.json deleted file mode 100644 index 03004a5b..00000000 --- a/nesta/core/schemas/tier_1/schema_transformations/eurito/arxiv-eu.json +++ /dev/null @@ -1,87 +0,0 @@ -[ - { - "tier_0": "id", - "tier_1": "id_of_article", - "unique": true - }, - { - "tier_0": "created", - "tier_1": "date_created_article" - }, - { - "tier_0": "title", - "tier_1": "title_of_article" - }, - { - "tier_0": "doi", - "tier_1": "id_digitalObjectIdentifier_article" - }, - { - "tier_0": "abstract", - "tier_1": "textBody_abstract_article" - }, - { - "tier_0": "authors", - "tier_1": "terms_authors_article" - }, - { - "tier_0": "citation_count", - "tier_1": "count_citations_article" - }, - { - "tier_0": "fields_of_study", - "tier_1": "json_fieldsOfStudy_article" - }, - { - "tier_0": "_fields_of_study", - "tier_1": "terms_fieldsOfStudy_article" - }, - { - "tier_0": "categories", - "tier_1": "terms_category_article" - }, - { - "tier_0": "has_multinational", - "tier_1": "booleanFlag_multinational_article" - }, - { - "tier_0": "institutes", - "tier_1": "terms_institutes_article" - }, - { - "tier_0": "is_eu", - "tier_1": "booleanFlag_eu_article" - }, - { - "tier_0": "novelty_of_article", - "tier_1": "metric_novelty_article" - }, - { - "tier_0": "nuts_0", - "tier_1": "terms_nuts0_article" - }, - { - "tier_0": "nuts_1", - "tier_1": "terms_nuts1_article" - }, - { - "tier_0": "nuts_2", - "tier_1": "terms_nuts2_article" - }, - { - "tier_0": "nuts_3", - "tier_1": "terms_nuts3_article" - }, - { - "tier_0": "countries", - "tier_1": "terms_countries_article" - }, - { - "tier_0": "regions", - "tier_1": "terms_regions_article" - }, - { - "tier_0": "year", - "tier_1": "year_of_article" - } -] \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/schema_transformations/eurito/cordis-eu.json b/nesta/core/schemas/tier_1/schema_transformations/eurito/cordis-eu.json deleted file mode 100644 index 4c5a1815..00000000 --- a/nesta/core/schemas/tier_1/schema_transformations/eurito/cordis-eu.json +++ /dev/null @@ -1,47 +0,0 @@ -[ - { - "tier_0": "rcn", - "tier_1": "id_of_project", - "unique": true - }, - { - "tier_0": "start_date_code", - "tier_1": "date_started_project" - }, - { - "tier_0": "end_date_code", - "tier_1": "date_ended_project" - }, - { - "tier_0": "title", - "tier_1": "title_of_project" - }, - { - "tier_0": "description", - "tier_1": "textBody_description_project" - }, - { - "tier_0": "ec_contribution", - "tier_1": "cost_ecFunding_project" - }, - { - "tier_0": "framework", - "tier_1": "name_framework_project" - }, - { - "tier_0": "status", - "tier_1": "status_of_project" - }, - { - "tier_0": "total_cost", - "tier_1": "cost_total_project" - }, - { - "tier_0": "year", - "tier_1": "year_of_project" - }, - { - "tier_0": "link", - "tier_1": "url_of_project" - } -] \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/schema_transformations/eurito/crunchbase-eu.json b/nesta/core/schemas/tier_1/schema_transformations/eurito/crunchbase-eu.json deleted file mode 100644 index 1f3146a5..00000000 --- a/nesta/core/schemas/tier_1/schema_transformations/eurito/crunchbase-eu.json +++ /dev/null @@ -1,162 +0,0 @@ -[ - { - "tier_0": "company_name", - "tier_1": "name_of_organisation" - }, - { - "tier_0": "is_eu", - "tier_1": "booleanFlag_eu_organisation" - }, - { - "tier_0": "roles", - "tier_1": "terms_roles_organisation" - }, - { - "tier_0": "homepage_url", - "tier_1": "url_of_organisation" - }, - { - "tier_0": "country", - "tier_1": "placeName_country_organisation" - }, - { - "tier_0": "country_alpha_2", - "tier_1": "id_iso2_country" - }, - { - "tier_0": "country_alpha_3", - "tier_1": "id_iso3_country" - }, - { - "tier_0": "country_numeric", - "tier_1": "id_isoNumeric_country" - }, - { - "tier_0": "continent", - "tier_1": "id_of_continent" - }, - { - "tier_0": "coordinates", - "tier_1": "coordinate_of_city" - }, - { - "tier_0": "state_code", - "tier_1": "id_state_organisation" - }, - { - "tier_0": "region", - "tier_1": "placeName_region_organisation" - }, - { - "tier_0": "city", - "tier_1": "placeName_city_organisation" - }, - { - "tier_0": "address", - "tier_1": "address_of_organisation" - }, - { - "tier_0": "status", - "tier_1": "status_of_organisation" - }, - { - "tier_0": "short_description", - "tier_1": "textBody_summary_organisation" - }, - { - "tier_0": "long_description", - "tier_1": "textBody_descriptive_organisation" - }, - { - "tier_0": "category_list", - "tier_1": "terms_subcategory_organisation" - }, - { - "tier_0": "category_group_list", - "tier_1": "terms_category_organisation" - }, - { - "tier_0": "funding_rounds", - "tier_1": "count_rounds_funding" - }, - { - "tier_0": "funding_total_usd", - "tier_1": "cost_of_funding" - }, - { - "tier_0": "currency_of_funding", - "tier_1": "currency_of_funding" - }, - { - "tier_0": "_total_cost_usd2018", - "tier_1": "_cost_usd2018_organisation" - }, - { - "tier_0": "founded_on", - "tier_1": "date_birth_organisation" - }, - { - "tier_0": "last_funding_on", - "tier_1": "date_last_funding" - }, - { - "tier_0": "closed_on", - "tier_1": "date_death_organisation" - }, - { - "tier_0": "employee_count", - "tier_1": "count_employee_organisation" - }, - { - "tier_0": "facebook_url", - "tier_1": "url_facebook_organisation" - }, - { - "tier_0": "linkedin_url", - "tier_1": "url_linkedIn_organisation" - }, - { - "tier_0": "cb_url", - "tier_1": "url_crunchBase_organisation" - }, - { - "tier_0": "twitter_url", - "tier_1": "url_twitter_organisation" - }, - { - "tier_0": "aliases", - "tier_1": "terms_alias_organisation" - }, - { - "tier_0": "updated_at", - "tier_1": "date_updated_organisation" - }, - { - "tier_0": "primary_role", - "tier_1": "type_of_organisation" - }, - { - "tier_0": "parent_id", - "tier_1": "id_parent_organisation" - }, - { - "tier_0": "is_health", - "tier_1": "booleanFlag_health_organisation" - }, - { - "tier_0": "mesh_terms", - "tier_1": "terms_mesh_description" - }, - { - "tier_0": "investor_names", - "tier_1": "terms_of_funders" - }, - { - "tier_0": "placeName_state_organisation", - "tier_1": "placeName_state_organisation" - }, - { - "tier_0": "placeName_continent_organisation", - "tier_1": "placeName_continent_organisation" - } -] \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/schema_transformations/eurito/patstat-eu.json b/nesta/core/schemas/tier_1/schema_transformations/eurito/patstat-eu.json deleted file mode 100644 index d2e1b4f4..00000000 --- a/nesta/core/schemas/tier_1/schema_transformations/eurito/patstat-eu.json +++ /dev/null @@ -1,55 +0,0 @@ -[ - { - "tier_0": "id", - "tier_1": "id_family_patent", - "unique": true - }, - { - "tier_0": "is_eu", - "tier_1": "booleanFlag_eu_patent" - }, - { - "tier_0": "earliest_filing_date", - "tier_1": "date_of_patent" - }, - { - "tier_0": "earliest_filing_year", - "tier_1": "year_of_patent" - }, - { - "tier_0": "title", - "tier_1": "title_of_patent" - }, - { - "tier_0": "abstract", - "tier_1": "textBody_abstract_patent" - }, - { - "tier_0": "nb_citing_docdb_fam", - "tier_1": "count_citations_patent" - }, - { - "tier_0": "ipc", - "tier_1": "terms_ipc_patent" - }, - { - "tier_0": "nace2", - "tier_1": "terms_nace2_patent" - }, - { - "tier_0": "tech", - "tier_1": "terms_techFieldNumber_patent" - }, - { - "tier_0": "ctry", - "tier_1": "terms_personCountry_patent" - }, - { - "tier_0": "nuts", - "tier_1": "terms_personNuts_patent" - }, - { - "tier_0": "appln_auth", - "tier_1": "terms_authCountry_patent" - } -] \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/schema_transformations/github.json b/nesta/core/schemas/tier_1/schema_transformations/github.json deleted file mode 100644 index 3caf1516..00000000 --- a/nesta/core/schemas/tier_1/schema_transformations/github.json +++ /dev/null @@ -1,72 +0,0 @@ -[ - { - "tier_0": "project_name", - "tier_1": "name_of_project" - }, - { - "tier_0": "project_description", - "tier_1": "textBody_descriptive_project" - }, - { - "tier_0": "project_url", - "tier_1": "url_of_project" - }, - { - "tier_0": "project_language", - "tier_1": "language_programming_project" - }, - { - "tier_0": "project_creation_date", - "tier_1": "datetime_created_project" - }, - { - "tier_0": "project_forked_from", - "tier_1": "id_forkedFrom_project" - }, - { - "tier_0": "project_id", - "tier_1": "id_of_project", - "unique": true - }, - { - "tier_0": "user_login", - "tier_1": "name_login_user" - }, - { - "tier_0": "user_company", - "tier_1": "name_of_organisation" - }, - { - "tier_0": "user_creation_date", - "tier_1": "datetime_created_user" - }, - { - "tier_0": "user_type", - "tier_1": "type_of_user" - }, - { - "tier_0": "user_longitude", - "tier_1": "coordinate_longitude_user" - }, - { - "tier_0": "user_latitude", - "tier_1": "coordinate_latitude_user" - }, - { - "tier_0": "user_state", - "tier_1": "placeName_state_user" - }, - { - "tier_0": "user_city", - "tier_1": "placeName_city_user" - }, - { - "tier_0": "user_id", - "tier_1": "id_of_user", - "unique": true - }, - { - "tier_0": "user_country_code", - "tier_1": "id_iso2_country" - } -] \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/schema_transformations/meetup.json b/nesta/core/schemas/tier_1/schema_transformations/meetup.json deleted file mode 100644 index c7145b25..00000000 --- a/nesta/core/schemas/tier_1/schema_transformations/meetup.json +++ /dev/null @@ -1,79 +0,0 @@ -[ - { - "tier_0": "id", - "tier_1": "id_of_group", - "unique": true - }, - { - "tier_0": "name", - "tier_1": "name_of_group" - }, - { - "tier_0": "urlname", - "tier_1": "url_of_group" - }, - { - "tier_0": "category_name", - "tier_1": "name_of_category" - }, - { - "tier_0": "country", - "tier_1": "id_iso2_country" - }, - { - "tier_0": "iso3", - "tier_1": "id_iso3_country" - }, - { - "tier_0": "isoNumeric", - "tier_1": "id_isoNumeric_country" - }, - { - "tier_0": "country_name", - "tier_1": "placeName_country_group" - }, - { - "tier_0": "continent", - "tier_1": "placeName_continent_group" - }, - { - "tier_0": "city", - "tier_1": "placeName_city_group" - }, - { - "tier_0": "created", - "tier_1": "date_start_group" - }, - { - "tier_0": "description", - "tier_1": "textBody_descriptive_group" - }, - { - "tier_0": "coordinate", - "tier_1": "coordinate_of_group" - }, - { - "tier_0": "members", - "tier_1": "count_member_group" - }, - { - "tier_0": "topics", - "tier_1": "terms_topics_group" - }, - { - "tier_0": "mesh_terms", - "tier_1": "terms_mesh_group" - }, - { - "tier_0": "member_origins", - "tier_1": "terms_memberOrigin_group" - }, - { - "tier_0": "continent_id", - "tier_1": "id_continent_group" - }, - { - "tier_0": "country_id", - "tier_1": "id_country_group" - } -] \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/schema_transformations/meetup_members.json b/nesta/core/schemas/tier_1/schema_transformations/meetup_members.json deleted file mode 100644 index cd4fb5a9..00000000 --- a/nesta/core/schemas/tier_1/schema_transformations/meetup_members.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "tier_0": "member_id", - "tier_1": "id_of_member", - "unique": true - }, - { - "tier_0": "group_id", - "tier_1": "id_of_group", - "unique": true - } -] \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/schema_transformations/nih.json b/nesta/core/schemas/tier_1/schema_transformations/nih.json deleted file mode 100644 index f563dc7b..00000000 --- a/nesta/core/schemas/tier_1/schema_transformations/nih.json +++ /dev/null @@ -1,103 +0,0 @@ -[ - { - "tier_0": "full_project_num", - "tier_1": "id_of_project", - "unique": true - }, - { - "tier_0": "fy", - "tier_1": "year_fiscal_funding" - }, - { - "tier_0": "city", - "tier_1": "placeName_city_organisation" - }, - { - "tier_0": "country", - "tier_1": "placeName_country_organisation" - }, - { - "tier_0": "org_state", - "tier_1": "id_state_organisation" - }, - { - "tier_0": "org_zipcode", - "tier_1": "placeName_zipcode_organisation" - }, - { - "tier_0": "org_name", - "tier_1": "title_of_organisation" - }, - { - "tier_0": "phr", - "tier_1": "textBody_descriptive_project" - }, - { - "tier_0": "project_start", - "tier_1": "date_start_project" - }, - { - "tier_0": "project_end", - "tier_1": "date_end_project" - }, - { - "tier_0": "project_terms", - "tier_1": "terms_descriptive_project" - }, - { - "tier_0": "project_title", - "tier_1": "title_of_project" - }, - { - "tier_0": "total_cost", - "tier_1": "cost_total_project" - }, - { - "tier_0": "abstract_text", - "tier_1": "textBody_abstract_project" - }, - { - "tier_0": "coordinates", - "tier_1": "coordinate_of_organisation" - }, - { - "tier_0": "country_alpha_2", - "tier_1": "id_iso2_country" - }, - { - "tier_0": "country_alpha_3", - "tier_1": "id_iso3_country" - }, - { - "tier_0": "country_numeric", - "tier_1": "id_isoNumeric_country" - }, - { - "tier_0": "continent", - "tier_1": "id_of_continent" - }, - { - "tier_0": "total_cost_currency", - "tier_1": "currency_total_cost" - }, - { - "tier_0": "_total_cost_usd2018", - "tier_1": "_cost_usd2018_project" - }, - { - "tier_0": "mesh_terms", - "tier_1": "terms_mesh_abstract" - }, - { - "tier_0": "duplicate_abstract", - "tier_1": "booleanFlag_duplicate_abstract" - }, - { - "tier_0": "placeName_state_organisation", - "tier_1": "placeName_state_organisation" - }, - { - "tier_0": "placeName_continent_organisation", - "tier_1": "placeName_continent_organisation" - } -] \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/schema_transformations/worldbank.json b/nesta/core/schemas/tier_1/schema_transformations/worldbank.json deleted file mode 100644 index e208ca3d..00000000 --- a/nesta/core/schemas/tier_1/schema_transformations/worldbank.json +++ /dev/null @@ -1,83 +0,0 @@ -[ - { - "tier_0": "id", - "tier_1": "id_iso3_country", - "unique": true - }, - { - "tier_0": "capitalCity", - "tier_1": "placeName_capitalCity_country" - }, - { - "tier_0": "incomeLevel", - "tier_1": "level_income_country" - }, - { - "tier_0": "iso2Code", - "tier_1": "id_iso2_country" - }, - { - "tier_0": "latitude", - "tier_1": "coordinate_latitude_country" - }, - { - "tier_0": "longitude", - "tier_1": "coordinate_longitude_country" - }, - { - "tier_0": "year", - "tier_1": "year_datestamp_country" - }, - { - "tier_0": "name", - "tier_1": "placeName_of_country" - }, - { - "tier_0": "region", - "tier_1": "placeName_worldRegion_country" - }, - { - "tier_0": "adminregion", - "tier_1": "placeName_adminRegion_country" - }, - { - "tier_0": "gini_index", - "tier_1": "metric_gini_country" - }, - { - "tier_0": "life_expectancy_at_birth_total_years", - "tier_1": "metric_lifeExpAtBirth_country" - }, - { - "tier_0": "population_total", - "tier_1": "count_population_country" - }, - { - "tier_0": "age_dependency_ratio_pc_of_working_age_population", - "tier_1": "metric_ageDepRatioPercWorkAgePop_country" - }, - { - "tier_0": "barro_lee_percentage_of_population_age_25_with_no_education", - "tier_1": "metric_percPopOver25NoEduc_country" - }, - { - "tier_0": "barro_lee_perce_of_popul_age_25_with_tertia_school_comple_tertia", - "tier_1": "metric_percPopOver25TertiaryEduc_country" - }, - { - "tier_0": "poverty_headcount_ratio_at_national_poverty_line_pc_of_populatio", - "tier_1": "metric_percPopBelowPovertyLine_country" - }, - { - "tier_0": "rural_population_pc_of_total_population", - "tier_1": "metric_percPopRural_country" - }, - { - "tier_0": "mortality_rate_infant_per_1_000_live_births", - "tier_1": "metric_infMortPer1000LiveBirths_country" - }, - { - "tier_0": "urban_population_pc_of_total", - "tier_1": "metric_percPopUrban_country" - } -] \ No newline at end of file diff --git a/nesta/core/schemas/tier_1/tests/test_aliases.py b/nesta/core/schemas/tier_1/tests/test_aliases.py new file mode 100644 index 00000000..c8b0e906 --- /dev/null +++ b/nesta/core/schemas/tier_1/tests/test_aliases.py @@ -0,0 +1,35 @@ +import os +import glob +import json +from pathlib import Path +import pytest + +@pytest.fixture +def json_files(): + cwd = os.path.dirname(__file__) + return list(glob.glob(f'{cwd}/../**/*json', recursive=True)) + + +def test_mappings_build(json_files): + + # Test each dataset for valid ontology + dirname = Path(os.path.dirname(__file__)).parent + dataset_dirname = os.path.join(dirname, 'datasets') + ontology = {} + for dataset in os.listdir(dataset_dirname): + filename = os.path.join(dataset_dirname, dataset) + with open(filename) as f: + _ontology = json.load(f) + ontology[dataset.split('.json')[0]] = list(_ontology['tier0_to_tier1'].values()) + + # Test that each alias is valid + for filename in json_files: + _, _filename = os.path.split(filename) + if _filename != 'aliases.json': + continue + with open(filename) as f: + aliases = json.load(f) + for new_name, info in aliases.items(): + for dataset, old_name in info.items(): + assert dataset in ontology, f'No such dataset "{dataset}" in {list(ontology.keys())}, referenced by {filename}' + assert old_name in ontology[dataset], f'{old_name} not found in {dataset}, referenced by {filename}' diff --git a/nesta/core/schemas/tier_1/tests/test_format.py b/nesta/core/schemas/tier_1/tests/test_format.py new file mode 100644 index 00000000..bb157b9c --- /dev/null +++ b/nesta/core/schemas/tier_1/tests/test_format.py @@ -0,0 +1,44 @@ +from nesta.core.orms.orm_utils import get_es_mapping +import os +import glob +import json +import pytest + +@pytest.fixture +def json_files(): + cwd = os.path.dirname(__file__) + return glob.glob(f'{cwd}/../**/*json', recursive=True) + + +def test_is_tidy(json_files): + """Check that all files are valid, tidy json""" + for filename in json_files: + # ontology.json is tested elsewhere + _, _filename = os.path.split(filename) + if _filename == 'ontology.json': + continue + with open(filename) as f: + raw = f.read() + js = json.loads(raw) + assert raw == json.dumps(js, sort_keys=True, indent=4), (f'\n\n{_filename} has not been tidied.\nBe sure to ' + 'run "python .githooks/hooktools/sort_all_json.py" ' + 'from the root directory to ' + 'avoid this test failure.\n\n') + +def test_mappings_build(json_files): + endpoints, datasets = set(), set() + for filename in json_files: + if not filename.endswith('mapping.json'): + continue + if 'datasets' in filename: + _, _filename = os.path.split(filename) + dataset = _filename.split('_mapping.json')[0] + datasets.add(dataset) + if 'endpoints' in filename: + dirname, _ = os.path.split(filename) + _, endpoint = os.path.split(dirname) + endpoints.add(endpoint) + for endpoint in endpoints: + for dataset in datasets: + get_es_mapping(dataset, endpoint) + get_es_mapping(dataset[0], 'dummy') # <--- also test on non-existent endpoint diff --git a/nesta/core/schemas/tier_1/tests/test_ontology.py b/nesta/core/schemas/tier_1/tests/test_ontology.py new file mode 100644 index 00000000..e491c50e --- /dev/null +++ b/nesta/core/schemas/tier_1/tests/test_ontology.py @@ -0,0 +1,48 @@ +import os +from pathlib import Path +import pytest +import json + +@pytest.fixture +def ontology(): + dirname = Path(os.path.dirname(__file__)).parent + # Load the ontology + filename = os.path.join(dirname, 'ontology.json') + with open(filename) as f: + ontology = json.load(f) + return {row['term']: row['values'] for row in ontology} + + +def test_ontology_uniqueness(ontology): + for lvl, values in ontology.items(): + assert len(values) == len(set(values)), f'{lvl} has duplicate values' + + +def test_validate(ontology): + dirname = Path(os.path.dirname(__file__)).parent + dataset_dirname = os.path.join(dirname, 'datasets') + firsts, middles, lasts = [], [], [] + # Test each dataset for valid ontology + for filename in os.listdir(dataset_dirname): + filename = os.path.join(dataset_dirname, filename) + with open(filename) as f: + dataset = json.load(f) + for field_name in dataset['tier0_to_tier1'].values(): + if field_name.startswith('_'): + field_name = field_name[1:] + first, middle, last = field_name.split('_') + # Test the vocab is valid + assert first in ontology['firstName'], f'{dataset} has unexpected field {field_name}' + assert middle in ontology['middleName'], f'{dataset} has unexpected field {field_name}' + assert last in ontology['lastName'], f'{dataset} has unexpected field {field_name}' + # Save these for the tests at the end + firsts.append(first) + middles.append(middle) + lasts.append(last) + # Test there is no superfluous vocab in the ontology + for f in ontology['firstName']: + assert f in firsts, f'Unused first name: {f}' + for f in ontology['middleName']: + assert f in middles, f'Unused middle name: {f}' + for f in ontology['lastName']: + assert f in lasts, f'Unused last name: {f}' diff --git a/nesta/core/schemas/tier_1/tests/test_validate.py b/nesta/core/schemas/tier_1/tests/test_validate.py deleted file mode 100644 index 4c70be6b..00000000 --- a/nesta/core/schemas/tier_1/tests/test_validate.py +++ /dev/null @@ -1,87 +0,0 @@ -import os -import glob -import json -from collections import Counter -from nesta.core.luigihacks.misctools import find_filepath_from_pathstub - -ES_CONF_SUFFIX = "_es_config.json" - -def alias_info(filepath): - with open(filepath) as f: - data = json.load(f) - for alias, info in data.items(): - for dataset, field in info.items(): - yield (alias, dataset, field) - - -class TestValidate(): - def test_validate(self): - # Load the ontology - cwd = os.path.dirname(__file__) - filename = os.path.join(cwd, '../tier_1.json') - with open(filename) as f: - data = json.load(f) - ontology = {row["term"]: row["values"] for row in data} - # Assert the core structure of the ontology - assert len(ontology) == 3 - for term_type in ["firstName", "middleName", "lastName"]: - assert term_type in ontology - - # Iterate over schema transformations - all_fields = {} - for filename in glob.glob(f'{cwd}/../**/*json', - recursive=True): - # Load the transformation - if 'schema_transformations' not in filename: - continue - print(filename) - with open(filename) as f: - data = json.load(f) - # Assert that the terms are in the ontology - tier_0, tier_1 = [], [] - for row in data: - fieldname = row['tier_1'] - tier_0.append(row['tier_0']) - tier_1.append(fieldname) - if fieldname.startswith("_"): - fieldname = fieldname[1:] - first, middle, last = fieldname.split("_") - assert first in ontology["firstName"] - assert middle in ontology["middleName"] - assert last in ontology["lastName"] - # Record the dataset name for the next tests - dataset_name = filename.replace(".json", "").split("/")[-1] - all_fields[dataset_name] = tier_1 - # Assert no duplicates - _, count = Counter(tier_0).most_common(1)[0] - print(Counter(tier_0).most_common(1)[0]) - assert count == 1 - _, count = Counter(tier_1).most_common(1)[0] - print(Counter(tier_1).most_common(1)[0]) - assert count == 1 - - def test_aliases(self): - """Assert consistency between the aliases and schemas""" - top_dir = find_filepath_from_pathstub("core/orms") - all_fields = {} - for filename in os.listdir(top_dir): - if not filename.endswith(ES_CONF_SUFFIX): - continue - dataset = filename.replace(ES_CONF_SUFFIX, "") - filename = os.path.join(top_dir, filename) - with open(filename) as f: - data = json.load(f) - print(f'Found {filename}') - fields = data["mappings"]["_doc"]["properties"].keys() - all_fields[dataset] = fields - - cwd = os.path.dirname(__file__) - path = os.path.join(cwd, '../aliases/') - for filename in os.listdir(path): - if not filename.endswith(".json"): - continue - filename = os.path.join(path, filename) - for alias, dataset, field in alias_info(filename): - print("\t", alias, dataset, field) - assert dataset in all_fields.keys() - assert field in all_fields[dataset] diff --git a/nesta/core/schemas/tier_1/tidy_schema.py b/nesta/core/schemas/tier_1/tests/tidy_schema.py similarity index 91% rename from nesta/core/schemas/tier_1/tidy_schema.py rename to nesta/core/schemas/tier_1/tests/tidy_schema.py index c6cf4f15..22ad96db 100644 --- a/nesta/core/schemas/tier_1/tidy_schema.py +++ b/nesta/core/schemas/tier_1/tests/tidy_schema.py @@ -7,7 +7,7 @@ """ import json -FILENAME="tier_1.json" +FILENAME="../ontology.json" # Load with open(FILENAME) as f: diff --git a/nesta/packages/biorxiv/collect_biorxiv.py b/nesta/packages/biorxiv/collect_biorxiv.py deleted file mode 100644 index 2699d68d..00000000 --- a/nesta/packages/biorxiv/collect_biorxiv.py +++ /dev/null @@ -1,35 +0,0 @@ -from nesta.packages.mag.query_mag_api import get_journal_articles -from nesta.packages.mag.parse_abstract import uninvert_abstract - -""" -Schema transformation of arviv ORM from MAG raw data, -so that biorxiv data from MAG can slot into arxiv pipelines. -""" -ARXIV_MAG = {'id':'DOI', - 'datestamp': 'D', - 'created': 'D', - 'updated': 'D', - 'title': 'DN', - 'doi':'DOI', - 'abstract': 'IA', - 'authors' : 'AA', - 'citation_count': 'CC'} - - -def get_biorxiv_articles(api_key, start_date='1 Jan, 2000'): - """Get all biorxiv articles from the MAG API. - - Args: - api_key (str): MAG API key - start_date (str): Sensibly formatted date string (interpretted by pd) - Yields: - article (dict): article object ready for insertion via nesta's arxiv ORM - """ - for article in get_journal_articles('biorxiv', start_date=start_date, - api_key=api_key): - # Convert to arxiv format for insertion to database - article= {arxiv_field: article[mag_field] - for arxiv_field, mag_field in ARXIV_MAG.items()} - article['abstract'] = uninvert_abstract(article['abstract']) - article['id'] = f"biorxiv-{article['id']}" # just to be sure - yield article diff --git a/nesta/packages/biorxiv/test_collect_biorxiv.py b/nesta/packages/biorxiv/test_collect_biorxiv.py deleted file mode 100644 index e8d0da68..00000000 --- a/nesta/packages/biorxiv/test_collect_biorxiv.py +++ /dev/null @@ -1,25 +0,0 @@ -from nesta.packages.biorxiv.collect_biorxiv import get_biorxiv_articles -from nesta.packages.biorxiv.collect_biorxiv import ARXIV_MAG -from nesta.core.orms.arxiv_orm import Article -from unittest import mock -import pytest - -@pytest.fixture -def dummy_article(): - return {key: f'blah blah{key}' for key in set(ARXIV_MAG.values())} - -def test_all_fields_in_orm(): - orm = dir(Article) - assert all(field in orm for field in ARXIV_MAG.keys()) - -@mock.patch('nesta.packages.biorxiv.collect_biorxiv.get_journal_articles') -@mock.patch('nesta.packages.biorxiv.collect_biorxiv.uninvert_abstract') -def test_get_biorxiv_articles(_, mocked, dummy_article): - n_articles = 3 - mocked.return_value = iter([dummy_article]*n_articles) - for i, article in enumerate(get_biorxiv_articles(api_key='dummy_api_key', - start_date='dummy_date')): - assert type(article) is dict - assert len(article) == len(ARXIV_MAG) - assert i + 1 == n_articles - diff --git a/nesta/packages/decorators/schema_transform.py b/nesta/packages/decorators/schema_transform.py index 45c067c5..10b82035 100644 --- a/nesta/packages/decorators/schema_transform.py +++ b/nesta/packages/decorators/schema_transform.py @@ -6,40 +6,34 @@ such that specified field names are transformed and unspecified fields are dropped. A valid file would be formatted as shown: -[{"tier_0": "bad_col", "tier_1": "good_col"}, -{"tier_0": "another_bad_col", "tier_1": "another_good_col"}, -...] - -where :code:`tier_0` and :code:`tier_1` correspond to :code:`from_key` and :code:`to_key` -in the below documentation. +{ "tier0_to_tier1": + { "bad_col": "good_col", + "another_bad_col": "another_good_col" + } +} ''' import pandas import json -def load_transformer(filename, from_key, to_key): +def load_transformer(filename): with open(filename) as f: _data = json.load(f) - transformer = {row[from_key]:row[to_key] for row in _data} + return _data['tier0_to_tier1'] return transformer -def schema_transform(filename, from_key, to_key): +def schema_transform(filename): ''' Args: filename (str): A record-oriented JSON file path mapping field names - denoted by from :code:`from_key` and :code:`to_key`. - from_key (str): The key in file indicated by :code:`filename` which indicates - the field name to transform. - to_key (str): The key in file indicated by :code:`filename` which what - the field name indicated by :code:`from_key` will be transformed to. Returns: Data in the format it was originally passed to the wrapper in, with specified field names transformed and unspecified fields dropped. ''' - transformer = load_transformer(filename, from_key, to_key) + transformer = load_transformer(filename) def wrapper(func): def transformed(*args, **kwargs): data = func(*args,**kwargs) @@ -64,20 +58,18 @@ def transformed(*args, **kwargs): return wrapper -def schema_transformer(data, *, filename, from_key, to_key, ignore=[]): +def schema_transformer(data, *, filename, ignore=[]): '''Function version of the schema_transformer wrapper. Args: data (dataframe OR list of dicts): the data requiring the schama transformation filename (str): the path to the schema json file - from_key (str): tier level of the data - to_key (str): tier level to be applied to the data ignore (list): optional list of fields, eg ids or keys which shouldn't be dropped Returns: supplied data with schema applied ''' # Accept DataFrames... - transformer = load_transformer(filename, from_key, to_key) + transformer = load_transformer(filename) if type(data) == pandas.DataFrame: drop_cols = [c for c in data.columns if c not in transformer diff --git a/nesta/packages/decorators/tests/test_schema_transform.py b/nesta/packages/decorators/tests/test_schema_transform.py index aa3c66c0..5994cc35 100644 --- a/nesta/packages/decorators/tests/test_schema_transform.py +++ b/nesta/packages/decorators/tests/test_schema_transform.py @@ -22,7 +22,7 @@ def test_transformer(): def test_dataframe_transform(self, mocked_loader, test_transformer, test_data): mocked_loader.return_value = test_transformer dummy_func = lambda : pd.DataFrame(test_data) - wrapper = schema_transform("dummy", "dummy", "dummy") + wrapper = schema_transform("dummy") wrapped = wrapper(dummy_func) transformed = wrapped() @@ -34,7 +34,7 @@ def test_dataframe_transform(self, mocked_loader, test_transformer, test_data): def test_list_of_dict_transform(self, mocked_loader, test_transformer, test_data): mocked_loader.return_value = test_transformer dummy_func = lambda : test_data - wrapper = schema_transform("dummy", "dummy", "dummy") + wrapper = schema_transform("dummy") wrapped = wrapper(dummy_func) transformed = wrapped() transformed = pd.DataFrame(transformed) @@ -46,7 +46,7 @@ def test_list_of_dict_transform(self, mocked_loader, test_transformer, test_data def test_invalid_type_transform(self, mocked_loader, test_transformer): mocked_loader.return_value = test_transformer dummy_func = lambda : None - wrapper = schema_transform("dummy", "dummy", "dummy") + wrapper = schema_transform("dummy") wrapped = wrapper(dummy_func) with pytest.raises(ValueError) as e: wrapped() @@ -57,6 +57,5 @@ def test_single_dict(self, mocked_loader, test_transformer): mocked_loader.return_value = test_transformer test_data = {'bad_col': 111, 'another_bad_col': 222, 'stuff': 333} - transformed = schema_transformer(test_data, filename='dummy', - from_key='dummy', to_key='dummy') + transformed = schema_transformer(test_data, filename='dummy') assert transformed == {'good_col': 111, 'another_good_col': 222} diff --git a/nesta/packages/geo_utils/country_iso_code.py b/nesta/packages/geo_utils/country_iso_code.py index 9a9ec591..9817594b 100644 --- a/nesta/packages/geo_utils/country_iso_code.py +++ b/nesta/packages/geo_utils/country_iso_code.py @@ -49,19 +49,20 @@ def country_iso_code_dataframe(df, country='country'): df['continent'] = None continents = alpha2_to_continent_mapping() - + country_codes = None for idx, row in df.iterrows(): try: country_codes = country_iso_code(row[country]) except KeyError: # some fallback method could go here - pass + continue else: - df.at[idx, 'country_alpha_2'] = country_codes.alpha_2 - df.at[idx, 'country_alpha_3'] = country_codes.alpha_3 - df.at[idx, 'country_numeric'] = country_codes.numeric - df.at[idx, 'continent'] = continents.get(country_codes.alpha_2) - + if country_codes is None: + continue + df.at[idx, 'country_alpha_2'] = country_codes.alpha_2 + df.at[idx, 'country_alpha_3'] = country_codes.alpha_3 + df.at[idx, 'country_numeric'] = country_codes.numeric + df.at[idx, 'continent'] = continents.get(country_codes.alpha_2) return df diff --git a/requirements.txt b/requirements.txt index 06a159bd..45e64d20 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,6 +27,7 @@ PyMySQL==0.9.3 pyshp==2.1.0 pytest==4.5.0 PyVirtualDisplay==0.2.3 +pyyaml==5.3.1 requests==2.22.0 requests_aws4auth==0.9 retrying==1.3.3