Skip to content
This repository was archived by the owner on Aug 13, 2021. It is now read-only.

Commit 2421024

Browse files
jaklingerJoel Klinger
andauthored
[266] Refactor and simplify ES configuration (#275)
* make sure conf dir is empty * simplified es config * added orm es config reader * modified setup_es to pick up new es config * swapped es_mode for boolean * aliases now consistent with config * aliases now automatically located * added endpoint field to estasks * added endpoint field to sql2estasks * [267] Pool ES mappings across datasets (#280) * changed branch name * mappings build * updated docs * updated docs * updated docs * added docstrings * added dynamic strict to settings * removed index.json in favour of a single defaults file * using soft alias until a future PR to minimise changes * cleaned and sorted json * [267] Tidy & slim schema transformations (#281) * pruned deprecated schema transformations * updated fos fieldname on arxlive * unified data set schema transformations * restructured directory * refactored references to schema_transformation * refactored references to schema_transformation * slimmed down transformations, and included entity_type * pruned ontology * tidied schemas * consistency tests * reverted unrelated json file * harmonised name fieldsofstudy across arxiv * added novelty back in * sorted json * sorted json * sorted json Co-authored-by: Joel Klinger <[email protected]> Co-authored-by: Joel Klinger <[email protected]> * patched out es config setup from tests * removed redundant tests * fixed json formatting * none included for testing * picked up bug in test Co-authored-by: Joel Klinger <[email protected]>
1 parent e095b7e commit 2421024

File tree

84 files changed

+1238
-2215
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

84 files changed

+1238
-2215
lines changed

docs/source/nesta.core.schemas.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
.. include:: ../../nesta/core/schemas/README.rst
2+
.. include:: ../../nesta/core/schemas/tier_1/mappings/README.rst

docs/source/nesta.core.scripts.rst

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1 @@
1-
Scripts
2-
=======
3-
41
.. include:: ../../nesta/core/scripts/README.rst

nesta/core/batchables/arxiv/arxiv_elasticsearch/run.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
from datetime import datetime as dt
2121

2222
from nesta.core.orms.orm_utils import db_session, get_mysql_engine
23-
from nesta.core.orms.orm_utils import load_json_from_pathstub
2423
from nesta.core.orms.orm_utils import object_to_dict
2524
from nesta.core.orms.arxiv_orm import Article
2625
from nesta.core.orms.grid_orm import Institute
@@ -76,10 +75,7 @@ def run():
7675
ngrammer = Ngrammer(database="production")
7776

7877
# es setup
79-
strans_kwargs={'filename':'arxiv.json',
80-
'from_key':'tier_0',
81-
'to_key':'tier_1',
82-
'ignore':['id']}
78+
strans_kwargs={'filename':'arxiv.json', 'ignore':['id']}
8379
es = ElasticsearchPlus(hosts=es_host,
8480
port=es_port,
8581
aws_auth_region=aws_auth_region,
@@ -164,9 +160,9 @@ def run():
164160
countries = set(grid_countries[inst_id]
165161
for inst_id in good_institutes
166162
if inst_id in grid_countries)
167-
row['categories'], _, _ = hierarchy_field(cats)
168-
row['fos'], _, _ = hierarchy_field(fos)
169-
row['countries'], _, _ = hierarchy_field(countries)
163+
row['nested_categories'], _, _ = hierarchy_field(cats)
164+
row['fields_of_study'], _, _ = hierarchy_field(fos)
165+
row['nested_location'], _, _ = hierarchy_field(countries)
170166

171167
# Pull out international institute info
172168
has_mn = any(is_multinational(inst,
@@ -216,8 +212,8 @@ def run():
216212

217213
if 'BATCHPAR_outinfo' not in os.environ:
218214
from nesta.core.orms.orm_utils import setup_es
219-
es, es_config = setup_es('dev', True, True,
220-
dataset='arxiv')
215+
es, es_config = setup_es(endpoint='arxlive', dataset='arxiv',
216+
production=False, drop_and_recreate=True)
221217
environ = {'batch_file': ('ArxivESTask-2019-09-19-'
222218
'False-1568888970724721.json'),
223219
'config': ('/home/ec2-user/nesta-eu/nesta/'

nesta/core/batchables/crunchbase/crunchbase_elasticsearch/run.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,8 @@ def run():
5858
continent_lookup[None] = None
5959

6060
# es setup
61-
field_null_mapping = load_json_from_pathstub("tier_1/field_null_mappings/",
62-
"health_scanner.json")
63-
strans_kwargs={'filename':'crunchbase_organisation_members.json',
64-
'from_key':'tier_0',
65-
'to_key':'tier_1',
66-
'ignore':['id']}
61+
field_null_mapping = load_json_from_pathstub("health-scanner", "nulls.json")
62+
strans_kwargs = {'filename': 'companies.json', 'ignore': ['id']}
6763
es = ElasticsearchPlus(hosts=es_host,
6864
port=es_port,
6965
aws_auth_region=aws_auth_region,
@@ -162,9 +158,9 @@ def run():
162158

163159
if 'BATCHPAR_outinfo' not in os.environ:
164160
from nesta.core.orms.orm_utils import setup_es
165-
es, es_config = setup_es('dev', True, True,
166-
dataset='crunchbase',
167-
aliases='health_scanner')
161+
es, es_config = setup_es(production=False, endpoint='health-scanner',
162+
dataset='companies',
163+
drop_and_recreate=True)
168164

169165
environ = {"AWSBATCHTEST": "",
170166
'BATCHPAR_batch_file': 'crunchbase_to_es-15597291977144725.json',

nesta/core/batchables/eurito/arxiv_eu/run.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,7 @@ def run():
5454

5555
# es setup
5656
logging.info('Connecting to ES')
57-
strans_kwargs={'filename':'eurito/arxiv-eu.json',
58-
'from_key':'tier_0', 'to_key':'tier_1',
59-
'ignore':['id']}
57+
strans_kwargs = {'filename': 'arxiv.json', 'ignore': ['id']}
6058
es = ElasticsearchPlus(hosts=es_host,
6159
port=es_port,
6260
aws_auth_region=aws_auth_region,
@@ -202,8 +200,8 @@ def run():
202200
set_log_level()
203201
if 'BATCHPAR_outinfo' not in os.environ:
204202
from nesta.core.orms.orm_utils import setup_es
205-
es, es_config = setup_es('dev', True, True,
206-
dataset='arxiv-eu')
203+
es, es_config = setup_es(production=False, endpoint='eurito',
204+
dataset='arxiv', drop_and_recreate=True)
207205
environ = {'config': ('/home/ec2-user/nesta-eu/nesta/'
208206
'core/config/mysqldb.config'),
209207
'batch_file' : ('arxiv-eu_EURITO-ElasticsearchTask-'

nesta/core/batchables/eurito/crunchbase_eu/run.py renamed to nesta/core/batchables/eurito/companies_eu/run.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,7 @@ def run():
6161
eu_countries = get_eu_countries()
6262

6363
# es setup
64-
strans_kwargs={'filename':'eurito/crunchbase-eu.json',
65-
'from_key':'tier_0',
66-
'to_key':'tier_1',
67-
'ignore':['id']}
64+
strans_kwargs = {'filename': 'companies.json', 'ignore': ['id']}
6865
es = ElasticsearchPlus(hosts=es_host,
6966
port=es_port,
7067
aws_auth_region=aws_auth_region,

nesta/core/batchables/eurito/cordis_eu/run.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,7 @@ def run():
8888

8989
# es setup
9090
logging.info('Connecting to ES')
91-
strans_kwargs={'filename':'eurito/cordis-eu.json',
92-
'from_key':'tier_0', 'to_key':'tier_1',
93-
'ignore':['id']}
91+
strans_kwargs = {'filename': 'cordis.json', 'ignore': ['id']}
9492
es = ElasticsearchPlus(hosts=es_host,
9593
port=es_port,
9694
aws_auth_region=aws_auth_region,
@@ -132,8 +130,8 @@ def run():
132130
if 'BATCHPAR_outinfo' not in os.environ:
133131
from nesta.core.orms.orm_utils import setup_es
134132
from nesta.core.luigihacks.misctools import find_filepath_from_pathstub
135-
es, es_config = setup_es('dev', True, True,
136-
dataset='cordis-eu')
133+
es, es_config = setup_es(production=False, endpoint='eurito',
134+
dataset='cordis', drop_and_recreate=True)
137135
environ = {'config': find_filepath_from_pathstub('mysqldb.config'),
138136
'batch_file' : ('cordis-eu_EURITO-ElasticsearchTask-'
139137
'2020-04-10-True-15865345336407135.json'),

nesta/core/batchables/eurito/patstat-eu/run.py

Lines changed: 0 additions & 128 deletions
This file was deleted.

nesta/core/batchables/eurito/patstat_eu/run.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,7 @@ def run():
6363

6464
# es setup
6565
logging.info('Connecting to ES')
66-
strans_kwargs={'filename':'eurito/patstat-eu.json',
67-
'from_key':'tier_0', 'to_key':'tier_1',
68-
'ignore':['id']}
66+
strans_kwargs = {'filename': 'patstat.json', 'ignore': ['id']}
6967
es = ElasticsearchPlus(hosts=es_host,
7068
port=es_port,
7169
aws_auth_region=aws_auth_region,

nesta/core/batchables/health_data/nih_abstract_mesh_data/run.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,7 @@ def run():
6868
dupes = format_duplicate_map(dupes)
6969

7070
# Set up elastic search connection
71-
field_null_mapping = load_json_from_pathstub("tier_1/"
72-
"field_null_mappings/",
73-
"health_scanner.json")
71+
field_null_mapping = load_json_from_pathstub("health-scanner", "nulls.json")
7472
es = ElasticsearchPlus(hosts=es_config['host'],
7573
port=es_config['port'],
7674
aws_auth_region=es_config['region'],

0 commit comments

Comments
 (0)