Skip to content
This repository has been archived by the owner on Aug 13, 2021. It is now read-only.

[267] Tidy & slim schema transformations #281

Merged
merged 22 commits into from
Jun 9, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions nesta/core/batchables/arxiv/arxiv_elasticsearch/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from datetime import datetime as dt

from nesta.core.orms.orm_utils import db_session, get_mysql_engine
from nesta.core.orms.orm_utils import load_json_from_pathstub
from nesta.core.orms.orm_utils import object_to_dict
from nesta.core.orms.arxiv_orm import Article
from nesta.core.orms.grid_orm import Institute
Expand Down Expand Up @@ -76,10 +75,7 @@ def run():
ngrammer = Ngrammer(database="production")

# es setup
strans_kwargs={'filename':'arxiv.json',
'from_key':'tier_0',
'to_key':'tier_1',
'ignore':['id']}
strans_kwargs={'filename':'arxiv.json', 'ignore':['id']}
es = ElasticsearchPlus(hosts=es_host,
port=es_port,
aws_auth_region=aws_auth_region,
Expand Down Expand Up @@ -164,9 +160,9 @@ def run():
countries = set(grid_countries[inst_id]
for inst_id in good_institutes
if inst_id in grid_countries)
row['categories'], _, _ = hierarchy_field(cats)
row['fos'], _, _ = hierarchy_field(fos)
row['countries'], _, _ = hierarchy_field(countries)
row['nested_categories'], _, _ = hierarchy_field(cats)
row['fields_of_study'], _, _ = hierarchy_field(fos)
row['nested_location'], _, _ = hierarchy_field(countries)

# Pull out international institute info
has_mn = any(is_multinational(inst,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,8 @@ def run():
continent_lookup[None] = None

# es setup
field_null_mapping = load_json_from_pathstub("tier_1/field_null_mappings/",
"health_scanner.json")
strans_kwargs={'filename':'crunchbase_organisation_members.json',
'from_key':'tier_0',
'to_key':'tier_1',
'ignore':['id']}
field_null_mapping = load_json_from_pathstub("health-scanner", "nulls.json")
strans_kwargs = {'filename': 'companies.json', 'ignore': ['id']}
es = ElasticsearchPlus(hosts=es_host,
port=es_port,
aws_auth_region=aws_auth_region,
Expand Down
4 changes: 1 addition & 3 deletions nesta/core/batchables/eurito/arxiv_eu/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,7 @@ def run():

# es setup
logging.info('Connecting to ES')
strans_kwargs={'filename':'eurito/arxiv-eu.json',
'from_key':'tier_0', 'to_key':'tier_1',
'ignore':['id']}
strans_kwargs = {'filename': 'arxiv.json', 'ignore': ['id']}
es = ElasticsearchPlus(hosts=es_host,
port=es_port,
aws_auth_region=aws_auth_region,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,7 @@ def run():
eu_countries = get_eu_countries()

# es setup
strans_kwargs={'filename':'eurito/crunchbase-eu.json',
'from_key':'tier_0',
'to_key':'tier_1',
'ignore':['id']}
strans_kwargs = {'filename': 'companies.json', 'ignore': ['id']}
es = ElasticsearchPlus(hosts=es_host,
port=es_port,
aws_auth_region=aws_auth_region,
Expand Down
4 changes: 1 addition & 3 deletions nesta/core/batchables/eurito/cordis_eu/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,7 @@ def run():

# es setup
logging.info('Connecting to ES')
strans_kwargs={'filename':'eurito/cordis-eu.json',
'from_key':'tier_0', 'to_key':'tier_1',
'ignore':['id']}
strans_kwargs = {'filename': 'cordis.json', 'ignore': ['id']}
es = ElasticsearchPlus(hosts=es_host,
port=es_port,
aws_auth_region=aws_auth_region,
Expand Down
128 changes: 0 additions & 128 deletions nesta/core/batchables/eurito/patstat-eu/run.py

This file was deleted.

4 changes: 1 addition & 3 deletions nesta/core/batchables/eurito/patstat_eu/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,7 @@ def run():

# es setup
logging.info('Connecting to ES')
strans_kwargs={'filename':'eurito/patstat-eu.json',
'from_key':'tier_0', 'to_key':'tier_1',
'ignore':['id']}
strans_kwargs = {'filename': 'patstat.json', 'ignore': ['id']}
es = ElasticsearchPlus(hosts=es_host,
port=es_port,
aws_auth_region=aws_auth_region,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,7 @@ def run():
dupes = format_duplicate_map(dupes)

# Set up elastic search connection
field_null_mapping = load_json_from_pathstub("tier_1/"
"field_null_mappings/",
"health_scanner.json")
field_null_mapping = load_json_from_pathstub("health-scanner", "nulls.json")
es = ElasticsearchPlus(hosts=es_config['host'],
port=es_config['port'],
aws_auth_region=es_config['region'],
Expand Down
4 changes: 1 addition & 3 deletions nesta/core/batchables/health_data/nih_dedupe/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,7 @@ def run():
art_ids = json.loads(ids_obj.get()['Body']._raw_stream.read())
logging.info(f'Processing {len(art_ids)} article ids')

field_null_mapping = load_json_from_pathstub(("tier_1/"
"field_null_mappings/"),
"health_scanner.json")
field_null_mapping = load_json_from_pathstub("health-scanner", "nulls.json")
es = ElasticsearchPlus(hosts=es_host,
port=es_port,
aws_auth_region=aws_auth_region,
Expand Down
14 changes: 4 additions & 10 deletions nesta/core/batchables/health_data/nih_process_data/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
def run():
start_index = os.environ["BATCHPAR_start_index"]
end_index = os.environ["BATCHPAR_end_index"]
#mysqldb_config = os.environ["BATCHPAR_config"]
es_host = os.environ["BATCHPAR_outinfo"]
es_port = os.environ["BATCHPAR_out_port"]
es_index = os.environ["BATCHPAR_out_index"]
Expand Down Expand Up @@ -87,13 +86,8 @@ def run():
df['total_cost_currency'] = 'USD'

# output to elasticsearch
field_null_mapping = load_json_from_pathstub("tier_1/field_null_mappings/",
"health_scanner.json")
strans_kwargs={'filename':'nih.json',
'from_key':'tier_0',
'to_key':'tier_1',
'ignore':['application_id']}

field_null_mapping = load_json_from_pathstub("health-scanner", "nulls.json")
strans_kwargs = {'filename': 'nih.json', 'ignore': ['application_id']}
es = ElasticsearchPlus(hosts=es_host,
port=es_port,
aws_auth_region=aws_auth_region,
Expand Down Expand Up @@ -143,15 +137,15 @@ def run():
pars = {'start_index': '2001360',
'end_index': '2003940',
'db': 'dev',
'done': 'False',
'config': (f'{os.environ["HOME"]}/nesta/nesta/'
'core/config/mysqldb.config'),
'done': 'False',
'outinfo': ('https://search-health-scanner-'
'5cs7g52446h7qscocqmiky5dn4.'
'eu-west-2.es.amazonaws.com'),
'out_index': 'nih_dev',
'out_type': '_doc',
'out_port': '_doc',
'out_port': '_443',
'aws_auth_region': 'eu-west-2',
'entity_type': 'paper',
'test': 'False'}
Expand Down
9 changes: 2 additions & 7 deletions nesta/core/batchables/meetup/topic_tag_elasticsearch/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,8 @@ def run():
mesh_terms = format_mesh_terms(df_mesh)

# Setup ES+
field_null_mapping = load_json_from_pathstub(("tier_1/"
"field_null_mappings/"),
"health_scanner.json")
strans_kwargs={'filename':'meetup.json',
'from_key':'tier_0',
'to_key':'tier_1',
'ignore':[]}
field_null_mapping = load_json_from_pathstub("health-scanner", "nulls.json")
strans_kwargs = {'filename': 'meetup.json'}
es = ElasticsearchPlus(hosts=es_host,
port=es_port,
aws_auth_region=aws_auth_region,
Expand Down
Binary file modified nesta/core/config/elasticsearch.yaml
Binary file not shown.
Loading