nestauk
diff --git a/‎nesta/core/batchables/general/gtr/run.py‎
Lines changed: 245 additions & 0 deletions b/‎nesta/core/batchables/general/gtr/run.py‎
Lines changed: 245 additions & 0 deletions
diff --git a/‎nesta/core/batchables/general/gtr/tests/test_gtr.py‎
Lines changed: 110 additions & 0 deletions b/‎nesta/core/batchables/general/gtr/tests/test_gtr.py‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎nesta/core/config/elasticsearch.yaml‎
4 Bytes b/‎nesta/core/config/elasticsearch.yaml‎
4 Bytes
diff --git a/‎nesta/core/config/luigi.cfg‎
0 Bytes b/‎nesta/core/config/luigi.cfg‎
0 Bytes
diff --git a/‎nesta/core/luigihacks/elasticsearchplus.py‎
Lines changed: 17 additions & 5 deletions b/‎nesta/core/luigihacks/elasticsearchplus.py‎
Lines changed: 17 additions & 5 deletions
@@ -0,0 +1,245 @@
+"""
+run.py (GtR general)
+--------------------
+
+Transfer pre-collected GtR data from MySQL to Elasticsearch.
+"""
+
+from ast import literal_eval
+import boto3
+import json
+import logging
+import os
+from datetime import datetime as dt
+
+from nesta.core.luigihacks.elasticsearchplus import ElasticsearchPlus
+from nesta.core.luigihacks.luigi_logging import set_log_level
+from nesta.core.orms.orm_utils import db_session, get_mysql_engine
+from nesta.core.orms.orm_utils import load_json_from_pathstub
+from nesta.core.orms.orm_utils import object_to_dict, get_class_by_tablename
+from nesta.core.orms.gtr_orm import Base, Projects, LinkTable, OrganisationLocation
+from collections import defaultdict, Counter
+
+
+def default_pop(dictobj, key, default={}):
+    """Pop the key from the dict-like object. If the key doesn't exist, return a default.
+    
+    Args:
+        dictobj (dict-like): A dict-like object to modify.
+        key (hashable): A key to pop from the dict-like object.
+        default: Any value to be returned as default, should the key not exist.
+    Returns:
+        value: Either the value stored at the key, or the default value.
+    """
+    try:
+        default = dictobj.pop(key)
+    except KeyError:
+        pass
+    return default
+
+
+def truncate_if_str(value, n):
+    """Truncate a value if it's a string, otherwise return the value itself.
+    
+    Args:
+        value: Object to truncate, if it's a string
+        n (int): Number of chars after which to truncate.
+    Returns:
+        truncated: A truncated string, otherwise the original value itself.
+    """
+    return value[:n] if type(value) is str else value
+
+
+def extract_funds(gtr_funds):
+    """Extract and deduplicate funding information
+
+    Args:
+        gtr_funds (list of dict): Raw GtR funding information for a single project
+    Returns:
+        _gtr_funds (list of dict): Deduplicated GtR funding information, ready for ingestion to ES
+    """
+    funds = {}
+    for row in gtr_funds:
+        row = {k:row[k] for k in row if k != 'id'}
+        row['start_date'] = truncate_if_str(row.pop('start'), 10)
+        row['end_date'] = truncate_if_str(row.pop('end'), 10)
+        composite_key = (row[k] for k in ('start_date', 'end_date', 'category',
+                                          'amount', 'currencyCode'))
+        funds[tuple(composite_key)] = row
+    return [row for _, row in funds.items()]
+
+
+def get_linked_rows(session, links):
+    """Pull rows out of the database from various tables,
+    as indicated by the link table.
+
+    Args:
+        session (SqlAlchemy session): Open session from which to query the database.
+        links (dict): Mapping of table name to a list of PKs in that table
+    Returns:
+        rows (dict): Mapping of table name to a list of rows of data from that table
+    """
+    linked_rows = defaultdict(list)
+    for table_name, ids in links.items():
+        if table_name.startswith('gtr_outcomes'):  # Just make counts of GtR outcomes for now as
+                                                   # they otherwise lead to a mapping explosion
+            linked_rows['gtr_outcomes'] += [table_name[13:]]*len(ids)  # Will make a count of these later
+        else:
+            _class = get_class_by_tablename(Base, table_name)
+            rows = [object_to_dict(_obj)
+                    for _obj in (session.query(_class)\
+                                 .filter(_class.id.in_(ids)).all())]
+            linked_rows[table_name] += rows
+    return linked_rows
+
+
+def reformat_row(row, linked_rows, locations):
+    """Prepare raw data for ingestion to ES.
+
+    Args:
+        row (dict): Row of data.
+        linked_rows (dict): Mapping of table name to a list of rows of data from that table
+        locations (dict): Mapping of organisation id to location data
+    Returns:
+        row (dict): Reformatted row of data
+    """
+    # Extract general info
+    gtr_funds = default_pop(linked_rows, 'gtr_funds')
+    row['_json_funding_project'] = extract_funds(gtr_funds)
+    row['_json_outcomes_project'] = dict(Counter(linked_rows['gtr_outcomes']))
+    row['_terms_topics_project'] = [r['text'] for r in linked_rows['gtr_topic'] if r['text'] != 'Unclassified']
+    row['_terms_institutes_project'] = [r['name'] for r in linked_rows['gtr_organisations']]
+    row['_terms_instituteIds_project'] = [r['id'] for r in linked_rows['gtr_organisations']]
+
+    # Extract geographic info
+    org_ids = list(row['_terms_instituteIds_project'])
+    _locations = [loc for org_id, loc in locations.items() if org_id in org_ids]
+    row['_terms_countries_project'] = [loc['country_name'] for loc in _locations]
+    row['_terms_iso2_project'] = [loc['country_alpha_2'] for loc in _locations]
+    row['_terms_continent_project'] = [loc['continent'] for loc in _locations]
+
+    row['_coordinate_institutes_project'] = []
+    for loc in _locations:
+        lat = loc['latitude']
+        lon = loc['longitude']
+        if lat is None or lon is None:
+            continue
+        row['_coordinate_institutes_project'].append({'lat': float(lat), 'lon': float(lon)})
+    return row
+
+
+def get_project_links(session, project_ids):
+    """Generate the look-up table of table_name to object ids, by project id, 
+    as a prepatory stage for retrieving the "rows" by id from each table_name, 
+    by project id.
+
+    Args:
+        session (SqlAlchemy session): Open session from which to query the database.
+        project_ids (list-like): List of project ids to extract linked entities from.
+    Returns:
+        linked_rows (dict): Mapping of table name to a list of row ids of data in that table
+    """
+    project_links = defaultdict(lambda: defaultdict(list))
+    for obj in session.query(LinkTable).filter(LinkTable.project_id.in_(project_ids)).all():
+        row = object_to_dict(obj)
+        project_links[row['project_id']][row['table_name']].append(row['id'])
+    return project_links
+
+
+def get_org_locations(session):
+    """Retrieve look-up of all organisation ids to location metadata.
+
+    Args:
+        session (SqlAlchemy session): Open session from which to query the database.
+    Returns:
+        locations (nested dict): Mapping of organisation id to location metadata.
+    """
+    locations = {}
+    for obj in session.query(OrganisationLocation).all():
+        row = object_to_dict(obj)
+        locations[row.pop('id')] = row
+    return locations
+
+
+def run():
+    test = literal_eval(os.environ["BATCHPAR_test"])
+    bucket = os.environ['BATCHPAR_bucket']
+    batch_file = os.environ['BATCHPAR_batch_file']
+    db_name = os.environ["BATCHPAR_db_name"]
+    es_host = os.environ['BATCHPAR_outinfo']
+    es_port = int(os.environ['BATCHPAR_out_port'])
+    es_index = os.environ['BATCHPAR_out_index']
+    entity_type = os.environ["BATCHPAR_entity_type"]
+    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]
+
+    # database setup
+    logging.info('Retrieving engine connection')
+    engine = get_mysql_engine("BATCHPAR_config", "mysqldb",
+                              db_name)
+
+    # es setup
+    logging.info('Connecting to ES')
+    strans_kwargs = {'filename': 'gtr.json', 'ignore': ['id']}
+    es = ElasticsearchPlus(hosts=es_host,
+                           port=es_port,
+                           aws_auth_region=aws_auth_region,
+                           no_commit=("AWSBATCHTEST" in
+                                      os.environ),
+                           entity_type=entity_type,
+                           strans_kwargs=strans_kwargs,
+                           null_empty_str=True,
+                           coordinates_as_floats=True,
+                           listify_terms=True,
+                           do_sort=False,
+                           ngram_fields=['textBody_abstract_project',
+                                         'textBody_potentialImpact_project',
+                                         'textBody_techAbstract_project'])
+
+    # collect file
+    logging.info('Retrieving project ids')
+    s3 = boto3.resource('s3')
+    obj = s3.Object(bucket, batch_file)
+    project_ids = json.loads(obj.get()['Body']._raw_stream.read())
+    logging.info(f"{len(project_ids)} project IDs "
+                 "retrieved from s3")
+
+    #
+    logging.info('Processing rows')
+    with db_session(engine) as session:
+        locations = get_org_locations(session)
+        project_links = get_project_links(session, project_ids)
+        for count, obj in enumerate((session.query(Projects)
+                                     .filter(Projects.id.in_(project_ids))
+                                     .all())):
+            row = object_to_dict(obj)
+            links = default_pop(project_links, row['id'])
+            linked_rows = get_linked_rows(session, links)
+            row = reformat_row(row, linked_rows, locations)
+            es.index(index=es_index, id=row.pop('id'), body=row)
+            if not count % 1000:
+                logging.info(f"{count} rows loaded to "
+                             "elasticsearch")
+
+
+if __name__ == "__main__":
+    set_log_level()
+    if 'BATCHPAR_outinfo' not in os.environ:
+        from nesta.core.orms.orm_utils import setup_es
+        from nesta.core.luigihacks.misctools import find_filepath_from_pathstub
+        es, es_config = setup_es(production=False, endpoint='general',
+                                 dataset='gtr', drop_and_recreate=True)
+        environ = {'config': find_filepath_from_pathstub('mysqldb.config'),
+                   'batch_file' : (''),
+                   'db_name': 'dev',
+                   'bucket': 'nesta-production-intermediate',
+                   'outinfo': es_config['host'],
+                   'out_port': es_config['port'],
+                   'out_index': es_config['index'],
+                   'aws_auth_region': 'eu-west-2',
+                   'entity_type': 'project',
+                   'test': "True"}
+        for k, v in environ.items():
+            os.environ[f'BATCHPAR_{k}'] = v
+
+    logging.info('Starting...')
+    run()
@@ -0,0 +1,110 @@
+import pytest
+from unittest import mock
+from schema import Schema
+
+from nesta.core.batchables.general.gtr.run import extract_funds
+from nesta.core.batchables.general.gtr.run import get_linked_rows
+from nesta.core.batchables.general.gtr.run import reformat_row
+from nesta.core.batchables.general.gtr.run import get_project_links
+from nesta.core.batchables.general.gtr.run import get_org_locations
+
+PATH='nesta.core.batchables.general.gtr.run.{}'
+
+
+
+@pytest.fixture
+def gtr_funds():
+    """There are 3 unique values here"""
+    return [{'start': '1 Dec 2020', 'end': '2 Dec 2020',
+             'category': 'Ingoings', 'amount': 10000, 'currencyCode': '$$$s'},
+            {'start': '1 Dec 2020', 'end': '2 Dec 2020',
+             'category': 'Ingoings', 'amount': 10000, 'currencyCode': '$$$s'},
+            {'start': '1 Dec 2020', 'end': '2 Dec 2021',
+             'category': 'Ingoings', 'amount': 10000, 'currencyCode': '$$$s'},
+            {'start': '1 Dec 2020', 'end': '2 Dec 2021',
+             'category': 'Ingoings', 'amount': 100, 'currencyCode': '$$$s'}]*100
+
+@pytest.fixture
+def links():
+    return {'gtr_outcomes_outcomeA': [1,2,3,4],
+            'gtr_aTable': [2,3,4],
+            'gtr_outcomes_outcomeB': [1,3,4],
+            'gtr_anotherTable': [1,4]}
+
+@pytest.fixture
+def link_table_rows():
+    return [{'project_id': 1, 'table_name': 'table_1', 'id': 23},
+            {'project_id': 21, 'table_name': 'table_2', 'id': 432},
+            {'project_id': 1, 'table_name': 'table_1', 'id': 32},
+            {'project_id': 21, 'table_name': 'table_1', 'id': 12}]
+
+@pytest.fixture
+def flattened_link_table_rows():
+    return {1: {'table_1': [23, 32]},
+            21: {'table_1': [12], 'table_2': [432]}}
+
+def test_extract_funds(gtr_funds):
+    output = extract_funds(gtr_funds)
+    assert len(output) == 3
+    assert all(len(row) == 5 for row in output)
+
+
+@mock.patch(PATH.format('get_class_by_tablename'))
+@mock.patch(PATH.format('object_to_dict'))
+def test_get_linked_rows(mocked_obj_to_dict, mocked_get_class, links):
+    session = mock.MagicMock()
+    outputs = [[None]*len(ids) for table_name, ids in links.items() 
+               if not table_name.startswith('gtr_outcomes')]
+    session.query().filter().all.side_effect = outputs
+    results = get_linked_rows(session, links)
+    assert set(results.keys()) == set(['gtr_aTable', 'gtr_anotherTable', 'gtr_outcomes'])
+    assert len(results['gtr_aTable']) == len(links['gtr_aTable'])
+    assert len(results['gtr_anotherTable']) == len(links['gtr_anotherTable'])
+    assert len(results['gtr_outcomes']) == 7  # total outcomes
+    assert type(results['gtr_outcomes']) == list
+
+
+@mock.patch(PATH.format('extract_funds'), return_value=['the funds!'])
+def test_reformat_row(mocked_extract_funds):
+    row = {'something': 'value'}
+    linked_rows = {'gtr_funds': None,  # Mocked out
+                   'gtr_topic': [{'text': 'one topic'}, {'text': 'Unclassified'}, 
+                                 {'text': 'a topic'}, {'text': 'another topic'}],
+                   'gtr_outcomes': ['some outcomes', 'some outcomes', 'some other outcomes'],
+                   'gtr_organisations': [{'id': 'first org', 'name':'an org name'}]}
+                   
+    locations = {'first org': {'country_name': 'Japan', 'country_alpha_2': 'jp',
+                               'continent': 'Asia', 'latitude': 1000, 'longitude': 200},
+                 'second org': {'country_name': 'Peru', 'country_alpha_2': 'pe',
+                                'continent': 'South America', 'latitude': -1000, 'longitude': -200}}
+    
+    row = reformat_row(row, linked_rows, locations)
+    assert row == {'something': 'value',
+                   '_json_funding_project': ['the funds!'],
+                   '_json_outcomes_project': {'some outcomes': 2, 'some other outcomes': 1},
+                   '_terms_topics_project': ['one topic', 'a topic', 'another topic'],
+                   '_terms_institutes_project': ['an org name'],                   
+                   '_terms_instituteIds_project': ['first org'],
+                   '_terms_countries_project': ['Japan'],
+                   '_terms_iso2_project': ['jp'],
+                   '_terms_continent_project': ['Asia'],
+                   '_coordinate_institutes_project': [{'lat': 1000, 'lon': 200}]}
+                 
+
+@mock.patch(PATH.format('LinkTable'))
+@mock.patch(PATH.format('object_to_dict'))
+def test_get_project_links(mocked_otd, mocked_LinkTable, link_table_rows, flattened_link_table_rows):
+    mocked_otd.side_effect = link_table_rows
+    session = mock.MagicMock()
+    session.query().filter().all.return_value = [None]*len(link_table_rows)
+    project_links = get_project_links(session, None)
+    Schema(project_links).validate(flattened_link_table_rows)
+
+
+@mock.patch(PATH.format('object_to_dict'))
+def test_get_org_locations(link_table_rows):
+    session = mock.MagicMock()
+    session.query().all.return_value = [None]*len(link_table_rows)
+    locations = get_org_locations(session)
+    for row in link_table_rows:
+        assert locations[row.pop('id')] == row
@@ -293,6 +293,18 @@ def _null_empty_str(row):
             _row[k] = None
     return _row
 
+
+# Double underscore: a very private method
+def __floatify_coord(coord):
+    _coord = deepcopy(coord)
+    if _coord['lat'] is None or _coord['lon'] is None:
+        _coord = None
+    else:
+        _coord['lat'] = float(_coord['lat'])
+        _coord['lon'] = float(_coord['lon'])
+    return _coord
+
+
 def _coordinates_as_floats(row):
     """Ensure coordinate data are always floats.
 
@@ -307,13 +319,13 @@ def _coordinates_as_floats(row):
             continue
         if v is None:
             continue
-        if v['lat'] is None or v['lon'] is None:
-            _row[k] = None
-            continue
-        _row[k]['lat'] = float(v['lat'])
-        _row[k]['lon'] = float(v['lon'])
+        if type(v) is list:
+            _row[k] = [__floatify_coord(coord) for coord in v]
+        else:
+            _row[k] = __floatify_coord(v)
     return _row
 
+
 def _country_lookup():
     """Extract country/nationality --> iso2 code lookup
     from a public json file.