Skip to content
This repository has been archived by the owner on Aug 13, 2021. It is now read-only.

Commit

Permalink
[270] GtR mapping (also table name typo) (#283)
Browse files Browse the repository at this point in the history
* make sure conf dir is empty

* simplified es config

* added orm es config reader

* modified setup_es to pick up new es config

* swapped es_mode for boolean

* aliases now consistent with config

* aliases now automatically located

* added endpoint field to estasks

* added endpoint field to sql2estasks

* changed branch name

* mappings build

* updated docs

* updated docs

* updated docs

* added docstrings

* pruned deprecated schema transformations

* updated fos fieldname on arxlive

* unified data set schema transformations

* restructured directory

* refactored references to schema_transformation

* refactored references to schema_transformation

* slimmed down transformations, and included entity_type

* pruned ontology

* tidied schemas

* consistency tests

* reverted unrelated json file

* added dynamic strict to settings

* removed index.json in favour of a single defaults file

* harmonised name fieldsofstudy across arxiv

* using soft alias until a future PR to minimise changes

* added novelty back in

* sorted json

* sorted json

* sorted json

* changed schema_transformor to use new simpler mapping

* removed to/from keys

* new null syntax mapping implemented

* cleaned and sorted json

* adding temporary eurito-dev index to avoid conflating es7 compatibility issues

* adding temporary eurito-dev index to avoid conflating es7 compatibility issues

* testing es7 on cordis only

* testing es7 on cordis only

* testing es7 on cordis only

* changes to make cordis es7 run

* eurito-dev iteration

* compatibility issues between arxlive and eurito arxiv

* sorted json

* pycountry change no longer assumes not null country

* needed to split pathstub args

* removed redundant es mappings

* empty gtr transformation

* [267] Pool ES mappings across datasets (#280)

* changed branch name

* mappings build

* updated docs

* updated docs

* updated docs

* added docstrings

* added dynamic strict to settings

* removed index.json in favour of a single defaults file

* using soft alias until a future PR to minimise changes

* cleaned and sorted json

* [267] Tidy & slim schema transformations (#281)

* pruned deprecated schema transformations

* updated fos fieldname on arxlive

* unified data set schema transformations

* restructured directory

* refactored references to schema_transformation

* refactored references to schema_transformation

* slimmed down transformations, and included entity_type

* pruned ontology

* tidied schemas

* consistency tests

* reverted unrelated json file

* harmonised name fieldsofstudy across arxiv

* added novelty back in

* sorted json

* sorted json

* sorted json

Co-authored-by: Joel Klinger <[email protected]>

Co-authored-by: Joel Klinger <[email protected]>

* patched out es config setup from tests

* removed redundant tests

* fixed json formatting

* fixed bad table name (NB table was empty anyway)

* fixed bad table name (NB table was empty anyway)

* gtr ontology

* none included for testing

* added schema transformation

* picked up bug in test

* gtr ontology is self consistent

* added gtr mapping

* added gtr to config

* fixed merge conflicts

* fixed merge conflicts

* changed json field names

* instiutes are now analyzed and text

* sorted and cleaned json

* added geopoint

* fixed bad json

* fixed bad json

Co-authored-by: Joel Klinger <[email protected]>
  • Loading branch information
jaklinger and Joel Klinger authored Jun 26, 2020
1 parent 18aedd7 commit 3ea6562
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 2 deletions.
Binary file modified nesta/core/config/elasticsearch.yaml
Binary file not shown.
4 changes: 2 additions & 2 deletions nesta/core/orms/gtr_orm.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class Participant(Base):


class OrganisationLocation(Base):
"""This table is not in the orginal data. It contains all organisations and location
"""This table is not in the original data. It contains all organisations and location
details where it has been possible to ascertain them."""
__tablename__ = "gtr_organisations_locations"

Expand Down Expand Up @@ -292,7 +292,7 @@ class SoftwareAndTechnicalProducts(Base):


class DocumentClusters(Base):
__tablename__ = 'grt_doc_clusters'
__tablename__ = 'gtr_doc_clusters'

doc_id = Column(VARCHAR(36), ForeignKey('gtr_projects.id'), primary_key=True)
cluster_id = Column(INT, primary_key=True, index=True)
Expand Down
25 changes: 25 additions & 0 deletions nesta/core/schemas/tier_1/datasets/gtr.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"entity_type": "project",
"tier0_to_tier1": {
"_continents": "terms_continent_project",
"_countries": "terms_countries_project",
"_funds": "json_funding_project",
"_institute_ids": "terms_instituteIds_project",
"_institutes": "terms_institutes_project",
"_iso2s": "terms_iso2_project",
"_locations": "coordinate_institutes_project",
"_outcomes": "json_outcomes_project",
"_topics": "terms_topics_project",
"abstractText": "textBody_abstract_project",
"end": "date_end_project",
"grantCategory": "type_category_funding",
"id": "id_of_project",
"leadFunder": "name_of_funder",
"leadOrganisationDepartment": "name_leadOrgDepartment_project",
"potentialImpact": "textBody_potentialImpact_project",
"start": "date_start_project",
"status": "status_of_project",
"techAbstractText": "textBody_techAbstract_project",
"title": "title_of_project"
}
}
124 changes: 124 additions & 0 deletions nesta/core/schemas/tier_1/mappings/datasets/gtr_mapping.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
{
"mappings": {
"dynamic": "strict",
"properties": {
"coordinate_institutes_project": {
"type": "geo_point"
},
"date_end_project": {
"type": "date"
},
"date_start_project": {
"type": "date"
},
"json_funding_project": {
"properties": {
"amount": {
"type": "integer"
},
"category": {
"type": "keyword"
},
"currency_code": {
"type": "keyword"
},
"end_date": {
"type": "date"
},
"start_date": {
"type": "date"
}
},
"type": "nested"
},
"json_outcomes_project": {
"dynamic": true,
"type": "nested"
},
"name_leadOrgDepartment_project": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"name_of_funder": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"status_of_project": {
"type": "keyword"
},
"terms_continent_project": {
"type": "keyword"
},
"terms_countries_project": {
"type": "keyword"
},
"terms_instituteIds_project": {
"type": "keyword"
},
"terms_institutes_project": {
"analyzer": "terms_analyzer",
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"terms_iso2_project": {
"type": "keyword"
},
"terms_topics_project": {
"analyzer": "terms_analyzer",
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"textBody_abstract_project": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"textBody_potentialImpact_project": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"textBody_techAbstract_project": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"title_of_project": {
"fields": {
"keyword": {
"type": "keyword"
}
},
"type": "text"
},
"type_category_funding": {
"type": "keyword"
}
}
}
}
7 changes: 7 additions & 0 deletions nesta/core/schemas/tier_1/ontology.json
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,17 @@
"fieldsOfStudy",
"fiscal",
"framework",
"funding",
"health",
"institutes",
"instituteIds",
"ipc",
"iso2",
"iso2lang",
"iso3",
"isoNumeric",
"last",
"leadOrgDepartment",
"linkedIn",
"location",
"member",
Expand All @@ -77,9 +80,11 @@
"nuts2",
"nuts3",
"of",
"outcomes",
"parent",
"personCountry",
"personNuts",
"potentialImpact",
"region",
"regions",
"rhodonite",
Expand All @@ -90,6 +95,7 @@
"state",
"subcategory",
"summary",
"techAbstract",
"techFieldNumber",
"tokens",
"topics",
Expand All @@ -113,6 +119,7 @@
"country",
"description",
"entity",
"funder",
"funders",
"funding",
"group",
Expand Down

0 comments on commit 3ea6562

Please sign in to comment.