Skip to content
This repository was archived by the owner on Aug 13, 2021. It is now read-only.

Commit 5229238

Browse files
jaklingerJoel Klinger
andauthored
[270] GtR mapping (also table name typo) (#283)
* make sure conf dir is empty * simplified es config * added orm es config reader * modified setup_es to pick up new es config * swapped es_mode for boolean * aliases now consistent with config * aliases now automatically located * added endpoint field to estasks * added endpoint field to sql2estasks * changed branch name * mappings build * updated docs * updated docs * updated docs * added docstrings * pruned deprecated schema transformations * updated fos fieldname on arxlive * unified data set schema transformations * restructured directory * refactored references to schema_transformation * refactored references to schema_transformation * slimmed down transformations, and included entity_type * pruned ontology * tidied schemas * consistency tests * reverted unrelated json file * added dynamic strict to settings * removed index.json in favour of a single defaults file * harmonised name fieldsofstudy across arxiv * using soft alias until a future PR to minimise changes * added novelty back in * sorted json * sorted json * sorted json * changed schema_transformor to use new simpler mapping * removed to/from keys * new null syntax mapping implemented * cleaned and sorted json * adding temporary eurito-dev index to avoid conflating es7 compatibility issues * adding temporary eurito-dev index to avoid conflating es7 compatibility issues * testing es7 on cordis only * testing es7 on cordis only * testing es7 on cordis only * changes to make cordis es7 run * eurito-dev iteration * compatibility issues between arxlive and eurito arxiv * sorted json * pycountry change no longer assumes not null country * needed to split pathstub args * removed redundant es mappings * empty gtr transformation * [267] Pool ES mappings across datasets (#280) * changed branch name * mappings build * updated docs * updated docs * updated docs * added docstrings * added dynamic strict to settings * removed index.json in favour of a single defaults file * using soft alias until a future PR to minimise changes * cleaned and sorted json * [267] Tidy & slim schema transformations (#281) * pruned deprecated schema transformations * updated fos fieldname on arxlive * unified data set schema transformations * restructured directory * refactored references to schema_transformation * refactored references to schema_transformation * slimmed down transformations, and included entity_type * pruned ontology * tidied schemas * consistency tests * reverted unrelated json file * harmonised name fieldsofstudy across arxiv * added novelty back in * sorted json * sorted json * sorted json Co-authored-by: Joel Klinger <[email protected]> Co-authored-by: Joel Klinger <[email protected]> * patched out es config setup from tests * removed redundant tests * fixed json formatting * fixed bad table name (NB table was empty anyway) * fixed bad table name (NB table was empty anyway) * gtr ontology * none included for testing * added schema transformation * picked up bug in test * gtr ontology is self consistent * added gtr mapping * added gtr to config * fixed merge conflicts * fixed merge conflicts * changed json field names * instiutes are now analyzed and text * sorted and cleaned json * added geopoint * fixed bad json * fixed bad json Co-authored-by: Joel Klinger <[email protected]>
1 parent 10b6437 commit 5229238

File tree

5 files changed

+158
-2
lines changed

5 files changed

+158
-2
lines changed
104 Bytes
Binary file not shown.

nesta/core/orms/gtr_orm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ class Participant(Base):
7272

7373

7474
class OrganisationLocation(Base):
75-
"""This table is not in the orginal data. It contains all organisations and location
75+
"""This table is not in the original data. It contains all organisations and location
7676
details where it has been possible to ascertain them."""
7777
__tablename__ = "gtr_organisations_locations"
7878

@@ -292,7 +292,7 @@ class SoftwareAndTechnicalProducts(Base):
292292

293293

294294
class DocumentClusters(Base):
295-
__tablename__ = 'grt_doc_clusters'
295+
__tablename__ = 'gtr_doc_clusters'
296296

297297
doc_id = Column(VARCHAR(36), ForeignKey('gtr_projects.id'), primary_key=True)
298298
cluster_id = Column(INT, primary_key=True, index=True)
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
{
2+
"entity_type": "project",
3+
"tier0_to_tier1": {
4+
"_continents": "terms_continent_project",
5+
"_countries": "terms_countries_project",
6+
"_funds": "json_funding_project",
7+
"_institute_ids": "terms_instituteIds_project",
8+
"_institutes": "terms_institutes_project",
9+
"_iso2s": "terms_iso2_project",
10+
"_locations": "coordinate_institutes_project",
11+
"_outcomes": "json_outcomes_project",
12+
"_topics": "terms_topics_project",
13+
"abstractText": "textBody_abstract_project",
14+
"end": "date_end_project",
15+
"grantCategory": "type_category_funding",
16+
"id": "id_of_project",
17+
"leadFunder": "name_of_funder",
18+
"leadOrganisationDepartment": "name_leadOrgDepartment_project",
19+
"potentialImpact": "textBody_potentialImpact_project",
20+
"start": "date_start_project",
21+
"status": "status_of_project",
22+
"techAbstractText": "textBody_techAbstract_project",
23+
"title": "title_of_project"
24+
}
25+
}
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
{
2+
"mappings": {
3+
"dynamic": "strict",
4+
"properties": {
5+
"coordinate_institutes_project": {
6+
"type": "geo_point"
7+
},
8+
"date_end_project": {
9+
"type": "date"
10+
},
11+
"date_start_project": {
12+
"type": "date"
13+
},
14+
"json_funding_project": {
15+
"properties": {
16+
"amount": {
17+
"type": "integer"
18+
},
19+
"category": {
20+
"type": "keyword"
21+
},
22+
"currency_code": {
23+
"type": "keyword"
24+
},
25+
"end_date": {
26+
"type": "date"
27+
},
28+
"start_date": {
29+
"type": "date"
30+
}
31+
},
32+
"type": "nested"
33+
},
34+
"json_outcomes_project": {
35+
"dynamic": true,
36+
"type": "nested"
37+
},
38+
"name_leadOrgDepartment_project": {
39+
"fields": {
40+
"keyword": {
41+
"type": "keyword"
42+
}
43+
},
44+
"type": "text"
45+
},
46+
"name_of_funder": {
47+
"fields": {
48+
"keyword": {
49+
"type": "keyword"
50+
}
51+
},
52+
"type": "text"
53+
},
54+
"status_of_project": {
55+
"type": "keyword"
56+
},
57+
"terms_continent_project": {
58+
"type": "keyword"
59+
},
60+
"terms_countries_project": {
61+
"type": "keyword"
62+
},
63+
"terms_instituteIds_project": {
64+
"type": "keyword"
65+
},
66+
"terms_institutes_project": {
67+
"analyzer": "terms_analyzer",
68+
"fields": {
69+
"keyword": {
70+
"type": "keyword"
71+
}
72+
},
73+
"type": "text"
74+
},
75+
"terms_iso2_project": {
76+
"type": "keyword"
77+
},
78+
"terms_topics_project": {
79+
"analyzer": "terms_analyzer",
80+
"fields": {
81+
"keyword": {
82+
"type": "keyword"
83+
}
84+
},
85+
"type": "text"
86+
},
87+
"textBody_abstract_project": {
88+
"fields": {
89+
"keyword": {
90+
"type": "keyword"
91+
}
92+
},
93+
"type": "text"
94+
},
95+
"textBody_potentialImpact_project": {
96+
"fields": {
97+
"keyword": {
98+
"type": "keyword"
99+
}
100+
},
101+
"type": "text"
102+
},
103+
"textBody_techAbstract_project": {
104+
"fields": {
105+
"keyword": {
106+
"type": "keyword"
107+
}
108+
},
109+
"type": "text"
110+
},
111+
"title_of_project": {
112+
"fields": {
113+
"keyword": {
114+
"type": "keyword"
115+
}
116+
},
117+
"type": "text"
118+
},
119+
"type_category_funding": {
120+
"type": "keyword"
121+
}
122+
}
123+
}
124+
}

nesta/core/schemas/tier_1/ontology.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,17 @@
5656
"fieldsOfStudy",
5757
"fiscal",
5858
"framework",
59+
"funding",
5960
"health",
6061
"institutes",
62+
"instituteIds",
6163
"ipc",
6264
"iso2",
6365
"iso2lang",
6466
"iso3",
6567
"isoNumeric",
6668
"last",
69+
"leadOrgDepartment",
6770
"linkedIn",
6871
"location",
6972
"member",
@@ -77,9 +80,11 @@
7780
"nuts2",
7881
"nuts3",
7982
"of",
83+
"outcomes",
8084
"parent",
8185
"personCountry",
8286
"personNuts",
87+
"potentialImpact",
8388
"region",
8489
"regions",
8590
"rhodonite",
@@ -90,6 +95,7 @@
9095
"state",
9196
"subcategory",
9297
"summary",
98+
"techAbstract",
9399
"techFieldNumber",
94100
"tokens",
95101
"topics",
@@ -113,6 +119,7 @@
113119
"country",
114120
"description",
115121
"entity",
122+
"funder",
116123
"funders",
117124
"funding",
118125
"group",

0 commit comments

Comments
 (0)