-
Notifications
You must be signed in to change notification settings - Fork 7
/
nomen_entities.py
283 lines (260 loc) · 12.9 KB
/
nomen_entities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
import sys
import argparse
import requests
from common import *
from running_stats import StatsList
nk_dataset_name = 'uk25k-entities'
# Model examples
# Entity
# {'_dataset': <Dataset(uk25k-entities)>, '__data__': {u'name': u'Therapies', u'creator': {u'updated_at': u'2012-07-18T13:20:27.593385', u'created_at': u'2012-07-18T13:20:27.593374', u'login': u'pudo', u'github_id': 41628, u'id': 1}, u'created_at': u'2012-07-30T12:19:35.227528', u'updated_at': u'2012-07-30T12:19:35.227540', u'dataset': [u'uk25k-entities'], u'data': [{}], u'id': 17920}}
# Alias (unmatched)
# {'_dataset': <Dataset(uk25k-entities)>, '__data__': {u'created_at': u'2012-09-29T23:02:43.650686', u'name': u'IPS - Identity & Passport Service ', u'creator': {u'updated_at': u'2012-07-18T13:20:27.593385', u'created_at': u'2012-07-18T13:20:27.593374', u'login': u'pudo', u'github_id': 41628, u'id': 1}, u'matcher': None, u'is_invalid': False, u'is_matched': False, u'updated_at': u'2012-09-29T23:02:43.650698', u'entity': None, u'dataset': u'uk25k-entities', u'data': {}, u'id': 25712}}
class NomenData(object):
'''Data from Nomenklatura that has been processed and cached.'''
def __init__(self):
nk_dataset = nk_connect(nk_dataset_name)
# Put all the entities and their aliases in entitie_dict, invalid_aliases and unmatched_aliases
entities = nk_dataset.entities()
self.entities_dict = {} # id: entity_dict
self.entities_dict_by_name = {} # name: entity_dict
count = 0
for entity in entities:
count += 1
self.entities_dict[entity.id] = {
'entity': entity,
'has_dgu_name': bool(entity.data.get('dgu_name')),
'aliases': []}
self.entities_dict_by_name[entity.name] = self.entities_dict[entity.id]
self.num_entities = count
count = 0
aliases = nk_dataset.aliases()
self.aliases_by_name = {} # name: alias
self.invalid_aliases = []
self.unmatched_aliases = []
for al in aliases:
count += 1
self.aliases_by_name[al.name] = al
if al.is_matched and not al.is_invalid:
self.entities_dict[al.entity['id']]['aliases'].append(al)
# self.entities_dict_by_name has the same value object, so no
# need to update it separately.
elif al.is_matched and al.is_invalid:
self.invalid_aliases.append(al)
else:
self.unmatched_aliases.append(al)
self.num_aliases = count
def show_all():
nomen_data = NomenData()
print 'Entities: %i' % nomen_data.num_entities
print 'Aliases: %i' % nomen_data.num_aliases
def title(text):
return '\n\n\n%s\n%s\n%s' % ('*' * 70, text, '*' * 70)
print title('Entity: Alias, Alias, Alias...')
def printable_nkobj(alias_or_entity):
# Use UTF8 (matching the average shell) so it can be cut/pasted
return alias_or_entity.name.encode('utf8', errors='replace')
for entity_dict in sorted(nomen_data.entities_dict.values(),
key=lambda entity_dict: (entity_dict['has_dgu_name'],
entity_dict['entity'].name)):
entity = entity_dict['entity']
aliases = entity_dict['aliases']
print printable_nkobj(entity), \
' '.join(['"%s"' % printable_nkobj(alias) for alias in aliases])
print title('Invalid Aliases')
for alias in sorted(nomen_data.invalid_aliases,
key=lambda alias: alias.name):
print printable_nkobj(alias)
print title('Unmatched Aliases')
for alias in sorted(nomen_data.unmatched_aliases,
key=lambda alias: alias.name):
print printable_nkobj(alias)
def update_entities_from_dgu(publishers=None):
dgu_client = ckan_client()
if not publishers:
# Get list of publishers from DGU
publishers = dgu_client.action('organization_list')
stats = StatsList()
nk_dataset = nk_connect(nk_dataset_name)
for publisher_name in publishers:
publisher = dgu_client.action('organization_show', id=publisher_name)
# Match each publisher with a Nomen entity
try:
entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True)
except (nk_dataset.Invalid, nk_dataset.NoMatch):
entity_or_alias = None
data = {'dgu_name': publisher_name}
if entity_or_alias and isinstance(entity_or_alias, nomenklatura.Entity):
# Matched an entity
entity = entity_or_alias
if entity.data.get('dgu_name') == publisher_name:
# Matching ID, ensure Nomen still has the title as per DGU
print stats.add('Matching ID. Title match: %s' % \
(entity.name == publisher['title']), publisher_name)
elif 'dgu_name' in entity.data:
print stats.add('Wrong ID - ignoring', publisher_name)
elif entity.name == publisher['title']:
nk_dataset.update_entity(entity.id, entity.name, data)
print stats.add('Matching title, just added ID', publisher_name)
else:
# The title differs because of canonization? Hasn't happened yet.
print stats.add('Title differs - ignoring', publisher_name)
elif entity_or_alias and isinstance(entity_or_alias, nomenklatura.Alias):
# Matched an alias
alias_ = entity_or_alias
if alias_.is_matched:
entity = nk_dataset.get_entity(id=alias_.entity['id'])
if entity.data.get('dgu_name'):
print stats.add('Matched an alias for an entity which already has an ID - ignoring', publisher_name)
else:
nk_dataset.update_entity(entity.id, publisher['title'], data)
# we can't delete the existing alias (that is now the same
# as the entity) but we can create a new alias for the old
# entity
try:
new_alias = nk_dataset.lookup(entity.name)
except nk_dataset.NoMatch:
nk_dataset.match(alias_id=new_alias.id, entity_id=entity.id)
print stats.add('Matched an alias for an entity - swapped them over', publisher_name)
except nk_dataset.Invalid:
# This is not expected, but still fine
print stats.add('Matched an alias for an entity - overwrote the entity', publisher_name)
else:
# This is not expected, but still fine
print stats.add('Matched an alias for an entity - overwrote the entity', publisher_name)
else:
new_entity = nk_dataset.add_entity(publisher['title'], data)
nk_dataset.match(alias_id=alias_.id, entity_id=new_entity.id)
print stats.add('Matched an alias without a matching entity - created the entity')
else:
# No match - create Nomen entity
nk_dataset.add_entity(publisher['title'], data)
print stats.add('No match - added to Nomen', publisher_name)
print 'Summary'
print stats.report()
def reconcile_aliases_that_match_entities_exactly():
'''When adding entities using this tool, they might also currently be in
the recon queue. In cases there the alias name matches exactly the entity
name, link them up.
(Ideally we'd just delete the alias from the recon queue, but there is no
delete_alias API.)
'''
stats = StatsList()
nomen_data = NomenData()
nk_dataset = nk_connect(nk_dataset_name)
for alias in nomen_data.unmatched_aliases:
try:
entity_or_alias = nk_dataset.lookup_detailed(alias.name, readonly=True)
except (nk_dataset.Invalid, nk_dataset.NoMatch):
entity_or_alias = None
if entity_or_alias and isinstance(entity_or_alias, nomenklatura.Entity):
try:
nk_dataset.match(alias_id=alias.id, entity_id=entity_or_alias.id)
except requests.exceptions.HTTPError, e:
# Seem to get occasional 502s due to overloading
print stats.add('Server error linking the alias to an entity: %s' % e, alias.name)
continue
print stats.add('Matched alias to an entity of the same name', alias.name)
else:
print stats.add('No matching entity', alias.name)
print 'Summary'
print stats.report()
def bulk_action(action=None, filepath=None, entity_or_alias_names=None, entities=True, aliases=True):
nomen_data = NomenData()
nk_dataset = nk_connect(nk_dataset_name)
# Gather the list of entities & aliases from the file and command-line
entities_or_aliases = []
def find_name(name, stats):
if not name.strip():
print stats.add('blank', name)
elif entities and name in nomen_data.entities_dict_by_name:
entities_or_aliases.append(nomen_data.entities_dict_by_name[name]['entity'])
print stats.add('Entity found', name)
elif aliases and name in nomen_data.aliases_by_name:
entities_or_aliases.append(nomen_data.aliases_by_name[name])
print stats.add('Alias found', name)
else:
print stats.add('Not found', name)
if entity_or_alias_names:
stats = StatsList()
for name in entity_or_alias_names:
find_name(name, stats)
print 'Given names:'
print stats.report()
if filepath:
if not os.path.exists(filepath):
raise Exception('Filepath not found: %s' % filepath)
with open(filepath, 'r') as f:
stats = StatsList()
for line in f:
name = line.rstrip('\n\r')
find_name(name, stats)
#try:
# entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True)
#except nk_dataset.NoMatch:
# print stats.add('Not found', publisher['title'])
# continue
#except nk_dataset.Invalid:
# pass
#print stats.add('Found %s' % entity_or_alias.__class__.__name__, entity_or_alias.name)
#entities_or_aliases.append(entity_or_alias)
print 'File names:'
print stats.report()
# Do the action to each entity
stats = StatsList()
for entity_or_alias in entities_or_aliases:
name = entity_or_alias.name
if action=='invalidate':
if isinstance(entity_or_alias, nomenklatura.Entity):
print stats.add('Cannot invalidate an Entity', name)
continue
alias = entity_or_alias
if alias.is_invalid:
print stats.add('Already invalid', name)
continue
try:
nk_dataset.match(alias_id=alias.id, entity_id='INVALID')
except requests.exceptions.HTTPError, e:
# Seem to get occasional 502s due to overloading
print stats.add('Server error: %s' % e, alias.name)
continue
print stats.add('Invalidated', name)
else:
raise NotImplemented
print 'Bulk %s:' % action
print stats.report()
if __name__ == '__main__':
parser1 = argparse.ArgumentParser(description='Manager of Entities in Nomenklatura.')
commands = (
'show',
'update-entities-from-dgu',
'reconcile-aliases-that-match-entities-exactly',
'invalidate',
)
parser1.add_argument('command', choices=commands)
# Split command-line into the command and any args after it
# (there may be general options before the command)
# TODO: use add_subparsers instead
args1, args2 = sys.argv, []
args = sys.argv[1:]
for word_index, word in enumerate(args):
if word in commands:
args1, args2 = args[:word_index+1], args[word_index+1:]
break
parsed_args1 = parser1.parse_args(args1)
if parsed_args1.command == 'show':
show_all()
elif parsed_args1.command == 'update-entities-from-dgu':
parser2 = argparse.ArgumentParser()
parser2.add_argument('publishers', nargs='*', default=None)
parsed_args2 = parser2.parse_args(args2)
update_entities_from_dgu(parsed_args2.publishers)
elif parsed_args1.command == 'reconcile-aliases-that-match-entities-exactly':
reconcile_aliases_that_match_entities_exactly()
elif parsed_args1.command == 'invalidate':
parser2 = argparse.ArgumentParser()
parser2.add_argument('-f', metavar='FILE', dest='filepath')
parser2.add_argument('entities', nargs='*', default=None)
parsed_args2 = parser2.parse_args(args2)
bulk_action(action=parsed_args1.command, filepath=parsed_args2.filepath, entity_or_alias_names=parsed_args2.entities)
else:
raise NotImplemented