Skip to content

Commit bd551c3

Browse files
JakobMiesnerkpsherva
authored andcommitted
importer: add warning for ambiguous eitems during import
* previously called `ambiguous` eitems are now called `duplicates` * ambiguous eitems are ones where the provider is listed as the `source` of the eitem but not under `created_by` * when ambiguous matches are detected, the import still happens as usual
1 parent 84ea8a8 commit bd551c3

File tree

10 files changed

+357
-64
lines changed

10 files changed

+357
-64
lines changed

cds_ils/importer/eitems/api.py

Lines changed: 0 additions & 18 deletions
This file was deleted.

cds_ils/importer/eitems/importer.py

Lines changed: 36 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,16 @@
1111

1212
import click
1313
from flask import current_app
14-
from invenio_app_ils.eitems.api import EItemIdProvider
14+
from invenio_app_ils.eitems.api import (
15+
EItemIdProvider,
16+
get_eitems_for_document_by_creator,
17+
get_eitems_for_document_by_source,
18+
)
1519
from invenio_app_ils.errors import IlsValidationError
1620
from invenio_app_ils.proxies import current_app_ils
1721
from invenio_db import db
1822
from invenio_pidstore.models import PersistentIdentifier
1923

20-
from cds_ils.importer.eitems.api import get_eitems_for_document_by_provider
21-
2224

2325
class EItemImporter(object):
2426
"""EItem importer class."""
@@ -47,6 +49,7 @@ def __init__(
4749
self.output_pid = None
4850
self.action = None
4951
self.eitem_record = None
52+
self.duplicate_list = []
5053
self.ambiguous_list = []
5154
self.deleted_list = []
5255

@@ -125,12 +128,13 @@ def _delete_existing_record(self, existing_eitem):
125128
eitem_indexer.delete(existing_eitem)
126129
return existing_eitem
127130

128-
def _report_ambiguous_records(self, multiple_results):
129-
eitem_cls = current_app_ils.eitem_record_cls
131+
def _report_duplicate_records(self, multiple_results):
132+
for hit in multiple_results:
133+
self.duplicate_list.append(hit["pid"])
130134

135+
def _report_ambiguous_records(self, multiple_results):
131136
for hit in multiple_results:
132-
existing_eitem = eitem_cls.get_record_by_pid(hit["pid"])
133-
self.ambiguous_list.append(existing_eitem)
137+
self.ambiguous_list.append(hit["pid"])
134138

135139
def _get_other_eitems_of_document(self, matched_document):
136140
eitem_search = current_app_ils.eitem_search_cls()
@@ -237,10 +241,25 @@ def eitems_search(self, matched_document):
237241
if self.eitem_json:
238242
# eitem is not always there, sometime we just create a doc
239243
eitem_type = self.eitem_json.get("_type", "E-BOOK").upper()
240-
search = get_eitems_for_document_by_provider(
244+
exact_eitem_search = get_eitems_for_document_by_creator(
241245
document_pid, self.metadata_provider
242246
).filter("term", eitem_type=eitem_type)
243-
return search
247+
248+
# Declare items that are matched by `source``, but not by `created_by`` as ambiguous
249+
# They might have been created manually and the source field was filled in
250+
exact_hits = exact_eitem_search.execute()
251+
exact_hit_ids = [hit.meta.id for hit in exact_hits]
252+
ambiguous_eitem_search = (
253+
get_eitems_for_document_by_source(
254+
document_pid, self.metadata_provider, case_insensitive=True
255+
)
256+
.exclude("ids", values=exact_hit_ids)
257+
.filter("term", eitem_type=eitem_type)
258+
)
259+
260+
self._report_ambiguous_records(ambiguous_eitem_search.scan())
261+
262+
return exact_eitem_search
244263

245264
def import_eitem_action(self, search):
246265
"""Determine import action."""
@@ -280,8 +299,8 @@ def update_eitems(self, matched_document):
280299
self.output_pid = existing_eitem["pid"]
281300
else:
282301
results = search.scan()
283-
self._report_ambiguous_records(results)
284-
# still creates an item, even ambiguous eitems found
302+
self._report_duplicate_records(results)
303+
# still creates an item, even duplicate eitems found
285304
# checks if there are higher priority eitems
286305
if should_eitem_be_imported:
287306
self.eitem_record = self.create_eitem(matched_document)
@@ -291,13 +310,10 @@ def update_eitems(self, matched_document):
291310
def delete_eitems(self, matched_document):
292311
"""Deletes eitems for a given document."""
293312
eitem_cls = current_app_ils.eitem_record_cls
294-
document_pid = matched_document["pid"]
295313
self.action = "delete"
296314

297315
# get eitems for current provider
298-
search = get_eitems_for_document_by_provider(
299-
document_pid, self.metadata_provider
300-
)
316+
search = self.eitems_search(matched_document)
301317
results = search.scan()
302318

303319
for record in results:
@@ -307,11 +323,8 @@ def delete_eitems(self, matched_document):
307323
def preview_delete(self, matched_document):
308324
"""Preview delete action on eitems for given document."""
309325
eitem_cls = current_app_ils.eitem_record_cls
310-
document_pid = matched_document["pid"]
311326
self.action = "delete"
312-
search = get_eitems_for_document_by_provider(
313-
document_pid, self.metadata_provider
314-
)
327+
search = self.eitems_search(matched_document)
315328
results = search.scan()
316329
for record in results:
317330
existing_eitem = eitem_cls.get_record_by_pid(record["pid"])
@@ -350,7 +363,8 @@ def summary(self):
350363
"eitem": self.eitem_record,
351364
"json": self.eitem_json,
352365
"output_pid": self.output_pid,
353-
"duplicates": self.ambiguous_list,
366+
"ambiguous": self.ambiguous_list,
367+
"duplicates": self.duplicate_list,
354368
"action": self.action,
355369
"deleted_eitems": self.deleted_list,
356370
}
@@ -374,8 +388,8 @@ def preview_import(self, matched_document):
374388
return self.summary()
375389
else:
376390
results = search.scan()
377-
self._report_ambiguous_records(results)
378-
# still creates an item, even ambiguous eitems found
391+
self._report_duplicate_records(results)
392+
# still creates an item, even duplicate eitems found
379393
# checks if there are higher priority eitems
380394
if should_eitem_be_imported:
381395
self.action = "create"

cds_ils/importer/serializers/schema.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ class ImportedEItemSchema(Schema):
2222
action = fields.String(dump_only=True)
2323
priority_provider = fields.Bool()
2424
duplicates = fields.List(fields.String)
25+
ambiguous = fields.List(fields.String)
2526
deleted_eitems = fields.List(fields.Dict)
2627
eitem = fields.Nested(EItemSchemaV1)
2728
json = fields.Raw()
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
[
2+
{
3+
"$schema": "https://127.0.0.1:5000/schemas/documents/document-v1.0.0.json",
4+
"created_by": { "type": "script", "value": "test" },
5+
"pid": "docid-13",
6+
"title": "Metallic materials — Determination of forming-limit curves for sheet and strip (ed. 2nd - 2021)",
7+
"authors": [{ "full_name": "International Organization for Standardization. Geneva" }],
8+
"abstract": "This is an abstract",
9+
"identifiers": [{ "scheme": "STANDARD_NUMBER", "value": "ISO-12004-2" }],
10+
"keywords": [{ "source": "X", "value": "Patata" }],
11+
"document_type": "BOOK",
12+
"publication_year": "1950",
13+
"agency_code": "DE-He213",
14+
"_eitem": {
15+
"pid": "eitemid-12",
16+
"created_by": {"type": "import", "value": "springer"},
17+
"document_pid": "docid-13",
18+
"eitem_type": "E-BOOK",
19+
"internal_notes": "Ambiguous eitem with source field",
20+
"description": "Description of the electronic item",
21+
"open_access": false,
22+
"source": "springer",
23+
"urls": [
24+
{
25+
"description": "Protected URL",
26+
"value": "http://protected-cds-ils.ch/",
27+
"login_required": true
28+
},
29+
{
30+
"description": "Another open URL",
31+
"value": "http://cds-ils.ch/",
32+
"login_required": false
33+
}
34+
]
35+
}
36+
}
37+
]
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
[
2+
{
3+
"$schema": "https://127.0.0.1:5000/schemas/documents/document-v1.0.0.json",
4+
"agency_code": "DE-He213",
5+
"created_by": {"type": "import", "value": "springer"},
6+
"pid": "docid-14",
7+
"title": "Test Document with Multiple Springer EItems",
8+
"document_type": "BOOK",
9+
"authors": [{ "full_name": "Test Author" }],
10+
"abstract": "This is a test document that should have multiple eitems from the same provider",
11+
"edition": "1st",
12+
"publication_year": "2024",
13+
"identifiers": [{ "scheme": "ISBN", "value": "1234567890123" }],
14+
"_eitem": {
15+
"pid": "eitemid-duplicate-test",
16+
"internal_notes": "This should create a duplicate when imported",
17+
"description": "Test eitem that should trigger duplicate detection",
18+
"open_access": false,
19+
"urls": [
20+
{
21+
"description": "Protected URL",
22+
"value": "http://protected-cds-ils.ch/",
23+
"login_required": true
24+
},
25+
{
26+
"description": "Another open URL",
27+
"value": "http://cds-ils.ch/",
28+
"login_required": false
29+
}
30+
]
31+
}
32+
}
33+
]

tests/importer/data/existing_documents.json

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,5 +222,18 @@
222222
"document_type": "BOOK",
223223
"publication_year": "1950",
224224
"note": "MATCH BY STANDARD NUMBER"
225+
},
226+
{
227+
"$schema": "https://127.0.0.1:5000/schemas/documents/document-v1.0.0.json",
228+
"created_by": { "type": "import", "value": "springer" },
229+
"pid": "docid-14",
230+
"title": "Test Document with Multiple Springer EItems",
231+
"document_type": "BOOK",
232+
"authors": [{ "full_name": "Test Author" }],
233+
"abstract": "This is a test document that should have multiple eitems from the same provider",
234+
"edition": "1st",
235+
"publication_year": "2024",
236+
"identifiers": [{ "scheme": "ISBN", "value": "1234567890123" }],
237+
"note": "DUPLICATE EITEMS TEST"
225238
}
226239
]

tests/importer/data/existing_eitems.json

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,5 +217,83 @@
217217
"login_required": false
218218
}
219219
]
220+
},
221+
{
222+
"pid": "eitemid-12",
223+
"created_by": {"type": "import", "value": "springer"},
224+
"source": "springer",
225+
"document_pid": "docid-13",
226+
"eitem_type": "E-BOOK",
227+
"internal_notes": "Unambiguous eitem with source field",
228+
"description": "Description of the electronic item",
229+
"open_access": false,
230+
"urls": [
231+
{
232+
"description": "Protected URL",
233+
"value": "http://protected-cds-ils.ch/",
234+
"login_required": true
235+
},
236+
{
237+
"description": "Another open URL",
238+
"value": "http://cds-ils.ch/",
239+
"login_required": false
240+
}
241+
]
242+
},
243+
{
244+
"pid": "eitemid-13",
245+
"created_by": {"type": "user_id", "value": "1"},
246+
"source": "springer",
247+
"document_pid": "docid-13",
248+
"eitem_type": "E-BOOK",
249+
"internal_notes": "Ambiguous eitem with source field",
250+
"description": "Description of the electronic item",
251+
"open_access": false,
252+
"urls": [
253+
{
254+
"description": "Protected URL",
255+
"value": "http://protected-cds-ils.ch/",
256+
"login_required": true
257+
},
258+
{
259+
"description": "Another open URL",
260+
"value": "http://cds-ils.ch/",
261+
"login_required": false
262+
}
263+
]
264+
},
265+
{
266+
"pid": "eitemid-14-dup1",
267+
"created_by": {"type": "import", "value": "springer"},
268+
"source": "springer",
269+
"document_pid": "docid-14",
270+
"eitem_type": "E-BOOK",
271+
"internal_notes": "First duplicate eitem from springer",
272+
"description": "Description of the first duplicate electronic item",
273+
"open_access": false,
274+
"urls": [
275+
{
276+
"description": "Protected URL 1",
277+
"value": "http://protected-cds-ils-1.ch/",
278+
"login_required": true
279+
}
280+
]
281+
},
282+
{
283+
"pid": "eitemid-14-dup2",
284+
"created_by": {"type": "import", "value": "springer"},
285+
"source": "springer",
286+
"document_pid": "docid-14",
287+
"eitem_type": "E-BOOK",
288+
"internal_notes": "Second duplicate eitem from springer",
289+
"description": "Description of the second duplicate electronic item",
290+
"open_access": false,
291+
"urls": [
292+
{
293+
"description": "Protected URL 2",
294+
"value": "http://protected-cds-ils-2.ch/",
295+
"login_required": true
296+
}
297+
]
220298
}
221299
]

0 commit comments

Comments
 (0)