diff --git a/.gitignore b/.gitignore index 06abb7185..a75d1b15a 100644 --- a/.gitignore +++ b/.gitignore @@ -61,7 +61,8 @@ target/ # Vim swapfiles .*.sw? -# vscode history +# vscode files and history +.vscode/ **/.history .python-version diff --git a/cds_ils/importer/eitems/api.py b/cds_ils/importer/eitems/api.py deleted file mode 100644 index eb004ddfc..000000000 --- a/cds_ils/importer/eitems/api.py +++ /dev/null @@ -1,18 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (C) 2020 CERN. -# -# CDS-ILS is free software; you can redistribute it and/or modify it under -# the terms of the MIT License; see LICENSE file for more details. - -"""CDS-ILS EItems Importer api.""" -from invenio_app_ils.proxies import current_app_ils - - -def get_eitems_for_document_by_provider(document_pid, provider): - """Find eitem by document pid and provider.""" - eitem_search = current_app_ils.eitem_search_cls() - search = eitem_search.search_by_document_pid(document_pid=document_pid).filter( - "term", created_by__value=provider - ) - return search diff --git a/cds_ils/importer/eitems/importer.py b/cds_ils/importer/eitems/importer.py index 9c1934cc0..03f6935e5 100644 --- a/cds_ils/importer/eitems/importer.py +++ b/cds_ils/importer/eitems/importer.py @@ -11,14 +11,16 @@ import click from flask import current_app -from invenio_app_ils.eitems.api import EItemIdProvider +from invenio_app_ils.eitems.api import ( + EItemIdProvider, + get_eitems_for_document_by_creator, + get_eitems_for_document_by_source, +) from invenio_app_ils.errors import IlsValidationError from invenio_app_ils.proxies import current_app_ils from invenio_db import db from invenio_pidstore.models import PersistentIdentifier -from cds_ils.importer.eitems.api import get_eitems_for_document_by_provider - class EItemImporter(object): """EItem importer class.""" @@ -47,6 +49,7 @@ def __init__( self.output_pid = None self.action = None self.eitem_record = None + self.duplicate_list = [] self.ambiguous_list = [] self.deleted_list = [] @@ -125,12 +128,13 @@ def _delete_existing_record(self, existing_eitem): eitem_indexer.delete(existing_eitem) return existing_eitem - def _report_ambiguous_records(self, multiple_results): - eitem_cls = current_app_ils.eitem_record_cls + def _report_duplicate_records(self, multiple_results): + for hit in multiple_results: + self.duplicate_list.append(hit["pid"]) + def _report_ambiguous_records(self, multiple_results): for hit in multiple_results: - existing_eitem = eitem_cls.get_record_by_pid(hit["pid"]) - self.ambiguous_list.append(existing_eitem) + self.ambiguous_list.append(hit["pid"]) def _get_other_eitems_of_document(self, matched_document): eitem_search = current_app_ils.eitem_search_cls() @@ -237,10 +241,25 @@ def eitems_search(self, matched_document): if self.eitem_json: # eitem is not always there, sometime we just create a doc eitem_type = self.eitem_json.get("_type", "E-BOOK").upper() - search = get_eitems_for_document_by_provider( + exact_eitem_search = get_eitems_for_document_by_creator( document_pid, self.metadata_provider ).filter("term", eitem_type=eitem_type) - return search + + # Declare items that are matched by `source``, but not by `created_by`` as ambiguous + # They might have been created manually and the source field was filled in + exact_hits = exact_eitem_search.execute() + exact_hit_ids = [hit.meta.id for hit in exact_hits] + ambiguous_eitem_search = ( + get_eitems_for_document_by_source( + document_pid, self.metadata_provider, case_insensitive=True + ) + .exclude("ids", values=exact_hit_ids) + .filter("term", eitem_type=eitem_type) + ) + + self._report_ambiguous_records(ambiguous_eitem_search.scan()) + + return exact_eitem_search def import_eitem_action(self, search): """Determine import action.""" @@ -280,8 +299,8 @@ def update_eitems(self, matched_document): self.output_pid = existing_eitem["pid"] else: results = search.scan() - self._report_ambiguous_records(results) - # still creates an item, even ambiguous eitems found + self._report_duplicate_records(results) + # still creates an item, even duplicate eitems found # checks if there are higher priority eitems if should_eitem_be_imported: self.eitem_record = self.create_eitem(matched_document) @@ -291,13 +310,10 @@ def update_eitems(self, matched_document): def delete_eitems(self, matched_document): """Deletes eitems for a given document.""" eitem_cls = current_app_ils.eitem_record_cls - document_pid = matched_document["pid"] self.action = "delete" # get eitems for current provider - search = get_eitems_for_document_by_provider( - document_pid, self.metadata_provider - ) + search = self.eitems_search(matched_document) results = search.scan() for record in results: @@ -307,11 +323,8 @@ def delete_eitems(self, matched_document): def preview_delete(self, matched_document): """Preview delete action on eitems for given document.""" eitem_cls = current_app_ils.eitem_record_cls - document_pid = matched_document["pid"] self.action = "delete" - search = get_eitems_for_document_by_provider( - document_pid, self.metadata_provider - ) + search = self.eitems_search(matched_document) results = search.scan() for record in results: existing_eitem = eitem_cls.get_record_by_pid(record["pid"]) @@ -350,7 +363,8 @@ def summary(self): "eitem": self.eitem_record, "json": self.eitem_json, "output_pid": self.output_pid, - "duplicates": self.ambiguous_list, + "ambiguous": self.ambiguous_list, + "duplicates": self.duplicate_list, "action": self.action, "deleted_eitems": self.deleted_list, } @@ -374,8 +388,8 @@ def preview_import(self, matched_document): return self.summary() else: results = search.scan() - self._report_ambiguous_records(results) - # still creates an item, even ambiguous eitems found + self._report_duplicate_records(results) + # still creates an item, even duplicate eitems found # checks if there are higher priority eitems if should_eitem_be_imported: self.action = "create" diff --git a/cds_ils/importer/serializers/schema.py b/cds_ils/importer/serializers/schema.py index ca1040f39..c71682f7d 100644 --- a/cds_ils/importer/serializers/schema.py +++ b/cds_ils/importer/serializers/schema.py @@ -22,6 +22,7 @@ class ImportedEItemSchema(Schema): action = fields.String(dump_only=True) priority_provider = fields.Bool() duplicates = fields.List(fields.String) + ambiguous = fields.List(fields.String) deleted_eitems = fields.List(fields.Dict) eitem = fields.Nested(EItemSchemaV1) json = fields.Raw() diff --git a/tests/importer/data/ambiguous_document_data.json b/tests/importer/data/ambiguous_document_data.json new file mode 100644 index 000000000..d9d1b74ff --- /dev/null +++ b/tests/importer/data/ambiguous_document_data.json @@ -0,0 +1,37 @@ +[ + { + "$schema": "https://127.0.0.1:5000/schemas/documents/document-v1.0.0.json", + "created_by": { "type": "script", "value": "test" }, + "pid": "docid-13", + "title": "Metallic materials — Determination of forming-limit curves for sheet and strip (ed. 2nd - 2021)", + "authors": [{ "full_name": "International Organization for Standardization. Geneva" }], + "abstract": "This is an abstract", + "identifiers": [{ "scheme": "STANDARD_NUMBER", "value": "ISO-12004-2" }], + "keywords": [{ "source": "X", "value": "Patata" }], + "document_type": "BOOK", + "publication_year": "1950", + "agency_code": "DE-He213", + "_eitem": { + "pid": "eitemid-12", + "created_by": {"type": "import", "value": "springer"}, + "document_pid": "docid-13", + "eitem_type": "E-BOOK", + "internal_notes": "Ambiguous eitem with source field", + "description": "Description of the electronic item", + "open_access": false, + "source": "springer", + "urls": [ + { + "description": "Protected URL", + "value": "http://protected-cds-ils.ch/", + "login_required": true + }, + { + "description": "Another open URL", + "value": "http://cds-ils.ch/", + "login_required": false + } + ] + } + } +] diff --git a/tests/importer/data/create_duplicate_eitems_document.json b/tests/importer/data/create_duplicate_eitems_document.json new file mode 100644 index 000000000..cf0899636 --- /dev/null +++ b/tests/importer/data/create_duplicate_eitems_document.json @@ -0,0 +1,33 @@ +[ + { + "$schema": "https://127.0.0.1:5000/schemas/documents/document-v1.0.0.json", + "agency_code": "DE-He213", + "created_by": {"type": "import", "value": "springer"}, + "pid": "docid-14", + "title": "Test Document with Multiple Springer EItems", + "document_type": "BOOK", + "authors": [{ "full_name": "Test Author" }], + "abstract": "This is a test document that should have multiple eitems from the same provider", + "edition": "1st", + "publication_year": "2024", + "identifiers": [{ "scheme": "ISBN", "value": "1234567890123" }], + "_eitem": { + "pid": "eitemid-duplicate-test", + "internal_notes": "This should create a duplicate when imported", + "description": "Test eitem that should trigger duplicate detection", + "open_access": false, + "urls": [ + { + "description": "Protected URL", + "value": "http://protected-cds-ils.ch/", + "login_required": true + }, + { + "description": "Another open URL", + "value": "http://cds-ils.ch/", + "login_required": false + } + ] + } + } +] \ No newline at end of file diff --git a/tests/importer/data/existing_documents.json b/tests/importer/data/existing_documents.json index 2f29f5c50..20ff00fd7 100644 --- a/tests/importer/data/existing_documents.json +++ b/tests/importer/data/existing_documents.json @@ -222,5 +222,18 @@ "document_type": "BOOK", "publication_year": "1950", "note": "MATCH BY STANDARD NUMBER" + }, + { + "$schema": "https://127.0.0.1:5000/schemas/documents/document-v1.0.0.json", + "created_by": { "type": "import", "value": "springer" }, + "pid": "docid-14", + "title": "Test Document with Multiple Springer EItems", + "document_type": "BOOK", + "authors": [{ "full_name": "Test Author" }], + "abstract": "This is a test document that should have multiple eitems from the same provider", + "edition": "1st", + "publication_year": "2024", + "identifiers": [{ "scheme": "ISBN", "value": "1234567890123" }], + "note": "DUPLICATE EITEMS TEST" } ] diff --git a/tests/importer/data/existing_eitems.json b/tests/importer/data/existing_eitems.json index 16c2938e7..37544502c 100644 --- a/tests/importer/data/existing_eitems.json +++ b/tests/importer/data/existing_eitems.json @@ -217,5 +217,83 @@ "login_required": false } ] + }, + { + "pid": "eitemid-12", + "created_by": {"type": "import", "value": "springer"}, + "source": "springer", + "document_pid": "docid-13", + "eitem_type": "E-BOOK", + "internal_notes": "Unambiguous eitem with source field", + "description": "Description of the electronic item", + "open_access": false, + "urls": [ + { + "description": "Protected URL", + "value": "http://protected-cds-ils.ch/", + "login_required": true + }, + { + "description": "Another open URL", + "value": "http://cds-ils.ch/", + "login_required": false + } + ] + }, + { + "pid": "eitemid-13", + "created_by": {"type": "user_id", "value": "1"}, + "source": "springer", + "document_pid": "docid-13", + "eitem_type": "E-BOOK", + "internal_notes": "Ambiguous eitem with source field", + "description": "Description of the electronic item", + "open_access": false, + "urls": [ + { + "description": "Protected URL", + "value": "http://protected-cds-ils.ch/", + "login_required": true + }, + { + "description": "Another open URL", + "value": "http://cds-ils.ch/", + "login_required": false + } + ] + }, + { + "pid": "eitemid-14-dup1", + "created_by": {"type": "import", "value": "springer"}, + "source": "springer", + "document_pid": "docid-14", + "eitem_type": "E-BOOK", + "internal_notes": "First duplicate eitem from springer", + "description": "Description of the first duplicate electronic item", + "open_access": false, + "urls": [ + { + "description": "Protected URL 1", + "value": "http://protected-cds-ils-1.ch/", + "login_required": true + } + ] + }, + { + "pid": "eitemid-14-dup2", + "created_by": {"type": "import", "value": "springer"}, + "source": "springer", + "document_pid": "docid-14", + "eitem_type": "E-BOOK", + "internal_notes": "Second duplicate eitem from springer", + "description": "Description of the second duplicate electronic item", + "open_access": false, + "urls": [ + { + "description": "Protected URL 2", + "value": "http://protected-cds-ils-2.ch/", + "login_required": true + } + ] } ] diff --git a/tests/importer/test_importer.py b/tests/importer/test_importer.py index c5db672e5..b7aaf6d9d 100644 --- a/tests/importer/test_importer.py +++ b/tests/importer/test_importer.py @@ -365,3 +365,118 @@ def test_import_video(app, db): assert eitem["created_by"] == {"type": "import", "value": "safari"} assert created_document["created_by"] == {"type": "import", "value": "safari"} + + +def test_report_ambiguous_eitems_by_source_field_during_import(importer_test_data): + """Test that eitems matched on source field are reported as ambiguous during import.""" + + document_cls = current_app_ils.document_record_cls + eitem_cls = current_app_ils.eitem_record_cls + eitem_search_cls = current_app_ils.eitem_search_cls + + document_with_source_eitem = document_cls.get_record_by_pid("docid-13") + + search = eitem_search_cls().search_by_document_pid( + document_pid=document_with_source_eitem["pid"] + ) + results = search.execute() + + json_data = load_json_from_datadir( + "ambiguous_document_data.json", relpath="importer" + )[0] + + importer = Importer(json_data, "springer") + report = importer.import_record() + + assert report["action"] == "update" + + eitem_report = report.get("eitem") + assert eitem_report["action"] == "update" + updated_eitem = eitem_report.get("json") + assert updated_eitem["pid"] == "eitemid-12" + + duplicates = eitem_report.get("duplicates") + assert len(duplicates) == 0 + + ambiguous_list = eitem_report.get("ambiguous") + assert len(ambiguous_list) == 1 + ambiguous_eitem = ambiguous_list[0] + assert ambiguous_eitem == "eitemid-13" + + +def test_report_ambiguous_eitems_by_source_field_during_delete(importer_test_data): + """Test that eitems matched on source field are reported as ambiguous during import.""" + + document_cls = current_app_ils.document_record_cls + eitem_cls = current_app_ils.eitem_record_cls + eitem_search_cls = current_app_ils.eitem_search_cls + + document_with_source_eitem = document_cls.get_record_by_pid("docid-13") + + search = eitem_search_cls().search_by_document_pid( + document_pid=document_with_source_eitem["pid"] + ) + results = search.execute() + + json_data = load_json_from_datadir( + "ambiguous_document_data.json", relpath="importer" + )[0] + + importer = Importer(json_data, "springer") + report = importer.preview_delete() + + assert report["action"] == "delete" + + eitem_report = report.get("eitem") + assert eitem_report["action"] == "delete" + updated_eitem = eitem_report.get("json") + assert updated_eitem["pid"] == "eitemid-12" + + duplicates = eitem_report.get("duplicates") + assert len(duplicates) == 0 + + ambiguous_list = eitem_report.get("ambiguous") + assert len(ambiguous_list) == 1 + ambiguous_eitem = ambiguous_list[0] + assert ambiguous_eitem == "eitemid-13" + + +def test_duplicate_eitems_detection_during_import(importer_test_data): + """Test that duplicate eitems are properly detected in import preview.""" + document_cls = current_app_ils.document_record_cls + eitem_cls = current_app_ils.eitem_record_cls + eitem_search_cls = current_app_ils.eitem_search_cls + + json_data = load_json_from_datadir( + "create_duplicate_eitems_document.json", relpath="importer" + ) + + document_with_duplicates = document_cls.get_record_by_pid("docid-14") + search = eitem_search_cls().search_by_document_pid( + document_pid=document_with_duplicates["pid"] + ) + results = search.execute() + + existing_eitems = [eitem_cls.get_record_by_pid(hit.pid) for hit in results.hits] + springer_eitems = [ + hit + for hit in existing_eitems + if hit.get("created_by").get("value") == "springer" + ] + + assert ( + len(springer_eitems) == 2 + ), f"Expected 2 existing springer eitems, but found {len(springer_eitems)}" + + importer = Importer(json_data[0], "springer") + report = importer.import_record() + + eitem_report = report.get("eitem") + duplicates = eitem_report.get("duplicates") + assert ( + len(duplicates) == 2 + ), f"Expected 2 duplicates to be detected, but found {len(duplicates)}" + + assert ( + eitem_report.get("action") == "create" + ), f"Unexpected action: {eitem_report.get('action')}" diff --git a/ui/src/importer/EitemImportDetailsModal/EitemImportDetailsModal.js b/ui/src/importer/EitemImportDetailsModal/EitemImportDetailsModal.js index df9078214..232c36658 100644 --- a/ui/src/importer/EitemImportDetailsModal/EitemImportDetailsModal.js +++ b/ui/src/importer/EitemImportDetailsModal/EitemImportDetailsModal.js @@ -13,9 +13,21 @@ export default class EitemImportDetails extends Component { EItem - attention required + {!_isEmpty(eitemReport.ambiguous) && ( + <> +
Review Required: Manual e-items found
+ + {eitemReport.ambiguous.map((pid) => ( + + {pid} + + ))} + + + )} {!_isEmpty(eitemReport.duplicates) && ( <> -
Duplicated e-items found
+
Review Required: Duplicated e-items found
{eitemReport.duplicates.map((pid) => ( diff --git a/ui/src/importer/importerTaskDetails/ImporterTaskDetailsComponents/ImportedDocumentReport.js b/ui/src/importer/importerTaskDetails/ImporterTaskDetailsComponents/ImportedDocumentReport.js index e9fb5e94b..3d5d9d922 100644 --- a/ui/src/importer/importerTaskDetails/ImporterTaskDetailsComponents/ImportedDocumentReport.js +++ b/ui/src/importer/importerTaskDetails/ImporterTaskDetailsComponents/ImportedDocumentReport.js @@ -124,29 +124,37 @@ class ImportedDocumentReportComponent extends Component { )} - {(_get(documentReport, "eitem.deleted_eitems", []).length > 1 || - !_isEmpty(_get(documentReport, "eitem.duplicates", []))) && ( -