From 784e57ed5b435389e752cc63805ebfe6f254f4ce Mon Sep 17 00:00:00 2001 From: kyle Date: Tue, 17 Sep 2024 12:38:24 -0400 Subject: [PATCH 1/7] SFR-2171: Fixing OCLC bib author mapping --- CHANGELOG.md | 1 + mappings/oclc_bib.py | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 76299216ba..9976823c1f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,7 @@ - Changed HATHI_DATAFILES outdated link in development, example, and local yaml files - Fixed edition API ID param - Fixed usage type bug +- Fixed OCLC bib author mapping ## 2024-08-06 -- v0.13.1 ## Added diff --git a/mappings/oclc_bib.py b/mappings/oclc_bib.py index 76bc564378..b9912f93c0 100644 --- a/mappings/oclc_bib.py +++ b/mappings/oclc_bib.py @@ -47,7 +47,7 @@ def _get_creators(self, oclc_bib): return list( filter( - lambda creator: creator.get('secondName') and creator.get('firstName'), + lambda creator: creator.get('secondName') or creator.get('firstName'), oclc_bib['contributor'].get('creators', []) ) ) @@ -91,8 +91,26 @@ def _map_subjects(self, oclc_bib) -> list[str]: def _map_authors(self, authors) -> Optional[list[str]]: if not authors: return None + + return [f'{author_name}|||true' for author in authors if (author_name := self._get_author_name(author))] + + def _get_author_name(self, author) -> str: + first_name = self._get_name(author.get('firstName')) + second_name = self._get_name(author.get('secondName')) + + if not first_name and not second_name: + return None + + if first_name and second_name: + return f'{second_name}, {first_name}' + + return f'{first_name or second_name}' + + def _get_name(self, name_data) -> str: + if not name_data: + return None - return [f"{author['secondName']['text']}, {author['firstName']['text']}|||true" for author in authors] + return name_data.get('text') or name_data.get('romanizedText') or None def _map_contributors(self, contributors) -> Optional[list[str]]: if not contributors: From c4f031865b7b3346e19782ca1d28301f0e799e43 Mon Sep 17 00:00:00 2001 From: kyle Date: Tue, 17 Sep 2024 12:42:38 -0400 Subject: [PATCH 2/7] removing fallback --- mappings/oclc_bib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mappings/oclc_bib.py b/mappings/oclc_bib.py index b9912f93c0..3b757f2cc8 100644 --- a/mappings/oclc_bib.py +++ b/mappings/oclc_bib.py @@ -110,7 +110,7 @@ def _get_name(self, name_data) -> str: if not name_data: return None - return name_data.get('text') or name_data.get('romanizedText') or None + return name_data.get('text') or name_data.get('romanizedText') def _map_contributors(self, contributors) -> Optional[list[str]]: if not contributors: From 1cfbf6a3f680f1065e801385b7cadd37194cd185 Mon Sep 17 00:00:00 2001 From: kyle Date: Tue, 17 Sep 2024 12:48:37 -0400 Subject: [PATCH 3/7] getting contributor names --- mappings/oclc_bib.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/mappings/oclc_bib.py b/mappings/oclc_bib.py index 3b757f2cc8..01820c6bb2 100644 --- a/mappings/oclc_bib.py +++ b/mappings/oclc_bib.py @@ -92,11 +92,20 @@ def _map_authors(self, authors) -> Optional[list[str]]: if not authors: return None - return [f'{author_name}|||true' for author in authors if (author_name := self._get_author_name(author))] + return [f'{author_name}|||true' for author in authors if (author_name := self._get_contributor_name(author))] - def _get_author_name(self, author) -> str: - first_name = self._get_name(author.get('firstName')) - second_name = self._get_name(author.get('secondName')) + def _map_contributors(self, contributors) -> Optional[list[str]]: + if not contributors: + return None + + return [ + f"{contributor_name}|||{', '.join(list(map(lambda relator: relator.get('term', ''), contributor.get('relators', []))))}" + for contributor in contributors if (contributor_name := self._get_contributor_name(contributor)) + ] + + def _get_contributor_name(self, contributor) -> str: + first_name = self._get_name(contributor.get('firstName')) + second_name = self._get_name(contributor.get('secondName')) if not first_name and not second_name: return None @@ -111,12 +120,3 @@ def _get_name(self, name_data) -> str: return None return name_data.get('text') or name_data.get('romanizedText') - - def _map_contributors(self, contributors) -> Optional[list[str]]: - if not contributors: - return None - - return [ - f"{contributor['secondName']['text']}, {contributor['firstName']['text']}|||{', '.join(list(map(lambda relator: relator.get('term', ''), contributor.get('relators', []))))}" - for contributor in contributors - ] From 62acca8f663811aba10a0ba1541a318445b3831a Mon Sep 17 00:00:00 2001 From: kyle Date: Tue, 17 Sep 2024 17:13:47 -0400 Subject: [PATCH 4/7] adding test --- mappings/oclc_bib.py | 2 + tests/unit/test_oclc_bib_mapping.py | 67 +++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 tests/unit/test_oclc_bib_mapping.py diff --git a/mappings/oclc_bib.py b/mappings/oclc_bib.py index 01820c6bb2..0a352c8b45 100644 --- a/mappings/oclc_bib.py +++ b/mappings/oclc_bib.py @@ -13,6 +13,8 @@ def __init__(self, oclc_bib, related_oclc_numbers=[]): authors = self._get_authors(creators) contributors = self._get_contributors(creators) + print(creators) + self.record = Record( uuid=uuid4(), frbr_status='complete', diff --git a/tests/unit/test_oclc_bib_mapping.py b/tests/unit/test_oclc_bib_mapping.py new file mode 100644 index 0000000000..7d014e5c13 --- /dev/null +++ b/tests/unit/test_oclc_bib_mapping.py @@ -0,0 +1,67 @@ +from mappings.oclc_bib import OCLCBibMapping + +base_oclc_bib = { + 'identifier': { + 'oclcNumber': 1234 + }, + 'work': { + 'id': 1 + }, + 'title': { + 'mainTitles': [{ + 'text': 'The Story of DRB' + }] + }, + 'subjects': [{ + 'subjectName': { + 'text': 'Subject' + }, + 'vocabulary': 'fast' + }], + 'contributor': { + 'creators': [{ + 'firstName': { + 'text': 'Hathi' + }, + 'secondName': { + 'text': 'Trust' + }, + 'isPrimary': True + }] + } +} + + +def test_oclc_bib_mapping_full_name(): + oclc_bib_mapping = OCLCBibMapping(base_oclc_bib) + + assert ['Trust, Hathi|||true'] == oclc_bib_mapping.record.authors + + +def test_oclc_bib_mapping_no_first_name(): + base_oclc_bib['contributor'] = { + 'creators': [{ + 'secondName': { + 'text': 'Trust' + }, + 'isPrimary': True + }] + } + + oclc_bib_mapping = OCLCBibMapping(base_oclc_bib) + + assert ['Trust|||true'] == oclc_bib_mapping.record.authors + +def test_oclc_bib_mapping_no_second_name(): + base_oclc_bib['contributor'] = { + 'creators': [{ + 'firstName': { + 'text': 'Hathi' + }, + 'isPrimary': True + }] + } + + oclc_bib_mapping = OCLCBibMapping(base_oclc_bib) + + assert ['Hathi|||true'] == oclc_bib_mapping.record.authors \ No newline at end of file From 067d5b0f00bdbb6844f9d3c75530863d28493c14 Mon Sep 17 00:00:00 2001 From: kyle Date: Tue, 17 Sep 2024 17:14:40 -0400 Subject: [PATCH 5/7] oops --- mappings/oclc_bib.py | 4 +--- tests/unit/test_oclc_bib_mapping.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/mappings/oclc_bib.py b/mappings/oclc_bib.py index 0a352c8b45..f54ef74932 100644 --- a/mappings/oclc_bib.py +++ b/mappings/oclc_bib.py @@ -12,9 +12,7 @@ def __init__(self, oclc_bib, related_oclc_numbers=[]): creators = self._get_creators(oclc_bib) authors = self._get_authors(creators) contributors = self._get_contributors(creators) - - print(creators) - + self.record = Record( uuid=uuid4(), frbr_status='complete', diff --git a/tests/unit/test_oclc_bib_mapping.py b/tests/unit/test_oclc_bib_mapping.py index 7d014e5c13..2e22f93393 100644 --- a/tests/unit/test_oclc_bib_mapping.py +++ b/tests/unit/test_oclc_bib_mapping.py @@ -64,4 +64,4 @@ def test_oclc_bib_mapping_no_second_name(): oclc_bib_mapping = OCLCBibMapping(base_oclc_bib) - assert ['Hathi|||true'] == oclc_bib_mapping.record.authors \ No newline at end of file + assert ['Hathi|||true'] == oclc_bib_mapping.record.authors From a8b1ad3dd57e56e11ac353347f467620909f3e1b Mon Sep 17 00:00:00 2001 From: kyle Date: Wed, 18 Sep 2024 09:11:39 -0400 Subject: [PATCH 6/7] guarding against mapping --- mappings/oclc_bib.py | 43 +++++++++++++++++------------ tests/unit/test_oclc_bib_mapping.py | 17 ++++++++++++ 2 files changed, 42 insertions(+), 18 deletions(-) diff --git a/mappings/oclc_bib.py b/mappings/oclc_bib.py index f54ef74932..76fb4aaa99 100644 --- a/mappings/oclc_bib.py +++ b/mappings/oclc_bib.py @@ -8,39 +8,33 @@ class OCLCBibMapping(Core): def __init__(self, oclc_bib, related_oclc_numbers=[]): - identifiers = oclc_bib['identifier'] + self.record = self._map_to_record(oclc_bib, related_oclc_numbers) + + def _map_to_record(self, oclc_bib, related_oclc_numbers=[]) -> Record: + identifiers = oclc_bib.get('identifier', {}) creators = self._get_creators(oclc_bib) authors = self._get_authors(creators) contributors = self._get_contributors(creators) - self.record = Record( + return Record( uuid=uuid4(), frbr_status='complete', cluster_status=False, source='oclcClassify', - source_id=f"{oclc_bib['work']['id']}|owi", - title=oclc_bib['title']['mainTitles'][0]['text'], + source_id=f"{oclc_bib.get('work', {}).get('id')}|owi", + title=oclc_bib.get('title', {}).get('mainTitles', [{}])[0].get('text'), subjects=self._map_subjects(oclc_bib), authors=self._map_authors(authors), contributors=self._map_contributors(contributors), identifiers=( - [f"{oclc_bib['work']['id']}|owi"] + - [f"{identifiers['oclcNumber']}|oclc"] + + [f"{oclc_bib.get('work', {}).get('id')}|owi"] + + [f"{identifiers.get('oclcNumber')}|oclc"] + [f"{oclc_number}|oclc" for oclc_number in related_oclc_numbers] ), date_created=datetime.now(timezone.utc).replace(tzinfo=None), date_modified=datetime.now(timezone.utc).replace(tzinfo=None) ) - def createMapping(self): - pass - - def applyFormatting(self): - pass - - def applyMapping(self): - pass - def _get_creators(self, oclc_bib): if not oclc_bib.get('contributor'): return None @@ -48,7 +42,7 @@ def _get_creators(self, oclc_bib): return list( filter( lambda creator: creator.get('secondName') or creator.get('firstName'), - oclc_bib['contributor'].get('creators', []) + oclc_bib.get('contributor', {}).get('creators', []) ) ) @@ -79,14 +73,18 @@ def _get_contributors(self, creators): ) def _is_author(self, creator): - for role in set(map(lambda relator: relator.get('term').lower(), creator.get('relators', []))): + for role in set(map(lambda relator: relator.get('term', '').lower(), creator.get('relators', []))): if 'author' in role.lower() or 'writer' in role.lower(): return True return False def _map_subjects(self, oclc_bib) -> list[str]: - return [f"{subject['subjectName']['text']}||{subject.get('vocabulary', '')}" for subject in oclc_bib.get('subjects', [])] + return [ + f"{subject_name}||{subject.get('vocabulary', '')}" + for subject in oclc_bib.get('subjects', []) + if (subject_name := subject.get('subjectName', {}).get('text')) + ] def _map_authors(self, authors) -> Optional[list[str]]: if not authors: @@ -120,3 +118,12 @@ def _get_name(self, name_data) -> str: return None return name_data.get('text') or name_data.get('romanizedText') + + def createMapping(self): + pass + + def applyFormatting(self): + pass + + def applyMapping(self): + pass diff --git a/tests/unit/test_oclc_bib_mapping.py b/tests/unit/test_oclc_bib_mapping.py index 2e22f93393..8b4b2474f4 100644 --- a/tests/unit/test_oclc_bib_mapping.py +++ b/tests/unit/test_oclc_bib_mapping.py @@ -65,3 +65,20 @@ def test_oclc_bib_mapping_no_second_name(): oclc_bib_mapping = OCLCBibMapping(base_oclc_bib) assert ['Hathi|||true'] == oclc_bib_mapping.record.authors + +def test_oclc_bib_mapping_fallback_to_romanized_text(): + base_oclc_bib['contributor'] = { + 'creators': [{ + 'firstName': { + 'romanizedText': 'Homer' + }, + 'secondName': { + 'romanizedText': 'Simpson' + }, + 'isPrimary': True + }] + } + + oclc_bib_mapping = OCLCBibMapping(base_oclc_bib) + + assert ['Simpson, Homer|||true'] == oclc_bib_mapping.record.authors From f6a7fdbac0815825baa8028a91dec0a630a305e5 Mon Sep 17 00:00:00 2001 From: kyle Date: Wed, 18 Sep 2024 09:25:53 -0400 Subject: [PATCH 7/7] making things readable --- mappings/oclc_bib.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mappings/oclc_bib.py b/mappings/oclc_bib.py index 76fb4aaa99..bac0043cd0 100644 --- a/mappings/oclc_bib.py +++ b/mappings/oclc_bib.py @@ -52,9 +52,7 @@ def _get_authors(self, creators): return list( filter( - lambda creator: - creator.get('isPrimary', False) or - self._is_author(creator), + lambda creator: creator.get('isPrimary', False) or self._is_author(creator), creators ) ) @@ -65,9 +63,7 @@ def _get_contributors(self, creators): return list( filter( - lambda creator: - not creator.get('isPrimary', False) and - not self._is_author(creator), + lambda creator: not creator.get('isPrimary', False) and not self._is_author(creator), creators ) ) @@ -90,7 +86,11 @@ def _map_authors(self, authors) -> Optional[list[str]]: if not authors: return None - return [f'{author_name}|||true' for author in authors if (author_name := self._get_contributor_name(author))] + return [ + f'{author_name}|||true' + for author in authors + if (author_name := self._get_contributor_name(author)) + ] def _map_contributors(self, contributors) -> Optional[list[str]]: if not contributors: @@ -98,10 +98,11 @@ def _map_contributors(self, contributors) -> Optional[list[str]]: return [ f"{contributor_name}|||{', '.join(list(map(lambda relator: relator.get('term', ''), contributor.get('relators', []))))}" - for contributor in contributors if (contributor_name := self._get_contributor_name(contributor)) + for contributor in contributors + if (contributor_name := self._get_contributor_name(contributor)) ] - def _get_contributor_name(self, contributor) -> str: + def _get_contributor_name(self, contributor) -> Optional[str]: first_name = self._get_name(contributor.get('firstName')) second_name = self._get_name(contributor.get('secondName')) @@ -113,7 +114,7 @@ def _get_contributor_name(self, contributor) -> str: return f'{first_name or second_name}' - def _get_name(self, name_data) -> str: + def _get_name(self, name_data) -> Optional[str]: if not name_data: return None