From ab60d1ecd504571866006ae3011c90d7805ed038 Mon Sep 17 00:00:00 2001 From: perrette Date: Fri, 21 Apr 2023 23:42:23 +0200 Subject: [PATCH] update default similarity to PARTIAL - duplicate if either DOI or AuthorTitle match I cannot think any situatiion where two legitimately distinct papers will have the exact same Authors and titles, or of course same DOI - tests updated to reflect that, and rewritten for clarity --- papers/bib.py | 17 +++++--- tests/test_papers.py | 94 +++++++++++++++++++++----------------------- 2 files changed, 57 insertions(+), 54 deletions(-) diff --git a/papers/bib.py b/papers/bib.py index bae894a..70963f3 100644 --- a/papers/bib.py +++ b/papers/bib.py @@ -97,9 +97,12 @@ def entry_id(e): FUZZY_RATIO = 80 -# should be conservative (used in papers add) -DEFAULT_SIMILARITY = 'FAIR' -# DEFAULT_SIMILARITY = 'PARTIAL' +# Default similarity is used in papers add +# False positive (to weak a test) and distinct entries will be merged +# False negative and duplicates will be created +# PARTIAL means that If either DOI or author+title agree, that is good enough to be considered a duplicate +# I cant think of any situation where two legitimately distinct papers test True with partial similarity. +DEFAULT_SIMILARITY = 'PARTIAL' EXACT_DUPLICATES = 104 GOOD_DUPLICATES = 103 @@ -122,10 +125,14 @@ def compare_entries(e1, e2, fuzzy=False): if id1 == id2: score = GOOD_DUPLICATES - elif all([f1==f2 for f1, f2 in zip(id1, id2) if f1 and f2]): # all defined fields agree + elif e1.get('doi') and e2.get('doi') and e1['doi'] == e2['doi']: score = FAIR_DUPLICATES - elif any([f1==f2 for f1, f2 in zip(id1, id2) if f1 and f2]): # some of the defined fields agree + # elif all([f1==f2 for f1, f2 in zip(id1, id2) if f1 and f2]): # ID and AUTHORTITLE agree + # score = FAIR_DUPLICATES + # COMMENT: same as GOOD_DUPLICATES when all fields are defined, but also returns true when one field is missing in one entry + + elif any([f1==f2 for f1, f2 in zip(id1, id2) if f1 and f2]): # any of ID or AUTHORTITLE agree score = PARTIAL_DUPLICATES elif not fuzzy: diff --git a/tests/test_papers.py b/tests/test_papers.py index 3d15eb7..73871e8 100644 --- a/tests/test_papers.py +++ b/tests/test_papers.py @@ -346,8 +346,9 @@ def assertMultiLineEqual(self, first, second, msg=None): +class SimilarityBase(unittest.TestCase): -class TestDuplicates(unittest.TestCase): + similarity = None reference = """@article{Perrette_2011, author = {M. Perrette and A. Yool and G. D. Quartly and E. E. Popova}, @@ -397,29 +398,34 @@ class TestDuplicates(unittest.TestCase): year = {2012} }""" - @staticmethod - def isduplicate(a, b): + + def isduplicate(self, a, b): """test Biblio's eq method for duplicates """ db = bibtexparser.loads(a+'\n'+b) e1, e2 = db.entries - refs = Biblio() + refs = Biblio(similarity=self.similarity) return refs.eq(e1, e2) + +class TestDuplicatesExact(SimilarityBase): + + similarity = 'EXACT' + def test_exactsame(self): self.assertTrue(self.isduplicate(self.reference, self.reference)) def test_anotherkey(self): - self.assertTrue(self.isduplicate(self.reference, self.anotherkey)) + self.assertFalse(self.isduplicate(self.reference, self.anotherkey)) def test_missingfield(self): - self.assertTrue(self.isduplicate(self.reference, self.missingfield)) + self.assertFalse(self.isduplicate(self.reference, self.missingfield)) def test_missingdoi(self): - self.assertTrue(self.isduplicate(self.reference, self.missingdoi)) + self.assertFalse(self.isduplicate(self.reference, self.missingdoi)) def test_missingtitauthor(self): - self.assertTrue(self.isduplicate(self.reference, self.missingtitauthor)) + self.assertFalse(self.isduplicate(self.reference, self.missingtitauthor)) def test_conflictauthor(self): self.assertFalse(self.isduplicate(self.reference, self.conflictauthor)) @@ -428,68 +434,58 @@ def test_conflictdoi(self): self.assertFalse(self.isduplicate(self.reference, self.conflictdoi)) def test_conflictyear(self): - self.assertTrue(self.isduplicate(self.reference, self.conflictyear)) - - - -class SimilarityBase: - - similarity = None - - def isduplicate(self, a, b): - """test Biblio's eq method for duplicates - """ - db = bibtexparser.loads(a+'\n'+b) - e1, e2 = db.entries - refs = Biblio(similarity=self.similarity) - return refs.eq(e1, e2) + self.assertFalse(self.isduplicate(self.reference, self.conflictyear)) -class TestDuplicatesExact(SimilarityBase, TestDuplicates): +class TestDuplicatesGood(TestDuplicatesExact): - similarity = 'EXACT' + similarity = 'GOOD' def test_anotherkey(self): - self.assertFalse(self.isduplicate(self.reference, self.anotherkey)) + self.assertTrue(self.isduplicate(self.reference, self.anotherkey)) def test_missingfield(self): - self.assertFalse(self.isduplicate(self.reference, self.missingfield)) - - def test_missingdoi(self): - self.assertFalse(self.isduplicate(self.reference, self.missingdoi)) - - def test_missingtitauthor(self): - self.assertFalse(self.isduplicate(self.reference, self.missingtitauthor)) + self.assertTrue(self.isduplicate(self.reference, self.missingfield)) def test_conflictyear(self): - self.assertFalse(self.isduplicate(self.reference, self.conflictyear)) - - + self.assertTrue(self.isduplicate(self.reference, self.conflictyear)) -class TestDuplicatesGood(SimilarityBase, TestDuplicates): - similarity = 'GOOD' +class TestDuplicatesFair(TestDuplicatesGood): - def test_missingdoi(self): - self.assertFalse(self.isduplicate(self.reference, self.missingdoi)) + similarity = 'FAIR' def test_missingtitauthor(self): - self.assertFalse(self.isduplicate(self.reference, self.missingtitauthor)) + self.assertTrue(self.isduplicate(self.reference, self.missingtitauthor)) + + def test_conflictauthor(self): + self.assertTrue(self.isduplicate(self.reference, self.conflictauthor)) -class TestDuplicatesPartial(SimilarityBase, TestDuplicates): +class TestDuplicatesPartial(TestDuplicatesFair): similarity = 'PARTIAL' - def test_conflictauthor(self): - self.assertTrue(self.isduplicate(self.reference, self.conflictauthor)) + def test_missingdoi(self): + self.assertTrue(self.isduplicate(self.reference, self.missingdoi)) def test_conflictdoi(self): self.assertTrue(self.isduplicate(self.reference, self.conflictdoi)) +class TestDuplicates(TestDuplicatesPartial): -class TestDuplicatesAdd(SimilarityBase, TestDuplicates): + @staticmethod + def isduplicate(a, b): + """test Biblio's eq method for duplicates + """ + db = bibtexparser.loads(a+'\n'+b) + e1, e2 = db.entries + refs = Biblio() + return refs.eq(e1, e2) + + +class TestDuplicatesAdd(TestDuplicates): def setUp(self): self.mybib = tempfile.mktemp(prefix='papers.bib') @@ -760,15 +756,15 @@ class TestAddConflict(BibTest): }""" bibtex_conflict_key = """@article{Perrette_2011, - author = {M. Perrette and A. Yool and G. D. Quartly and E. E. Popova}, + author = {M. Perrette and Another author}, doi = {10.5194/bg-8-515-2011XXX}, - title = {Near-ubiquity of ice-edge blooms in the Arctic} + title = {Something else entirely} }""" bibtex_conflict_key_fixed = """@article{Perrette_2011b, - author = {M. Perrette and A. Yool and G. D. Quartly and E. E. Popova}, + author = {M. Perrette and Another author}, doi = {10.5194/bg-8-515-2011XXX}, - title = {Near-ubiquity of ice-edge blooms in the Arctic} + title = {Something else entirely} }""" bibtex_same_doi = """@article{same_doi,