fix: use bibtexparser convert_to_unicode function

perrette · Apr 21, 2023 · 4742508 · 4742508
1 parent 635aeed
commit 4742508
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 30 deletions.
diff --git a/papers/bib.py b/papers/bib.py
@@ -9,6 +9,7 @@
 import itertools
 
 import bibtexparser
+from bibtexparser.customization import convert_to_unicode
 
 import papers
 from papers import logger
@@ -17,7 +18,7 @@
 from papers.extract import extract_pdf_metadata
 from papers.extract import fetch_bibtex_by_fulltext_crossref, fetch_bibtex_by_doi
 
-from papers.encoding import latex_to_unicode, unicode_to_latex, unicode_to_ascii
+from papers.encoding import unicode_to_latex, unicode_to_ascii
 from papers.encoding import parse_file, format_file, standard_name, family_names, format_entries, update_file_path
 
 from papers.config import config, bcolors, checksum, move, search_config, CONFIG_FILE, DATA_DIR
@@ -368,8 +369,10 @@ def append_abc_to_key(self, entry):
         return append_abc(entry['ID'], keys={self.key(e) for e in self.entries})
 
 
-    def add_bibtex(self, bibtex, relative_to=None, attachments=None, **kw):
+    def add_bibtex(self, bibtex, relative_to=None, attachments=None, convert_to_unicode=False, **kw):
         bib = bibtexparser.loads(bibtex)
+        if convert_to_unicode:
+            bib = bibtexparser.customization.convert_to_unicode(bib)
         for e in bib.entries:
             files = []
             if "file" in e:
@@ -404,7 +407,7 @@ def add_pdf(self, pdf, attachments=None, search_doi=True, search_fulltext=True,
         entry = bib.entries[0]
 
         # convert curly brackets to unicode
-        bibtexparser.customization.convert_to_unicode(entry)
+        entry = convert_to_unicode(entry)
 
         files = [pdf] if pdf else []
         if attachments:
@@ -589,16 +592,19 @@ def fix_entry(self, e, fix_doi=True, fetch=False, fetch_all=False,
             assert encoding in ['unicode','latex'], e.get('ID','')+': unknown encoding: '+repr(encoding)
 
             logger.debug(e.get('ID','')+': update encoding')
-            for k in e:
-                if k == k.lower() and k != 'abstract': # all but ENTRYTYPE, ID, abstract
-                    try:
-                        if encoding == 'unicode':
-                            e[k] = latex_to_unicode(e[k])
-                        elif encoding == 'latex':
-                            e[k] = unicode_to_latex(e[k])
-                    # except KeyError as error:
-                    except (KeyError, ValueError) as error:
-                        logger.warn(e.get('ID','')+': '+k+': failed to encode: '+str(error))
+            if encoding == "unicode":
+                e = convert_to_unicode(e)
+            else:
+                for k in e:
+                    if k == k.lower() and k != 'abstract': # all but ENTRYTYPE, ID, abstract
+                        try:
+                            if encoding == 'unicode':
+                                e[k] = latex_to_unicode(e[k])
+                            elif encoding == 'latex':
+                                e[k] = unicode_to_latex(e[k])
+                        # except KeyError as error:
+                        except (KeyError, ValueError) as error:
+                            logger.warn(e.get('ID','')+': '+k+': failed to encode: '+str(error))
 
         if fix_doi:
             if 'doi' in e and e['doi']:

diff --git a/papers/encoding.py b/papers/encoding.py
@@ -1,7 +1,7 @@
 import os
 import bibtexparser
 from unidecode import unidecode as unicode_to_ascii
-from papers.latexenc import latex_to_unicode, unicode_to_latex
+from papers.latexenc import unicode_to_latex
 from papers import logger
 
 # fix bibtexparser call on empty strings
@@ -129,18 +129,7 @@ def strip_outmost_brackets(family):
 
 
 def standard_name(author):
-    names = []
-    for name in bibtexparser.customization.getnames([strip_outmost_brackets(nm) for nm in author.split(' and ')]):
-        # if 'name' contains more than one author
-        # Example: doi:10.1111/jnc.13687
-        name = name.split(' and ')
-
-        for n in name:
-            family, given = n.split(',')
-            family = strip_outmost_brackets(family.strip())
-            # given = strip_outmost_brackets(given.strip())
-            names.append(', '.join([family.strip(), given.strip()]))
-    return ' and '.join(names)
+    return " and ".join(bibtexparser.customization.author({"author": author}).get("author",[]))
 
 
 def family_names(author_field):

diff --git a/papers/extract.py b/papers/extract.py
@@ -10,7 +10,8 @@
 import papers
 from papers.config import cached
 from papers import logger
-from papers.encoding import family_names, latex_to_unicode
+from papers.encoding import family_names
+from bibtexparser.customization import convert_to_unicode
 
 my_etiquette = Etiquette('papers', papers.__version__, 'https://github.com/perrette/papers', '[email protected]')
 
@@ -369,14 +370,15 @@ def fetch_entry(e):
     if 'doi' in e and isvaliddoi(e['doi']):
         bibtex = fetch_bibtex_by_doi(e['doi'])
     else:
+        e = convert_to_unicode(e)
         kw = {}
         if e.get('author',''):
-            kw['author'] = latex_to_unicode(family_names(e['author']))
+            kw['author'] = family_names(e['author'])
         if e.get('title',''):
-            kw['title'] = latex_to_unicode(family_names(e['title']))
+            kw['title'] = e['title']
         if kw:
             bibtex = fetch_bibtex_by_fulltext_crossref('', **kw)
         else:
-            ValueError('no author not title field')
+            ValueError('no author nor title field')
     db = bibtexparser.loads(bibtex)
     return db.entries[0]