Skip to content

Commit

Permalink
fix: use bibtexparser convert_to_unicode function
Browse files Browse the repository at this point in the history
  • Loading branch information
perrette committed Apr 21, 2023
1 parent 635aeed commit 4742508
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 30 deletions.
32 changes: 19 additions & 13 deletions papers/bib.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import itertools

import bibtexparser
from bibtexparser.customization import convert_to_unicode

import papers
from papers import logger
Expand All @@ -17,7 +18,7 @@
from papers.extract import extract_pdf_metadata
from papers.extract import fetch_bibtex_by_fulltext_crossref, fetch_bibtex_by_doi

from papers.encoding import latex_to_unicode, unicode_to_latex, unicode_to_ascii
from papers.encoding import unicode_to_latex, unicode_to_ascii
from papers.encoding import parse_file, format_file, standard_name, family_names, format_entries, update_file_path

from papers.config import config, bcolors, checksum, move, search_config, CONFIG_FILE, DATA_DIR
Expand Down Expand Up @@ -368,8 +369,10 @@ def append_abc_to_key(self, entry):
return append_abc(entry['ID'], keys={self.key(e) for e in self.entries})


def add_bibtex(self, bibtex, relative_to=None, attachments=None, **kw):
def add_bibtex(self, bibtex, relative_to=None, attachments=None, convert_to_unicode=False, **kw):
bib = bibtexparser.loads(bibtex)
if convert_to_unicode:
bib = bibtexparser.customization.convert_to_unicode(bib)
for e in bib.entries:
files = []
if "file" in e:
Expand Down Expand Up @@ -404,7 +407,7 @@ def add_pdf(self, pdf, attachments=None, search_doi=True, search_fulltext=True,
entry = bib.entries[0]

# convert curly brackets to unicode
bibtexparser.customization.convert_to_unicode(entry)
entry = convert_to_unicode(entry)

files = [pdf] if pdf else []
if attachments:
Expand Down Expand Up @@ -589,16 +592,19 @@ def fix_entry(self, e, fix_doi=True, fetch=False, fetch_all=False,
assert encoding in ['unicode','latex'], e.get('ID','')+': unknown encoding: '+repr(encoding)

logger.debug(e.get('ID','')+': update encoding')
for k in e:
if k == k.lower() and k != 'abstract': # all but ENTRYTYPE, ID, abstract
try:
if encoding == 'unicode':
e[k] = latex_to_unicode(e[k])
elif encoding == 'latex':
e[k] = unicode_to_latex(e[k])
# except KeyError as error:
except (KeyError, ValueError) as error:
logger.warn(e.get('ID','')+': '+k+': failed to encode: '+str(error))
if encoding == "unicode":
e = convert_to_unicode(e)
else:
for k in e:
if k == k.lower() and k != 'abstract': # all but ENTRYTYPE, ID, abstract
try:
if encoding == 'unicode':
e[k] = latex_to_unicode(e[k])
elif encoding == 'latex':
e[k] = unicode_to_latex(e[k])
# except KeyError as error:
except (KeyError, ValueError) as error:
logger.warn(e.get('ID','')+': '+k+': failed to encode: '+str(error))

if fix_doi:
if 'doi' in e and e['doi']:
Expand Down
15 changes: 2 additions & 13 deletions papers/encoding.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import bibtexparser
from unidecode import unidecode as unicode_to_ascii
from papers.latexenc import latex_to_unicode, unicode_to_latex
from papers.latexenc import unicode_to_latex
from papers import logger

# fix bibtexparser call on empty strings
Expand Down Expand Up @@ -129,18 +129,7 @@ def strip_outmost_brackets(family):


def standard_name(author):
names = []
for name in bibtexparser.customization.getnames([strip_outmost_brackets(nm) for nm in author.split(' and ')]):
# if 'name' contains more than one author
# Example: doi:10.1111/jnc.13687
name = name.split(' and ')

for n in name:
family, given = n.split(',')
family = strip_outmost_brackets(family.strip())
# given = strip_outmost_brackets(given.strip())
names.append(', '.join([family.strip(), given.strip()]))
return ' and '.join(names)
return " and ".join(bibtexparser.customization.author({"author": author}).get("author",[]))


def family_names(author_field):
Expand Down
10 changes: 6 additions & 4 deletions papers/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
import papers
from papers.config import cached
from papers import logger
from papers.encoding import family_names, latex_to_unicode
from papers.encoding import family_names
from bibtexparser.customization import convert_to_unicode

my_etiquette = Etiquette('papers', papers.__version__, 'https://github.com/perrette/papers', '[email protected]')

Expand Down Expand Up @@ -369,14 +370,15 @@ def fetch_entry(e):
if 'doi' in e and isvaliddoi(e['doi']):
bibtex = fetch_bibtex_by_doi(e['doi'])
else:
e = convert_to_unicode(e)
kw = {}
if e.get('author',''):
kw['author'] = latex_to_unicode(family_names(e['author']))
kw['author'] = family_names(e['author'])
if e.get('title',''):
kw['title'] = latex_to_unicode(family_names(e['title']))
kw['title'] = e['title']
if kw:
bibtex = fetch_bibtex_by_fulltext_crossref('', **kw)
else:
ValueError('no author not title field')
ValueError('no author nor title field')
db = bibtexparser.loads(bibtex)
return db.entries[0]

0 comments on commit 4742508

Please sign in to comment.