Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Infra: Add pep2bib.py for generating BibTeX entries #2085

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ pep-0000.txt
pep-0000.rst
pep-????.html
peps.rss
peps.bib
__pycache__
*.pyc
*.pyo
Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,16 @@ pep-0000.rst: $(wildcard pep-????.txt) $(wildcard pep-????.rst) $(wildcard pep0/
rss:
$(PYTHON) pep2rss.py .

bib: pep-0000.rst
$(PYTHON) pep2bib.py .

install:
echo "Installing is not necessary anymore. It will be done in post-commit."

clean:
-rm pep-0000.rst
-rm *.html
-rm *.bib
-rm -rf build

update:
Expand Down
89 changes: 89 additions & 0 deletions pep2bib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/env python3

# usage: python3 pep2bib.py .

import glob
import os
import re
import sys
from pybtex.database import Entry, BibliographyData

from pep_parsing_helpers import pep_number, pep_creation_dt, first_line_starting_with, parse_authors

BIB_PATH = os.path.join(sys.argv[1], 'peps.bib')


name_first_regex = re.compile(r'(.*)<.*>')
mail_first_regex = re.compile(r'.*\((.*)\)')
name_only_regex = re.compile(r'(.*)')


months = {
Comment on lines +16 to +21
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are top-level constants, so I'd think it would make sense to make them UPPER_SNAKE_CASE like the others.

1: 'jan',
2: 'feb',
3: 'mar',
4: 'apr',
5: 'may',
6: 'jun',
7: 'jul',
8: 'aug',
9: 'sep',
10: 'oct',
11: 'nov',
12: 'dec',
}


def authors_to_bib(authors):
cleaned = []
for author in authors:
match = name_first_regex.match(author)
if match is None:
match = mail_first_regex.match(author)
if match is None:
match = name_only_regex.match(author)
cleaned.append(match.group(1).strip())
return " and ".join(cleaned)


def main():
# get list of peps with creation time
# (from "Created:" string in pep .rst or .txt)
peps = glob.glob('pep-*.txt')
peps.extend(glob.glob('pep-*.rst'))

peps_with_dt = [(pep_number(full_path), pep_creation_dt(full_path), full_path) for full_path in peps]
# sort peps by number
peps_with_dt.sort()

items = {}
for n, dt, full_path in peps_with_dt:
title = first_line_starting_with(full_path, 'Title:')
author_string = first_line_starting_with(full_path, 'Author:')
authors = parse_authors(author_string)
authors = authors_to_bib(authors)
url = 'https://www.python.org/dev/peps/pep-%0.4d/' % n
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might want to factor this into a top-level constant, so its easier to update if and when we move to peps.python.org

item = Entry('techreport', [
('author', authors),
('title', 'PEP %d: %s' % (n, title)),
('institution', "Python Software Foundation"),
('year', str(dt.year)),
('month', months[dt.month]),
('type', 'PEP'),
('number', str(n)),
('url', url)
])
items['pep%d' % n] = item

bib = BibliographyData(items)
bib_str = bib.to_string('bibtex')

# pybtex always quotes strings, but we want month strings unquoted, so bib styles can replace it
bib_str = re.sub('month = "(.*)"', r'month = \1', bib_str)

with open(BIB_PATH, 'w', encoding="utf-8") as fp:
fp.write(bib_str)


if __name__ == '__main__':
main()
35 changes: 5 additions & 30 deletions pep2rss.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
import datetime
import glob
import os
import re
import sys
import time
import PyRSS2Gen as rssgen
import docutils.frontend
import docutils.nodes
import docutils.parsers.rst
import docutils.utils

from pep_parsing_helpers import pep_creation_dt, first_line_starting_with, parse_authors

RSS_PATH = os.path.join(sys.argv[1], 'peps.rss')


Expand Down Expand Up @@ -53,38 +53,12 @@ def pep_abstract(full_path: str) -> str:
return abstract


def firstline_startingwith(full_path, text):
for line in open(full_path, encoding="utf-8"):
if line.startswith(text):
return line[len(text):].strip()
return None


# get list of peps with creation time
# (from "Created:" string in pep .rst or .txt)
peps = glob.glob('pep-*.txt')
peps.extend(glob.glob('pep-*.rst'))


def pep_creation_dt(full_path):
created_str = firstline_startingwith(full_path, 'Created:')
# bleh, I was hoping to avoid re but some PEPs editorialize
# on the Created line
m = re.search(r'''(\d+-\w+-\d{4})''', created_str)
if not m:
# some older ones have an empty line, that's okay, if it's old
# we ipso facto don't care about it.
# "return None" would make the most sense but datetime objects
# refuse to compare with that. :-|
return datetime.datetime(*time.localtime(0)[:6])
created_str = m.group(1)
try:
t = time.strptime(created_str, '%d-%b-%Y')
except ValueError:
t = time.strptime(created_str, '%d-%B-%Y')
return datetime.datetime(*t[:6])


peps_with_dt = [(pep_creation_dt(full_path), full_path) for full_path in peps]
# sort peps by date, newest first
peps_with_dt.sort(reverse=True)
Expand All @@ -96,8 +70,9 @@ def pep_creation_dt(full_path):
n = int(full_path.split('-')[-1].split('.')[0])
except ValueError:
pass
title = firstline_startingwith(full_path, 'Title:')
author = firstline_startingwith(full_path, 'Author:')
title = first_line_starting_with(full_path, 'Title:')
authors = first_line_starting_with(full_path, 'Author:')
author = parse_authors(authors)[0] # RSS only supports one author
abstract = pep_abstract(full_path)
url = 'https://www.python.org/dev/peps/pep-%0.4d/' % n
item = rssgen.RSSItem(
Expand Down
56 changes: 56 additions & 0 deletions pep_parsing_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import re
import datetime
import time
Comment on lines +1 to +3
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
import re
import datetime
import time
import datetime
import re
import time



def first_line_starting_with(full_path, text):
result = None
for line in open(full_path, encoding="utf-8"):
if result is not None:
if not line[0].strip(): # Line begins with whitespace
result += line
else:
return result
if line.startswith(text):
result = line[len(text):].strip()
return None


def pep_creation_dt(full_path):
created_str = first_line_starting_with(full_path, 'Created:')
# bleh, I was hoping to avoid re but some PEPs editorialize
# on the Created line
m = re.search(r'''(\d+-\w+-\d{4})''', created_str)
if not m:
# some older ones have an empty line, that's okay, if it's old
# we ipso facto don't care about it.
# "return None" would make the most sense but datetime objects
# refuse to compare with that. :-|
return datetime.datetime(*time.localtime(0)[:6])
created_str = m.group(1)
try:
t = time.strptime(created_str, '%d-%b-%Y')
except ValueError:
t = time.strptime(created_str, '%d-%B-%Y')
return datetime.datetime(*t[:6])


def pep_number(full_path):
n_str = full_path.split('-')[-1].split('.')[0]
try:
n = int(n_str)
except ValueError:
raise Exception("Can't parse pep number %s" % n_str)

return n


def parse_authors(authors_str):
orig_authors = authors_str.split(',')
authors = []
for author in orig_authors:
authors.append(author.strip())

return authors


Comment on lines +54 to +56
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return authors
return authors

3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ docutils >= 0.16

# For RSS
feedgen >= 0.9.0 # For RSS feed

# For bibliography
pybtex >= 0.24.0
Comment on lines +7 to +9
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# For bibliography
pybtex >= 0.24.0

I would strongly advise against adding this to the requirements file, as this means all CIs and users that set up an env for our repo in a normal fashion will have to install it, when it isn't (yet) actually used anywhere other than a niche manual command. I suggest handling it just like we did the lint command and installing it on demand instead.