-
Notifications
You must be signed in to change notification settings - Fork 2
/
tidy-md-refs.py
executable file
·164 lines (131 loc) · 5.41 KB
/
tidy-md-refs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python3
# pylint: disable=invalid-name
"""Tidy numbering and ordering of all Markdown reference links in FILEs.
Number _all_ reference links in the order they appear in the text and place
them at the end of the file. _Remove_ any unused reference link definitions.
If no files are given, read stdin and print tidied Markdown to stdout.
Titles in reference definitions must be on the same line as the link, if not,
this script may move the link and leave the title behind. Examples:
> OK:
> [21]: https://spec.commonmark.org/0.31.2/ CommonMark
> OK:
> [21]:
> https://spec.commonmark.org/0.31.2/ CommonMark
> NOT OK:
> [21]: https://spec.commonmark.org/0.31.2/
> CommonMark
> NOT OK:
> [21]: https://spec.commonmark.org/0.31.2/ (
> CommonMark )
Original script by Dr. Drang – https://github.com/drdrang/
Posted at:
https://leancrew.com/all-this/2012/09/tidying-markdown-reference-links/ or
https://web.archive.org/web/20120920012828/http://www.leancrew.com/all-this/
WayBack machine archive link.
"""
from __future__ import annotations
import argparse
import re
import sys
import warnings
from typing import Optional
from typing import TextIO
# The regex for finding reference links in the text. Don't find
# footnotes by mistake.
link = re.compile(r"\[([^]]+)]\[([^]^]+)]")
# The regex for finding the label. Again, don't find footnotes
# by mistake. Allow up to three spaces of indentation, per CommonMark spec.
# Avoid eating a reference label on a line following an invalid one.
# TODO: Properly handle link titles (hard, probably requires markdown parser)
label = re.compile(r"^ {0,3}\[([^]]+)]:\s+(?!\[)(.+)$", re.MULTILINE)
# The regex for label-like things that might create confusion with our labels.
badlabel = re.compile(r"^ {0,3}\[\d+]:(?:[ \t].*)?$", re.MULTILINE)
def tidy(text: str) -> str:
"""Find all the links and references in the text and reorder them."""
def refrepl(m: re.Match) -> str:
"""Rewrite reference links with the reordered link numbers."""
return f"[{m.group(1)}][{order.index(m.group(2)) + 1}]"
links = link.findall(text)
if not links:
return text
labels = dict(label.findall(text))
# Determine the order of the links in the text. If a link is used
# more than once, its order is its first position.
order: list[str] = []
for i in links:
if order.count(i[1]) == 0:
order.append(i[1])
# Make a list of the references in order of appearance.
newlabels = []
for i, j in enumerate(order):
try:
newlabels.append(f"[{i + 1}]: {labels[j]}")
except KeyError: # noqa: PERF203
# Warn about missing label, but continue processing others.
missing_ref = (
f"Missing/empty reference [{i + 1}] (originally [{j}])"
)
warnings.warn(missing_ref)
# Remove the old reference labels.
text = label.sub("", text).rstrip()
# Remove any leftover invalid numeric labels that may conflict or confuse.
badlabels = [lab.replace("\n", "") for lab in badlabel.findall(text)]
if badlabels:
bad_labels = "\n • ".join(["Removed invalid references:", *badlabels])
warnings.warn(bad_labels)
text = badlabel.sub("", text).rstrip()
# Append the new labels at the end of the text.
text += "\n" * 2 + "\n".join(newlabels)
# Rewrite the links with the new reference numbers.
return link.sub(refrepl, text) + "\n"
def tidy_file(input_file: TextIO, output_file: Optional[TextIO]) -> bool:
"""Tidy a file, returning True if the output is identical."""
original = input_file.read()
tidied = tidy(original)
if output_file is None:
if original == tidied:
return True
input_file.seek(0)
output_file = input_file
print(tidied, end="", file=output_file)
if output_file == input_file:
output_file.truncate()
return original == tidied
def main() -> int:
"""Tidy Markdown file arguments' reference links, or stdin if no files."""
docstring = sys.modules[__name__].__doc__ or ""
description = docstring.split("\n", 1)[0]
epilog = """
Number all reference links in the order they appear in the text and place
them at the end of the file. Remove any unused reference link definitions.
Reference titles must be on the same link as links.See the script source
for details and other limitations of this script.
Returns exit code 1 if any files were modified, 2 for bad arguments, and
zero (success) if all files were already tidy.
If no FILE arguments are present, read stdin and writes stdout, always
returning success, even if input was not tidy. To get exit code == 1 for
untidy stdin, provide '-' as the filename."""
parser = argparse.ArgumentParser(
description=description, epilog=epilog, allow_abbrev=False
)
parser.add_argument(
"files",
help="(multiple) Markdown file(s) for reference link tidying",
metavar="FILE",
nargs="*",
type=argparse.FileType("r+"),
)
args = parser.parse_args()
if not args.files:
tidy_file(sys.stdin, sys.stdout)
return 0
modified = False
for in_file in args.files:
out_file = None
if in_file == sys.stdin:
out_file = sys.stdout
if not tidy_file(in_file, out_file):
modified = True
return 1 if modified else 0
if __name__ == "__main__":
sys.exit(main())