Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 175 additions & 67 deletions great_docs/assets/post-render.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import glob
import html
import json
import os
import re
Expand Down Expand Up @@ -3922,6 +3923,156 @@ def fix_page_metadata_script_paths():
# widget (copy to clipboard / view as plain Markdown).


def _postprocess_markdown_content(md_content: str, rel: str) -> str:
"""Apply markdown cleanup and link normalization after pandoc conversion."""

# Decode HTML entities that may have been double-encoded or improperly handled.
# This fixes issues like: ’ -> ', “ -> ", etc.
md_content = html.unescape(md_content)

# Fix common UTF-8 mojibake sequences (UTF-8 bytes interpreted as Latin-1).
# When UTF-8 bytes are incorrectly decoded as Latin-1, we get mojibake patterns.
# E.g., U+2019 (') is bytes E2 80 99 in UTF-8, but decoded as Latin-1 becomes
# the three characters U+00E2 U+20AC U+2122, which looks like: ’
mojibake_fixes = {
"\u00e2\u20ac\u2122": "\u2019", # Right single quotation mark
"\u00e2\u20ac\u009c": "\u201c", # Left double quotation mark
"\u00e2\u20ac\u009d": "\u201d", # Right double quotation mark
"\u00e2\u20ac\u0093": "\u2013", # En dash
"\u00e2\u20ac\u0094": "\u2014", # Em dash
# Alternate mojibake form with C1 control characters.
"\u00e2\u0080\u0099": "\u2019",
"\u00e2\u0080\u009c": "\u201c",
"\u00e2\u0080\u009d": "\u201d",
"\u00e2\u0080\u0093": "\u2013",
"\u00e2\u0080\u0094": "\u2014",
}
for mojibake, correct in mojibake_fixes.items():
md_content = md_content.replace(mojibake, correct)

# Normalize typography to plain ASCII for robust display/copy across
# environments that may not preserve UTF-8 metadata for raw .md files.
typography_fixes = {
"\u2018": "'", # Left single quotation mark
"\u2019": "'", # Right single quotation mark
"\u201c": '"', # Left double quotation mark
"\u201d": '"', # Right double quotation mark
"\u2013": "-", # En dash
"\u2014": "--", # Em dash
}
for src, dst in typography_fixes.items():
md_content = md_content.replace(src, dst)

# Remove standalone Source links (HTML and markdown forms).
md_content = re.sub(
r"^\s*<a\s+[^>]*>\s*source\s*</a>\s*\n?",
"",
md_content,
flags=re.IGNORECASE | re.MULTILINE,
)
md_content = re.sub(
r"^\s*\[source\]\([^\n)]+\)\s*\n?",
"",
md_content,
flags=re.IGNORECASE | re.MULTILINE,
)

# Convert simple HTML anchors to markdown links.
# Keep this conservative: only convert when anchor text has no nested tags.
def _anchor_to_md(m):
href = m.group("href").strip()
text = m.group("text")
if "<" in text or ">" in text:
return m.group(0)
text = html.unescape(text).strip()
if not text:
return m.group(0)
return f"[{text}]({href})"

md_content = re.sub(
r'<a\s+[^>]*href="(?P<href>[^"]+)"[^>]*>(?P<text>.*?)</a>',
_anchor_to_md,
md_content,
flags=re.IGNORECASE | re.DOTALL,
)

# Normalize parameter signature artifacts created by adjacent code spans.
# These can appear in various formats depending on how pandoc combined the spans.
# Pattern 1: `name``:`` ``type`` ``=`` ``default`` - full three-part with optional closing backtick
md_content = re.sub(
r"`([^`\n]+)``:``\s*``([^`\n]+)``\s*``=``\s*``([^`]+?)``?",
r"`\1`: `\2` = `\3`",
md_content,
)
# Pattern 2: `name``:`` ``type`` - just name and type
md_content = re.sub(
r"`([^`\n]+)``:``\s*``([^`\n]+)``",
r"`\1`: `\2`",
md_content,
)

# Remove leftover HTML div wrappers that pandoc preserved.
md_content = re.sub(
r"^<div[^>]*>\s*$",
"",
md_content,
flags=re.MULTILINE,
)
md_content = re.sub(
r"^</div>\s*$",
"",
md_content,
flags=re.MULTILINE,
)

# Remove leftover <span> tags with parameter/annotation classes.
md_content = re.sub(
r'<span\s+class="parameter-[^"]*"[^>]*>(.*?)</span>',
r"\1",
md_content,
)

# Rewrite internal .html links to .md (relative paths only).
md_content = re.sub(
r"\]\((\.\./[^)]*?)\.html(\)?)",
r"](\1.md\2",
md_content,
)
# Also in the same directory.
md_content = re.sub(
r"\]\(([A-Za-z0-9_][^):/]*?)\.html(\)?)",
r"](\1.md\2",
md_content,
)

# Simplify redundant ../current_dir/ paths to ./
file_dir = os.path.dirname(rel)
if file_dir:
escaped = re.escape("../" + file_dir + "/")
md_content = re.sub(
r"\]\(" + escaped + r"([^)]+)\)",
r"](\1)",
md_content,
)

# Remove leftover <span> tags (screen-reader, callout-icon, etc.)
md_content = re.sub(
r'<span\s+class="[^"]*">(.*?)</span>',
r"\1",
md_content,
)
# Remove empty <i> tags (callout icons)
md_content = re.sub(r"<i[^>]*></i>", "", md_content)

# Clean up excessive blank lines (3+ → 2)
md_content = re.sub(r"\n{4,}", "\n\n\n", md_content)

# Strip trailing whitespace
md_content = md_content.strip() + "\n"

return md_content


def generate_markdown_pages():
"""
Create a .md companion for every .html page in _site/.
Expand Down Expand Up @@ -4174,9 +4325,9 @@ def _param_dl_to_html(m):
dl_html = m.group(0)
items = []
dt_dd_pattern = re.compile(
r'<dt>.*?<span class="parameter-name">\s*<strong>(.*?)</strong>\s*</span>'
r'(?:.*?<span class="parameter-annotation">(.*?)</span>)?'
r'(?:.*?<span class="parameter-default">(.*?)</span>)?'
r'<dt>.*?<span class="(?:parameter-name|doc-parameter-name)">\s*<strong>(.*?)</strong>\s*</span>'
r'(?:.*?<span class="(?:parameter-annotation|doc-parameter-annotation)">(.*?)</span>)?'
r'(?:.*?<span class="(?:parameter-default|doc-parameter-default)">(.*?)</span>)?'
r".*?</dt>\s*<dd>\s*(.*?)\s*</dd>",
re.DOTALL,
)
Expand Down Expand Up @@ -4232,9 +4383,10 @@ def _param_dl_to_html(m):
main_html,
)

# Remove parameter-* spans (parameter-name, parameter-annotation, etc.)
# Remove parameter-* spans (parameter-name, doc-parameter-name, parameter-annotation, etc.)
# Note: The HTML may have either 'parameter-*' or 'doc-parameter-*' class names
main_html = re.sub(
r'<span\s+class="parameter-[^"]*"[^>]*>(.*?)</span>',
r'<span\s+class="(?:parameter-|doc-parameter-)[^"]*"[^>]*>(.*?)</span>',
r"\1",
main_html,
)
Expand All @@ -4261,68 +4413,24 @@ def _param_dl_to_html(m):
continue

md_content = result.stdout

# ── 4b. Post-pandoc cleanup ──────────────────────────────────
# Remove leftover HTML div wrappers that pandoc preserved
md_content = re.sub(
r"^<div[^>]*>\s*$",
"",
md_content,
flags=re.MULTILINE,
)
md_content = re.sub(
r"^</div>\s*$",
"",
md_content,
flags=re.MULTILINE,
)

# Remove leftover <span> tags with parameter/annotation classes
md_content = re.sub(
r'<span\s+class="parameter-[^"]*"[^>]*>(.*?)</span>',
r"\1",
md_content,
)

# Rewrite internal .html links to .md (relative paths only)
md_content = re.sub(
r"\]\((\.\./[^)]*?)\.html(\)?)",
r"](\1.md\2",
md_content,
)
# Also in the same directory
md_content = re.sub(
r"\]\(([A-Za-z0-9_][^):/]*?)\.html(\)?)",
r"](\1.md\2",
md_content,
)

# Simplify redundant ../current_dir/ paths to ./
file_dir = os.path.dirname(rel)
if file_dir:
# e.g. for user-guide/changelog.md, rewrite
# ../user-guide/foo.md → foo.md
escaped = re.escape("../" + file_dir + "/")
md_content = re.sub(
r"\]\(" + escaped + r"([^)]+)\)",
r"](\1)",
md_content,
)

# Remove leftover <span> tags (screen-reader, callout-icon, etc.)
md_content = re.sub(
r'<span\s+class="[^"]*">(.*?)</span>',
r"\1",
md_content,
)
# Remove empty <i> tags (callout icons)
md_content = re.sub(r"<i[^>]*></i>", "", md_content)

# Clean up excessive blank lines (3+ → 2)
md_content = re.sub(r"\n{4,}", "\n\n\n", md_content)

# Strip trailing whitespace
md_content = md_content.strip() + "\n"
# Debug: Check if post-processing is needed
found_artifact = False
if "``:``" in md_content:
# Find the context around the artifact
lines = md_content.split("\n")
for i, line in enumerate(lines):
if "``:``" in line:
print(f" DEBUG [{rel}:{i}]: Line with artifact = {repr(line[:150])}")
found_artifact = True
break
md_content = _postprocess_markdown_content(md_content, rel)
# Debug: Verify post-processing worked
if "``:``" in md_content and found_artifact:
lines = md_content.split("\n")
for i, line in enumerate(lines):
if "``:``" in line:
print(f" WARNING [{rel}:{i}]: After post-processing = {repr(line[:150])}")
break

# ── 5. Write .md file ────────────────────────────────────────
md_file = html_file.rsplit(".", 1)[0] + ".md"
Expand Down
Loading
Loading