diff --git a/great_docs/assets/post-render.py b/great_docs/assets/post-render.py
index 673bd0a..5a52d1b 100644
--- a/great_docs/assets/post-render.py
+++ b/great_docs/assets/post-render.py
@@ -1,4 +1,5 @@
import glob
+import html
import json
import os
import re
@@ -3922,6 +3923,156 @@ def fix_page_metadata_script_paths():
# widget (copy to clipboard / view as plain Markdown).
+def _postprocess_markdown_content(md_content: str, rel: str) -> str:
+ """Apply markdown cleanup and link normalization after pandoc conversion."""
+
+ # Decode HTML entities that may have been double-encoded or improperly handled.
+ # This fixes issues like: ’ -> ', “ -> ", etc.
+ md_content = html.unescape(md_content)
+
+ # Fix common UTF-8 mojibake sequences (UTF-8 bytes interpreted as Latin-1).
+ # When UTF-8 bytes are incorrectly decoded as Latin-1, we get mojibake patterns.
+ # E.g., U+2019 (') is bytes E2 80 99 in UTF-8, but decoded as Latin-1 becomes
+ # the three characters U+00E2 U+20AC U+2122, which looks like: ’
+ mojibake_fixes = {
+ "\u00e2\u20ac\u2122": "\u2019", # Right single quotation mark
+ "\u00e2\u20ac\u009c": "\u201c", # Left double quotation mark
+ "\u00e2\u20ac\u009d": "\u201d", # Right double quotation mark
+ "\u00e2\u20ac\u0093": "\u2013", # En dash
+ "\u00e2\u20ac\u0094": "\u2014", # Em dash
+ # Alternate mojibake form with C1 control characters.
+ "\u00e2\u0080\u0099": "\u2019",
+ "\u00e2\u0080\u009c": "\u201c",
+ "\u00e2\u0080\u009d": "\u201d",
+ "\u00e2\u0080\u0093": "\u2013",
+ "\u00e2\u0080\u0094": "\u2014",
+ }
+ for mojibake, correct in mojibake_fixes.items():
+ md_content = md_content.replace(mojibake, correct)
+
+ # Normalize typography to plain ASCII for robust display/copy across
+ # environments that may not preserve UTF-8 metadata for raw .md files.
+ typography_fixes = {
+ "\u2018": "'", # Left single quotation mark
+ "\u2019": "'", # Right single quotation mark
+ "\u201c": '"', # Left double quotation mark
+ "\u201d": '"', # Right double quotation mark
+ "\u2013": "-", # En dash
+ "\u2014": "--", # Em dash
+ }
+ for src, dst in typography_fixes.items():
+ md_content = md_content.replace(src, dst)
+
+ # Remove standalone Source links (HTML and markdown forms).
+ md_content = re.sub(
+ r"^\s*]*>\s*source\s*\s*\n?",
+ "",
+ md_content,
+ flags=re.IGNORECASE | re.MULTILINE,
+ )
+ md_content = re.sub(
+ r"^\s*\[source\]\([^\n)]+\)\s*\n?",
+ "",
+ md_content,
+ flags=re.IGNORECASE | re.MULTILINE,
+ )
+
+ # Convert simple HTML anchors to markdown links.
+ # Keep this conservative: only convert when anchor text has no nested tags.
+ def _anchor_to_md(m):
+ href = m.group("href").strip()
+ text = m.group("text")
+ if "<" in text or ">" in text:
+ return m.group(0)
+ text = html.unescape(text).strip()
+ if not text:
+ return m.group(0)
+ return f"[{text}]({href})"
+
+ md_content = re.sub(
+ r']*href="(?P[^"]+)"[^>]*>(?P.*?)',
+ _anchor_to_md,
+ md_content,
+ flags=re.IGNORECASE | re.DOTALL,
+ )
+
+ # Normalize parameter signature artifacts created by adjacent code spans.
+ # These can appear in various formats depending on how pandoc combined the spans.
+ # Pattern 1: `name``:`` ``type`` ``=`` ``default`` - full three-part with optional closing backtick
+ md_content = re.sub(
+ r"`([^`\n]+)``:``\s*``([^`\n]+)``\s*``=``\s*``([^`]+?)``?",
+ r"`\1`: `\2` = `\3`",
+ md_content,
+ )
+ # Pattern 2: `name``:`` ``type`` - just name and type
+ md_content = re.sub(
+ r"`([^`\n]+)``:``\s*``([^`\n]+)``",
+ r"`\1`: `\2`",
+ md_content,
+ )
+
+ # Remove leftover HTML div wrappers that pandoc preserved.
+ md_content = re.sub(
+ r"^
]*>\s*$",
+ "",
+ md_content,
+ flags=re.MULTILINE,
+ )
+ md_content = re.sub(
+ r"^
\s*$",
+ "",
+ md_content,
+ flags=re.MULTILINE,
+ )
+
+ # Remove leftover tags with parameter/annotation classes.
+ md_content = re.sub(
+ r']*>(.*?)',
+ r"\1",
+ md_content,
+ )
+
+ # Rewrite internal .html links to .md (relative paths only).
+ md_content = re.sub(
+ r"\]\((\.\./[^)]*?)\.html(\)?)",
+ r"](\1.md\2",
+ md_content,
+ )
+ # Also in the same directory.
+ md_content = re.sub(
+ r"\]\(([A-Za-z0-9_][^):/]*?)\.html(\)?)",
+ r"](\1.md\2",
+ md_content,
+ )
+
+ # Simplify redundant ../current_dir/ paths to ./
+ file_dir = os.path.dirname(rel)
+ if file_dir:
+ escaped = re.escape("../" + file_dir + "/")
+ md_content = re.sub(
+ r"\]\(" + escaped + r"([^)]+)\)",
+ r"](\1)",
+ md_content,
+ )
+
+ # Remove leftover tags (screen-reader, callout-icon, etc.)
+ md_content = re.sub(
+ r'(.*?)',
+ r"\1",
+ md_content,
+ )
+ # Remove empty tags (callout icons)
+ md_content = re.sub(r"]*>", "", md_content)
+
+ # Clean up excessive blank lines (3+ → 2)
+ md_content = re.sub(r"\n{4,}", "\n\n\n", md_content)
+
+ # Strip trailing whitespace
+ md_content = md_content.strip() + "\n"
+
+ return md_content
+
+
def generate_markdown_pages():
"""
Create a .md companion for every .html page in _site/.
@@ -4174,9 +4325,9 @@ def _param_dl_to_html(m):
dl_html = m.group(0)
items = []
dt_dd_pattern = re.compile(
- r'.*?\s*(.*?)\s*'
- r'(?:.*?(.*?))?'
- r'(?:.*?(.*?))?'
+ r'.*?\s*(.*?)\s*'
+ r'(?:.*?(.*?))?'
+ r'(?:.*?(.*?))?'
r".*?\s*\s*(.*?)\s*",
re.DOTALL,
)
@@ -4232,9 +4383,10 @@ def _param_dl_to_html(m):
main_html,
)
- # Remove parameter-* spans (parameter-name, parameter-annotation, etc.)
+ # Remove parameter-* spans (parameter-name, doc-parameter-name, parameter-annotation, etc.)
+ # Note: The HTML may have either 'parameter-*' or 'doc-parameter-*' class names
main_html = re.sub(
- r']*>(.*?)',
+ r']*>(.*?)',
r"\1",
main_html,
)
@@ -4261,68 +4413,24 @@ def _param_dl_to_html(m):
continue
md_content = result.stdout
-
- # ── 4b. Post-pandoc cleanup ──────────────────────────────────
- # Remove leftover HTML div wrappers that pandoc preserved
- md_content = re.sub(
- r"^]*>\s*$",
- "",
- md_content,
- flags=re.MULTILINE,
- )
- md_content = re.sub(
- r"^
\s*$",
- "",
- md_content,
- flags=re.MULTILINE,
- )
-
- # Remove leftover tags with parameter/annotation classes
- md_content = re.sub(
- r']*>(.*?)',
- r"\1",
- md_content,
- )
-
- # Rewrite internal .html links to .md (relative paths only)
- md_content = re.sub(
- r"\]\((\.\./[^)]*?)\.html(\)?)",
- r"](\1.md\2",
- md_content,
- )
- # Also in the same directory
- md_content = re.sub(
- r"\]\(([A-Za-z0-9_][^):/]*?)\.html(\)?)",
- r"](\1.md\2",
- md_content,
- )
-
- # Simplify redundant ../current_dir/ paths to ./
- file_dir = os.path.dirname(rel)
- if file_dir:
- # e.g. for user-guide/changelog.md, rewrite
- # ../user-guide/foo.md → foo.md
- escaped = re.escape("../" + file_dir + "/")
- md_content = re.sub(
- r"\]\(" + escaped + r"([^)]+)\)",
- r"](\1)",
- md_content,
- )
-
- # Remove leftover tags (screen-reader, callout-icon, etc.)
- md_content = re.sub(
- r'(.*?)',
- r"\1",
- md_content,
- )
- # Remove empty tags (callout icons)
- md_content = re.sub(r"]*>", "", md_content)
-
- # Clean up excessive blank lines (3+ → 2)
- md_content = re.sub(r"\n{4,}", "\n\n\n", md_content)
-
- # Strip trailing whitespace
- md_content = md_content.strip() + "\n"
+ # Debug: Check if post-processing is needed
+ found_artifact = False
+ if "``:``" in md_content:
+ # Find the context around the artifact
+ lines = md_content.split("\n")
+ for i, line in enumerate(lines):
+ if "``:``" in line:
+ print(f" DEBUG [{rel}:{i}]: Line with artifact = {repr(line[:150])}")
+ found_artifact = True
+ break
+ md_content = _postprocess_markdown_content(md_content, rel)
+ # Debug: Verify post-processing worked
+ if "``:``" in md_content and found_artifact:
+ lines = md_content.split("\n")
+ for i, line in enumerate(lines):
+ if "``:``" in line:
+ print(f" WARNING [{rel}:{i}]: After post-processing = {repr(line[:150])}")
+ break
# ── 5. Write .md file ────────────────────────────────────────
md_file = html_file.rsplit(".", 1)[0] + ".md"
diff --git a/tests/test_post_render.py b/tests/test_post_render.py
index d371c93..b29e850 100644
--- a/tests/test_post_render.py
+++ b/tests/test_post_render.py
@@ -22,6 +22,8 @@ def _load_post_render():
def _get_functions():
"""Extract translate_sphinx_roles and translate_rst_directives via exec."""
+ import html as _html
+ import os as _os
import re as _re # noqa: F811
source = _SCRIPT.read_text()
@@ -36,6 +38,8 @@ def _t(key: str, fallback: str | None = None) -> str:
# Build a minimal namespace with the imports the functions need
ns = {
+ "html": _html,
+ "os": _os,
"re": _re,
"__builtins__": __builtins__,
"highlight": _highlight,
@@ -57,6 +61,7 @@ def _t(key: str, fallback: str | None = None) -> str:
"translate_rst_directives",
"translate_rst_math",
"fix_plain_doctest_code_blocks",
+ "_postprocess_markdown_content",
]
for func_name in funcs_to_extract:
@@ -87,6 +92,7 @@ def _t(key: str, fallback: str | None = None) -> str:
ns["translate_rst_directives"],
ns["translate_rst_math"],
ns["fix_plain_doctest_code_blocks"],
+ ns["_postprocess_markdown_content"],
)
@@ -95,9 +101,92 @@ def _t(key: str, fallback: str | None = None) -> str:
translate_rst_directives,
translate_rst_math,
fix_plain_doctest_code_blocks,
+ postprocess_markdown_content,
) = _get_functions()
+class TestPostprocessMarkdownContent:
+ """Tests for markdown cleanup used by generated .md reference pages."""
+
+ def test_removes_source_anchor_and_converts_links(self):
+ md = (
+ "Usage\n\n"
+ 'Source\n\n'
+ "The workflow is: "
+ 'install()'
+ " then "
+ 'build().\n'
+ )
+
+ out = postprocess_markdown_content(md, "reference/GreatDocs.md")
+
+ assert "Source" not in out
+ assert "[install()](GreatDocs.install.md#great_docs.GreatDocs.install)" in out
+ assert "[build()](GreatDocs.build.md#great_docs.GreatDocs.build)" in out
+ assert "