diff --git a/great_docs/assets/post-render.py b/great_docs/assets/post-render.py index 673bd0a..5a52d1b 100644 --- a/great_docs/assets/post-render.py +++ b/great_docs/assets/post-render.py @@ -1,4 +1,5 @@ import glob +import html import json import os import re @@ -3922,6 +3923,156 @@ def fix_page_metadata_script_paths(): # widget (copy to clipboard / view as plain Markdown). +def _postprocess_markdown_content(md_content: str, rel: str) -> str: + """Apply markdown cleanup and link normalization after pandoc conversion.""" + + # Decode HTML entities that may have been double-encoded or improperly handled. + # This fixes issues like: ’ -> ', “ -> ", etc. + md_content = html.unescape(md_content) + + # Fix common UTF-8 mojibake sequences (UTF-8 bytes interpreted as Latin-1). + # When UTF-8 bytes are incorrectly decoded as Latin-1, we get mojibake patterns. + # E.g., U+2019 (') is bytes E2 80 99 in UTF-8, but decoded as Latin-1 becomes + # the three characters U+00E2 U+20AC U+2122, which looks like: ’ + mojibake_fixes = { + "\u00e2\u20ac\u2122": "\u2019", # Right single quotation mark + "\u00e2\u20ac\u009c": "\u201c", # Left double quotation mark + "\u00e2\u20ac\u009d": "\u201d", # Right double quotation mark + "\u00e2\u20ac\u0093": "\u2013", # En dash + "\u00e2\u20ac\u0094": "\u2014", # Em dash + # Alternate mojibake form with C1 control characters. + "\u00e2\u0080\u0099": "\u2019", + "\u00e2\u0080\u009c": "\u201c", + "\u00e2\u0080\u009d": "\u201d", + "\u00e2\u0080\u0093": "\u2013", + "\u00e2\u0080\u0094": "\u2014", + } + for mojibake, correct in mojibake_fixes.items(): + md_content = md_content.replace(mojibake, correct) + + # Normalize typography to plain ASCII for robust display/copy across + # environments that may not preserve UTF-8 metadata for raw .md files. + typography_fixes = { + "\u2018": "'", # Left single quotation mark + "\u2019": "'", # Right single quotation mark + "\u201c": '"', # Left double quotation mark + "\u201d": '"', # Right double quotation mark + "\u2013": "-", # En dash + "\u2014": "--", # Em dash + } + for src, dst in typography_fixes.items(): + md_content = md_content.replace(src, dst) + + # Remove standalone Source links (HTML and markdown forms). + md_content = re.sub( + r"^\s*]*>\s*source\s*\s*\n?", + "", + md_content, + flags=re.IGNORECASE | re.MULTILINE, + ) + md_content = re.sub( + r"^\s*\[source\]\([^\n)]+\)\s*\n?", + "", + md_content, + flags=re.IGNORECASE | re.MULTILINE, + ) + + # Convert simple HTML anchors to markdown links. + # Keep this conservative: only convert when anchor text has no nested tags. + def _anchor_to_md(m): + href = m.group("href").strip() + text = m.group("text") + if "<" in text or ">" in text: + return m.group(0) + text = html.unescape(text).strip() + if not text: + return m.group(0) + return f"[{text}]({href})" + + md_content = re.sub( + r']*href="(?P[^"]+)"[^>]*>(?P.*?)', + _anchor_to_md, + md_content, + flags=re.IGNORECASE | re.DOTALL, + ) + + # Normalize parameter signature artifacts created by adjacent code spans. + # These can appear in various formats depending on how pandoc combined the spans. + # Pattern 1: `name``:`` ``type`` ``=`` ``default`` - full three-part with optional closing backtick + md_content = re.sub( + r"`([^`\n]+)``:``\s*``([^`\n]+)``\s*``=``\s*``([^`]+?)``?", + r"`\1`: `\2` = `\3`", + md_content, + ) + # Pattern 2: `name``:`` ``type`` - just name and type + md_content = re.sub( + r"`([^`\n]+)``:``\s*``([^`\n]+)``", + r"`\1`: `\2`", + md_content, + ) + + # Remove leftover HTML div wrappers that pandoc preserved. + md_content = re.sub( + r"^]*>\s*$", + "", + md_content, + flags=re.MULTILINE, + ) + md_content = re.sub( + r"^\s*$", + "", + md_content, + flags=re.MULTILINE, + ) + + # Remove leftover tags with parameter/annotation classes. + md_content = re.sub( + r']*>(.*?)', + r"\1", + md_content, + ) + + # Rewrite internal .html links to .md (relative paths only). + md_content = re.sub( + r"\]\((\.\./[^)]*?)\.html(\)?)", + r"](\1.md\2", + md_content, + ) + # Also in the same directory. + md_content = re.sub( + r"\]\(([A-Za-z0-9_][^):/]*?)\.html(\)?)", + r"](\1.md\2", + md_content, + ) + + # Simplify redundant ../current_dir/ paths to ./ + file_dir = os.path.dirname(rel) + if file_dir: + escaped = re.escape("../" + file_dir + "/") + md_content = re.sub( + r"\]\(" + escaped + r"([^)]+)\)", + r"](\1)", + md_content, + ) + + # Remove leftover tags (screen-reader, callout-icon, etc.) + md_content = re.sub( + r'(.*?)', + r"\1", + md_content, + ) + # Remove empty tags (callout icons) + md_content = re.sub(r"]*>", "", md_content) + + # Clean up excessive blank lines (3+ → 2) + md_content = re.sub(r"\n{4,}", "\n\n\n", md_content) + + # Strip trailing whitespace + md_content = md_content.strip() + "\n" + + return md_content + + def generate_markdown_pages(): """ Create a .md companion for every .html page in _site/. @@ -4174,9 +4325,9 @@ def _param_dl_to_html(m): dl_html = m.group(0) items = [] dt_dd_pattern = re.compile( - r'
.*?\s*(.*?)\s*' - r'(?:.*?(.*?))?' - r'(?:.*?(.*?))?' + r'
.*?\s*(.*?)\s*' + r'(?:.*?(.*?))?' + r'(?:.*?(.*?))?' r".*?
\s*
\s*(.*?)\s*
", re.DOTALL, ) @@ -4232,9 +4383,10 @@ def _param_dl_to_html(m): main_html, ) - # Remove parameter-* spans (parameter-name, parameter-annotation, etc.) + # Remove parameter-* spans (parameter-name, doc-parameter-name, parameter-annotation, etc.) + # Note: The HTML may have either 'parameter-*' or 'doc-parameter-*' class names main_html = re.sub( - r']*>(.*?)', + r']*>(.*?)', r"\1", main_html, ) @@ -4261,68 +4413,24 @@ def _param_dl_to_html(m): continue md_content = result.stdout - - # ── 4b. Post-pandoc cleanup ────────────────────────────────── - # Remove leftover HTML div wrappers that pandoc preserved - md_content = re.sub( - r"^]*>\s*$", - "", - md_content, - flags=re.MULTILINE, - ) - md_content = re.sub( - r"^\s*$", - "", - md_content, - flags=re.MULTILINE, - ) - - # Remove leftover tags with parameter/annotation classes - md_content = re.sub( - r']*>(.*?)', - r"\1", - md_content, - ) - - # Rewrite internal .html links to .md (relative paths only) - md_content = re.sub( - r"\]\((\.\./[^)]*?)\.html(\)?)", - r"](\1.md\2", - md_content, - ) - # Also in the same directory - md_content = re.sub( - r"\]\(([A-Za-z0-9_][^):/]*?)\.html(\)?)", - r"](\1.md\2", - md_content, - ) - - # Simplify redundant ../current_dir/ paths to ./ - file_dir = os.path.dirname(rel) - if file_dir: - # e.g. for user-guide/changelog.md, rewrite - # ../user-guide/foo.md → foo.md - escaped = re.escape("../" + file_dir + "/") - md_content = re.sub( - r"\]\(" + escaped + r"([^)]+)\)", - r"](\1)", - md_content, - ) - - # Remove leftover tags (screen-reader, callout-icon, etc.) - md_content = re.sub( - r'(.*?)', - r"\1", - md_content, - ) - # Remove empty tags (callout icons) - md_content = re.sub(r"]*>", "", md_content) - - # Clean up excessive blank lines (3+ → 2) - md_content = re.sub(r"\n{4,}", "\n\n\n", md_content) - - # Strip trailing whitespace - md_content = md_content.strip() + "\n" + # Debug: Check if post-processing is needed + found_artifact = False + if "``:``" in md_content: + # Find the context around the artifact + lines = md_content.split("\n") + for i, line in enumerate(lines): + if "``:``" in line: + print(f" DEBUG [{rel}:{i}]: Line with artifact = {repr(line[:150])}") + found_artifact = True + break + md_content = _postprocess_markdown_content(md_content, rel) + # Debug: Verify post-processing worked + if "``:``" in md_content and found_artifact: + lines = md_content.split("\n") + for i, line in enumerate(lines): + if "``:``" in line: + print(f" WARNING [{rel}:{i}]: After post-processing = {repr(line[:150])}") + break # ── 5. Write .md file ──────────────────────────────────────── md_file = html_file.rsplit(".", 1)[0] + ".md" diff --git a/tests/test_post_render.py b/tests/test_post_render.py index d371c93..b29e850 100644 --- a/tests/test_post_render.py +++ b/tests/test_post_render.py @@ -22,6 +22,8 @@ def _load_post_render(): def _get_functions(): """Extract translate_sphinx_roles and translate_rst_directives via exec.""" + import html as _html + import os as _os import re as _re # noqa: F811 source = _SCRIPT.read_text() @@ -36,6 +38,8 @@ def _t(key: str, fallback: str | None = None) -> str: # Build a minimal namespace with the imports the functions need ns = { + "html": _html, + "os": _os, "re": _re, "__builtins__": __builtins__, "highlight": _highlight, @@ -57,6 +61,7 @@ def _t(key: str, fallback: str | None = None) -> str: "translate_rst_directives", "translate_rst_math", "fix_plain_doctest_code_blocks", + "_postprocess_markdown_content", ] for func_name in funcs_to_extract: @@ -87,6 +92,7 @@ def _t(key: str, fallback: str | None = None) -> str: ns["translate_rst_directives"], ns["translate_rst_math"], ns["fix_plain_doctest_code_blocks"], + ns["_postprocess_markdown_content"], ) @@ -95,9 +101,92 @@ def _t(key: str, fallback: str | None = None) -> str: translate_rst_directives, translate_rst_math, fix_plain_doctest_code_blocks, + postprocess_markdown_content, ) = _get_functions() +class TestPostprocessMarkdownContent: + """Tests for markdown cleanup used by generated .md reference pages.""" + + def test_removes_source_anchor_and_converts_links(self): + md = ( + "Usage\n\n" + 'Source\n\n' + "The workflow is: " + 'install()' + " then " + 'build().\n' + ) + + out = postprocess_markdown_content(md, "reference/GreatDocs.md") + + assert "Source" not in out + assert "[install()](GreatDocs.install.md#great_docs.GreatDocs.install)" in out + assert "[build()](GreatDocs.build.md#great_docs.GreatDocs.build)" in out + assert "