From 56a70e85f09b08bfc3c5237a3fb88d114afc6727 Mon Sep 17 00:00:00 2001 From: Michael Matloob <matloob@golang.org> Date: Wed, 6 Dec 2023 16:00:10 -0500 Subject: [PATCH] internal/frontend: change id generation to use parsed markdown text To produce heading ids that match between the goldmark version of the code and the rsc.io/markdown version of the code, use the markdown parser to parse the markdown and then extract the text from it. We do this because rsc.io/markdown doesn't provide the raw markdown for us to generate the ids with. This will change the ids that are generated for some headings. For golang/go#61399 Change-Id: Id0f26b311b59e848ff1753e058d413ed3168926d Reviewed-on: https://go-review.googlesource.com/c/pkgsite/+/548255 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> kokoro-CI: kokoro <noreply+kokoro@google.com> Reviewed-by: Jonathan Amsterdam <jba@google.com> --- internal/frontend/goldmark.go | 37 ++++++++++++++++++++++---------- internal/frontend/markdown.go | 28 +++++++++++++----------- internal/frontend/readme_test.go | 16 ++++++++++++-- 3 files changed, 55 insertions(+), 26 deletions(-) diff --git a/internal/frontend/goldmark.go b/internal/frontend/goldmark.go index deb39f5d6..22f6e2cd1 100644 --- a/internal/frontend/goldmark.go +++ b/internal/frontend/goldmark.go @@ -10,7 +10,6 @@ import ( "bytes" "context" "fmt" - "regexp" "strings" "github.com/yuin/goldmark/ast" @@ -22,6 +21,7 @@ import ( "golang.org/x/pkgsite/internal" "golang.org/x/pkgsite/internal/log" "golang.org/x/pkgsite/internal/source" + "rsc.io/markdown" ) // astTransformer is a default transformer of the goldmark tree. We pass in @@ -185,20 +185,35 @@ func newIDs() parser.IDs { // unit page. Duplicated heading ids are given an incremental suffix. See // readme_test.go for examples. func (s *ids) Generate(value []byte, kind ast.NodeKind) []byte { - // Matches strings like `<tag attr="value">Text</tag>` or `[](link.html)`. - r := regexp.MustCompile(`(<[^<>]+>|\[\!\[[^\]]+]\([^\)]+\)\]\([^\)]+\))`) - str := r.ReplaceAllString(string(value), "") + var defaultID string + if kind == ast.KindHeading { + defaultID = "heading" + } else { + defaultID = "id" + } + + parser := &markdown.Parser{} + doc := parser.Parse("# " + string(value)) + return []byte(s.generateID(doc, defaultID)) +} + +func (s *ids) generateID(block markdown.Block, defaultID string) string { + var buf bytes.Buffer + walkBlocks([]markdown.Block{block}, func(b markdown.Block) error { + if t, ok := b.(*markdown.Text); ok { + for _, inl := range t.Inline { + inl.PrintText(&buf) + } + } + return nil + }) f := func(c rune) bool { return !('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z') && !('0' <= c && c <= '9') } - str = strings.Join(strings.FieldsFunc(str, f), "-") + str := strings.Join(strings.FieldsFunc(buf.String(), f), "-") str = strings.ToLower(str) if len(str) == 0 { - if kind == ast.KindHeading { - str = "heading" - } else { - str = "id" - } + str = defaultID } key := str for i := 1; ; i++ { @@ -208,7 +223,7 @@ func (s *ids) Generate(value []byte, kind ast.NodeKind) []byte { } key = fmt.Sprintf("%s-%d", str, i) } - return []byte("readme-" + key) + return "readme-" + key } // Put implements Put from the goldmark parser IDs interface. diff --git a/internal/frontend/markdown.go b/internal/frontend/markdown.go index 8f2da6cc3..98072e336 100644 --- a/internal/frontend/markdown.go +++ b/internal/frontend/markdown.go @@ -12,7 +12,6 @@ import ( "strings" "github.com/google/safehtml/template" - "github.com/yuin/goldmark/ast" "golang.org/x/pkgsite/internal" "golang.org/x/pkgsite/internal/derrors" "golang.org/x/pkgsite/internal/log" @@ -118,6 +117,8 @@ func walkBlocks(blocks []markdown.Block, walkFunc func(b markdown.Block) error) err = nil switch x := b.(type) { + case *markdown.Document: + err = walkBlocks(x.Blocks, walkFunc) case *markdown.Text: case *markdown.Paragraph: err = walkBlocks([]markdown.Block{x.Text}, walkFunc) @@ -130,7 +131,9 @@ func walkBlocks(blocks []markdown.Block, walkFunc func(b markdown.Block) error) case *markdown.Quote: err = walkBlocks(x.Blocks, walkFunc) case *markdown.HTMLBlock: - continue + case *markdown.CodeBlock: + case *markdown.Empty: + case *markdown.ThematicBreak: default: return fmt.Errorf("unhandled block type %T", x) } @@ -287,6 +290,12 @@ func transformHeadingsToHTML(doc *markdown.Document) { rewriteHeadingsBlocks = func(blocks []markdown.Block) { for i, b := range blocks { switch x := b.(type) { + case *markdown.Paragraph: + rewriteHeadingsBlocks([]markdown.Block{x.Text}) + case *markdown.List: + rewriteHeadingsBlocks(x.Items) + case *markdown.Item: + rewriteHeadingsBlocks(x.Blocks) case *markdown.Quote: rewriteHeadingsBlocks(x.Blocks) case *markdown.Heading: @@ -338,19 +347,12 @@ var htmlQuoteEscaper = strings.NewReplacer( // function, but we don't have the raw markdown anymore, so we use the // text instead. func rewriteHeadingIDs(doc *markdown.Document) { - ids := newIDs() + ids := &ids{ + values: map[string]bool{}, + } walkBlocks(doc.Blocks, func(b markdown.Block) error { if heading, ok := b.(*markdown.Heading); ok { - var buf bytes.Buffer - for _, inl := range heading.Text.Inline { - // Hack: use HTML because ids strips out html tags. - // TODO(matloob): change the goldmark code to not use - // raw markdown text and instead depend on the text of the - // nodes. - inl.PrintHTML(&buf) - } - - id := ids.Generate(buf.Bytes(), ast.KindHeading) + id := ids.generateID(heading, "heading") heading.ID = string(id) } return nil diff --git a/internal/frontend/readme_test.go b/internal/frontend/readme_test.go index e7abd1c32..df162c44d 100644 --- a/internal/frontend/readme_test.go +++ b/internal/frontend/readme_test.go @@ -414,9 +414,9 @@ func TestReadme(t *testing.T) { Contents: `# [](link.html) `, }, - wantHTML: `<h3 class="h1" id="readme-heading"><a href="https://github.com/valid/module_name/blob/v1.0.0/link.html" rel="nofollow"><img src="https://github.com/valid/module_name/raw/v1.0.0/file.svg" alt="Image Text"/></a></h3>`, + wantHTML: `<h3 class="h1" id="readme-image-text"><a href="https://github.com/valid/module_name/blob/v1.0.0/link.html" rel="nofollow"><img src="https://github.com/valid/module_name/raw/v1.0.0/file.svg" alt="Image Text"/></a></h3>`, wantOutline: []*Heading{ - {Level: 1, Text: "Image Text", ID: "readme-heading"}, + {Level: 1, Text: "Image Text", ID: "readme-image-text"}, }, }, { @@ -457,6 +457,18 @@ func TestReadme(t *testing.T) { {Level: 1, Text: "Heading", ID: "readme-heading-3"}, }, }, + { + name: "tag in heading", + unit: unit, + readme: &internal.Readme{ + Filepath: "README.md", + Contents: `# A link <a href="link">link</a>`, + }, + wantHTML: `<h3 class="h1" id="readme-a-link-link">A link <a href="link" rel="nofollow">link</a></h3>`, + wantOutline: []*Heading{ + {Level: 1, Text: "A link link", ID: "readme-a-link-link"}, + }, + }, } { t.Run(test.name, func(t *testing.T) { processReadmes := map[string]func(ctx context.Context, u *internal.Unit) (frontendReadme *Readme, err error){