Skip to content

Commit

Permalink
Check base tag for absolute urls
Browse files Browse the repository at this point in the history
  • Loading branch information
StJudeWasHere committed Feb 26, 2025
1 parent 74bd0bf commit 210d4dc
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 5 deletions.
60 changes: 60 additions & 0 deletions internal/services/html_parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -501,3 +501,63 @@ func TestEmptyBody(t *testing.T) {
t.Errorf("pageReport status code should be %d but received %d", statusCode, pageReport.StatusCode)
}
}

func TestBase(t *testing.T) {
u, err := url.Parse("https://example-base.com")
if err != nil {
fmt.Println(err)
}

body := []byte(
`<html>
<head><base href="https://example-base.com/test"></head>
<body><a href="/page.html">link</a></body>
</html>`)
statusCode := 200
headers := &http.Header{
"Content-Type": []string{"text/html"},
}

pageReport, _, err := services.NewHTMLParser(u, statusCode, headers, body, int64(len(body)))
if err != nil {
t.Fatal(err)
}

if len(pageReport.Links) != 1 {
t.Fatal("A link with base URL was expected")
}

if pageReport.Links[0].URL != "https://example-base.com/test/page.html" {
t.Errorf("Link with base URL does not match, got %s", pageReport.Links[0].URL)
}
}

func TestBaseRelativeURL(t *testing.T) {
u, err := url.Parse(testURL)
if err != nil {
fmt.Println(err)
}

body := []byte(
`<html>
<head><base href="/test"></head>
<body><a href="/page.html">link</a></body>
</html>`)
statusCode := 200
headers := &http.Header{
"Content-Type": []string{"text/html"},
}

pageReport, _, err := services.NewHTMLParser(u, statusCode, headers, body, int64(len(body)))
if err != nil {
t.Fatal(err)
}

if len(pageReport.Links) != 1 {
t.Fatal("A link with base URL was expected")
}

if pageReport.Links[0].URL != "https://example.com/test/page.html" {
t.Errorf("Link with base URL does not match, got %s", pageReport.Links[0].URL)
}
}
38 changes: 33 additions & 5 deletions internal/services/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -645,23 +645,51 @@ func (p *Parser) newLink(n *html.Node) (models.Link, error) {
return l, nil
}

// Return an absolute URL removing the URL fragment
// Return an absolute URL removing the URL fragment and taking into account
// the document's base tag if it exists.
func (p *Parser) absoluteURL(s string) (*url.URL, error) {
u, err := url.Parse(strings.TrimSpace(s))
if err != nil {
return &url.URL{}, err
return nil, err
}

a := p.ParsedURL.ResolveReference(u)
a.Fragment = ""
base := p.ParsedURL
if htmlBase, err := p.htmlBase(); err == nil {
u = htmlBase.JoinPath(u.Path)
if htmlBase.IsAbs() {
base = htmlBase
}
}

a := base.ResolveReference(u)
a.Fragment = ""
if a.Path == "" {
a.Path = "/"
}

if a.Scheme != "http" && a.Scheme != "https" {
return &url.URL{}, errors.New("protocol not supported")
return nil, errors.New("protocol not supported")
}

return a, nil
}

// htmlBase returns the url in the base tag if it exists. Otherwise it returns an error.
func (p *Parser) htmlBase() (*url.URL, error) {
base, err := htmlquery.Query(p.doc, "//head/base[@href]")
if err != nil {
return nil, err
}

if base == nil {
return nil, errors.New("document does not have a base tag")
}

href := htmlquery.SelectAttr(base, "href")
parsed, err := url.Parse(strings.TrimSpace(href))
if err != nil {
return nil, err
}

return parsed, nil
}

0 comments on commit 210d4dc

Please sign in to comment.