Skip to content

Commit

Permalink
Improve Content-Type parsing
Browse files Browse the repository at this point in the history
Instead of looking for "html" substring, actually parse the MIME type
string. Don't use mime.ParseMediaType though as it doesn't handle
invalid duplicate parameters (e.g. "text/html; charset=UTF-8; charset=utf-8")
that occur in the wild.
  • Loading branch information
WGH- committed Mar 25, 2024
1 parent 4ccfe78 commit 73a1f68
Showing 1 changed file with 16 additions and 1 deletion.
17 changes: 16 additions & 1 deletion colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -1117,9 +1117,24 @@ func (c *Collector) handleOnResponseHeaders(r *Response) {
}

func (c *Collector) handleOnHTML(resp *Response) error {
if len(c.htmlCallbacks) == 0 || !strings.Contains(strings.ToLower(resp.Headers.Get("Content-Type")), "html") {
if len(c.htmlCallbacks) == 0 {
return nil
}

contentType := resp.Headers.Get("Content-Type")
// implementation of mime.ParseMediaType without parsing the params
// part
mediatype, _, _ := strings.Cut(contentType, ";")
mediatype = strings.TrimSpace(strings.ToLower(mediatype))

// TODO we also want to parse application/xml as XHTML if it has
// appropriate doctype
switch mediatype {
case "text/html", "application/xhtml+xml":
default:
return nil
}

doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body))
if err != nil {
return err
Expand Down

0 comments on commit 73a1f68

Please sign in to comment.