Skip to content

Commit

Permalink
Improve Content-Type parsing
Browse files Browse the repository at this point in the history
Instead of looking for "html" substring, actually parse the MIME type
string. Don't use mime.ParseMediaType though as it doesn't handle
invalid duplicate parameters (e.g. "text/html; charset=UTF-8; charset=utf-8")
that occur in the wild.
  • Loading branch information
WGH- committed Mar 27, 2024
1 parent 26a5648 commit 31f0876
Showing 1 changed file with 16 additions and 1 deletion.
17 changes: 16 additions & 1 deletion colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -1117,9 +1117,24 @@ func (c *Collector) handleOnResponseHeaders(r *Response) {
}

func (c *Collector) handleOnHTML(resp *Response) error {
if len(c.htmlCallbacks) == 0 || !strings.Contains(strings.ToLower(resp.Headers.Get("Content-Type")), "html") {
if len(c.htmlCallbacks) == 0 {
return nil
}

contentType := resp.Headers.Get("Content-Type")
// implementation of mime.ParseMediaType without parsing the params
// part
mediatype, _, _ := strings.Cut(contentType, ";")
mediatype = strings.TrimSpace(strings.ToLower(mediatype))

// TODO we also want to parse application/xml as XHTML if it has
// appropriate doctype
switch mediatype {
case "text/html", "application/xhtml+xml":
default:
return nil
}

doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body))
if err != nil {
return err
Expand Down

0 comments on commit 31f0876

Please sign in to comment.