Improve Content-Type parsing

Instead of looking for "html" substring, actually parse the MIME type string. Don't use mime.ParseMediaType though as it doesn't handle invalid duplicate parameters (e.g. "text/html; charset=UTF-8; charset=utf-8") that occur in the wild.
gocolly · Mar 27, 2024 · 31f0876 · 31f0876
1 parent 26a5648
commit 31f0876
Showing 1 changed file with 16 additions and 1 deletion.
diff --git a/colly.go b/colly.go
@@ -1117,9 +1117,24 @@ func (c *Collector) handleOnResponseHeaders(r *Response) {
 }
 
 func (c *Collector) handleOnHTML(resp *Response) error {
-	if len(c.htmlCallbacks) == 0 || !strings.Contains(strings.ToLower(resp.Headers.Get("Content-Type")), "html") {
+	if len(c.htmlCallbacks) == 0 {
 		return nil
 	}
+
+	contentType := resp.Headers.Get("Content-Type")
+	// implementation of mime.ParseMediaType without parsing the params
+	// part
+	mediatype, _, _ := strings.Cut(contentType, ";")
+	mediatype = strings.TrimSpace(strings.ToLower(mediatype))
+
+	// TODO we also want to parse application/xml as XHTML if it has
+	// appropriate doctype
+	switch mediatype {
+	case "text/html", "application/xhtml+xml":
+	default:
+		return nil
+	}
+
 	doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body))
 	if err != nil {
 		return err