diff --git a/colly.go b/colly.go index 0496ab1c..ae74b7c3 100644 --- a/colly.go +++ b/colly.go @@ -1122,6 +1122,9 @@ func (c *Collector) handleOnHTML(resp *Response) error { } contentType := resp.Headers.Get("Content-Type") + if contentType == "" { + contentType = http.DetectContentType(resp.Body) + } // implementation of mime.ParseMediaType without parsing the params // part mediatype, _, _ := strings.Cut(contentType, ";") diff --git a/colly_test.go b/colly_test.go index 2382ecb1..e70d2774 100644 --- a/colly_test.go +++ b/colly_test.go @@ -52,7 +52,11 @@ func newUnstartedTestServer() *httptest.Server { }) mux.HandleFunc("/html", func(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "text/html") + if r.URL.Query().Get("no-content-type") != "" { + w.Header()["Content-Type"] = nil + } else { + w.Header().Set("Content-Type", "text/html") + } w.Write([]byte(` @@ -627,6 +631,34 @@ func TestCollectorOnHTML(t *testing.T) { } } +func TestCollectorContentSniffing(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + c := NewCollector() + + htmlCallbackCalled := false + + c.OnResponse(func(r *Response) { + if (*r.Headers)["Content-Type"] != nil { + t.Error("Content-Type unexpectedly not nil") + } + }) + + c.OnHTML("html", func(e *HTMLElement) { + htmlCallbackCalled = true + }) + + err := c.Visit(ts.URL + "/html?no-content-type=yes") + if err != nil { + t.Fatal(err) + } + + if !htmlCallbackCalled { + t.Error("OnHTML was not called") + } +} + func TestCollectorURLRevisit(t *testing.T) { ts := newTestServer() defer ts.Close()