Skip to content

Commit

Permalink
Implement content sniffing for HTML parsing
Browse files Browse the repository at this point in the history
Web pages can be served without Content-Type set, in which case
browsers employ content sniffing. Do the same here, in Colly.
  • Loading branch information
WGH- committed Mar 25, 2024
1 parent 73a1f68 commit 40d3e41
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 1 deletion.
3 changes: 3 additions & 0 deletions colly.go
Original file line number Diff line number Diff line change
Expand Up @@ -1122,6 +1122,9 @@ func (c *Collector) handleOnHTML(resp *Response) error {
}

contentType := resp.Headers.Get("Content-Type")
if contentType == "" {
contentType = http.DetectContentType(resp.Body)
}
// implementation of mime.ParseMediaType without parsing the params
// part
mediatype, _, _ := strings.Cut(contentType, ";")

Check failure on line 1130 in colly.go

View workflow job for this annotation

GitHub Actions / Build 1.17

undefined: strings.Cut

Check failure on line 1130 in colly.go

View workflow job for this annotation

GitHub Actions / Test 1.17

undefined: strings.Cut
Expand Down
34 changes: 33 additions & 1 deletion colly_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,11 @@ func newUnstartedTestServer() *httptest.Server {
})

mux.HandleFunc("/html", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
if r.URL.Query().Get("no-content-type") != "" {
w.Header()["Content-Type"] = nil
} else {
w.Header().Set("Content-Type", "text/html")
}
w.Write([]byte(`<!DOCTYPE html>
<html>
<head>
Expand Down Expand Up @@ -627,6 +631,34 @@ func TestCollectorOnHTML(t *testing.T) {
}
}

func TestCollectorContentSniffing(t *testing.T) {
ts := newTestServer()
defer ts.Close()

c := NewCollector()

htmlCallbackCalled := false

c.OnResponse(func(r *Response) {
if (*r.Headers)["Content-Type"] != nil {
t.Error("Content-Type unexpectedly not nil")
}
})

c.OnHTML("html", func(e *HTMLElement) {
htmlCallbackCalled = true
})

err := c.Visit(ts.URL + "/html?no-content-type=yes")
if err != nil {
t.Fatal(err)
}

if !htmlCallbackCalled {
t.Error("OnHTML was not called")
}
}

func TestCollectorURLRevisit(t *testing.T) {
ts := newTestServer()
defer ts.Close()
Expand Down

0 comments on commit 40d3e41

Please sign in to comment.