-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrap_html_test.go
111 lines (103 loc) · 2.75 KB
/
scrap_html_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
package main
import (
"fmt"
"io"
"net/url"
"os"
"reflect"
"sort"
"testing"
)
func openFile(t *testing.T, path string) io.Reader {
f, err := os.Open(path)
if err != nil {
t.Fatal("Failed to open file ", path, ": ", err)
}
t.Cleanup(func() {
err := f.Close()
if err != nil {
t.Log("Failed to close file ", path, ": ", err)
}
})
return f
}
func urlsToStrings(urls []*url.URL) []string {
s := []string{}
for _, v := range urls {
s = append(s, v.String())
}
sort.Strings(s)
return s
}
func TestScrap(t *testing.T) {
testcases := map[string]struct {
inHtml string
outUrls []string
}{
"No links": {
inHtml: "testdata/no_links.html",
outUrls: []string{},
},
"Absolute hrefs": {
inHtml: "testdata/absolute_hrefs.html",
outUrls: []string{
"http://localhost:8081/auckland.html",
"http://localhost:8081/christchurch.html",
"http://localhost:8081/wellington.html",
"http://localhost:8081/hamilton.html",
"http://localhost:8081/tauranga.html",
"http://localhost:8081/lowerhutt.html",
"http://localhost:8081/dunedin.html",
"http://localhost:8081/palmerstonnorth.html",
"http://localhost:8081/napier.html",
"http://localhost:8081/hibiscuscoast.html",
},
},
"Relative hrefs": {
inHtml: "testdata/relative_hrefs.html",
outUrls: []string{
"http://localhost:8081/auckland.html",
"http://localhost:8081/christchurch.html",
"http://localhost:8081/wellington.html",
"http://localhost:8081/hamilton.html",
"http://localhost:8081/tauranga.html",
"http://localhost:8081/lowerhutt.html",
"http://localhost:8081/dunedin.html",
"http://localhost:8081/palmerstonnorth.html",
"http://localhost:8081/napier.html",
"http://localhost:8081/hibiscuscoast.html",
},
},
"Relative hrefs with base": {
inHtml: "testdata/relative_hrefs_with_base.html",
outUrls: []string{
"http://island.nz/auckland.html",
"http://island.nz/christchurch.html",
"http://island.nz/wellington.html",
"http://island.nz/hamilton.html",
"http://island.nz/tauranga.html",
"http://island.nz/lowerhutt.html",
"http://island.nz/dunedin.html",
"http://island.nz/palmerstonnorth.html",
"http://island.nz/napier.html",
"http://island.nz/hibiscuscoast.html",
},
},
}
host := "http://localhost:8081/"
for name, tc := range testcases {
t.Run(name, func(t *testing.T) {
u, err := url.Parse(fmt.Sprint(host, tc.inHtml))
if err != nil {
t.Fatal("Failed to parse test url: ", err)
}
links := ScrapHtml(u, openFile(t, tc.inHtml))
slinks := urlsToStrings(links)
sort.Strings(slinks)
sort.Strings(tc.outUrls)
if !reflect.DeepEqual(tc.outUrls, slinks) {
t.Error("Failed to parse urls out of html: ", tc.outUrls, " != ", slinks)
}
})
}
}