Skip to content

Commit 2daad72

Browse files
authored
Release/v1.5.3 (#40)
* code grouming, better usage info
1 parent ed2cdb2 commit 2daad72

File tree

16 files changed

+196
-166
lines changed

16 files changed

+196
-166
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,15 @@ crawley -headless -delay 0 -depth -1 -dirs only http://some-test.site
5353

5454
# installation
5555

56-
- [binaries](https://github.com/s0rg/crawley/releases) for Linux, FreeBSD, macOS and Windows, just download and run.
56+
- [binaries / deb / rpm](https://github.com/s0rg/crawley/releases) for Linux, FreeBSD, macOS and Windows.
5757
- [archlinux](https://aur.archlinux.org/packages/crawley-bin/) you can use your favourite AUR helper to install it, e. g. `paru -S crawley-bin`.
5858

5959
# usage
6060

6161
```
6262
crawley [flags] url
6363
64-
possible flags:
64+
possible flags with default values:
6565
6666
-brute
6767
scan html comments

cmd/crawley/main.go

Lines changed: 85 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@ import (
88
"io"
99
"log"
1010
"os"
11+
"path/filepath"
1112
"runtime"
13+
"strings"
1214
"time"
1315

1416
"github.com/s0rg/crawley/pkg/crawler"
@@ -17,32 +19,30 @@ import (
1719

1820
const (
1921
appName = "Crawley"
22+
appHelp = "the unix-way web crawler"
2023
appSite = "https://github.com/s0rg/crawley"
2124
defaultDelay = 150 * time.Millisecond
2225
)
2326

27+
// build-time values.
2428
var (
25-
GitHash string
2629
GitTag string
30+
GitHash string
2731
BuildDate string
2832
defaultUA = "Mozilla/5.0 (compatible; Win64; x64) Mr." + appName + "/" + GitTag + "-" + GitHash
33+
)
2934

30-
cookies, headers values.Smart
31-
tags, ignored values.Simple
32-
33-
fDepth = flag.Int("depth", 0, "scan depth (set -1 for unlimited)")
34-
fWorkers = flag.Int("workers", runtime.NumCPU(), "number of workers")
35-
fBrute = flag.Bool("brute", false, "scan html comments")
36-
fNoHeads = flag.Bool("headless", false, "disable pre-flight HEAD requests")
37-
fScanJS = flag.Bool("js", false, "scan js files for endpoints")
38-
fSkipSSL = flag.Bool("skip-ssl", false, "skip ssl verification")
39-
fSilent = flag.Bool("silent", false, "suppress info and error messages in stderr")
40-
fVersion = flag.Bool("version", false, "show version")
41-
fDirsPolicy = flag.String("dirs", "show", "policy for non-resource urls: show / hide / only")
42-
fProxyAuth = flag.String("proxy-auth", "", "credentials for proxy: user:password")
43-
fRobotsPolicy = flag.String("robots", "ignore", "policy for robots.txt: ignore / crawl / respect")
44-
fUA = flag.String("user-agent", defaultUA, "user-agent string")
45-
fDelay = flag.Duration("delay", defaultDelay, "per-request delay (0 - disable)")
35+
// command-line flags.
36+
var (
37+
fDepth, fWorkers int
38+
fSilent, fVersion bool
39+
fBrute, fNoHeads bool
40+
fSkipSSL, fScanJS bool
41+
fDirsPolicy, fProxyAuth string
42+
fRobotsPolicy, fUA string
43+
fDelay time.Duration
44+
cookies, headers values.Smart
45+
tags, ignored values.List
4646
)
4747

4848
func version() string {
@@ -56,6 +56,29 @@ func version() string {
5656
)
5757
}
5858

59+
func usage() {
60+
var sb strings.Builder
61+
62+
const twoCR = "\n\n"
63+
64+
sb.WriteString(appName)
65+
sb.WriteString(" - ")
66+
sb.WriteString(appHelp)
67+
sb.WriteString(", usage:")
68+
sb.WriteString(twoCR)
69+
70+
sb.WriteString(filepath.Base(os.Args[0]))
71+
sb.WriteString(" [flags] url")
72+
sb.WriteString(twoCR)
73+
74+
sb.WriteString("possible flags with default values:")
75+
sb.WriteString(twoCR)
76+
77+
_, _ = os.Stderr.WriteString(sb.String())
78+
79+
flag.PrintDefaults()
80+
}
81+
5982
func puts(s string) {
6083
_, _ = os.Stdout.WriteString(s + "\n")
6184
}
@@ -102,14 +125,14 @@ func loadSmart() (h, c []string, err error) {
102125
}
103126

104127
func initOptions() (rv []crawler.Option, err error) {
105-
robots, err := crawler.ParseRobotsPolicy(*fRobotsPolicy)
128+
robots, err := crawler.ParseRobotsPolicy(fRobotsPolicy)
106129
if err != nil {
107130
err = fmt.Errorf("robots policy: %w", err)
108131

109132
return
110133
}
111134

112-
dirs, err := crawler.ParseDirsPolicy(*fDirsPolicy)
135+
dirs, err := crawler.ParseDirsPolicy(fDirsPolicy)
113136
if err != nil {
114137
err = fmt.Errorf("dirs policy: %w", err)
115138

@@ -124,58 +147,72 @@ func initOptions() (rv []crawler.Option, err error) {
124147
}
125148

126149
rv = []crawler.Option{
127-
crawler.WithUserAgent(*fUA),
128-
crawler.WithDelay(*fDelay),
129-
crawler.WithMaxCrawlDepth(*fDepth),
130-
crawler.WithWorkersCount(*fWorkers),
131-
crawler.WithSkipSSL(*fSkipSSL),
132-
crawler.WithBruteMode(*fBrute),
150+
crawler.WithUserAgent(fUA),
151+
crawler.WithDelay(fDelay),
152+
crawler.WithMaxCrawlDepth(fDepth),
153+
crawler.WithWorkersCount(fWorkers),
154+
crawler.WithSkipSSL(fSkipSSL),
155+
crawler.WithBruteMode(fBrute),
133156
crawler.WithDirsPolicy(dirs),
134157
crawler.WithRobotsPolicy(robots),
135-
crawler.WithoutHeads(*fNoHeads),
136-
crawler.WithScanJS(*fScanJS),
158+
crawler.WithoutHeads(fNoHeads),
159+
crawler.WithScanJS(fScanJS),
137160
crawler.WithExtraHeaders(h),
138161
crawler.WithExtraCookies(c),
139162
crawler.WithTagsFilter(tags.Values),
140163
crawler.WithIgnored(ignored.Values),
141-
crawler.WithProxyAuth(*fProxyAuth),
164+
crawler.WithProxyAuth(fProxyAuth),
142165
}
143166

144167
return rv, nil
145168
}
146169

147-
func main() {
148-
flag.Var(
149-
&headers,
150-
"header",
170+
func setupFlags() {
171+
flag.Var(&headers, "header",
151172
"extra headers for request, can be used multiple times, accept files with '@'-prefix",
152173
)
153-
flag.Var(
154-
&cookies,
155-
"cookie",
174+
flag.Var(&cookies, "cookie",
156175
"extra cookies for request, can be used multiple times, accept files with '@'-prefix",
157176
)
158-
flag.Var(
159-
&tags,
160-
"tag",
161-
"tags filter, single or comma-separated tag names",
162-
)
163-
flag.Var(
164-
&ignored,
165-
"ignore",
166-
"patterns (in urls) to be ignored in crawl process",
167-
)
177+
178+
flag.Var(&tags, "tag", "tags filter, single or comma-separated tag names")
179+
flag.Var(&ignored, "ignore", "patterns (in urls) to be ignored in crawl process")
180+
181+
flag.IntVar(&fDepth, "depth", 0, "scan depth (set -1 for unlimited)")
182+
flag.IntVar(&fWorkers, "workers", runtime.NumCPU(), "number of workers")
183+
184+
flag.BoolVar(&fBrute, "brute", false, "scan html comments")
185+
flag.BoolVar(&fNoHeads, "headless", false, "disable pre-flight HEAD requests")
186+
flag.BoolVar(&fScanJS, "js", false, "scan js files for endpoints")
187+
flag.BoolVar(&fSkipSSL, "skip-ssl", false, "skip ssl verification")
188+
flag.BoolVar(&fSilent, "silent", false, "suppress info and error messages in stderr")
189+
flag.BoolVar(&fVersion, "version", false, "show version")
190+
191+
flag.StringVar(&fDirsPolicy, "dirs", crawler.DefaultDirsPolicy,
192+
"policy for non-resource urls: show / hide / only")
193+
flag.StringVar(&fRobotsPolicy, "robots", crawler.DefaultRobotsPolicy,
194+
"policy for robots.txt: ignore / crawl / respect")
195+
flag.StringVar(&fUA, "user-agent", defaultUA, "user-agent string")
196+
flag.StringVar(&fProxyAuth, "proxy-auth", "", "credentials for proxy: user:password")
197+
198+
flag.DurationVar(&fDelay, "delay", defaultDelay, "per-request delay (0 - disable)")
199+
200+
flag.Usage = usage
201+
}
202+
203+
func main() {
204+
setupFlags()
168205

169206
flag.Parse()
170207

171-
if *fVersion {
208+
if fVersion {
172209
puts(version())
173210

174211
return
175212
}
176213

177214
if flag.NArg() != 1 {
178-
flag.Usage()
215+
usage()
179216

180217
return
181218
}
@@ -185,7 +222,7 @@ func main() {
185222
log.Fatal("[-] options:", err)
186223
}
187224

188-
if *fSilent {
225+
if fSilent {
189226
log.SetOutput(io.Discard)
190227
}
191228

pkg/crawler/config_test.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,10 +190,12 @@ func TestString(t *testing.T) {
190190
func TestProxyAuth(t *testing.T) {
191191
t.Parallel()
192192

193+
const creds = "user:pass"
194+
193195
var (
194196
c = &config{}
195-
opts = []Option{WithProxyAuth("user:pass")}
196-
headers = []string{proxyAuthHdr + ": " + proxyAuthTyp + " dXNlcjpwYXNz"}
197+
opts = []Option{WithProxyAuth(creds)}
198+
headers = []string{proxyAuthHeader(creds)}
197199
)
198200

199201
for _, o := range opts {

pkg/crawler/crawler.go

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ const (
4646

4747
type crawlResult struct {
4848
URI string
49+
Hash uint64
4950
Flag taskFlag
5051
}
5152

@@ -92,8 +93,8 @@ func (c *Crawler) Run(uri string, fn func(string)) (err error) {
9293

9394
defer c.close()
9495

95-
seen := make(set.URI)
96-
seen.Add(uri)
96+
seen := make(set.Set[uint64])
97+
seen.Add(urlhash(uri))
9798

9899
web := client.New(
99100
c.cfg.UserAgent,
@@ -122,7 +123,7 @@ func (c *Crawler) Run(uri string, fn func(string)) (err error) {
122123
switch {
123124
case t.Flag == TaskDone:
124125
w--
125-
case seen.Add(t.URI):
126+
case seen.TryAdd(t.Hash):
126127
if t.Flag == TaskCrawl && c.crawl(base, &t) {
127128
w++
128129
}
@@ -260,14 +261,6 @@ func (c *Crawler) crawlRobots(host *url.URL) {
260261
}
261262
}
262263

263-
func (c *Crawler) sitemapHandler(s string) {
264-
c.linkHandler(atom.A, s)
265-
}
266-
267-
func (c *Crawler) jsHandler(s string) {
268-
c.linkHandler(atom.Link, s)
269-
}
270-
271264
func (c *Crawler) isIgnored(v string) (yes bool) {
272265
if len(c.cfg.Ignored) == 0 {
273266
return
@@ -283,7 +276,10 @@ func (c *Crawler) isIgnored(v string) (yes bool) {
283276
}
284277

285278
func (c *Crawler) linkHandler(a atom.Atom, s string) {
286-
r := crawlResult{URI: s}
279+
r := crawlResult{
280+
URI: s,
281+
Hash: urlhash(s),
282+
}
287283

288284
fetch := (a == atom.A || a == atom.Iframe) ||
289285
(c.cfg.ScanJS && a == atom.Script)
@@ -328,9 +324,13 @@ func (c *Crawler) fetch(
328324
Handler: c.linkHandler,
329325
})
330326
case isSitemap(uri):
331-
links.ExtractSitemap(body, base, c.sitemapHandler)
327+
links.ExtractSitemap(body, base, func(s string) {
328+
c.linkHandler(atom.A, s)
329+
})
332330
case c.cfg.ScanJS && isJS(content, uri):
333-
links.ExtractJS(body, c.jsHandler)
331+
links.ExtractJS(body, func(s string) {
332+
c.linkHandler(atom.Link, s)
333+
})
334334
}
335335

336336
client.Discard(body)

pkg/crawler/crawler_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -759,7 +759,7 @@ func TestCrawlerProxyAuth(t *testing.T) {
759759
)
760760

761761
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
762-
creds := r.Header.Get(proxyAuthHdr)
762+
creds := r.Header.Get(proxyAuthKey)
763763
if creds == "" {
764764
t.Fatal("auth header empty")
765765
}
@@ -769,7 +769,7 @@ func TestCrawlerProxyAuth(t *testing.T) {
769769
t.Fatalf("invalid fields count: %d", len(parts))
770770
}
771771

772-
if !strings.EqualFold(parts[0], proxyAuthTyp) {
772+
if !strings.EqualFold(parts[0], proxyAuthBasic) {
773773
t.Fatalf("invalid auth type: %s", parts[0])
774774
}
775775

pkg/crawler/options.go

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package crawler
22

33
import (
4-
"encoding/base64"
54
"time"
65
)
76

@@ -109,9 +108,6 @@ func WithScanJS(v bool) Option {
109108
// WithProxyAuth enables proxy credentials.
110109
func WithProxyAuth(v string) Option {
111110
return func(c *config) {
112-
c.Headers = append(
113-
c.Headers,
114-
proxyAuthHdr+": "+proxyAuthTyp+" "+base64.StdEncoding.EncodeToString([]byte(v)),
115-
)
111+
c.Headers = append(c.Headers, proxyAuthHeader(v))
116112
}
117113
}

pkg/crawler/policies.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,13 @@ import (
55
"strings"
66
)
77

8+
const (
9+
// DefaultRobotsPolicy is a default policy name for robots handling.
10+
DefaultRobotsPolicy = "ignore"
11+
// DefaultDirsPolicy is a default policy name for non-resource URLs.
12+
DefaultDirsPolicy = "show"
13+
)
14+
815
// ErrUnknownPolicy is returned when requested policy unknown.
916
var ErrUnknownPolicy = errors.New("unknown policy")
1017

@@ -33,21 +40,21 @@ const (
3340
)
3441

3542
// ParseRobotsPolicy parses robots policy from string.
36-
func ParseRobotsPolicy(s string) (a RobotsPolicy, err error) {
43+
func ParseRobotsPolicy(s string) (p RobotsPolicy, err error) {
3744
switch strings.ToLower(s) {
3845
case "ignore":
39-
a = RobotsIgnore
46+
p = RobotsIgnore
4047
case "crawl":
41-
a = RobotsCrawl
48+
p = RobotsCrawl
4249
case "respect":
43-
a = RobotsRespect
50+
p = RobotsRespect
4451
default:
4552
err = ErrUnknownPolicy
4653

4754
return
4855
}
4956

50-
return a, nil
57+
return p, nil
5158
}
5259

5360
// ParseDirsPolicy parses dirs policy from string.

0 commit comments

Comments
 (0)