This repository has been archived by the owner on Sep 27, 2024. It is now read-only.
forked from philipithomas/iterscraper
-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.go
160 lines (139 loc) · 4.18 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
package main
import (
"strconv"
"strings"
"go.ilie.io/grape/flags"
"go.ilie.io/grape/output"
"go.ilie.io/grape/scraper"
"bufio"
"flag"
"fmt"
"log"
"os"
"sync"
)
const (
defaultConcurrency = 1
)
var (
urlTemplate flags.String
concurrency flags.Int
dictfile flags.String
idLow flags.Int
idHigh flags.Int
query flags.String
outfile flags.String
)
// init runs before main and parses the CLI flags
func init() {
flag.Var(&urlTemplate, "url", "The URL you wish to scrape, containing \"%s\" or \"%d\" where the token will be substituted")
flag.Var(&concurrency, "concurrency", "How many scrapers to run in parallel. (More scrapers are faster, but more prone to rate limiting or bandwidth issues)")
flag.Var(&dictfile, "dict", "Filename to import a dictionary from (optional: if not provided, to and from will be used to generate integer IDs to be scraped; if present along to and from, will select the range of words in the dictionary)")
flag.Var(&idLow, "from", "Start of the search range - inclusive")
flag.Var(&idHigh, "to", "End of the search range - exclusive")
flag.Var(&query, "query", "JQuery-style query for the element")
flag.Var(&outfile, "outfile", "Filename to export the CSV results")
flag.Parse()
// check if concurrency set
if !concurrency.IsSet() {
concurrency.Value = defaultConcurrency
}
// if no input is set
if !dictfile.IsSet() && (!idLow.IsSet() || !idHigh.IsSet()) {
log.Fatal("you must either provide a dictionary file or an index range")
}
}
func main() {
headers := strings.Split(query.Value, " ")
queries := strings.Split(query.Value, " ")
// url and id are added as the first two columns.
headers = append([]string{"id"}, headers...)
// create tasks and send them to the channel.
tasks := make(chan task)
go createTasks(tasks)
// create workers and schedule closing results when all work is done.
results := make(chan []string)
var wg sync.WaitGroup
wg.Add(concurrency.Value)
go func() {
wg.Wait()
close(results)
}()
for i := 0; i < concurrency.Value; i++ {
go func() {
defer wg.Done()
for t := range tasks {
r, err := scraper.Fetch(t.url, queries)
if err != nil {
log.Printf("could not fetch %v: %v", t.url, err)
continue
} else {
log.Printf("fetched %v", t.url)
}
results <- append([]string{t.token}, r...)
}
}()
}
if outfile.IsSet() {
// if we have an outfile, print to csv.
if err := output.CSV(outfile.Value, headers, results); err != nil {
log.Printf("could not write to %s: %v", outfile.Value, err)
}
} else {
// else print a table to stdout.
if err := output.Stdout(headers, results); err != nil {
log.Printf("could not print table: %v", err)
}
}
}
type task struct {
url string
token string
}
func createTasks(tasks chan task) {
defer close(tasks)
if dictfile.IsSet() {
if idLow.IsSet() && idHigh.IsSet() {
passTasksFromDictRange(urlTemplate.Value, tasks, dictfile.Value, idLow.Value, idHigh.Value)
} else {
passTasksFromDict(urlTemplate.Value, tasks, dictfile.Value)
}
} else if idLow.IsSet() && idHigh.IsSet() {
passTasksFromRange(urlTemplate.Value, tasks, idLow.Value, idHigh.Value)
} else {
log.Fatal("you must either provide a dictionary file or an index range")
}
}
func passTasksFromDict(url string, tasks chan task, dictFile string) {
file, err := os.Open(dictFile)
if err != nil {
log.Fatalf("cannot open dictionary file: %v", err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
t := scanner.Text()
tasks <- task{url: fmt.Sprintf(url, t), token: t}
}
}
func passTasksFromDictRange(url string, tasks chan task, dictFile string, idLow, idHigh int) {
file, err := os.Open(dictFile)
if err != nil {
log.Fatalf("cannot open dictionary file: %v", err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
// skip the first idLow tokens
for i := 0; i < idLow; i++ {
scanner.Scan()
}
for i := idLow; scanner.Scan() && i < idHigh; i++ {
t := scanner.Text()
tasks <- task{url: fmt.Sprintf(url, t), token: t}
}
}
func passTasksFromRange(url string, tasks chan task, idLow, idHigh int) {
for i := idLow; i < idHigh; i++ {
tasks <- task{url: fmt.Sprintf(url, i), token: strconv.Itoa(i)}
}
}