-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
69 lines (62 loc) · 2.39 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import QuickLRU from 'quick-lru'
import pTimeout from 'p-timeout'
import normalizeUrl from './utils/normalize-url.js'
import httpClientGen from './utils/http-client.js'
import dnsLookupGen from './utils/dns-lookup.js'
import logger from './utils/logger.js'
import CacheableLookup from 'cacheable-lookup'
import { gotSsrf } from 'got-ssrf'
import { PRESETS } from 'header-generator'
const debug = logger('index.js')
export default (
normalizeUrlOptions = {
stripHash: true,
removeQueryParameters: []
},
gotOptions = {
https: {
rejectUnauthorized: true
},
followRedirect: true,
maxRedirects: 10,
throwHttpErrors: true,
timeout: {
request: 14000 // global timeout
},
cache: new QuickLRU({ maxSize: 10000 }),
dnsCache: new CacheableLookup({ cache: new QuickLRU({ maxSize: 100000 }) }),
context: {
insecureHTTPParser: false,
proxyUrl: process.env.HTTP_PROXY_URL,
headerGeneratorOptions: PRESETS.MODERN_WINDOWS_CHROME
}
},
timeoutMs = 15000,
canonicizeMemOpts = { cache: new QuickLRU({ maxSize: 100000 }) },
stripTrackersMemOpts = { cache: new QuickLRU({ maxSize: 100000 }) }
// The cache numbers are pulled from the most reliable source on the internet: my ass.
) => {
const dnsLookup = dnsLookupGen(gotOptions)
const normalize = normalizeUrl(
normalizeUrlOptions,
dnsLookup,
gotSsrf.extend(gotOptions), // don't really need to mimic browser behaviour or canonicize shit
stripTrackersMemOpts
)
const httpClient = httpClientGen(normalize, gotOptions, canonicizeMemOpts)
// Normalize URL so that we can search by URL.
async function normalizePlus(url = '') {
debug('Normalizing URL %s', url)
// 1. "Base" normalization using normalize-url + stripping trackers
// When an invalid link is passed, it will throw.
const link = await normalize(url)
debug('Normalization first pass: %s', url)
// 2. Follow redirects to deal with "intermediate" links (such as the links on google search results)
const res = await httpClient.get(link)
debug('Normalization second pass: %s', res.url)
// At this point, the link will be completely normalized based on canonical links (if one exists)
return res.url
}
// global timeout for the ENTIRE function, because I'm afraid of blocking the event loop w/ some of the more compute-intensive shit
return url => pTimeout(normalizePlus(url), timeoutMs)
}