-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
executable file
·120 lines (110 loc) · 4.38 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/python
import subprocess
import os
import signal
from httplib import HTTPException
from time import time,sleep
from subprocess import Popen
from collections import OrderedDict
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
SITES_LIST = 'data/top-1m-pruned.csv'
MEASURING_SCRIPT = 'stap stap_all.stp'
MOBILE_UA = 'Mozilla/5.0 (Linux; U; Android 2.3.3; en-us; HTC_DesireS_S510e Build/GRI40) ' \
'AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile'
SECONDS_PER_SITE = 150
MAX_SITES = 1e6
START_INDEX = 115
VERBOSE = True
def main():
if os.getuid() != 0:
raise Exception('Not running as root')
sites = open(SITES_LIST, 'r').read().split('\n')
maxSites = min(MAX_SITES, len(sites))
print 'Read in %d sites. Starting crawl of %d.' % (len(sites), maxSites)
os.system('mkdir -p output')
for i, site in enumerate(sites[START_INDEX:maxSites]):
site_full = 'http://' + site
print "[%d of %d] Loading site: %s" % (i+1+START_INDEX, maxSites, site_full)
for chrome in True, False:
print 'Trying browser %s...' % ('Chrome' if chrome else 'Firefox')
for mobile in False, True:
site_tmp = site[:]
print 'Trying %s user agent...' % ('mobile' if mobile else 'default')
profile = webdriver.FirefoxProfile()
chromeOptions = webdriver.ChromeOptions()
if mobile:
profile.set_preference("general.useragent.override", MOBILE_UA)
chromeOptions.add_argument('--user-agent=' + MOBILE_UA)
site_tmp += '-mobile'
else:
site_tmp += '-desktop'
if chrome:
site_tmp += '-chrome'
browser = None
while browser is None:
try:
browser = webdriver.Chrome(chrome_options=chromeOptions)
except WebDriverException as e:
print 'Chrome load error: ' + str(e)
sleep(5)
else:
site_tmp += '-firefox'
browser = webdriver.Firefox(profile)
browser.set_page_load_timeout(SECONDS_PER_SITE)
browser.set_script_timeout(3)
if chrome:
browserPID = browser.service.process.pid
else:
browserPID = browser.binary.process.pid
sleep(5)
cmd = '%s -G parent_id=%s -G browser_id=%s > output/%s-stap.csv' % \
(MEASURING_SCRIPT, str(os.getpid()), str(browserPID), site_tmp)
if VERBOSE:
print 'Browser PID: ' + str(browserPID)
print 'Running command: ' + cmd
pStap = Popen(cmd, \
stderr=subprocess.STDOUT, stdout=subprocess.PIPE, shell=True)
pConn = Popen('watch -n .2 "bash measure-connections.sh >> ' \
'output/%s-conns.csv"' % site_tmp, \
stderr=subprocess.STDOUT, stdout=subprocess.PIPE, shell=True)
def close(signal, frame):
print 'Caught sigint--terminating.'
kill((pConn, pStap))
signal.signal(signal.SIGINT, close)
sleep(5)
try:
browser.get(site_full) # Load page
sleep(SECONDS_PER_SITE)
timing = browser.execute_async_script(
"arguments[arguments.length - 1](performance.timing)")
timing = OrderedDict(
timeConnect = timing['connectEnd'] - timing['connectStart'],
timeDomLoad = timing['domComplete'] - timing['domLoading'],
timeDns = timing['domainLookupEnd'] - timing['domainLookupStart'],
timeRedirect = timing['redirectEnd'] - timing['redirectStart'],
timeResponse = timing['responseEnd'] - timing['responseStart']
)
if VERBOSE:
print "Page load timers:"
for i in timing: print ' * %s: %dms' % (i, timing[i])
with open('output/%s-loadtime.csv' % site_tmp, 'w') as f:
f.write(','.join(str(i) for i in timing.values()))
except (WebDriverException, TypeError, HTTPException) as e:
print 'Page load/timer error: ' + str(e)
kill((pConn, pStap))
browser.quit()
# since the files are getting somewhat large, ~3-5MB, compress them
os.system('bzip2 -f output/*.csv')
print
print "Terminated successfully!"
def kill(procs):
for p in procs:
try:
p.terminate()
os.killpg(p.pid, signal.SIGTERM)
except OSError: pass
p.wait()
# hacky, but the above doesn't work sometimes
os.system('killall watch')
main()