-
Notifications
You must be signed in to change notification settings - Fork 1
/
gcprobe.py
executable file
·166 lines (153 loc) · 5.94 KB
/
gcprobe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env python3
import socket, argparse, os, sys, json, concurrent.futures, urllib.parse, time
import threading, shutil
from utils import *
REPEAT_COUNT = 5
FILE_LIST = 'jsfiles.json'
STORAGE_DIR = 'gcprobefiles'
SUSPICIOUS_DIR = 'suspicious'
RESULTS_FILE = 'gcprobe.json'
TIMEOUT = 3 #seconds
MAX_WORKERS = 512 #ThreadPool size
# Issue the HTTP GET request repeatedly for a given referer, script, and ttl
# value
def issue_request(host, address, port, message, tls=False, ttl=None):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
if tls:
s = SSL_CONTEXT.wrap_socket(s, server_hostname=host)
try: s.connect((address, port))
except: return None
if ttl: s.setsockopt(socket.SOL_IP, socket.IP_TTL, ttl)
s.send(message)
s.settimeout(TIMEOUT)
full_response = b''
while True:
try:
response = s.recv(BUFSIZE)
if response: full_response += response
else: break
except socket.timeout:
break
s.close()
return full_response
# Probe one domain ('domain'), identifying the minimum TTL required in the HTTP
# request to download each of the JS files from 'script_list', and writing the
# results to the results file
def probe_domain(domain, script_list, traceroute_lock, outfile_lock):
address = name_to_address(domain)
first_port = script_list[0][2]
with traceroute_lock: distance = rerun_traceroute(address, first_port)
for script_info in script_list:
# setup
result = dict()
tls = script_info[0]
protocol = {True: 'https://', False: 'http://'}[tls]
host = script_info[1]
port = script_info[2]
request = script_info[3]
filename = script_info[4]
referer = script_info[5]
script = urllib.parse.urljoin(protocol+host, request)
result['script'] = script
result['referer'] = referer
result['address'] = address
result['traceroute'] = distance
message = gen_message(host, request, referer)
# do the probe (start with a range of possible TTL values from
# lowerbound to upperbound; when the file is received, decrease
# the upperbound; when it is not, increase the lowerbound; set
# the TTL value in the middle of the range and repeat until the
# range has size zero)
print(time.strftime('%Y-%m-%d %H:%M:%S ') + 'Probing for ' + \
script + ', referred by ' + referer)
sys.stdout.flush()
if distance: upperbound = distance + 3
else: upperbound = TRACEROUTE_MAX
lowerbound = 0
downloaded = False
while lowerbound != upperbound:
ttl = int((upperbound - lowerbound) / 2 + lowerbound)
print("%s\tlowerbound:%d\tupperbound:%d\tttl:%d" %
(script, lowerbound, upperbound, ttl))
downloaded_this_ttl = False
for i in range(args.repeat):
response = issue_request(host, address, port,
message, tls, ttl)
if response:
# file received - save it
save_file(args.dir, host, filename,
response)
downloaded = downloaded_this_ttl = True
if upperbound == ttl: upperbound -= 1
else: upperbound = ttl
break
if not downloaded_this_ttl:
if lowerbound == ttl: lowerbound += 1
else: lowerbound = ttl + 1
# record the result
result['ttlrequired'] = lowerbound
if distance:
earlyby = distance - lowerbound
result['earlyby'] = earlyby
else:
result['traceroutefailed'] = True
result['downloaded'] = downloaded
with outfile_lock:
with open(args.outfile, 'a') as f:
json.dump(result, f)
f.write(',\n')
if earlyby > 3:
# file received with a request sent 3 hops or more short
# of the traceroute value - save it separately
path = os.path.join(args.dir, SUSPICIOUS_DIR, host)
if not os.path.isdir(path): os.makedirs(path)
shutil.copy(os.path.join(args.dir, host, filename),
path)
# commandline argument parser
parser = argparse.ArgumentParser(description='Determine the smallest TTL required to download various JS files from China.')
parser.add_argument('-r', '--repeat', default=REPEAT_COUNT, type=int, help='the number of times to download each file')
parser.add_argument('-f', '--filelist', default=FILE_LIST, type=str, help='the path to the file containing the list of candidate files to be downloaded')
parser.add_argument('-d', '--dir', default=STORAGE_DIR, type=str, help='the path at which to store the downloaded files')
parser.add_argument('-o', '--outfile', default=RESULTS_FILE, type=str, help='the path of the file in which to store the results')
args = parser.parse_args()
# echo the command line arguments for logging purposes
print(str(sys.argv) + ' on ' + socket.gethostname())
# make sure the download folder and results file don't exist yet
if os.path.exists(args.dir):
print('ERROR: Download directory ' + args.dir + ' already exists')
sys.exit()
if os.path.exists(args.outfile):
print('ERROR: Outfile ' + args.outfile + ' already exists')
sys.exit()
# get the list of JS files
with open(args.filelist) as f: jsondata = json.load(f)
# parse the URIs and refactor the list of JS files by domain
list_by_domain = dict()
for referer in jsondata:
for script in referer['scripts']:
try:
tls, host, port, request, filename = parseURI(script)
except TTLProbeError as e:
print(e)
continue
if not host in list_by_domain: list_by_domain[host] = list()
list_by_domain[host].append((tls, host, port, request, filename,
referer['referer']))
# probe all of the JS files in domain-specific threads (concurrently by domain,
# serially by script)
with open(args.outfile, 'w') as f: f.write('[')
traceroute_lock = threading.Lock()
outfile_lock = threading.Lock()
futures = list()
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
for domain in list_by_domain:
futures.append(executor.submit(probe_domain, domain,
list_by_domain[domain], traceroute_lock,
outfile_lock))
concurrent.futures.wait(futures)
# tidy up the end of the results file so that it can be json.load()'d
shutil.copy(args.outfile, args.outfile+'.bak')
with open(args.outfile) as f: contents = f.read()
contents = contents.rstrip(',\n') + ']'
with open(args.outfile, 'w') as f: f.write(contents)
os.remove(args.outfile+'.bak')