Skip to content
This repository was archived by the owner on Aug 1, 2024. It is now read-only.

Commit 3e1ed1d

Browse files
committed
one more try. remove class methods from multiprocessing pool. Saves pickling operation
1 parent cee1fc8 commit 3e1ed1d

File tree

1 file changed

+80
-81
lines changed

1 file changed

+80
-81
lines changed

src/robot_api/api/aggregation.py

Lines changed: 80 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,83 @@
2424

2525
from robot_api.parse import join_abs
2626

27+
def reverse_ip_lookup(domain, queue, filename):
28+
"""Read in filesnames and use regex to extract all ips and hostnames.
29+
30+
Args:
31+
filename: string to filename to parse
32+
33+
Returns:
34+
A list of tuples containing the extracted host and ip
35+
"""
36+
ip_reg = re.compile(
37+
r"(?:(?:1\d\d|2[0-5][0-5]|2[0-4]\d|0?[1-9]\d|0?0?\d)\.){3}(?:1\d\d|2[0-5][0-5]|2[0-4]\d|0?[1-9]\d|0?0?\d)")
38+
hostname_reg = re.compile(
39+
r"([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])(\.([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]{0,61}[a-zA-Z0-9]))*?\."
40+
+ domain
41+
+ r"(\:?[0-9]{1,5})?")
42+
results = []
43+
try:
44+
with open(filename, "r", encoding='utf-8') as _file:
45+
for line in tqdm(_file.readlines(), desc=f"{filename} parsing..."):
46+
_host = hostname_reg.search(line)
47+
if _host is not None:
48+
_host = _host.group(0)
49+
_ip = ip_reg.search(line)
50+
if _ip is not None:
51+
_ip = _ip.group(0)
52+
try:
53+
if _host is not None and _ip is None:
54+
_ip = socket.gethostbyname(_host)
55+
if _ip is not None and _host is None:
56+
_host = socket.gethostbyaddr(_ip)
57+
except Exception:
58+
pass
59+
if _host or _ip:
60+
queue.put((_host, _ip))
61+
except Exception:
62+
pass
63+
64+
return results
65+
66+
def get_headers(queue, target):
67+
"""Static method for request to scrape header information from ip
68+
69+
Args:
70+
target: string to make request to
71+
72+
Returns:
73+
ip/hostname and tuple containing headers
74+
"""
75+
http = None
76+
https = None
77+
# May add option later to set UserAgent
78+
headers = {
79+
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0"
80+
}
81+
try:
82+
http = requests.get(
83+
f"http://{target}",
84+
headers=headers,
85+
timeout=1,
86+
verify=False).headers
87+
http = str(http)
88+
except requests.ConnectionError:
89+
pass
90+
except OSError:
91+
pass
92+
try:
93+
https = requests.get(
94+
f"https://{target}",
95+
headers=headers,
96+
timeout=1,
97+
verify=False).headers
98+
https = str(https)
99+
except requests.ConnectionError:
100+
pass
101+
except OSError:
102+
pass
103+
queue.put([target, (http, https)])
27104

28105
class Aggregation:
29106
"""Aggregation module
@@ -225,7 +302,7 @@ def aggregate(self, output_files=[], output_folders=[]):
225302
qu_manager = multiprocessing.Manager()
226303
pool = multiprocessing.Pool(5)
227304
queue = qu_manager.Queue()
228-
reverse_partial = partial(self._reverse_ip_lookup, queue)
305+
reverse_partial = partial(reverse_ip_lookup, self.domain, queue)
229306
pool.map(reverse_partial, all_files)
230307
pool.close()
231308
self._build_db(queue, dbcurs)
@@ -235,85 +312,7 @@ def aggregate(self, output_files=[], output_folders=[]):
235312
finally:
236313
dbconn.close()
237314

238-
def _reverse_ip_lookup(self, queue, filename):
239-
"""Read in filesnames and use regex to extract all ips and hostnames.
240-
241-
Args:
242-
filename: string to filename to parse
243-
244-
Returns:
245-
A list of tuples containing the extracted host and ip
246-
"""
247-
ip_reg = re.compile(
248-
r"(?:(?:1\d\d|2[0-5][0-5]|2[0-4]\d|0?[1-9]\d|0?0?\d)\.){3}(?:1\d\d|2[0-5][0-5]|2[0-4]\d|0?[1-9]\d|0?0?\d)")
249-
# hostname_reg = re.compile(r"([A-Za-z0-9\-]*\.?)*\." + self.domain)
250-
hostname_reg = re.compile(
251-
r"([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])(\.([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]{0,61}[a-zA-Z0-9]))*?\."
252-
+ self.domain
253-
+ r"(\:?[0-9]{1,5})?")
254-
results = []
255-
try:
256-
with open(filename, "r", encoding='utf-8') as _file:
257-
for line in tqdm(_file.readlines(), desc=f"{filename} parsing..."):
258-
_host = hostname_reg.search(line)
259-
if _host is not None:
260-
_host = _host.group(0)
261-
_ip = ip_reg.search(line)
262-
if _ip is not None:
263-
_ip = _ip.group(0)
264-
try:
265-
if _host is not None and _ip is None:
266-
_ip = socket.gethostbyname(_host)
267-
if _ip is not None and _host is None:
268-
_host = socket.gethostbyaddr(_ip)
269-
except Exception:
270-
pass
271-
if _host or _ip:
272-
queue.put((_host, _ip))
273-
except Exception:
274-
self.logger.exception(f"Error opening file {filename}")
275-
276-
return results
277-
278-
def _get_headers(self, queue, target):
279-
"""Static method for request to scrape header information from ip
280-
281-
Args:
282-
target: string to make request to
283315

284-
Returns:
285-
ip/hostname and tuple containing headers
286-
"""
287-
http = None
288-
https = None
289-
# May add option later to set UserAgent
290-
headers = {
291-
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0"
292-
}
293-
try:
294-
http = requests.get(
295-
f"http://{target}",
296-
headers=headers,
297-
timeout=1,
298-
verify=False).headers
299-
http = str(http)
300-
except requests.ConnectionError:
301-
pass
302-
except OSError:
303-
pass
304-
try:
305-
https = requests.get(
306-
f"https://{target}",
307-
headers=headers,
308-
timeout=1,
309-
verify=False).headers
310-
https = str(https)
311-
except requests.ConnectionError:
312-
pass
313-
except OSError:
314-
pass
315-
queue.put([target, (http, https)])
316-
# return target, (http, https)
317316

318317
def headers(self):
319318
"""Attempts to grab header data for all ips/hostnames
@@ -338,7 +337,7 @@ def headers(self):
338337

339338
qu_manager = multiprocessing.Manager()
340339
queue = qu_manager.Queue()
341-
get_headers_partial = partial(self._get_headers, queue)
340+
get_headers_partial = partial(get_headers, queue)
342341
_ = list(tqdm(pool.imap_unordered(get_headers_partial, ips), total=len(ips), desc="Getting headers for ip..."))
343342
pool.close()
344343
pool.join()
@@ -365,7 +364,7 @@ def headers(self):
365364

366365
pool = multiprocessing.Pool(40)
367366
queue = qu_manager.Queue()
368-
get_headers_partial = partial(self._get_headers, queue)
367+
get_headers_partial = partial(get_headers, queue)
369368
_ = list(tqdm(pool.map(get_headers_partial, hostnames), total=len(hostnames), desc="Getting headers for host..."))
370369

371370
pool.close()

0 commit comments

Comments
 (0)