24
24
25
25
from robot_api .parse import join_abs
26
26
27
+ def reverse_ip_lookup (domain , queue , filename ):
28
+ """Read in filesnames and use regex to extract all ips and hostnames.
29
+
30
+ Args:
31
+ filename: string to filename to parse
32
+
33
+ Returns:
34
+ A list of tuples containing the extracted host and ip
35
+ """
36
+ ip_reg = re .compile (
37
+ r"(?:(?:1\d\d|2[0-5][0-5]|2[0-4]\d|0?[1-9]\d|0?0?\d)\.){3}(?:1\d\d|2[0-5][0-5]|2[0-4]\d|0?[1-9]\d|0?0?\d)" )
38
+ hostname_reg = re .compile (
39
+ r"([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])(\.([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]{0,61}[a-zA-Z0-9]))*?\."
40
+ + domain
41
+ + r"(\:?[0-9]{1,5})?" )
42
+ results = []
43
+ try :
44
+ with open (filename , "r" , encoding = 'utf-8' ) as _file :
45
+ for line in tqdm (_file .readlines (), desc = f"{ filename } parsing..." ):
46
+ _host = hostname_reg .search (line )
47
+ if _host is not None :
48
+ _host = _host .group (0 )
49
+ _ip = ip_reg .search (line )
50
+ if _ip is not None :
51
+ _ip = _ip .group (0 )
52
+ try :
53
+ if _host is not None and _ip is None :
54
+ _ip = socket .gethostbyname (_host )
55
+ if _ip is not None and _host is None :
56
+ _host = socket .gethostbyaddr (_ip )
57
+ except Exception :
58
+ pass
59
+ if _host or _ip :
60
+ queue .put ((_host , _ip ))
61
+ except Exception :
62
+ pass
63
+
64
+ return results
65
+
66
+ def get_headers (queue , target ):
67
+ """Static method for request to scrape header information from ip
68
+
69
+ Args:
70
+ target: string to make request to
71
+
72
+ Returns:
73
+ ip/hostname and tuple containing headers
74
+ """
75
+ http = None
76
+ https = None
77
+ # May add option later to set UserAgent
78
+ headers = {
79
+ "User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0"
80
+ }
81
+ try :
82
+ http = requests .get (
83
+ f"http://{ target } " ,
84
+ headers = headers ,
85
+ timeout = 1 ,
86
+ verify = False ).headers
87
+ http = str (http )
88
+ except requests .ConnectionError :
89
+ pass
90
+ except OSError :
91
+ pass
92
+ try :
93
+ https = requests .get (
94
+ f"https://{ target } " ,
95
+ headers = headers ,
96
+ timeout = 1 ,
97
+ verify = False ).headers
98
+ https = str (https )
99
+ except requests .ConnectionError :
100
+ pass
101
+ except OSError :
102
+ pass
103
+ queue .put ([target , (http , https )])
27
104
28
105
class Aggregation :
29
106
"""Aggregation module
@@ -225,7 +302,7 @@ def aggregate(self, output_files=[], output_folders=[]):
225
302
qu_manager = multiprocessing .Manager ()
226
303
pool = multiprocessing .Pool (5 )
227
304
queue = qu_manager .Queue ()
228
- reverse_partial = partial (self ._reverse_ip_lookup , queue )
305
+ reverse_partial = partial (reverse_ip_lookup , self .domain , queue )
229
306
pool .map (reverse_partial , all_files )
230
307
pool .close ()
231
308
self ._build_db (queue , dbcurs )
@@ -235,85 +312,7 @@ def aggregate(self, output_files=[], output_folders=[]):
235
312
finally :
236
313
dbconn .close ()
237
314
238
- def _reverse_ip_lookup (self , queue , filename ):
239
- """Read in filesnames and use regex to extract all ips and hostnames.
240
-
241
- Args:
242
- filename: string to filename to parse
243
-
244
- Returns:
245
- A list of tuples containing the extracted host and ip
246
- """
247
- ip_reg = re .compile (
248
- r"(?:(?:1\d\d|2[0-5][0-5]|2[0-4]\d|0?[1-9]\d|0?0?\d)\.){3}(?:1\d\d|2[0-5][0-5]|2[0-4]\d|0?[1-9]\d|0?0?\d)" )
249
- # hostname_reg = re.compile(r"([A-Za-z0-9\-]*\.?)*\." + self.domain)
250
- hostname_reg = re .compile (
251
- r"([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])(\.([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]{0,61}[a-zA-Z0-9]))*?\."
252
- + self .domain
253
- + r"(\:?[0-9]{1,5})?" )
254
- results = []
255
- try :
256
- with open (filename , "r" , encoding = 'utf-8' ) as _file :
257
- for line in tqdm (_file .readlines (), desc = f"{ filename } parsing..." ):
258
- _host = hostname_reg .search (line )
259
- if _host is not None :
260
- _host = _host .group (0 )
261
- _ip = ip_reg .search (line )
262
- if _ip is not None :
263
- _ip = _ip .group (0 )
264
- try :
265
- if _host is not None and _ip is None :
266
- _ip = socket .gethostbyname (_host )
267
- if _ip is not None and _host is None :
268
- _host = socket .gethostbyaddr (_ip )
269
- except Exception :
270
- pass
271
- if _host or _ip :
272
- queue .put ((_host , _ip ))
273
- except Exception :
274
- self .logger .exception (f"Error opening file { filename } " )
275
-
276
- return results
277
-
278
- def _get_headers (self , queue , target ):
279
- """Static method for request to scrape header information from ip
280
-
281
- Args:
282
- target: string to make request to
283
315
284
- Returns:
285
- ip/hostname and tuple containing headers
286
- """
287
- http = None
288
- https = None
289
- # May add option later to set UserAgent
290
- headers = {
291
- "User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0"
292
- }
293
- try :
294
- http = requests .get (
295
- f"http://{ target } " ,
296
- headers = headers ,
297
- timeout = 1 ,
298
- verify = False ).headers
299
- http = str (http )
300
- except requests .ConnectionError :
301
- pass
302
- except OSError :
303
- pass
304
- try :
305
- https = requests .get (
306
- f"https://{ target } " ,
307
- headers = headers ,
308
- timeout = 1 ,
309
- verify = False ).headers
310
- https = str (https )
311
- except requests .ConnectionError :
312
- pass
313
- except OSError :
314
- pass
315
- queue .put ([target , (http , https )])
316
- # return target, (http, https)
317
316
318
317
def headers (self ):
319
318
"""Attempts to grab header data for all ips/hostnames
@@ -338,7 +337,7 @@ def headers(self):
338
337
339
338
qu_manager = multiprocessing .Manager ()
340
339
queue = qu_manager .Queue ()
341
- get_headers_partial = partial (self . _get_headers , queue )
340
+ get_headers_partial = partial (get_headers , queue )
342
341
_ = list (tqdm (pool .imap_unordered (get_headers_partial , ips ), total = len (ips ), desc = "Getting headers for ip..." ))
343
342
pool .close ()
344
343
pool .join ()
@@ -365,7 +364,7 @@ def headers(self):
365
364
366
365
pool = multiprocessing .Pool (40 )
367
366
queue = qu_manager .Queue ()
368
- get_headers_partial = partial (self . _get_headers , queue )
367
+ get_headers_partial = partial (get_headers , queue )
369
368
_ = list (tqdm (pool .map (get_headers_partial , hostnames ), total = len (hostnames ), desc = "Getting headers for host..." ))
370
369
371
370
pool .close ()
0 commit comments