-
Notifications
You must be signed in to change notification settings - Fork 0
/
proxy_for_crawler.py
111 lines (99 loc) · 4.04 KB
/
proxy_for_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# a git submitting to github
import requests
import bs4
import re
import time
import random
class Spyder():
#伪装浏览器首部 brwoser header simulation
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
def download_html(self,url,trytime = 2):
'''
get the html from given url and try again when timeout
'''
try:
responce = requests.get(url,headers=self.headers,timeout=5)
# print('status_code is %s' % responce.status_code)
# print('raise_for_status is %s' % responce.raise_for_status)
return(responce.text)
# if responce.raise_for_status() !=
except requests.exceptions.Timeout:
if trytime > 0:
print("Request timeout and try again...")
self.download_html(url,trytime - 1) # 在类中使用递归 Using Recursion in Class
else:
print("Request Time out!")
return(None)
except requests.exceptions.HTTPError as e:
if hasattr(e,'errno'):
print(e.errno)
return(None)
except requests.exceptions.ConnectionError:
print('Connection Error,please check whether your url is valid!')
return(None)
def get_proxy_info(self,html,filtertime=1):
'''
getting out the proxies information from html by bs4
ip_info_list structure: [[IP,PORT,SITE,RespTime,CheckTime]...]
'''
soup = bs4.BeautifulSoup(html,'html.parser')
ip_list = soup.select('tr')
ip_info_list = []
fast_ip = []
for ip_info in ip_list:
# print(ip_info)
IP = re.search('<td data-title="IP">(.+?)</td>',str(ip_info))
if IP is not None:
IP = IP.group(1)
PORT = re.search('<td data-title="PORT">(.+?)</td>',str(ip_info))
if PORT is not None:
PORT = PORT.group(1)
SITE = re.search('<td data-title="位置">(.+?)</td>',str(ip_info))
if SITE is not None:
SITE = SITE.group(1)
RespTime = re.search('<td data-title="响应速度">(.+?)秒</td>',str(ip_info))
if RespTime is not None:
RespTime = RespTime.group(1)
CheckTime = re.search('<td data-title="最后验证时间">(.+?)</td>',str(ip_info))
if CheckTime is not None:
CheckTime = CheckTime.group(1)
ip_info_list.append([IP,PORT,SITE,RespTime,CheckTime])
# print([IP,PORT,SITE,RespTime,CheckTime])
if RespTime is not None and float(RespTime) <= float(filtertime):
fast_ip.append(IP)
# fast_ip = [str(i[0]) + str(i[1]) for i in ip_info_list if float(i[3]) <= filtertime]
return(fast_ip)
def download_proxy(self,page_num=1):
'''
get the free proxies ip from www.kuaidaili.com
page_num control ip number and pages of crawling
'''
url = 'https://www.kuaidaili.com/free/inha/'
if page_num == 1:
page = [1]
elif page_num > 1:
page = [i for i in range(1,page_num+1)]
n = 0
all_proxies = []
while n < len(page):
url_page = url + str(page[n])
html = self.download_html(url_page)
print(url_page)
# print(html)
fast_ip = self.get_proxy_info(html)
print(fast_ip)
if fast_ip is not None:
all_proxies = all_proxies + fast_ip
else:
pass
n += 1
sleep_time = 2
print("Now,let's have a rest for %s sec" % sleep_time)
time.sleep(sleep_time) # 该网站如果不设置间隔时间会什么都爬不下来
return(all_proxies)
if __name__ == '__main__':
spyder = Spyder()
proxies_ips = spyder.download_proxy(page_num=3) # print the fast ip proxies
print(proxies_ips)