-
Notifications
You must be signed in to change notification settings - Fork 8
/
official_accounts_spider.py
274 lines (243 loc) · 11 KB
/
official_accounts_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#encoding=utf-8
from random import choice
import requests
from bs4 import BeautifulSoup
import bs4 #不推荐写法
import re
# import request_ip
import urllib.request
import urllib.error
import simplejson
import http.cookiejar
import pymysql
import types
import json
import time
import uuid
import os
class OfficialAccounts:
def __init__(self, url=False):
#获取随机IP
# self._host = request_ip.RequestIP().getIP()
self._url = url if url else 'http://www.jiweixin168.com'
self._headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;",
"Accept-Language":"zh-CN,zh;q=0.8",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
}
#设置cookie
self.cookie_filename = 'cookie.txt'
self.cookie = http.cookiejar.MozillaCookieJar(self.cookie_filename)
self.handler = urllib.request.HTTPCookieProcessor(self.cookie)
self.opener = urllib.request.build_opener(self.handler)
self.official_accounts_array = []
self.base_url = os.path.split(os.path.realpath(__file__))[0]
self.qrcode_url = self.base_url + '/images/qrcode/'
self.cover_url = self.base_url + '/images/cover/'
#连接数据库执行数据库操作
self.db=pymysql.connect(host="localhost",user="123456",charset="utf8",password="123456",db="wxpn");
self.cursor=self.db.cursor()
def get_index_page(self):
request = urllib.request.Request(self._url, headers = self._headers)
response = self.opener.open(request)
# content = response.read().decode('UTF-8')
content = response.read()
# 保存cookie到cookie.txt中, ### 其实不用
self.cookie.save(ignore_discard=True, ignore_expires=True)
#以html解析格式打开
soup = bs4.BeautifulSoup(content,"html.parser")
# f = open("jiweixin.html", "wb")
# f.write(content)
# f.close;
sub_nav_ul = soup.find_all("ul", class_ = "sub_nav_ul")
catalog = sub_nav_ul[0].find_all("li")
for item in catalog:
tag_a = item.find_all("a")
self.cursor.execute('INSERT INTO official_accounts_categories (name) VALUES (%s)', tag_a[0].string)
self.db.commit()
self.cursor.execute('select LAST_INSERT_ID()')
last_id = self.cursor.fetchone()
category_id = last_id[0]
# self.db.close()
# self.cursor.close()
print('---------------- 抓取一级分类 '+ tag_a[0].string +' 开始 --------------------------')
for index, content in enumerate(tag_a):
if index > 0:
sub_database_data = (content.string, int(category_id))
self.cursor.execute('INSERT INTO official_accounts_subcategories (name, parent_id) VALUES (%s, %s)', sub_database_data)
self.db.commit()
self.cursor.execute('select LAST_INSERT_ID()')
sub_last_id = self.cursor.fetchone()
subcategory_id = int(sub_last_id[0])
# self.db.close()
# self.cursor.close()
subcategory_url = self._url + content['href']
print('---------------- 抓取二级分类 '+ content.string +' 开始 --------------------------')
self.get_subcategories(subcategory_url, subcategory_id)
print('---------------- 抓取二级分类 '+ content.string +' 结束 --------------------------')
print('-------- 抓取二级分类 '+ content.string +' 结束休眠两秒 ----------------')
time.sleep(2)
# return
# print(subcategory_url)
print('---------------- 抓取一级分类 '+ tag_a[0].string +' 结束 --------------------------')
def get_subcategories(self, url, parent_id):
request = urllib.request.Request(url, headers = self._headers)
response = self.opener.open(request)
# content = response.read().decode('UTF-8')
content = response.read()
# f = open("subcategory.html", "wb")
# f.write(content)
# f.close;
soup = bs4.BeautifulSoup(content,"html.parser")
sub_nav_ul = soup.find_all("div", class_ = "newslist")
catalog = sub_nav_ul[0].find_all("dt")
try:
#获取页码数据,抓取下一页
sub_page_div = soup.find_all("div", class_ = "page")
#当前页码数
current = sub_page_div[0].find_all('span', class_ = "current")[0].string
except IndexError:
sub_page_div = []
current = '0'
#重置数据组装
self.official_accounts_array = []
for item in catalog:
tag_a = item.find_all("a")
thirdcategory_url = self._url + tag_a[0]['href']
self.get_thirdcategories(thirdcategory_url, parent_id)
self.cursor.executemany('INSERT INTO official_accounts_thirdcategories (name, wechat_id, attention_count, address, include_time, parent_id, descriptions, qrcode, wechat_link, background_cover) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', self.official_accounts_array)
self.db.commit()
print('----------第 '+ current +' 页插库进行中...----------------')
print('----------第 '+ current +' 页插库成功---------------------')
#抓取完一页后睡眠两秒
print('---------- 抓取完第 '+ current +' 页睡眠两秒 -------')
time.sleep(2)
#最后一个a标签
try:
tag_a_array = sub_page_div[0].find_all('a')
tag_a_last = tag_a_array[len(tag_a_array) - 1].string
tag_a_link = tag_a_array[len(tag_a_array) - 1]['href']
except IndexError:
tag_a_last = ''
tag_a_link = ''
if tag_a_last == '下一页':
next_url = self._url + tag_a_link
self.get_subcategories(next_url, parent_id)
# self.db.close()
# self.cursor.close()
# return
def get_thirdcategories(self, url, parent_id):
request = urllib.request.Request(url, headers = self._headers)
response = self.opener.open(request)
# content = response.read().decode('UTF-8')
content = response.read()
# f = open("thirdcategory.html", "wb")
# f.write(content)
# f.close;
soup = bs4.BeautifulSoup(content,"html.parser")
#获取公众号主体信息
sub_nav_ul = soup.find_all("div", class_ = "newsinfo")
#获取公众号二维码
try:
content_img = sub_nav_ul[0].find_all("div", class_ = "qucode_img")
tag_img = content_img[0].find_all("img") or ''
except SyntaxError:
pass
#获取公众号名称
try:
tag_h1 = sub_nav_ul[0].find_all("h1", class_ = "tit_show")
except SyntaxError:
pass
#获取公众号其他信息
try:
tag_tit_bot = sub_nav_ul[0].find_all("div", class_ = "tit_bot")
tag_lis = (tag_tit_bot[0].find_all('li'))
except SyntaxError:
pass
#获取微信号
try:
wechat_id = tag_lis[0].find_all('strong')[0].string or ''
except SyntaxError:
wechat_id = ''
#获取关注度
try:
attention = re.findall(r"(\d+)", str(tag_lis[2]))
attention_count = attention[0] or ''
except SyntaxError:
attention_count = 0
#获取所在地区
try:
chinese_word = re.compile(u"[\u4e00-\u9fa5]+")
address_array = re.findall(chinese_word, str(tag_lis[4]))
address = address_array[1] or ''
except SyntaxError:
address = ''
#获取收录时间
try:
include_time_array = re.findall(r"(\d+)", str(tag_lis[5])) or ''
include_time = include_time_array[0] + '-' + include_time_array[1] + '-' + include_time_array[2]
except SyntaxError:
include_time = ''
#获取二维码
try:
qrcode_link = tag_img[0]['src']
random_str_q = uuid.uuid5(uuid.NAMESPACE_DNS, str(uuid.uuid1()))
qrcode = str(self.qrcode_url) + str(random_str_q) + '.jpg'
qr = requests.get(qrcode_link)
with open(qrcode, 'wb') as outfile:
outfile.write(qr.content)
# urllib.request.urlretrieve(qrcode_link)
except SyntaxError:
qrcode = ''
#获取微信名
try:
name = tag_h1[0].string or ''
except SyntaxError:
name = ''
#获取微信介绍
try:
sub_description_array = soup.find_all("div", class_ = "view view_c")
sub_description_div = sub_description_array[0].find_all('div')
except SyntaxError:
pass
#获取背景头像
try:
background_cover_link = sub_description_div[0].find_all('img')[0]['src']
random_str_b = uuid.uuid5(uuid.NAMESPACE_DNS, str(uuid.uuid1()))
background_cover = str(self.cover_url) + str(random_str_b) + '.jpg'
br = requests.get(background_cover_link)
with open(background_cover, 'wb') as outfile:
outfile.write(br.content)
except SyntaxError:
background_cover = ''
#获取介绍
try:
descriptions = sub_description_div[1].string or ''
except SyntaxError:
descriptions = ''
#获取微信公众号链接
try:
wechat_link = sub_description_div[2].find_all('a')[0]['href'] or ''
wechat_link_mp = re.search('mp.weixin.qq.com', wechat_link)
if not wechat_link_mp:
wechat_link = ''
else:
wechat_link = wechat_link
except IndexError:
wechat_link = ''
print('微信名--------------' + name)
print('微信号--------------' + wechat_id)
print('关注度--------------' + attention_count)
print('所在地区------------' + address)
print('收录时间------------' + include_time)
print('外键ID-------------' + str(parent_id))
print('介绍---------------' + descriptions)
print('二维码-------------' + qrcode)
print('公众号链接----------' + wechat_link)
print('背景头像------------' + background_cover)
#组装数据
one_account_data = (name, wechat_id, attention_count, address, include_time, parent_id, descriptions, qrcode, wechat_link, background_cover)
self.official_accounts_array.append(one_account_data)
print('------------------'+ name +' 组装完成 ------------------------')
official_accounts = OfficialAccounts()
official_accounts.get_index_page()