-
Notifications
You must be signed in to change notification settings - Fork 17
/
BaiduIndex.py
193 lines (174 loc) · 8.23 KB
/
BaiduIndex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#!/usr/bin/env python3
# coding: utf-8
# File: BaiduIndex.py
# Author: lhy<[email protected],https://huangyong.github.io>
# Date: 18-5-23
from selenium import webdriver
from PIL import Image
import requests
import time
import re
import urllib
import pytesseract
import datetime
import os
import urllib.parse
import random
class BaiduIndex:
'''登录,打开首页'''
def __init__(self, user_name, password, chromepath):
self.user_name = user_name
self.password = password
self.chromepath = chromepath
self.current_dir = os.path.abspath(os.path.dirname(__file__))
def open_homepage(self, search_word):
keys = search_word.encode('gb2312')
keys = urllib.parse.quote(keys)
driver = webdriver.Chrome(self.chromepath)
url = 'http://index.baidu.com/?tpl=trend&word=%s' % keys
driver.get(url)
e1 = driver.find_element_by_id("TANGRAM__PSP_4__userName")
e1.send_keys(self.user_name)
e2 = driver.find_element_by_id("TANGRAM__PSP_4__password")
e2.send_keys(self.password)
e3 = driver.find_element_by_id("TANGRAM__PSP_4__submit")
e3.click()
time.sleep(2)
return driver
'''获取指数首页'''
def get_indexinfo(self, search_word, start_date, end_date):
date_list = self.collect_days(start_date, end_date)
driver = self.open_homepage(search_word)
new_cookies = ''
cookies = driver.get_cookies()
for cookie in cookies:
name = (cookie['name'])
value = (cookie['value'])
new_cookie = name + '=' + value + ';'
new_cookies = new_cookies + new_cookie
new_cookies = new_cookies[:-1]
res = driver.execute_script('return PPval.ppt;')
res2 = driver.execute_script('return PPval.res2;')
header = {
'Host': 'index.baidu.com',
'Connection': 'keep-alive',
'Accept': '*/*',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36',
'Referer': 'http://index.baidu.com/?tpl=trend&word=%CE%A4%B5%C2',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': new_cookies
}
url = 'http://index.baidu.com/Interface/Search/getSubIndex/?res={}&res2={}&type=0&startdate={}&enddate={}&forecast=0&word={}'.format(res, res2, start_date, end_date, search_word)
req = requests.get(url, headers=header).json()
return res, res2, req, date_list, header
'''获取指数细节'''
def get_image(self, year, search_word, start_date, end_date):
res, res2, req, date_list, header = self.get_indexinfo(search_word, start_date, end_date)
res3_list = req['data']['all'][0]['userIndexes_enc']
res3_list = res3_list.split(',')
date_pairs = list(zip(date_list, res3_list))
region_dict = []
date_dict = {}
date_index = 0
if not os.path.exists('%s/%s_tmp1/'%(search_word, year)):
os.mkdir('%s/%s_tmp1/'%(search_word, year))
if not os.path.exists('%s/%s_tmp2/'%(search_word, year)):
os.mkdir('%s/%s_tmp2/'%(search_word, year))
if not os.path.exists('%s/%s_tmp3/'%(search_word, year)):
os.mkdir('%s/%s_tmp3/'%(search_word, year))
for date, res3 in date_pairs:
date_dict[date_index] = date
timestamp = int(time.time())
viewbox_url = 'http://index.baidu.com/Interface/IndexShow/show/?res=%s&res2=%s&classType=1&res3[]=%s&className=view-value&%s' % (
res, res2, res3, timestamp)
req = requests.get(viewbox_url, headers=header).json()
print(search_word, date, '请求成功')
response = req['data']['code'][0]
width = re.findall('width:(.*?)px', response)
margin_left = re.findall('margin-left:-(.*?)px', response)
width = [int(x) for x in width]
margin_left = [int(x) for x in margin_left]
region_dict.append({'width': width, 'margin_left': margin_left})
img_url = 'http://index.baidu.com' + re.findall('url\("(.*?)"', response)[0]
img_content = requests.get(img_url, headers=header)
time.sleep(random.uniform(0,1))
#time.sleep(0.01)
if img_content.status_code == requests.codes.ok:
with open('%s/%s_tmp1/%s.png' % (search_word, year, date_index), 'wb') as file:
file.write(img_content.content)
print(search_word, date, '下载成功')
date_index += 1
return region_dict, date_dict
'''数据图片解码'''
def decode_image(self, keyword, year, region_dict, date_dict):
for index, region in enumerate(region_dict):
code = Image.open('%s/%s_tmp1/%s.png' % (keyword, year, index))
hight = code.size[1]
target = Image.new('RGB', (sum(region['width']), hight))
for i in range(len(region['width'])):
img = code.crop((region['margin_left'][i], 0, region['margin_left'][i] + region['width'][i], hight))
target.paste(img, (sum(region['width'][0:i]), 0, sum(region['width'][0:i + 1]), hight))
target.save('%s/%s_tmp2/%s.png' % (keyword, year, date_dict[index]))
'''图片数字转写'''
def transwrite_image(self, year, word):
f = open('%s/%s_index.txt' % (word, year), 'w+')
for root, dirs, files in os.walk('%s/%s_tmp2'%(word, year)):
for file in files:
filepath = os.path.join(root, file)
date = file.split('.')[0]
num = self.char_to_num(word, year, filepath)
f.write(date + '\t' + num + '\n')
f.close()
'''图片数字识别'''
def char_to_num(self, keyword, year, filepath):
jpgzoom = Image.open(filepath)
(x, y) = jpgzoom.size
x_s = 2 * x
y_s = 2 * y
out = jpgzoom.resize((x_s, y_s), Image.ANTIALIAS)
file = filepath.split('/')[-1].split('.')[0]
out.save('%s/%s_tmp3/%s.jpg' % (keyword, year, file), "JPEG", quality=100)
num = pytesseract.image_to_string(out)
if num:
num = num.lower().replace("'", '').replace('!', '').replace('.', '').replace(',', '').replace('?', '7').replace("S", '5').replace(" ","").replace("E", "8").replace("B", "8").replace("I", "1").replace("$", "8").replace("a", "8").replace('n', "11").replace('o', '0')
print(num)
return num
else:
return 'error'
'''获取时间段内的日期列表'''
def collect_days(self, start_date, end_date):
date_list = []
begin_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d")
while begin_date <= end_date:
date_str = begin_date.strftime("%Y-%m-%d")
date_list.append(date_str)
begin_date += datetime.timedelta(days=1)
return date_list
'''合并指数'''
def merge_index(self, word):
index_paths = sorted(['%s/%s'%(word, file) for file in os.listdir(word) if file.endswith('_index.txt')])
f = open('%s/%s.txt'%(word, word), 'w+')
index_dict = {}
for filepath in index_paths:
for line in open(filepath):
if not line:
continue
line = line.strip().split(' ')
index_dict[int(line[0].replace('-', ''))] = line[1]
index_dict = sorted(index_dict.items(), key=lambda asd:asd[0], reverse=False)
for item in index_dict:
f.write(str(item[0])[:4] + '-' + str(item[0])[4:6] + '-' + str(item[0])[6:] + '\t' + item[1] + '\n')
f.close()
'''采集主函数'''
def spider(self, year, word, start_date, end_date):
print('step1, spider data..')
region_dict, date_dict = self.get_image(year, word, start_date, end_date)
print('step2, deocde image..')
self.decode_image(word, year, region_dict, date_dict)
print('step3, transfer image..')
self.transwrite_image(year, word)
print('step4, merge index..')
self.merge_index(word)