-
Notifications
You must be signed in to change notification settings - Fork 2
/
de_mitmproxy.py
44 lines (40 loc) · 1.9 KB
/
de_mitmproxy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import mitmproxy.http
import json
import time
import os
import re
def validateTitle(title):
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
new_title = re.sub(rstr, "_", title) # 替换为下划线
# 去掉字符串中的 回车 和 空格
new_title = "".join((re.sub("\n", " ", new_title)).split(" "))
return new_title
class DouyinCrawl:
def __init__(self):
if not os.path.isdir("saveData"):
# 如果目录不存在,则创建保存数据的目录
os.mkdir("saveData")
# 拦截响应
def response(self, flow: mitmproxy.http.HTTPFlow):
if 'api.jikipedia.com/go/browse_definitions' in flow.request.url:
response = flow.response
# 状态码为200,说明响应成功,获取到内容了
if response.status_code == 200:
data_json = response.text
# # json.loads()函数是将json格式数据转换为字典
data_obj = json.loads(data_json)
if isinstance(data_obj, list):
for item in data_obj:
try:
title =str(item["id"]) + "_" + str(item["term"]["id"]) + "_" + item["term"]["title"]
title = validateTitle(title) #使得标题名符合文件命名要求
with open('./saveData/'+ title +'.json', 'w') as f:
# 1、python3里面默认编码是unicode
# 2、python3做dump与dumps操作时,会将中文转换成unicode编码,并以16进制方式存储,再做逆向操作时,会将unicode编码转换回中文
# 即添加参数 ensure_ascii=False,它默认的是Ture
json.dump(item, f,ensure_ascii=False)
except:
pass
addons = [
DouyinCrawl()
]