Skip to content

Commit e78eba2

Browse files
authoredMay 30, 2024
Update spider.py
1 parent 1b1b7b4 commit e78eba2

File tree

1 file changed

+40
-10
lines changed

1 file changed

+40
-10
lines changed
 

‎spider.py

+40-10
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,16 @@
55
from os.path import exists
66

77
stock = '博思软件'
8-
RESULTS_DIR = f'D:\\年报\\{stock}'
8+
announcement_list = ['分红派息实施公告', '利润分配预案', '年度报告', '半年度|季度', '招股说明书']
9+
announcement = '季度报告'
10+
ban = '摘要|已取消|提示性公告'
11+
RESULTS_DIR = f'D:\\报告\\{stock}\\{announcement}'
912
exists(RESULTS_DIR) or makedirs(RESULTS_DIR)
1013

1114
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0'}
1215
orgid_url = 'http://www.cninfo.com.cn/new/data/szse_stock.json'
1316
url = 'http://www.cninfo.com.cn/new/hisAnnouncement/query'
1417
DETAIL_URL = 'http://static.cninfo.com.cn/'
15-
1618
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
1719

1820

@@ -24,25 +26,23 @@ def get_orgid():
2426
stock_lists = orgids['stockList']
2527
for stock_list in stock_lists:
2628
if stock_list['zwjc'] == stock:
27-
logging.info(f'获得股票信息: {stock}')
2829
return {
2930
'code': stock_list['code'],
3031
'orgid': stock_list['orgId']
3132
}
3233

3334

3435
def get_pdf_url(page, data):
35-
"""获得年报及招股说明书pdf下载信息"""
36+
"""获得公告的pdf下载信息"""
3637
code = data.get('code')
3738
orgid = data.get('orgid')
38-
3939
post_data = {
4040
'stock': f'{code},{orgid}',
4141
'tabName': 'fulltext',
4242
'pageSize': 30,
4343
'pageNum': page,
4444
'column': 'szse',
45-
'category': 'category_ndbg_szsh;category_sf_szsh;',
45+
'category': '',
4646
'plate': 'sz',
4747
'seDate': '',
4848
'searchkey': '',
@@ -58,18 +58,44 @@ def get_pdf_url(page, data):
5858
dats = an.get('announcements')
5959
stock_list = []
6060
for dat in dats:
61-
if re.search('摘要|已取消', dat['announcementTitle']):
61+
if re.search(ban, dat['announcementTitle']):
6262
continue
63-
elif re.search('招股说明书|年度报告', dat['announcementTitle']):
63+
elif re.search(announcement, dat['announcementTitle']):
6464
stock_list.append({
6565
'announcementTitle': dat['announcementTitle'],
6666
'adjunctUrl': dat['adjunctUrl']
6767
})
6868
return stock_list
6969

7070

71+
def get_totalpages(data):
72+
"""获得公告的总页数"""
73+
code = data.get('code')
74+
orgid = data.get('orgid')
75+
post_data = {
76+
'stock': f'{code},{orgid}',
77+
'tabName': 'fulltext',
78+
'pageSize': 30,
79+
'pageNum': 1,
80+
'column': 'szse',
81+
'category': '',
82+
'plate': 'sz',
83+
'seDate': '',
84+
'searchkey': '',
85+
'secid': '',
86+
'sortName': '',
87+
'sortType': '',
88+
'isHLtitle': 'true'
89+
}
90+
with httpx.Client(headers=headers) as client:
91+
res = client.post(url, data=post_data)
92+
an = res.json()
93+
totalpages = an.get('totalpages')
94+
return totalpages
95+
96+
7197
def save_pdf(datas):
72-
"""保存年报pdf"""
98+
"""保存公告pdf"""
7399
for data in datas:
74100
part_url = data.get('adjunctUrl')
75101
name = data.get('announcementTitle')
@@ -84,9 +110,13 @@ def save_pdf(datas):
84110

85111

86112
def main():
87-
for page in range(1, 2):
113+
pages = get_totalpages(get_orgid())
114+
logging.info(f'一共{pages}页公告信息...')
115+
for page in range(1, pages + 1):
88116
pdfdata = get_pdf_url(page, get_orgid())
117+
logging.info(f'获得第{page}页股票信息...')
89118
save_pdf(pdfdata)
119+
logging.info('下载完成')
90120

91121

92122
if __name__ == '__main__':

0 commit comments

Comments
 (0)
Please sign in to comment.