Skip to content
This repository was archived by the owner on Sep 6, 2025. It is now read-only.

Commit 904af59

Browse files
committed
feat: output crawler abnormal record
1 parent 57de7df commit 904af59

File tree

5 files changed

+99
-53
lines changed

5 files changed

+99
-53
lines changed

src/acquire_fund_quarter.py

Lines changed: 39 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from sql_model.fund_query import FundQuery
2323
from utils.driver import create_chrome_driver
2424
from utils.index import bootstrap_thread
25+
from utils.file_op import read_error_code_from_json, write_fund_json_data
2526
from utils.login import login_morning_star
2627

2728
# 利用api获取同类基金的资产
@@ -44,13 +45,17 @@ def get_total_asset(fund_code, platform):
4445
def acquire_fund_quarter():
4546
lock = Lock()
4647
each_fund_query = FundQuery()
47-
4848
idWorker = IdWorker()
49-
result_dir = './output/'
50-
fund_csv = FundCSV(result_dir)
51-
fund_csv.write_season_catch_fund(True)
52-
fund_csv.write_abnormal_url_fund(True)
53-
49+
# result_dir = './output/'
50+
# fund_csv = FundCSV(result_dir)
51+
# fund_csv.write_season_catch_fund(True)
52+
# fund_csv.write_abnormal_url_fund(True)
53+
err_info = read_error_code_from_json()
54+
error_funds_with_page = err_info.get('error_funds_with_page')
55+
error_funds_with_found_date = err_info.get('error_funds_with_found_date')
56+
error_funds_with_unmatch = err_info.get('error_funds_with_unmatch')
57+
filename = err_info.get('filename')
58+
file_dir = err_info.get('file_dir')
5459
def crawlData(start, end):
5560
login_url = 'https://www.morningstar.cn/membership/signin.aspx'
5661
chrome_driver = create_chrome_driver()
@@ -62,29 +67,30 @@ def crawlData(start, end):
6267
results = each_fund_query.select_quarter_fund(
6368
page_start, page_limit)
6469
for record in results:
65-
sleep(1)
66-
# 0P000179WG
67-
# 001811 中欧明睿新常态混合A
68-
each_fund = FundSpider(
69-
record[0], record[1], record[2], chrome_driver)
70-
70+
fund_code = record[0]
71+
if fund_code in error_funds_with_page or fund_code in error_funds_with_found_date or fund_code in error_funds_with_unmatch:
72+
print('error fund: ', fund_code)
73+
continue
74+
each_fund = FundSpider(fund_code, record[1], record[2], chrome_driver)
7175
each_fund.set_found_data(record[3])
7276
is_error_page = each_fund.go_fund_url()
7377
# 是否能正常跳转到基金详情页,没有的话,写入csv,退出当前循环
7478
if is_error_page == True:
7579
# error_funds.append(each_fund.fund_code)
76-
fund_infos = [each_fund.fund_code, each_fund.morning_star_code,
77-
each_fund.fund_name, record[3], page_start, '页面跳转有问题']
78-
output_line = ', '.join(str(x)
79-
for x in fund_infos) + '\n'
80-
fund_csv.write_abnormal_url_fund(False, output_line)
80+
# fund_infos = [each_fund.fund_code, each_fund.morning_star_code,
81+
# each_fund.fund_name, record[3], page_start, '页面跳转有问题']
82+
# output_line = ', '.join(str(x)
83+
# for x in fund_infos) + '\n'
84+
# fund_csv.write_abnormal_url_fund(False, output_line)
85+
error_funds_with_page.append(each_fund.fund_code)
8186

8287
continue
8388
# 开始爬取数据
8489
quarter_index = each_fund.get_quarter_index() # 数据更新时间,如果不一致,不爬取下面数据
8590
if quarter_index != each_fund.quarter_index:
86-
print('quarter_index', quarter_index, each_fund.update_date,
87-
each_fund.fund_code, each_fund.fund_name)
91+
# print('quarter_index', quarter_index, each_fund.update_date,
92+
# each_fund.fund_code, each_fund.fund_name)
93+
error_funds_with_unmatch.append(each_fund.fund_code)
8894
continue
8995

9096
each_fund.get_fund_season_info() # 基本季度性数据
@@ -95,14 +101,14 @@ def crawlData(start, end):
95101
if each_fund.stock_position['total'] != '0.00' and each_fund.total_asset != None:
96102
each_fund.get_asset_composition_info()
97103
# 爬取过程中是否有异常,有的话,存在csv中
98-
if each_fund._is_trigger_catch == True:
99-
fund_infos = [each_fund.fund_code, each_fund.morning_star_code,
100-
each_fund.fund_name, record[3],
101-
each_fund.stock_position['total'],
102-
page_start, each_fund._catch_detail]
103-
output_line = ', '.join(str(x)
104-
for x in fund_infos) + '\n'
105-
fund_csv.write_season_catch_fund(False, output_line)
104+
# if each_fund._is_trigger_catch == True:
105+
# fund_infos = [each_fund.fund_code, each_fund.morning_star_code,
106+
# each_fund.fund_name, record[3],
107+
# each_fund.stock_position['total'],
108+
# page_start, each_fund._catch_detail]
109+
# output_line = ', '.join(str(x)
110+
# for x in fund_infos) + '\n'
111+
# fund_csv.write_season_catch_fund(False, output_line)
106112
# 入库
107113
lock.acquire()
108114
snow_flake_id = idWorker.get_id()
@@ -157,7 +163,6 @@ def crawlData(start, end):
157163
'morning_star_rating_5': each_fund.morning_star_rating.get(5),
158164
'morning_star_rating_10': each_fund.morning_star_rating.get(10),
159165
}
160-
161166
# 入库十大股票持仓
162167
stock_position_total = each_fund.stock_position.get(
163168
'total', '0.00')
@@ -192,7 +197,6 @@ def crawlData(start, end):
192197
item_code = item[0]
193198
if item_code == each_fund.fund_code:
194199
continue
195-
print("item_code", item_code, platform)
196200
total_asset = get_total_asset(item_code, platform)
197201
if total_asset != None:
198202
init_total_asset = init_total_asset - total_asset
@@ -233,25 +237,22 @@ def crawlData(start, end):
233237
raise BaseException
234238
chrome_driver.close()
235239
thread_count = 6
236-
237-
# for count in range(6):
238240
total_start_time = time()
239241
# record_total = each_fund_query.select_quarter_fund_total() # 获取记录条数
240-
# print("record_total", record_total)
241242
# bootstrap_thread(crawlData, record_total, thread_count)
242-
243-
for i in range(3):
244-
print("i", i)
243+
record_total = each_fund_query.select_quarter_fund_total() # 获取记录条数
244+
for i in range(2):
245245
start_time = time()
246-
record_total = each_fund_query.select_quarter_fund_total() # 获取记录条数
247246
print('record_total', record_total)
248247
try:
249248
bootstrap_thread(crawlData, record_total, thread_count)
250249
except:
251-
end_time = time()
252-
print("耗时: {:.2f}秒".format(end_time - start_time))
250+
cur_total = each_fund_query.select_quarter_fund_total() # 获取记录条数
251+
print('crawler item count:', record_total - cur_total)
252+
record_total = cur_total
253253
end_time = time()
254254
print("耗时: {:.2f}秒".format(end_time - start_time))
255+
write_fund_json_data({'error_funds_with_page': error_funds_with_page, 'error_funds_with_found_date': error_funds_with_found_date, 'error_funds_with_unmatch': error_funds_with_unmatch}, filename=filename, file_dir=file_dir)
255256
total_end_time = time()
256257
print("total耗时: {:.2f}秒".format(total_end_time - total_start_time))
257258
exit()

src/crud/query.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import sys
1212

1313
sys.path.append('./src')
14-
from sqlalchemy import and_
14+
from sqlalchemy import and_, or_
1515
from sqlalchemy.orm import Session
1616

1717
from models.fund import FundBase, FundQuarter
@@ -53,9 +53,12 @@ def query_empty_company_and_found_date_fund(start, size):
5353
all_funds = session.query(FundBase).where(FundBase.company == None, FundBase.found_date == None, FundBase.is_archive==0).offset(start).limit(size).all()
5454
return all_funds
5555

56+
def query_empty_company_or_found_date_fund(start, size):
57+
all_funds = session.query(FundBase).where(FundBase.is_archive==0).filter(or_(FundBase.company == None, FundBase.found_date == None)).offset(start).limit(size).all()
58+
return all_funds
5659
if __name__ == '__main__':
5760
quarter_index = '2022-Q2'
5861
# fund_list = query_high_score_funds(quarter_index)
59-
query_empty_company_and_found_date_fund(2, 10)
62+
query_empty_company_or_found_date_fund(0, 5000)
6063
# print("fund_list",fund_list)
6164

src/fund_info/crawler.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,14 +61,20 @@ def go_fund_url(self, cookie_str=None):
6161
self.morning_star_code
6262

6363
self._chrome_driver.get(morning_fund_selector_url) # 打开爬取页面
64-
sleep(6)
64+
sleep(5)
6565
# 判断是否页面出错,重定向,如果是的话跳过
6666
if self._chrome_driver.current_url == 'https://www.morningstar.cn/errors/defaulterror.html':
6767
return True
68-
while self._chrome_driver.page_source == None:
68+
if 'Value cannot be null' in self._chrome_driver.title:
69+
return True
70+
try_count = 5
71+
while self._chrome_driver.page_source == None and try_count > 0:
6972
self._chrome_driver.refresh()
7073
print('wait:fund_code', self.fund_code)
7174
sleep(9)
75+
try_count -= 1
76+
if self._chrome_driver.page_source == None:
77+
return True
7278
return False
7379
# self._chrome_driver.execute_script('location.reload()')
7480

@@ -140,7 +146,7 @@ def get_fund_base_info(self):
140146
# 获取基金经理信息(多位在任基金经理,只需第一位)
141147
def get_fund_manager_info(self):
142148
manager_ele_list = self._chrome_driver.find_element(By.ID,
143-
'qt_manager').find_elements_by_xpath("ul")
149+
'qt_manager').find_elements(By.XPATH, "ul")
144150
for manager_ele in manager_ele_list:
145151
try:
146152
# 基金经理

src/sync_fund_base.py

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,14 @@
1414
from selenium.webdriver.common.by import By
1515
from selenium.webdriver.support.ui import WebDriverWait
1616

17-
from crud.query import query_all_fund, query_empty_company_and_found_date_fund
18-
from models.fund import FundBase
17+
from crud.query import (query_all_fund,
18+
query_empty_company_and_found_date_fund,
19+
query_empty_company_or_found_date_fund)
1920
from fund_info.crawler import FundSpider
20-
from utils.index import bootstrap_thread
21+
from models.fund import FundBase
2122
from utils.driver import create_chrome_driver, text_to_be_present_in_element
23+
from utils.file_op import read_error_code_from_json, write_fund_json_data
24+
from utils.index import bootstrap_thread
2225
from utils.login import login_morning_star
2326

2427

@@ -94,8 +97,13 @@ def sync_fund_base(page_index):
9497
print('end')
9598

9699
def further_complete_base_info():
97-
all_funds = query_empty_company_and_found_date_fund(0, 10000)
98-
error_funds = []
100+
all_funds = query_empty_company_or_found_date_fund(0, 10000)
101+
err_info = read_error_code_from_json()
102+
error_funds_with_page = err_info.get('error_funds_with_page')
103+
error_funds_with_found_date = err_info.get('error_funds_with_found_date')
104+
error_funds_with_unmatch = err_info.get('error_funds_with_unmatch')
105+
filename = err_info.get('filename')
106+
file_dir = err_info.get('file_dir')
99107
def crawlData(start, end):
100108
login_url = 'https://www.morningstar.cn/membership/signin.aspx'
101109
chrome_driver = create_chrome_driver()
@@ -109,19 +117,21 @@ def crawlData(start, end):
109117
# results = query_empty_company_and_found_date_fund(page_start, page_limit)
110118
for record in results:
111119
fund_code = record.fund_code
120+
if fund_code in error_funds_with_page or fund_code in error_funds_with_found_date:
121+
continue
112122
morning_star_code = record.morning_star_code
113123
fund_name = record.fund_name
114124
each_fund = FundSpider(fund_code, morning_star_code, fund_name, chrome_driver)
115125
# 是否能正常跳转到基金详情页
116126
is_error_page = each_fund.go_fund_url()
117127
if is_error_page == True:
118-
error_funds.append(each_fund.fund_code)
128+
error_funds_with_page.append(each_fund.fund_code)
119129
continue
120130
each_fund.get_fund_base_info()
121131
# 去掉没有成立时间的
122132
if each_fund.found_date == '-' or each_fund.found_date == None:
123133
# lock.acquire()
124-
error_funds.append(each_fund.fund_code)
134+
error_funds_with_found_date.append(each_fund.fund_code)
125135
# lock.release()
126136
continue
127137
# 拼接sql需要的数据
@@ -138,7 +148,12 @@ def crawlData(start, end):
138148
print('page_start', page_start)
139149
page_start = page_start + page_limit
140150
chrome_driver.close()
141-
bootstrap_thread(crawlData, len(all_funds), 3)
151+
try:
152+
bootstrap_thread(crawlData, len(all_funds), 6)
153+
write_fund_json_data({'error_funds_with_page': error_funds_with_page, 'error_funds_with_found_date': error_funds_with_found_date, 'error_funds_with_unmatch': error_funds_with_unmatch}, filename=filename, file_dir=file_dir)
154+
except:
155+
write_fund_json_data({'error_funds_with_page': error_funds_with_page, 'error_funds_with_found_date': error_funds_with_found_date, 'error_funds_with_unmatch': error_funds_with_unmatch}, filename=filename, file_dir=file_dir)
156+
142157
if __name__ == '__main__':
143158
#127, 300, 600-
144159
page_index = 1

src/utils/file_op.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,15 @@
77
-----
88
Copyright (c) 2021 Camel Lu
99
'''
10+
import json
1011
import os
1112
import time
1213

1314
import pandas as pd
1415
from openpyxl import load_workbook
1516

17+
from .index import get_last_quarter_str
18+
1619

1720
# 写json文件
1821
def write_fund_json_data(data, filename, file_dir=None):
@@ -30,8 +33,6 @@ def write_fund_json_data(data, filename, file_dir=None):
3033
def read_dir_all_file(path):
3134
return os.listdir(path)
3235

33-
34-
3536
def update_xlsx_file(path, df_data, sheet_name):
3637
try:
3738
if os.path.exists(path):
@@ -110,3 +111,23 @@ def update_xlsx_file_with_insert(path, df_data, sheet_name, index = 0):
110111
except BaseException:
111112
print("path", path)
112113
raise BaseException('更新excel失败')
114+
115+
def read_error_code_from_json():
116+
quarter_index = get_last_quarter_str()
117+
filename = 'error_funds_' + quarter_index + '.json'
118+
file_dir = './output/json/'
119+
error_funds_with_page = []
120+
error_funds_with_unmatch = []
121+
error_funds_with_found_date = []
122+
if os.path.exists(file_dir + filename):
123+
with open(file_dir + filename) as json_file:
124+
my_data = json.load(json_file)
125+
error_funds_with_page = my_data.get('error_funds_with_page')
126+
error_funds_with_found_date = my_data.get('error_funds_with_found_date')
127+
return {
128+
"file_dir": file_dir,
129+
"filename": filename,
130+
'error_funds_with_unmatch': error_funds_with_unmatch,
131+
'error_funds_with_page': error_funds_with_page,
132+
'error_funds_with_found_date': error_funds_with_found_date
133+
}

0 commit comments

Comments
 (0)