22
22
from sql_model .fund_query import FundQuery
23
23
from utils .driver import create_chrome_driver
24
24
from utils .index import bootstrap_thread
25
+ from utils .file_op import read_error_code_from_json , write_fund_json_data
25
26
from utils .login import login_morning_star
26
27
27
28
# 利用api获取同类基金的资产
@@ -44,13 +45,17 @@ def get_total_asset(fund_code, platform):
44
45
def acquire_fund_quarter ():
45
46
lock = Lock ()
46
47
each_fund_query = FundQuery ()
47
-
48
48
idWorker = IdWorker ()
49
- result_dir = './output/'
50
- fund_csv = FundCSV (result_dir )
51
- fund_csv .write_season_catch_fund (True )
52
- fund_csv .write_abnormal_url_fund (True )
53
-
49
+ # result_dir = './output/'
50
+ # fund_csv = FundCSV(result_dir)
51
+ # fund_csv.write_season_catch_fund(True)
52
+ # fund_csv.write_abnormal_url_fund(True)
53
+ err_info = read_error_code_from_json ()
54
+ error_funds_with_page = err_info .get ('error_funds_with_page' )
55
+ error_funds_with_found_date = err_info .get ('error_funds_with_found_date' )
56
+ error_funds_with_unmatch = err_info .get ('error_funds_with_unmatch' )
57
+ filename = err_info .get ('filename' )
58
+ file_dir = err_info .get ('file_dir' )
54
59
def crawlData (start , end ):
55
60
login_url = 'https://www.morningstar.cn/membership/signin.aspx'
56
61
chrome_driver = create_chrome_driver ()
@@ -62,29 +67,30 @@ def crawlData(start, end):
62
67
results = each_fund_query .select_quarter_fund (
63
68
page_start , page_limit )
64
69
for record in results :
65
- sleep (1 )
66
- # 0P000179WG
67
- # 001811 中欧明睿新常态混合A
68
- each_fund = FundSpider (
69
- record [0 ], record [1 ], record [2 ], chrome_driver )
70
-
70
+ fund_code = record [0 ]
71
+ if fund_code in error_funds_with_page or fund_code in error_funds_with_found_date or fund_code in error_funds_with_unmatch :
72
+ print ('error fund: ' , fund_code )
73
+ continue
74
+ each_fund = FundSpider (fund_code , record [1 ], record [2 ], chrome_driver )
71
75
each_fund .set_found_data (record [3 ])
72
76
is_error_page = each_fund .go_fund_url ()
73
77
# 是否能正常跳转到基金详情页,没有的话,写入csv,退出当前循环
74
78
if is_error_page == True :
75
79
# error_funds.append(each_fund.fund_code)
76
- fund_infos = [each_fund .fund_code , each_fund .morning_star_code ,
77
- each_fund .fund_name , record [3 ], page_start , '页面跳转有问题' ]
78
- output_line = ', ' .join (str (x )
79
- for x in fund_infos ) + '\n '
80
- fund_csv .write_abnormal_url_fund (False , output_line )
80
+ # fund_infos = [each_fund.fund_code, each_fund.morning_star_code,
81
+ # each_fund.fund_name, record[3], page_start, '页面跳转有问题']
82
+ # output_line = ', '.join(str(x)
83
+ # for x in fund_infos) + '\n'
84
+ # fund_csv.write_abnormal_url_fund(False, output_line)
85
+ error_funds_with_page .append (each_fund .fund_code )
81
86
82
87
continue
83
88
# 开始爬取数据
84
89
quarter_index = each_fund .get_quarter_index () # 数据更新时间,如果不一致,不爬取下面数据
85
90
if quarter_index != each_fund .quarter_index :
86
- print ('quarter_index' , quarter_index , each_fund .update_date ,
87
- each_fund .fund_code , each_fund .fund_name )
91
+ # print('quarter_index', quarter_index, each_fund.update_date,
92
+ # each_fund.fund_code, each_fund.fund_name)
93
+ error_funds_with_unmatch .append (each_fund .fund_code )
88
94
continue
89
95
90
96
each_fund .get_fund_season_info () # 基本季度性数据
@@ -95,14 +101,14 @@ def crawlData(start, end):
95
101
if each_fund .stock_position ['total' ] != '0.00' and each_fund .total_asset != None :
96
102
each_fund .get_asset_composition_info ()
97
103
# 爬取过程中是否有异常,有的话,存在csv中
98
- if each_fund ._is_trigger_catch == True :
99
- fund_infos = [each_fund .fund_code , each_fund .morning_star_code ,
100
- each_fund .fund_name , record [3 ],
101
- each_fund .stock_position ['total' ],
102
- page_start , each_fund ._catch_detail ]
103
- output_line = ', ' .join (str (x )
104
- for x in fund_infos ) + '\n '
105
- fund_csv .write_season_catch_fund (False , output_line )
104
+ # if each_fund._is_trigger_catch == True:
105
+ # fund_infos = [each_fund.fund_code, each_fund.morning_star_code,
106
+ # each_fund.fund_name, record[3],
107
+ # each_fund.stock_position['total'],
108
+ # page_start, each_fund._catch_detail]
109
+ # output_line = ', '.join(str(x)
110
+ # for x in fund_infos) + '\n'
111
+ # fund_csv.write_season_catch_fund(False, output_line)
106
112
# 入库
107
113
lock .acquire ()
108
114
snow_flake_id = idWorker .get_id ()
@@ -157,7 +163,6 @@ def crawlData(start, end):
157
163
'morning_star_rating_5' : each_fund .morning_star_rating .get (5 ),
158
164
'morning_star_rating_10' : each_fund .morning_star_rating .get (10 ),
159
165
}
160
-
161
166
# 入库十大股票持仓
162
167
stock_position_total = each_fund .stock_position .get (
163
168
'total' , '0.00' )
@@ -192,7 +197,6 @@ def crawlData(start, end):
192
197
item_code = item [0 ]
193
198
if item_code == each_fund .fund_code :
194
199
continue
195
- print ("item_code" , item_code , platform )
196
200
total_asset = get_total_asset (item_code , platform )
197
201
if total_asset != None :
198
202
init_total_asset = init_total_asset - total_asset
@@ -233,25 +237,22 @@ def crawlData(start, end):
233
237
raise BaseException
234
238
chrome_driver .close ()
235
239
thread_count = 6
236
-
237
- # for count in range(6):
238
240
total_start_time = time ()
239
241
# record_total = each_fund_query.select_quarter_fund_total() # 获取记录条数
240
- # print("record_total", record_total)
241
242
# bootstrap_thread(crawlData, record_total, thread_count)
242
-
243
- for i in range (3 ):
244
- print ("i" , i )
243
+ record_total = each_fund_query .select_quarter_fund_total () # 获取记录条数
244
+ for i in range (2 ):
245
245
start_time = time ()
246
- record_total = each_fund_query .select_quarter_fund_total () # 获取记录条数
247
246
print ('record_total' , record_total )
248
247
try :
249
248
bootstrap_thread (crawlData , record_total , thread_count )
250
249
except :
251
- end_time = time ()
252
- print ("耗时: {:.2f}秒" .format (end_time - start_time ))
250
+ cur_total = each_fund_query .select_quarter_fund_total () # 获取记录条数
251
+ print ('crawler item count:' , record_total - cur_total )
252
+ record_total = cur_total
253
253
end_time = time ()
254
254
print ("耗时: {:.2f}秒" .format (end_time - start_time ))
255
+ write_fund_json_data ({'error_funds_with_page' : error_funds_with_page , 'error_funds_with_found_date' : error_funds_with_found_date , 'error_funds_with_unmatch' : error_funds_with_unmatch }, filename = filename , file_dir = file_dir )
255
256
total_end_time = time ()
256
257
print ("total耗时: {:.2f}秒" .format (total_end_time - total_start_time ))
257
258
exit ()
0 commit comments