1
+ '''
2
+ Desc:
3
+ File: /sync_fund_base.py
4
+ File Created: Sunday, 30th October 2022 2:53:56 pm
5
+
6
+ -----
7
+ Copyright (c) 2022 Camel Lu
8
+ '''
9
+ import math
10
+ import re
11
+ from time import sleep
12
+
13
+ from bs4 import BeautifulSoup
14
+ from selenium .webdriver .common .by import By
15
+ from selenium .webdriver .support .ui import WebDriverWait
16
+
17
+ from crud .query import query_all_fund , query_empty_company_and_found_date_fund
18
+ from models .fund import FundBase
19
+ from fund_info .crawler import FundSpider
20
+ from utils .index import bootstrap_thread
21
+ from utils .driver import create_chrome_driver , text_to_be_present_in_element
22
+ from utils .login import login_morning_star
23
+
24
+
25
+ def sync_fund_base (page_index ):
26
+ morning_fund_selector_url = "https://www.morningstar.cn/fundselect/default.aspx"
27
+ chrome_driver = create_chrome_driver ()
28
+ login_morning_star (chrome_driver , morning_fund_selector_url )
29
+ page_count = 25 # 晨星固定分页数
30
+ page_total = math .ceil (int (chrome_driver .find_element (By .XPATH ,
31
+ '/html/body/form/div[8]/div/div[4]/div[3]/div[2]/span' ).text ) / page_count )
32
+ all_fund_dict = query_all_fund ()
33
+ all_fund_codes = all_fund_dict .keys ()
34
+ while page_index <= page_total :
35
+ # 求余
36
+ remainder = page_total % 10
37
+ # 判断是否最后一页
38
+ num = (remainder +
39
+ 2 ) if page_index > (page_total - remainder ) else 12
40
+ xpath_str = '/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/a[%s]' % (
41
+ num )
42
+ print ('page_index' , page_index )
43
+ # 等待,直到当前页(样式判断)等于page_num
44
+ WebDriverWait (chrome_driver , timeout = 600 ).until (text_to_be_present_in_element (
45
+ "/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/span[@style='margin-right:5px;font-weight:Bold;color:red;']" , str (page_index ), xpath_str ))
46
+ sleep (1 )
47
+ # 获取每页的源代码
48
+ data = chrome_driver .page_source
49
+ # 利用BeautifulSoup解析网页源代码
50
+ bs = BeautifulSoup (data , 'lxml' )
51
+ class_list = ['gridItem' , 'gridAlternateItem' ] # 数据在这两个类下
52
+ # 取出所有类的信息,并保存到对应的列表里
53
+ for i in range (len (class_list )):
54
+ tr_list = bs .find_all ('tr' , {'class' : class_list [i ]})
55
+ for tr_index in range (len (tr_list )):
56
+ # 雪花id
57
+ tr = tr_list [tr_index ]
58
+ tds_text = tr .find_all ('td' , {'class' : "msDataText" })
59
+ # 基金代码
60
+ code_a_element = tds_text [0 ].find_all ('a' )[0 ]
61
+ cur_fund_code = code_a_element .string
62
+ cur_morning_star_code = re .findall (
63
+ r'(?<=/quicktake/)(\w+)$' , code_a_element .get ('href' )).pop (0 )
64
+ cur_fund_name = tds_text [1 ].find_all ('a' )[0 ].string
65
+ cur_fund_cat = tds_text [2 ].string
66
+ if cur_fund_code in all_fund_codes :
67
+ exit_fund = all_fund_dict .get (cur_fund_code )
68
+ if (cur_morning_star_code != exit_fund ['morning_star_code' ]) or (cur_fund_name != exit_fund ['fund_name' ]) or (cur_fund_cat != exit_fund ['fund_cat' ]) :
69
+ fund_base_params = {
70
+ ** exit_fund ,
71
+ 'morning_star_code' : cur_morning_star_code ,
72
+ 'fund_name' : cur_fund_name ,
73
+ 'fund_cat' : cur_fund_cat
74
+ }
75
+ fund_base = FundBase (** fund_base_params )
76
+ fund_base .upsert ()
77
+ elif cur_fund_code :
78
+ fund_base_params = {
79
+ 'fund_code' : cur_fund_code ,
80
+ 'morning_star_code' : cur_morning_star_code ,
81
+ 'fund_name' : cur_fund_name ,
82
+ 'fund_cat' : cur_fund_cat
83
+ }
84
+ fund_base = FundBase (** fund_base_params )
85
+ print ('fund_name:' , cur_fund_name , 'fund_code:' , cur_fund_code , 'morning_star_code:' , cur_morning_star_code )
86
+ fund_base .upsert ()
87
+ # 获取下一页元素
88
+ next_page = chrome_driver .find_element (By .XPATH , xpath_str )
89
+ # 点击下一页
90
+ next_page .click ()
91
+ sleep (3 )
92
+ page_index += 1
93
+ chrome_driver .close ()
94
+ print ('end' )
95
+
96
+ def further_complete_base_info ():
97
+ all_funds = query_empty_company_and_found_date_fund (0 , 10000 )
98
+ error_funds = []
99
+ def crawlData (start , end ):
100
+ login_url = 'https://www.morningstar.cn/membership/signin.aspx'
101
+ chrome_driver = create_chrome_driver ()
102
+ login_morning_star (chrome_driver , login_url )
103
+ page_start = start
104
+ page_limit = 10
105
+ # 遍历从基金列表的单支基金
106
+ while (page_start < end ):
107
+ page_end = page_start + page_limit
108
+ results = all_funds [page_start :page_end ]
109
+ # results = query_empty_company_and_found_date_fund(page_start, page_limit)
110
+ for record in results :
111
+ fund_code = record .fund_code
112
+ morning_star_code = record .morning_star_code
113
+ fund_name = record .fund_name
114
+ each_fund = FundSpider (fund_code , morning_star_code , fund_name , chrome_driver )
115
+ # 是否能正常跳转到基金详情页
116
+ is_error_page = each_fund .go_fund_url ()
117
+ if is_error_page == True :
118
+ error_funds .append (each_fund .fund_code )
119
+ continue
120
+ each_fund .get_fund_base_info ()
121
+ # 去掉没有成立时间的
122
+ if each_fund .found_date == '-' or each_fund .found_date == None :
123
+ # lock.acquire()
124
+ error_funds .append (each_fund .fund_code )
125
+ # lock.release()
126
+ continue
127
+ # 拼接sql需要的数据
128
+ base_dict = {
129
+ 'fund_code' : fund_code ,
130
+ 'morning_star_code' : morning_star_code ,
131
+ 'fund_name' : each_fund .fund_name ,
132
+ 'fund_cat' : each_fund .fund_cat ,
133
+ 'company' : each_fund .company ,
134
+ 'found_date' : each_fund .found_date
135
+ }
136
+ fund_base = FundBase (** base_dict )
137
+ fund_base .upsert ()
138
+ page_start = page_start + page_limit
139
+ print ('page_start' , page_start )
140
+ chrome_driver .close ()
141
+ bootstrap_thread (crawlData , len (all_funds ), 3 )
142
+ if __name__ == '__main__' :
143
+ #127, 300, 600-
144
+ page_index = 1
145
+ # sync_fund_base(page_index)
146
+ further_complete_base_info ()
147
+
0 commit comments