Skip to content
This repository was archived by the owner on Sep 6, 2025. It is now read-only.

Commit 31bafab

Browse files
committed
feat: add sync fund base
1 parent 2f08f98 commit 31bafab

File tree

5 files changed

+165
-15
lines changed

5 files changed

+165
-15
lines changed

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
selenium==3.11.0
1+
selenium==4.5.0
22
PyMySQL==1.0.2
33
pandas==1.1.5
44
requests==2.18.4

src/acquire_fund_base.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,13 @@
88
Copyright (c) 2020 Camel Lu
99
'''
1010
from threading import Lock
11-
11+
from utils.login import login_morning_star
12+
from utils.driver import create_chrome_driver
13+
from utils.index import bootstrap_thread
1214
from fund_info.crawler import FundSpider
1315
from lib.mysnowflake import IdWorker
14-
from sql_model.fund_insert import FundInsert
1516
from sql_model.fund_query import FundQuery
16-
from utils.driver import create_chrome_driver
17-
from utils.index import bootstrap_thread
18-
from utils.login import login_morning_star
19-
17+
from sql_model.fund_insert import FundInsert
2018

2119
def acquire_fund_base():
2220
lock = Lock()
@@ -45,16 +43,16 @@ def crawlData(start, end):
4543
for record in results:
4644
each_fund = FundSpider(
4745
record[0], record[1], record[2], chrome_driver)
48-
# 从晨星网上更新信息
49-
is_normal = each_fund.go_fund_url()
50-
if is_normal == False:
46+
# 是否能正常跳转到基金详情页
47+
is_error_page = each_fund.go_fund_url()
48+
if is_error_page == True:
5149
lock.acquire()
5250
error_funds.append(each_fund.fund_code)
5351
lock.release()
5452
continue
5553
each_fund.get_fund_base_info()
5654
# 去掉没有成立时间的
57-
if each_fund.found_date == '-':
55+
if each_fund.found_date == '-' or each_fund.found_date == None:
5856
lock.acquire()
5957
error_funds.append(each_fund.fund_code)
6058
lock.release()

src/crud/query.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,13 @@ def query_all_fund():
4949
}
5050
return all_fund_dict
5151

52+
def query_empty_company_and_found_date_fund(start, size):
53+
all_funds = session.query(FundBase).where(FundBase.company == None, FundBase.found_date == None, FundBase.is_archive==0).offset(start).limit(size).all()
54+
return all_funds
55+
5256
if __name__ == '__main__':
5357
quarter_index = '2022-Q2'
54-
fund_list = query_high_score_funds(quarter_index)
58+
# fund_list = query_high_score_funds(quarter_index)
59+
query_empty_company_and_found_date_fund(2, 10)
5560
# print("fund_list",fund_list)
5661

src/fund_info/supplement.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,17 @@
88
Copyright (c) 2021 Camel Lu
99
'''
1010

11-
from utils.index import get_last_quarter_str
11+
from fund_info.api import FundApier
1212
from sql_model.fund_query import FundQuery
1313
from sql_model.fund_update import FundUpdate
14-
from fund_info.api import FundApier
14+
from utils.index import get_last_quarter_str
15+
1516

1617
class FundSupplement:
1718
def __init__(self, code=None):
1819
self.fund_code = code
1920
# 动态计算季度信息
2021
self.quarter_index = get_last_quarter_str()
21-
2222
def update_archive_status(self):
2323
fund_query = FundQuery()
2424
each_fund_update = FundUpdate()

src/sync_fund_base.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
'''
2+
Desc:
3+
File: /sync_fund_base.py
4+
File Created: Sunday, 30th October 2022 2:53:56 pm
5+
6+
-----
7+
Copyright (c) 2022 Camel Lu
8+
'''
9+
import math
10+
import re
11+
from time import sleep
12+
13+
from bs4 import BeautifulSoup
14+
from selenium.webdriver.common.by import By
15+
from selenium.webdriver.support.ui import WebDriverWait
16+
17+
from crud.query import query_all_fund, query_empty_company_and_found_date_fund
18+
from models.fund import FundBase
19+
from fund_info.crawler import FundSpider
20+
from utils.index import bootstrap_thread
21+
from utils.driver import create_chrome_driver, text_to_be_present_in_element
22+
from utils.login import login_morning_star
23+
24+
25+
def sync_fund_base(page_index):
26+
morning_fund_selector_url = "https://www.morningstar.cn/fundselect/default.aspx"
27+
chrome_driver = create_chrome_driver()
28+
login_morning_star(chrome_driver, morning_fund_selector_url)
29+
page_count = 25 # 晨星固定分页数
30+
page_total = math.ceil(int(chrome_driver.find_element(By.XPATH,
31+
'/html/body/form/div[8]/div/div[4]/div[3]/div[2]/span').text) / page_count)
32+
all_fund_dict = query_all_fund()
33+
all_fund_codes = all_fund_dict.keys()
34+
while page_index <= page_total:
35+
# 求余
36+
remainder = page_total % 10
37+
# 判断是否最后一页
38+
num = (remainder +
39+
2) if page_index > (page_total - remainder) else 12
40+
xpath_str = '/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/a[%s]' % (
41+
num)
42+
print('page_index', page_index)
43+
# 等待,直到当前页(样式判断)等于page_num
44+
WebDriverWait(chrome_driver, timeout=600).until(text_to_be_present_in_element(
45+
"/html/body/form/div[8]/div/div[4]/div[3]/div[3]/div[1]/span[@style='margin-right:5px;font-weight:Bold;color:red;']", str(page_index), xpath_str))
46+
sleep(1)
47+
# 获取每页的源代码
48+
data = chrome_driver.page_source
49+
# 利用BeautifulSoup解析网页源代码
50+
bs = BeautifulSoup(data, 'lxml')
51+
class_list = ['gridItem', 'gridAlternateItem'] # 数据在这两个类下
52+
# 取出所有类的信息,并保存到对应的列表里
53+
for i in range(len(class_list)):
54+
tr_list = bs.find_all('tr', {'class': class_list[i]})
55+
for tr_index in range(len(tr_list)):
56+
# 雪花id
57+
tr = tr_list[tr_index]
58+
tds_text = tr.find_all('td', {'class': "msDataText"})
59+
# 基金代码
60+
code_a_element = tds_text[0].find_all('a')[0]
61+
cur_fund_code = code_a_element.string
62+
cur_morning_star_code = re.findall(
63+
r'(?<=/quicktake/)(\w+)$', code_a_element.get('href')).pop(0)
64+
cur_fund_name = tds_text[1].find_all('a')[0].string
65+
cur_fund_cat = tds_text[2].string
66+
if cur_fund_code in all_fund_codes:
67+
exit_fund = all_fund_dict.get(cur_fund_code)
68+
if (cur_morning_star_code != exit_fund['morning_star_code']) or (cur_fund_name != exit_fund['fund_name']) or (cur_fund_cat != exit_fund['fund_cat']) :
69+
fund_base_params = {
70+
**exit_fund,
71+
'morning_star_code': cur_morning_star_code,
72+
'fund_name': cur_fund_name,
73+
'fund_cat': cur_fund_cat
74+
}
75+
fund_base = FundBase(**fund_base_params)
76+
fund_base.upsert()
77+
elif cur_fund_code:
78+
fund_base_params = {
79+
'fund_code': cur_fund_code,
80+
'morning_star_code': cur_morning_star_code,
81+
'fund_name': cur_fund_name,
82+
'fund_cat': cur_fund_cat
83+
}
84+
fund_base = FundBase(**fund_base_params)
85+
print('fund_name:', cur_fund_name, 'fund_code:', cur_fund_code, 'morning_star_code:', cur_morning_star_code)
86+
fund_base.upsert()
87+
# 获取下一页元素
88+
next_page = chrome_driver.find_element(By.XPATH, xpath_str)
89+
# 点击下一页
90+
next_page.click()
91+
sleep(3)
92+
page_index += 1
93+
chrome_driver.close()
94+
print('end')
95+
96+
def further_complete_base_info():
97+
all_funds = query_empty_company_and_found_date_fund(0, 10000)
98+
error_funds = []
99+
def crawlData(start, end):
100+
login_url = 'https://www.morningstar.cn/membership/signin.aspx'
101+
chrome_driver = create_chrome_driver()
102+
login_morning_star(chrome_driver, login_url)
103+
page_start = start
104+
page_limit = 10
105+
# 遍历从基金列表的单支基金
106+
while(page_start < end):
107+
page_end = page_start + page_limit
108+
results = all_funds[page_start:page_end]
109+
# results = query_empty_company_and_found_date_fund(page_start, page_limit)
110+
for record in results:
111+
fund_code = record.fund_code
112+
morning_star_code = record.morning_star_code
113+
fund_name = record.fund_name
114+
each_fund = FundSpider(fund_code, morning_star_code, fund_name, chrome_driver)
115+
# 是否能正常跳转到基金详情页
116+
is_error_page = each_fund.go_fund_url()
117+
if is_error_page == True:
118+
error_funds.append(each_fund.fund_code)
119+
continue
120+
each_fund.get_fund_base_info()
121+
# 去掉没有成立时间的
122+
if each_fund.found_date == '-' or each_fund.found_date == None:
123+
# lock.acquire()
124+
error_funds.append(each_fund.fund_code)
125+
# lock.release()
126+
continue
127+
# 拼接sql需要的数据
128+
base_dict = {
129+
'fund_code': fund_code,
130+
'morning_star_code': morning_star_code,
131+
'fund_name': each_fund.fund_name,
132+
'fund_cat': each_fund.fund_cat ,
133+
'company': each_fund.company,
134+
'found_date': each_fund.found_date
135+
}
136+
fund_base = FundBase(**base_dict)
137+
fund_base.upsert()
138+
page_start = page_start + page_limit
139+
print('page_start', page_start)
140+
chrome_driver.close()
141+
bootstrap_thread(crawlData, len(all_funds), 3)
142+
if __name__ == '__main__':
143+
#127, 300, 600-
144+
page_index = 1
145+
# sync_fund_base(page_index)
146+
further_complete_base_info()
147+

0 commit comments

Comments
 (0)