Skip to content

Commit 0af5f27

Browse files
committed
修正按周下载数据按年合并出错的问题
1 parent 3c2593f commit 0af5f27

File tree

8 files changed

+72
-35
lines changed

8 files changed

+72
-35
lines changed

ddump/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.1.5"
1+
__version__ = "0.1.6"

ddump/api/merge.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def path_groupby_date(input_path, output_path,
4444
df = pd.DataFrame([f.name.split('.')[0].split(START_SEP_END) for f in files], columns=['start', 'end'])
4545
df['path'] = files
4646
df['key'] = pd.to_datetime(df['start'])
47+
df['key2'] = df['key']
4748
df.index = df['key'].copy()
4849
df.index.name = 'date' # 防止无法groupby
4950

@@ -67,11 +68,15 @@ def path_groupby_date(input_path, output_path,
6768
df['10Y_1'] = df['key'].apply(lambda x: x.date() + relativedelta(year=x.year // 10 * 10, month=1, day=1))
6869
df['10Y_2'] = df['key'].apply(lambda x: x.date() + relativedelta(year=x.year // 10 * 10 + 9, month=12, day=31))
6970

71+
df['1M_1'] = pd.to_datetime(df['1M_1'])
72+
df['1Y_1'] = pd.to_datetime(df['1Y_1'])
73+
7074
# 最近的两个月不动,两个月前的都按月合并
7175
t = f'{datetime.now() - timedelta(days=31 * 2):%Y-%m}'
7276
df['key'] = df.loc[:t, '1M_1']
7377
t = f'{datetime.now() - timedelta(days=365 * 1):%Y}'
7478
df['key'] = df.loc[:t, '1Y_1']
79+
df['key'].fillna(df['key2'], inplace=True)
7580

7681
# 按key进行分组
7782
fss = {}

ddump/db/merge.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ def path_groupby_date(input_path, output_path,
110110
df = pd.DataFrame([f.name.split('.')[0].split(KEY_SEP_ID) for f in files], columns=['key', 'id'])
111111
df['path'] = files
112112
df['key'] = pd.to_datetime(df['key'])
113+
df['key2'] = df['key']
113114
df.index = df['key'].copy()
114115
df.index.name = 'date' # 防止无法groupby
115116

@@ -133,11 +134,15 @@ def path_groupby_date(input_path, output_path,
133134
df['10Y_1'] = df['key'].apply(lambda x: x.date() + relativedelta(year=x.year // 10 * 10, month=1, day=1))
134135
df['10Y_2'] = df['key'].apply(lambda x: x.date() + relativedelta(year=x.year // 10 * 10 + 9, month=12, day=31))
135136

137+
df['1M_1'] = pd.to_datetime(df['1M_1'])
138+
df['1Y_1'] = pd.to_datetime(df['1Y_1'])
139+
136140
# 最近的两个月不动,两个月前的都按月合并
137141
t = f'{datetime.now() - timedelta(days=31 * 2):%Y-%m}'
138142
df['key'] = df.loc[:t, '1M_1']
139143
t = f'{datetime.now() - timedelta(days=365 * 1):%Y}'
140144
df['key'] = df.loc[:t, '1Y_1']
145+
df['key'].fillna(df['key2'], inplace=True)
141146

142147
# 按key进行分组
143148
fss = {}

ddump/merge.py

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
import shutil
2+
13
import pandas as pd
24
from loguru import logger
35

46

57
def merge_files_to_file(path, files,
68
ignore_index=True,
7-
delete_src=False):
9+
delete_src=False,
10+
single_overwrite=True):
811
"""合并件列表到文件
912
1013
Parameters
@@ -17,17 +20,30 @@ def merge_files_to_file(path, files,
1720
合并时是否忽略索引。索引没有意义时忽略能加速
1821
delete_src: bool
1922
是否删除源文件
20-
21-
Returns
22-
-------
23+
single_overwrite: bool
24+
单文件是否进行覆盖
2325
2426
"""
2527
if len(files) == 0:
2628
return
29+
2730
if len(files) == 1:
2831
if path == files[0]:
2932
# 同一文件,没有必要合并
3033
return
34+
else:
35+
if path.exists() and not single_overwrite:
36+
logger.info('单路径,已存在,跳过 {}', path)
37+
return
38+
else:
39+
logger.info('单路径,直接覆盖 {}', path)
40+
path.parent.mkdir(parents=True, exist_ok=True)
41+
shutil.copy(files[0], path)
42+
return
43+
44+
if path.exists():
45+
logger.info('合并目标,已存在,跳过 {}', path)
46+
return
3147

3248
# 加载
3349
dfs = []
@@ -69,5 +85,7 @@ def merge_files_dict(files_dict,
6985
key为路径
7086
value为列表
7187
"""
72-
for k, v in files_dict.items():
73-
merge_files_to_file(k, v, ignore_index, delete_src)
88+
for i, (k, v) in enumerate(files_dict.items()):
89+
# 最后5个单文件总是试着覆盖
90+
single_overwrite = i >= len(files_dict) - 5
91+
merge_files_to_file(k, v, ignore_index, delete_src, single_overwrite)

examples/jqresearch/get_price_daily.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from datetime import datetime
22

33
import pandas as pd
4+
from dateutil.relativedelta import relativedelta
45

56
from ddump.api.dump import Dump__start__end
67
from examples.jqresearch.config import DATA_ROOT, jq
@@ -66,6 +67,7 @@ def do_get_industry(d, start_date, end_date, symbols):
6667
# 下载日线数据
6768
d.set_parameters('get_industry',
6869
start_date=f'{start_date:%Y-%m-%d}',
70+
end_date=f'{end_date:%Y-%m-%d}',
6971
date=f'{end_date:%Y-%m-%d}',
7072
security=symbols.index.tolist())
7173
if not d.exists(file_timeout=3600 * 6, data_timeout=86400 * 2):
@@ -84,18 +86,31 @@ def do_get_industry(d, start_date, end_date, symbols):
8486
d1 = Dump__start__end(jq, path1, 'start_date', 'end_date')
8587
d2 = Dump__start__end(jq, path2, 'start_date', 'end_date')
8688
d3 = Dump__start__end(jq, path3, 'start_date', 'end_date')
87-
d4 = Dump__start__end(jq, path4, 'start_date', 'date')
89+
d4 = Dump__start__end(jq, path4, 'start_date', 'end_date')
8890

8991
# 前半段,按周查,这样能快一些
9092
end = pd.to_datetime('2023-01-15') # 星期日
9193
# 下周,由date_range调到本周日
9294
end = pd.to_datetime(datetime.today().date()) + pd.Timedelta(days=6)
9395
start = pd.to_datetime('2023-01-02') # 星期一
94-
start = pd.to_datetime('2015-01-01') # 星期一
96+
97+
# 只要跨月了就划分成两部分,实现指定月份也能加载不出错
98+
start_list = []
99+
end_list = []
95100
for dr in pd.date_range(start=start, end=end, freq='W'):
96101
start_date = dr - pd.Timedelta(days=6)
97102
end_date = dr
98-
103+
if start_date.month == end_date.month:
104+
start_list.append(start_date)
105+
end_list.append(end_date)
106+
else:
107+
start_list.append(start_date)
108+
end_list.append(start_date + relativedelta(day=31))
109+
start_list.append(end_date + relativedelta(day=1))
110+
end_list.append(end_date)
111+
112+
# 下载数据
113+
for start_date, end_date in zip(start_list, end_list):
99114
symbols = universe.query(f'start_date<=@end_date.date() and end_date>=@start_date.date()')
100115

101116
do_get_price(d1, start_date, end_date, symbols, fields1, fq1)
Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
11
"""
22
下载历史数据后,很早以前的数据可以合并。一年合并一次即可
3+
4+
历史数据如果还没有验证,应当选取其它方式验证多次后再合并
5+
因为只要发现某天数据有问题,只要删了那天的文件,即可重新下载对应部分
36
"""
47
import pathlib
58

9+
from loguru import logger
10+
611
from ddump.api.merge import path_groupby_date
712
from ddump.merge import merge_files_dict
813

9-
paths = [
14+
paths1 = [
1015
r'D:\data\jqresearch\get_extras_stock_is_st',
1116
r'D:\data\jqresearch\get_industry_stock',
1217
r'D:\data\jqresearch\get_price_stock_factor',
@@ -17,16 +22,21 @@
1722
r'D:\data\jqresearch\get_fundamentals_indicator',
1823
r'D:\data\jqresearch\get_fundamentals_valuation',
1924
]
20-
for path in paths:
21-
path = pathlib.Path(path)
22-
files = path_groupby_date(path, path)
23-
merge_files_dict(files, ignore_index=False, delete_src=True)
2425

25-
if False:
26-
# 测试用
27-
path1 = r'D:\data\jqresearch\get_fundamentals_balance'
26+
paths2 = [
27+
r'M:\data\jqresearch\get_extras_stock_is_st',
28+
r'M:\data\jqresearch\get_industry_stock',
29+
r'M:\data\jqresearch\get_price_stock_factor',
30+
r'M:\data\jqresearch\get_price_stock_daily',
31+
r'M:\data\jqresearch\get_fundamentals_balance',
32+
r'M:\data\jqresearch\get_fundamentals_cash_flow',
33+
r'M:\data\jqresearch\get_fundamentals_income',
34+
r'M:\data\jqresearch\get_fundamentals_indicator',
35+
r'M:\data\jqresearch\get_fundamentals_valuation',
36+
]
37+
for path1, path2 in zip(paths1, paths2):
38+
logger.info('=' * 60, )
2839
path1 = pathlib.Path(path1)
29-
path2 = r'D:\data\jqresearch\get_fundamentals_balance_2'
3040
path2 = pathlib.Path(path2)
3141
files = path_groupby_date(path1, path2)
3242
merge_files_dict(files, ignore_index=False, delete_src=False)

tests/test_folder.py

Lines changed: 0 additions & 12 deletions
This file was deleted.

tests/test_show.py

Lines changed: 0 additions & 4 deletions
This file was deleted.

0 commit comments

Comments
 (0)