-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcombine.py
141 lines (116 loc) · 5.7 KB
/
combine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import pandas as pd
import numpy as np
import os
import yaml
import pdb
base_path = os.path.dirname(os.path.realpath(__file__))
config_file = base_path + r'\combine.yaml'
with open(config_file, 'r') as f:
config = yaml.safe_load(f)
f.close()
for table in config['tables']:
merge_cols = config['tables'][table]['merge']
first_run = True
for run in config['runs']:
in_df = pd.read_csv(base_path + config['runs'][run] + '\\' + table + '.csv')
if 'transpose' in config['tables'][table]:
in_df = in_df.T
in_df.index.name = config['tables'][table]['transpose']
in_df = in_df.reset_index()
if 'melt' in config['tables'][table]:
in_df = pd.melt(in_df, **config['tables'][table]['melt'])
#if table == 'summary_ec':
# pdb.set_trace()
name_map = {}
names = []
counter = 0 #For separating columns into different tables
try:
for col in config['tables'][table]['dtype']:
in_df[col] = in_df[col].astype(config['tables'][table]['dtype'][col])
except KeyError:
pass
for col in in_df.columns:
#Remove columns to be skipped
if 'skip' in config['tables'][table] and col in config['tables'][table]['skip']:
del in_df[col]
continue
elif col in merge_cols:
in_df[col] = in_df[col].apply(lambda x: str(x).replace('.0', '')) #Convert to string in case different data types
elif 'separate' in config['tables'][table] and col in config['tables'][table]['separate']:
name_map[col] = '{0}_{1}'.format(run, counter)
names.append(col)
counter += 1
else:
name_map[col] = run# + '_' + col
#if table == 'summary_ec':
# pdb.set_trace()
#Add marginal totals if needed
if 'total' in config['tables'][table]:
for col in config['tables'][table]['total']:
assert col in merge_cols, 'Column `{}` must be in merge columns'.format(col)
if 'Total' in in_df[col].values or 'All' in in_df[col].values or 'All Households' in in_df[col].values:
continue
else:
in_df = in_df.set_index(merge_cols)
level = list(range(len(merge_cols)))
level.remove(merge_cols.index(col))
totals = in_df.sum(level = level).reset_index()
totals[col] = config['tables'][table]['total'][col]
in_df = pd.concat((in_df.reset_index(), totals))
#if table == 'summary_ec':
# pdb.set_trace()
#Merge tables from different runs together. If no table exists yet copy `in_df`
if first_run:
out_df = in_df.rename(columns = name_map)
first_run = False
else:
out_df = out_df.merge(in_df.rename(columns = name_map), how = 'outer', on = merge_cols)
#if table == 'summary_ec':
# pdb.set_trace()
if 'normalize' in config['tables'][table] and config['tables'][table]['normalize']:
for col in out_df:
try:
out_df[col] *= (100/out_df[col].sum())
except TypeError: #Not numeric
continue
if 'pct_diff' in config['tables'][table]:
for col in config['tables'][table]['pct_diff']:
out_df['%Diff_' + col] = out_df[config['tables'][table]['pct_diff'][col][0] + '_' + col] / out_df[config['tables'][table]['pct_diff'][col][1] + '_' + col] - 1
if 'separate' in config['tables'][table]:
out_dfs = {}
for i in range(counter):
out_dfs[names[i]] = pd.DataFrame()
for col in out_df.columns:
if col in merge_cols:
out_dfs[names[i]][col] = out_df[col]
elif '_{}'.format(i) in col:
out_dfs[names[i]][col.split('_')[0]] = out_df[col]
outfile = base_path + config['outpath'] + '\\' + names[i].replace(' ', '') + '.csv'
if 'suffix' in config['tables'][table]:
outfile = outfile.replace('.csv', '_{}.csv'.format(config['tables'][table]['suffix']))
out_dfs[names[i]].to_csv(outfile, index = False)
del in_df, out_df, out_dfs
elif 'subsets' in config['tables'][table]:
for subset in config['tables'][table]['subsets']:
qry = config['tables'][table]['subsets'][subset]['query']
if qry == None:
subset_df = out_df.copy()
else:
subset_df = out_df.query(qry)
if 'rename' in config['tables'][table]['subsets'][subset]:
for col in config['tables'][table]['subsets'][subset]['rename']:
subset_df[col] = subset_df[col].replace(config['tables'][table]['subsets'][subset]['rename'][col])
if 'exclude' in config['tables'][table]['subsets'][subset]:
for col in config['tables'][table]['subsets'][subset]['exclude']:
del subset_df[col]
if 'transpose' in config['tables'][table]['subsets'][subset]:
subset_df = subset_df.set_index(subset_df.columns[0]).T
subset_df.index.name = config['tables'][table]['subsets'][subset]['transpose']
subset_df.to_csv(base_path + config['outpath'] + '\\' + subset + '.csv')
else:
subset_df.to_csv(base_path + config['outpath'] + '\\' + subset + '.csv', index = False)
else:
out_df.index.name = 'index'
out_df.fillna(0).to_csv(base_path + config['outpath'] + '\\' + table + '.csv', index = False)
del in_df, out_df
print('Done')