Skip to content

Commit 0b2f4be

Browse files
committed
updates for 3phase strategy
1 parent 5fbb13c commit 0b2f4be

File tree

7 files changed

+191
-79
lines changed

7 files changed

+191
-79
lines changed

.gitignore

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,6 @@ Backend/build/**
77
Backend/OEDA_Backend.egg-info/**
88
Backend/tools/**
99
Backend/tests/http-test-server/node_modules
10-
# Templates for database configuration
11-
Backend/oeda/databases/experiment_db_config.json
12-
Backend/oeda/databases/user_db_config.json
1310

1411
# Byte-compiled / optimized / DLL files
1512
__pycache__/

Backend/oeda/analysis/analysis_execution.py

Lines changed: 54 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -128,10 +128,14 @@ def start_factorial_tests(wf):
128128
stage_ids, samples, knobs = get_tuples(id, key)
129129
test = FactorialAnova(stage_ids=stage_ids, y_key=key, knob_keys=None, stages_count=len(stage_ids))
130130
aov_table, aov_table_sqr = test.run(data=samples, knobs=knobs)
131+
# before saving and merging tables, extract useful information
132+
delete_combination_notation(aov_table)
133+
delete_combination_notation(aov_table_sqr)
134+
131135
# type(dd) is defaultdict with unique keys
132136
dd = iterate_anova_tables(aov_table=aov_table, aov_table_sqr=aov_table_sqr)
133137

134-
# keys e.g. C(exploration_percentage), C(route_random_sigma), Residual
138+
# keys = [exploration_percentage, route_random_sigma, exploration_percentage,route_random_sigma...]
135139
# resultDict e.g. {'PR(>F)': 0.0949496951695454, 'F': 2.8232330924997346 ...
136140
anova_result = dict()
137141
for key, resultDict in dd.items():
@@ -140,6 +144,7 @@ def start_factorial_tests(wf):
140144
if str(value) == 'nan':
141145
value = None
142146
anova_result[key][inner_key] = value
147+
# TODO: before saving, should we also filter according to the significant interactions?
143148
db().save_analysis(experiment_id=id, stage_ids=stage_ids, analysis_name=test.name, anova_result=anova_result)
144149
return True, aov_table, aov_table_sqr
145150
else:
@@ -168,23 +173,66 @@ def extract_inner_values(key, stage_ids, data):
168173

169174

170175
# type(table) is DataFrame
171-
# rows are keys of the result obj
172-
# values are inner keys of those keys
176+
# rows are keys of the result obj (C(param1), C(param2), C(param1):C(param2) etc.
177+
# values are inner keys of those keys, type of values is dict
178+
# also, while iterating original tables, remove C(...): notation and extract exact parameters
173179
def iterate_anova_tables(aov_table, aov_table_sqr):
174180
dd = defaultdict(dict)
175-
print(aov_table)
176181
# iterate first table
177182
for row in aov_table.itertuples():
178183
for col_name in list(aov_table):
179-
if col_name == "PR(>F)" and hasattr(row, "_4"): # PR(>F) is translated to _4 TODO: why?
184+
if col_name == "PR(>F)" and hasattr(row, "_4"): # PR(>F) is translated to _4 because of pandas?
180185
dd[row.Index][col_name] = getattr(row, "_4")
181186
elif hasattr(row, col_name):
182187
dd[row.Index][col_name] = getattr(row, col_name)
183188

184-
print(aov_table_sqr)
185189
# iterate second table
186190
for row in aov_table_sqr.itertuples():
187191
for col_name in list(aov_table_sqr):
188192
if hasattr(row, col_name):
189193
dd[row.Index][col_name] = getattr(row, col_name)
194+
return dd
195+
196+
197+
# https://stackoverflow.com/questions/4406501/change-the-name-of-a-key-in-dictionary
198+
# https://stackoverflow.com/questions/40855900/pandas-rename-index-values
199+
def delete_combination_notation(table):
200+
for r in table.index:
201+
corrected = []
202+
keys = str(r).split(':')
203+
for k in keys:
204+
k = str(k).replace('C(', '').replace(')', '')
205+
corrected.append(k)
206+
if len(corrected) != 0:
207+
res = ""
208+
for idx, k in enumerate(corrected):
209+
res += k
210+
if idx != len(corrected) - 1:
211+
res += ","
212+
table = table.rename(index={r: res})
213+
return table
214+
215+
216+
# https://stackoverflow.com/questions/16412563/python-sorting-dictionary-of-dictionaries
217+
def get_significant_interactions(anova_result, alpha, nrOfParameters):
218+
# now we want to select the most important factors out of result
219+
significant_interactions = []
220+
for interaction_key in anova_result.keys():
221+
res = anova_result[interaction_key]
222+
pvalue = res['PR(>F)']
223+
# Residual will be filtered here because of None check
224+
if pvalue < alpha and pvalue is not None:
225+
significant_interactions.append((interaction_key, res, pvalue))
226+
227+
# sort w.r.t pvalue and also pass other values to caller fcn
228+
sorted_significant_interactions = sorted((pvalue, interaction_key, res) for (interaction_key, res, pvalue) in significant_interactions)
229+
# also mark the selected ones, might be required by UI in the future
230+
dd = defaultdict()
231+
idx = 0
232+
for (pvalue, interaction_key, res) in sorted_significant_interactions:
233+
# Filtering
234+
if idx < nrOfParameters:
235+
res["is_selected"] = True
236+
dd[interaction_key] = res
237+
idx += 1
190238
return dd

Backend/oeda/analysis/factorial_tests.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def run(self, data, knobs):
6767
# print "------------------"
6868

6969
aov_table = anova_lm(data_lm, typ=2)
70-
aov_table_sqr= deepcopy(aov_table)
70+
aov_table_sqr = deepcopy(aov_table)
7171
self.eta_squared(aov_table_sqr)
7272
self.omega_squared(aov_table_sqr)
7373
# with pd.option_context('display.max_rows', self.stages_count, 'display.max_columns', 6, 'max_colwidth', 10000):
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
{
2+
"db_type": "elasticsearch",
3+
"host": "localhost",
4+
"port": 9200,
5+
"settings": {
6+
"number_of_shards" : 1,
7+
"number_of_replicas" : 1
8+
},
9+
"index_definitions": {
10+
"experiment": {
11+
"name": "experiment",
12+
"index_name": "oeda_experiment",
13+
"mappings": {
14+
"experiment": {
15+
"properties": {
16+
"executionStrategy": {
17+
"type": "nested",
18+
"properties": {
19+
"knobs": { "type": "object" },
20+
"sample_size": { "type": "integer" },
21+
"type": { "type": "keyword" },
22+
"stages_count": { "type": "integer" }
23+
}
24+
},
25+
"createdDate": { "type": "date", "format": "yyyy-MM-dd HH||yyyy-MM-dd HH:mm||yyyy-MM-dd HH:mm:ss||yyyy-MM-dd HH:mm:ss.SSSSSS"},
26+
"targetSystemId": { "type": "keyword" },
27+
"variable_to_be_optimized": { "type": "keyword" }
28+
}
29+
}
30+
}
31+
},
32+
"stage": {
33+
"name": "stage",
34+
"index_name": "oeda_stage",
35+
"mappings": {
36+
"stage": {
37+
"properties": {
38+
"experiment_id": {"type": "keyword"},
39+
"number": { "type": "integer" },
40+
"knobs": { "type": "object" },
41+
"createdDate": { "type": "date", "format": "yyyy-MM-dd HH||yyyy-MM-dd HH:mm||yyyy-MM-dd HH:mm:ss||yyyy-MM-dd HH:mm:ss.SSSSSS"},
42+
"stage_result": { "type": "float" }
43+
}
44+
}
45+
}
46+
},
47+
"analysis": {
48+
"name": "analysis",
49+
"index_name": "oeda_analysis",
50+
"mappings": {
51+
"analysis": {
52+
"properties": {
53+
"stage_ids": { "type": "keyword" },
54+
"name": { "type": "keyword" },
55+
"sample_size": { "type": "integer" },
56+
"result": { "type": "object" },
57+
"anova_result":
58+
{
59+
"type": "nested",
60+
"dynamic": true,
61+
"properties": {}
62+
},
63+
"data_type": { "type": "object" },
64+
"createdDate": { "type": "date", "format": "yyyy-MM-dd HH||yyyy-MM-dd HH:mm||yyyy-MM-dd HH:mm:ss||yyyy-MM-dd HH:mm:ss.SSSSSS"}
65+
}
66+
}
67+
}
68+
},
69+
"data_point": {
70+
"name": "data_point",
71+
"index_name": "oeda_data_point",
72+
"mappings": {
73+
"data_point": {
74+
"properties": {
75+
"stage_id": { "type": "keyword"},
76+
"payload": { "type": "object" },
77+
"createdDate": { "type": "date", "format": "yyyy-MM-dd HH||yyyy-MM-dd HH:mm||yyyy-MM-dd HH:mm:ss||yyyy-MM-dd HH:mm:ss.SSSSSS"}
78+
}
79+
}
80+
}
81+
},
82+
"target_system": {
83+
"name": "target_system",
84+
"index_name": "oeda_target_system",
85+
"mappings": {
86+
"target_system": {
87+
"properties": {
88+
"primary_data_provider": {
89+
"type": "nested",
90+
"properties": {
91+
"type": { "type": "keyword" },
92+
"serializer": { "type": "keyword" },
93+
"ignore_first_n_samples": { "type": "integer" }
94+
}
95+
},
96+
"secondary_data_providers": {
97+
"type": "nested"
98+
},
99+
"change_provider": {
100+
"type": "nested",
101+
"properties": {
102+
"type": { "type": "keyword" },
103+
"serializer": { "type": "keyword" }
104+
}
105+
},
106+
"name": { "type": "keyword" },
107+
"description": { "type": "text" },
108+
"status": { "type": "keyword" },
109+
"createdDate": { "type": "date", "format": "yyyy-MM-dd HH||yyyy-MM-dd HH:mm||yyyy-MM-dd HH:mm:ss||yyyy-MM-dd HH:mm:ss.SSSSSS"}
110+
}
111+
}
112+
}
113+
}
114+
}
115+
}

Backend/oeda/rtxlib/executionstrategy/ThreePhaseStrategy.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
from oeda.analysis.analysis_execution import start_factorial_tests
55
from oeda.databases import db
66
from oeda.rtxlib.executionstrategy.StepStrategy import start_step_strategy
7-
from oeda.analysis.analysis_execution import get_tuples
7+
from oeda.analysis.analysis_execution import get_tuples, delete_combination_notation, \
8+
iterate_anova_tables, get_significant_interactions
89

910
def start_three_phase_strategy(wf):
1011
""" executes ANOVA, bayesian opt, and Ttest """

Backend/tests/analysis.py

Lines changed: 19 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
11
from oeda.databases import setup_experiment_database, setup_user_database, db
22
from collections import defaultdict
33
from oeda.analysis.factorial_tests import FactorialAnova
4+
from oeda.analysis.analysis_execution import delete_combination_notation, \
5+
iterate_anova_tables, get_significant_interactions, get_tuples, extract_inner_values
6+
7+
import pprint
8+
pp = pprint.PrettyPrinter(indent=4)
49

510
# there are >= 2 samples for anova
6-
def start_anova(id):
7-
key = "overhead"
11+
def start_anova(id, key, alpha, nrOfParameters):
812
stage_ids, samples, knobs = get_tuples(id, key)
913
test = FactorialAnova(stage_ids=stage_ids, y_key=key, knob_keys=None, stages_count=len(stage_ids))
1014
aov_table, aov_table_sqr = test.run(data=samples, knobs=knobs)
15+
1116
# type(dd) is defaultdict with unique keys
1217
dd = iterate_anova_tables(aov_table=aov_table, aov_table_sqr=aov_table_sqr)
1318

@@ -21,74 +26,24 @@ def start_anova(id):
2126
value = None
2227
anova_result[key][inner_key] = value
2328
db().save_analysis(experiment_id=id, stage_ids=stage_ids, analysis_name=test.name, anova_result=anova_result)
24-
return
25-
26-
def get_tuples(id, key):
27-
stage_ids = db().get_stages(id)[0]
28-
data, knobs = db().get_data_for_analysis(id)
29-
extract_inner_values(key=key, stage_ids=stage_ids, data=data)
30-
# parse data & knobs (k-v pairs) to a proper array of values
31-
samples = [data[stage_id] for stage_id in stage_ids]
32-
knobs = [knobs[stage_id] for stage_id in stage_ids]
33-
return stage_ids, samples, knobs
34-
35-
def extract_inner_values(key, stage_ids, data):
36-
outer_key = "payload"
37-
for stage_id in stage_ids:
38-
res = []
39-
# AnalysisTest.data is a dict of stage_ids and data_points
40-
for data_point in data[stage_id]:
41-
if key in data_point[outer_key]:
42-
res.append(data_point[outer_key][key])
43-
data[stage_id] = res
4429

4530

46-
# type(table) is DataFrame
47-
# rows are keys of the result obj
48-
# values are inner keys of those keys
49-
def iterate_anova_tables(aov_table, aov_table_sqr):
50-
dd = defaultdict(dict)
51-
# iterate first table
52-
for row in aov_table.itertuples():
53-
for col_name in list(aov_table):
54-
if col_name == "PR(>F)" and hasattr(row, "_4"): # PR(>F) is translated to _4 TODO: why?
55-
dd[row.Index][col_name] = getattr(row, "_4")
56-
elif hasattr(row, col_name):
57-
dd[row.Index][col_name] = getattr(row, col_name)
58-
# iterate second table
59-
for row in aov_table_sqr.itertuples():
60-
for col_name in list(aov_table_sqr):
61-
if hasattr(row, col_name):
62-
dd[row.Index][col_name] = getattr(row, col_name)
63-
return dd
31+
retrieved = db().get_analysis(experiment_id=id, stage_ids=stage_ids, analysis_name='two-way-anova')
32+
pp.pprint(retrieved)
6433

65-
def get_influence_parameters(anova_result, alpha, nrOfParameters):
34+
# following part will be integrated to ThreePhaseStr.
35+
# aov_table = delete_combination_notation(aov_table)
36+
# aov_table_sqr = delete_combination_notation(aov_table_sqr)
6637

67-
print "##########"
68-
import pprint
69-
pp = pprint.PrettyPrinter(indent=4)
70-
pp.pprint(anova_result)
71-
print "##########"
72-
# now we want to select the most important factors out of anova result
38+
# si = get_significant_interactions(dd, alpha, nrOfParameters)
39+
# pp.pprint(si)
7340

74-
significant_interactions = []
75-
for interaction_key in anova_result.keys():
76-
pvalue = anova_result[interaction_key]['PR(>F)']
77-
if pvalue < alpha:
78-
significant_interactions.append((interaction_key, pvalue))
79-
80-
sorted_significant_interactions = sorted((value, key) for (key, value) in significant_interactions)
81-
print "!!!!!!!!!"
82-
pp.pprint(sorted_significant_interactions)
83-
print "!!!!!!!!!"
84-
return anova_result
41+
return
8542

8643
if __name__ == '__main__':
87-
# setup_user_database()
44+
nrOfParameters = 3 # to be retrieved from analysis definition
45+
alpha = 0.5 # to be retrieved from analysis definition
8846
setup_experiment_database("elasticsearch", "localhost", 9200)
8947
id = "a780bba9-a2c7-20a5-7be9-ede26d9c9b64"
90-
stage_ids = ["6dc62e9c-3625-85ca-657e-3b06cc269828#1", "6dc62e9c-3625-85ca-657e-3b06cc269828#2", "6dc62e9c-3625-85ca-657e-3b06cc269828#3", "6dc62e9c-3625-85ca-657e-3b06cc269828#4"]
91-
retrieved = db().get_analysis(experiment_id=id, stage_ids=stage_ids, analysis_name="two-way-anova")
92-
nrOfParameters = 2 # to be retrieved from analysis definition
93-
alpha = 0.5 # to be retrieved from analysis definition
94-
get_influence_parameters(retrieved["anova_result"], alpha, nrOfParameters)
48+
key = "overhead"
49+
start_anova(id, key, alpha, nrOfParameters)

Backend/tests/analysis_test.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -247,10 +247,6 @@ def test_s_FactorialAnova(self):
247247
test = FactorialAnova(stage_ids=stage_ids, y_key=AnalysisTest.key, knob_keys=None, stages_count=len(stage_ids))
248248
result = test.run(data=samples, knobs=knobs)
249249
self.assertTrue(result is not None)
250-
# TODO: Ilias, how can we save Factorial Analysis table to DB?
251-
# db().save_analysis(AnalysisTest.stage_ids, test.name, result)
252-
# retrieved = db().get_analysis(AnalysisTest.stage_ids, test.name)
253-
# self.assertTrue(retrieved)
254250
except Exception as e:
255251
error_name = type(e).__name__
256252
self.assertTrue(error_name == "LinAlgError" or error_name == "ValueError")

0 commit comments

Comments
 (0)