updates for 3phase strategy

alinaciuysal · alinaciuysal · commit 0b2f4befd709 · 2018-06-09T01:11:23.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -7,9 +7,6 @@ Backend/build/**
 Backend/OEDA_Backend.egg-info/**
 Backend/tools/**
 Backend/tests/http-test-server/node_modules
-# Templates for database configuration
-Backend/oeda/databases/experiment_db_config.json
-Backend/oeda/databases/user_db_config.json
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/Backend/oeda/analysis/analysis_execution.py b/Backend/oeda/analysis/analysis_execution.py
@@ -128,10 +128,14 @@ def start_factorial_tests(wf):
         stage_ids, samples, knobs = get_tuples(id, key)
         test = FactorialAnova(stage_ids=stage_ids, y_key=key, knob_keys=None, stages_count=len(stage_ids))
         aov_table, aov_table_sqr = test.run(data=samples, knobs=knobs)
+        # before saving and merging tables, extract useful information
+        delete_combination_notation(aov_table)
+        delete_combination_notation(aov_table_sqr)
+
         # type(dd) is defaultdict with unique keys
         dd = iterate_anova_tables(aov_table=aov_table, aov_table_sqr=aov_table_sqr)
 
-        # keys e.g. C(exploration_percentage), C(route_random_sigma), Residual
+        # keys = [exploration_percentage, route_random_sigma, exploration_percentage,route_random_sigma...]
         # resultDict e.g. {'PR(>F)': 0.0949496951695454, 'F': 2.8232330924997346 ...
         anova_result = dict()
         for key, resultDict in dd.items():
@@ -140,6 +144,7 @@ def start_factorial_tests(wf):
                 if str(value) == 'nan':
                     value = None
                 anova_result[key][inner_key] = value
+        # TODO: before saving, should we also filter according to the significant interactions?
         db().save_analysis(experiment_id=id, stage_ids=stage_ids, analysis_name=test.name, anova_result=anova_result)
         return True, aov_table, aov_table_sqr
     else:
@@ -168,23 +173,66 @@ def extract_inner_values(key, stage_ids, data):
 
 
 # type(table) is DataFrame
-# rows are keys of the result obj
-# values are inner keys of those keys
+# rows are keys of the result obj (C(param1), C(param2), C(param1):C(param2) etc.
+# values are inner keys of those keys, type of values is dict
+# also, while iterating original tables, remove C(...): notation and extract exact parameters
 def iterate_anova_tables(aov_table, aov_table_sqr):
     dd = defaultdict(dict)
-    print(aov_table)
     # iterate first table
     for row in aov_table.itertuples():
         for col_name in list(aov_table):
-            if col_name == "PR(>F)" and hasattr(row, "_4"): # PR(>F) is translated to _4 TODO: why?
+            if col_name == "PR(>F)" and hasattr(row, "_4"): # PR(>F) is translated to _4 because of pandas?
                 dd[row.Index][col_name] = getattr(row, "_4")
             elif hasattr(row, col_name):
                 dd[row.Index][col_name] = getattr(row, col_name)
 
-    print(aov_table_sqr)
     # iterate second table
     for row in aov_table_sqr.itertuples():
         for col_name in list(aov_table_sqr):
             if hasattr(row, col_name):
                 dd[row.Index][col_name] = getattr(row, col_name)
+    return dd
+
+
+# https://stackoverflow.com/questions/4406501/change-the-name-of-a-key-in-dictionary
+# https://stackoverflow.com/questions/40855900/pandas-rename-index-values
+def delete_combination_notation(table):
+    for r in table.index:
+        corrected = []
+        keys = str(r).split(':')
+        for k in keys:
+            k = str(k).replace('C(', '').replace(')', '')
+            corrected.append(k)
+        if len(corrected) != 0:
+            res = ""
+            for idx, k in enumerate(corrected):
+                res += k
+                if idx != len(corrected) - 1:
+                    res += ","
+            table = table.rename(index={r: res})
+    return table
+
+
+# https://stackoverflow.com/questions/16412563/python-sorting-dictionary-of-dictionaries
+def get_significant_interactions(anova_result, alpha, nrOfParameters):
+    # now we want to select the most important factors out of result
+    significant_interactions = []
+    for interaction_key in anova_result.keys():
+        res = anova_result[interaction_key]
+        pvalue = res['PR(>F)']
+        # Residual will be filtered here because of None check
+        if pvalue < alpha and pvalue is not None:
+            significant_interactions.append((interaction_key, res, pvalue))
+
+    # sort w.r.t pvalue and also pass other values to caller fcn
+    sorted_significant_interactions = sorted((pvalue, interaction_key, res) for (interaction_key, res, pvalue) in significant_interactions)
+    # also mark the selected ones, might be required by UI in the future
+    dd = defaultdict()
+    idx = 0
+    for (pvalue, interaction_key, res) in sorted_significant_interactions:
+        # Filtering
+        if idx < nrOfParameters:
+            res["is_selected"] = True
+            dd[interaction_key] = res
+        idx += 1
     return dd
diff --git a/Backend/oeda/analysis/factorial_tests.py b/Backend/oeda/analysis/factorial_tests.py
@@ -67,7 +67,7 @@ def run(self, data, knobs):
         # print "------------------"
 
         aov_table = anova_lm(data_lm, typ=2)
-        aov_table_sqr= deepcopy(aov_table)
+        aov_table_sqr = deepcopy(aov_table)
         self.eta_squared(aov_table_sqr)
         self.omega_squared(aov_table_sqr)
         # with pd.option_context('display.max_rows', self.stages_count, 'display.max_columns', 6, 'max_colwidth', 10000):
diff --git a/Backend/oeda/databases/experiment_db_config.json b/Backend/oeda/databases/experiment_db_config.json
@@ -0,0 +1,115 @@
+{
+  "db_type": "elasticsearch",
+  "host": "localhost",
+  "port": 9200,
+  "settings": {
+    "number_of_shards" : 1,
+    "number_of_replicas" : 1
+  },
+  "index_definitions": {
+    "experiment": {
+      "name": "experiment",
+      "index_name": "oeda_experiment",
+      "mappings": {
+        "experiment": {
+          "properties": {
+            "executionStrategy": {
+              "type": "nested",
+              "properties": {
+                "knobs":                   { "type": "object"  },
+                "sample_size":             { "type": "integer" },
+                "type":                    { "type": "keyword" },
+                "stages_count":            { "type": "integer" }
+              }
+            },
+            "createdDate":                  { "type": "date", "format": "yyyy-MM-dd HH||yyyy-MM-dd HH:mm||yyyy-MM-dd HH:mm:ss||yyyy-MM-dd HH:mm:ss.SSSSSS"},
+            "targetSystemId":           { "type": "keyword"  },
+            "variable_to_be_optimized": { "type": "keyword"  }
+          }
+        }
+      }
+    },
+    "stage": {
+      "name": "stage",
+      "index_name": "oeda_stage",
+      "mappings": {
+        "stage": {
+          "properties":     {
+            "experiment_id": {"type": "keyword"},
+            "number":       { "type": "integer" },
+            "knobs":        { "type": "object"  },
+            "createdDate":  { "type": "date", "format": "yyyy-MM-dd HH||yyyy-MM-dd HH:mm||yyyy-MM-dd HH:mm:ss||yyyy-MM-dd HH:mm:ss.SSSSSS"},
+            "stage_result": { "type": "float" }
+          }
+        }
+      }
+    },
+    "analysis": {
+      "name": "analysis",
+      "index_name": "oeda_analysis",
+      "mappings": {
+        "analysis": {
+          "properties": {
+            "stage_ids":     { "type": "keyword" },
+            "name":          { "type": "keyword" },
+            "sample_size":   { "type": "integer" },
+            "result":        { "type": "object"  },
+            "anova_result":
+            {
+              "type": "nested",
+              "dynamic": true,
+              "properties": {}
+            },
+            "data_type":        { "type": "object"  },
+            "createdDate":       { "type": "date", "format": "yyyy-MM-dd HH||yyyy-MM-dd HH:mm||yyyy-MM-dd HH:mm:ss||yyyy-MM-dd HH:mm:ss.SSSSSS"}
+          }
+        }
+      }
+    },
+    "data_point": {
+      "name": "data_point",
+      "index_name": "oeda_data_point",
+      "mappings": {
+        "data_point": {
+          "properties": {
+            "stage_id": { "type": "keyword"},
+            "payload":  { "type": "object"  },
+            "createdDate":  { "type": "date", "format": "yyyy-MM-dd HH||yyyy-MM-dd HH:mm||yyyy-MM-dd HH:mm:ss||yyyy-MM-dd HH:mm:ss.SSSSSS"}
+          }
+        }
+      }
+    },
+    "target_system": {
+      "name": "target_system",
+      "index_name": "oeda_target_system",
+      "mappings": {
+        "target_system": {
+          "properties": {
+            "primary_data_provider": {
+              "type": "nested",
+              "properties": {
+                "type":          { "type": "keyword" },
+                "serializer":    { "type": "keyword" },
+                "ignore_first_n_samples":  { "type": "integer" }
+              }
+            },
+            "secondary_data_providers": {
+              "type": "nested"
+            },
+            "change_provider": {
+              "type": "nested",
+              "properties": {
+                "type":          { "type": "keyword" },
+                "serializer":    { "type": "keyword" }
+              }
+            },
+            "name":         { "type": "keyword"  },
+            "description":  { "type": "text"     },
+            "status":       { "type": "keyword"  },
+            "createdDate":      { "type": "date", "format": "yyyy-MM-dd HH||yyyy-MM-dd HH:mm||yyyy-MM-dd HH:mm:ss||yyyy-MM-dd HH:mm:ss.SSSSSS"}
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/Backend/oeda/rtxlib/executionstrategy/ThreePhaseStrategy.py b/Backend/oeda/rtxlib/executionstrategy/ThreePhaseStrategy.py
@@ -4,7 +4,8 @@
 from oeda.analysis.analysis_execution import start_factorial_tests
 from oeda.databases import db
 from oeda.rtxlib.executionstrategy.StepStrategy import start_step_strategy
-from oeda.analysis.analysis_execution import get_tuples
+from oeda.analysis.analysis_execution import get_tuples, delete_combination_notation, \
+                                             iterate_anova_tables, get_significant_interactions
 
 def start_three_phase_strategy(wf):
     """ executes ANOVA, bayesian opt, and Ttest """
diff --git a/Backend/tests/analysis.py b/Backend/tests/analysis.py
@@ -1,13 +1,18 @@
 from oeda.databases import setup_experiment_database, setup_user_database, db
 from collections import defaultdict
 from oeda.analysis.factorial_tests import FactorialAnova
+from oeda.analysis.analysis_execution import delete_combination_notation, \
+    iterate_anova_tables, get_significant_interactions, get_tuples, extract_inner_values
+
+import pprint
+pp = pprint.PrettyPrinter(indent=4)
 
 # there are >= 2 samples for anova
-def start_anova(id):
-    key = "overhead"
+def start_anova(id, key, alpha, nrOfParameters):
     stage_ids, samples, knobs = get_tuples(id, key)
     test = FactorialAnova(stage_ids=stage_ids, y_key=key, knob_keys=None, stages_count=len(stage_ids))
     aov_table, aov_table_sqr = test.run(data=samples, knobs=knobs)
+
     # type(dd) is defaultdict with unique keys
     dd = iterate_anova_tables(aov_table=aov_table, aov_table_sqr=aov_table_sqr)
 
@@ -21,74 +26,24 @@ def start_anova(id):
                 value = None
             anova_result[key][inner_key] = value
     db().save_analysis(experiment_id=id, stage_ids=stage_ids, analysis_name=test.name, anova_result=anova_result)
-    return
-
-def get_tuples(id, key):
-    stage_ids = db().get_stages(id)[0]
-    data, knobs = db().get_data_for_analysis(id)
-    extract_inner_values(key=key, stage_ids=stage_ids, data=data)
-    # parse data & knobs (k-v pairs) to a proper array of values
-    samples = [data[stage_id] for stage_id in stage_ids]
-    knobs = [knobs[stage_id] for stage_id in stage_ids]
-    return stage_ids, samples, knobs
-
-def extract_inner_values(key, stage_ids, data):
-    outer_key = "payload"
-    for stage_id in stage_ids:
-        res = []
-        # AnalysisTest.data is a dict of stage_ids and data_points
-        for data_point in data[stage_id]:
-            if key in data_point[outer_key]:
-                res.append(data_point[outer_key][key])
-        data[stage_id] = res
 
 
-# type(table) is DataFrame
-# rows are keys of the result obj
-# values are inner keys of those keys
-def iterate_anova_tables(aov_table, aov_table_sqr):
-    dd = defaultdict(dict)
-    # iterate first table
-    for row in aov_table.itertuples():
-        for col_name in list(aov_table):
-            if col_name == "PR(>F)" and hasattr(row, "_4"): # PR(>F) is translated to _4 TODO: why?
-                dd[row.Index][col_name] = getattr(row, "_4")
-            elif hasattr(row, col_name):
-                dd[row.Index][col_name] = getattr(row, col_name)
-    # iterate second table
-    for row in aov_table_sqr.itertuples():
-        for col_name in list(aov_table_sqr):
-            if hasattr(row, col_name):
-                dd[row.Index][col_name] = getattr(row, col_name)
-    return dd
+    retrieved = db().get_analysis(experiment_id=id, stage_ids=stage_ids, analysis_name='two-way-anova')
+    pp.pprint(retrieved)
 
-def get_influence_parameters(anova_result, alpha, nrOfParameters):
+    # following part will be integrated to ThreePhaseStr.
+    # aov_table = delete_combination_notation(aov_table)
+    # aov_table_sqr = delete_combination_notation(aov_table_sqr)
 
-    print "##########"
-    import pprint
-    pp = pprint.PrettyPrinter(indent=4)
-    pp.pprint(anova_result)
-    print "##########"
-    # now we want to select the most important factors out of anova result
+    # si = get_significant_interactions(dd, alpha, nrOfParameters)
+    # pp.pprint(si)
 
-    significant_interactions = []
-    for interaction_key in anova_result.keys():
-        pvalue = anova_result[interaction_key]['PR(>F)']
-        if pvalue < alpha:
-            significant_interactions.append((interaction_key, pvalue))
-
-    sorted_significant_interactions = sorted((value, key) for (key, value) in significant_interactions)
-    print "!!!!!!!!!"
-    pp.pprint(sorted_significant_interactions)
-    print "!!!!!!!!!"
-    return anova_result
+    return
 
 if __name__ == '__main__':
-    # setup_user_database()
+    nrOfParameters = 3 # to be retrieved from analysis definition
+    alpha = 0.5 # to be retrieved from analysis definition
     setup_experiment_database("elasticsearch", "localhost", 9200)
     id = "a780bba9-a2c7-20a5-7be9-ede26d9c9b64"
-    stage_ids = ["6dc62e9c-3625-85ca-657e-3b06cc269828#1", "6dc62e9c-3625-85ca-657e-3b06cc269828#2", "6dc62e9c-3625-85ca-657e-3b06cc269828#3", "6dc62e9c-3625-85ca-657e-3b06cc269828#4"]
-    retrieved = db().get_analysis(experiment_id=id, stage_ids=stage_ids, analysis_name="two-way-anova")
-    nrOfParameters = 2 # to be retrieved from analysis definition
-    alpha = 0.5 # to be retrieved from analysis definition
-    get_influence_parameters(retrieved["anova_result"], alpha, nrOfParameters)
+    key = "overhead"
+    start_anova(id, key, alpha, nrOfParameters)
diff --git a/Backend/tests/analysis_test.py b/Backend/tests/analysis_test.py
@@ -247,10 +247,6 @@ def test_s_FactorialAnova(self):
             test = FactorialAnova(stage_ids=stage_ids, y_key=AnalysisTest.key, knob_keys=None, stages_count=len(stage_ids))
             result = test.run(data=samples, knobs=knobs)
             self.assertTrue(result is not None)
-            # TODO: Ilias, how can we save Factorial Analysis table to DB?
-            # db().save_analysis(AnalysisTest.stage_ids, test.name, result)
-            # retrieved = db().get_analysis(AnalysisTest.stage_ids, test.name)
-            # self.assertTrue(retrieved)
         except Exception as e:
             error_name = type(e).__name__
             self.assertTrue(error_name == "LinAlgError" or error_name == "ValueError")