From 4c5a80477bbc69c038b220d95d2d8bc07c41fefe Mon Sep 17 00:00:00 2001
From: "diego.esteves" <diego.esteves@farfetch.com>
Date: Fri, 1 May 2020 00:29:27 +0100
Subject: [PATCH] #51 - text features dict - feat extraction - bash update
 (spacy models)

---
 src/features/horus_feature_extraction.py | 172 ++++++++++++-----------
 src/horus_meta.py                        | 103 ++++++++------
 2 files changed, 146 insertions(+), 129 deletions(-)

diff --git a/src/features/horus_feature_extraction.py b/src/features/horus_feature_extraction.py
index 6a06d50..83a11d1 100644
--- a/src/features/horus_feature_extraction.py
+++ b/src/features/horus_feature_extraction.py
@@ -1,3 +1,4 @@
+import heapq
 from abc import ABC, abstractmethod, ABCMeta
 import gensim as gensim
 from src import definitions
@@ -23,7 +24,7 @@
 from sklearn import preprocessing
 from sklearn.preprocessing import normalize
 
-from src.utils.definitions_sql import SQL_TEXT_CLASS_SEL
+from src.utils.definitions_sql import SQL_TEXT_CLASS_SEL, SQL_TEXT_CLASS_UPD
 from src.utils.nlp_tools import NLPTools
 from src.utils.translation.azure import bing_detect_language, bing_translate_text
 from src.utils.translation.bingtranslation import BingTranslator
@@ -276,14 +277,14 @@ def extract_features(self, horus: Horus) -> bool:
 
                             limit_txt = min(nr_results_txt, int(self.config.search_engine_tot_resources))
                             tot_error_translation = 0
-                            embs, top5_sim = self.__get_number_classes_in_embeedings(term)
+                            embs, top5_sim = self.__get_number_classes_in_embeedings(token.text)
 
                             klass_top = []
                             tm_cnn_w = []
                             tm_cnn_w_exp = []
                             if self.text_tm is not None:
                                 # TM+CNN - term
-                                tm_cnn_w = self.text_tm.detect_text_klass(term)
+                                tm_cnn_w = self.text_tm.detect_text_klass(token.text)
                                 tm_cnn_w_exp = [np.math.pow(i, 2) for i in tm_cnn_w]
 
                             if self.text_tm is not None and top5_sim is not None:
@@ -333,8 +334,6 @@ def extract_features(self, horus: Horus) -> bool:
                                         embs.append(rows[itxt][19])
                                         embs.append(rows[itxt][20])
 
-
-
                                 except Exception as e:
                                     self.config.logger.error(str(e.message))
                                     pass
@@ -364,90 +363,97 @@ def extract_features(self, horus: Horus) -> bool:
 
                             horus_tx_ner = gpb.index(max(gpb)) + 1
 
-                            self.horus_matrix[index][definitions.INDEX_TOT_RESULTS_TX] = limit_txt
-                            self.horus_matrix[index][definitions.INDEX_TOT_TX_LOC] = gpb[0]
-                            self.horus_matrix[index][definitions.INDEX_TOT_TX_ORG] = gpb[1]
-                            self.horus_matrix[index][definitions.INDEX_TOT_TX_PER] = gpb[2]
-                            self.horus_matrix[index][definitions.INDEX_TOT_ERR_TRANS] = tot_error_translation
-
-                            self.horus_matrix[index][definitions.INDEX_TOT_TX_LOC_TM_CNN] = 0 if len(tm_cnn_w) == 0 else \
-                            tm_cnn_w[0]
-                            self.horus_matrix[index][definitions.INDEX_TOT_TX_ORG_TM_CNN] = 0 if len(tm_cnn_w) == 0 else \
-                            tm_cnn_w[1]
-                            self.horus_matrix[index][definitions.INDEX_TOT_TX_PER_TM_CNN] = 0 if len(tm_cnn_w) == 0 else \
-                            tm_cnn_w[2]
-                            self.horus_matrix[index][definitions.INDEX_TOT_TX_NONE_TM_CNN] = 0 if len(
-                                tm_cnn_w) == 0 else tm_cnn_w[3]
+                            token.features.text.values[tx_dict_reversed.get('total.retrieved.results.search_engine')] \
+                                = limit_txt
+
+                            token.features.text.values[tx_dict_reversed.get('total.binary.k.loc')] = gpb[0]
+                            token.features.text.values[tx_dict_reversed.get('total.binary.k.org')] = gpb[1]
+                            token.features.text.values[tx_dict_reversed.get('total.binary.k.per')] = gpb[2]
+                            token.features.text.values[tx_dict_reversed.get('total.binary.k.other')] = 0
+                            token.features.text.values[tx_dict_reversed.get('total.error.translation')] = \
+                                tot_error_translation
+
+                            token.features.text.values[tx_dict_reversed.get('total.topic.k.loc')] = \
+                                0 if len(tm_cnn_w) == 0 else tm_cnn_w[0]
+                            token.features.text.values[tx_dict_reversed.get('total.topic.k.org')] = \
+                                0 if len(tm_cnn_w) == 0 else tm_cnn_w[1]
+                            token.features.text.values[tx_dict_reversed.get('total.topic.k.per')] = \
+                                0 if len(tm_cnn_w) == 0 else tm_cnn_w[2]
+                            token.features.text.values[tx_dict_reversed.get('total.topic.k.other')] = \
+                                0 if len(tm_cnn_w) == 0 else tm_cnn_w[3]
 
                             maxs_tx = heapq.nlargest(2, gpb)
                             maxs_tm = 0 if len(tm_cnn_w) == 0 else heapq.nlargest(2, tm_cnn_w)
                             dist_tx_indicator = max(maxs_tx) - min(maxs_tx)
                             dist_tx_indicator_tm = 0 if len(yytm) == 0 else (max(maxs_tm) - min(maxs_tm))
 
-                            self.horus_matrix[index][definitions.INDEX_DIST_TX_I] = dist_tx_indicator
-                            self.horus_matrix[index][definitions.INDEX_NR_RESULTS_SE_TX] = nr_results_txt
-                            self.horus_matrix[index][definitions.INDEX_DIST_TX_I_TM_CNN] = dist_tx_indicator_tm
-
-                            self.horus_matrix[index][definitions.INDEX_TOT_EMB_SIMILAR_LOC] = embs[0]
-                            self.horus_matrix[index][definitions.INDEX_TOT_EMB_SIMILAR_ORG] = embs[1]
-                            self.horus_matrix[index][definitions.INDEX_TOT_EMB_SIMILAR_PER] = embs[2]
-                            self.horus_matrix[index][definitions.INDEX_TOT_EMB_SIMILAR_NONE] = embs[3]
-
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_SUM_LOC] = \
-                            topic_klass_top_sums[0]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_SUM_ORG] = \
-                            topic_klass_top_sums[1]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_SUM_PER] = \
-                            topic_klass_top_sums[2]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_SUM_NONE] = \
-                            topic_klass_top_sums[3]
-
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_AVG_LOC] = \
-                            topic_klass_top_avg[0]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_AVG_ORG] = \
-                            topic_klass_top_avg[1]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_AVG_PER] = \
-                            topic_klass_top_avg[2]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_AVG_NONE] = \
-                            topic_klass_top_avg[3]
-
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_MAX_LOC] = \
-                            topic_klass_top_max[0]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_MAX_ORG] = \
-                            topic_klass_top_max[1]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_MAX_PER] = \
-                            topic_klass_top_max[2]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_MAX_NONE] = \
-                            topic_klass_top_max[3]
-
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_MIN_LOC] = \
-                            topic_klass_top_min[0]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_MIN_ORG] = \
-                            topic_klass_top_min[1]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_MIN_PER] = \
-                            topic_klass_top_min[2]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_MIN_NONE] = \
-                            topic_klass_top_min[3]
-
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_SUM_LOC] = topic_sums[0]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_SUM_ORG] = topic_sums[1]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_SUM_PER] = topic_sums[2]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_SUM_NONE] = topic_sums[3]
-
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_AVG_LOC] = topic_avg[0]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_AVG_ORG] = topic_avg[1]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_AVG_PER] = topic_avg[2]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_AVG_NONE] = topic_avg[3]
-
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_MAX_LOC] = topic_max[0]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_MAX_ORG] = topic_max[1]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_MAX_PER] = topic_max[2]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_MAX_NONE] = topic_max[3]
-
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_MIN_LOC] = topic_min[0]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_MIN_ORG] = topic_min[1]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_MIN_PER] = topic_min[2]
-                            self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_MIN_NONE] = topic_min[3]
+                            token.features.text.values[tx_dict_reversed.get('dist.k')] = dist_tx_indicator
+                            token.features.text.values[tx_dict_reversed.get('dist.k.topic_model')] = \
+                                dist_tx_indicator_tm
+                            token.features.text.values[tx_dict_reversed.get('total.results.search_engine')] = \
+                                nr_results_txt
+
+                            token.features.text.values[tx_dict_reversed.get('total.emb.similar.loc')] = embs[0]
+                            token.features.text.values[tx_dict_reversed.get('total.emb.similar.org')] = embs[1]
+                            token.features.text.values[tx_dict_reversed.get('total.emb.similar.per')] = embs[2]
+                            token.features.text.values[tx_dict_reversed.get('total.emb.similar.other')] = embs[3]
+
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.sum.loc')] = \
+                                topic_klass_top_sums[0]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.sum.org')] = \
+                                topic_klass_top_sums[1]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.sum.per')] = \
+                                topic_klass_top_sums[2]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.sum.other')] = \
+                                topic_klass_top_sums[3]
+
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.avg.loc')] = \
+                                topic_klass_top_avg[0]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.avg.org')] = \
+                                topic_klass_top_avg[1]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.avg.per')] = \
+                                topic_klass_top_avg[2]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.avg.other')] = \
+                                topic_klass_top_avg[3]
+
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.max.loc')] = \
+                                topic_klass_top_max[0]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.max.org')] = \
+                                topic_klass_top_max[1]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.max.per')] = \
+                                topic_klass_top_max[2]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.max.other')] = \
+                                topic_klass_top_max[3]
+
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.min.loc')] = \
+                                topic_klass_top_min[0]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.min.org')] = \
+                                topic_klass_top_min[1]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.min.per')] = \
+                                topic_klass_top_min[2]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.min.other')] = \
+                                topic_klass_top_min[3]
+
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.sum.loc')] = topic_sums[0]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.sum.org')] = topic_sums[1]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.sum.per')] = topic_sums[2]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.sum.other')] = topic_sums[3]
+
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.avg.loc')] = topic_avg[0]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.avg.org')] = topic_avg[1]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.avg.per')] = topic_avg[2]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.avg.other')] = topic_avg[3]
+
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.max.loc')] = topic_max[0]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.max.org')] = topic_max[1]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.max.per')] = topic_max[2]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.max.other')] = topic_max[3]
+
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.min.loc')] = topic_min[0]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.min.org')] = topic_min[1]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.min.per')] = topic_min[2]
+                            token.features.text.values[tx_dict_reversed.get('stats.topic.min.other')] = topic_min[3]
+
 
                             if limit_txt != 0:
                                 self.horus_matrix[index][definitions.INDEX_MAX_KLASS_PREDICT_TX] = \
diff --git a/src/horus_meta.py b/src/horus_meta.py
index 69ac6c7..e09e0c4 100644
--- a/src/horus_meta.py
+++ b/src/horus_meta.py
@@ -28,57 +28,68 @@ def get_visual() -> dict:
     def get_textual() -> dict:
 
         features = {
-            0: 'total.global.results.search_engine',
+            0: 'total.results.search_engine',
             1: 'total.retrieved.results.search_engine',
             2: 'total.error.translation',
             3: 'total.binary.k.loc',
             4: 'total.binary.k.org',
             5: 'total.binary.k.per',
-            6: 'total.binary.k.other',
-            7: 'top.binary.k',
-            8: 'dist.k',
-            9: 'total.topic.k.loc',
-            10: 'total.topic.k.org',
-            11: 'total.topic.k.per',
-            12: 'total.topic.k.other',
-            13: 'top.topic.k',
-            14: 'dist.k.topic_model',
-            15: 'total.emb.similar.loc',
-            16: 'total.emb.similar.org',
-            17: 'total.emb.similar.per',
-            18: 'total.emb.similar.other',
-            19: 'stats.topic.top.k.sum.loc',
-            20: 'stats.topic.top.k.sum.org',
-            21: 'stats.topic.top.k.sum.per',
-            22: 'stats.topic.top.k.sum.other',
-            23: 'stats.topic.top.k.avg.loc',
-            24: 'stats.topic.top.k.avg.org',
-            25: 'stats.topic.top.k.avg.per',
-            26: 'stats.topic.top.k.avg.other',
-            27: 'stats.topic.top.k.max.loc',
-            28: 'stats.topic.top.k.max.org',
-            29: 'stats.topic.top.k.max.per',
-            30: 'stats.topic.top.k.max.other',
-            31: 'stats.topic.top.k.min.loc',
-            32: 'stats.topic.top.k.min.org',
-            33: 'stats.topic.top.k.min.per',
-            34: 'stats.topic.top.k.min.other',
-            35: 'stats.topic.sum.loc',
-            36: 'stats.topic.sum.org',
-            37: 'stats.topic.sum.per',
-            38: 'stats.topic.sum.other',
-            39: 'stats.topic.avg.loc',
-            40: 'stats.topic.avg.org',
-            41: 'stats.topic.avg.per',
-            42: 'stats.topic.avg.other',
-            43: 'stats.topic.max.loc',
-            44: 'stats.topic.max.org',
-            45: 'stats.topic.max.per',
-            46: 'stats.topic.max.other',
-            47: 'stats.topic.min.loc',
-            48: 'stats.topic.min.org',
-            49: 'stats.topic.min.per',
-            50: 'stats.topic.min.other',
+            6: 'total.binary.k.misc',
+            7: 'total.binary.k.other',
+            8: 'top.binary.k',
+            9: 'dist.k',
+            10: 'total.topic.k.loc',
+            11: 'total.topic.k.org',
+            12: 'total.topic.k.per',
+            13: 'total.topic.k.misc',
+            14: 'total.topic.k.other',
+            15: 'top.topic.k',
+            16: 'dist.k.topic_model',
+            17: 'total.emb.similar.loc',
+            18: 'total.emb.similar.org',
+            19: 'total.emb.similar.per',
+            20: 'total.emb.similar.misc',
+            21: 'total.emb.similar.other',
+            22: 'stats.topic.top.k.sum.loc',
+            23: 'stats.topic.top.k.sum.org',
+            24: 'stats.topic.top.k.sum.per',
+            25: 'stats.topic.top.k.sum.misc',
+            26: 'stats.topic.top.k.sum.other',
+            27: 'stats.topic.top.k.avg.loc',
+            28: 'stats.topic.top.k.avg.org',
+            29: 'stats.topic.top.k.avg.per',
+            30: 'stats.topic.top.k.avg.misc',
+            31: 'stats.topic.top.k.avg.other',
+            32: 'stats.topic.top.k.max.loc',
+            33: 'stats.topic.top.k.max.org',
+            34: 'stats.topic.top.k.max.per',
+            35: 'stats.topic.top.k.max.misc',
+            36: 'stats.topic.top.k.max.other',
+            37: 'stats.topic.top.k.min.loc',
+            38: 'stats.topic.top.k.min.org',
+            39: 'stats.topic.top.k.min.per',
+            40: 'stats.topic.top.k.min.misc',
+            41: 'stats.topic.top.k.min.other',
+            42: 'stats.topic.sum.loc',
+            43: 'stats.topic.sum.org',
+            44: 'stats.topic.sum.per',
+            45: 'stats.topic.sum.misc',
+            46: 'stats.topic.sum.other',
+            47: 'stats.topic.avg.loc',
+            48: 'stats.topic.avg.org',
+            49: 'stats.topic.avg.per',
+            50: 'stats.topic.avg.misc',
+            51: 'stats.topic.avg.other',
+            52: 'stats.topic.max.loc',
+            53: 'stats.topic.max.org',
+            54: 'stats.topic.max.per',
+            55: 'stats.topic.max.misc',
+            56: 'stats.topic.max.other',
+            57: 'stats.topic.min.loc',
+            58: 'stats.topic.min.org',
+            59: 'stats.topic.min.per',
+            60: 'stats.topic.min.misc',
+            61: 'stats.topic.min.other',
         }
 
         reversed_features = dict([(value, key) for key, value in features.items()])