From 4c5a80477bbc69c038b220d95d2d8bc07c41fefe Mon Sep 17 00:00:00 2001 From: "diego.esteves" Date: Fri, 1 May 2020 00:29:27 +0100 Subject: [PATCH] #51 - text features dict - feat extraction - bash update (spacy models) --- src/features/horus_feature_extraction.py | 172 ++++++++++++----------- src/horus_meta.py | 103 ++++++++------ 2 files changed, 146 insertions(+), 129 deletions(-) diff --git a/src/features/horus_feature_extraction.py b/src/features/horus_feature_extraction.py index 6a06d50..83a11d1 100644 --- a/src/features/horus_feature_extraction.py +++ b/src/features/horus_feature_extraction.py @@ -1,3 +1,4 @@ +import heapq from abc import ABC, abstractmethod, ABCMeta import gensim as gensim from src import definitions @@ -23,7 +24,7 @@ from sklearn import preprocessing from sklearn.preprocessing import normalize -from src.utils.definitions_sql import SQL_TEXT_CLASS_SEL +from src.utils.definitions_sql import SQL_TEXT_CLASS_SEL, SQL_TEXT_CLASS_UPD from src.utils.nlp_tools import NLPTools from src.utils.translation.azure import bing_detect_language, bing_translate_text from src.utils.translation.bingtranslation import BingTranslator @@ -276,14 +277,14 @@ def extract_features(self, horus: Horus) -> bool: limit_txt = min(nr_results_txt, int(self.config.search_engine_tot_resources)) tot_error_translation = 0 - embs, top5_sim = self.__get_number_classes_in_embeedings(term) + embs, top5_sim = self.__get_number_classes_in_embeedings(token.text) klass_top = [] tm_cnn_w = [] tm_cnn_w_exp = [] if self.text_tm is not None: # TM+CNN - term - tm_cnn_w = self.text_tm.detect_text_klass(term) + tm_cnn_w = self.text_tm.detect_text_klass(token.text) tm_cnn_w_exp = [np.math.pow(i, 2) for i in tm_cnn_w] if self.text_tm is not None and top5_sim is not None: @@ -333,8 +334,6 @@ def extract_features(self, horus: Horus) -> bool: embs.append(rows[itxt][19]) embs.append(rows[itxt][20]) - - except Exception as e: self.config.logger.error(str(e.message)) pass @@ -364,90 +363,97 @@ def extract_features(self, horus: Horus) -> bool: horus_tx_ner = gpb.index(max(gpb)) + 1 - self.horus_matrix[index][definitions.INDEX_TOT_RESULTS_TX] = limit_txt - self.horus_matrix[index][definitions.INDEX_TOT_TX_LOC] = gpb[0] - self.horus_matrix[index][definitions.INDEX_TOT_TX_ORG] = gpb[1] - self.horus_matrix[index][definitions.INDEX_TOT_TX_PER] = gpb[2] - self.horus_matrix[index][definitions.INDEX_TOT_ERR_TRANS] = tot_error_translation - - self.horus_matrix[index][definitions.INDEX_TOT_TX_LOC_TM_CNN] = 0 if len(tm_cnn_w) == 0 else \ - tm_cnn_w[0] - self.horus_matrix[index][definitions.INDEX_TOT_TX_ORG_TM_CNN] = 0 if len(tm_cnn_w) == 0 else \ - tm_cnn_w[1] - self.horus_matrix[index][definitions.INDEX_TOT_TX_PER_TM_CNN] = 0 if len(tm_cnn_w) == 0 else \ - tm_cnn_w[2] - self.horus_matrix[index][definitions.INDEX_TOT_TX_NONE_TM_CNN] = 0 if len( - tm_cnn_w) == 0 else tm_cnn_w[3] + token.features.text.values[tx_dict_reversed.get('total.retrieved.results.search_engine')] \ + = limit_txt + + token.features.text.values[tx_dict_reversed.get('total.binary.k.loc')] = gpb[0] + token.features.text.values[tx_dict_reversed.get('total.binary.k.org')] = gpb[1] + token.features.text.values[tx_dict_reversed.get('total.binary.k.per')] = gpb[2] + token.features.text.values[tx_dict_reversed.get('total.binary.k.other')] = 0 + token.features.text.values[tx_dict_reversed.get('total.error.translation')] = \ + tot_error_translation + + token.features.text.values[tx_dict_reversed.get('total.topic.k.loc')] = \ + 0 if len(tm_cnn_w) == 0 else tm_cnn_w[0] + token.features.text.values[tx_dict_reversed.get('total.topic.k.org')] = \ + 0 if len(tm_cnn_w) == 0 else tm_cnn_w[1] + token.features.text.values[tx_dict_reversed.get('total.topic.k.per')] = \ + 0 if len(tm_cnn_w) == 0 else tm_cnn_w[2] + token.features.text.values[tx_dict_reversed.get('total.topic.k.other')] = \ + 0 if len(tm_cnn_w) == 0 else tm_cnn_w[3] maxs_tx = heapq.nlargest(2, gpb) maxs_tm = 0 if len(tm_cnn_w) == 0 else heapq.nlargest(2, tm_cnn_w) dist_tx_indicator = max(maxs_tx) - min(maxs_tx) dist_tx_indicator_tm = 0 if len(yytm) == 0 else (max(maxs_tm) - min(maxs_tm)) - self.horus_matrix[index][definitions.INDEX_DIST_TX_I] = dist_tx_indicator - self.horus_matrix[index][definitions.INDEX_NR_RESULTS_SE_TX] = nr_results_txt - self.horus_matrix[index][definitions.INDEX_DIST_TX_I_TM_CNN] = dist_tx_indicator_tm - - self.horus_matrix[index][definitions.INDEX_TOT_EMB_SIMILAR_LOC] = embs[0] - self.horus_matrix[index][definitions.INDEX_TOT_EMB_SIMILAR_ORG] = embs[1] - self.horus_matrix[index][definitions.INDEX_TOT_EMB_SIMILAR_PER] = embs[2] - self.horus_matrix[index][definitions.INDEX_TOT_EMB_SIMILAR_NONE] = embs[3] - - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_SUM_LOC] = \ - topic_klass_top_sums[0] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_SUM_ORG] = \ - topic_klass_top_sums[1] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_SUM_PER] = \ - topic_klass_top_sums[2] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_SUM_NONE] = \ - topic_klass_top_sums[3] - - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_AVG_LOC] = \ - topic_klass_top_avg[0] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_AVG_ORG] = \ - topic_klass_top_avg[1] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_AVG_PER] = \ - topic_klass_top_avg[2] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_AVG_NONE] = \ - topic_klass_top_avg[3] - - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_MAX_LOC] = \ - topic_klass_top_max[0] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_MAX_ORG] = \ - topic_klass_top_max[1] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_MAX_PER] = \ - topic_klass_top_max[2] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_MAX_NONE] = \ - topic_klass_top_max[3] - - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_MIN_LOC] = \ - topic_klass_top_min[0] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_MIN_ORG] = \ - topic_klass_top_min[1] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_MIN_PER] = \ - topic_klass_top_min[2] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_T_PLUS_TOP5_K_MIN_NONE] = \ - topic_klass_top_min[3] - - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_SUM_LOC] = topic_sums[0] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_SUM_ORG] = topic_sums[1] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_SUM_PER] = topic_sums[2] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_SUM_NONE] = topic_sums[3] - - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_AVG_LOC] = topic_avg[0] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_AVG_ORG] = topic_avg[1] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_AVG_PER] = topic_avg[2] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_AVG_NONE] = topic_avg[3] - - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_MAX_LOC] = topic_max[0] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_MAX_ORG] = topic_max[1] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_MAX_PER] = topic_max[2] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_MAX_NONE] = topic_max[3] - - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_MIN_LOC] = topic_min[0] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_MIN_ORG] = topic_min[1] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_MIN_PER] = topic_min[2] - self.horus_matrix[index][definitions.INDEX_TX_CNN_STAT_MIN_NONE] = topic_min[3] + token.features.text.values[tx_dict_reversed.get('dist.k')] = dist_tx_indicator + token.features.text.values[tx_dict_reversed.get('dist.k.topic_model')] = \ + dist_tx_indicator_tm + token.features.text.values[tx_dict_reversed.get('total.results.search_engine')] = \ + nr_results_txt + + token.features.text.values[tx_dict_reversed.get('total.emb.similar.loc')] = embs[0] + token.features.text.values[tx_dict_reversed.get('total.emb.similar.org')] = embs[1] + token.features.text.values[tx_dict_reversed.get('total.emb.similar.per')] = embs[2] + token.features.text.values[tx_dict_reversed.get('total.emb.similar.other')] = embs[3] + + token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.sum.loc')] = \ + topic_klass_top_sums[0] + token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.sum.org')] = \ + topic_klass_top_sums[1] + token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.sum.per')] = \ + topic_klass_top_sums[2] + token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.sum.other')] = \ + topic_klass_top_sums[3] + + token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.avg.loc')] = \ + topic_klass_top_avg[0] + token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.avg.org')] = \ + topic_klass_top_avg[1] + token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.avg.per')] = \ + topic_klass_top_avg[2] + token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.avg.other')] = \ + topic_klass_top_avg[3] + + token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.max.loc')] = \ + topic_klass_top_max[0] + token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.max.org')] = \ + topic_klass_top_max[1] + token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.max.per')] = \ + topic_klass_top_max[2] + token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.max.other')] = \ + topic_klass_top_max[3] + + token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.min.loc')] = \ + topic_klass_top_min[0] + token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.min.org')] = \ + topic_klass_top_min[1] + token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.min.per')] = \ + topic_klass_top_min[2] + token.features.text.values[tx_dict_reversed.get('stats.topic.top.k.min.other')] = \ + topic_klass_top_min[3] + + token.features.text.values[tx_dict_reversed.get('stats.topic.sum.loc')] = topic_sums[0] + token.features.text.values[tx_dict_reversed.get('stats.topic.sum.org')] = topic_sums[1] + token.features.text.values[tx_dict_reversed.get('stats.topic.sum.per')] = topic_sums[2] + token.features.text.values[tx_dict_reversed.get('stats.topic.sum.other')] = topic_sums[3] + + token.features.text.values[tx_dict_reversed.get('stats.topic.avg.loc')] = topic_avg[0] + token.features.text.values[tx_dict_reversed.get('stats.topic.avg.org')] = topic_avg[1] + token.features.text.values[tx_dict_reversed.get('stats.topic.avg.per')] = topic_avg[2] + token.features.text.values[tx_dict_reversed.get('stats.topic.avg.other')] = topic_avg[3] + + token.features.text.values[tx_dict_reversed.get('stats.topic.max.loc')] = topic_max[0] + token.features.text.values[tx_dict_reversed.get('stats.topic.max.org')] = topic_max[1] + token.features.text.values[tx_dict_reversed.get('stats.topic.max.per')] = topic_max[2] + token.features.text.values[tx_dict_reversed.get('stats.topic.max.other')] = topic_max[3] + + token.features.text.values[tx_dict_reversed.get('stats.topic.min.loc')] = topic_min[0] + token.features.text.values[tx_dict_reversed.get('stats.topic.min.org')] = topic_min[1] + token.features.text.values[tx_dict_reversed.get('stats.topic.min.per')] = topic_min[2] + token.features.text.values[tx_dict_reversed.get('stats.topic.min.other')] = topic_min[3] + if limit_txt != 0: self.horus_matrix[index][definitions.INDEX_MAX_KLASS_PREDICT_TX] = \ diff --git a/src/horus_meta.py b/src/horus_meta.py index 69ac6c7..e09e0c4 100644 --- a/src/horus_meta.py +++ b/src/horus_meta.py @@ -28,57 +28,68 @@ def get_visual() -> dict: def get_textual() -> dict: features = { - 0: 'total.global.results.search_engine', + 0: 'total.results.search_engine', 1: 'total.retrieved.results.search_engine', 2: 'total.error.translation', 3: 'total.binary.k.loc', 4: 'total.binary.k.org', 5: 'total.binary.k.per', - 6: 'total.binary.k.other', - 7: 'top.binary.k', - 8: 'dist.k', - 9: 'total.topic.k.loc', - 10: 'total.topic.k.org', - 11: 'total.topic.k.per', - 12: 'total.topic.k.other', - 13: 'top.topic.k', - 14: 'dist.k.topic_model', - 15: 'total.emb.similar.loc', - 16: 'total.emb.similar.org', - 17: 'total.emb.similar.per', - 18: 'total.emb.similar.other', - 19: 'stats.topic.top.k.sum.loc', - 20: 'stats.topic.top.k.sum.org', - 21: 'stats.topic.top.k.sum.per', - 22: 'stats.topic.top.k.sum.other', - 23: 'stats.topic.top.k.avg.loc', - 24: 'stats.topic.top.k.avg.org', - 25: 'stats.topic.top.k.avg.per', - 26: 'stats.topic.top.k.avg.other', - 27: 'stats.topic.top.k.max.loc', - 28: 'stats.topic.top.k.max.org', - 29: 'stats.topic.top.k.max.per', - 30: 'stats.topic.top.k.max.other', - 31: 'stats.topic.top.k.min.loc', - 32: 'stats.topic.top.k.min.org', - 33: 'stats.topic.top.k.min.per', - 34: 'stats.topic.top.k.min.other', - 35: 'stats.topic.sum.loc', - 36: 'stats.topic.sum.org', - 37: 'stats.topic.sum.per', - 38: 'stats.topic.sum.other', - 39: 'stats.topic.avg.loc', - 40: 'stats.topic.avg.org', - 41: 'stats.topic.avg.per', - 42: 'stats.topic.avg.other', - 43: 'stats.topic.max.loc', - 44: 'stats.topic.max.org', - 45: 'stats.topic.max.per', - 46: 'stats.topic.max.other', - 47: 'stats.topic.min.loc', - 48: 'stats.topic.min.org', - 49: 'stats.topic.min.per', - 50: 'stats.topic.min.other', + 6: 'total.binary.k.misc', + 7: 'total.binary.k.other', + 8: 'top.binary.k', + 9: 'dist.k', + 10: 'total.topic.k.loc', + 11: 'total.topic.k.org', + 12: 'total.topic.k.per', + 13: 'total.topic.k.misc', + 14: 'total.topic.k.other', + 15: 'top.topic.k', + 16: 'dist.k.topic_model', + 17: 'total.emb.similar.loc', + 18: 'total.emb.similar.org', + 19: 'total.emb.similar.per', + 20: 'total.emb.similar.misc', + 21: 'total.emb.similar.other', + 22: 'stats.topic.top.k.sum.loc', + 23: 'stats.topic.top.k.sum.org', + 24: 'stats.topic.top.k.sum.per', + 25: 'stats.topic.top.k.sum.misc', + 26: 'stats.topic.top.k.sum.other', + 27: 'stats.topic.top.k.avg.loc', + 28: 'stats.topic.top.k.avg.org', + 29: 'stats.topic.top.k.avg.per', + 30: 'stats.topic.top.k.avg.misc', + 31: 'stats.topic.top.k.avg.other', + 32: 'stats.topic.top.k.max.loc', + 33: 'stats.topic.top.k.max.org', + 34: 'stats.topic.top.k.max.per', + 35: 'stats.topic.top.k.max.misc', + 36: 'stats.topic.top.k.max.other', + 37: 'stats.topic.top.k.min.loc', + 38: 'stats.topic.top.k.min.org', + 39: 'stats.topic.top.k.min.per', + 40: 'stats.topic.top.k.min.misc', + 41: 'stats.topic.top.k.min.other', + 42: 'stats.topic.sum.loc', + 43: 'stats.topic.sum.org', + 44: 'stats.topic.sum.per', + 45: 'stats.topic.sum.misc', + 46: 'stats.topic.sum.other', + 47: 'stats.topic.avg.loc', + 48: 'stats.topic.avg.org', + 49: 'stats.topic.avg.per', + 50: 'stats.topic.avg.misc', + 51: 'stats.topic.avg.other', + 52: 'stats.topic.max.loc', + 53: 'stats.topic.max.org', + 54: 'stats.topic.max.per', + 55: 'stats.topic.max.misc', + 56: 'stats.topic.max.other', + 57: 'stats.topic.min.loc', + 58: 'stats.topic.min.org', + 59: 'stats.topic.min.per', + 60: 'stats.topic.min.misc', + 61: 'stats.topic.min.other', } reversed_features = dict([(value, key) for key, value in features.items()])