Skip to content

Commit

Permalink
#51
Browse files Browse the repository at this point in the history
- fix text predictions
- fix label encoder_4MUC_cat2id_id2cat.joblib
- fix klass mappings
- update notebook text
  • Loading branch information
diegoesteves committed Jun 30, 2020
1 parent 7d58117 commit 9845c6b
Show file tree
Hide file tree
Showing 10 changed files with 540 additions and 281 deletions.
523 changes: 396 additions & 127 deletions notebooks/horus_v1/02-horus-training-news-classifiers-wiki.ipynb

Large diffs are not rendered by default.

Binary file modified notebooks/horus_v1/encoder_4MUC_cat2id_id2cat.joblib
Binary file not shown.
Binary file modified notebooks/horus_v1/feature_extract_tfidf_ngram2_5000.joblib
Binary file not shown.
Binary file modified notebooks/horus_v1/multi_cls-calib_linear.joblib
Binary file not shown.
Binary file modified notebooks/horus_v1/ovr_cls-linearSVC.joblib
Binary file not shown.
8 changes: 4 additions & 4 deletions scripts/05_feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,31 +40,31 @@
config.logger.info(f'finish ok?: {out}')
horus.update_status(PRE_PROCESSING_STATUS["FEATURE_LEXICAL"])
else:
config.logger.info('feature extraction (lexical): either not active or already cached')
config.logger.info('feature extraction (lexical): either not active or already processed')

if EXTRACT_IMAGE and (str(PRE_PROCESSING_STATUS["FEATURE_IMAGE"]) not in str(horus.processing_status)):
config.logger.info('feature extraction (image)')
out = fe_image.extract_features(horus)
config.logger.info(f'finish ok?: {out}')
horus.update_status(PRE_PROCESSING_STATUS["FEATURE_IMAGE"])
else:
config.logger.info('feature extraction (image): either not active or already cached')
config.logger.info('feature extraction (image): either not active or already processed')

if EXTRACT_TEXT and (str(PRE_PROCESSING_STATUS["FEATURE_TEXT"]) not in str(horus.processing_status)):
config.logger.info('feature extraction (text)')
out = fe_text.extract_features(horus)
config.logger.info(f'finish ok?: {out}')
horus.update_status(PRE_PROCESSING_STATUS["FEATURE_TEXT"])
else:
config.logger.info('feature extraction (text): either not active or already cached')
config.logger.info('feature extraction (text): either not active or already processed')

config.logger.info('done! saving files')
horus_file_stage3_simple_json = conll_file.replace('.horusx', '.horus3.simple.json')
horus_file_stage3 = conll_file.replace('.horusx', '.horus3.json')

# TODO: for now I am saving in a different json file just to compare and check things are fine.
# later just update the status of the horus file (definitions.PRE_PROCESSING_STATUS)
#HorusDataLoader.save_metadata_to_file(horus=horus, file=horus_file_stage3_simple_json, simple_json=True)
HorusDataLoader.save_metadata_to_file(horus=horus, file=horus_file_stage3_simple_json, simple_json=True)
HorusDataLoader.save_metadata_to_file(horus=horus, file=horus_file_stage3, simple_json=False)

config.logger.info('hooray!')
Expand Down
22 changes: 11 additions & 11 deletions src/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,27 +127,27 @@
NER_TAGS.extend(NER_TAGS_LOC)
NER_TAGS.extend(NER_TAGS_MISC)

# PER, LOC, ORG and MISC
PLOMNone_index2label = {1: "LOC", 2: "ORG", 3: "PER", 4: "MISC", 5: "O"} #KLASSES
PLOMNone_label2index = {"LOC": 1, "ORG": 2, "PER": 3, "MISC": 4, "O": 5} #KLASSES2
PLOM_index2label = PLOMNone_index2label.copy()
del PLOM_index2label[5]
# not testing MISC for now
del PLOM_index2label[4]
# PER, ORG, LOC and MISC
encoder_4MUC_NER_idx2category = {0: "O", 1: "PER", 2: "ORG", 3: "LOC", 4: "MISC"} #KLASSES
encoder_4MUC_NER_category2idx = {"O": 0, "PER": 1, "ORG": 2, "LOC": 3, "MISC": 4} #KLASSES2
#PLOM_index2label = encoder_4MUC_NER_idx2category.copy()
#del PLOM_index2label[5]
## not testing MISC for now
#del PLOM_index2label[4]

header = 'cross-validation\tconfig\trun\tlabel\tprecision\trecall\tf1\tsupport\talgo\tdataset1\tdataset2\ttask\n'
line = '%s\t%s\t%s\t%s\t%.5f\t%.5f\t%.5f\t%s\t%s\t%s\t%s\t%s\n'

def tags_to_3muc_simple(tags):
for i in range(len(tags)):
if tags[i] in NER_TAGS_PER:
tags[i] = PLOMNone_label2index['PER']
tags[i] = encoder_4MUC_NER_category2idx['PER']
elif tags[i] in NER_TAGS_ORG:
tags[i] = PLOMNone_label2index['ORG']
tags[i] = encoder_4MUC_NER_category2idx['ORG']
elif tags[i] in NER_TAGS_LOC:
tags[i] = PLOMNone_label2index['LOC']
tags[i] = encoder_4MUC_NER_category2idx['LOC']
else:
tags[i] = PLOMNone_label2index['O']
tags[i] = encoder_4MUC_NER_category2idx['O']
return tags


Expand Down
206 changes: 98 additions & 108 deletions src/features/horus_feature_extraction.py

Large diffs are not rendered by default.

26 changes: 13 additions & 13 deletions src/horus_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,69 +34,69 @@ def get_textual() -> dict:
3: 'total.ovr.k.loc',
4: 'total.ovr.k.org',
5: 'total.ovr.k.per',
6: 'total.ovr.k.other',
6: 'total.ovr.k.misc',
7: 'avg.probs1.k.loc',
8: 'avg.probs1.k.org',
9: 'avg.probs1.k.per',
10: 'avg.probs1.k.other',
10: 'avg.probs1.k.misc',
11: 'avg.probs2.k.loc',
12: 'avg.probs2.k.org',
13: 'avg.probs2.k.per',
14: 'avg.probs2.k.other',
14: 'avg.probs2.k.misc',
15: 'top.binary.k',
16: 'dist.k',
17: 'total.topic.k.loc',
18: 'total.topic.k.org',
19: 'total.topic.k.per',
20: 'total.topic.k.misc',
21: 'total.topic.k.other',
21: 'total.topic.k.misc',
22: 'top.topic.k',
23: 'dist.k.topic_model',
24: 'total.emb.similar.loc',
25: 'total.emb.similar.org',
26: 'total.emb.similar.per',
27: 'total.emb.similar.misc',
28: 'total.emb.similar.other',
28: 'total.emb.similar.misc',
29: 'stats.topic.top.k.sum.loc',
30: 'stats.topic.top.k.sum.org',
31: 'stats.topic.top.k.sum.per',
32: 'stats.topic.top.k.sum.misc',
33: 'stats.topic.top.k.sum.other',
33: 'stats.topic.top.k.sum.misc',
34: 'stats.topic.top.k.avg.loc',
35: 'stats.topic.top.k.avg.org',
36: 'stats.topic.top.k.avg.per',
37: 'stats.topic.top.k.avg.misc',
38: 'stats.topic.top.k.avg.other',
38: 'stats.topic.top.k.avg.misc',
39: 'stats.topic.top.k.max.loc',
40: 'stats.topic.top.k.max.org',
41: 'stats.topic.top.k.max.per',
42: 'stats.topic.top.k.max.misc',
43: 'stats.topic.top.k.max.other',
43: 'stats.topic.top.k.max.misc',
44: 'stats.topic.top.k.min.loc',
45: 'stats.topic.top.k.min.org',
46: 'stats.topic.top.k.min.per',
47: 'stats.topic.top.k.min.misc',
48: 'stats.topic.top.k.min.other',
48: 'stats.topic.top.k.min.misc',
49: 'stats.topic.sum.loc',
50: 'stats.topic.sum.org',
51: 'stats.topic.sum.per',
52: 'stats.topic.sum.misc',
53: 'stats.topic.sum.other',
53: 'stats.topic.sum.misc',
54: 'stats.topic.avg.loc',
55: 'stats.topic.avg.org',
56: 'stats.topic.avg.per',
57: 'stats.topic.avg.misc',
58: 'stats.topic.avg.other',
58: 'stats.topic.avg.misc',
59: 'stats.topic.max.loc',
60: 'stats.topic.max.org',
61: 'stats.topic.max.per',
62: 'stats.topic.max.misc',
63: 'stats.topic.max.other',
63: 'stats.topic.max.misc',
64: 'stats.topic.min.loc',
65: 'stats.topic.min.org',
66: 'stats.topic.min.per',
67: 'stats.topic.min.misc',
68: 'stats.topic.min.other'
68: 'stats.topic.min.misc'
}

reversed_features = dict([(value, key) for key, value in features.items()])
Expand Down
36 changes: 18 additions & 18 deletions src/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -591,13 +591,13 @@ def populate_matrix_new_columns(self):
temp.extend([0] * (int(definitions.HORUS_TOT_FEATURES)-8))
# do NOT append the last column here (y)

temp[18] = definitions.PLOMNone_index2label[4]
temp[26] = definitions.PLOMNone_index2label[4]
temp[26] = definitions.PLOMNone_index2label[4]
temp[38] = definitions.PLOMNone_index2label[4]
temp[39] = definitions.PLOMNone_index2label[4]
temp[40] = definitions.PLOMNone_index2label[4]
temp[41] = definitions.PLOMNone_index2label[4]
temp[18] = definitions.encoder_4MUC_NER_idx2category[4]
temp[26] = definitions.encoder_4MUC_NER_idx2category[4]
temp[26] = definitions.encoder_4MUC_NER_idx2category[4]
temp[38] = definitions.encoder_4MUC_NER_idx2category[4]
temp[39] = definitions.encoder_4MUC_NER_idx2category[4]
temp[40] = definitions.encoder_4MUC_NER_idx2category[4]
temp[41] = definitions.encoder_4MUC_NER_idx2category[4]

return temp

Expand All @@ -618,9 +618,9 @@ def sentence_to_horus_matrix(self, sentences):
word_index_ref = sent[6][self.config.models_pos_tag_lib][c][0]
compound = sent[6][self.config.models_pos_tag_lib][c][1]
compound_size = sent[6][self.config.models_pos_tag_lib][c][2]
temp = [0, sent_index, word_index_ref, compound, '', '', definitions.PLOMNone_index2label[4], 1, compound_size]
temp = [0, sent_index, word_index_ref, compound, '', '', definitions.encoder_4MUC_NER_idx2category[4], 1, compound_size]
temp.extend(self.populate_matrix_new_columns())
temp[definitions.INDEX_TARGET_NER] = definitions.PLOMNone_index2label[4]
temp[definitions.INDEX_TARGET_NER] = definitions.encoder_4MUC_NER_idx2category[4]
converted.append(temp)
word_index = 0
starty = 0
Expand All @@ -646,24 +646,24 @@ def sentence_to_horus_matrix(self, sentences):
if len(sent[3][0]) > 0:
tag_ner_y = sent[3][0][ind_ner_real]
if tag_ner_y in definitions.NER_TAGS_LOC:
tag_ner_y = definitions.PLOMNone_index2label[1]
tag_ner_y = definitions.encoder_4MUC_NER_idx2category[1]
elif tag_ner_y in definitions.NER_TAGS_ORG:
tag_ner_y = definitions.PLOMNone_index2label[2]
tag_ner_y = definitions.encoder_4MUC_NER_idx2category[2]
elif tag_ner_y in definitions.NER_TAGS_PER:
tag_ner_y = definitions.PLOMNone_index2label[3]
tag_ner_y = definitions.encoder_4MUC_NER_idx2category[3]
else:
tag_ner_y = definitions.PLOMNone_index2label[4]
tag_ner_y = definitions.encoder_4MUC_NER_idx2category[4]
else:
tag_ner_y = definitions.PLOMNone_index2label[4]
tag_ner_y = definitions.encoder_4MUC_NER_idx2category[4]

if tag_ner in definitions.NER_TAGS_LOC:
tag_ner = definitions.PLOMNone_index2label[1]
tag_ner = definitions.encoder_4MUC_NER_idx2category[1]
elif tag_ner in definitions.NER_TAGS_ORG:
tag_ner = definitions.PLOMNone_index2label[2]
tag_ner = definitions.encoder_4MUC_NER_idx2category[2]
elif tag_ner in definitions.NER_TAGS_PER:
tag_ner = definitions.PLOMNone_index2label[3]
tag_ner = definitions.encoder_4MUC_NER_idx2category[3]
else:
tag_ner = definitions.PLOMNone_index2label[4]
tag_ner = definitions.encoder_4MUC_NER_idx2category[4]

temp = [has_NER, sent_index, word_index, term, tag_pos_uni, tag_pos, tag_ner, 0, 0] # 0-8
temp.extend(self.populate_matrix_new_columns())
Expand Down

0 comments on commit 9845c6b

Please sign in to comment.