Skip to content

Commit

Permalink
#51
Browse files Browse the repository at this point in the history
- text features dict
- feat extraction
  • Loading branch information
diegoesteves committed Apr 30, 2020
1 parent c5fd413 commit fd1e618
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 12 deletions.
57 changes: 51 additions & 6 deletions scripts/05_feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,11 @@ def _shape(word):
return word_shape


def _extract_lexical(horus: Horus) -> Horus:
def _extract_lexical(horus: Horus) -> bool:

try:
lx_dict = WordFeaturesInterface.get_lexical()
lx_dict, lx_dict_reversed = WordFeaturesInterface.get_lexical()
tot_slide_brown_cluster = 5
lx_dict_reversed = dict([(value, key) for key, value in lx_dict.items()])

for sentence in horus.sentences:
for token in sentence.tokens:
brown_1000_path = '{:<016}'.format(dict_brown_c1000.get(token.text, '0000000000000000'))
Expand Down Expand Up @@ -120,7 +119,54 @@ def _extract_text(horus: Horus) -> bool:
:param horus:
:return:
'''
return True
try:
tx_dict, tx_dict_reversed = WordFeaturesInterface.get_textual()
for sentence in horus.sentences:
for token in sentence.tokens:
brown_1000_path = '{:<016}'.format(dict_brown_c1000.get(token.text, '0000000000000000'))
brown_640_path = '{:<016}'.format(dict_brown_c640.get(token.text, '0000000000000000'))
brown_320_path = '{:<016}'.format(dict_brown_c320.get(token.text, '0000000000000000'))

for i in range(0, tot_slide_brown_cluster - 1):
token.features.lexical.values[lx_dict_reversed.get('brown_1000.' + str(i + 1))] = brown_1000_path[
:i + 1]
token.features.lexical.values[lx_dict_reversed.get('brown_640.' + str(i + 1))] = brown_640_path[
:i + 1]
token.features.lexical.values[lx_dict_reversed.get('brown_320.' + str(i + 1))] = brown_320_path[
:i + 1]

token.features.lexical.values[lx_dict_reversed.get('word.lower')] = token.text.lower()

lemma = ''
try:
lemma = lemmatize(token.text.lower())
except:
pass

stem = ''
try:
stem = stemo(token.text.lower())
except:
pass

token.features.lexical.values[lx_dict_reversed.get('word.lemma')] = lemma
token.features.lexical.values[lx_dict_reversed.get('word.stem')] = stem
token.features.lexical.values[lx_dict_reversed.get('word.len.1')] = int(len(token.text) == 1)
token.features.lexical.values[lx_dict_reversed.get('word.has.special')] = int(
len(re.findall('(http://\S+|\S*[^\w\s]\S*)', token.text)) > 0)
token.features.lexical.values[lx_dict_reversed.get('word[0].isupper')] = int(token.text[0].isupper())
token.features.lexical.values[lx_dict_reversed.get('word.isupper')] = int(token.text.isupper())
token.features.lexical.values[lx_dict_reversed.get('word.istitle')] = int(token.text.istitle())
token.features.lexical.values[lx_dict_reversed.get('word.isdigit')] = int(token.text.isdigit())
token.features.lexical.values[lx_dict_reversed.get('word.len.issmall')] = int(len(token.text) <= 2)
token.features.lexical.values[lx_dict_reversed.get('word.has.minus')] = int('-' in token.text)
token.features.lexical.values[lx_dict_reversed.get('word.stop')] = int(token.text in stop)
token.features.lexical.values[lx_dict_reversed.get('word.shape')] = _shape(token.text)

return True

except Exception as e:
raise e


def _extract_visual(horus: Horus) -> bool:
Expand All @@ -138,7 +184,6 @@ def extract_features(horus: Horus, lexical: bool = False, text: bool = False, im
if lexical:
_extract_lexical(horus)

# TODO: implement
if text:
_extract_text(horus)

Expand Down
23 changes: 17 additions & 6 deletions src/horus_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ def get_visual() -> dict:

@staticmethod
def get_textual() -> dict:
return {

features = {
0: 'total.global.results.search_engine',
1: 'total.retrieved.results.search_engine',
2: 'total.error.translation',
Expand Down Expand Up @@ -75,12 +76,15 @@ def get_textual() -> dict:
48: 'stats.topic.min.org',
49: 'stats.topic.min.per',
50: 'stats.topic.min.other',

}

reversed_features = dict([(value, key) for key, value in features.items()])

return features, reversed_features

@staticmethod
def get_lexical() -> dict:
return {
features = {
0: 'word.lower',
1: 'word.lemma',
2: 'word.stem',
Expand Down Expand Up @@ -111,6 +115,10 @@ def get_lexical() -> dict:
27: 'brown_1000.5'
}

reversed_features = dict([(value, key) for key, value in features.items()])

return features, reversed_features


class HorusWordFeatures(object):
def __init__(self,
Expand Down Expand Up @@ -146,23 +154,26 @@ def __init__(self,
image: HorusWordFeatures = None):

if lexical is None:
d, dv = WordFeaturesInterface.get_lexical()
self.lexical = HorusWordFeatures(alias='Lexical',
acronym='LX',
dictionary_size=len(WordFeaturesInterface.get_lexical()))
dictionary_size=len(d))
else:
self.lexical = lexical

if text is None:
d, dv = WordFeaturesInterface.get_textual()
self.text = HorusWordFeatures(alias='Text',
acronym='TX',
dictionary_size=len(WordFeaturesInterface.get_textual()))
dictionary_size=len(d))
else:
self.text = text

if image is None:
d, dv = WordFeaturesInterface.get_visual()
self.image = HorusWordFeatures(alias='Image',
acronym='CV',
dictionary_size=len(WordFeaturesInterface.get_visual()))
dictionary_size=len(d))
else:
self.image = image

Expand Down

0 comments on commit fd1e618

Please sign in to comment.