forked from pedrobalage/STIL_LIWC_Evaluation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SentiLex.py
106 lines (85 loc) · 3.01 KB
/
SentiLex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -*- coding: utf-8 -*-
#### Class to provide data and methods to read SentiLex dictionary
####
#### Author: Pedro Paulo Balage Filho
#### Version: 1.0
#### Date: 05/12/12
import codecs
import re
class SentiLexReader(dict):
"""
Dictionary format:
à-vontade,à-vontade.PoS=N;FLEX=ms;TG=HUM:N0;POL:N0=1;ANOT=MAN
abafada,abafado.PoS=Adj;FLEX=fs;TG=HUM:N0;POL:N0=-1;ANOT=JALC
abafadas,abafado.PoS=Adj;FLEX=fp;TG=HUM:N0;POL:N0=-1;ANOT=JALC
abafado,abafado.PoS=Adj;FLEX=ms;TG=HUM:N0;POL:N0=-1;ANOT=JALC
abafados,abafado.PoS=Adj;FLEX=mp;TG=HUM:N0;POL:N0=-1;ANOT=JALC
"""
# Constructor
# dict_file: the path to dictionary file
def __init__(self, dict_file='Dictionaries/SentiLex/SentiLex-flex-PT02.txt'):
handle = codecs.open(dict_file, 'r', 'utf-8')
line = handle.readline()
prog = re.compile(r'([^,]*),([^.]*)\.PoS=([^;]*);([^;]*);TG=([^;]*);(POL:.*);ANOT=(.*)',re.I)
while line:
# Retrieve only the word/phrase and PoS
m = prog.match(line)
if m:
phrase = m.group(1)
lemma = m.group(2)
pos = m.group(3)
flex = m.group(4)
target = m.group(5)
polarities = m.group(6)
anot = m.group(7)
polarities = re.findall('POL:(N[0-9])=(-?[0-9])',polarities)
polarities = [(srl,int(value)) for srl,value in polarities]
if phrase in self:
self[phrase].append((pos,polarities))
else:
self[phrase] = [(pos,polarities)]
else:
print line
line = handle.readline()
handle.close()
#return all matches for a sentence which consists in list of words
def match_words(self, sentence):
i = 0
length = len(sentence)
j = length
matches = []
# iterate over the words present in the sentence
while i < len(sentence):
# get a slide window
phrase = ' '.join(sentence[i:j])
if i == j:
i +=1
j = length
elif phrase in self:
pos,polarities = self[phrase]
matches.append( (phrase,pos,polarities) )
i = j
j = length
else:
j = j - 1
return matches
def print_statistics(self):
return None
def vocabulary(self):
return set(self.keys())
def vocabulary_polar(self):
vocabulary = set()
for key in self:
if self.polarity(key) != 0:
vocabulary.add(key)
return vocabulary
def polarity(self,word):
if word in self:
# how to select the polarity most representative among different
# PoS and SRL?
# I took the first PoS occurency and the fist SRL always
return self[word][0][1][0][1]
else:
return None
def get_name(self):
return 'SentiLex'