forked from augustocsmartins/coh-metrix-dementia
-
Notifications
You must be signed in to change notification settings - Fork 1
/
base.py
311 lines (249 loc) · 10.6 KB
/
base.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
#-*- coding: utf-8 -*-
# base.py - Basic classes for accessing Coh-Metrix-Port's functionality.
# Copyright (C) 2014 Andre Luiz Verucci da Cunha
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
from itertools import chain
from coh.resources import senter, word_tokenize,\
pos_tagger
import codecs
class Text(object):
"""Represents a text: its content and metadata.
A text has several (optional) attributes: title, author,
source, publication data and genre.
"""
def __init__(self, filepath, encoding='utf-8', title='', author='',
source='', publication_date='', genre=''):
"""Form a text.
Required arguments:
filepath -- a path to the file containing the text. The text is
supposed to be formatted as one paragraph per line, with
multiple sentences per paragraph. Blank lines are ignored.
Keyword arguments:
encoding -- The encoding of the input file (default "utf-8")
title -- The title of the text (default "").
author -- The author of the text (default "").
source -- Where the text came from, usually a URL (default "").
publication_date -- When the text was released (default "").
genre -- The textual genre that better fits the text (default "").
"""
self.title = title
self.author = author
self.source = source
self.publication_date = publication_date
self.genre = genre
with codecs.open(filepath, mode='r', encoding=encoding)\
as input_file:
content = input_file.readlines()
self.paragraphs = [line.strip() for line in content
if not line.isspace()]
def __str__(self):
return '<Text: "%s...">' % (self.paragraphs[0][:70])
@property
def sentences(self):
"""Return a list of strings, each one being a sentence of the text.
"""
if not hasattr(self, '_sentences'):
_sentences = chain.from_iterable(
map(senter.tokenize, self.paragraphs))
self._sentences = list(_sentences)
return self._sentences
@property
def words(self):
"""Return a list of lists of strings, where each list of strings
corresponds to a sentence, and each string in the list is a word.
"""
if not hasattr(self, '_words'):
self._words = list(map(word_tokenize, self.sentences))
return self._words
@property
def all_words(self):
"""Return all words of the text in a single list.
"""
if not hasattr(self, '_all_words'):
self._all_words = list(chain.from_iterable(self.words))
return self._all_words
@property
def tagged_sentences(self):
"""Return a list of lists of pairs (string, string), representing
the sentences with tagged words.
"""
if not hasattr(self, '_tagged_sentences'):
self._tagged_sentences = pos_tagger.batch_tag(self.words)
return self._tagged_sentences
@property
def tagged_words(self):
"""Return a list of pair (string, string), representing the tokens
not separated in sentences.
"""
if not hasattr(self, '_tagged_words'):
self._tagged_words = list(
chain.from_iterable(self.tagged_sentences))
return self._tagged_words
class Category(object):
"""Represents a set of taxonomically related metrics.
"""
def __init__(self, name="", table_name="", desc=None):
"""Form a category.
Keyword arguments:
name -- A succint name of the category (e.g., 'Basic Counts'). If
no name is provided, the class name is used. (default "")
table_name -- The name of the table in coh_user_data that contains
the values of this category on the users's texts. If no value is
specified, Coh-Metrix-Port will check whether 'name' is a valid
table name; if so, 'name' is used as the table name. (default "")
desc -- A longer description of the category. Used for UI purposes.
If no value is passed, the docstring of the class is used.
(default None)
"""
if name == '':
name = self.__class__.__name__
self.name = name
if table_name == '':
# TODO: check if 'name' is a valid table name.
table_name = name
self.table_name = table_name
if desc is None:
desc = self.__doc__
self.desc = desc
def _set_metrics_from_module(self, module):
"""Set self.metrics as the list of Metric subclasses declared in
a module.
Required arguments:
module -- the name of module that will be scanned for metrics.
"""
import sys
import inspect
self.metrics = [obj() for _, obj
in inspect.getmembers(sys.modules[module])
if inspect.isclass(obj) and issubclass(obj, Metric)]
def values_for_text(self, text):
"""Calculate the value of each metric in a text and return it in a
ResultSet.
Required arguments:
text -- the text whose metrics will be extracted.
Returns: a ResultSet containing the calculated metrics.
"""
#metrics_values = ResultSet([m.value_for_text(text).items()[0]
# for m in self.metrics])
metrics_values = ResultSet([(m, m.value_for_text(text))
for m in self.metrics])
#return ResultSet([(self, metrics_values)])
return metrics_values
def __str__(self):
return '<Category: %s: %s>' % \
(self.name, str([m.name for m in self.metrics]))
def __getattr__(self, attr):
# A metric's column name can be used as an attribute to access its
# object in self.metrics.
for m in self.metrics:
if m.column_name == attr:
return m
raise AttributeError('%s: no such metric.' % attr)
def __getitem__(self, key):
# A metric's column name and its name can be used as an index
# to access its object in self.metrics.
for m in self.metrics:
if m.column_name == key or m.name == key:
return m
raise KeyError('%s: no such metric.' % key)
class Metric(object):
"""A metric is a textual characteristic.
"""
def __init__(self, name="", column_name="", desc=""):
"""Form a metric.
Keyword arguments:
name -- A succint name of the metric (e.g., 'Flesch index'). If
no name is provided, the class name is used. (default "")
table_name -- The name of the column in the table corresponding to
the category of this metric in coh_user_data. If no value is
specified, Coh-Metrix-Port will check whether 'name' is a valid
table name; if so, 'name' is used as the table name. (default "")
desc -- A longer description of the metric. Used for UI purposes.
(default "")
"""
if name == '':
name = self.__class__.__name__
self.name = name
if column_name == '':
# TODO: check if 'name' is a valid table name.
column_name = name
self.column_name = column_name
self.desc = desc
def value_for_text(self, text):
"""Calculate the value of the metric in the text.
Required arguments:
text -- The text to be analyzed.
Returns: an integer value, corresponding to the metric.
"""
from random import randrange
#return ResultSet([(self, randrange(1, 100))])
#TODO: replace by an exception raising.
return randrange(1, 100)
def __str__(self):
return '<Metric: %s> ' % (self.name)
class MetricsSet(object):
def __init__(self, categories):
self.categories = categories
def _set_categories_from_module(self, module):
"""Set self.categories as the list of Category subclasses
declared in a module.
Required arguments:
module -- the name of module that will be scanned for categories.
"""
import sys
import inspect
self.categories = [obj() for _, obj
in inspect.getmembers(sys.modules[module])
if inspect.isclass(obj)
and issubclass(obj, Category)]
def values_for_text(self, t):
return ResultSet([(c, c.values_for_text(t)) for c in self.categories])
import collections
class ResultSet():
"""A dictionary-like structure that represents the values of
a set of metrics extracted from a text.
"""
def __init__(self, *args, **kwargs):
#TODO: To improve performance, replace OrderedDict by namedtuple.
self.store = collections.OrderedDict(*args, **kwargs)
def items(self):
return self.store.items()
def __getitem__(self, key):
if isinstance(key, int):
key = list(self.store.items())[key][0]
return self.store[self.__keytransform__(key)]
def __setitem__(self, key, value):
self.store[self.__keytransform__(key)] = value
def __delitem__(self, key):
del self.store[self.__keytransform__(key)]
def __iter__(self):
return iter(self.store)
def __len__(self):
return len(self.store)
def __keytransform__(self, key):
return key
def __getattr__(self, attr):
for key in self.store.keys():
if (isinstance(key, Category) and key.table_name == attr) or \
(isinstance(key, Metric) and key.column_name == attr):
return self.store[key]
def __str__(self):
string = ''
for key, value in self.store.items():
if isinstance(key, Category):
string = string + '%s:\n%s' % (key.name, value)
elif isinstance(key, Metric):
string = string + ' %s: %s\n' % (key.name, value)
return string.rstrip()