-
Notifications
You must be signed in to change notification settings - Fork 3
/
utils.py
226 lines (165 loc) · 6.93 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import os.path
import configparser
import math
import numpy as np
from ital import *
from datasets import load_dataset, RetrievalDataset, MultitaskRetrievalDataset, RegressionDataset
############
## Config ##
############
LEARNERS = {
'ITAL' : ITAL,
'EMOC' : EMOC,
'MCMI' : MCMI_min,
'AdaptAL' : AdaptAL,
'SUD' : SUD,
'RBMAL' : RBMAL,
'TCAL' : TCAL,
'USDM' : USDM,
'entropy' : EntropySampling,
'random' : RandomRetrieval,
'border' : BorderlineSampling,
'border_div': BorderlineDiversitySampling,
'topscoring': TopscoringSampling,
'var' : VarianceSampling,
'unc' : UncertaintySampling
}
REGRESSION_LEARNERS = {
'ITAL' : ITAL_Regression,
'EMOC' : EMOC_Regression,
'entropy' : EntropySampling_Regression,
'random' : RandomRetrieval_Regression,
'var' : VarianceSampling_Regression
}
def read_config_file(config_file, section, overrides):
""" Reads a configuration file.
Config files follow the format understood by `configparser.ConfigParser`.
In addition, this function tries to cast all values in the config file to int, float,
or boolean. It also handles the special "import" option that can be used to specify
a white-space separated list of other config files to be read before this one.
# Arguments:
- config_file: path to the config file.
- section: name of the section to search for "import" options in and to apply overrides to.
- overrides: dictionary with options overriding the ones read from the config file in the
section given by `section`.
# Returns:
a configparser.ConfigParser instance.
"""
# Read config file
config = configparser.ConfigParser(interpolation = ConversionInterpolation())
with open(config_file) as cf:
config.read_file(cf)
# Handle imports of other config files
imports = config.get(section, 'import', fallback = None)
if imports:
config_base = os.path.dirname(config_file)
if config_base == '':
config_base = '.'
imports = [p.strip() if os.path.isabs(p.strip()) else os.path.join(config_base, p.strip()) for p in imports.split()]
config.read(imports + [config_file])
# Apply overrides
for k, v in overrides.items():
config[section][k] = v
return config
def load_config(config_file, section, overrides = {}):
""" Instantiates a dataset and a learner from a given config file.
See `read_config_file` for information about the format of the config file
and the arguments of this function.
# Returns:
a (parser, dataset, learner) tuple whose individual components are:
- parser: a configparser.ConfigParser instance,
- dataset: a dataset.Dataset instance,
- learner: either an ital.retrieval_base.ActiveRetrievalBase instance, an
ital.regression_base.ActiveRegressionBase instance.
The learner is usually initialized with the data from the dataset,
except in the case of a MultitaskDataset, where an uninitialized
learner will be returned.
"""
# Read config file
config = read_config_file(config_file, section, overrides)
# Set up dataset
dataset = config[section]['dataset']
dataset = load_dataset(dataset, **config[dataset])
# Set up learner
learner = config[section]['method']
learner_config = dict(config['METHOD_DEFAULTS']) if 'METHOD_DEFAULTS' in config else dict()
if learner in config:
learner_config.update(config[learner])
if isinstance(dataset, RegressionDataset):
learner = REGRESSION_LEARNERS[learner](dataset.X_train_norm, **learner_config)
elif isinstance(dataset, MultitaskRetrievalDataset):
learner = LEARNERS[learner](**learner_config)
else:
learner = LEARNERS[learner](dataset.X_train_norm, **learner_config)
return config, dataset, learner
def load_dataset_from_config(config_file, section, overrides = {}):
""" Instantiates a dataset from a given config file.
See `read_config_file` for information about the format of the config file
and the arguments of this function.
# Returns:
a (configparser.ConfigParser, datasets.Dataset) tuple.
"""
# Read config file
config = read_config_file(config_file, section, overrides)
# Set up dataset
dataset = config[section]['dataset']
dataset = load_dataset(dataset, **config[dataset])
return config, dataset
class ConversionInterpolation(configparser.BasicInterpolation):
""" Interpolation for ConfigParser instances trying to cast all values to int, float, or boolean. """
def before_get(self, parser, section, option, value, defaults):
val = configparser.BasicInterpolation.before_get(self, parser, section, option, value, defaults)
try:
return int(val)
except ValueError:
pass
try:
return float(val)
except ValueError:
pass
if val.lower() in ('yes','on','true'):
return True
elif val.lower() in ('no','off','false'):
return False
else:
return val
########################
## Evaluation Metrics ##
########################
def ndcg(y_true, y_score):
""" Computes the Normalized Discounted Cumulative Gain (NDCG) of given retrieval results.
# Arguments:
- y_true: ground-truth relevance labels of the retrieved samples.
- y_score: predicted relevance scores of the retrieved samples.
# Returns:
float
"""
num_relevant = sum(yt > 0 for yt in y_true)
retrieved = np.argsort(y_score)[::-1]
rank, cgain, normalizer = 0, 0.0, 0.0
for ret in retrieved:
if y_true[ret] != 0:
rank += 1
gain = 1.0 / math.log2(rank + 1)
if y_true[ret] > 0:
cgain += gain
if rank <= num_relevant:
normalizer += gain
return cgain / normalizer
def area_under_curve(perf, normalized = True):
""" Computes the area under curve for a sequence of performance metrics.
# Arguments:
- perf: either a vector of performance measures for a number of consecutive
active learning steps or a 2-D array containing one such vector per row.
- normalized: if True, the x-axis will be re-scaled so that the best possible AUC is 1.0.
# Returns:
float if perf is a vector, or vector with as many rows as perf if it is a 2-D array
"""
perf = np.asarray(perf)
if perf.ndim == 1:
single = True
perf = perf[None,:]
else:
single = False
auc = (perf[:,1:-1].sum(axis = -1) + (perf[:,0] + perf[:,-1]) / 2) / perf.shape[1]
return auc[0] if single else auc