1
1
#!/usr/bin/env python
2
2
3
- import sys
4
3
import argparse
5
- import logging
6
- import time
4
+ import sys
5
+
7
6
import morfessor
8
7
from morfessor import _logger
9
8
10
- def main (argv ):
11
- parser = argparse .ArgumentParser (
12
- prog = 'morfessor-train' ,
13
- description = """
14
- Morfessor %s
15
-
16
- Copyright (c) 2012, Sami Virpioja and Peter Smit
17
- All rights reserved.
18
9
19
- Redistribution and use in source and binary forms, with or without
20
- modification, are permitted provided that the following conditions
21
- are met:
22
-
23
- 1. Redistributions of source code must retain the above copyright
24
- notice, this list of conditions and the following disclaimer.
25
-
26
- 2. Redistributions in binary form must reproduce the above
27
- copyright notice, this list of conditions and the following
28
- disclaimer in the documentation and/or other materials provided
29
- with the distribution.
30
-
31
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
32
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
33
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
34
- FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
35
- COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
36
- INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
37
- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
38
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
39
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
40
- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
41
- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
42
- POSSIBILITY OF SUCH DAMAGE.
43
-
44
- Command-line arguments:
45
- """ % morfessor .__version__ ,
46
- epilog = """
10
+ def main (argv ):
11
+ parser = morfessor .get_default_argparser ()
12
+ parser .prog = "morfessor-train"
13
+ parser .epilog = """
47
14
Simple usage example (train a model and save it to model.pickled):
48
15
49
16
%(prog)s -s model.pickled training_corpus.txt
@@ -52,312 +19,28 @@ Interactive use (read corpus from user):
52
19
53
20
%(prog)s -m online -v 2 -
54
21
55
- """ ,
56
- formatter_class = argparse .RawDescriptionHelpFormatter ,
57
- add_help = False )
22
+ """
23
+
24
+ keep_options = ['savesegfile' , 'savefile' , 'trainmode' , 'dampening' ,
25
+ 'encoding' , 'list' , 'skips' , 'annofile' , 'develfile' ,
26
+ 'corpusweight' , 'annotationweight' , 'help' , 'version' ]
27
+ for action_group in parser ._action_groups :
28
+ for arg in action_group ._group_actions :
29
+ if arg .dest not in keep_options :
30
+ arg .help = argparse .SUPPRESS
58
31
59
- # Positional arguments
60
32
parser .add_argument ('trainfiles' , metavar = '<file>' , nargs = '+' ,
61
33
help = 'training data files' )
62
34
63
- # Options for input data files
64
- add_arg = parser .add_argument_group ('input data files' ).add_argument
65
- add_arg ('-l' , '--load' , dest = "loadfile" , default = None , metavar = '<file>' ,
66
- help = "load existing model from file (pickled model object)" )
67
- add_arg ('-L' , '--load-segmentation' , dest = "loadsegfile" , default = None ,
68
- metavar = '<file>' ,
69
- help = "load existing model from segmentation "
70
- "file (Morfessor 1.0 format)" )
71
-
72
- # Options for output data files
73
- add_arg = parser .add_argument_group ('output data files' ).add_argument
74
- add_arg ('-s' , '--save' , dest = "savefile" , default = None , metavar = '<file>' ,
75
- help = "save final model to file (pickled model object)" )
76
- add_arg ('-S' , '--save-segmentation' , dest = "savesegfile" , default = None ,
77
- metavar = '<file>' ,
78
- help = "save model segmentations to file (Morfessor 1.0 format)" )
79
- add_arg ('-x' , '--lexicon' , dest = "lexfile" , default = None , metavar = '<file>' ,
80
- help = "output final lexicon to given file" )
81
-
82
- # Options for data formats
83
- add_arg = parser .add_argument_group (
84
- 'data format options' ).add_argument
85
- add_arg ('-e' , '--encoding' , dest = 'encoding' , metavar = '<encoding>' ,
86
- help = "encoding of input and output files (if none is given, "
87
- "both the local encoding and UTF-8 are tried)" )
88
- add_arg ('--traindata-list' , dest = "list" , default = False ,
89
- action = 'store_true' ,
90
- help = "input file(s) for batch training are lists "
91
- "(one compound per line, optionally count as a prefix)" )
92
- add_arg ('--atom-separator' , dest = "separator" , type = str , default = None ,
93
- metavar = '<regexp>' ,
94
- help = "atom separator regexp (default %(default)s)" )
95
- add_arg ('--compound-separator' , dest = "cseparator" , type = str , default = '\s+' ,
96
- metavar = '<regexp>' ,
97
- help = "compound separator regexp (default '%(default)s')" )
98
- add_arg ('--analysis-separator' , dest = 'analysisseparator' , type = str ,
99
- default = ',' , metavar = '<regexp>' ,
100
- help = "separator for different analyses in an annotation file. Use"
101
- " NONE for only allowing one analysis per line" )
102
-
103
- # Options for model training
104
- add_arg = parser .add_argument_group (
105
- 'training and segmentation options' ).add_argument
106
- add_arg ('-m' , '--mode' , dest = "trainmode" , default = 'init+batch' ,
107
- metavar = '<mode>' ,
108
- choices = ['none' , 'batch' , 'init' , 'init+batch' , 'online' ,
109
- 'online+batch' ],
110
- help = "training mode ('none', 'init', 'batch', 'init+batch', "
111
- "'online', or 'online+batch'; default '%(default)s')" )
112
- add_arg ('-a' , '--algorithm' , dest = "algorithm" , default = 'recursive' ,
113
- metavar = '<algorithm>' , choices = ['recursive' , 'viterbi' ],
114
- help = "algorithm type ('recursive', 'viterbi'; default "
115
- "'%(default)s')" )
116
- add_arg ('-d' , '--dampening' , dest = "dampening" , type = str , default = 'none' ,
117
- metavar = '<type>' , choices = ['none' , 'log' , 'ones' ],
118
- help = "frequency dampening for training data ('none', 'log', or "
119
- "'ones'; default '%(default)s')" )
120
- add_arg ('-f' , '--forcesplit' , dest = "forcesplit" , type = list , default = ['-' ],
121
- metavar = '<list>' ,
122
- help = "force split on given atoms (default %(default)s)" )
123
- add_arg ('-F' , '--finish-threshold' , dest = 'finish_threshold' , type = float ,
124
- default = 0.005 , metavar = '<float>' ,
125
- help = "Stopping threshold. Training stops when "
126
- "the improvement of the last iteration is"
127
- "smaller then finish_threshold * #boundaries; "
128
- "(default '%(default)s')" )
129
- add_arg ('-r' , '--randseed' , dest = "randseed" , default = None ,
130
- metavar = '<seed>' ,
131
- help = "seed for random number generator" )
132
- add_arg ('-R' , '--randsplit' , dest = "splitprob" , default = None , type = float ,
133
- metavar = '<float>' ,
134
- help = "initialize new words by random splitting using the given "
135
- "split probability (default no splitting)" )
136
- add_arg ('--skips' , dest = "skips" , default = False , action = 'store_true' ,
137
- help = "use random skips for frequently seen compounds to speed up "
138
- "training" )
139
- add_arg ('--batch-minfreq' , dest = "freqthreshold" , type = int , default = 1 ,
140
- metavar = '<int>' ,
141
- help = "compound frequency threshold for batch training (default "
142
- "%(default)s)" )
143
- add_arg ('--online-epochint' , dest = "epochinterval" , type = int ,
144
- default = 10000 , metavar = '<int>' ,
145
- help = "epoch interval for online training (default %(default)s)" )
146
- add_arg ('--viterbi-smoothing' , dest = "viterbismooth" , default = 0 ,
147
- type = float , metavar = '<float>' ,
148
- help = "additive smoothing parameter for Viterbi training "
149
- "and segmentation (default %(default)s)" )
150
- add_arg ('--viterbi-maxlen' , dest = "viterbimaxlen" , default = 30 ,
151
- type = int , metavar = '<int>' ,
152
- help = "maximum construction length in Viterbi training "
153
- "and segmentation (default %(default)s)" )
154
-
155
- # Options for semi-supervised model training
156
- add_arg = parser .add_argument_group (
157
- 'semi-supervised training options' ).add_argument
158
- add_arg ('-A' , '--annotations' , dest = "annofile" , default = None ,
159
- metavar = '<file>' ,
160
- help = "load annotated data for semi-supervised learning" )
161
- add_arg ('-D' , '--develset' , dest = "develfile" , default = None ,
162
- metavar = '<file>' ,
163
- help = "load annotated data for tuning the corpus weight parameter" )
164
- add_arg ('-w' , '--corpusweight' , dest = "corpusweight" , type = float ,
165
- default = 1.0 , metavar = '<float>' ,
166
- help = "corpus weight parameter (default %(default)s); "
167
- "sets the initial value if --develset is used" )
168
- add_arg ('-W' , '--annotationweight' , dest = "annotationweight" ,
169
- type = float , default = None , metavar = '<float>' ,
170
- help = "corpus weight parameter for annotated data (if unset, the "
171
- "weight is set to balance the number of tokens in annotated "
172
- "and unannotated data sets)" )
173
-
174
- # Options for logging
175
- add_arg = parser .add_argument_group ('logging options' ).add_argument
176
- add_arg ('-v' , '--verbose' , dest = "verbose" , type = int , default = 1 ,
177
- metavar = '<int>' ,
178
- help = "verbose level; controls what is written to the standard "
179
- "error stream or log file (default %(default)s)" )
180
- add_arg ('--logfile' , dest = 'log_file' , metavar = '<file>' ,
181
- help = "write log messages to file in addition to standard "
182
- "error stream" )
183
- add_arg ('--progressbar' , dest = 'progress' , default = False ,
184
- action = 'store_true' ,
185
- help = "Force the progressbar to be displayed (possibly lowers the "
186
- "log level for the standard error stream)" )
187
-
188
- add_arg = parser .add_argument_group ('other options' ).add_argument
189
- add_arg ('-h' , '--help' , action = 'help' ,
190
- help = "show this help message and exit" )
191
- add_arg ('--version' , action = 'version' ,
192
- version = '%(prog)s ' + morfessor .__version__ ,
193
- help = "show version number and exit" )
194
-
195
- args = parser .parse_args (argv [1 :])
196
-
197
- if args .verbose >= 2 :
198
- loglevel = logging .DEBUG
199
- elif args .verbose >= 1 :
200
- loglevel = logging .INFO
201
- else :
202
- loglevel = logging .WARNING
203
-
204
- logging_format = '%(asctime)s - %(message)s'
205
- date_format = '%Y-%m-%d %H:%M:%S'
206
- default_formatter = logging .Formatter (logging_format , date_format )
207
- plain_formatter = logging .Formatter ('%(message)s' )
208
- logging .basicConfig (level = loglevel )
209
- _logger .propagate = False # do not forward messages to the root logger
210
-
211
- # Basic settings for logging to the error stream
212
- ch = logging .StreamHandler ()
213
- ch .setLevel (loglevel )
214
- ch .setFormatter (plain_formatter )
215
- _logger .addHandler (ch )
216
-
217
- # Settings for when log_file is present
218
- if args .log_file is not None :
219
- fh = logging .FileHandler (args .log_file , 'w' )
220
- fh .setLevel (loglevel )
221
- fh .setFormatter (default_formatter )
222
- _logger .addHandler (fh )
223
- # If logging to a file, make INFO the highest level for the
224
- # error stream
225
- ch .setLevel (max (loglevel , logging .INFO ))
226
-
227
- # If debug messages are printed to screen or if stderr is not a tty (but
228
- # a pipe or a file), don't show the progressbar
229
- global show_progress_bar
230
- if (ch .level > logging .INFO or
231
- (hasattr (sys .stderr , 'isatty' ) and not sys .stderr .isatty ())):
232
- show_progress_bar = False
233
-
234
- if args .progress :
235
- show_progress_bar = True
236
- ch .setLevel (min (ch .level , logging .INFO ))
237
-
238
- if len (args .trainfiles ) == 0 :
239
- parser .error ("not training data files defined" )
240
-
241
- if args .randseed is not None :
242
- random .seed (args .randseed )
243
-
244
- io = morfessor .MorfessorIO (encoding = args .encoding ,
245
- compound_separator = args .cseparator ,
246
- atom_separator = args .separator )
247
-
248
- # Load exisiting model or create a new one
249
- if args .loadfile is not None :
250
- model = io .read_binary_model_file (args .loadfile )
251
-
252
- else :
253
- model = morfessor .BaselineModel (forcesplit_list = args .forcesplit ,
254
- corpusweight = args .corpusweight ,
255
- use_skips = args .skips )
256
-
257
- if args .loadsegfile is not None :
258
- model .load_segmentations (io .read_segmentation_file (args .loadsegfile ))
259
-
260
- analysis_sep = (args .analysisseparator
261
- if args .analysisseparator != 'NONE' else None )
262
-
263
- if args .annofile is not None :
264
- annotations = io .read_annotations_file (args .annofile ,
265
- analysis_sep = analysis_sep )
266
- model .set_annotations (annotations , args .annotationweight )
267
-
268
- if args .develfile is not None :
269
- develannots = io .read_annotations_file (args .develfile ,
270
- analysis_sep = analysis_sep )
271
- else :
272
- develannots = None
273
-
274
- # Set frequency dampening function
275
- if args .dampening == 'none' :
276
- dampfunc = lambda x : x
277
- elif args .dampening == 'log' :
278
- dampfunc = lambda x : int (round (math .log (x + 1 , 2 )))
279
- elif args .dampening == 'ones' :
280
- dampfunc = lambda x : 1
281
- else :
282
- parser .error ("unknown dampening type '%s'" % args .dampening )
283
-
284
- # Set algorithm parameters
285
- if args .algorithm == 'viterbi' :
286
- algparams = (args .viterbismooth , args .viterbimaxlen )
287
- else :
288
- algparams = ()
289
-
290
- # Train model
291
- if args .trainmode == 'none' :
292
- pass
293
- elif args .trainmode == 'batch' :
294
- if len (model .get_compounds ()) == 0 :
295
- _logger .warning ("Model contains no compounds for batch training."
296
- " Use 'init+batch' mode to add new data." )
297
- else :
298
- if len (args .trainfiles ) > 0 :
299
- _logger .warning ("Training mode 'batch' ignores new data "
300
- "files. Use 'init+batch' or 'online' to "
301
- "add new compounds." )
302
- ts = time .time ()
303
- e , c = model .train_batch (args .algorithm , algparams , develannots ,
304
- args .finish_threshold )
305
- te = time .time ()
306
- _logger .info ("Epochs: %s" % e )
307
- _logger .info ("Final cost: %s" % c )
308
- _logger .info ("Training time: %.3fs" % (te - ts ))
309
- elif len (args .trainfiles ) > 0 :
310
- ts = time .time ()
311
- if args .trainmode == 'init' :
312
- for f in args .trainfiles :
313
- if args .list :
314
- data = io .read_corpus_list_file (f )
315
- else :
316
- data = io .read_corpus_file (f )
317
- c = model .load_data (data , args .freqthreshold , dampfunc ,
318
- args .splitprob )
319
- elif args .trainmode == 'init+batch' :
320
- for f in args .trainfiles :
321
- if args .list :
322
- data = io .read_corpus_list_file (f )
323
- else :
324
- data = io .read_corpus_file (f )
325
- model .load_data (data , args .freqthreshold , dampfunc ,
326
- args .splitprob )
327
- e , c = model .train_batch (args .algorithm , algparams , develannots ,
328
- args .finish_threshold )
329
- _logger .info ("Epochs: %s" % e )
330
- elif args .trainmode == 'online' :
331
- data = io .read_corpus_files (args .trainfiles )
332
- e , c = model .train_online (data , dampfunc , args .epochinterval ,
333
- args .algorithm , algparams ,
334
- args .splitprob )
335
- _logger .info ("Epochs: %s" % e )
336
- elif args .trainmode == 'online+batch' :
337
- data = io .read_corpus_files (args .trainfiles )
338
- e , c = model .train_online (data , dampfunc , args .epochinterval ,
339
- args .algorithm , algparams ,
340
- args .splitprob )
341
- e , c = model .train_batch (args .algorithm , algparams , develannots ,
342
- args .finish_threshold )
343
- _logger .info ("Epochs: %s" % e )
344
- else :
345
- parser .error ("unknown training mode '%s'" % args .trainmode )
346
- te = time .time ()
347
- _logger .info ("Final cost: %s" % c )
348
- _logger .info ("Training time: %.3fs" % (te - ts ))
349
- else :
350
- _logger .warning ("No training data files specified." )
351
-
352
- # Save model
353
- if args .savefile is not None :
354
- io .write_binary_model_file (args .savefile , model )
355
-
356
- if args .savesegfile is not None :
357
- io .write_segmentation_file (args .savesegfile , model .get_segmentations ())
35
+ try :
36
+ args = parser .parse_args (argv )
37
+ morfessor .main (args )
38
+ except morfessor .ArgumentException as e :
39
+ parser .error (e .message )
40
+ except Exception as e :
41
+ _logger .error ("Fatal Error %s %s" % (type (e ), str (e )))
42
+ raise
358
43
359
- # Output lexicon
360
- if args .lexfile is not None :
361
- io .write_lexicon_file (args .lexfile , model .get_constructions ())
362
44
363
- main (sys .argv )
45
+ if __name__ == "__main__" :
46
+ main (sys .argv [1 :])
0 commit comments