Skip to content

Commit 4d9baa8

Browse files
committedFeb 28, 2013
Create new version of convenience scripts (morfessor-train, morfessor-segment)
1 parent 1257427 commit 4d9baa8

File tree

5 files changed

+109
-555
lines changed

5 files changed

+109
-555
lines changed
 

‎.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ develop-eggs
1818
lib
1919
lib64
2020
MANIFEST
21+
env*
2122

2223
# Installer logs
2324
pip-log.txt

‎morfessor.py

+42-30
Original file line numberDiff line numberDiff line change
@@ -1275,11 +1275,11 @@ def get_cost(self):
12751275
return 0.0
12761276

12771277
n = self.tokens + self.boundaries
1278-
return ((n * math.log(n)
1279-
- self.boundaries * math.log(self.boundaries)
1280-
- self.logtokensum
1281-
+ self.permutations_cost()) * self.weight
1282-
+ self.frequency_distribution_cost())
1278+
return ((n * math.log(n)
1279+
- self.boundaries * math.log(self.boundaries)
1280+
- self.logtokensum
1281+
+ self.permutations_cost()) * self.weight
1282+
+ self.frequency_distribution_cost())
12831283

12841284

12851285
class CorpusEncoding(Encoding):
@@ -1322,10 +1322,10 @@ def get_cost(self):
13221322
return 0.0
13231323

13241324
n = self.tokens + self.boundaries
1325-
return ((n * math.log(n)
1326-
- self.boundaries * math.log(self.boundaries)
1327-
- self.logtokensum) * self.weight
1328-
+ self.frequency_distribution_cost())
1325+
return ((n * math.log(n)
1326+
- self.boundaries * math.log(self.boundaries)
1327+
- self.logtokensum) * self.weight
1328+
+ self.frequency_distribution_cost())
13291329

13301330

13311331
class AnnotatedCorpusEncoding(Encoding):
@@ -1410,10 +1410,10 @@ def get_cost(self):
14101410
if self.boundaries == 0:
14111411
return 0.0
14121412
n = self.tokens + self.boundaries
1413-
return ((n * math.log(self.corpus_coding.tokens +
1414-
self.corpus_coding.boundaries)
1415-
- self.boundaries * math.log(self.corpus_coding.boundaries)
1416-
- self.logtokensum) * self.weight)
1413+
return ((n * math.log(self.corpus_coding.tokens +
1414+
self.corpus_coding.boundaries)
1415+
- self.boundaries * math.log(self.corpus_coding.boundaries)
1416+
- self.logtokensum) * self.weight)
14171417

14181418

14191419
class LexiconEncoding(Encoding):
@@ -1468,7 +1468,7 @@ def get_codelength(self, construction):
14681468
return cost
14691469

14701470

1471-
def main(argv):
1471+
def get_default_argparser():
14721472
import argparse
14731473

14741474
parser = argparse.ArgumentParser(
@@ -1557,7 +1557,7 @@ def main(argv):
15571557
'data format options').add_argument
15581558
add_arg('-e', '--encoding', dest='encoding', metavar='<encoding>',
15591559
help="encoding of input and output files (if none is given, "
1560-
"both the local encoding and UTF-8 are tried)")
1560+
"both the local encoding and UTF-8 are tried)")
15611561
add_arg('--traindata-list', dest="list", default=False,
15621562
action='store_true',
15631563
help="input file(s) for batch training are lists "
@@ -1574,9 +1574,9 @@ def main(argv):
15741574
" NONE for only allowing one analysis per line")
15751575
add_arg('--output-format', dest='outputformat', type=str,
15761576
default=r'{analysis}\n', metavar='<format>',
1577-
help="format string for --output file. Valid keywords are "
1578-
"{analysis}, {compound}, {count}, and {logprob} "
1579-
"(default: '%(default)s')")
1577+
help="format string for --output file. Valid keywords are "
1578+
"{analysis}, {compound}, {count}, and {logprob} "
1579+
"(default: '%(default)s')")
15801580

15811581
# Options for model training
15821582
add_arg = parser.add_argument_group(
@@ -1586,7 +1586,7 @@ def main(argv):
15861586
choices=['none', 'batch', 'init', 'init+batch', 'online',
15871587
'online+batch'],
15881588
help="training mode ('none', 'init', 'batch', 'init+batch', "
1589-
"'online', or 'online+batch'; default '%(default)s')")
1589+
"'online', or 'online+batch'; default '%(default)s')")
15901590
add_arg('-a', '--algorithm', dest="algorithm", default='recursive',
15911591
metavar='<algorithm>', choices=['recursive', 'viterbi'],
15921592
help="algorithm type ('recursive', 'viterbi'; default "
@@ -1624,11 +1624,11 @@ def main(argv):
16241624
add_arg('--viterbi-smoothing', dest="viterbismooth", default=0,
16251625
type=float, metavar='<float>',
16261626
help="additive smoothing parameter for Viterbi training "
1627-
"and segmentation (default %(default)s)")
1627+
"and segmentation (default %(default)s)")
16281628
add_arg('--viterbi-maxlen', dest="viterbimaxlen", default=30,
16291629
type=int, metavar='<int>',
16301630
help="maximum construction length in Viterbi training "
1631-
"and segmentation (default %(default)s)")
1631+
"and segmentation (default %(default)s)")
16321632

16331633
# Options for semi-supervised model training
16341634
add_arg = parser.add_argument_group(
@@ -1642,7 +1642,7 @@ def main(argv):
16421642
add_arg('-w', '--corpusweight', dest="corpusweight", type=float,
16431643
default=1.0, metavar='<float>',
16441644
help="corpus weight parameter (default %(default)s); "
1645-
"sets the initial value if --develset is used")
1645+
"sets the initial value if --develset is used")
16461646
add_arg('-W', '--annotationweight', dest="annotationweight",
16471647
type=float, default=None, metavar='<float>',
16481648
help="corpus weight parameter for annotated data (if unset, the "
@@ -1657,7 +1657,7 @@ def main(argv):
16571657
"error stream or log file (default %(default)s)")
16581658
add_arg('--logfile', dest='log_file', metavar='<file>',
16591659
help="write log messages to file in addition to standard "
1660-
"error stream")
1660+
"error stream")
16611661
add_arg('--progressbar', dest='progress', default=False,
16621662
action='store_true',
16631663
help="Force the progressbar to be displayed (possibly lowers the "
@@ -1670,8 +1670,10 @@ def main(argv):
16701670
version='%(prog)s ' + __version__,
16711671
help="show version number and exit")
16721672

1673-
args = parser.parse_args(argv[1:])
1673+
return parser
16741674

1675+
1676+
def main(args):
16751677
if args.verbose >= 2:
16761678
loglevel = logging.DEBUG
16771679
elif args.verbose >= 1:
@@ -1716,7 +1718,8 @@ def main(argv):
17161718
if (args.loadfile is None and
17171719
args.loadsegfile is None and
17181720
len(args.trainfiles) == 0):
1719-
parser.error("either model file or training data should be defined")
1721+
raise ArgumentException("either model file or training data should "
1722+
"be defined")
17201723

17211724
if args.randseed is not None:
17221725
random.seed(args.randseed)
@@ -1759,7 +1762,7 @@ def main(argv):
17591762
elif args.dampening == 'ones':
17601763
dampfunc = lambda x: 1
17611764
else:
1762-
parser.error("unknown dampening type '%s'" % args.dampening)
1765+
raise ArgumentException("unknown dampening type '%s'" % args.dampening)
17631766

17641767
# Set algorithm parameters
17651768
if args.algorithm == 'viterbi':
@@ -1822,7 +1825,8 @@ def main(argv):
18221825
args.finish_threshold)
18231826
_logger.info("Epochs: %s" % e)
18241827
else:
1825-
parser.error("unknown training mode '%s'" % args.trainmode)
1828+
raise ArgumentException("unknown training mode '%s'"
1829+
% args.trainmode)
18261830
te = time.time()
18271831
_logger.info("Final cost: %s" % c)
18281832
_logger.info("Training time: %.3fs" % (te - ts))
@@ -1856,17 +1860,25 @@ def main(argv):
18561860
atoms, args.viterbismooth, args.viterbimaxlen)
18571861
analysis = ' '.join(constructions)
18581862
fobj.write(outformat.format(
1859-
analysis=analysis, compound=compound,
1860-
count=count, logprob=logp))
1863+
analysis=analysis, compound=compound,
1864+
count=count, logprob=logp))
18611865
i += 1
18621866
if i % 10000 == 0:
18631867
sys.stderr.write(".")
18641868
sys.stderr.write("\n")
18651869
_logger.info("Done.")
18661870

1871+
1872+
class ArgumentException(Exception):
1873+
pass
1874+
18671875
if __name__ == "__main__":
1876+
parser = get_default_argparser()
18681877
try:
1869-
main(sys.argv)
1878+
args = parser.parse_args(sys.argv[1:])
1879+
main(args)
1880+
except ArgumentException as e:
1881+
parser.error(e.message)
18701882
except Exception as e:
18711883
_logger.error("Fatal Error %s %s" % (type(e), str(e)))
18721884
raise

‎scripts/morfessor

+17-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,21 @@
22

33
import sys
44

5-
from morfessor import main
5+
import morfessor
6+
from morfessor import _logger
67

7-
main(sys.argv)
8+
9+
def main(argv):
10+
parser = morfessor.get_default_argparser()
11+
try:
12+
args = parser.parse_args(argv)
13+
morfessor.main(args)
14+
except morfessor.ArgumentException as e:
15+
parser.error(e.message)
16+
except Exception as e:
17+
_logger.error("Fatal Error %s %s" % (type(e), str(e)))
18+
raise
19+
20+
21+
if __name__ == "__main__":
22+
main(sys.argv[1:])

‎scripts/morfessor-segment

+24-181
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,16 @@
11
#!/usr/bin/env python
22

3-
import sys
43
import argparse
5-
import logging
6-
import time
4+
import sys
5+
76
import morfessor
87
from morfessor import _logger
98

10-
def main(argv):
11-
parser = argparse.ArgumentParser(
12-
prog='morfessor-segment',
13-
description="""
14-
Morfessor %s
15-
16-
Copyright (c) 2012, Sami Virpioja and Peter Smit
17-
All rights reserved.
18-
19-
Redistribution and use in source and binary forms, with or without
20-
modification, are permitted provided that the following conditions
21-
are met:
22-
23-
1. Redistributions of source code must retain the above copyright
24-
notice, this list of conditions and the following disclaimer.
259

26-
2. Redistributions in binary form must reproduce the above
27-
copyright notice, this list of conditions and the following
28-
disclaimer in the documentation and/or other materials provided
29-
with the distribution.
30-
31-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
32-
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
33-
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
34-
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
35-
COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
36-
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
37-
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
38-
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
39-
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
40-
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
41-
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
42-
POSSIBILITY OF SUCH DAMAGE.
43-
44-
Command-line arguments:
45-
""" % morfessor.__version__,
46-
epilog="""
10+
def main(argv):
11+
parser = morfessor.get_default_argparser()
12+
parser.prog = "morfessor-segment"
13+
parser.epilog = """
4714
Simple usage example (load model.pickled and use it to segment test corpus):
4815
4916
%(prog)s -l model.pickled -o test_corpus.segmented test_corpus.txt
@@ -52,151 +19,27 @@ Interactive use (read corpus from user):
5219
5320
%(prog)s -l model.pickled -
5421
55-
""",
56-
formatter_class=argparse.RawDescriptionHelpFormatter,
57-
add_help=False)
22+
"""
23+
24+
keep_options = ['encoding', 'loadfile', 'loadsegfile', 'outfile', 'help',
25+
'version']
26+
for action_group in parser._action_groups:
27+
for arg in action_group._group_actions:
28+
if arg.dest not in keep_options:
29+
arg.help = argparse.SUPPRESS
5830

59-
# Positional arguments
6031
parser.add_argument('testfiles', metavar='<file>', nargs='+',
6132
help='corpus files to segment')
6233

63-
# Options for input data files
64-
add_arg = parser.add_argument_group('input data files').add_argument
65-
add_arg('-l', '--load', dest="loadfile", default=None, metavar='<file>',
66-
help="load existing model from file (pickled model object)")
67-
add_arg('-L', '--load-segmentation', dest="loadsegfile", default=None,
68-
metavar='<file>',
69-
help="load existing model from segmentation "
70-
"file (Morfessor 1.0 format)")
71-
72-
# Options for output data files
73-
add_arg = parser.add_argument_group('output data files').add_argument
74-
add_arg('-o', '--output', dest="outfile", default='-', metavar='<file>',
75-
help="output file for test data results (for standard output, "
76-
"use '-'; default '%(default)s')")
77-
add_arg('-x', '--lexicon', dest="lexfile", default=None, metavar='<file>',
78-
help="output model lexicon to given file")
79-
80-
# Options for data formats
81-
add_arg = parser.add_argument_group(
82-
'data format options').add_argument
83-
add_arg('-e', '--encoding', dest='encoding', metavar='<encoding>',
84-
help="encoding of input and output files (if none is given, "
85-
"both the local encoding and UTF-8 are tried)")
86-
add_arg('--atom-separator', dest="separator", type=str, default=None,
87-
metavar='<regexp>',
88-
help="atom separator regexp (default %(default)s)")
89-
add_arg('--compound-separator', dest="cseparator", type=str, default='\s+',
90-
metavar='<regexp>',
91-
help="compound separator regexp (default '%(default)s')")
92-
93-
# Options for model training
94-
add_arg = parser.add_argument_group(
95-
'segmentation options').add_argument
96-
add_arg('--viterbi-smoothing', dest="viterbismooth", default=0,
97-
type=float, metavar='<float>',
98-
help="additive smoothing parameter for Viterbi training "
99-
"and segmentation (default %(default)s)")
100-
add_arg('--viterbi-maxlen', dest="viterbimaxlen", default=30,
101-
type=int, metavar='<int>',
102-
help="maximum construction length in Viterbi training "
103-
"and segmentation (default %(default)s)")
104-
105-
# Options for logging
106-
add_arg = parser.add_argument_group('logging options').add_argument
107-
add_arg('-v', '--verbose', dest="verbose", type=int, default=1,
108-
metavar='<int>',
109-
help="verbose level; controls what is written to the standard "
110-
"error stream or log file (default %(default)s)")
111-
add_arg('--logfile', dest='log_file', metavar='<file>',
112-
help="write log messages to file in addition to standard "
113-
"error stream")
114-
add_arg('--progressbar', dest='progress', default=False,
115-
action='store_true',
116-
help="Force the progressbar to be displayed (possibly lowers the "
117-
"log level for the standard error stream)")
118-
119-
add_arg = parser.add_argument_group('other options').add_argument
120-
add_arg('-h', '--help', action='help',
121-
help="show this help message and exit")
122-
add_arg('--version', action='version',
123-
version='%(prog)s ' + morfessor.__version__,
124-
help="show version number and exit")
125-
126-
args = parser.parse_args(argv[1:])
127-
128-
if args.verbose >= 2:
129-
loglevel = logging.DEBUG
130-
elif args.verbose >= 1:
131-
loglevel = logging.INFO
132-
else:
133-
loglevel = logging.WARNING
134-
135-
logging_format = '%(asctime)s - %(message)s'
136-
date_format = '%Y-%m-%d %H:%M:%S'
137-
default_formatter = logging.Formatter(logging_format, date_format)
138-
plain_formatter = logging.Formatter('%(message)s')
139-
logging.basicConfig(level=loglevel)
140-
_logger.propagate = False # do not forward messages to the root logger
141-
142-
# Basic settings for logging to the error stream
143-
ch = logging.StreamHandler()
144-
ch.setLevel(loglevel)
145-
ch.setFormatter(plain_formatter)
146-
_logger.addHandler(ch)
147-
148-
# Settings for when log_file is present
149-
if args.log_file is not None:
150-
fh = logging.FileHandler(args.log_file, 'w')
151-
fh.setLevel(loglevel)
152-
fh.setFormatter(default_formatter)
153-
_logger.addHandler(fh)
154-
# If logging to a file, make INFO the highest level for the
155-
# error stream
156-
ch.setLevel(max(loglevel, logging.INFO))
157-
158-
# If debug messages are printed to screen or if stderr is not a tty (but
159-
# a pipe or a file), don't show the progressbar
160-
global show_progress_bar
161-
if (ch.level > logging.INFO or
162-
(hasattr(sys.stderr, 'isatty') and not sys.stderr.isatty())):
163-
show_progress_bar = False
164-
165-
if args.progress:
166-
show_progress_bar = True
167-
ch.setLevel(min(ch.level, logging.INFO))
168-
169-
io = morfessor.MorfessorIO(encoding=args.encoding,
170-
compound_separator=args.cseparator,
171-
atom_separator=args.separator)
172-
173-
# Load exisiting model or create a new one
174-
if args.loadfile is not None:
175-
model = io.read_binary_model_file(args.loadfile)
176-
177-
else:
178-
model = morfessor.BaselineModel()
179-
180-
if args.loadsegfile is not None:
181-
model.load_segmentations(io.read_segmentation_file(args.loadsegfile))
182-
183-
# Output lexicon
184-
if args.lexfile is not None:
185-
io.write_lexicon_file(args.lexfile, model.get_constructions())
34+
try:
35+
args = parser.parse_args(argv)
36+
morfessor.main(args)
37+
except morfessor.ArgumentException as e:
38+
parser.error(e.message)
39+
except Exception as e:
40+
_logger.error("Fatal Error %s %s" % (type(e), str(e)))
41+
raise
18642

187-
# Segment test data
188-
_logger.info("Segmenting test data...")
189-
with io._open_text_file_write(args.outfile) as fobj:
190-
testdata = io.read_corpus_files(args.testfiles)
191-
i = 0
192-
for _, _, compound in testdata:
193-
constructions, logp = model.viterbi_segment(
194-
compound, args.viterbismooth, args.viterbimaxlen)
195-
fobj.write("%s\n" % ' '.join(constructions))
196-
i += 1
197-
if i % 10000 == 0:
198-
sys.stderr.write(".")
199-
sys.stderr.write("\n")
200-
_logger.info("Done.")
20143

202-
main(sys.argv)
44+
if __name__ == "__main__":
45+
main(sys.argv[1:])

‎scripts/morfessor-train

+25-342
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,16 @@
11
#!/usr/bin/env python
22

3-
import sys
43
import argparse
5-
import logging
6-
import time
4+
import sys
5+
76
import morfessor
87
from morfessor import _logger
98

10-
def main(argv):
11-
parser = argparse.ArgumentParser(
12-
prog='morfessor-train',
13-
description="""
14-
Morfessor %s
15-
16-
Copyright (c) 2012, Sami Virpioja and Peter Smit
17-
All rights reserved.
189

19-
Redistribution and use in source and binary forms, with or without
20-
modification, are permitted provided that the following conditions
21-
are met:
22-
23-
1. Redistributions of source code must retain the above copyright
24-
notice, this list of conditions and the following disclaimer.
25-
26-
2. Redistributions in binary form must reproduce the above
27-
copyright notice, this list of conditions and the following
28-
disclaimer in the documentation and/or other materials provided
29-
with the distribution.
30-
31-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
32-
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
33-
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
34-
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
35-
COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
36-
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
37-
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
38-
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
39-
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
40-
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
41-
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
42-
POSSIBILITY OF SUCH DAMAGE.
43-
44-
Command-line arguments:
45-
""" % morfessor.__version__,
46-
epilog="""
10+
def main(argv):
11+
parser = morfessor.get_default_argparser()
12+
parser.prog = "morfessor-train"
13+
parser.epilog = """
4714
Simple usage example (train a model and save it to model.pickled):
4815
4916
%(prog)s -s model.pickled training_corpus.txt
@@ -52,312 +19,28 @@ Interactive use (read corpus from user):
5219
5320
%(prog)s -m online -v 2 -
5421
55-
""",
56-
formatter_class=argparse.RawDescriptionHelpFormatter,
57-
add_help=False)
22+
"""
23+
24+
keep_options = ['savesegfile', 'savefile', 'trainmode', 'dampening',
25+
'encoding', 'list', 'skips', 'annofile', 'develfile',
26+
'corpusweight', 'annotationweight', 'help', 'version']
27+
for action_group in parser._action_groups:
28+
for arg in action_group._group_actions:
29+
if arg.dest not in keep_options:
30+
arg.help = argparse.SUPPRESS
5831

59-
# Positional arguments
6032
parser.add_argument('trainfiles', metavar='<file>', nargs='+',
6133
help='training data files')
6234

63-
# Options for input data files
64-
add_arg = parser.add_argument_group('input data files').add_argument
65-
add_arg('-l', '--load', dest="loadfile", default=None, metavar='<file>',
66-
help="load existing model from file (pickled model object)")
67-
add_arg('-L', '--load-segmentation', dest="loadsegfile", default=None,
68-
metavar='<file>',
69-
help="load existing model from segmentation "
70-
"file (Morfessor 1.0 format)")
71-
72-
# Options for output data files
73-
add_arg = parser.add_argument_group('output data files').add_argument
74-
add_arg('-s', '--save', dest="savefile", default=None, metavar='<file>',
75-
help="save final model to file (pickled model object)")
76-
add_arg('-S', '--save-segmentation', dest="savesegfile", default=None,
77-
metavar='<file>',
78-
help="save model segmentations to file (Morfessor 1.0 format)")
79-
add_arg('-x', '--lexicon', dest="lexfile", default=None, metavar='<file>',
80-
help="output final lexicon to given file")
81-
82-
# Options for data formats
83-
add_arg = parser.add_argument_group(
84-
'data format options').add_argument
85-
add_arg('-e', '--encoding', dest='encoding', metavar='<encoding>',
86-
help="encoding of input and output files (if none is given, "
87-
"both the local encoding and UTF-8 are tried)")
88-
add_arg('--traindata-list', dest="list", default=False,
89-
action='store_true',
90-
help="input file(s) for batch training are lists "
91-
"(one compound per line, optionally count as a prefix)")
92-
add_arg('--atom-separator', dest="separator", type=str, default=None,
93-
metavar='<regexp>',
94-
help="atom separator regexp (default %(default)s)")
95-
add_arg('--compound-separator', dest="cseparator", type=str, default='\s+',
96-
metavar='<regexp>',
97-
help="compound separator regexp (default '%(default)s')")
98-
add_arg('--analysis-separator', dest='analysisseparator', type=str,
99-
default=',', metavar='<regexp>',
100-
help="separator for different analyses in an annotation file. Use"
101-
" NONE for only allowing one analysis per line")
102-
103-
# Options for model training
104-
add_arg = parser.add_argument_group(
105-
'training and segmentation options').add_argument
106-
add_arg('-m', '--mode', dest="trainmode", default='init+batch',
107-
metavar='<mode>',
108-
choices=['none', 'batch', 'init', 'init+batch', 'online',
109-
'online+batch'],
110-
help="training mode ('none', 'init', 'batch', 'init+batch', "
111-
"'online', or 'online+batch'; default '%(default)s')")
112-
add_arg('-a', '--algorithm', dest="algorithm", default='recursive',
113-
metavar='<algorithm>', choices=['recursive', 'viterbi'],
114-
help="algorithm type ('recursive', 'viterbi'; default "
115-
"'%(default)s')")
116-
add_arg('-d', '--dampening', dest="dampening", type=str, default='none',
117-
metavar='<type>', choices=['none', 'log', 'ones'],
118-
help="frequency dampening for training data ('none', 'log', or "
119-
"'ones'; default '%(default)s')")
120-
add_arg('-f', '--forcesplit', dest="forcesplit", type=list, default=['-'],
121-
metavar='<list>',
122-
help="force split on given atoms (default %(default)s)")
123-
add_arg('-F', '--finish-threshold', dest='finish_threshold', type=float,
124-
default=0.005, metavar='<float>',
125-
help="Stopping threshold. Training stops when "
126-
"the improvement of the last iteration is"
127-
"smaller then finish_threshold * #boundaries; "
128-
"(default '%(default)s')")
129-
add_arg('-r', '--randseed', dest="randseed", default=None,
130-
metavar='<seed>',
131-
help="seed for random number generator")
132-
add_arg('-R', '--randsplit', dest="splitprob", default=None, type=float,
133-
metavar='<float>',
134-
help="initialize new words by random splitting using the given "
135-
"split probability (default no splitting)")
136-
add_arg('--skips', dest="skips", default=False, action='store_true',
137-
help="use random skips for frequently seen compounds to speed up "
138-
"training")
139-
add_arg('--batch-minfreq', dest="freqthreshold", type=int, default=1,
140-
metavar='<int>',
141-
help="compound frequency threshold for batch training (default "
142-
"%(default)s)")
143-
add_arg('--online-epochint', dest="epochinterval", type=int,
144-
default=10000, metavar='<int>',
145-
help="epoch interval for online training (default %(default)s)")
146-
add_arg('--viterbi-smoothing', dest="viterbismooth", default=0,
147-
type=float, metavar='<float>',
148-
help="additive smoothing parameter for Viterbi training "
149-
"and segmentation (default %(default)s)")
150-
add_arg('--viterbi-maxlen', dest="viterbimaxlen", default=30,
151-
type=int, metavar='<int>',
152-
help="maximum construction length in Viterbi training "
153-
"and segmentation (default %(default)s)")
154-
155-
# Options for semi-supervised model training
156-
add_arg = parser.add_argument_group(
157-
'semi-supervised training options').add_argument
158-
add_arg('-A', '--annotations', dest="annofile", default=None,
159-
metavar='<file>',
160-
help="load annotated data for semi-supervised learning")
161-
add_arg('-D', '--develset', dest="develfile", default=None,
162-
metavar='<file>',
163-
help="load annotated data for tuning the corpus weight parameter")
164-
add_arg('-w', '--corpusweight', dest="corpusweight", type=float,
165-
default=1.0, metavar='<float>',
166-
help="corpus weight parameter (default %(default)s); "
167-
"sets the initial value if --develset is used")
168-
add_arg('-W', '--annotationweight', dest="annotationweight",
169-
type=float, default=None, metavar='<float>',
170-
help="corpus weight parameter for annotated data (if unset, the "
171-
"weight is set to balance the number of tokens in annotated "
172-
"and unannotated data sets)")
173-
174-
# Options for logging
175-
add_arg = parser.add_argument_group('logging options').add_argument
176-
add_arg('-v', '--verbose', dest="verbose", type=int, default=1,
177-
metavar='<int>',
178-
help="verbose level; controls what is written to the standard "
179-
"error stream or log file (default %(default)s)")
180-
add_arg('--logfile', dest='log_file', metavar='<file>',
181-
help="write log messages to file in addition to standard "
182-
"error stream")
183-
add_arg('--progressbar', dest='progress', default=False,
184-
action='store_true',
185-
help="Force the progressbar to be displayed (possibly lowers the "
186-
"log level for the standard error stream)")
187-
188-
add_arg = parser.add_argument_group('other options').add_argument
189-
add_arg('-h', '--help', action='help',
190-
help="show this help message and exit")
191-
add_arg('--version', action='version',
192-
version='%(prog)s ' + morfessor.__version__,
193-
help="show version number and exit")
194-
195-
args = parser.parse_args(argv[1:])
196-
197-
if args.verbose >= 2:
198-
loglevel = logging.DEBUG
199-
elif args.verbose >= 1:
200-
loglevel = logging.INFO
201-
else:
202-
loglevel = logging.WARNING
203-
204-
logging_format = '%(asctime)s - %(message)s'
205-
date_format = '%Y-%m-%d %H:%M:%S'
206-
default_formatter = logging.Formatter(logging_format, date_format)
207-
plain_formatter = logging.Formatter('%(message)s')
208-
logging.basicConfig(level=loglevel)
209-
_logger.propagate = False # do not forward messages to the root logger
210-
211-
# Basic settings for logging to the error stream
212-
ch = logging.StreamHandler()
213-
ch.setLevel(loglevel)
214-
ch.setFormatter(plain_formatter)
215-
_logger.addHandler(ch)
216-
217-
# Settings for when log_file is present
218-
if args.log_file is not None:
219-
fh = logging.FileHandler(args.log_file, 'w')
220-
fh.setLevel(loglevel)
221-
fh.setFormatter(default_formatter)
222-
_logger.addHandler(fh)
223-
# If logging to a file, make INFO the highest level for the
224-
# error stream
225-
ch.setLevel(max(loglevel, logging.INFO))
226-
227-
# If debug messages are printed to screen or if stderr is not a tty (but
228-
# a pipe or a file), don't show the progressbar
229-
global show_progress_bar
230-
if (ch.level > logging.INFO or
231-
(hasattr(sys.stderr, 'isatty') and not sys.stderr.isatty())):
232-
show_progress_bar = False
233-
234-
if args.progress:
235-
show_progress_bar = True
236-
ch.setLevel(min(ch.level, logging.INFO))
237-
238-
if len(args.trainfiles) == 0:
239-
parser.error("not training data files defined")
240-
241-
if args.randseed is not None:
242-
random.seed(args.randseed)
243-
244-
io = morfessor.MorfessorIO(encoding=args.encoding,
245-
compound_separator=args.cseparator,
246-
atom_separator=args.separator)
247-
248-
# Load exisiting model or create a new one
249-
if args.loadfile is not None:
250-
model = io.read_binary_model_file(args.loadfile)
251-
252-
else:
253-
model = morfessor.BaselineModel(forcesplit_list=args.forcesplit,
254-
corpusweight=args.corpusweight,
255-
use_skips=args.skips)
256-
257-
if args.loadsegfile is not None:
258-
model.load_segmentations(io.read_segmentation_file(args.loadsegfile))
259-
260-
analysis_sep = (args.analysisseparator
261-
if args.analysisseparator != 'NONE' else None)
262-
263-
if args.annofile is not None:
264-
annotations = io.read_annotations_file(args.annofile,
265-
analysis_sep=analysis_sep)
266-
model.set_annotations(annotations, args.annotationweight)
267-
268-
if args.develfile is not None:
269-
develannots = io.read_annotations_file(args.develfile,
270-
analysis_sep=analysis_sep)
271-
else:
272-
develannots = None
273-
274-
# Set frequency dampening function
275-
if args.dampening == 'none':
276-
dampfunc = lambda x: x
277-
elif args.dampening == 'log':
278-
dampfunc = lambda x: int(round(math.log(x + 1, 2)))
279-
elif args.dampening == 'ones':
280-
dampfunc = lambda x: 1
281-
else:
282-
parser.error("unknown dampening type '%s'" % args.dampening)
283-
284-
# Set algorithm parameters
285-
if args.algorithm == 'viterbi':
286-
algparams = (args.viterbismooth, args.viterbimaxlen)
287-
else:
288-
algparams = ()
289-
290-
# Train model
291-
if args.trainmode == 'none':
292-
pass
293-
elif args.trainmode == 'batch':
294-
if len(model.get_compounds()) == 0:
295-
_logger.warning("Model contains no compounds for batch training."
296-
" Use 'init+batch' mode to add new data.")
297-
else:
298-
if len(args.trainfiles) > 0:
299-
_logger.warning("Training mode 'batch' ignores new data "
300-
"files. Use 'init+batch' or 'online' to "
301-
"add new compounds.")
302-
ts = time.time()
303-
e, c = model.train_batch(args.algorithm, algparams, develannots,
304-
args.finish_threshold)
305-
te = time.time()
306-
_logger.info("Epochs: %s" % e)
307-
_logger.info("Final cost: %s" % c)
308-
_logger.info("Training time: %.3fs" % (te - ts))
309-
elif len(args.trainfiles) > 0:
310-
ts = time.time()
311-
if args.trainmode == 'init':
312-
for f in args.trainfiles:
313-
if args.list:
314-
data = io.read_corpus_list_file(f)
315-
else:
316-
data = io.read_corpus_file(f)
317-
c = model.load_data(data, args.freqthreshold, dampfunc,
318-
args.splitprob)
319-
elif args.trainmode == 'init+batch':
320-
for f in args.trainfiles:
321-
if args.list:
322-
data = io.read_corpus_list_file(f)
323-
else:
324-
data = io.read_corpus_file(f)
325-
model.load_data(data, args.freqthreshold, dampfunc,
326-
args.splitprob)
327-
e, c = model.train_batch(args.algorithm, algparams, develannots,
328-
args.finish_threshold)
329-
_logger.info("Epochs: %s" % e)
330-
elif args.trainmode == 'online':
331-
data = io.read_corpus_files(args.trainfiles)
332-
e, c = model.train_online(data, dampfunc, args.epochinterval,
333-
args.algorithm, algparams,
334-
args.splitprob)
335-
_logger.info("Epochs: %s" % e)
336-
elif args.trainmode == 'online+batch':
337-
data = io.read_corpus_files(args.trainfiles)
338-
e, c = model.train_online(data, dampfunc, args.epochinterval,
339-
args.algorithm, algparams,
340-
args.splitprob)
341-
e, c = model.train_batch(args.algorithm, algparams, develannots,
342-
args.finish_threshold)
343-
_logger.info("Epochs: %s" % e)
344-
else:
345-
parser.error("unknown training mode '%s'" % args.trainmode)
346-
te = time.time()
347-
_logger.info("Final cost: %s" % c)
348-
_logger.info("Training time: %.3fs" % (te - ts))
349-
else:
350-
_logger.warning("No training data files specified.")
351-
352-
# Save model
353-
if args.savefile is not None:
354-
io.write_binary_model_file(args.savefile, model)
355-
356-
if args.savesegfile is not None:
357-
io.write_segmentation_file(args.savesegfile, model.get_segmentations())
35+
try:
36+
args = parser.parse_args(argv)
37+
morfessor.main(args)
38+
except morfessor.ArgumentException as e:
39+
parser.error(e.message)
40+
except Exception as e:
41+
_logger.error("Fatal Error %s %s" % (type(e), str(e)))
42+
raise
35843

359-
# Output lexicon
360-
if args.lexfile is not None:
361-
io.write_lexicon_file(args.lexfile, model.get_constructions())
36244

363-
main(sys.argv)
45+
if __name__ == "__main__":
46+
main(sys.argv[1:])

0 commit comments

Comments
 (0)
Please sign in to comment.