-
Notifications
You must be signed in to change notification settings - Fork 1
/
Converter.py
105 lines (91 loc) · 3.74 KB
/
Converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# credits to: https://stackoverflow.com/questions/14070060/convert-number-to-italian-and-italian-to-number-in-python
import re
class Converter():
"""
Class used to convert textual representation of numbers (string) to numbers.
Attributes:
NUMBERS_SEQ tuple of tuples containing the correspondeces between text and numbers
NUMBERS dixtionary created from NUMBERS_SEQ
TOKEN_REGEX regular expression to extract tokens (pieces of number's text representation) from text
"""
def __init__(self):
self.NUMBERS_SEQ = (
('dieci', '10'),
('undici', '11'),
('dodici', '12'),
('tredici', '13'),
('quattordici', '14'),
('quindici', '15'),
('sedici', '16'),
('diciasette', '17'),
('diciotto', '18'),
('diciannove', '19'),
('venti', '20'),
('trenta', '30'),
('quaranta', '40'),
('cinquanta', '50'),
('sessanta', '60'),
('settanta', '70'),
('ottanta', '80'),
('novanta', '90'),
('cento', '100'),
('mille', '1000'), ('mila', '1000'),
('milione', '1000000'), ('milioni', '1000000'),
('miliardo', '1000000000'), ('miliardi', '1000000000'),
('uno', '1'), ('un', '1'),
('due', '2'),
('tre', '3'),
('quattro', '4'),
('cinque', '5'),
('sei', '6'),
('sette', '7'),
('otto', '8'),
('nove', '9'))
self.NUMBERS = dict(self.NUMBERS_SEQ)
self.TOKEN_REGEX = re.compile('|'.join('(%s)' % num for num, val in self.NUMBERS_SEQ))
def normalize_text(self, num_repr):
'''Return a normalized version of *num_repr* that can be passed to let2num.'''
return num_repr.lower().translate(None, ' \t')
def let2num(self, num_repr):
'''Yield the numeric representation of *num_repr*.'''
result = ''
# symbol = "~`!@#$%^&*()_-+={}[]:>;',</?*-+"
# for i in num_repr:
# if i in symbol:
# return num_repr
for token in (tok for tok in self.TOKEN_REGEX.split(num_repr) if tok):
try:
value = self.NUMBERS[token]
except KeyError:
if token not in ('di', 'e'):
return token
#raise ValueError('Invalid number representation: %r' % num_repr)
continue
if token == 'miliardi':
result += '0'*9
elif token in ('mila','milioni'):
zeros = '0' * value.count('0')
piece = result[-3:].lstrip('0')
result = (result[:-len(piece)-len(zeros)] +
piece +
zeros)
elif not result:
result = value
else:
length = len(value)
non_zero_values = len(value.strip('0'))
if token in ('cento', 'milione', 'miliardo'):
if result[-1] != '0':
result = (result[:-length] +
result[-1] +
'0' * value.count('0'))
continue
result = (result[:-length] +
value.rstrip('0') +
result[len(result) -length + non_zero_values:])
return self.add_thousand_separator(result)
def add_thousand_separator(self, s, sep='.'):
'''Return the numeric string s with the thousand separator.'''
rev_s = s[::-1]
tokens = [rev_s[i:i+3][::-1] for i in range(0, len(s), 3)][::-1]
return sep.join(tokens)