Skip to content

Commit 042951b

Browse files
isingoocopybara-github
authored andcommitted
No public description
PiperOrigin-RevId: 630460036
1 parent e3fefc8 commit 042951b

File tree

7 files changed

+540
-7
lines changed

7 files changed

+540
-7
lines changed

nisaba/scripts/natural_translit/utils/BUILD.bazel

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,30 @@ py_test(
246246
],
247247
)
248248

249+
py_library(
250+
name = "alignment2",
251+
srcs = ["alignment2.py"],
252+
deps = [
253+
":expression",
254+
":operation",
255+
":type_op",
256+
],
257+
)
258+
259+
py_test(
260+
name = "alignment2_test",
261+
srcs = ["alignment2_test.py"],
262+
main = "alignment2_test.py",
263+
deps = [
264+
":alignment2",
265+
":expression",
266+
":operation",
267+
":symbol",
268+
":test_op",
269+
"@io_abseil_py//absl/testing:absltest",
270+
],
271+
)
272+
249273
py_library(
250274
name = "test_op",
251275
srcs = ["test_op.py"],
Lines changed: 358 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,358 @@
1+
# Copyright 2024 Nisaba Authors.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Alignment class for defining a relation between two expressions.
16+
17+
Attributes:
18+
alias: Alias of the Expression.
19+
left: Left side of the Alignment.
20+
right: Right side of the Alignment.
21+
preceding: Preceding context.
22+
following: Following context.
23+
from_bos: If True, preceding context starts from the beginning of the
24+
sequence.
25+
to_eos: If True, the following context ends at the end of the sequence.
26+
operation: Operation that represents the relation between the sides of the
27+
alignment.
28+
priority: Priority of the alignment. When the applied cost of multiple
29+
rules are equal, the rule with the highest priority will be applied.
30+
applied_cost: Cost of the alignment when it's applied in context.
31+
source: Source of the alignment.
32+
33+
For inspection and debugging purposes, alignments are represented as strings in
34+
the following format:
35+
36+
([<preceding context>] <input>:<output> [<following context>], operation)
37+
38+
Alignments can be defined in an inventory as a set of rules to build grammars,
39+
or to assess the structural correspondence of two expressions.
40+
41+
Example:
42+
`([grapheme:nasal] a:b [grapheme:vowel], alignable (0.00))` means that
43+
expression `a` is rewritten as expression `b` with `alignable` operation with
44+
0 cost when it's preceded by a grapheme that corresponds to a nasal and
45+
followed by a grapheme that corresponds to a vowel.
46+
47+
Alignment sources:
48+
ALIGNER: Alignments from an aligner output that doesn't correspond to a
49+
predefined rule. Eg. identity or token boundary alignments.
50+
CONSTANT: Alignment class constants.
51+
ENGLISH: English alignables.
52+
FOREIGN: Alignables for foreign languages other than English
53+
LEXICON: Alignables that come from a lexicon that will be prioritised over
54+
other rules. Eg. frequent affixes or high profile entity names.
55+
NATIVE: Alignables for the native language.
56+
SPELLOUT: Alignables for spelled out letters.
57+
UNSPECIFIED = Alignments from an unspecified source.
58+
"""
59+
60+
import enum
61+
# from typing import Union
62+
from nisaba.scripts.natural_translit.utils import expression as exp
63+
from nisaba.scripts.natural_translit.utils import operation as op
64+
# from nisaba.scripts.natural_translit.utils
65+
# import type_op as ty
66+
67+
68+
class Alignment(exp.Expression):
69+
"""An Expression that represents an alignment of exp.Expressions."""
70+
71+
class Source(enum.StrEnum):
72+
ALIGNER = 'aligner'
73+
CONSTANT = 'constant'
74+
ENGLISH = 'english'
75+
FOREIGN = 'foreign'
76+
LEXICON = 'lexicon'
77+
NATIVE = 'native'
78+
SPELLOUT = 'spellout'
79+
UNSPECIFIED = 'unspecified'
80+
81+
def __init__(self, alias: str = ''):
82+
super().__init__(alias)
83+
self.left = exp.Expression.ANY
84+
self.right = exp.Expression.ANY
85+
self.preceding = exp.Expression.ANY
86+
self.following = exp.Expression.ANY
87+
self.from_bos = False
88+
self.to_eos = False
89+
self.operation = op.Operation.COMMON.unassigned
90+
self.priority = 0
91+
self.applied_cost = self.operation.base_cost
92+
self.source = Alignment.Source.UNSPECIFIED
93+
94+
def _side_str(self, side: exp.Expression) -> str:
95+
if side.is_any() or len(side) != 1:
96+
return str(side)
97+
return side.item(0).text
98+
99+
def _context_str(self, context: exp.Expression) -> str:
100+
if context.is_any() or not isinstance(context, Alignment):
101+
return ''
102+
return '%s:%s' % (
103+
self._side_str(context.left),
104+
self._side_str(context.right),
105+
)
106+
107+
def _pre_str(self) -> str:
108+
text = self._context_str(self.preceding)
109+
prefix = exp.Atomic.CTRL.bos.text if self.from_bos else ''
110+
return '[%s%s] ' % (prefix, text) if text else ''
111+
112+
def _fol_str(self) -> str:
113+
text = self._context_str(self.following)
114+
suffix = exp.Atomic.CTRL.eos.text if self.to_eos else ''
115+
return ' [%s%s]' % (text, suffix) if text else ''
116+
117+
def __str__(self):
118+
if self.operation.is_assigned():
119+
operation = ', %s' % str(self.operation)
120+
else:
121+
operation = ''
122+
return '(%s%s:%s%s%s)' % (
123+
self._pre_str(),
124+
self._side_str(self.left),
125+
self._side_str(self.right),
126+
self._fol_str(),
127+
operation,
128+
)
129+
130+
def tsv_row(self) -> str:
131+
return '\t'.join([
132+
self.alias,
133+
''.join([item.text for item in self.left]),
134+
''.join([item.text for item in self.right]),
135+
str(self.operation.match),
136+
str(self.applied_cost),
137+
])
138+
139+
def _set_side(self, side: exp.Expression.OR_SYMBOL) -> exp.Expression:
140+
if not isinstance(side, exp.Expression):
141+
return exp.Atomic.read(side)
142+
return side
143+
144+
@classmethod
145+
def simple(
146+
cls,
147+
left: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
148+
right: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
149+
) -> 'Alignment':
150+
alignment = cls()
151+
alignment.left = alignment._set_side(left)
152+
alignment.right = alignment._set_side(right)
153+
return alignment
154+
155+
# @classmethod
156+
# def constant(
157+
# cls,
158+
# alias: str = '',
159+
# left: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
160+
# right: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
161+
# operation: op.Operation = op.Operation.COMMON.unassigned,
162+
# ) -> 'Alignment':
163+
# alignment = cls(alias)
164+
# alignment.left = alignment._set_side(left)
165+
# alignment.right = alignment._set_side(right)
166+
# alignment.operation = operation
167+
# alignment.source = Alignment.Source.CONSTANT
168+
# return alignment
169+
170+
# TODO: Expand context to allow Cat and Or of alignments.
171+
# Eg. a rule with `preceding=((vowel_grapheme:any) | (any:vowel_phoneme)))`
172+
# will apply if it's preceded by an alignment that has a vowel grapheme on the
173+
# left side or a vowel phoneme on the right side, regardless of what they are
174+
# aligned with.
175+
# def _set_context(
176+
# self, left: exp.Expression.OR_SYMBOL, right: exp.Expression.OR_SYMBOL
177+
# ) -> 'Alignment':
178+
# if left.is_any() and right.is_any():
179+
# return Alignment.ANY
180+
# return Alignment.simple(left, right)
181+
182+
# @classmethod
183+
# def rule(
184+
# cls,
185+
# alias: str = '',
186+
# left: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
187+
# right: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
188+
# preceding_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
189+
# preceding_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
190+
# following_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
191+
# following_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
192+
# from_bos: bool = False,
193+
# to_eos: bool = False,
194+
# operation: op.Operation = op.Operation.COMMON.alignable,
195+
# priority: int = 0,
196+
# applied_cost: Union[float, ty.Nothing] = ty.UNSPECIFIED,
197+
# source: Source = Source.UNSPECIFIED,
198+
# ) -> 'Alignment':
199+
# rule = cls()
200+
# rule.alias = alias
201+
# rule.left = rule._set_side(left)
202+
# rule.right = rule._set_side(right)
203+
# rule.preceding = rule._set_context(preceding_left, preceding_right)
204+
# rule.following = rule._set_context(following_left, following_right)
205+
# rule.from_bos = from_bos
206+
# rule.to_eos = to_eos
207+
# rule.operation = operation
208+
# rule.priority = priority
209+
# if isinstance(applied_cost, float):
210+
# rule.applied_cost = applied_cost
211+
# else:
212+
# rule.applied_cost = rule.operation.base_cost
213+
# rule.source = source
214+
# return rule
215+
216+
# @classmethod
217+
# def deletion(
218+
# cls,
219+
# alias: str = '',
220+
# left: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
221+
# preceding_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
222+
# preceding_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
223+
# following_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
224+
# following_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
225+
# from_bos: bool = False,
226+
# to_eos: bool = False,
227+
# operation: op.Operation = op.Operation.COMMON.deletion,
228+
# priority: int = 0,
229+
# applied_cost: Union[int, float, ty.Nothing] = ty.UNSPECIFIED,
230+
# source: Source = Source.UNSPECIFIED,
231+
# ) -> 'Alignment':
232+
# return cls.rule(
233+
# alias,
234+
# left,
235+
# exp.Atomic.CTRL.eps,
236+
# preceding_left,
237+
# preceding_right,
238+
# following_left,
239+
# following_right,
240+
# from_bos,
241+
# to_eos,
242+
# operation,
243+
# priority,
244+
# applied_cost,
245+
# source,
246+
# )
247+
248+
# @classmethod
249+
# def insertion(
250+
# cls,
251+
# alias: str = '',
252+
# right: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
253+
# preceding_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
254+
# preceding_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
255+
# following_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
256+
# following_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
257+
# from_bos: bool = False,
258+
# to_eos: bool = False,
259+
# operation: op.Operation = op.Operation.COMMON.insertion,
260+
# priority: int = 0,
261+
# applied_cost: Union[int, float, ty.Nothing] = ty.UNSPECIFIED,
262+
# source: Source = Source.UNSPECIFIED,
263+
# ) -> 'Alignment':
264+
# return cls.rule(
265+
# alias,
266+
# exp.Atomic.CTRL.eps,
267+
# right,
268+
# preceding_left,
269+
# preceding_right,
270+
# following_left,
271+
# following_right,
272+
# from_bos,
273+
# to_eos,
274+
# operation,
275+
# priority,
276+
# applied_cost,
277+
# source,
278+
# )
279+
280+
# @classmethod
281+
# def interchangeable(
282+
# cls,
283+
# alias: str = '',
284+
# left: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
285+
# right: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
286+
# preceding_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
287+
# preceding_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
288+
# following_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
289+
# following_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY,
290+
# from_bos: bool = False,
291+
# to_eos: bool = False,
292+
# operation: op.Operation = op.Operation.COMMON.interchangeable,
293+
# priority: int = 0,
294+
# applied_cost: Union[int, float, ty.Nothing] = ty.UNSPECIFIED,
295+
# source: Source = Source.UNSPECIFIED,
296+
# ) -> tuple['Alignment', 'Alignment']:
297+
# common = (
298+
# preceding_left,
299+
# preceding_right,
300+
# following_left,
301+
# following_right,
302+
# from_bos,
303+
# to_eos,
304+
# operation,
305+
# priority,
306+
# applied_cost,
307+
# source,
308+
# )
309+
# left_to_right = cls.rule(alias + '_l2r', left, right, *common)
310+
# right_to_left = cls.rule(alias + '_r2l', right, left, *common)
311+
# return left_to_right, right_to_left
312+
313+
# def is_any(self) -> bool:
314+
# return self.left.is_any() and self.right.is_any()
315+
316+
# def is_eps(self) -> bool:
317+
# return self.left.is_eps() and self.right.is_eps()
318+
319+
# def is_nor(self) -> bool:
320+
# return self.left.is_nor() and self.right.is_nor()
321+
322+
# def _copy_context(
323+
# self, context: exp.Expression
324+
# ) -> tuple[exp.Expression, exp.Expression]:
325+
# if isinstance(context, Alignment):
326+
# return context.left.copy(), context.right.copy()
327+
# return exp.Expression.ANY, exp.Expression.ANY
328+
329+
# def copy(self) -> 'Alignment':
330+
# if (
331+
# self == Alignment.ANY
332+
# or self == Alignment.EPSILON
333+
# or self == Alignment.ERROR
334+
# ):
335+
# return self
336+
# return Alignment.rule(
337+
# self.alias,
338+
# self.left.copy(),
339+
# self.right.copy(),
340+
# *self._copy_context(self.preceding),
341+
# *self._copy_context(self.following),
342+
# self.from_bos,
343+
# self.to_eos,
344+
# self.operation,
345+
# self.priority,
346+
# self.applied_cost,
347+
# self.source,
348+
# )
349+
350+
351+
# Alignment.ANY = Alignment.constant('any')
352+
# Alignment.EPSILON = Alignment.constant(
353+
# 'empty', exp.Atomic.CTRL.eps, exp.Atomic.CTRL.eps
354+
# )
355+
# Alignment.ERROR = Alignment.constant(
356+
# 'error',
357+
# exp.Atomic.CTRL.nor, exp.Atomic.CTRL.nor, op.Operation.COMMON.error
358+
# )

0 commit comments

Comments
 (0)