|
| 1 | +# Copyright 2024 Nisaba Authors. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +"""Alignment class for defining a relation between two expressions. |
| 16 | +
|
| 17 | +Attributes: |
| 18 | + alias: Alias of the Expression. |
| 19 | + left: Left side of the Alignment. |
| 20 | + right: Right side of the Alignment. |
| 21 | + preceding: Preceding context. |
| 22 | + following: Following context. |
| 23 | + from_bos: If True, preceding context starts from the beginning of the |
| 24 | + sequence. |
| 25 | + to_eos: If True, the following context ends at the end of the sequence. |
| 26 | + operation: Operation that represents the relation between the sides of the |
| 27 | + alignment. |
| 28 | + priority: Priority of the alignment. When the applied cost of multiple |
| 29 | + rules are equal, the rule with the highest priority will be applied. |
| 30 | + applied_cost: Cost of the alignment when it's applied in context. |
| 31 | + source: Source of the alignment. |
| 32 | +
|
| 33 | +For inspection and debugging purposes, alignments are represented as strings in |
| 34 | + the following format: |
| 35 | +
|
| 36 | +([<preceding context>] <input>:<output> [<following context>], operation) |
| 37 | +
|
| 38 | +Alignments can be defined in an inventory as a set of rules to build grammars, |
| 39 | + or to assess the structural correspondence of two expressions. |
| 40 | +
|
| 41 | +Example: |
| 42 | + `([grapheme:nasal] a:b [grapheme:vowel], alignable (0.00))` means that |
| 43 | + expression `a` is rewritten as expression `b` with `alignable` operation with |
| 44 | + 0 cost when it's preceded by a grapheme that corresponds to a nasal and |
| 45 | + followed by a grapheme that corresponds to a vowel. |
| 46 | +
|
| 47 | +Alignment sources: |
| 48 | + ALIGNER: Alignments from an aligner output that doesn't correspond to a |
| 49 | + predefined rule. Eg. identity or token boundary alignments. |
| 50 | + CONSTANT: Alignment class constants. |
| 51 | + ENGLISH: English alignables. |
| 52 | + FOREIGN: Alignables for foreign languages other than English |
| 53 | + LEXICON: Alignables that come from a lexicon that will be prioritised over |
| 54 | + other rules. Eg. frequent affixes or high profile entity names. |
| 55 | + NATIVE: Alignables for the native language. |
| 56 | + SPELLOUT: Alignables for spelled out letters. |
| 57 | + UNSPECIFIED = Alignments from an unspecified source. |
| 58 | +""" |
| 59 | + |
| 60 | +import enum |
| 61 | +# from typing import Union |
| 62 | +from nisaba.scripts.natural_translit.utils import expression as exp |
| 63 | +from nisaba.scripts.natural_translit.utils import operation as op |
| 64 | +# from nisaba.scripts.natural_translit.utils |
| 65 | +# import type_op as ty |
| 66 | + |
| 67 | + |
| 68 | +class Alignment(exp.Expression): |
| 69 | + """An Expression that represents an alignment of exp.Expressions.""" |
| 70 | + |
| 71 | + class Source(enum.StrEnum): |
| 72 | + ALIGNER = 'aligner' |
| 73 | + CONSTANT = 'constant' |
| 74 | + ENGLISH = 'english' |
| 75 | + FOREIGN = 'foreign' |
| 76 | + LEXICON = 'lexicon' |
| 77 | + NATIVE = 'native' |
| 78 | + SPELLOUT = 'spellout' |
| 79 | + UNSPECIFIED = 'unspecified' |
| 80 | + |
| 81 | + def __init__(self, alias: str = ''): |
| 82 | + super().__init__(alias) |
| 83 | + self.left = exp.Expression.ANY |
| 84 | + self.right = exp.Expression.ANY |
| 85 | + self.preceding = exp.Expression.ANY |
| 86 | + self.following = exp.Expression.ANY |
| 87 | + self.from_bos = False |
| 88 | + self.to_eos = False |
| 89 | + self.operation = op.Operation.COMMON.unassigned |
| 90 | + self.priority = 0 |
| 91 | + self.applied_cost = self.operation.base_cost |
| 92 | + self.source = Alignment.Source.UNSPECIFIED |
| 93 | + |
| 94 | + def _side_str(self, side: exp.Expression) -> str: |
| 95 | + if side.is_any() or len(side) != 1: |
| 96 | + return str(side) |
| 97 | + return side.item(0).text |
| 98 | + |
| 99 | + def _context_str(self, context: exp.Expression) -> str: |
| 100 | + if context.is_any() or not isinstance(context, Alignment): |
| 101 | + return '' |
| 102 | + return '%s:%s' % ( |
| 103 | + self._side_str(context.left), |
| 104 | + self._side_str(context.right), |
| 105 | + ) |
| 106 | + |
| 107 | + def _pre_str(self) -> str: |
| 108 | + text = self._context_str(self.preceding) |
| 109 | + prefix = exp.Atomic.CTRL.bos.text if self.from_bos else '' |
| 110 | + return '[%s%s] ' % (prefix, text) if text else '' |
| 111 | + |
| 112 | + def _fol_str(self) -> str: |
| 113 | + text = self._context_str(self.following) |
| 114 | + suffix = exp.Atomic.CTRL.eos.text if self.to_eos else '' |
| 115 | + return ' [%s%s]' % (text, suffix) if text else '' |
| 116 | + |
| 117 | + def __str__(self): |
| 118 | + if self.operation.is_assigned(): |
| 119 | + operation = ', %s' % str(self.operation) |
| 120 | + else: |
| 121 | + operation = '' |
| 122 | + return '(%s%s:%s%s%s)' % ( |
| 123 | + self._pre_str(), |
| 124 | + self._side_str(self.left), |
| 125 | + self._side_str(self.right), |
| 126 | + self._fol_str(), |
| 127 | + operation, |
| 128 | + ) |
| 129 | + |
| 130 | + def tsv_row(self) -> str: |
| 131 | + return '\t'.join([ |
| 132 | + self.alias, |
| 133 | + ''.join([item.text for item in self.left]), |
| 134 | + ''.join([item.text for item in self.right]), |
| 135 | + str(self.operation.match), |
| 136 | + str(self.applied_cost), |
| 137 | + ]) |
| 138 | + |
| 139 | + def _set_side(self, side: exp.Expression.OR_SYMBOL) -> exp.Expression: |
| 140 | + if not isinstance(side, exp.Expression): |
| 141 | + return exp.Atomic.read(side) |
| 142 | + return side |
| 143 | + |
| 144 | + @classmethod |
| 145 | + def simple( |
| 146 | + cls, |
| 147 | + left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 148 | + right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 149 | + ) -> 'Alignment': |
| 150 | + alignment = cls() |
| 151 | + alignment.left = alignment._set_side(left) |
| 152 | + alignment.right = alignment._set_side(right) |
| 153 | + return alignment |
| 154 | + |
| 155 | +# @classmethod |
| 156 | +# def constant( |
| 157 | +# cls, |
| 158 | +# alias: str = '', |
| 159 | +# left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 160 | +# right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 161 | +# operation: op.Operation = op.Operation.COMMON.unassigned, |
| 162 | +# ) -> 'Alignment': |
| 163 | +# alignment = cls(alias) |
| 164 | +# alignment.left = alignment._set_side(left) |
| 165 | +# alignment.right = alignment._set_side(right) |
| 166 | +# alignment.operation = operation |
| 167 | +# alignment.source = Alignment.Source.CONSTANT |
| 168 | +# return alignment |
| 169 | + |
| 170 | + # TODO: Expand context to allow Cat and Or of alignments. |
| 171 | + # Eg. a rule with `preceding=((vowel_grapheme:any) | (any:vowel_phoneme)))` |
| 172 | + # will apply if it's preceded by an alignment that has a vowel grapheme on the |
| 173 | + # left side or a vowel phoneme on the right side, regardless of what they are |
| 174 | + # aligned with. |
| 175 | +# def _set_context( |
| 176 | +# self, left: exp.Expression.OR_SYMBOL, right: exp.Expression.OR_SYMBOL |
| 177 | +# ) -> 'Alignment': |
| 178 | +# if left.is_any() and right.is_any(): |
| 179 | +# return Alignment.ANY |
| 180 | +# return Alignment.simple(left, right) |
| 181 | + |
| 182 | +# @classmethod |
| 183 | +# def rule( |
| 184 | +# cls, |
| 185 | +# alias: str = '', |
| 186 | +# left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 187 | +# right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 188 | +# preceding_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 189 | +# preceding_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 190 | +# following_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 191 | +# following_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 192 | +# from_bos: bool = False, |
| 193 | +# to_eos: bool = False, |
| 194 | +# operation: op.Operation = op.Operation.COMMON.alignable, |
| 195 | +# priority: int = 0, |
| 196 | +# applied_cost: Union[float, ty.Nothing] = ty.UNSPECIFIED, |
| 197 | +# source: Source = Source.UNSPECIFIED, |
| 198 | +# ) -> 'Alignment': |
| 199 | +# rule = cls() |
| 200 | +# rule.alias = alias |
| 201 | +# rule.left = rule._set_side(left) |
| 202 | +# rule.right = rule._set_side(right) |
| 203 | +# rule.preceding = rule._set_context(preceding_left, preceding_right) |
| 204 | +# rule.following = rule._set_context(following_left, following_right) |
| 205 | +# rule.from_bos = from_bos |
| 206 | +# rule.to_eos = to_eos |
| 207 | +# rule.operation = operation |
| 208 | +# rule.priority = priority |
| 209 | +# if isinstance(applied_cost, float): |
| 210 | +# rule.applied_cost = applied_cost |
| 211 | +# else: |
| 212 | +# rule.applied_cost = rule.operation.base_cost |
| 213 | +# rule.source = source |
| 214 | +# return rule |
| 215 | + |
| 216 | +# @classmethod |
| 217 | +# def deletion( |
| 218 | +# cls, |
| 219 | +# alias: str = '', |
| 220 | +# left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 221 | +# preceding_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 222 | +# preceding_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 223 | +# following_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 224 | +# following_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 225 | +# from_bos: bool = False, |
| 226 | +# to_eos: bool = False, |
| 227 | +# operation: op.Operation = op.Operation.COMMON.deletion, |
| 228 | +# priority: int = 0, |
| 229 | +# applied_cost: Union[int, float, ty.Nothing] = ty.UNSPECIFIED, |
| 230 | +# source: Source = Source.UNSPECIFIED, |
| 231 | +# ) -> 'Alignment': |
| 232 | +# return cls.rule( |
| 233 | +# alias, |
| 234 | +# left, |
| 235 | +# exp.Atomic.CTRL.eps, |
| 236 | +# preceding_left, |
| 237 | +# preceding_right, |
| 238 | +# following_left, |
| 239 | +# following_right, |
| 240 | +# from_bos, |
| 241 | +# to_eos, |
| 242 | +# operation, |
| 243 | +# priority, |
| 244 | +# applied_cost, |
| 245 | +# source, |
| 246 | +# ) |
| 247 | + |
| 248 | +# @classmethod |
| 249 | +# def insertion( |
| 250 | +# cls, |
| 251 | +# alias: str = '', |
| 252 | +# right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 253 | +# preceding_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 254 | +# preceding_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 255 | +# following_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 256 | +# following_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 257 | +# from_bos: bool = False, |
| 258 | +# to_eos: bool = False, |
| 259 | +# operation: op.Operation = op.Operation.COMMON.insertion, |
| 260 | +# priority: int = 0, |
| 261 | +# applied_cost: Union[int, float, ty.Nothing] = ty.UNSPECIFIED, |
| 262 | +# source: Source = Source.UNSPECIFIED, |
| 263 | +# ) -> 'Alignment': |
| 264 | +# return cls.rule( |
| 265 | +# alias, |
| 266 | +# exp.Atomic.CTRL.eps, |
| 267 | +# right, |
| 268 | +# preceding_left, |
| 269 | +# preceding_right, |
| 270 | +# following_left, |
| 271 | +# following_right, |
| 272 | +# from_bos, |
| 273 | +# to_eos, |
| 274 | +# operation, |
| 275 | +# priority, |
| 276 | +# applied_cost, |
| 277 | +# source, |
| 278 | +# ) |
| 279 | + |
| 280 | +# @classmethod |
| 281 | +# def interchangeable( |
| 282 | +# cls, |
| 283 | +# alias: str = '', |
| 284 | +# left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 285 | +# right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 286 | +# preceding_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 287 | +# preceding_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 288 | +# following_left: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 289 | +# following_right: exp.Expression.OR_SYMBOL = exp.Expression.ANY, |
| 290 | +# from_bos: bool = False, |
| 291 | +# to_eos: bool = False, |
| 292 | +# operation: op.Operation = op.Operation.COMMON.interchangeable, |
| 293 | +# priority: int = 0, |
| 294 | +# applied_cost: Union[int, float, ty.Nothing] = ty.UNSPECIFIED, |
| 295 | +# source: Source = Source.UNSPECIFIED, |
| 296 | +# ) -> tuple['Alignment', 'Alignment']: |
| 297 | +# common = ( |
| 298 | +# preceding_left, |
| 299 | +# preceding_right, |
| 300 | +# following_left, |
| 301 | +# following_right, |
| 302 | +# from_bos, |
| 303 | +# to_eos, |
| 304 | +# operation, |
| 305 | +# priority, |
| 306 | +# applied_cost, |
| 307 | +# source, |
| 308 | +# ) |
| 309 | +# left_to_right = cls.rule(alias + '_l2r', left, right, *common) |
| 310 | +# right_to_left = cls.rule(alias + '_r2l', right, left, *common) |
| 311 | +# return left_to_right, right_to_left |
| 312 | + |
| 313 | +# def is_any(self) -> bool: |
| 314 | +# return self.left.is_any() and self.right.is_any() |
| 315 | + |
| 316 | +# def is_eps(self) -> bool: |
| 317 | +# return self.left.is_eps() and self.right.is_eps() |
| 318 | + |
| 319 | +# def is_nor(self) -> bool: |
| 320 | +# return self.left.is_nor() and self.right.is_nor() |
| 321 | + |
| 322 | +# def _copy_context( |
| 323 | +# self, context: exp.Expression |
| 324 | +# ) -> tuple[exp.Expression, exp.Expression]: |
| 325 | +# if isinstance(context, Alignment): |
| 326 | +# return context.left.copy(), context.right.copy() |
| 327 | +# return exp.Expression.ANY, exp.Expression.ANY |
| 328 | + |
| 329 | +# def copy(self) -> 'Alignment': |
| 330 | +# if ( |
| 331 | +# self == Alignment.ANY |
| 332 | +# or self == Alignment.EPSILON |
| 333 | +# or self == Alignment.ERROR |
| 334 | +# ): |
| 335 | +# return self |
| 336 | +# return Alignment.rule( |
| 337 | +# self.alias, |
| 338 | +# self.left.copy(), |
| 339 | +# self.right.copy(), |
| 340 | +# *self._copy_context(self.preceding), |
| 341 | +# *self._copy_context(self.following), |
| 342 | +# self.from_bos, |
| 343 | +# self.to_eos, |
| 344 | +# self.operation, |
| 345 | +# self.priority, |
| 346 | +# self.applied_cost, |
| 347 | +# self.source, |
| 348 | +# ) |
| 349 | + |
| 350 | + |
| 351 | +# Alignment.ANY = Alignment.constant('any') |
| 352 | +# Alignment.EPSILON = Alignment.constant( |
| 353 | +# 'empty', exp.Atomic.CTRL.eps, exp.Atomic.CTRL.eps |
| 354 | +# ) |
| 355 | +# Alignment.ERROR = Alignment.constant( |
| 356 | +# 'error', |
| 357 | +# exp.Atomic.CTRL.nor, exp.Atomic.CTRL.nor, op.Operation.COMMON.error |
| 358 | +# ) |
0 commit comments