Skip to content

Commit f6bc1a4

Browse files
committed
Merge branch 'develop'
2 parents aaec893 + 74a9207 commit f6bc1a4

8 files changed

Lines changed: 405 additions & 83 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ OldSqueakyCleanText/
170170
snap.py
171171
CLAUDE.md
172172
SPECS.md
173+
docs/plans/
173174
squeakycleantext-explorer.html
174175
ralph-loop-prompt.md
175176
.claude/

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ dependencies = [
3333
"huggingface-hub>=1.4.0",
3434
"numpy>=2.0.0",
3535
"presidio_anonymizer>=2.2.355",
36+
"regex>=2024.0.0",
3637
]
3738

3839
[project.urls]

sct/config.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@
2929
'DUTCH': 'rhnfzl/xlm-roberta-large-conll02-dutch-onnx',
3030
'GERMAN': 'rhnfzl/xlm-roberta-large-conll03-german-onnx',
3131
'SPANISH': 'rhnfzl/xlm-roberta-large-conll02-spanish-onnx',
32+
# FR/PT/IT: no dedicated ONNX model available; map to MULTILINGUAL fallback
33+
'FRENCH': 'rhnfzl/wikineural-multilingual-ner-onnx',
34+
'PORTUGUESE': 'rhnfzl/wikineural-multilingual-ner-onnx',
35+
'ITALIAN': 'rhnfzl/wikineural-multilingual-ner-onnx',
3236
'MULTILINGUAL': 'rhnfzl/wikineural-multilingual-ner-onnx',
3337
}
3438

@@ -37,9 +41,26 @@
3741
'DUTCH': 'FacebookAI/xlm-roberta-large-finetuned-conll02-dutch',
3842
'GERMAN': 'FacebookAI/xlm-roberta-large-finetuned-conll03-german',
3943
'SPANISH': 'FacebookAI/xlm-roberta-large-finetuned-conll02-spanish',
44+
# FR/PT/IT: no dedicated Torch model available; map to MULTILINGUAL fallback
45+
'FRENCH': 'Babelscape/wikineural-multilingual-ner',
46+
'PORTUGUESE': 'Babelscape/wikineural-multilingual-ner',
47+
'ITALIAN': 'Babelscape/wikineural-multilingual-ner',
4048
'MULTILINGUAL': 'Babelscape/wikineural-multilingual-ner',
4149
}
4250

51+
# NER ensemble: ordered model keys to run per input language.
52+
# Keys must exist in the resolved ner_models dict.
53+
# EN/NL/DE/ES get a 3-model ensemble (lang-specific + English CoNLL + Multilingual).
54+
# All other languages (FR, PT, IT, etc.) fall back to NER_ENSEMBLE_DEFAULT_KEYS.
55+
DEFAULT_NER_ENSEMBLE: dict[str, tuple] = {
56+
'ENGLISH': ('ENGLISH', 'MULTILINGUAL'),
57+
'DUTCH': ('DUTCH', 'ENGLISH', 'MULTILINGUAL'),
58+
'GERMAN': ('GERMAN', 'ENGLISH', 'MULTILINGUAL'),
59+
'SPANISH': ('SPANISH', 'ENGLISH', 'MULTILINGUAL'),
60+
}
61+
# Fallback for any language not in DEFAULT_NER_ENSEMBLE (FR, PT, IT, etc.)
62+
NER_ENSEMBLE_DEFAULT_KEYS: tuple = ('MULTILINGUAL', 'ENGLISH')
63+
4364

4465
@dataclass(frozen=True)
4566
class TextCleanerConfig:
@@ -100,6 +121,7 @@ class TextCleanerConfig:
100121
# NER settings
101122
positional_tags: Tuple[str, ...] = ('PER', 'LOC', 'ORG', 'MISC')
102123
ner_confidence_threshold: float = 0.85
124+
ner_batch_size: int = 8
103125
language: Optional[str] = None
104126

105127
# NER backend selection
@@ -111,6 +133,16 @@ class TextCleanerConfig:
111133
# Missing keys are filled from DEFAULT_NER_MODELS.
112134
ner_models: Optional[Mapping[str, str]] = None
113135

136+
# Ensemble routing: maps each input language to an ordered tuple of model keys
137+
# to run. Keys must exist in the resolved ner_models dict.
138+
# If None, DEFAULT_NER_ENSEMBLE is used (with NER_ENSEMBLE_DEFAULT_KEYS fallback).
139+
ner_ensemble: Optional[Mapping[str, Tuple[str, ...]]] = None
140+
141+
# Fallback keys for any language not listed in ner_ensemble (e.g. FR, PT, IT).
142+
# If None, NER_ENSEMBLE_DEFAULT_KEYS is used. Pass an empty tuple to skip
143+
# ensemble inference for unmapped languages.
144+
ner_ensemble_default_keys: Optional[Tuple[str, ...]] = None
145+
114146
# Deprecated: positional tuple (English, Dutch, German, Spanish, Multilingual).
115147
# Use ner_models dict instead. Kept for backward compatibility.
116148
ner_models_list: Tuple[str, ...] = (
@@ -131,6 +163,10 @@ class TextCleanerConfig:
131163
gliner_label_map: Optional[Mapping[str, str]] = None
132164
gliner_threshold: float = 0.4
133165

166+
# Plugin: user-provided pipeline steps, each callable (text: str) -> str.
167+
# Appended after all built-in steps in _init_pipeline().
168+
custom_pipeline_steps: Tuple = ()
169+
134170
# Language extensibility — extend Lingua detection, stopwords, dates, NER
135171
extra_languages: Tuple[str, ...] = ()
136172
custom_stopwords: Optional[Mapping[str, frozenset]] = None
@@ -145,6 +181,13 @@ def __post_init__(self):
145181
object.__setattr__(self, 'positional_tags', tuple(self.positional_tags))
146182
if isinstance(self.ner_models_list, list):
147183
object.__setattr__(self, 'ner_models_list', tuple(self.ner_models_list))
184+
if isinstance(self.custom_pipeline_steps, list):
185+
object.__setattr__(self, 'custom_pipeline_steps', tuple(self.custom_pipeline_steps))
186+
for step in self.custom_pipeline_steps:
187+
if not callable(step):
188+
raise ValueError(
189+
f"custom_pipeline_steps must contain callables, got: {type(step)!r}"
190+
)
148191

149192
# --- NER backend validation ---
150193
if self.ner_backend not in VALID_NER_BACKENDS:
@@ -153,6 +196,12 @@ def __post_init__(self):
153196
f"got: {self.ner_backend!r}"
154197
)
155198

199+
# ner_batch_size must be positive
200+
if self.ner_batch_size <= 0:
201+
raise ValueError(
202+
f"ner_batch_size must be >= 1, got {self.ner_batch_size}"
203+
)
204+
156205
# GLiNER fields required for gliner/ensemble backends
157206
needs_gliner = self.ner_backend in ('gliner', 'ensemble_onnx', 'ensemble_torch')
158207
if needs_gliner:
@@ -204,6 +253,12 @@ def __post_init__(self):
204253
)
205254
else:
206255
# No dict provided — derive from ner_models_list (may be default or custom)
256+
import warnings
257+
warnings.warn(
258+
"ner_models_list is deprecated. Use ner_models dict instead.",
259+
DeprecationWarning,
260+
stacklevel=2,
261+
)
207262
models_dict = dict(zip(LANG_KEYS, self.ner_models_list))
208263
object.__setattr__(self, 'ner_models', MappingProxyType(models_dict))
209264

sct/sct.py

Lines changed: 68 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
"""
22
Comprehensive text cleaning and preprocessing pipeline.
33
"""
4+
import asyncio
45
import logging
56
import os
6-
import threading
77
from concurrent.futures import ThreadPoolExecutor
88
from typing import List, Tuple, Optional
99

@@ -12,12 +12,6 @@
1212

1313
logger = logging.getLogger(__name__)
1414

15-
# Thread-local storage for passing detected language to pipeline steps
16-
# that need it (e.g. fuzzy date replacement). Each thread running
17-
# _process_single sets its own value, so there is no cross-thread leaking.
18-
_thread_ctx = threading.local()
19-
20-
2115
class TextCleaner:
2216

2317
def __init__(self, cfg: Optional[TextCleanerConfig] = None):
@@ -84,16 +78,25 @@ def __init__(self, cfg: Optional[TextCleanerConfig] = None):
8478
ner_backend=self.cfg.ner_backend,
8579
gliner_config=gliner_config,
8680
torch_model_names=torch_model_names,
81+
ner_batch_size=self.cfg.ner_batch_size,
82+
ensemble_models=dict(self.cfg.ner_ensemble) if self.cfg.ner_ensemble is not None else None,
83+
ensemble_default_keys=tuple(self.cfg.ner_ensemble_default_keys) if self.cfg.ner_ensemble_default_keys is not None else None,
8784
)
8885
else:
8986
self.GeneralNER = None
9087

9188
self.batch_size = 8
9289
self._pipeline = []
90+
self._post_fuzzy_pipeline = []
9391
self._init_pipeline()
9492

9593
def _init_pipeline(self):
96-
"""Build the pipeline steps list based on config."""
94+
"""Build the pipeline steps lists based on config.
95+
96+
Steps that require language context (fuzzy date replacement) are split
97+
into pre- and post-fuzzy lists. _process_single calls them in order:
98+
self._pipeline → fuzzy (explicit, with ctx) → self._post_fuzzy_pipeline.
99+
"""
97100
cfg = self.cfg
98101

99102
if cfg.check_fix_bad_unicode:
@@ -110,22 +113,25 @@ def _init_pipeline(self):
110113
self._pipeline.append(self._replace_emails)
111114
if cfg.check_replace_dates:
112115
self._pipeline.append(self._replace_dates)
113-
if cfg.check_fuzzy_replace_dates:
114-
self._pipeline.append(self._fuzzy_replace_dates)
116+
# fuzzy_replace_dates runs here (between replace_dates and replace_years)
117+
# but is called explicitly in _process_single with language context.
115118
if cfg.check_replace_years:
116-
self._pipeline.append(self._replace_years)
119+
self._post_fuzzy_pipeline.append(self._replace_years)
117120
if cfg.check_replace_phone_numbers:
118-
self._pipeline.append(self._replace_phone_numbers)
121+
self._post_fuzzy_pipeline.append(self._replace_phone_numbers)
119122
if cfg.check_replace_numbers:
120-
self._pipeline.append(self._replace_numbers)
123+
self._post_fuzzy_pipeline.append(self._replace_numbers)
121124
if cfg.check_replace_currency_symbols:
122-
self._pipeline.append(self._replace_currency_symbols)
125+
self._post_fuzzy_pipeline.append(self._replace_currency_symbols)
123126
if cfg.check_remove_isolated_letters:
124-
self._pipeline.append(self._remove_isolated_letters)
127+
self._post_fuzzy_pipeline.append(self._remove_isolated_letters)
125128
if cfg.check_remove_isolated_special_symbols:
126-
self._pipeline.append(self._remove_isolated_special_symbols)
129+
self._post_fuzzy_pipeline.append(self._remove_isolated_special_symbols)
127130
if cfg.check_normalize_whitespace:
128-
self._pipeline.append(self._normalize_whitespace)
131+
self._post_fuzzy_pipeline.append(self._normalize_whitespace)
132+
# User-provided custom steps — appended after all built-in steps
133+
for step_fn in cfg.custom_pipeline_steps:
134+
self._post_fuzzy_pipeline.append(step_fn)
129135

130136
def _detect_language(self, text: str) -> Optional[str]:
131137
"""Detect language as a pure function (no instance mutation)."""
@@ -151,15 +157,30 @@ def _process_single(self, text: str) -> Tuple[str, Optional[str], Optional[str]]
151157
# Detect language (pure function, thread-safe)
152158
language = self._detect_language(text)
153159

154-
current_text = text
160+
# Pass language explicitly through pipeline context dict
161+
ctx = {"language": language}
155162

156-
# Store language in thread-local so pipeline steps can access it
157-
_thread_ctx.language = language
163+
current_text = text
158164

159-
# Apply non-NER pipeline steps
165+
# Pre-fuzzy pipeline steps (unicode fix → html → urls → emails → dates)
160166
for step in self._pipeline:
161167
current_text = step(current_text)
162168

169+
# Fuzzy date replacement — requires language context, called explicitly
170+
# to avoid thread-local; positioned between replace_dates and replace_years.
171+
if self.cfg.check_fuzzy_replace_dates:
172+
lang = ctx.get("language")
173+
current_text = self.ProcessDateTime.fuzzy_replace_dates(
174+
current_text,
175+
replace_with=self.cfg.replace_with_dates,
176+
score_cutoff=self.cfg.fuzzy_date_score_cutoff,
177+
language=lang,
178+
)
179+
180+
# Post-fuzzy pipeline steps (years → phones → numbers → symbols → whitespace)
181+
for step in self._post_fuzzy_pipeline:
182+
current_text = step(current_text)
183+
163184
# NER processing
164185
if self.cfg.check_ner_process and self.GeneralNER is not None:
165186
current_text = self.GeneralNER.ner_process(
@@ -217,6 +238,30 @@ def process_batch(self, texts: List[str], batch_size: int = None) -> List[Tuple[
217238

218239
return results
219240

241+
async def aprocess_batch(self, texts: List[str], batch_size: int = None) -> List[Tuple[str, Optional[str], Optional[str]]]:
242+
"""Async version of process_batch for use with asyncio-based frameworks (FastAPI, aiohttp).
243+
244+
Runs process_batch in a thread-pool executor so it doesn't block the event loop.
245+
"""
246+
loop = asyncio.get_running_loop()
247+
return await loop.run_in_executor(None, self.process_batch, texts, batch_size)
248+
249+
def warmup(self, languages: Optional[List[str]] = None) -> None:
250+
"""Pre-load NER models to avoid first-request latency.
251+
252+
Args:
253+
languages: Language names to pre-load (e.g. ``['ENGLISH', 'DUTCH']``).
254+
If ``None``, pre-loads models for all supported languages.
255+
"""
256+
if not self.cfg.check_ner_process or self.GeneralNER is None:
257+
return
258+
langs = languages or list(self.cfg.supported_languages)
259+
for lang in langs:
260+
try:
261+
self.GeneralNER.load_language(lang)
262+
except Exception as e:
263+
logger.warning("warmup: skipping %s: %s", lang, e)
264+
220265
def process(self, text: str) -> Tuple[str, Optional[str], Optional[str]]:
221266
"""Process a single text. Maintains backward compatibility.
222267
@@ -248,12 +293,12 @@ def _replace_emails(self, text):
248293
def _replace_dates(self, text):
249294
return self.ProcessDateTime.replace_dates(text, replace_with=self.cfg.replace_with_dates)
250295

251-
def _fuzzy_replace_dates(self, text):
296+
def _fuzzy_replace_dates(self, text, language=None):
252297
return self.ProcessDateTime.fuzzy_replace_dates(
253298
text,
254299
replace_with=self.cfg.replace_with_dates,
255300
score_cutoff=self.cfg.fuzzy_date_score_cutoff,
256-
language=getattr(_thread_ctx, 'language', 'ENGLISH'),
301+
language=language,
257302
)
258303

259304
def _replace_years(self, text):

sct/utils/constants.py

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import re
66
import string
7+
import regex
78

89
CURRENCIES = {
910
"$": "USD",
@@ -100,10 +101,11 @@
100101
YEAR_REGEX = re.compile(r"\b(19|20)\d{2}\b")
101102

102103
# ---------------------------------------------------------------------------
103-
# Multilingual month name vocabulary (EN, NL, DE, ES)
104+
# Multilingual month name vocabulary (EN, NL, DE, ES, FR, PT, IT)
104105
# Used by DATE_REGEX (exact match) and fuzzy_replace_dates (RapidFuzz).
105106
# Keys are canonical lowercase forms; values are frozensets of language codes
106-
# (some month names are shared across languages, e.g. "juni" is NL and DE).
107+
# (some month names are shared across languages, e.g. "juni" is NL and DE,
108+
# "agosto" is ES/PT/IT, "marzo" is ES/IT, "novembre" is FR/IT).
107109
# ---------------------------------------------------------------------------
108110
MONTH_NAMES_MULTILINGUAL = {
109111
# English
@@ -130,13 +132,38 @@
130132
"mär": frozenset({"de"}), "mrz": frozenset({"de"}), "dez": frozenset({"de"}),
131133
# Spanish
132134
"enero": frozenset({"es"}), "febrero": frozenset({"es"}),
133-
"marzo": frozenset({"es"}), "abril": frozenset({"es"}),
135+
"marzo": frozenset({"es", "it"}), "abril": frozenset({"es", "pt"}),
134136
"mayo": frozenset({"es"}), "junio": frozenset({"es"}),
135-
"julio": frozenset({"es"}), "agosto": frozenset({"es"}),
137+
"julio": frozenset({"es"}), "agosto": frozenset({"es", "pt", "it"}),
136138
"septiembre": frozenset({"es"}), "octubre": frozenset({"es"}),
137139
"noviembre": frozenset({"es"}), "diciembre": frozenset({"es"}),
138140
"ene": frozenset({"es"}), "abr": frozenset({"es"}),
139141
"ago": frozenset({"es"}), "dic": frozenset({"es"}),
142+
# French
143+
"janvier": frozenset({"fr"}), "février": frozenset({"fr"}),
144+
"mars": frozenset({"fr"}), "avril": frozenset({"fr"}),
145+
"juin": frozenset({"fr"}), "juillet": frozenset({"fr"}),
146+
"août": frozenset({"fr"}), "septembre": frozenset({"fr"}),
147+
"octobre": frozenset({"fr"}), "novembre": frozenset({"fr", "it"}),
148+
"décembre": frozenset({"fr"}),
149+
"fév": frozenset({"fr"}), "avr": frozenset({"fr"}),
150+
"aoû": frozenset({"fr"}), "déc": frozenset({"fr"}),
151+
# Portuguese
152+
"janeiro": frozenset({"pt"}), "fevereiro": frozenset({"pt"}),
153+
"março": frozenset({"pt"}), "maio": frozenset({"pt"}),
154+
"junho": frozenset({"pt"}), "julho": frozenset({"pt"}),
155+
"setembro": frozenset({"pt"}), "outubro": frozenset({"pt"}),
156+
"novembro": frozenset({"pt"}), "dezembro": frozenset({"pt"}),
157+
"set": frozenset({"pt"}), "out": frozenset({"pt"}),
158+
# Italian
159+
"gennaio": frozenset({"it"}), "febbraio": frozenset({"it"}),
160+
"aprile": frozenset({"it"}), "maggio": frozenset({"it"}),
161+
"giugno": frozenset({"it"}), "luglio": frozenset({"it"}),
162+
"settembre": frozenset({"it"}), "ottobre": frozenset({"it"}),
163+
"dicembre": frozenset({"it"}),
164+
"gen": frozenset({"it"}), "mag": frozenset({"it"}),
165+
"giu": frozenset({"it"}), "lug": frozenset({"it"}),
166+
"ott": frozenset({"it"}),
140167
}
141168

142169
# Flat set for quick membership check (lowercase)
@@ -158,6 +185,9 @@
158185
"DUTCH": "nl",
159186
"GERMAN": "de",
160187
"SPANISH": "es",
188+
"FRENCH": "fr",
189+
"PORTUGUESE": "pt",
190+
"ITALIAN": "it",
161191
}
162192

163193
# Per-language fuzzy vocabulary (full names only, len > 4)
@@ -302,8 +332,17 @@ def build_fuzzy_vocabulary(month_names_dict=None, lang_to_vocab=None):
302332
# Pre-compiled punctuation pattern (was re-compiled on every call in special.py)
303333
PUNCTUATION_REGEX = re.compile('[' + re.escape(string.punctuation) + ']')
304334

305-
# Sentence boundary pattern for NER text splitting (legacy, kept for backward compat)
306-
SENTENCE_BOUNDARY_PATTERN = re.compile(r'(?<=[.!?])\s+(?=[^\d])')
335+
# Sentence boundary pattern for NER text splitting
336+
# Used by ner.py split_text() as preferred split point before CHUNK_DELIMITERS.
337+
# Uses the `regex` library (drop-in for `re`) to support variable-length lookbehind.
338+
# (?<![A-Z][a-z]{0,3}\.) — do NOT split after abbreviation (Dr., Mr., Ltd., Corp.)
339+
# [A-Z][a-z]{0,3}\. covers: "Dr.", "Mr.", "Ltd.", "U." etc.
340+
# (?<=[.!?]) — must follow sentence-ending punctuation
341+
# \s+ — one or more whitespace
342+
# (?=[A-Z]) — must be followed by uppercase (sentence start)
343+
SENTENCE_BOUNDARY_PATTERN = regex.compile(
344+
r"(?<![A-Z][a-z]{0,3}\.)(?<=[.!?])\s+(?=[A-Z])"
345+
)
307346

308347
# Ordered delimiter hierarchy for text chunking (inspired by semchunk).
309348
# Tried from coarsest to finest; first delimiter that produces a split wins.

0 commit comments

Comments
 (0)