rhnfzl
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎sct/config.py‎
Lines changed: 55 additions & 0 deletions b/‎sct/config.py‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎sct/sct.py‎
Lines changed: 68 additions & 23 deletions b/‎sct/sct.py‎
Lines changed: 68 additions & 23 deletions
diff --git a/‎sct/utils/constants.py‎
Lines changed: 45 additions & 6 deletions b/‎sct/utils/constants.py‎
Lines changed: 45 additions & 6 deletions
@@ -170,6 +170,7 @@ OldSqueakyCleanText/
 snap.py
 CLAUDE.md
 SPECS.md
+docs/plans/
 squeakycleantext-explorer.html
 ralph-loop-prompt.md
 .claude/
 
@@ -33,6 +33,7 @@ dependencies = [
     "huggingface-hub>=1.4.0",
     "numpy>=2.0.0",
     "presidio_anonymizer>=2.2.355",
+    "regex>=2024.0.0",
 ]
 
 [project.urls]
 
@@ -29,6 +29,10 @@
     'DUTCH': 'rhnfzl/xlm-roberta-large-conll02-dutch-onnx',
     'GERMAN': 'rhnfzl/xlm-roberta-large-conll03-german-onnx',
     'SPANISH': 'rhnfzl/xlm-roberta-large-conll02-spanish-onnx',
+    # FR/PT/IT: no dedicated ONNX model available; map to MULTILINGUAL fallback
+    'FRENCH': 'rhnfzl/wikineural-multilingual-ner-onnx',
+    'PORTUGUESE': 'rhnfzl/wikineural-multilingual-ner-onnx',
+    'ITALIAN': 'rhnfzl/wikineural-multilingual-ner-onnx',
     'MULTILINGUAL': 'rhnfzl/wikineural-multilingual-ner-onnx',
 }
 
@@ -37,9 +41,26 @@
     'DUTCH': 'FacebookAI/xlm-roberta-large-finetuned-conll02-dutch',
     'GERMAN': 'FacebookAI/xlm-roberta-large-finetuned-conll03-german',
     'SPANISH': 'FacebookAI/xlm-roberta-large-finetuned-conll02-spanish',
+    # FR/PT/IT: no dedicated Torch model available; map to MULTILINGUAL fallback
+    'FRENCH': 'Babelscape/wikineural-multilingual-ner',
+    'PORTUGUESE': 'Babelscape/wikineural-multilingual-ner',
+    'ITALIAN': 'Babelscape/wikineural-multilingual-ner',
     'MULTILINGUAL': 'Babelscape/wikineural-multilingual-ner',
 }
 
+# NER ensemble: ordered model keys to run per input language.
+# Keys must exist in the resolved ner_models dict.
+# EN/NL/DE/ES get a 3-model ensemble (lang-specific + English CoNLL + Multilingual).
+# All other languages (FR, PT, IT, etc.) fall back to NER_ENSEMBLE_DEFAULT_KEYS.
+DEFAULT_NER_ENSEMBLE: dict[str, tuple] = {
+    'ENGLISH': ('ENGLISH', 'MULTILINGUAL'),
+    'DUTCH':   ('DUTCH',   'ENGLISH', 'MULTILINGUAL'),
+    'GERMAN':  ('GERMAN',  'ENGLISH', 'MULTILINGUAL'),
+    'SPANISH': ('SPANISH', 'ENGLISH', 'MULTILINGUAL'),
+}
+# Fallback for any language not in DEFAULT_NER_ENSEMBLE (FR, PT, IT, etc.)
+NER_ENSEMBLE_DEFAULT_KEYS: tuple = ('MULTILINGUAL', 'ENGLISH')
+
 
 @dataclass(frozen=True)
 class TextCleanerConfig:
@@ -100,6 +121,7 @@ class TextCleanerConfig:
     # NER settings
     positional_tags: Tuple[str, ...] = ('PER', 'LOC', 'ORG', 'MISC')
     ner_confidence_threshold: float = 0.85
+    ner_batch_size: int = 8
     language: Optional[str] = None
 
     # NER backend selection
@@ -111,6 +133,16 @@ class TextCleanerConfig:
     # Missing keys are filled from DEFAULT_NER_MODELS.
     ner_models: Optional[Mapping[str, str]] = None
 
+    # Ensemble routing: maps each input language to an ordered tuple of model keys
+    # to run. Keys must exist in the resolved ner_models dict.
+    # If None, DEFAULT_NER_ENSEMBLE is used (with NER_ENSEMBLE_DEFAULT_KEYS fallback).
+    ner_ensemble: Optional[Mapping[str, Tuple[str, ...]]] = None
+
+    # Fallback keys for any language not listed in ner_ensemble (e.g. FR, PT, IT).
+    # If None, NER_ENSEMBLE_DEFAULT_KEYS is used. Pass an empty tuple to skip
+    # ensemble inference for unmapped languages.
+    ner_ensemble_default_keys: Optional[Tuple[str, ...]] = None
+
     # Deprecated: positional tuple (English, Dutch, German, Spanish, Multilingual).
     # Use ner_models dict instead. Kept for backward compatibility.
     ner_models_list: Tuple[str, ...] = (
@@ -131,6 +163,10 @@ class TextCleanerConfig:
     gliner_label_map: Optional[Mapping[str, str]] = None
     gliner_threshold: float = 0.4
 
+    # Plugin: user-provided pipeline steps, each callable (text: str) -> str.
+    # Appended after all built-in steps in _init_pipeline().
+    custom_pipeline_steps: Tuple = ()
+
     # Language extensibility — extend Lingua detection, stopwords, dates, NER
     extra_languages: Tuple[str, ...] = ()
     custom_stopwords: Optional[Mapping[str, frozenset]] = None
@@ -145,6 +181,13 @@ def __post_init__(self):
             object.__setattr__(self, 'positional_tags', tuple(self.positional_tags))
         if isinstance(self.ner_models_list, list):
             object.__setattr__(self, 'ner_models_list', tuple(self.ner_models_list))
+        if isinstance(self.custom_pipeline_steps, list):
+            object.__setattr__(self, 'custom_pipeline_steps', tuple(self.custom_pipeline_steps))
+        for step in self.custom_pipeline_steps:
+            if not callable(step):
+                raise ValueError(
+                    f"custom_pipeline_steps must contain callables, got: {type(step)!r}"
+                )
 
         # --- NER backend validation ---
         if self.ner_backend not in VALID_NER_BACKENDS:
@@ -153,6 +196,12 @@ def __post_init__(self):
                 f"got: {self.ner_backend!r}"
             )
 
+        # ner_batch_size must be positive
+        if self.ner_batch_size <= 0:
+            raise ValueError(
+                f"ner_batch_size must be >= 1, got {self.ner_batch_size}"
+            )
+
         # GLiNER fields required for gliner/ensemble backends
         needs_gliner = self.ner_backend in ('gliner', 'ensemble_onnx', 'ensemble_torch')
         if needs_gliner:
@@ -204,6 +253,12 @@ def __post_init__(self):
             )
         else:
             # No dict provided — derive from ner_models_list (may be default or custom)
+            import warnings
+            warnings.warn(
+                "ner_models_list is deprecated. Use ner_models dict instead.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
             models_dict = dict(zip(LANG_KEYS, self.ner_models_list))
             object.__setattr__(self, 'ner_models', MappingProxyType(models_dict))
 
 
@@ -1,9 +1,9 @@
 """
 Comprehensive text cleaning and preprocessing pipeline.
 """
+import asyncio
 import logging
 import os
-import threading
 from concurrent.futures import ThreadPoolExecutor
 from typing import List, Tuple, Optional
 
@@ -12,12 +12,6 @@
 
 logger = logging.getLogger(__name__)
 
-# Thread-local storage for passing detected language to pipeline steps
-# that need it (e.g. fuzzy date replacement).  Each thread running
-# _process_single sets its own value, so there is no cross-thread leaking.
-_thread_ctx = threading.local()
-
-
 class TextCleaner:
 
     def __init__(self, cfg: Optional[TextCleanerConfig] = None):
@@ -84,16 +78,25 @@ def __init__(self, cfg: Optional[TextCleanerConfig] = None):
                 ner_backend=self.cfg.ner_backend,
                 gliner_config=gliner_config,
                 torch_model_names=torch_model_names,
+                ner_batch_size=self.cfg.ner_batch_size,
+                ensemble_models=dict(self.cfg.ner_ensemble) if self.cfg.ner_ensemble is not None else None,
+                ensemble_default_keys=tuple(self.cfg.ner_ensemble_default_keys) if self.cfg.ner_ensemble_default_keys is not None else None,
             )
         else:
             self.GeneralNER = None
 
         self.batch_size = 8
         self._pipeline = []
+        self._post_fuzzy_pipeline = []
         self._init_pipeline()
 
     def _init_pipeline(self):
-        """Build the pipeline steps list based on config."""
+        """Build the pipeline steps lists based on config.
+
+        Steps that require language context (fuzzy date replacement) are split
+        into pre- and post-fuzzy lists.  _process_single calls them in order:
+        self._pipeline → fuzzy (explicit, with ctx) → self._post_fuzzy_pipeline.
+        """
         cfg = self.cfg
 
         if cfg.check_fix_bad_unicode:
@@ -110,22 +113,25 @@ def _init_pipeline(self):
             self._pipeline.append(self._replace_emails)
         if cfg.check_replace_dates:
             self._pipeline.append(self._replace_dates)
-        if cfg.check_fuzzy_replace_dates:
-            self._pipeline.append(self._fuzzy_replace_dates)
+        # fuzzy_replace_dates runs here (between replace_dates and replace_years)
+        # but is called explicitly in _process_single with language context.
         if cfg.check_replace_years:
-            self._pipeline.append(self._replace_years)
+            self._post_fuzzy_pipeline.append(self._replace_years)
         if cfg.check_replace_phone_numbers:
-            self._pipeline.append(self._replace_phone_numbers)
+            self._post_fuzzy_pipeline.append(self._replace_phone_numbers)
         if cfg.check_replace_numbers:
-            self._pipeline.append(self._replace_numbers)
+            self._post_fuzzy_pipeline.append(self._replace_numbers)
         if cfg.check_replace_currency_symbols:
-            self._pipeline.append(self._replace_currency_symbols)
+            self._post_fuzzy_pipeline.append(self._replace_currency_symbols)
         if cfg.check_remove_isolated_letters:
-            self._pipeline.append(self._remove_isolated_letters)
+            self._post_fuzzy_pipeline.append(self._remove_isolated_letters)
         if cfg.check_remove_isolated_special_symbols:
-            self._pipeline.append(self._remove_isolated_special_symbols)
+            self._post_fuzzy_pipeline.append(self._remove_isolated_special_symbols)
         if cfg.check_normalize_whitespace:
-            self._pipeline.append(self._normalize_whitespace)
+            self._post_fuzzy_pipeline.append(self._normalize_whitespace)
+        # User-provided custom steps — appended after all built-in steps
+        for step_fn in cfg.custom_pipeline_steps:
+            self._post_fuzzy_pipeline.append(step_fn)
 
     def _detect_language(self, text: str) -> Optional[str]:
         """Detect language as a pure function (no instance mutation)."""
@@ -151,15 +157,30 @@ def _process_single(self, text: str) -> Tuple[str, Optional[str], Optional[str]]
         # Detect language (pure function, thread-safe)
         language = self._detect_language(text)
 
-        current_text = text
+        # Pass language explicitly through pipeline context dict
+        ctx = {"language": language}
 
-        # Store language in thread-local so pipeline steps can access it
-        _thread_ctx.language = language
+        current_text = text
 
-        # Apply non-NER pipeline steps
+        # Pre-fuzzy pipeline steps (unicode fix → html → urls → emails → dates)
         for step in self._pipeline:
             current_text = step(current_text)
 
+        # Fuzzy date replacement — requires language context, called explicitly
+        # to avoid thread-local; positioned between replace_dates and replace_years.
+        if self.cfg.check_fuzzy_replace_dates:
+            lang = ctx.get("language")
+            current_text = self.ProcessDateTime.fuzzy_replace_dates(
+                current_text,
+                replace_with=self.cfg.replace_with_dates,
+                score_cutoff=self.cfg.fuzzy_date_score_cutoff,
+                language=lang,
+            )
+
+        # Post-fuzzy pipeline steps (years → phones → numbers → symbols → whitespace)
+        for step in self._post_fuzzy_pipeline:
+            current_text = step(current_text)
+
         # NER processing
         if self.cfg.check_ner_process and self.GeneralNER is not None:
             current_text = self.GeneralNER.ner_process(
@@ -217,6 +238,30 @@ def process_batch(self, texts: List[str], batch_size: int = None) -> List[Tuple[
 
         return results
 
+    async def aprocess_batch(self, texts: List[str], batch_size: int = None) -> List[Tuple[str, Optional[str], Optional[str]]]:
+        """Async version of process_batch for use with asyncio-based frameworks (FastAPI, aiohttp).
+
+        Runs process_batch in a thread-pool executor so it doesn't block the event loop.
+        """
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(None, self.process_batch, texts, batch_size)
+
+    def warmup(self, languages: Optional[List[str]] = None) -> None:
+        """Pre-load NER models to avoid first-request latency.
+
+        Args:
+            languages: Language names to pre-load (e.g. ``['ENGLISH', 'DUTCH']``).
+                       If ``None``, pre-loads models for all supported languages.
+        """
+        if not self.cfg.check_ner_process or self.GeneralNER is None:
+            return
+        langs = languages or list(self.cfg.supported_languages)
+        for lang in langs:
+            try:
+                self.GeneralNER.load_language(lang)
+            except Exception as e:
+                logger.warning("warmup: skipping %s: %s", lang, e)
+
     def process(self, text: str) -> Tuple[str, Optional[str], Optional[str]]:
         """Process a single text. Maintains backward compatibility.
 
@@ -248,12 +293,12 @@ def _replace_emails(self, text):
     def _replace_dates(self, text):
         return self.ProcessDateTime.replace_dates(text, replace_with=self.cfg.replace_with_dates)
 
-    def _fuzzy_replace_dates(self, text):
+    def _fuzzy_replace_dates(self, text, language=None):
         return self.ProcessDateTime.fuzzy_replace_dates(
             text,
             replace_with=self.cfg.replace_with_dates,
             score_cutoff=self.cfg.fuzzy_date_score_cutoff,
-            language=getattr(_thread_ctx, 'language', 'ENGLISH'),
+            language=language,
         )
 
     def _replace_years(self, text):
 
@@ -4,6 +4,7 @@
 
 import re
 import string
+import regex
 
 CURRENCIES = {
     "$": "USD",
@@ -100,10 +101,11 @@
 YEAR_REGEX = re.compile(r"\b(19|20)\d{2}\b")
 
 # ---------------------------------------------------------------------------
-# Multilingual month name vocabulary (EN, NL, DE, ES)
+# Multilingual month name vocabulary (EN, NL, DE, ES, FR, PT, IT)
 # Used by DATE_REGEX (exact match) and fuzzy_replace_dates (RapidFuzz).
 # Keys are canonical lowercase forms; values are frozensets of language codes
-# (some month names are shared across languages, e.g. "juni" is NL and DE).
+# (some month names are shared across languages, e.g. "juni" is NL and DE,
+# "agosto" is ES/PT/IT, "marzo" is ES/IT, "novembre" is FR/IT).
 # ---------------------------------------------------------------------------
 MONTH_NAMES_MULTILINGUAL = {
     # English
@@ -130,13 +132,38 @@
     "mär": frozenset({"de"}), "mrz": frozenset({"de"}), "dez": frozenset({"de"}),
     # Spanish
     "enero": frozenset({"es"}), "febrero": frozenset({"es"}),
-    "marzo": frozenset({"es"}), "abril": frozenset({"es"}),
+    "marzo": frozenset({"es", "it"}), "abril": frozenset({"es", "pt"}),
     "mayo": frozenset({"es"}), "junio": frozenset({"es"}),
-    "julio": frozenset({"es"}), "agosto": frozenset({"es"}),
+    "julio": frozenset({"es"}), "agosto": frozenset({"es", "pt", "it"}),
     "septiembre": frozenset({"es"}), "octubre": frozenset({"es"}),
     "noviembre": frozenset({"es"}), "diciembre": frozenset({"es"}),
     "ene": frozenset({"es"}), "abr": frozenset({"es"}),
     "ago": frozenset({"es"}), "dic": frozenset({"es"}),
+    # French
+    "janvier": frozenset({"fr"}), "février": frozenset({"fr"}),
+    "mars": frozenset({"fr"}), "avril": frozenset({"fr"}),
+    "juin": frozenset({"fr"}), "juillet": frozenset({"fr"}),
+    "août": frozenset({"fr"}), "septembre": frozenset({"fr"}),
+    "octobre": frozenset({"fr"}), "novembre": frozenset({"fr", "it"}),
+    "décembre": frozenset({"fr"}),
+    "fév": frozenset({"fr"}), "avr": frozenset({"fr"}),
+    "aoû": frozenset({"fr"}), "déc": frozenset({"fr"}),
+    # Portuguese
+    "janeiro": frozenset({"pt"}), "fevereiro": frozenset({"pt"}),
+    "março": frozenset({"pt"}), "maio": frozenset({"pt"}),
+    "junho": frozenset({"pt"}), "julho": frozenset({"pt"}),
+    "setembro": frozenset({"pt"}), "outubro": frozenset({"pt"}),
+    "novembro": frozenset({"pt"}), "dezembro": frozenset({"pt"}),
+    "set": frozenset({"pt"}), "out": frozenset({"pt"}),
+    # Italian
+    "gennaio": frozenset({"it"}), "febbraio": frozenset({"it"}),
+    "aprile": frozenset({"it"}), "maggio": frozenset({"it"}),
+    "giugno": frozenset({"it"}), "luglio": frozenset({"it"}),
+    "settembre": frozenset({"it"}), "ottobre": frozenset({"it"}),
+    "dicembre": frozenset({"it"}),
+    "gen": frozenset({"it"}), "mag": frozenset({"it"}),
+    "giu": frozenset({"it"}), "lug": frozenset({"it"}),
+    "ott": frozenset({"it"}),
 }
 
 # Flat set for quick membership check (lowercase)
@@ -158,6 +185,9 @@
     "DUTCH": "nl",
     "GERMAN": "de",
     "SPANISH": "es",
+    "FRENCH": "fr",
+    "PORTUGUESE": "pt",
+    "ITALIAN": "it",
 }
 
 # Per-language fuzzy vocabulary (full names only, len > 4)
@@ -302,8 +332,17 @@ def build_fuzzy_vocabulary(month_names_dict=None, lang_to_vocab=None):
 # Pre-compiled punctuation pattern (was re-compiled on every call in special.py)
 PUNCTUATION_REGEX = re.compile('[' + re.escape(string.punctuation) + ']')
 
-# Sentence boundary pattern for NER text splitting (legacy, kept for backward compat)
-SENTENCE_BOUNDARY_PATTERN = re.compile(r'(?<=[.!?])\s+(?=[^\d])')
+# Sentence boundary pattern for NER text splitting
+# Used by ner.py split_text() as preferred split point before CHUNK_DELIMITERS.
+# Uses the `regex` library (drop-in for `re`) to support variable-length lookbehind.
+# (?<![A-Z][a-z]{0,3}\.) — do NOT split after abbreviation (Dr., Mr., Ltd., Corp.)
+#                           [A-Z][a-z]{0,3}\. covers: "Dr.", "Mr.", "Ltd.", "U." etc.
+# (?<=[.!?])              — must follow sentence-ending punctuation
+# \s+                     — one or more whitespace
+# (?=[A-Z])               — must be followed by uppercase (sentence start)
+SENTENCE_BOUNDARY_PATTERN = regex.compile(
+    r"(?<![A-Z][a-z]{0,3}\.)(?<=[.!?])\s+(?=[A-Z])"
+)
 
 # Ordered delimiter hierarchy for text chunking (inspired by semchunk).
 # Tried from coarsest to finest; first delimiter that produces a split wins.
Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,7 @@ dependencies = [`
`33`	`33`	`"huggingface-hub>=1.4.0",`
`34`	`34`	`"numpy>=2.0.0",`
`35`	`35`	`"presidio_anonymizer>=2.2.355",`
	`36`	`+ "regex>=2024.0.0",`
`36`	`37`	`]`
`37`	`38`
`38`	`39`	`[project.urls]`