Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ SPECS.md
docs/plans/
docs/GLINER_GAP_ANALYSIS.md
docs/V060_PLAN.md
docs/FUNCTIONAL_TEST_REPORT.md
squeakycleantext-explorer.html
ralph-loop-prompt.md
.claude/
Expand Down
2 changes: 1 addition & 1 deletion sct/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from sct.utils.anonymization_map import AnonymizationMap, MapEntry
from sct.utils.process_result import ProcessResult

__version__ = "0.6.0"
__version__ = "0.6.1"
__all__ = [
"TextCleaner", "TextCleanerConfig",
"PII_LABELS", "PII_LABEL_MAP",
Expand Down
5 changes: 2 additions & 3 deletions sct/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
VALID_NER_BACKENDS = frozenset({
'onnx', 'torch', 'gliner', 'ensemble_onnx', 'ensemble_torch', 'presidio_gliner',
})
GLINER_BACKENDS = frozenset({'gliner', 'ensemble_onnx', 'ensemble_torch', 'presidio_gliner'})

DEFAULT_NER_MODELS: dict[str, str] = {
'ENGLISH': 'rhnfzl/xlm-roberta-large-conll03-english-onnx',
Expand Down Expand Up @@ -320,9 +321,7 @@ def __post_init__(self):
)

# GLiNER fields required for gliner/ensemble backends
needs_gliner = self.ner_backend in (
'gliner', 'ensemble_onnx', 'ensemble_torch', 'presidio_gliner',
)
needs_gliner = self.ner_backend in GLINER_BACKENDS
if needs_gliner:
if not self.gliner_model:
raise ValueError(
Expand Down
19 changes: 5 additions & 14 deletions sct/sct.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Callable, Dict, List, Optional

from sct.config import TextCleanerConfig, _config_from_module_globals
from sct.config import TextCleanerConfig, GLINER_BACKENDS, _config_from_module_globals
from sct.utils import constants, contact, datetime, ner, normtext, resources, special, stopwords
from sct.utils.anonymization_map import AnonymizationMap
from sct.utils.process_result import ProcessResult
Expand Down Expand Up @@ -71,9 +71,7 @@ def __init__(self, cfg: Optional[TextCleanerConfig] = None):
if self.cfg.check_ner_process:
# Build GLiNER config dict (if needed)
gliner_config = None
needs_gliner = self.cfg.ner_backend in (
'gliner', 'ensemble_onnx', 'ensemble_torch', 'presidio_gliner',
)
needs_gliner = self.cfg.ner_backend in GLINER_BACKENDS
if needs_gliner:
gliner_config = {
'model': self.cfg.gliner_model,
Expand Down Expand Up @@ -109,10 +107,8 @@ def __init__(self, cfg: Optional[TextCleanerConfig] = None):
else None
),
replacement_mode=self.cfg.replacement_mode,
synthetic_replacer=self._synthetic_replacer,
)
else:
pass # self.GeneralNER already initialized to None above

# GLiClass document-level pre-classification (optional, lazy-loaded)
self._gliclass: Any = None
if self.cfg.check_classify_document:
Expand All @@ -126,7 +122,6 @@ def __init__(self, cfg: Optional[TextCleanerConfig] = None):
onnx=self.cfg.gliclass_onnx,
)

self.batch_size = 8
self._pipeline: List[Callable[[str], str]] = []
self._post_fuzzy_pipeline: List[Callable[[str], str]] = []
self._init_pipeline()
Expand Down Expand Up @@ -227,9 +222,6 @@ def _process_single(self, text: str) -> ProcessResult:
# Detect language (pure function, thread-safe)
language = self._detect_language(text)

# Pass language explicitly through pipeline context dict
ctx = {"language": language}

current_text = text

# Pre-fuzzy pipeline steps (unicode fix → html → urls → emails → dates)
Expand All @@ -239,12 +231,11 @@ def _process_single(self, text: str) -> ProcessResult:
# Fuzzy date replacement — requires language context, called explicitly
# to avoid thread-local; positioned between replace_dates and replace_years.
if self.cfg.check_fuzzy_replace_dates:
lang = ctx.get("language")
current_text = self.ProcessDateTime.fuzzy_replace_dates(
current_text,
replace_with=self.cfg.replace_with_dates,
score_cutoff=self.cfg.fuzzy_date_score_cutoff,
language=lang,
language=language,
)

# Post-fuzzy pipeline steps (years → phones → numbers → symbols → whitespace)
Expand All @@ -255,7 +246,7 @@ def _process_single(self, text: str) -> ProcessResult:
if self.cfg.check_ner_process and self.GeneralNER is not None:
current_text = self.GeneralNER.ner_process(
current_text,
positional_tags=list(self.cfg.positional_tags),
positional_tags=self.cfg.positional_tags,
ner_confidence_threshold=self.cfg.ner_confidence_threshold,
language=language,
anon_map=anon_map,
Expand Down
49 changes: 16 additions & 33 deletions sct/utils/gliclass_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,21 @@ def __init__(
self._onnx = onnx
self._pipeline = None

if onnx:
self._init_onnx(model_id)
else:
self._init_pytorch(model_id)
self._init_model(model_id)

logger.info(
"Loaded GLiClass model: %s (onnx=%s, labels=%d)",
model_id, onnx, len(self.labels),
)

def _init_pytorch(self, model_id: str) -> None:
"""Load GLiClass model via PyTorch (gliclass package)."""
def _init_model(self, model_id: str) -> None:
"""Load GLiClass model via gliclass package.

Note: gliclass does not yet expose a native ONNX loader, so the
``onnx`` flag is recorded but both paths use the same PyTorch-backed
``GLiClassModel.from_pretrained``. When gliclass adds ONNX support,
this method should branch on ``self._onnx``.
"""
try:
from gliclass import GLiClassModel, ZeroShotClassificationPipeline # noqa: S404
from transformers import AutoTokenizer
Expand All @@ -62,26 +65,6 @@ def _init_pytorch(self, model_id: str) -> None:
device='cpu',
)

def _init_onnx(self, model_id: str) -> None:
"""Load GLiClass model via ONNX Runtime (torch-free)."""
try:
from gliclass import GLiClassModel, ZeroShotClassificationPipeline # noqa: S404
from transformers import AutoTokenizer
except ImportError:
raise ImportError(
"gliclass + onnxruntime are required for ONNX GLiClass backend. "
"Install with: pip install squeakycleantext[classify] squeakycleantext[classify-onnx]"
)

model = GLiClassModel.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
self._pipeline = ZeroShotClassificationPipeline(
model=model,
tokenizer=tokenizer,
classification_type=self.classification_type,
device='cpu',
)

def classify(self, text: str) -> List[Dict[str, float]]:
"""Classify text against configured labels.

Expand All @@ -94,16 +77,16 @@ def classify(self, text: str) -> List[Dict[str, float]]:

result = self._pipeline(
text,
candidate_labels=self.labels,
labels=self.labels,
)

# Pipeline returns {"sequence": ..., "labels": [...], "scores": [...]}
# Pipeline returns list[list[dict]] — one list per input text,
# each containing {"label": str, "score": float} dicts.
classifications = []
labels = result.get('labels', [])
scores = result.get('scores', [])
for label, score in zip(labels, scores):
if score >= self.threshold:
classifications.append({'label': label, 'score': score})
entries = result[0] if result else []
for entry in entries:
if entry['score'] >= self.threshold:
classifications.append({'label': entry['label'], 'score': entry['score']})

classifications.sort(key=lambda x: x['score'], reverse=True)
return classifications
Expand Down
17 changes: 13 additions & 4 deletions sct/utils/gliner_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,19 @@ def __init__(
"Install with: pip install squeakycleantext[gliner]"
)
if onnx:
self.model = GLiNER.from_pretrained(
model_id, load_onnx_model=True, load_tokenizer=True,
)
logger.info("Loaded GLiNER model in ONNX mode: %s", model_id)
try:
self.model = GLiNER.from_pretrained(
model_id, load_onnx_model=True, load_tokenizer=True,
)
logger.info("Loaded GLiNER model in ONNX mode: %s", model_id)
except FileNotFoundError:
logger.warning(
"ONNX model not found for %s (GLiNER issue #314: most models "
"don't ship model.onnx at repo root). Falling back to PyTorch.",
model_id,
)
self.model = GLiNER.from_pretrained(model_id)
self._onnx = False
else:
self.model = GLiNER.from_pretrained(model_id)
if device == 'cuda' and not onnx:
Expand Down
64 changes: 35 additions & 29 deletions sct/utils/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,26 @@
import threading
from collections import defaultdict
import logging
from typing import Dict, List, Optional, Union
from typing import Dict, List, Optional, Sequence, Union
from pathlib import Path

import onnxruntime as ort

from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import RecognizerResult

from typing import NamedTuple as _NamedTuple

from sct.utils import constants
from sct.utils.anonymization_map import AnonymizationMap
from sct.utils.onnx_pipeline import load_onnx_ner_model
from sct.config import DEFAULT_NER_MODELS, DEFAULT_NER_ENSEMBLE, NER_ENSEMBLE_DEFAULT_KEYS, LANG_KEYS


class AnonymizeResult(_NamedTuple):
"""Result of text anonymization — lightweight typed container."""
text: str

ort.set_default_logger_severity(3) # Silence ONNX Runtime warnings

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -53,7 +60,8 @@ def __init__(self, cache_dir: Optional[Path] = None, device: Optional[str] = Non
ner_batch_size: int = 8,
ensemble_models: Optional[Dict] = None,
ensemble_default_keys: Optional[tuple] = None,
replacement_mode: str = 'placeholder'):
replacement_mode: str = 'placeholder',
synthetic_replacer=None):
"""Initialize NER processor.

Args:
Expand All @@ -67,11 +75,12 @@ def __init__(self, cache_dir: Optional[Path] = None, device: Optional[str] = Non
Required when ner_backend involves GLiNER.
torch_model_names: Language-keyed dict of PyTorch model repo IDs.
Required when ner_backend involves torch.
synthetic_replacer: Shared SyntheticReplacer instance (from TextCleaner).
"""
self._ner_backend = ner_backend
self._ner_batch_size = ner_batch_size
self._replacement_mode = replacement_mode
self._synthetic_replacer = None # Lazy-loaded when replacement_mode='synthetic'
self._synthetic_replacer = synthetic_replacer
self._gliner_pipe = None
self._ensemble_models: Dict[str, tuple] = ensemble_models if ensemble_models is not None else DEFAULT_NER_ENSEMBLE
self._ensemble_default_keys: tuple = (
Expand Down Expand Up @@ -195,11 +204,12 @@ def _init_presidio_gliner(self, gliner_config):
try:
from presidio_analyzer.predefined_recognizers import GLiNERRecognizer # noqa: S404
gliner_recognizer = GLiNERRecognizer(
model_path=gliner_config['model'],
model_name=gliner_config['model'],
supported_entities=[
label.upper()
for label in gliner_config.get('labels', ['person', 'organization', 'location'])
],
threshold=gliner_config.get('threshold', 0.4),
)
self._analyzer.registry.add_recognizer(gliner_recognizer)
except ImportError:
Expand All @@ -208,13 +218,6 @@ def _init_presidio_gliner(self, gliner_config):
"Install with: pip install presidio-analyzer gliner"
)

def _get_synthetic_replacer(self):
"""Lazily initialize the SyntheticReplacer."""
if self._synthetic_replacer is None:
from sct.utils.synthetic import SyntheticReplacer
self._synthetic_replacer = SyntheticReplacer()
return self._synthetic_replacer

def _get_ensemble_keys(self, language: str) -> tuple:
"""Return ordered model keys to run for the given language."""
return self._ensemble_models.get(language, self._ensemble_default_keys)
Expand Down Expand Up @@ -313,9 +316,8 @@ def anonymize_text(self, text, filtered_data, replacement_mode='placeholder',
return self._anonymize_reversible(text, filtered_data, anon_map)

if replacement_mode == 'synthetic':
replacer = self._get_synthetic_replacer()
result_text = replacer.generate_for_entities(text, filtered_data)
return type('AnonymizeResult', (), {'text': result_text})()
result_text = self._synthetic_replacer.generate_for_entities(text, filtered_data)
return AnonymizeResult(text=result_text)

has_custom = any(
items['entity_group'] not in ENTITY_TYPE_MAP
Expand All @@ -328,7 +330,7 @@ def anonymize_text(self, text, filtered_data, replacement_mode='placeholder',
for items in sorted_data:
tag = ENTITY_TYPE_MAP.get(items['entity_group'], items['entity_group'])
text = text[:items['start']] + f"<{tag}>" + text[items['end']:]
return type('AnonymizeResult', (), {'text': text})()
return AnonymizeResult(text=text)
else:
# Standard entities only: use Presidio (existing behavior)
analyzer_result = []
Expand All @@ -348,13 +350,14 @@ def anonymize_text(self, text, filtered_data, replacement_mode='placeholder',
if 0 <= entry.start < text_length and 0 < entry.end <= text_length
]

return self.engine.anonymize(text=text, analyzer_results=analyzer_result)
engine_result = self.engine.anonymize(text=text, analyzer_results=analyzer_result)
return AnonymizeResult(text=engine_result.text)

def _anonymize_reversible(self, text, filtered_data, anon_map=None):
"""Replace entities with indexed placeholders and populate the map.

Uses right-to-left replacement to preserve character offsets.
Returns an AnonymizeResult-like object with ``.text`` attribute.
Returns an ``AnonymizeResult`` with the anonymized ``.text``.
"""
if anon_map is None:
anon_map = AnonymizationMap()
Expand All @@ -373,7 +376,7 @@ def _anonymize_reversible(self, text, filtered_data, anon_map=None):
)
text = text[:item['start']] + placeholder + text[item['end']:]

return type('AnonymizeResult', (), {'text': text, 'anon_map': anon_map})()
return AnonymizeResult(text=text)

def ner_ensemble(self, ner_results, t):
"""Apply ensemble voting across multiple model results.
Expand Down Expand Up @@ -440,7 +443,7 @@ def _simple_chunk(self, text: str, max_tokens: int = 384) -> List[str]:
def ner_process(
self,
text: str,
positional_tags: Optional[List[str]] = None,
positional_tags: Optional[Sequence[str]] = None,
ner_confidence_threshold: Optional[float] = None,
language: Optional[str] = None,
anon_map: Optional['AnonymizationMap'] = None,
Expand Down Expand Up @@ -474,6 +477,16 @@ def ner_process(
if not chunks:
return text

# Pre-compute GLiNER-only tag set (constant across chunks)
gliner_all_tags = None
if self._ner_backend == 'gliner' and self._gliner_pipe:
gliner_all_tags = set(positional_tags)
gliner_all_tags.update(self._gliner_pipe.label_map.values())
gliner_all_tags.update(
label.upper() for label in self._gliner_pipe.labels
if label not in self._gliner_pipe.label_map
)

# --- Inference + ensemble per chunk ---
ner_clean_text = []
for chunk in chunks:
Expand All @@ -486,7 +499,7 @@ def ner_process(
model_name = self._model_names.get(key, key)
model_lock = self._get_lock(model_name)
with model_lock:
batch = self._get_pipeline(key)([chunk])
batch = self._pipelines[key]([chunk])
ner_results.extend(self.ner_data(batch[0], positional_tags))

# GLiNER backend
Expand All @@ -495,15 +508,8 @@ def ner_process(
gliner_lock = self._get_lock('gliner')
with gliner_lock:
gliner_batch = self._gliner_pipe([chunk])
if self._ner_backend == 'gliner':
# GLiNER-only: include all mapped entity types
all_tags = set(positional_tags)
all_tags.update(self._gliner_pipe.label_map.values())
all_tags.update(
label.upper() for label in self._gliner_pipe.labels
if label not in self._gliner_pipe.label_map
)
ner_results.extend(self.ner_data(gliner_batch[0], all_tags))
if gliner_all_tags is not None:
ner_results.extend(self.ner_data(gliner_batch[0], gliner_all_tags))
else:
# Ensemble: filter to positional_tags only
ner_results.extend(self.ner_data(gliner_batch[0], positional_tags))
Expand Down
Loading