feat: added context string selectors

aphp · May 19, 2024 · cb7d1ec · cb7d1ec
1 parent b756307
commit cb7d1ec
Show file tree

Hide file tree

Showing 5 changed files with 218 additions and 85 deletions.
diff --git a/changelog.md b/changelog.md
@@ -14,6 +14,7 @@
 - Added a `context_getter` SpanGetter argument to the `eds.matcher` class to only retrieve entities inside the spans returned by the getter
 - Added a `filter_expr` parameter to scorers to filter the documents to score
 - Added a new `required` field to `eds.contextual_matcher` assign patterns to only match if the required field has been found, and an `include` parameter (similar to `exclude`) to search for required patterns without assigning them to the entity
+- Added context strings (e.g., "words[0:5] | sent[0:1]") to the `eds.contextual_matcher` component to allow for more complex patterns in the selection of the window around the trigger spans
 
 ### Changed
 

diff --git a/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py b/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py
@@ -1,8 +1,7 @@
 import copy
 import re
 import warnings
-from functools import lru_cache
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Union
 
 from confit import VisibleDeprecationWarning
 from loguru import logger
@@ -21,30 +20,6 @@
 from .models import FullConfig, SingleAssignModel, SingleConfig
 
 
-@lru_cache(64)
-def get_window(
-    doclike: Union[Doc, Span], window: Tuple[int, int], limit_to_sentence: bool
-):
-    """
-    Generate a window around the first parameter
-    """
-    start_limit = doclike.sent.start if limit_to_sentence else 0
-    end_limit = doclike.sent.end if limit_to_sentence else len(doclike.doc)
-
-    start = (
-        max(doclike.start + window[0], start_limit)
-        if window and window[0] is not None
-        else start_limit
-    )
-    end = (
-        min(doclike.end + window[1], end_limit)
-        if window and window[0] is not None
-        else end_limit
-    )
-
-    return doclike.doc[start:end]
-
-
 class ContextualMatcher(BaseNERComponent):
     """
     Allows additional matching in the surrounding context of the main match group,
@@ -252,23 +227,15 @@ def filter_one(self, span: Span) -> Span:
         source = span.label_
         to_keep = True
         for exclude in self.patterns[source].exclude:
-            snippet = get_window(
-                doclike=span,
-                window=exclude.window,
-                limit_to_sentence=exclude.limit_to_sentence,
-            )
+            snippet = exclude.window(span)
 
             if next(exclude.matcher(snippet, as_spans=True), None) is not None:
                 to_keep = False
                 logger.trace(f"Entity {span} was filtered out")
                 break
 
         for include in self.patterns[source].include:
-            snippet = get_window(
-                doclike=span,
-                window=include.window,
-                limit_to_sentence=include.limit_to_sentence,
-            )
+            snippet = include.window(span)
 
             if next(include.matcher(snippet, as_spans=True), None) is None:
                 to_keep = False
@@ -308,13 +275,7 @@ def assign_one(self, span: Span) -> Span:
         for assign in self.patterns[source].assign:
             assign: SingleAssignModel
             window = assign.window
-            limit_to_sentence = assign.limit_to_sentence
-
-            snippet = get_window(
-                doclike=span,
-                window=window,
-                limit_to_sentence=limit_to_sentence,
-            )
+            snippet = window(span)
 
             matcher: RegexMatcher = assign.matcher
             if matcher is not None:

diff --git a/edsnlp/pipes/core/contextual_matcher/models.py b/edsnlp/pipes/core/contextual_matcher/models.py
@@ -1,37 +1,14 @@
 import re
-from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, List, Optional, Union
 
 import regex
 from pydantic import BaseModel, Extra, validator
 
 from edsnlp.matchers.utils import ListOrStr
-from edsnlp.utils.span_getters import SpanGetterArg
+from edsnlp.utils.span_getters import Context, SentenceContext, SpanGetterArg
 from edsnlp.utils.typing import AsList
 
 Flags = Union[re.RegexFlag, int]
-Window = Union[
-    Tuple[int, int],
-    List[int],
-    int,
-]
-
-
-def normalize_window(cls, v):
-    if v is None:
-        return v
-    if isinstance(v, list):
-        assert (
-            len(v) == 2
-        ), "`window` should be a tuple/list of two integer, or a single integer"
-        v = tuple(v)
-    if isinstance(v, int):
-        assert v != 0, "The provided `window` should not be 0"
-        if v < 0:
-            return (v, 0)
-        if v > 0:
-            return (0, v)
-    assert v[0] < v[1], "The provided `window` should contain at least 1 token"
-    return v
 
 
 class AssignDict(dict):
@@ -101,9 +78,10 @@ class SingleExcludeModel(BaseModel):
     ----------
     regex: ListOrStr
         A single Regex or a list of Regexes
-    window: Optional[Window]
+    window: Optional[Context]
         Size of the context to use (in number of words). You can provide the window as:
 
+            - A [context string][context-string]
             - A positive integer, in this case the used context will be taken **after**
               the extraction
             - A negative integer, in this case the used context will be taken **before**
@@ -121,8 +99,8 @@ class SingleExcludeModel(BaseModel):
     """
 
     regex: ListOrStr = []
-    window: Optional[Window] = None
-    limit_to_sentence: Optional[bool] = True
+    limit_to_sentence: Optional[bool] = None
+    window: Optional[Context] = None
     regex_flags: Optional[Flags] = None
     regex_attr: Optional[str] = None
     matcher: Optional[Any] = None
@@ -133,7 +111,20 @@ def exclude_regex_validation(cls, v):
             v = [v]
         return v
 
-    _normalize_window = validator("window", allow_reuse=True)(normalize_window)
+    @validator("limit_to_sentence", pre=True, always=True)
+    def backward_compat_auto_limit_to_sentence(cls, v, values):
+        if (
+            isinstance(values.get("window"), (type(None), int, tuple, list))
+            and v is None
+        ):
+            v = True
+        return v
+
+    @validator("window", always=True)
+    def backward_compat_intersect_sentence(cls, v, values):
+        if values.get("limit_to_sentence"):
+            v = v & SentenceContext(0, 0)
+        return v
 
 
 class SingleIncludeModel(BaseModel):
@@ -146,9 +137,10 @@ class SingleIncludeModel(BaseModel):
     ----------
     regex: ListOrStr
         A single Regex or a list of Regexes
-    window: Optional[Window]
+    window: Optional[Context]
         Size of the context to use (in number of words). You can provide the window as:
 
+            - A [context string][context-string]
             - A positive integer, in this case the used context will be taken **after**
               the extraction
             - A negative integer, in this case the used context will be taken **before**
@@ -166,8 +158,8 @@ class SingleIncludeModel(BaseModel):
     """
 
     regex: ListOrStr = []
-    window: Optional[Window] = None
-    limit_to_sentence: Optional[bool] = True
+    limit_to_sentence: Optional[bool] = None
+    window: Optional[Context] = None
     regex_flags: Optional[Flags] = None
     regex_attr: Optional[str] = None
     matcher: Optional[Any] = None
@@ -178,7 +170,20 @@ def exclude_regex_validation(cls, v):
             v = [v]
         return v
 
-    _normalize_window = validator("window", allow_reuse=True)(normalize_window)
+    @validator("limit_to_sentence", pre=True, always=True)
+    def backward_compat_auto_limit_to_sentence(cls, v, values):
+        if (
+            isinstance(values.get("window"), (type(None), int, tuple, list))
+            and v is None
+        ):
+            v = True
+        return v
+
+    @validator("window", always=True)
+    def backward_compat_intersect_sentence(cls, v, values):
+        if values.get("limit_to_sentence"):
+            v = v & SentenceContext(0, 0)
+        return v
 
 
 class ExcludeModel(AsList[SingleExcludeModel]):
@@ -204,9 +209,10 @@ class SingleAssignModel(BaseModel):
     ----------
     name: ListOrStr
         A name (string)
-    window: Optional[Window]
+    window: Optional[Context]
         Size of the context to use (in number of words). You can provide the window as:
 
+        - A [context string][context-string]
         - A positive integer, in this case the used context will be taken **after**
           the extraction
         - A negative integer, in this case the used context will be taken **before**
@@ -217,7 +223,7 @@ class SingleAssignModel(BaseModel):
     span_getter: Optional[SpanGetterArg]
         A span getter to pick the assigned spans from already extracted entities
         in the doc.
-    regex: Optional[Window]
+    regex: Optional[Context]
         A dictionary where keys are labels and values are **Regexes with a single
         capturing group**
     replace_entity: Optional[bool]
@@ -235,8 +241,8 @@ class SingleAssignModel(BaseModel):
     name: str
     regex: Optional[str] = None
     span_getter: Optional[SpanGetterArg] = None
-    window: Optional[Window] = None
-    limit_to_sentence: Optional[bool] = True
+    limit_to_sentence: Optional[bool] = None
+    window: Optional[Context] = None
     regex_flags: Optional[Flags] = None
     regex_attr: Optional[str] = None
     replace_entity: bool = False
@@ -259,7 +265,20 @@ def check_single_regex_group(cls, pat):
 
         return pat
 
-    _normalize_window = validator("window", allow_reuse=True)(normalize_window)
+    @validator("limit_to_sentence", pre=True, always=True)
+    def backward_compat_auto_limit_to_sentence(cls, v, values):
+        if (
+            isinstance(values.get("window"), (type(None), int, tuple, list))
+            and v is None
+        ):
+            v = True
+        return v
+
+    @validator("window", always=True)
+    def backward_compat_intersect_sentence(cls, v, values):
+        if values.get("limit_to_sentence"):
+            v = v & SentenceContext(0, 0)
+        return v
 
 
 class AssignModel(AsList[SingleAssignModel]):