aphp · percevalw · May 18, 2024 · May 18, 2024 · May 18, 2024 · May 18, 2024
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -7,7 +7,8 @@ on:
     branches: [master]
 
 jobs:
-  Linting:
+  linting:
+    name: Linting
     if: github.event_name == 'pull_request'
     runs-on: ubuntu-latest
     steps:
@@ -22,7 +23,8 @@ jobs:
         with:
           extra_args: --color=always --from-ref ${{ github.event.pull_request.base.sha }} --to-ref ${{ github.event.pull_request.head.sha }}
 
-  Pytest:
+  pytest:
+    name: Pytest
     runs-on: ubuntu-latest
     strategy:
       fail-fast: true
@@ -37,11 +39,18 @@ jobs:
           path: ~/.data/
           key: resources
 
-      - name: Cache pip
-        uses: actions/cache@v3
+      # - name: Cache pip
+      #   uses: actions/cache@v3
+      #   with:
+      #     path: ~/.cache/pip
+      #     key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip
+
+      - run: echo WEEK=$(date +%V) >>$GITHUB_ENV
+        shell: bash
+
+      - uses: hynek/setup-cached-uv@v1
         with:
-          path: ~/.cache/pip
-          key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip
+          cache-suffix: -tests-${{ matrix.python-version }}-${{ env.WEEK }}
 
       - name: Set up Java
         uses: actions/setup-java@v2
@@ -53,26 +62,31 @@ jobs:
         uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
-          architecture: x64
-          cache: 'pip'
 
       - name: Install dependencies
         run: |
-          pip install --upgrade pip
-          pip install pipx
-          pipx install poetry
-          pip install -e '.[dev,setup]'
+          uv venv
+          source .venv/bin/activate
+          uv pip install -e '.[dev,setup]' pytest-xdist poetry pip
 
       - name: Test with Pytest on Python ${{ matrix.python-version }}
         env:
           UMLS_API_KEY: ${{ secrets.UMLS_API_KEY }}
-        run: coverage run -m pytest --ignore tests/test_docs.py
+        run: |
+          source .venv/bin/activate
+          coverage run -m pytest --ignore tests/test_docs.py # -n auto
+          # coverage combine
+          # mv .coverage .coverage.${{ matrix.python-version }}
         if: matrix.python-version != '3.9'
 
       - name: Test with Pytest on Python ${{ matrix.python-version }}
         env:
           UMLS_API_KEY: ${{ secrets.UMLS_API_KEY }}
-        run: coverage run -m pytest
+        run: |
+          source .venv/bin/activate
+          coverage run -m pytest # -n auto
+          # coverage combine
+          # mv .coverage .coverage.${{ matrix.python-version }}
         if: matrix.python-version == '3.9'
 
       - name: Upload coverage data
@@ -82,8 +96,9 @@ jobs:
           path: .coverage.*
           if-no-files-found: ignore
 
-  Coverage:
-    needs: Pytest
+  coverage:
+    name: Coverage
+    needs: pytest
     uses: aphp/foldedtensor/.github/workflows/coverage.yml@main
     with:
       base-branch: master
@@ -92,34 +107,56 @@ jobs:
       coverage-badge: coverage.svg
       coverage-branch: coverage
 
-  Documentation:
+  documentation:
+    name: Documentation
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v2
+
     - uses: actions/setup-python@v4
       with:
         python-version: "3.9"
         cache: 'pip'
+
+    - run: echo WEEK=$(date +%V) >>$GITHUB_ENV
+      shell: bash
+
+    - uses: hynek/setup-cached-uv@v1
+      with:
+        cache-suffix: -docs-${{ matrix.python-version }}-${{ env.WEEK }}
+
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
-        pip install '.[dev]'
+        uv venv
+        uv pip install '.[dev]'
+
     - name: Build documentation
       run: |
+        source .venv/bin/activate
         mkdocs build --clean
 
-  Installation:
+  simple-installation:
+    name: Simple installation
     runs-on: ubuntu-latest
     strategy:
-      fail-fast: false
+      fail-fast: true
       matrix:
         python-version: ["3.7", "3.8", "3.9"]
     steps:
       - uses: actions/checkout@v2
+
       - uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
-          cache: 'pip'
+
+      - run: echo WEEK=$(date +%V) >>$GITHUB_ENV
+        shell: bash
+
+      - uses: hynek/setup-cached-uv@v1
+        with:
+          cache-suffix: -simple-install-${{ matrix.python-version }}-${{ env.WEEK }}
+
       - name: Install library
         run: |
-          pip install .
+          uv venv
+          uv pip install .
diff --git a/changelog.md b/changelog.md
@@ -9,6 +9,12 @@
 - Window stride can now be disabled (i.e., stride = window) during training in the `eds.transformer` component by `training_stride = False`
 - Added a new `eds.ner_overlap_scorer` to evaluate matches between two lists of entities, counting true when the dice overlap is above a given threshold
 - `edsnlp.load` now accepts EDS-NLP models from the huggingface hub 🤗 !
+- New `python -m edsnlp.package` command to package a model for the huggingface hub or pypi-like registries
+- Expose the defaults patterns of `eds.negation`, `eds.hypothesis`, `eds.family`, `eds.history` and `eds.reported_speech` under a `eds.negation.default_patterns` attribute
+- Added a `context_getter` SpanGetter argument to the `eds.matcher` class to only retrieve entities inside the spans returned by the getter
+- Added a `filter_expr` parameter to scorers to filter the documents to score
+- Added a new `required` field to `eds.contextual_matcher` assign patterns to only match if the required field has been found, and an `include` parameter (similar to `exclude`) to search for required patterns without assigning them to the entity
+- Added context strings (e.g., "words[0:5] | sent[0:1]") to the `eds.contextual_matcher` component to allow for more complex patterns in the selection of the window around the trigger spans
 
 ### Changed
 

diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css
@@ -166,3 +166,20 @@ body, input {
 .md-typeset code a:not(.md-annotation__index) {
     border-bottom: 1px dashed var(--md-typeset-a-color);
 }
+
+.doc-param-details .subdoc {
+    padding: 0;
+    box-shadow: none;
+    border-color: var(--md-typeset-table-color);
+}
+
+.doc-param-details .subdoc > div > div > div>  table {
+    padding: 0;
+    box-shadow: none;
+    border: none;
+}
+
+.doc-param-details .subdoc > summary {
+    margin: 0;
+    font-weight: normal;
+}
diff --git a/docs/pipes/core/contextual-matcher.md b/docs/pipes/core/contextual-matcher.md
@@ -206,74 +206,6 @@ Let us see what we can get from this pipeline with a few examples
 
 However, most of the configuration is provided in the `patterns` key, as a **pattern dictionary** or a **list of pattern dictionaries**
 
-## The pattern dictionary
-
-### Description
-
-A patterr is a nested dictionary with the following keys:
-
-=== "`source`"
-
-    A label describing the pattern
-
-=== "`regex`"
-
-    A single Regex or a list of Regexes
-
-=== "`regex_attr`"
-
-    An attributes to overwrite the given `attr` when matching with Regexes.
-
-=== "`terms`"
-
-    A single term or a list of terms (for exact matches)
-
-=== "`exclude`"
-
-    A dictionary (or list of dictionaries) to define exclusion rules. Exclusion rules are given as Regexes, and if a
-    match is found in the surrounding context of an extraction, the extraction is removed. Each dictionary should have the following keys:
-
-    === "`window`"
-
-        Size of the context to use (in number of words). You can provide the window as:
-
-        - A positive integer, in this case the used context will be taken **after** the extraction
-        - A negative integer, in this case the used context will be taken **before** the extraction
-        - A tuple of integers `(start, end)`, in this case the used context will be the snippet from `start` tokens before the extraction to `end` tokens after the extraction
-
-    === "`regex`"
-
-        A single Regex or a list of Regexes.
-
-=== "`assign`"
-
-    A dictionary to refine the extraction. Similarily to the `exclude` key, you can provide a dictionary to
-    use on the context **before** and **after** the extraction.
-
-    === "`name`"
-
-        A name (string)
-
-    === "`window`"
-
-        Size of the context to use (in number of words). You can provide the window as:
-
-        - A positive integer, in this case the used context will be taken **after** the extraction
-        - A negative integer, in this case the used context will be taken **before** the extraction
-        - A tuple of integers `(start, end)`, in this case the used context will be the snippet from `start` tokens before the extraction to `end` tokens after the extraction
-
-    === "`regex`"
-
-        A dictionary where keys are labels and values are **Regexes with a single capturing group**
-
-    === "`replace_entity`"
-
-        If set to `True`, the match from the corresponding assign key will be used as entity, instead of the main match. See [this paragraph][the-replace_entity-parameter]
-
-    === "`reduce_mode`"
-
-        Set how multiple assign matches are handled. See the documentation of the [`reduce_mode` parameter][the-reduce_mode-parameter]
-
 ### A full pattern dictionary example
 
 ```python
@@ -300,6 +232,8 @@ dict(
             regex=r"(neonatal)",
             expand_entity=True,
             window=3,
+            # keep the extraction only if neonatal is found
+            required=True,
         ),
         dict(
             name="trans",

diff --git a/edsnlp/core/pipeline.py b/edsnlp/core/pipeline.py
@@ -944,7 +944,7 @@ def package(
         isolation: bool = True,
         skip_build_dependency_check: bool = False,
     ):
-        from edsnlp.utils.package import package
+        from edsnlp.package import package
 
         return package(
             pipeline=self,

diff --git a/edsnlp/core/registries.py b/edsnlp/core/registries.py
@@ -3,6 +3,7 @@
 from collections import defaultdict
 from dataclasses import dataclass
 from functools import wraps
+from itertools import chain
 from typing import Any, Callable, Dict, Iterable, Optional, Sequence
 from weakref import WeakKeyDictionary
 
@@ -231,7 +232,12 @@ def check_and_return():
         if func is None and self.entry_points:
             # Update entry points in case packages lookup paths have changed
             available_entry_points = defaultdict(list)
-            for ep in importlib_metadata.entry_points():
+            eps = importlib_metadata.entry_points()
+            for ep in (
+                chain.from_iterable(dict(eps).values())
+                if isinstance(eps, dict)
+                else eps
+            ):
                 available_entry_points[ep.group].append(ep)
             catalogue.AVAILABLE_ENTRY_POINTS.update(available_entry_points)
             # Otherwise, step 3

diff --git a/edsnlp/matchers/regex.py b/edsnlp/matchers/regex.py
@@ -1,6 +1,6 @@
 import re
 from bisect import bisect_left, bisect_right
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
 
 from loguru import logger
 from spacy.tokens import Doc, Span
@@ -465,7 +465,7 @@ def __call__(
         doclike: Union[Doc, Span],
         as_spans=False,
         return_groupdict=False,
-    ) -> Union[Span, Tuple[Span, Dict[str, Any]]]:
+    ) -> Iterator[Union[Span, Tuple[Span, Dict[str, Any]]]]:
         """
         Performs matching. Yields matches.