diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9df3974b7..5d8a20d8c 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -7,9 +7,11 @@ on: branches: [master] env: - # UV_INDEX_STRATEGY: "unsafe-first-match" - # UV_EXTRA_INDEX_URL: "https://download.pytorch.org/whl/cpu" + UV_INDEX_STRATEGY: "unsafe-first-match" + UV_EXTRA_INDEX_URL: "https://download.pytorch.org/whl/cpu" PIP_EXTRA_INDEX_URL: "https://download.pytorch.org/whl/cpu" + UV_SYSTEM_PYTHON: 1 + BRANCH_NAME: ${{ github.head_ref || github.ref_name }} jobs: linting: @@ -22,7 +24,7 @@ jobs: # requites to grab the history of the PR fetch-depth: 0 - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: cache: 'pip' @@ -46,19 +48,6 @@ jobs: path: ~/.data/ key: resources - # - name: Cache pip - # uses: actions/cache@v3 - # with: - # path: ~/.cache/pip - # key: ${{ runner.os }}-python-${{ matrix.python-version }}-pip - - - run: echo WEEK=$(date +%V) >>$GITHUB_ENV - shell: bash - - # - uses: hynek/setup-cached-uv@v1 - # with: - # cache-suffix: -tests-${{ matrix.python-version }}-${{ env.WEEK }} - - name: Set up Java uses: actions/setup-java@v2 with: @@ -72,48 +61,28 @@ jobs: cache: 'pip' - name: Install dependencies - run: | - pip install poetry - pip install -e '.[dev]' pytest-xdist pip + run: pip install -e ".[dev]" if: matrix.python-version != '3.9' && matrix.python-version != '3.10' && matrix.python-version != '3.11' && matrix.python-version != '3.12' -# uv venv -# source .venv/bin/activate -# uv pip install -e '.[dev]' pytest-xdist pip - name: Install dependencies - run: | - pip install poetry - pip install -e '.[dev,setup]' pytest-xdist pip + run: pip install -e ".[dev,setup]" if: matrix.python-version == '3.9' -# uv venv -# source .venv/bin/activate -# uv pip install -e '.[dev]' pytest-xdist pip - name: Install dependencies - run: | - pip install poetry - pip install -e '.[dev-no-ml]' pytest-xdist pip # skip ML tests for 3.10 and 3.11 + run: pip install -e ".[dev-no-ml]" if: matrix.python-version == '3.10' || matrix.python-version == '3.11' || matrix.python-version == '3.12' - name: Test with Pytest on Python ${{ matrix.python-version }} env: UMLS_API_KEY: ${{ secrets.UMLS_API_KEY }} - run: | - coverage run -m pytest --ignore tests/test_docs.py # -n auto - # coverage combine - # mv .coverage .coverage.${{ matrix.python-version }} -# source .venv/bin/activate + run: coverage run -m pytest --ignore tests/test_docs.py if: matrix.python-version != '3.9' - name: Test with Pytest on Python ${{ matrix.python-version }} env: UMLS_API_KEY: ${{ secrets.UMLS_API_KEY }} - run: | - coverage run -m pytest # -n auto - # coverage combine - # mv .coverage .coverage.${{ matrix.python-version }} -# source .venv/bin/activate + run: coverage run -m pytest if: matrix.python-version == '3.9' - name: Upload coverage data @@ -137,33 +106,72 @@ jobs: documentation: name: Documentation - runs-on: ubuntu-22.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: - python-version: "3.9" + python-version: 3.9 cache: 'pip' - - run: echo WEEK=$(date +%V) >>$GITHUB_ENV - shell: bash + - name: Install dependencies + run: pip install -e ".[docs]" - # - uses: hynek/setup-cached-uv@v1 - # with: - # cache-suffix: -docs-${{ matrix.python-version }}-${{ env.WEEK }} + - name: Set up Git + run: | + git config user.name ${{ github.actor }} + git config user.email ${{ github.actor }}@users.noreply.github.com + echo Current branch: $BRANCH_NAME - - name: Install dependencies + - name: Build documentation run: | - pip install '.[docs]' -# uv venv -# uv pip install '.[docs]' + mike deploy --no-redirect --rebase --update-aliases $BRANCH_NAME latest + mike set-default $BRANCH_NAME + - name: Put content of gh-pages to public folder + run: rm -rf public && mkdir public && git archive gh-pages | tar -x -C ./public/ - - name: Build documentation + - name: Set up Vercel + run: npm install --global vercel@latest + + - name: Pull Vercel environment + run: vercel pull --yes --environment=preview --token=${{ secrets.VERCEL_TOKEN }} + + - name: Create new vercel project linked to this branch + run: vercel project add edsnlp-$BRANCH_NAME --token=${{ secrets.VERCEL_TOKEN }} + + - name: Link public folder to the (maybe) new vercel project + run: vercel link --cwd public --project edsnlp-$BRANCH_NAME --yes --token=${{ secrets.VERCEL_TOKEN }} + + - name: Deploy to Vercel + run: vercel deploy public/ --yes --token=${{ secrets.VERCEL_TOKEN }} --archive=tgz --prod > deployment-url.txt + + - name: Post the documentation link + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - mkdocs build --clean -# source .venv/bin/activate + URL=https://edsnlp-$BRANCH_NAME.vercel.app/ + COMMENT_BODY="## Docs preview URL\n\n$URL\n\n" + HEADER="Authorization: token $GITHUB_TOKEN" + PR_COMMENTS_URL="https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" + + # Fetch existing comments to find if one from this workflow already exists + COMMENTS=$(curl -s -H "$HEADER" "$PR_COMMENTS_URL") + COMMENT_ID=$(echo "$COMMENTS" | jq -r '.[] | select(.user.login == "github-actions[bot]" and (.body | startswith("## Docs preview URL"))) | .id') + + # Check if we have a comment ID, if so, update it, otherwise create a new one + if [[ "$COMMENT_ID" ]]; then + # Update existing comment + curl -s -X PATCH -H "$HEADER" -H "Content-Type: application/json" -d "{\"body\": \"$COMMENT_BODY\"}" "https://api.github.com/repos/${{ github.repository }}/issues/comments/$COMMENT_ID" + else + # Post new comment + curl -s -X POST -H "$HEADER" -H "Content-Type: application/json" -d "{\"body\": \"$COMMENT_BODY\"}" "$PR_COMMENTS_URL" + fi + + if [ $status -ne 0 ]; then + exit $status + fi simple-installation: name: Simple installation @@ -175,7 +183,7 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: 'pip' @@ -183,13 +191,7 @@ jobs: - run: echo WEEK=$(date +%V) >>$GITHUB_ENV shell: bash - # - uses: hynek/setup-cached-uv@v1 - # with: - # cache-suffix: -simple-install-${{ matrix.python-version }}-${{ env.WEEK }} - - name: Install library run: | pip install ".[ml]" pytest pytest tests/pipelines/test_pipelines.py -# uv venv -# uv pip install . diff --git a/.gitignore b/.gitignore index bf0d160a0..3432f0519 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ site/ *.cpp *.so *.c +public/ # Unit test / coverage reports htmlcov/ @@ -71,3 +72,4 @@ _build/ docs/reference docs/changelog.md docs/contributing.md +.vercel diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bfaa2cd70..efdf4e9ec 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ repos: # ruff - repo: https://github.com/charliermarsh/ruff-pre-commit # Ruff version. - rev: 'v0.6.4' + rev: 'v0.9.6' hooks: - id: ruff args: ['--config', 'pyproject.toml', '--fix', '--show-fixes'] diff --git a/changelog.md b/changelog.md index d1121ef2d..836100648 100644 --- a/changelog.md +++ b/changelog.md @@ -5,13 +5,24 @@ ### Added - Support for numpy>2.0, and formal support for Python 3.11 and Python 3.12 +- Expose the defaults patterns of `eds.negation`, `eds.hypothesis`, `eds.family`, `eds.history` and `eds.reported_speech` under a `eds.negation.default_patterns` attribute +- Added a `context_getter` SpanGetter argument to the `eds.matcher` class to only retrieve entities inside the spans returned by the getter +- Added a `filter_expr` parameter to scorers to filter the documents to score +- Added a new `required` field to `eds.contextual_matcher` assign patterns to only match if the required field has been found, and an `include` parameter (similar to `exclude`) to search for required patterns without assigning them to the entity +- Added context strings (e.g., "words[0:5] | sent[0:1]") to the `eds.contextual_matcher` component to allow for more complex patterns in the selection of the window around the trigger spans. +- Include and exclude patterns in the contextual matcher now dismiss matches that occur inside the anchor pattern (e.g. "anti" exclude pattern for anchor pattern "antibiotics" will not match the "anti" part of "antibiotics") +- Pull Requests will now build a public accessible preview of the docs + +### Changed +- Improve the contextual matcher documentation. ### Fixed - `edsnlp.package` now correctly detect if a project uses an old-style poetry pyproject or a PEP621 pyproject.toml. - PEP621 projects containing nested directories (e.g., "my_project/pipes/foo.py") are now supported. - Try several paths to find current pip executable -- Compatibility with Optuna 4.3.0 +- The parameter "value_extract" of `eds.score` now correctly handles lists of patterns. +- "Zero variance error" when computing param tuning importance are now catched and converted as a warning ## v0.16.0 (2025-0.3-26) diff --git a/docs/assets/fragments/alcohol-examples.md b/docs/assets/fragments/alcohol-examples.md index 5a5a81279..f563040a3 100644 --- a/docs/assets/fragments/alcohol-examples.md +++ b/docs/assets/fragments/alcohol-examples.md @@ -73,12 +73,12 @@ spans = doc.spans["alcohol"] spans - # Out: [Alcoolism non sevré] + # Out: [Alcoolisme non sevré] span = spans[0] - span._.detailed_status - # Out: None # "sevré" is negated, so no "ABTINENCE" status + span._.detailed_status # "sevré" is negated, so no "ABTINENCE" status + # Out: None ``` @@ -90,7 +90,7 @@ spans = doc.spans["alcohol"] spans - # Out: [Alcool: 0] + # Out: [Alcool] span = spans[0] diff --git a/docs/assets/fragments/peptic-ulcer-disease-examples.md b/docs/assets/fragments/peptic-ulcer-disease-examples.md index c2a7ac52f..2f1a793ee 100644 --- a/docs/assets/fragments/peptic-ulcer-disease-examples.md +++ b/docs/assets/fragments/peptic-ulcer-disease-examples.md @@ -41,7 +41,7 @@ spans = doc.spans["peptic_ulcer_disease"] spans - # Out: [ulcères] + # Out: [gastrique: blabla blabla blabla blabla blabla quelques ulcères] span = spans[0] diff --git a/docs/assets/fragments/tobacco-examples.md b/docs/assets/fragments/tobacco-examples.md index 80f65ff62..8e7d29c94 100644 --- a/docs/assets/fragments/tobacco-examples.md +++ b/docs/assets/fragments/tobacco-examples.md @@ -66,7 +66,7 @@ spans = doc.spans["tobacco"] spans - # Out: [Tabac: 0] + # Out: [Tabac] span = spans[0] @@ -77,7 +77,7 @@ # Out: True span._.assigned - # Out: {'zero_after': [0]} + # Out: {'zero_after': 0} ``` diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css index ef6c443e4..da8fe9706 100644 --- a/docs/assets/stylesheets/extra.css +++ b/docs/assets/stylesheets/extra.css @@ -190,3 +190,20 @@ a.discrete-link { font-size: 1rem; align-content: center; } + +.doc-param-details .subdoc { + padding: 0; + box-shadow: none; + border-color: var(--md-typeset-table-color); +} + +.doc-param-details .subdoc > div > div > div> table { + padding: 0; + box-shadow: none; + border: none; +} + +.doc-param-details .subdoc > summary { + margin: 0; + font-weight: normal; +} diff --git a/docs/pipes/core/contextual-matcher.md b/docs/pipes/core/contextual-matcher.md index 30f4afb25..9f81cbb6b 100644 --- a/docs/pipes/core/contextual-matcher.md +++ b/docs/pipes/core/contextual-matcher.md @@ -1,148 +1,12 @@ - # Contextual Matcher {: #edsnlp.pipes.core.contextual_matcher.factory.create_component } -During feature extraction, it may be necessary to search for additional patterns in their neighborhood, namely: - -- patterns to discard irrelevant entities -- patterns to enrich these entities and store some information - -For example, to extract mentions of non-benign cancers, we need to discard all extractions that mention "benin" in their immediate neighborhood. -Although such a filtering is feasible using a regular expression, it essentially requires modifying each of the regular expressions. - -The ContextualMatcher allows to perform this extraction in a clear and concise way. - -## The configuration file - -The whole ContextualMatcher pipeline component is basically defined as a list of **pattern dictionaries**. -Let us see step by step how to build such a list using the example stated just above. - -### a. Finding mentions of cancer - -To do this, we can build either a set of `terms` or a set of `regex`. `terms` will be used to search for exact matches in the text. While less flexible, -it is faster than using regex. In our case we could use the following lists (which are of course absolutely not exhaustives): - -```python -terms = [ - "cancer", - "tumeur", -] - -regex = [ - r"adeno(carcinom|[\s-]?k)", - "neoplas", - "melanom", -] -``` - -Maybe we want to exclude mentions of benign cancers: - -```python -benign = "benign|benin" -``` - -### b. Find mention of a *stage* and extract its value +EDS-NLP provides simple pattern matchers like `eds.matcher` to extract regular expressions, specific phrases, or perform lexical similarity matching on documents. However, certain use cases require examining the context around matched entities to filter out irrelevant matches or enrich them with additional information. For example, to extract mentions of malignant cancers, we need to exclude matches that have “benin” mentioned nearby : `eds.contextual_matcher` was built to address such needs. -For this we will forge a RegEx with one capturing group (basically a pattern enclosed in parentheses): +## Example -```python -stage = "stade (I{1,3}V?|[1234])" -``` +The following example demonstrates how to configure and use `eds.contextual_matcher` to extract mentions of solid cancers and lymphomas, while filtering out irrelevant mentions (e.g., benign tumors) and enriching entities with contextual information such as stage or metastasis status. -This will extract stage between 1 and 4 - -We can add a second regex to try to capture if the cancer is in a metastasis stage or not: - -```python -metastase = "(metasta)" -``` - -### c. The complete configuration - -We can now put everything together: - -```python -cancer = dict( - source="Cancer solide", - regex=regex, - terms=terms, - regex_attr="NORM", - exclude=dict( - regex=benign, - window=3, - ), - assign=[ - dict( - name="stage", - regex=stage, - window=(-10, 10), - replace_entity=False, - reduce_mode=None, - ), - dict( - name="metastase", - regex=metastase, - window=10, - replace_entity=False, - reduce_mode="keep_last", - ), - ], -) -``` - -Here the configuration consists of a single dictionary. We might want to also include lymphoma in the matcher: - -```python -lymphome = dict( - source="Lymphome", - regex=["lymphom", "lymphangio"], - regex_attr="NORM", - exclude=dict( - regex=["hodgkin"], # (1) - window=3, - ), -) -``` - -1. We are excluding "Lymphome de Hodgkin" here - -In this case, the configuration can be concatenated in a list: - -```python -patterns = [cancer, lymphome] -``` - -## Available parameters for more flexibility - -3 main parameters can be used to refine how entities will be formed - -### The `include_assigned` parameter - -Following the previous example, you might want your extracted entities to **include**, if found, the cancer stage and the metastasis status. This can be achieved by setting `include_assigned=True` in the pipe configuration. - -For instance, from the sentence "Le patient a un cancer au stade 3", the extracted entity will be: - -- "cancer" if `include_assigned=False` -- "cancer au stade 3" if `include_assigned=True` - -### The `reduce_mode` parameter - -It may happen that an assignment matches more than once. For instance, in the (nonsensical) sentence "Le patient a un cancer au stade 3 et au stade 4", both "stade 3" and "stade 4" will be matched by the `stage` assign key. Depending on your use case, you may want to keep all the extractions, or just one. - -- If `reduce_mode=None` (default), all extractions are kept in a list -- If `reduce_mode="keep_first"`, only the extraction closest to the main matched entity will be kept (in this case, it would be "stade 3" since it is the closest to "cancer") -- If `reduce_mode=="keep_last"`, only the furthest extraction is kept. - -### The `replace_entity` parameter - -This parameter can be se to `True` **only for a single assign key per dictionary**. This limitation comes from the purpose of this parameter: If set to `True`, the corresponding `assign` key will be returned as the entity, instead of the match itself. For clarity, let's take the same sentence "Le patient a un cancer au stade 3" as an example: - -- if `replace_entity=True` in the `stage` assign key, then the extracted entity will be "stade 3" instead of "cancer" -- if `replace_entity=False` for every assign key, the returned entity will be, as expected, "cancer" - -**Please notice** that with `replace_entity` set to True, if the correponding assign key matches nothing, the entity will be discarded. - - -## Examples +Let's dive in with the full code example: ```python import edsnlp, edsnlp.pipes as eds @@ -153,14 +17,71 @@ nlp.add_pipe(eds.sentences()) nlp.add_pipe(eds.normalizer()) nlp.add_pipe( eds.contextual_matcher( - patterns=patterns, + patterns=[ + dict( + terms=["cancer", "tumeur"], # (1)! + regex=[r"adeno(carcinom|[\s-]?k)", "neoplas", "melanom"], # (2)! + regex_attr="NORM", # (3)! + exclude=dict( + regex="benign|benin", # (4)! + window=3, # (5)! + ), + assign=[ + dict( + name="stage", # (6)! + regex="stade (I{1,3}V?|[1234])", # (7)! + window="words[-10:10]", # (8)! + replace_entity=False, # (9)! + reduce_mode=None, # (10)! + ), + dict( + name="metastase", # (11)! + regex="(metasta)", # (12)! + window=10, # (13)! + replace_entity=False, # (14)! + reduce_mode="keep_last", # (15)! + ), + ], + source="Cancer solide", # (16)! + ), + dict( + regex=["lymphom", "lymphangio"], # (17)! + regex_attr="NORM", # (18)! + exclude=dict( + regex=["hodgkin"], # (19)! + window=3, # (20)! + ), + source="Lymphome", # (21)! + ), + ], label="cancer", ), ) ``` -Let us see what we can get from this pipeline with a few examples - +1. Exact match terms (faster than regex, but less flexible) +2. Regex for flexible matching +3. Apply regex on normalized text +4. Regex to exclude benign mentions +5. Window size for exclusion check +6. Extract cancer stage +7. Stage regex pattern +8. Window range for stage extraction. Visit the documentation of [ContextWindow][edsnlp.utils.span_getters.ContextWindow] for more information about this syntax. +9. Do not use these matches as replacement for the anchor (default behavior) +10. Keep all matches +11. Detect metastasis +12. Regex for metastasis detection +13. Window size for detection +14. Keep main entity +15. Keep furthest extraction +16. Optional source label for solid tumor. This can be useful to know which pattern matched the entity. +17. Regex patterns for lymphoma +18. Apply regex on normalized text +19. Exclude Hodgkin lymphoma +20. Window size for exclusion +21. Optional source label for lymphoma. This can be useful to know which pattern matched the entity. + +Let's explore some examples using this pipeline: === "Simple match" @@ -181,7 +102,7 @@ Let us see what we can get from this pipeline with a few examples === "Exclusion rule" - Let us check that when a *benign* mention is present, the extraction is excluded: + Check exclusion with a benign mention: ```python txt = "Le patient a eu un cancer relativement bénin il y a 5 ans" @@ -193,135 +114,51 @@ Let us see what we can get from this pipeline with a few examples === "Extracting additional infos" - All informations extracted from the provided `assign` configuration can be found in the `assigned` attribute - under the form of a dictionary: + Additional information extracted via `assign` configurations is available in the `assigned` attribute: ```python txt = "Le patient a eu un cancer de stade 3." doc = nlp(txt) - doc.ents[0]._.assigned - # Out: {'stage': '3'} + doc.ents[0]._.assigned # (1)! + # Out: {'stage': ['3']} ``` -However, most of the configuration is provided in the `patterns` key, as a **pattern dictionary** or a **list of pattern dictionaries** - -## The pattern dictionary - -### Description + 1. We get a list for 'stage' because `reduce_mode` is set to `None` (default). If you want to keep only the first or last match, set `reduce_mode="keep_first"` or `reduce_mode="keep_last"`. -A patterr is a nested dictionary with the following keys: +## Better control over the final extracted entities -=== "`source`" +Three main parameters refine how entities are extracted: - A label describing the pattern +#### `include_assigned` -=== "`regex`" +Following the previous example, if you want extracted entities to include the cancer stage or metastasis status (if found), set `include_assigned=True` in the pipe configuration. - A single Regex or a list of Regexes +For instance, from the sentence "Le patient a un cancer au stade 3": -=== "`regex_attr`" +- If `include_assigned=False`, the extracted entity is "cancer" +- If `include_assigned=True`, the extracted entity is "cancer au stade 3" - An attributes to overwrite the given `attr` when matching with Regexes. +#### `reduce_mode` -=== "`terms`" +Sometimes, an assignment matches multiple times. For example, in the sentence "Le patient a un cancer au stade 3 et au stade 4", both "stade 3" and "stade 4" match the `stage` key. Depending on your use case: - A single term or a list of terms (for exact matches) +- `reduce_mode=None` (default): Keeps all matched extractions in a list +- `reduce_mode="keep_first"`: Keeps only the extraction closest to the main matched entity ("stade 3" in this case) +- `reduce_mode="keep_last"`: Keeps only the furthest extraction -=== "`exclude`" +#### `replace_entity` - A dictionary (or list of dictionaries) to define exclusion rules. Exclusion rules are given as Regexes, and if a - match is found in the surrounding context of an extraction, the extraction is removed. Each dictionary should have the following keys: +This parameter can be set to `True` **for only one assign key per dictionary**. If set to `True`, the matched assignment replaces the main entity. - === "`window`" +Example using "Le patient a un cancer au stade 3": - Size of the context to use (in number of words). You can provide the window as: +- With `replace_entity=True` for the `stage` key, the entity extracted is "stade 3" +- With `replace_entity=False`, the entity extracted remains "cancer" - - A positive integer, in this case the used context will be taken **after** the extraction - - A negative integer, in this case the used context will be taken **before** the extraction - - A tuple of integers `(start, end)`, in this case the used context will be the snippet from `start` tokens before the extraction to `end` tokens after the extraction +**Note**: With `replace_entity=True`, if the corresponding assign key matches nothing, the entity is discarded. - === "`regex`" - - A single Regex or a list of Regexes. - -=== "`assign`" - - A dictionary to refine the extraction. Similarily to the `exclude` key, you can provide a dictionary to - use on the context **before** and **after** the extraction. - - === "`name`" - - A name (string) - - === "`window`" - - Size of the context to use (in number of words). You can provide the window as: - - - A positive integer, in this case the used context will be taken **after** the extraction - - A negative integer, in this case the used context will be taken **before** the extraction - - A tuple of integers `(start, end)`, in this case the used context will be the snippet from `start` tokens before the extraction to `end` tokens after the extraction - - === "`regex`" - - A dictionary where keys are labels and values are **Regexes with a single capturing group** - - === "`replace_entity`" - - If set to `True`, the match from the corresponding assign key will be used as entity, instead of the main match. See [this paragraph][the-replace_entity-parameter] - - === "`reduce_mode`" - - Set how multiple assign matches are handled. See the documentation of the [`reduce_mode` parameter][the-reduce_mode-parameter] - -### A full pattern dictionary example - -```python -dict( - source="AVC", - regex=[ - "accidents? vasculaires? cerebr", - ], - terms="avc", - regex_attr="NORM", - exclude=[ - dict( - regex=["service"], - window=3, - ), - dict( - regex=[" a "], - window=-2, - ), - ], - assign=[ - dict( - name="neo", - regex=r"(neonatal)", - expand_entity=True, - window=3, - ), - dict( - name="trans", - regex="(transitoire)", - expand_entity=True, - window=3, - ), - dict( - name="hemo", - regex=r"(hemorragique)", - expand_entity=True, - window=3, - ), - dict( - name="risk", - regex=r"(risque)", - expand_entity=False, - window=-3, - ), - ], -) -``` +The primary configuration is provided in the `patterns` key as either a **pattern dictionary** or a **list of pattern dictionaries**. ::: edsnlp.pipes.core.contextual_matcher.factory.create_component options: @@ -329,4 +166,4 @@ dict( ## Authors and citation -The `eds.matcher` pipeline component was developed by AP-HP's Data Science team. +The `eds.contextual_matcher` pipeline component was developed by AP-HP's Data Science team. diff --git a/docs/tutorials/detecting-dates.md b/docs/tutorials/detecting-dates.md index e3b576828..e61f3e3c8 100644 --- a/docs/tutorials/detecting-dates.md +++ b/docs/tutorials/detecting-dates.md @@ -160,10 +160,10 @@ for i, date in enumerate(doc.spans["dates"]): note_datetime=note_datetime, infer_from_context=False, tz=None ), ) - # Out: 0 - 12 avril - None - # Out: 1 - il y a trois jours - 1999-08-24 00:00:00 - # Out: 2 - l'année dernière - 1998-08-27 00:00:00 - # Out: 3 - mai 1995 - None +# Out: 0 - 12 avril - None +# Out: 1 - il y a trois jours - 1999-08-24 00:00:00 +# Out: 2 - l'année dernière - 1998-08-27 00:00:00 +# Out: 3 - mai 1995 - None for i, date in enumerate(doc.spans["dates"]): @@ -179,17 +179,17 @@ for i, date in enumerate(doc.spans["dates"]): default_day=15, ), ) - # Out: 0 - 12 avril - 1999-04-12T00:00:00 - # Out: 1 - il y a trois jours - 1999-08-24 00:00:00 - # Out: 2 - l'année dernière - 1998-08-27 00:00:00 - # Out: 3 - mai 1995 - 1995-05-15T00:00:00 +# Out: 0 - 12 avril - 1999-04-12 00:00:00 +# Out: 1 - il y a trois jours - 1999-08-24 00:00:00 +# Out: 2 - l'année dernière - 1998-08-27 00:00:00 +# Out: 3 - mai 1995 - 1995-05-15 00:00:00 ``` As a first heuristic, let's consider that an entity can be linked to a date if the two are in the same sentence. In the case where multiple dates are present, we'll select the closest one. ```python title="utils.py" -from edsnlp.tokens import Span +from spacy.tokens import Span from typing import List, Optional @@ -219,9 +219,8 @@ def get_event_date(ent: Span) -> Optional[Span]: We can apply this simple function: -```{ .python .no-check } +```python import edsnlp, edsnlp.pipes as eds -from utils import get_event_date from datetime import datetime nlp = edsnlp.blank("eds") @@ -247,7 +246,9 @@ for ent in doc.ents: if ent.label_ != "admission": continue date = get_event_date(ent) - print(f"{ent.text:<20}{date.text:<20}{date._.date.to_datetime(now).strftime('%d/%m/%Y'):<15}{date._.date.to_duration(now)}") + print( + f"{ent.text:<20}{date.text:<20}{date._.date.to_datetime(now).strftime('%d/%m/%Y'):<15}{date._.date.to_duration(now)}" + ) # Out: admis 12 avril 12/04/2023 21 weeks 4 days 6 hours 3 minutes 26 seconds # Out: pris en charge l'année dernière 10/09/2022 -1 year ``` diff --git a/docs/tutorials/reason.md b/docs/tutorials/reason.md index e87e155f4..bc06eaef5 100644 --- a/docs/tutorials/reason.md +++ b/docs/tutorials/reason.md @@ -63,8 +63,7 @@ reason._.is_reason ```python # ↑ Omitted code above ↑ -entities = reason._.ents_reason # (1) -for e in entities: +for e in reason._.ents_reason: # (1) print( "Entity:", e.text, diff --git a/docs/utilities/tests/blocs.md b/docs/utilities/tests/blocs.md index 42a6edb0a..19f81e1b7 100644 --- a/docs/utilities/tests/blocs.md +++ b/docs/utilities/tests/blocs.md @@ -1,6 +1,6 @@ # Testing Code Blocs -We created a utility that scans through markdown files, extracts code blocs and executes them to check that everything is indeed functional. +We created a utility that scans through the documentation, extracts code blocs and executes them to check that everything is indeed functional. There is more! Whenever the utility comes across an example (denoted by `# Out: `, see example below), an `assert` statement is dynamically added to the snippet to check that the output matches. @@ -22,12 +22,12 @@ v = a assert repr(v) == "1" ``` -We can disable code checking for a specific code bloc by adding `` above it: +We can disable code checking for a specific code bloc by adding a `.no-check` class to the code bloc: ````md -```{ .python .no-check } +```python { .no-check } test = undeclared_function(42) ``` ```` -See the [dedicated reference][edsnlp.utils.blocs.check_md_file] for more information +Visit the source code of [test_docs.py](https://github.com/aphp/edsnlp/blob/master/tests/test_docs.py) for more information. diff --git a/edsnlp/matchers/regex.py b/edsnlp/matchers/regex.py index 681788535..4c1921238 100644 --- a/edsnlp/matchers/regex.py +++ b/edsnlp/matchers/regex.py @@ -1,6 +1,6 @@ import re from bisect import bisect_left, bisect_right -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union from loguru import logger from spacy.tokens import Doc, Span @@ -465,7 +465,7 @@ def __call__( doclike: Union[Doc, Span], as_spans=False, return_groupdict=False, - ) -> Union[Span, Tuple[Span, Dict[str, Any]]]: + ) -> Iterator[Union[Span, Tuple[Span, Dict[str, Any]]]]: """ Performs matching. Yields matches. diff --git a/edsnlp/package.py b/edsnlp/package.py index 3c0c837d0..60dc4b8b1 100644 --- a/edsnlp/package.py +++ b/edsnlp/package.py @@ -337,11 +337,9 @@ def __init__( poetry = pyproject["tool"]["poetry"] # Extract packages - poetry_bin_path = ( - subprocess.run(["which", "poetry"], stdout=subprocess.PIPE) - .stdout.decode() - .strip() - ) + poetry_bin_path = shutil.which("poetry") + if poetry_bin_path is None: + raise RuntimeError("Poetry is not installed or not found in PATH.") python_executable = Path(poetry_bin_path).read_text().split("\n")[0][2:] result = subprocess.run( [ @@ -407,9 +405,9 @@ def __init__( pass if "version" in constraint: dep_version = constraint.pop("version") - assert not dep_version.startswith( - "^" - ), "Packaging models with ^ dependencies is not supported" + assert not dep_version.startswith("^"), ( + "Packaging models with ^ dependencies is not supported" + ) dep += ( "" if dep_version == "*" @@ -421,9 +419,9 @@ def __init__( dep += f"; {constraint.pop('markers')}" except KeyError: pass - assert ( - not constraint - ), f"Unsupported constraints for dependency {dep_name}: {constraint}" + assert not constraint, ( + f"Unsupported constraints for dependency {dep_name}: {constraint}" + ) if dep_name == "python": new_pyproject["project"]["requires-python"] = dep.replace( "python", "" diff --git a/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py b/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py index 88db08e41..a036b2808 100644 --- a/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py +++ b/edsnlp/pipes/core/contextual_matcher/contextual_matcher.py @@ -1,38 +1,22 @@ import re import warnings -from collections import defaultdict -from functools import lru_cache -from operator import attrgetter -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Generator, Iterable, Optional, Union -from confit import VisibleDeprecationWarning +from confit import VisibleDeprecationWarning, validate_arguments from loguru import logger from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.matchers.phrase import EDSPhraseMatcher from edsnlp.matchers.regex import RegexMatcher, create_span -from edsnlp.matchers.utils import get_text from edsnlp.pipes.base import BaseNERComponent, SpanSetterArg -from edsnlp.utils.collections import flatten_once -from edsnlp.utils.typing import cast +from edsnlp.utils.doc_to_text import get_text +from edsnlp.utils.span_getters import get_spans -from . import models - - -@lru_cache(64) -def get_window( - doclike: Union[Doc, Span], window: Tuple[int, int], limit_to_sentence: bool -): - start_limit = doclike.sent.start if limit_to_sentence else 0 - end_limit = doclike.sent.end if limit_to_sentence else len(doclike.doc) - - start = max(doclike.start + window[0], start_limit) - end = min(doclike.end + window[1], end_limit) - - return doclike.doc[start:end] +from .models import FullConfig, SingleAssignModel, SingleConfig +@validate_arguments() class ContextualMatcher(BaseNERComponent): """ Allows additional matching in the surrounding context of the main match group, @@ -44,8 +28,13 @@ class ContextualMatcher(BaseNERComponent): spaCy `Language` object. name : Optional[str] The name of the pipe - patterns : Union[Dict[str, Any], List[Dict[str, Any]]] - The configuration dictionary + patterns : FullConfig + ??? subdoc "The patterns to match" + + ::: edsnlp.pipes.core.contextual_matcher.models.SingleConfig + options: + only_parameters: "no-header" + show_toc: false assign_as_span : bool Whether to store eventual extractions defined via the `assign` key as Spans or as string @@ -75,7 +64,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: Optional[str] = "contextual_matcher", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]], + patterns: FullConfig, assign_as_span: bool = False, alignment_mode: str = "expand", attr: str = "NORM", @@ -87,7 +76,7 @@ def __init__( label: Optional[str] = None, span_setter: SpanSetterArg = {"ents": True}, ): - if label is None and label_name is not None: + if label is None and label_name is not None: # pragma: no cover warnings.warn( "`label_name` is deprecated, use `label` instead.", VisibleDeprecationWarning, @@ -104,136 +93,103 @@ def __init__( self.ignore_excluded = ignore_excluded self.ignore_space_tokens = ignore_space_tokens self.alignment_mode = alignment_mode - self.regex_flags = regex_flags + self.regex_flags: Union[re.RegexFlag, int] = regex_flags self.include_assigned = include_assigned - # Configuration parsing - patterns = cast(models.FullConfig, patterns) - self.patterns = {pattern.source: pattern for pattern in patterns} - - # Matchers for the anchors - self.phrase_matcher = EDSPhraseMatcher( - nlp.vocab, - attr=attr, - ignore_excluded=ignore_excluded, - ignore_space_tokens=ignore_space_tokens, - ) - self.regex_matcher = RegexMatcher( - attr=attr, - flags=regex_flags, - ignore_excluded=ignore_excluded, - ignore_space_tokens=ignore_space_tokens, - alignment_mode=alignment_mode, - ) - - self.phrase_matcher.build_patterns( - nlp=nlp, - terms={ - source: { - "patterns": p.terms, - } - for source, p in self.patterns.items() - }, - ) - self.regex_matcher.build_patterns( - regex={ - source: { - "regex": p.regex, - "attr": p.regex_attr, - "flags": p.regex_flags, + for pattern in patterns: + phrase_matcher = EDSPhraseMatcher( + nlp.vocab, + attr=attr, + ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, + ) + phrase_matcher.build_patterns( + nlp=nlp, + terms={ + "terms": { + "patterns": pattern.terms, + } + }, + ) + pattern.phrase_matcher = phrase_matcher + + regex_matcher = RegexMatcher( + attr=attr, + flags=regex_flags, + ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, + alignment_mode=alignment_mode, + ) + regex_matcher.build_patterns( + regex={ + "regex": { + "regex": pattern.regex, + "attr": pattern.regex_attr, + "flags": pattern.regex_flags, + } } - for source, p in self.patterns.items() - } - ) - - self.exclude_matchers = defaultdict( - list - ) # Will contain all the exclusion matchers - self.assign_matchers = defaultdict(list) # Will contain all the assign matchers - - # Will contain the reduce mode (for each source and assign matcher) - self.reduce_mode = {} - - # Will contain the name of the assign matcher from which - # entity will be replaced (for each source) - self.replace_key = {} - - for source, p in self.patterns.items(): - p = p.model_dump() - - for exclude in p["exclude"]: - exclude_matcher = RegexMatcher( - attr=exclude["regex_attr"] or p["regex_attr"] or self.attr, - flags=exclude["regex_flags"] - or p["regex_flags"] - or self.regex_flags, - ignore_excluded=ignore_excluded, - ignore_space_tokens=ignore_space_tokens, - alignment_mode="expand", - ) - - exclude_matcher.build_patterns(regex={"exclude": exclude["regex"]}) - - self.exclude_matchers[source].append( - dict( - matcher=exclude_matcher, - window=exclude["window"], - limit_to_sentence=exclude["limit_to_sentence"], + ) + pattern.regex_matcher = regex_matcher + + for exclude in pattern.exclude: + if exclude.regex is not None: + matcher = RegexMatcher( + attr=exclude.regex_attr or pattern.regex_attr or self.attr, + flags=exclude.regex_flags + or pattern.regex_flags + or self.regex_flags, + ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, + alignment_mode="expand", ) - ) + matcher.build_patterns(regex={"exclude": exclude.regex}) + exclude.regex_matcher = matcher + + for include in pattern.include: + if include.regex is not None: + matcher = RegexMatcher( + attr=include.regex_attr or pattern.regex_attr or self.attr, + flags=include.regex_flags + or pattern.regex_flags + or self.regex_flags, + ignore_excluded=ignore_excluded, + ignore_space_tokens=ignore_space_tokens, + alignment_mode="expand", + ) + matcher.build_patterns(regex={"include": include.regex}) + include.regex_matcher = matcher - replace_key = None + # replace_key = None - for assign in p["assign"]: - assign_matcher = RegexMatcher( - attr=assign["regex_attr"] or p["regex_attr"] or self.attr, - flags=assign["regex_flags"] or p["regex_flags"] or self.regex_flags, + for assign in pattern.assign: + assign.regex_matcher = RegexMatcher( + attr=assign.regex_attr or pattern.regex_attr or self.attr, + flags=assign.regex_flags or pattern.regex_flags or self.regex_flags, ignore_excluded=ignore_excluded, ignore_space_tokens=ignore_space_tokens, alignment_mode=alignment_mode, span_from_group=True, ) - - assign_matcher.build_patterns( - regex={assign["name"]: assign["regex"]}, + assign.regex_matcher.build_patterns( + regex={assign.name: assign.regex}, ) - self.assign_matchers[source].append( - dict( - name=assign["name"], - matcher=assign_matcher, - window=assign["window"], - limit_to_sentence=assign["limit_to_sentence"], - replace_entity=assign["replace_entity"], - reduce_mode=assign["reduce_mode"], - ) - ) - - if assign["replace_entity"]: - # We know that there is only one assign name - # with `replace_entity==True` - # from PyDantic validation - replace_key = assign["name"] - - self.replace_key[source] = replace_key - - self.reduce_mode[source] = { - d["name"]: d["reduce_mode"] for d in self.assign_matchers[source] - } - - self.set_extensions() + self.patterns = patterns def set_extensions(self) -> None: + """ + Define the extensions used by the component + """ super().set_extensions() if not Span.has_extension("assigned"): Span.set_extension("assigned", default=dict()) if not Span.has_extension("source"): Span.set_extension("source", default=None) - def filter_one(self, span: Span) -> Span: + def filter_one(self, span: Span, pattern) -> Optional[Span]: """ - Filter extracted entity based on the "exclusion filter" mentioned - in the configuration + Filter extracted entity based on the exclusion and inclusion filters of + the configuration. Parameters ---------- @@ -245,32 +201,51 @@ def filter_one(self, span: Span) -> Span: Optional[Span] None if the span was filtered, the span else """ - source = span.label_ to_keep = True - for matcher in self.exclude_matchers[source]: - window = matcher["window"] - limit_to_sentence = matcher["limit_to_sentence"] - snippet = get_window( - doclike=span, - window=window, - limit_to_sentence=limit_to_sentence, - ) + for exclude in pattern.exclude: + snippet = exclude.window(span) if ( - next( - matcher["matcher"](snippet, as_spans=True), - None, + exclude.regex_matcher is not None + and any( + # check that it isn't inside in the anchor span + not (s.start >= span.start and s.end <= span.end) + for s in exclude.regex_matcher(snippet, as_spans=True) + ) + or exclude.span_getter is not None + and any( + # check that it isn't inside in the anchor span + not (s.start >= span.start and s.end <= span.end) + for s in get_spans(snippet, exclude.span_getter) + ) + ): + to_keep = False + break + + for include in pattern.include: + snippet = include.window(span) + + if ( + include.regex_matcher is not None + and not any( + # check that it isn't inside in the anchor span + not (s.start >= span.start and s.end <= span.end) + for s in include.regex_matcher(snippet, as_spans=True) + ) + or include.span_getter is not None + and not any( + # check that it isn't inside in the anchor span + not (s.start >= span.start and s.end <= span.end) + for s in get_spans(snippet, include.span_getter) ) - is not None ): to_keep = False - logger.trace(f"Entity {span} was filtered out") break if to_keep: return span - def assign_one(self, span: Span) -> Span: + def assign_one(self, span: Span, pattern) -> Iterable[Span]: """ Get additional information in the context of each entity. This function will populate two custom attributes: @@ -285,159 +260,164 @@ def assign_one(self, span: Span) -> Span: Returns ------- - Span - Span with additional information + List[Span] + Spans with additional information """ - - if span is None: - yield from [] - return - - source = span.label_ - assigned_dict = models.AssignDict(reduce_mode=self.reduce_mode[source]) replace_key = None - for matcher in self.assign_matchers[source]: - attr = self.patterns[source].regex_attr or matcher["matcher"].default_attr - window = matcher["window"] - limit_to_sentence = matcher["limit_to_sentence"] - replace_entity = matcher["replace_entity"] # Boolean - - snippet = get_window( - doclike=span, - window=window, - limit_to_sentence=limit_to_sentence, - ) - - # Getting the matches - assigned_list = list(matcher["matcher"].match(snippet)) - - assigned_list = [ - (span, span, matcher["matcher"].regex[0][0]) - if not match.groups() - else ( - span, - create_span( - doclike=snippet, - start_char=match.start(0), - end_char=match.end(0), - key=matcher["matcher"].regex[0][0], - attr=matcher["matcher"].regex[0][2], - alignment_mode=matcher["matcher"].regex[0][5], - ignore_excluded=matcher["matcher"].regex[0][3], - ignore_space_tokens=matcher["matcher"].regex[0][4], - ), - matcher["matcher"].regex[0][0], - ) - for (span, match) in assigned_list - ] - - # assigned_list now contains tuples with - # - the first element being the span extracted from the group - # - the second element being the full match + # Assigned matches is a list of tuples, each containing: + # - the span matched by the "assign" regex (or returned by the span getter) + # - the span corresponding to the match group of the regex (or the full match, + # ie same as above) + assigned_dict = {} + reduce_modes = {} + attrs = {} + + for assign in pattern.assign: + assign: SingleAssignModel + window = assign.window + snippet = window(span) + reduce_modes[assign.name] = assign.reduce_mode + matcher: RegexMatcher = assign.regex_matcher + attrs[assign.name] = matcher.regex[0][2] + if matcher is not None: + # Getting the matches + matches = list(matcher.match(snippet)) + assigned = [ + (matched_span, matched_span) + if not re_match.groups() + else ( + matched_span, + create_span( + doclike=snippet, + start_char=re_match.start(0), + end_char=re_match.end(0), + key=matcher.regex[0][0], + attr=matcher.regex[0][2], + alignment_mode=matcher.regex[0][5], + ignore_excluded=matcher.regex[0][3], + ignore_space_tokens=matcher.regex[0][4], + ), + # matcher.regex[0][0], + ) + for (matched_span, re_match) in matches + ] + if assign.span_getter is not None: + assigned = [ + (matched_span, matched_span) + for matched_span in get_spans(snippet, assign.span_getter) + # if matched_span.start >= snippet.start + # and matched_span.end <= snippet.end + ] + + if assign.required and not assigned: + logger.trace(f"Entity {span} was filtered out") + return [] - if not assigned_list: # No match was found + if len(assigned) == 0: continue - for assigned in assigned_list: - if assigned is None: - continue - if replace_entity: - replace_key = assigned[2] - - # Using he overrid `__setitem__` method from AssignDict here: - assigned_dict[assigned[2]] = { - "span": assigned[1], # Full span - "value_span": assigned[0], # Span of the group - "value_text": get_text( - assigned[0], - attr=attr, - ignore_excluded=self.ignore_excluded, - ), # Text of the group - } - logger.trace(f"Assign key {matcher['name']} matched on entity {span}") - if replace_key is None and self.replace_key[source] is not None: - # There should have been a replacement, but none was found - # So we discard the entity - return + if assign.replace_entity: + replace_key = assign.name + if assign.reduce_mode == "keep_first": # closest + assigned = [min(assigned, key=lambda e: abs(e[0].start - span.start))] + elif assign.reduce_mode == "keep_last": + assigned = [max(assigned, key=lambda e: abs(e[0].start - span.start))] - # Entity replacement - if replace_key is not None: - replacables = assigned_dict[replace_key]["span"] - kept_ents = ( - replacables if isinstance(replacables, list) else [replacables] - ).copy() + assigned_dict[assign.name] = assigned - if self.include_assigned: - # We look for the closest - closest = min( - kept_ents, - key=lambda e: abs(e.start - span.start), + # Several cases: + # 1. should_have_replacement and include_assigned is True + # -> pick closest assigned span where replace = True + # -> + if replace_key is not None: + replacements = sorted( + assigned_dict[replace_key], + key=lambda e: abs(e[0].start - span.start), + ) + assigned_dict[replace_key] = replacements + + ext = { + n: None + if reduce_modes[n] is not None and len(g) == 0 + else [s[0] for s in g][slice(None) if reduce_modes[n] is None else 0] + if self.assign_as_span + else [ + get_text( + s[0], + attr=attrs[n], + ignore_excluded=self.ignore_excluded, + ignore_space_tokens=self.ignore_space_tokens, ) - kept_ents.remove(closest) - - expandables = list( - flatten_once( - [ - a["span"] - for k, a in assigned_dict.items() - if k != replace_key - ] - ) - ) + [span, closest] + for s in g + ][slice(None) if reduce_modes[n] is None else 0] + for n, g in assigned_dict.items() + } - closest = Span( + if replace_key is None: + if self.include_assigned: + merged = [span, *(x[1] for name, g in assigned_dict.items() for x in g)] + span = Span( span.doc, - min(expandables, key=attrgetter("start")).start, - max(expandables, key=attrgetter("end")).end, + min(s.start for s in merged), + max(s.end for s in merged), span.label_, ) - - kept_ents.append(closest) - kept_ents.sort(key=attrgetter("start")) - - for replaced in kept_ents: - # Propagating attributes from the anchor - replaced._.source = source - replaced.label_ = self.label - + span._.source = pattern.source + span.label_ = self.label + span._.assigned = ext + new_spans = [span] else: - # Entity expansion - expandables = [ - s - for s in flatten_once([a["span"] for a in assigned_dict.values()]) - if s is not None - ] - - if self.include_assigned and expandables: + if self.include_assigned: + # we will merge spans from other assign groups + the main span + # to the closest "most central" assign span. + [closest_replacement, *rest_replacements] = assigned_dict[replace_key] + other_spans = [ + x[1] + for name, g in assigned_dict.items() + if name != replace_key + for x in g + ] + merged = [closest_replacement[1], span, *other_spans] span = Span( span.doc, - min(s.start for s in expandables + [span] if s is not None), - max(s.end for s in expandables + [span] if s is not None), + min(s.start for s in merged), + max(s.end for s in merged), span.label_, ) + new_spans = [span, *(s[1] for s in rest_replacements)] + else: + new_spans = [x[1] for x in assigned_dict[replace_key]] + for idx, span in enumerate(new_spans): + span._.source = pattern.source + span.label_ = self.label + span._.assigned = { + k: v[idx] if ((k == replace_key) and reduce_modes[k] is None) else v + for k, v in ext.items() + } - span._.source = source - span.label_ = self.label - kept_ents = [span] - - key = "value_span" if self.assign_as_span else "value_text" - - for idx, e in enumerate(kept_ents): - e._.assigned = { - k: v[key][idx] - if ((k == replace_key) and self.reduce_mode[source][k] is None) - else v[key] - for k, v in assigned_dict.items() - } + return new_spans - yield from kept_ents + def process_one(self, span: Span, pattern: SingleConfig): + """ + Processes one span, applying both the filters and the assignments - def process_one(self, span): - filtered = self.filter_one(span) - yield from self.assign_one(filtered) + Parameters + ---------- + span: Span + Span object + pattern: SingleConfig + + Yields + ------ + span: + Filtered spans, with optional assignments + """ + span = self.filter_one(span, pattern) + if span is not None: + yield from self.assign_one(span, pattern) - def process(self, doc: Doc) -> List[Span]: + def process(self, doc: Doc) -> Generator[Span, None, None]: """ Process the document, looking for named entities. @@ -452,12 +432,18 @@ def process(self, doc: Doc) -> List[Span]: List of detected spans. """ - matches = self.phrase_matcher(doc, as_spans=True) - regex_matches = list(self.regex_matcher(doc, as_spans=True)) - - spans = (*matches, *regex_matches) - for span in spans: - yield from self.process_one(span) + for pattern in self.patterns: + for span in ( + *pattern.phrase_matcher(doc, as_spans=True), + *pattern.regex_matcher(doc, as_spans=True), + *( + get_spans(doc, pattern.span_getter) + if pattern.span_getter is not None + else [] + ), + ): + spans = list(self.process_one(span, pattern)) + yield from spans def __call__(self, doc: Doc) -> Doc: """ diff --git a/edsnlp/pipes/core/contextual_matcher/factory.py b/edsnlp/pipes/core/contextual_matcher/factory.py index 38badbb6f..a75422ad5 100644 --- a/edsnlp/pipes/core/contextual_matcher/factory.py +++ b/edsnlp/pipes/core/contextual_matcher/factory.py @@ -15,6 +15,6 @@ ) create_component = registry.factory.register( - "eds.contextual-matcher", - deprecated=["contextual-matcher"], + "eds.contextual_matcher", + deprecated=["eds.contextual-matcher", "contextual-matcher"], )(ContextualMatcher) diff --git a/edsnlp/pipes/core/contextual_matcher/models.py b/edsnlp/pipes/core/contextual_matcher/models.py index 4d8a6e1c5..544108ff7 100644 --- a/edsnlp/pipes/core/contextual_matcher/models.py +++ b/edsnlp/pipes/core/contextual_matcher/models.py @@ -1,167 +1,205 @@ import re -from typing import List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, List, Optional, Union -import pydantic import regex -from pydantic import BaseModel, Extra +from pydantic import BaseModel from edsnlp.matchers.utils import ListOrStr -from edsnlp.utils.typing import Validated, cast +from edsnlp.utils.span_getters import ( + ContextWindow, + SentenceContextWindow, + SpanGetterArg, +) +from edsnlp.utils.typing import AsList Flags = Union[re.RegexFlag, int] -Window = Union[ - Tuple[int, int], - List[int], - int, -] try: - from pydantic import field_validator + from pydantic import field_validator, model_validator def validator(x, allow_reuse=True, pre=False): return field_validator(x, mode="before" if pre else "after") + + def root_validator(allow_reuse=True, pre=False): + return model_validator(mode="before" if pre else "after") + + except ImportError: - from pydantic import validator - - -def normalize_window(cls, v): - if isinstance(v, list): - assert ( - len(v) == 2 - ), "`window` should be a tuple/list of two integer, or a single integer" - v = tuple(v) - if isinstance(v, int): - assert v != 0, "The provided `window` should not be 0" - if v < 0: - return (v, 0) - if v > 0: - return (0, v) - assert v[0] < v[1], "The provided `window` should contain at least 1 token" - return v - - -class AssignDict(dict): - """ - Custom dictionary that overrides the __setitem__ method - depending on the reduce_mode - """ - - def __init__(self, reduce_mode: dict): - super().__init__() - self.reduce_mode = reduce_mode - self._setitem_ = self.__setitem_options__() - - def __missing__(self, key): - return ( - { - "span": [], - "value_span": [], - "value_text": [], - } - if self.reduce_mode[key] is None - else {} + from pydantic import root_validator, validator + + +def validate_window(cls, values): + if isinstance(values.get("regex"), str): + values["regex"] = [values["regex"]] + window = values.get("window") + if window is None or isinstance(window, (int, tuple, list)): + values["limit_to_sentence"] = True + window = values.get("window") + if window is not None: + values["window"] = ContextWindow.validate(window) + if values.get("limit_to_sentence"): + values["window"] = ( + SentenceContextWindow(0, 0) & values.get("window") + if window is not None + else SentenceContextWindow(0, 0) ) - - def __setitem__(self, key, value): - self._setitem_[self.reduce_mode[key]](key, value) - - def __setitem_options__(self): - def keep_list(key, value): - old_values = self.__getitem__(key) - value["span"] = old_values["span"] + [value["span"]] - value["value_span"] = old_values["value_span"] + [value["value_span"]] - value["value_text"] = old_values["value_text"] + [value["value_text"]] - - dict.__setitem__(self, key, value) - - def keep_first(key, value): - old_values = self.__getitem__(key) - if ( - old_values.get("span") is None - or value["span"].start <= old_values["span"].start - ): - dict.__setitem__(self, key, value) - - def keep_last(key, value): - old_values = self.__getitem__(key) - if ( - old_values.get("span") is None - or value["span"].start >= old_values["span"].start - ): - dict.__setitem__(self, key, value) - - return { - None: keep_list, - "keep_first": keep_first, - "keep_last": keep_last, - } + return values class SingleExcludeModel(BaseModel): + """ + A dictionary to define exclusion rules. Exclusion rules are given as Regexes, and + if a match is found in the surrounding context of an extraction, the extraction is + removed. Note that only take a match into account if it is not inside the anchor + span. + + Parameters + ---------- + regex : ListOrStr + A single Regex or a list of Regexes + regex_attr : Optional[str] + An attributes to overwrite the given `attr` when matching with Regexes. + regex_flags : re.RegexFlag + Regex flags + span_getter : Optional[SpanGetterArg] + A span getter to pick the assigned spans from already extracted entities. + window : Optional[ContextWindow] + Context window to search for patterns around the anchor. Defaults to "sent" ( + i.e. the sentence of the anchor span). + """ + + span_getter: Optional[SpanGetterArg] = None regex: ListOrStr = [] - window: Window - limit_to_sentence: Optional[bool] = True - regex_flags: Optional[Flags] = None regex_attr: Optional[str] = None + regex_flags: Union[re.RegexFlag, int] = None - @validator("regex") - def exclude_regex_validation(cls, v): - if isinstance(v, str): - v = [v] - return v + limit_to_sentence: Optional[bool] = None + window: Optional[ContextWindow] = None + regex_matcher: Optional[Any] = None - _normalize_window = validator("window", allow_reuse=True)(normalize_window) - if pydantic.VERSION < "2": - model_dump = BaseModel.dict + validate_window = root_validator(pre=True, allow_reuse=True)(validate_window) -class ExcludeModel(Validated): - @classmethod - def validate(cls, v, config=None): - if not isinstance(v, list): - v = [v] - return [cast(SingleExcludeModel, x) for x in v] +class SingleIncludeModel(BaseModel): + """ + A dictionary to define inclusion rules. Inclusion rules are given as Regexes, and + if a match isn't found in the surrounding context of an extraction, the extraction + is removed. Note that only take a match into account if it is not inside the anchor + span. + + Parameters + ---------- + regex : ListOrStr + A single Regex or a list of Regexes + regex_attr : Optional[str] + An attributes to overwrite the given `attr` when matching with Regexes. + regex_flags : re.RegexFlag + Regex flags + span_getter : Optional[SpanGetterArg] + A span getter to pick the assigned spans from already extracted entities. + window : Optional[ContextWindow] + Context window to search for patterns around the anchor. Defaults to "sent" ( + i.e. the sentence of the anchor span). + """ + + span_getter: Optional[SpanGetterArg] = None + regex: ListOrStr = [] + regex_attr: Optional[str] = None + regex_flags: Union[re.RegexFlag, int] = None + + limit_to_sentence: Optional[bool] = None + window: Optional[ContextWindow] = None + + regex_matcher: Optional[Any] = None + + validate_window = root_validator(pre=True, allow_reuse=True)(validate_window) + - if pydantic.VERSION < "2": - model_dump = BaseModel.dict +class ExcludeModel(AsList[SingleExcludeModel]): + """ + A list of `SingleExcludeModel` objects. If a single config is passed, + it will be automatically converted to a list of a single element. + """ + + +class IncludeModel(AsList[SingleIncludeModel]): + """ + A list of `SingleIncludeModel` objects. If a single config is passed, + it will be automatically converted to a list of a single element. + """ class SingleAssignModel(BaseModel): + """ + A dictionary to refine the extraction. Similarly to the `exclude` key, you can + provide a dictionary to use on the context **before** and **after** the extraction. + + Parameters + ---------- + span_getter : Optional[SpanGetterArg] + A span getter to pick the assigned spans from already extracted entities + in the doc. + regex : ListOrStr + A single Regex or a list of Regexes + regex_attr : Optional[str] + An attributes to overwrite the given `attr` when matching with Regexes. + regex_flags : re.RegexFlag + Regex flags + window : Optional[ContextWindow] + Context window to search for patterns around the anchor. Defaults to "sent" ( + i.e. the sentence of the anchor span). + replace_entity : Optional[bool] + If set to `True`, the match from the corresponding assign key will be used as + entity, instead of the main match. + See [this paragraph][replace_entity] + reduce_mode : Optional[Flags] + Set how multiple assign matches are handled. See the documentation of the + [`reduce_mode` parameter][reduce_mode] + required : Optional[str] + If set to `True`, the assign key must match for the extraction to be kept. If + it does not match, the extraction is discarded. + name : str + A name (string) + """ + name: str - regex: str - window: Window - limit_to_sentence: Optional[bool] = True - regex_flags: Optional[Flags] = None + + span_getter: Optional[SpanGetterArg] = None + regex: ListOrStr = [] regex_attr: Optional[str] = None + regex_flags: Union[re.RegexFlag, int] = None + + limit_to_sentence: Optional[bool] = None + window: Optional[ContextWindow] = None replace_entity: bool = False reduce_mode: Optional[str] = None + required: Optional[bool] = False - @validator("regex") + regex_matcher: Optional[Any] = None + + @validator("regex", allow_reuse=True) def check_single_regex_group(cls, pat): - compiled_pat = regex.compile( - pat - ) # Using regex to allow multiple fgroups with same name - n_groups = compiled_pat.groups - assert n_groups == 1, ( - "The pattern {pat} should have only one capturing group, not {n_groups}" - ).format( - pat=pat, - n_groups=n_groups, - ) + for single_pat in pat: + compiled_pat = regex.compile( + single_pat + ) # Using regex to allow multiple fgroups with same name + n_groups = compiled_pat.groups + assert n_groups == 1, ( + f"The pattern {single_pat} should have exactly one capturing group, " + f"not {n_groups}" + ) return pat - _normalize_window = validator("window", allow_reuse=True)(normalize_window) - if pydantic.VERSION < "2": - model_dump = BaseModel.dict + validate_window = root_validator(pre=True, allow_reuse=True)(validate_window) -class AssignModel(Validated): - @classmethod - def item_to_list(cls, v, config=None): - if not isinstance(v, list): - v = [v] - return [cast(SingleAssignModel, x) for x in v] +class AssignModel(AsList[SingleAssignModel]): + """ + A list of `SingleAssignModel` objects that should have at most + one element with `replace_entity=True`. If a single config is passed, + it will be automatically converted to a list of a single element. + """ @classmethod def name_uniqueness(cls, v, config=None): @@ -172,39 +210,86 @@ def name_uniqueness(cls, v, config=None): @classmethod def replace_uniqueness(cls, v, config=None): replace = [item for item in v if item.replace_entity] - assert ( - len(replace) <= 1 - ), "Only 1 assign element can be set with `replace_entity=True`" + assert len(replace) <= 1, ( + "Only 1 assign element can be set with `replace_entity=True`" + ) return v @classmethod def __get_validators__(cls): - yield cls.item_to_list + yield cls.validate yield cls.name_uniqueness yield cls.replace_uniqueness - if pydantic.VERSION < "2": - model_dump = BaseModel.dict +if TYPE_CHECKING: + ExcludeModel = List[SingleExcludeModel] # noqa: F811 + IncludeModel = List[SingleIncludeModel] # noqa: F811 + AssignModel = List[SingleAssignModel] # noqa: F811 -class SingleConfig(BaseModel, extra=Extra.forbid): - source: str + +class SingleConfig(BaseModel, extra="forbid"): + """ + A single configuration for the contextual matcher. + + Parameters + ---------- + span_getter : Optional[SpanGetterArg] + A span getter to pick the assigned spans from already extracted entities + in the doc. + regex : ListOrStr + A single Regex or a list of Regexes + regex_attr : Optional[str] + An attributes to overwrite the given `attr` when matching with Regexes. + regex_flags: re.RegexFlag + Regex flags + terms : Union[re.RegexFlag, int] + A single term or a list of terms (for exact matches) + exclude : AsList[SingleExcludeModel] + ??? subdoc "One or more exclusion patterns" + + ::: edsnlp.pipes.core.contextual_matcher.models.SingleExcludeModel + options: + only_parameters: "no-header" + include : AsList[SingleIncludeModel] + ??? subdoc "One or more inclusion patterns" + + ::: edsnlp.pipes.core.contextual_matcher.models.SingleIncludeModel + options: + only_parameters: "no-header" + assign : AsList[SingleAssignModel] + ??? subdoc "One or more assignment patterns" + + ::: edsnlp.pipes.core.contextual_matcher.models.SingleAssignModel + options: + only_parameters: "no-header" + source : str + A label describing the pattern + + """ + + source: Optional[str] = None + + span_getter: Optional[SpanGetterArg] = None terms: ListOrStr = [] regex: ListOrStr = [] regex_attr: Optional[str] = None regex_flags: Union[re.RegexFlag, int] = None - exclude: Optional[ExcludeModel] = [] - assign: Optional[AssignModel] = [] - if pydantic.VERSION < "2": - model_dump = BaseModel.dict + exclude: ExcludeModel = [] + include: IncludeModel = [] + assign: AssignModel = [] -class FullConfig(Validated): - @classmethod - def pattern_to_list(cls, v, config=None): - if not isinstance(v, list): - v = [v] - return [cast(SingleConfig, item) for item in v] + regex_matcher: Optional[Any] = None + phrase_matcher: Optional[Any] = None + + +class FullConfig(AsList[SingleConfig]): + """ + A list of `SingleConfig` objects that should have distinct `source` fields. + If a single config is passed, it will be automatically converted to a list of + a single element. + """ @classmethod def source_uniqueness(cls, v, config=None): @@ -214,8 +299,9 @@ def source_uniqueness(cls, v, config=None): @classmethod def __get_validators__(cls): - yield cls.pattern_to_list + yield cls.validate yield cls.source_uniqueness - if pydantic.VERSION < "2": - model_dump = BaseModel.dict + +if TYPE_CHECKING: + FullConfig = List[SingleConfig] # noqa: F811 diff --git a/edsnlp/pipes/core/endlines/endlines.py b/edsnlp/pipes/core/endlines/endlines.py index b2ad186ac..2d91c1155 100644 --- a/edsnlp/pipes/core/endlines/endlines.py +++ b/edsnlp/pipes/core/endlines/endlines.py @@ -143,6 +143,10 @@ def __init__( self._read_model(model_path) + def set_extensions(self): + if not Token.has_extension("excluded"): + Token.set_extension("excluded", default=False) + def _read_model(self, end_lines_model: Optional[Union[str, EndLinesModel]]): """ Parameters diff --git a/edsnlp/pipes/core/matcher/matcher.py b/edsnlp/pipes/core/matcher/matcher.py index e824e9f9d..fe9874b31 100644 --- a/edsnlp/pipes/core/matcher/matcher.py +++ b/edsnlp/pipes/core/matcher/matcher.py @@ -9,6 +9,7 @@ from edsnlp.matchers.simstring import SimstringMatcher from edsnlp.matchers.utils import Patterns from edsnlp.pipes.base import BaseNERComponent, SpanSetterArg +from edsnlp.utils.span_getters import SpanGetterArg, get_spans class GenericMatcher(BaseNERComponent): @@ -102,6 +103,7 @@ def __init__( term_matcher: Literal["exact", "simstring"] = "exact", term_matcher_config: Dict[str, Any] = {}, span_setter: SpanSetterArg = {"ents": True}, + context_getter: Optional[SpanGetterArg] = None, ): super().__init__(nlp=nlp, name=name, span_setter=span_setter) @@ -114,6 +116,7 @@ def __init__( regex = regex or {} self.attr = attr + self.context_getter = context_getter if term_matcher == "exact": self.phrase_matcher = EDSPhraseMatcher( @@ -163,10 +166,16 @@ def process(self, doc: Doc) -> List[Span]: List of Spans returned by the matchers. """ - matches = self.phrase_matcher(doc, as_spans=True) - regex_matches = self.regex_matcher(doc, as_spans=True) - - spans = list(matches) + list(regex_matches) + contexts = ( + list(get_spans(doc, self.context_getter)) + if self.context_getter is not None + else [doc] + ) + spans: List[Span] = [] + for context in contexts: + matches = self.phrase_matcher(context, as_spans=True) + regex_matches = self.regex_matcher(context, as_spans=True) + spans.extend(list(matches) + list(regex_matches)) return spans diff --git a/edsnlp/pipes/misc/consultation_dates/consultation_dates.py b/edsnlp/pipes/misc/consultation_dates/consultation_dates.py index 2f62fd2a7..7f0cf8fe2 100644 --- a/edsnlp/pipes/misc/consultation_dates/consultation_dates.py +++ b/edsnlp/pipes/misc/consultation_dates/consultation_dates.py @@ -52,7 +52,7 @@ class ConsultationDatesMatcher(GenericMatcher): # Out: [Consultation du 03/10/2018] doc.spans["consultation_dates"][0]._.consultation_date.to_datetime() - # Out: DateTime(2018, 10, 3, 0, 0, 0) + # Out: 2018-10-03 00:00:00 ``` Extensions diff --git a/edsnlp/pipes/misc/dates/dates.py b/edsnlp/pipes/misc/dates/dates.py index 3b00719a8..bc9b5b764 100644 --- a/edsnlp/pipes/misc/dates/dates.py +++ b/edsnlp/pipes/misc/dates/dates.py @@ -41,7 +41,7 @@ class DatesMatcher(BaseNERComponent): | `relative` | `hier`, `la semaine dernière` | | `duration` | `pendant quatre jours` | - See the [tutorial](../../tutorials/detecting-dates.md) for a presentation of a + See the [tutorial](/tutorials/detecting-dates.md) for a presentation of a full pipeline featuring the `eds.dates` component. ## Usage @@ -67,7 +67,7 @@ class DatesMatcher(BaseNERComponent): # Out: [23 août 2021, il y a un an, mai 1995] dates[0]._.date.to_datetime() - # Out: 2021-08-23T00:00:00+02:00 + # Out: 2021-08-23 00:00:00 dates[1]._.date.to_datetime() # Out: None @@ -76,7 +76,7 @@ class DatesMatcher(BaseNERComponent): doc._.note_datetime = note_datetime dates[1]._.date.to_datetime() - # Out: 2020-08-27T00:00:00+02:00 + # Out: 2020-08-27 00:00:00+00:09 date_2_output = dates[2]._.date.to_datetime( note_datetime=note_datetime, @@ -85,7 +85,7 @@ class DatesMatcher(BaseNERComponent): default_day=15, ) date_2_output - # Out: 1995-05-15T00:00:00+02:00 + # Out: 1995-05-15 00:00:00+02:00 doc.spans["durations"] # Out: [pendant une semaine] @@ -260,7 +260,7 @@ def __init__( if on_ents_only: assert span_getter is None, ( - "Cannot use both `on_ents_only` and " "`span_getter`" + "Cannot use both `on_ents_only` and `span_getter`" ) def span_getter(doc): diff --git a/edsnlp/pipes/misc/dates/models.py b/edsnlp/pipes/misc/dates/models.py index 506af3f95..0c9470776 100644 --- a/edsnlp/pipes/misc/dates/models.py +++ b/edsnlp/pipes/misc/dates/models.py @@ -40,14 +40,11 @@ class Mode(str, Enum): DURATION = "duration" -class Period(BaseModel): +class Period(BaseModel, arbitrary_types_allowed=True): FROM: Optional[Span] = None UNTIL: Optional[Span] = None DURATION: Optional[Span] = None - class Config: - arbitrary_types_allowed = True - class BaseDate(BaseModel): mode: Mode = None diff --git a/edsnlp/pipes/misc/split/split.py b/edsnlp/pipes/misc/split/split.py index 17db63384..d26b11568 100644 --- a/edsnlp/pipes/misc/split/split.py +++ b/edsnlp/pipes/misc/split/split.py @@ -128,8 +128,7 @@ def __init__( stream = stream.map(eds.split(max_length=5, regex="\\n{2,}")) print(" | ".join(doc.text.strip() for doc in stream)) - # Out: - # Sentence 1 | This is another longer sentence | more than 5 words + # Out: Sentence 1 | This is another longer sentence | more than 5 words ``` Parameters diff --git a/edsnlp/pipes/misc/tables/tables.py b/edsnlp/pipes/misc/tables/tables.py index ede5a91b4..9d0dcc9ac 100644 --- a/edsnlp/pipes/misc/tables/tables.py +++ b/edsnlp/pipes/misc/tables/tables.py @@ -78,7 +78,7 @@ class TablesMatcher(BaseComponent): index=False, # set True to use the first column as index ) type(df) - # Out: pandas.core.frame.DataFrame + # Out: ``` The pandas DataFrame: diff --git a/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py b/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py index ca543fa6e..a22e1c993 100644 --- a/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py +++ b/edsnlp/pipes/ner/behaviors/alcohol/alcohol.py @@ -1,10 +1,11 @@ """`eds.alcohol` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from edsnlp.pipes.qualifiers.negation import NegationQualifier from ...disorders.base import DisorderMatcher @@ -59,7 +60,7 @@ class AlcoholMatcher(DisorderMatcher): ), ), ) - nlp.add_pipe(f"eds.alcohol") + nlp.add_pipe(eds.alcohol()) ``` Below are a few examples: @@ -72,7 +73,7 @@ class AlcoholMatcher(DisorderMatcher): The pipeline object name : Optional[str] The name of the component - patterns : Union[Dict[str, Any], List[Dict[str, Any]]] + patterns : FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -91,7 +92,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "alcohol", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label="alcohol", span_setter={"ents": True, "alcohol": True}, ): diff --git a/edsnlp/pipes/ner/behaviors/alcohol/patterns.py b/edsnlp/pipes/ner/behaviors/alcohol/patterns.py index 38c795926..0d041722f 100644 --- a/edsnlp/pipes/ner/behaviors/alcohol/patterns.py +++ b/edsnlp/pipes/ner/behaviors/alcohol/patterns.py @@ -1,4 +1,4 @@ -default_patterns = dict( +default_pattern = dict( source="alcohol", regex=[ r"\balco[ol]", @@ -40,3 +40,4 @@ ), ], ) +default_patterns = [default_pattern] diff --git a/edsnlp/pipes/ner/behaviors/tobacco/tobacco.py b/edsnlp/pipes/ner/behaviors/tobacco/tobacco.py index 36c0701cc..3010478c0 100644 --- a/edsnlp/pipes/ner/behaviors/tobacco/tobacco.py +++ b/edsnlp/pipes/ner/behaviors/tobacco/tobacco.py @@ -1,18 +1,20 @@ """`eds.tobacco` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import Iterable, Optional from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig +from edsnlp.pipes.qualifiers.negation import NegationQualifier from edsnlp.utils.numbers import parse_digit -from ..alcohol.alcohol import AlcoholMatcher +from ...disorders.base import DisorderMatcher from .patterns import default_patterns -class TobaccoMatcher(AlcoholMatcher): +class TobaccoMatcher(DisorderMatcher): """ The `eds.tobacco` pipeline component extracts mentions of tobacco consumption. @@ -80,7 +82,7 @@ class TobaccoMatcher(AlcoholMatcher): The pipeline object name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -99,7 +101,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "tobacco", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "tobacco", span_setter: SpanSetterArg = {"ents": True, "tobacco": True}, ): @@ -108,21 +110,34 @@ def __init__( name=name, patterns=patterns, label=label, + detailed_status_mapping={ + 1: None, + 2: "ABSTINENCE", + }, span_setter=span_setter, + include_assigned=True, ) + self.negation = NegationQualifier(nlp) - def process(self, doc: Doc) -> List[Span]: + def process(self, doc: Doc) -> Iterable[Span]: for span in super().process(doc): - if "secondhand" in span._.assigned.keys(): + if "stopped" in span._.assigned: + # using nlp(text) so that we don't assign negation flags on + # the original document + stopped = self.negation.process(span) + if not any(stopped_token.negation for stopped_token in stopped.tokens): + span._.status = 2 + if "zero_after" in span._.assigned: span._.negation = True - - elif "PA" in span._.assigned.keys(): + if "secondhand" in span._.assigned: + span._.negation = True + if "PA" in span._.assigned and ("stopped" not in span._.assigned): pa = parse_digit( span._.assigned["PA"], atttr="NORM", ignore_excluded=True, ) - if (pa == 0) and ("stopped" not in span._.assigned.keys()): + if pa == 0: span._.negation = True yield span diff --git a/edsnlp/pipes/ner/disorders/aids/aids.py b/edsnlp/pipes/ner/disorders/aids/aids.py index f12e7a911..2e5e24d2b 100644 --- a/edsnlp/pipes/ner/disorders/aids/aids.py +++ b/edsnlp/pipes/ner/disorders/aids/aids.py @@ -1,10 +1,11 @@ """`eds.aids` pipeline""" import itertools -from typing import Any, Dict, List, Optional, Union +from typing import Optional from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from edsnlp.pipes.ner.disorders.base import DisorderMatcher from edsnlp.pipes.qualifiers.hypothesis import HypothesisQualifier from edsnlp.pipes.qualifiers.hypothesis.factory import ( @@ -82,7 +83,7 @@ class AIDSMatcher(DisorderMatcher): The pipeline object name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -101,7 +102,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "aids", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "aids", span_setter: SpanSetterArg = {"ents": True, "aids": True}, ): diff --git a/edsnlp/pipes/ner/disorders/base.py b/edsnlp/pipes/ner/disorders/base.py index 06d25358d..dcceec302 100644 --- a/edsnlp/pipes/ner/disorders/base.py +++ b/edsnlp/pipes/ner/disorders/base.py @@ -1,11 +1,12 @@ import re -from typing import Any, Dict, List, Union +from typing import Dict, Union from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg from edsnlp.pipes.core.contextual_matcher import ContextualMatcher +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from edsnlp.utils.deprecation import deprecated_getter_factory from edsnlp.utils.filter import filter_spans @@ -20,7 +21,7 @@ class DisorderMatcher(ContextualMatcher): spaCy `Language` object. name : str The name of the pipe - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The configuration dictionary include_assigned : bool Whether to include (eventual) assign matches to the final entity @@ -44,7 +45,7 @@ def __init__( name: str, *, label: str, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]], + patterns: FullConfig, include_assigned: bool = True, ignore_excluded: bool = True, ignore_space_tokens: bool = True, diff --git a/edsnlp/pipes/ner/disorders/cerebrovascular_accident/cerebrovascular_accident.py b/edsnlp/pipes/ner/disorders/cerebrovascular_accident/cerebrovascular_accident.py index 6d562b8e0..2c60db414 100644 --- a/edsnlp/pipes/ner/disorders/cerebrovascular_accident/cerebrovascular_accident.py +++ b/edsnlp/pipes/ner/disorders/cerebrovascular_accident/cerebrovascular_accident.py @@ -1,11 +1,12 @@ """`eds.cerebrovascular_accident` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from ..base import DisorderMatcher from .patterns import default_patterns @@ -69,7 +70,7 @@ class CerebrovascularAccidentMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -88,7 +89,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "cerebrovascular_accident", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "cerebrovascular_accident", span_setter: SpanSetterArg = {"ents": True, "cerebrovascular_accident": True}, ): diff --git a/edsnlp/pipes/ner/disorders/ckd/ckd.py b/edsnlp/pipes/ner/disorders/ckd/ckd.py index e4a4ca20d..3a0dc7f35 100644 --- a/edsnlp/pipes/ner/disorders/ckd/ckd.py +++ b/edsnlp/pipes/ner/disorders/ckd/ckd.py @@ -1,12 +1,13 @@ """`eds.ckd` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import Optional from loguru import logger from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from edsnlp.pipes.ner.disorders.base import DisorderMatcher from .patterns import default_patterns @@ -82,7 +83,7 @@ class CKDMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -101,7 +102,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "ckd", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "ckd", span_setter: SpanSetterArg = {"ents": True, "ckd": True}, ): diff --git a/edsnlp/pipes/ner/disorders/congestive_heart_failure/congestive_heart_failure.py b/edsnlp/pipes/ner/disorders/congestive_heart_failure/congestive_heart_failure.py index 2ad275336..42a7d30bb 100644 --- a/edsnlp/pipes/ner/disorders/congestive_heart_failure/congestive_heart_failure.py +++ b/edsnlp/pipes/ner/disorders/congestive_heart_failure/congestive_heart_failure.py @@ -1,9 +1,10 @@ """`eds.congestive_heart_failure` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import Optional from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from ..base import DisorderMatcher from .patterns import default_patterns @@ -63,7 +64,7 @@ class CongestiveHeartFailureMatcher(DisorderMatcher): The pipeline object name : str, The name of the component - patterns : Optional[Dict[str, Any]] + patterns : FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -82,7 +83,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "congestive_heart_failure", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "congestive_heart_failure", span_setter: SpanSetterArg = {"ents": True, "congestive_heart_failure": True}, ): diff --git a/edsnlp/pipes/ner/disorders/connective_tissue_disease/connective_tissue_disease.py b/edsnlp/pipes/ner/disorders/connective_tissue_disease/connective_tissue_disease.py index 1819c4a13..18547c891 100644 --- a/edsnlp/pipes/ner/disorders/connective_tissue_disease/connective_tissue_disease.py +++ b/edsnlp/pipes/ner/disorders/connective_tissue_disease/connective_tissue_disease.py @@ -1,11 +1,12 @@ """`eds.connective_tissue_disease` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from ..base import DisorderMatcher from .patterns import default_patterns @@ -66,7 +67,7 @@ class ConnectiveTissueDiseaseMatcher(DisorderMatcher): The pipeline object name : str The name of the component - patterns : Optional[Dict[str, Any]] + patterns : FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -85,7 +86,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "connective_tissue_disease", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "connective_tissue_disease", span_setter: SpanSetterArg = {"ents": True, "connective_tissue_disease": True}, ): diff --git a/edsnlp/pipes/ner/disorders/copd/copd.py b/edsnlp/pipes/ner/disorders/copd/copd.py index 9ebc6fd00..f6a1e3a55 100644 --- a/edsnlp/pipes/ner/disorders/copd/copd.py +++ b/edsnlp/pipes/ner/disorders/copd/copd.py @@ -1,11 +1,12 @@ """`eds.copd` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import Optional from spacy.tokens import Doc from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from ..base import DisorderMatcher from .patterns import default_patterns @@ -69,7 +70,7 @@ class COPDMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -88,7 +89,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "copd", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "copd", span_setter: SpanSetterArg = {"ents": True, "copd": True}, ): diff --git a/edsnlp/pipes/ner/disorders/dementia/dementia.py b/edsnlp/pipes/ner/disorders/dementia/dementia.py index a33c4824e..7de643880 100644 --- a/edsnlp/pipes/ner/disorders/dementia/dementia.py +++ b/edsnlp/pipes/ner/disorders/dementia/dementia.py @@ -1,9 +1,10 @@ """`eds.dementia` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import Optional from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from ..base import DisorderMatcher from .patterns import default_patterns @@ -63,7 +64,7 @@ class DementiaMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -82,7 +83,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "dementia", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "dementia", span_setter: SpanSetterArg = {"ents": True, "dementia": True}, ): diff --git a/edsnlp/pipes/ner/disorders/diabetes/diabetes.py b/edsnlp/pipes/ner/disorders/diabetes/diabetes.py index 1eef8a885..c32f2cb00 100644 --- a/edsnlp/pipes/ner/disorders/diabetes/diabetes.py +++ b/edsnlp/pipes/ner/disorders/diabetes/diabetes.py @@ -1,6 +1,6 @@ """`eds.diabetes` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional from spacy.tokens import Doc, Span @@ -8,7 +8,7 @@ from edsnlp.matchers.regex import RegexMatcher from edsnlp.matchers.utils import get_text from edsnlp.pipes.base import SpanSetterArg -from edsnlp.pipes.core.contextual_matcher.contextual_matcher import get_window +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from ..base import DisorderMatcher from .patterns import COMPLICATIONS, default_patterns @@ -75,7 +75,7 @@ class DiabetesMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -94,7 +94,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "diabetes", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "diabetes", span_setter: SpanSetterArg = {"ents": True, "diabetes": True}, ): @@ -141,8 +141,7 @@ def has_far_complications(self, span: Span): Handles the common case where complications are listed as bullet points, sometimes fairly far from the anchor. """ - window = (0, 50) - context = get_window(span, window, limit_to_sentence=False) + context = span.doc[span.start : span.end + 50] if next(iter(self.complication_matcher(context)), None) is not None: return True return False diff --git a/edsnlp/pipes/ner/disorders/hemiplegia/hemiplegia.py b/edsnlp/pipes/ner/disorders/hemiplegia/hemiplegia.py index 7baa64f07..c391223f9 100644 --- a/edsnlp/pipes/ner/disorders/hemiplegia/hemiplegia.py +++ b/edsnlp/pipes/ner/disorders/hemiplegia/hemiplegia.py @@ -1,9 +1,10 @@ """`eds.hemiplegia` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import Optional from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from ..base import DisorderMatcher from .patterns import default_patterns @@ -63,7 +64,7 @@ class HemiplegiaMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -82,7 +83,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "hemiplegia", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "hemiplegia", span_setter: SpanSetterArg = {"ents": True, "hemiplegia": True}, ): diff --git a/edsnlp/pipes/ner/disorders/leukemia/leukemia.py b/edsnlp/pipes/ner/disorders/leukemia/leukemia.py index 7da1533cc..cc723eb56 100644 --- a/edsnlp/pipes/ner/disorders/leukemia/leukemia.py +++ b/edsnlp/pipes/ner/disorders/leukemia/leukemia.py @@ -1,9 +1,10 @@ """`eds.leukemia` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import Optional from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from ..base import DisorderMatcher from .patterns import default_patterns @@ -63,7 +64,7 @@ class LeukemiaMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -82,7 +83,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "leukemia", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "leukemia", span_setter: SpanSetterArg = {"ents": True, "leukemia": True}, ): diff --git a/edsnlp/pipes/ner/disorders/liver_disease/liver_disease.py b/edsnlp/pipes/ner/disorders/liver_disease/liver_disease.py index 1c5f1b76a..9e567c70d 100644 --- a/edsnlp/pipes/ner/disorders/liver_disease/liver_disease.py +++ b/edsnlp/pipes/ner/disorders/liver_disease/liver_disease.py @@ -1,11 +1,12 @@ """`eds.liver_disease` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from ..base import DisorderMatcher from .patterns import default_patterns @@ -67,7 +68,7 @@ class LiverDiseaseMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -86,7 +87,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "liver_disease", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "liver_disease", span_setter: SpanSetterArg = {"ents": True, "liver_disease": True}, ): diff --git a/edsnlp/pipes/ner/disorders/lymphoma/lymphoma.py b/edsnlp/pipes/ner/disorders/lymphoma/lymphoma.py index b4e130dda..fc491621a 100644 --- a/edsnlp/pipes/ner/disorders/lymphoma/lymphoma.py +++ b/edsnlp/pipes/ner/disorders/lymphoma/lymphoma.py @@ -1,9 +1,10 @@ """`eds.lymphoma` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import Optional from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from edsnlp.pipes.ner.disorders.base import DisorderMatcher from .patterns import default_patterns @@ -67,7 +68,7 @@ class LymphomaMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -86,7 +87,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "lymphoma", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "lymphoma", span_setter: SpanSetterArg = {"ents": True, "lymphoma": True}, ): diff --git a/edsnlp/pipes/ner/disorders/myocardial_infarction/myocardial_infarction.py b/edsnlp/pipes/ner/disorders/myocardial_infarction/myocardial_infarction.py index 51f0f41db..3aed7d3b5 100644 --- a/edsnlp/pipes/ner/disorders/myocardial_infarction/myocardial_infarction.py +++ b/edsnlp/pipes/ner/disorders/myocardial_infarction/myocardial_infarction.py @@ -1,11 +1,12 @@ """`eds.myocardial_infarction` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from edsnlp.pipes.ner.disorders.base import DisorderMatcher from .patterns import default_patterns @@ -71,7 +72,7 @@ class MyocardialInfarctionMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -90,7 +91,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "myocardial_infarction", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "myocardial_infarction", span_setter: SpanSetterArg = {"ents": True, "myocardial_infarction": True}, ): diff --git a/edsnlp/pipes/ner/disorders/peptic_ulcer_disease/peptic_ulcer_disease.py b/edsnlp/pipes/ner/disorders/peptic_ulcer_disease/peptic_ulcer_disease.py index 3bd55440f..09076e14a 100644 --- a/edsnlp/pipes/ner/disorders/peptic_ulcer_disease/peptic_ulcer_disease.py +++ b/edsnlp/pipes/ner/disorders/peptic_ulcer_disease/peptic_ulcer_disease.py @@ -1,11 +1,12 @@ """`eds.peptic_ulcer_disease` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from edsnlp.pipes.ner.disorders.base import DisorderMatcher from .patterns import default_patterns @@ -66,7 +67,7 @@ class PepticUlcerDiseaseMatcher(DisorderMatcher): The pipeline object name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -85,7 +86,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "peptic_ulcer_disease", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "peptic_ulcer_disease", span_setter: SpanSetterArg = {"ents": True, "peptic_ulcer_disease": True}, ): diff --git a/edsnlp/pipes/ner/disorders/peripheral_vascular_disease/peripheral_vascular_disease.py b/edsnlp/pipes/ner/disorders/peripheral_vascular_disease/peripheral_vascular_disease.py index 99c0e9cbd..3eff5c6bc 100644 --- a/edsnlp/pipes/ner/disorders/peripheral_vascular_disease/peripheral_vascular_disease.py +++ b/edsnlp/pipes/ner/disorders/peripheral_vascular_disease/peripheral_vascular_disease.py @@ -1,11 +1,12 @@ """`eds.peripheral_vascular_disease` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from edsnlp.pipes.ner.disorders.base import DisorderMatcher from .patterns import default_patterns @@ -66,7 +67,7 @@ class PeripheralVascularDiseaseMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -86,7 +87,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "peripheral_vascular_disease", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, label: str = "peripheral_vascular_disease", span_setter: SpanSetterArg = { "ents": True, diff --git a/edsnlp/pipes/ner/disorders/solid_tumor/solid_tumor.py b/edsnlp/pipes/ner/disorders/solid_tumor/solid_tumor.py index a577fd2c2..a0db70d42 100644 --- a/edsnlp/pipes/ner/disorders/solid_tumor/solid_tumor.py +++ b/edsnlp/pipes/ner/disorders/solid_tumor/solid_tumor.py @@ -1,11 +1,12 @@ """`eds.solid_tumor` pipeline""" -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional from spacy.tokens import Doc, Span from edsnlp.core import PipelineProtocol from edsnlp.pipes.base import SpanSetterArg +from edsnlp.pipes.core.contextual_matcher.models import FullConfig from edsnlp.utils.numbers import parse_digit from ..base import DisorderMatcher @@ -71,7 +72,7 @@ class SolidTumorMatcher(DisorderMatcher): The pipeline name : Optional[str] The name of the component - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] + patterns: FullConfig The patterns to use for matching label : str The label to use for the `Span` object and the extension @@ -95,7 +96,7 @@ def __init__( nlp: Optional[PipelineProtocol], name: str = "solid_tumor", *, - patterns: Union[Dict[str, Any], List[Dict[str, Any]]] = default_patterns, + patterns: FullConfig = default_patterns, use_tnm: bool = False, use_patterns_metastasis_ct_scan: bool = False, label: str = "solid_tumor", diff --git a/edsnlp/pipes/ner/scores/base_score.py b/edsnlp/pipes/ner/scores/base_score.py index 6ddb63f0a..494985c9f 100644 --- a/edsnlp/pipes/ner/scores/base_score.py +++ b/edsnlp/pipes/ner/scores/base_score.py @@ -7,6 +7,7 @@ from edsnlp.core import PipelineProtocol, registry from edsnlp.pipes.base import SpanSetterArg from edsnlp.pipes.core.contextual_matcher import ContextualMatcher +from edsnlp.utils.typing import AsList class SimpleScoreMatcher(ContextualMatcher): @@ -55,7 +56,7 @@ def __init__( *, regex: List[str] = None, attr: str = "NORM", - value_extract: Union[str, Dict[str, str], List[Dict[str, str]]] = None, + value_extract: Union[AsList[Dict[str, str]], str] = None, score_normalization: Union[str, Callable[[Union[str, None]], Any]] = None, window: int = 7, ignore_excluded: bool = False, @@ -79,14 +80,13 @@ def __init__( span_setter = {"ents": True, label: True} if isinstance(value_extract, str): - value_extract = dict( - name="value", - regex=value_extract, - window=window, - ) - - if isinstance(value_extract, dict): - value_extract = [value_extract] + value_extract = [ + dict( + name="value", + regex=value_extract, + window=window, + ) + ] value_exists = False for i, extract in enumerate(value_extract): diff --git a/edsnlp/pipes/qualifiers/family/family.py b/edsnlp/pipes/qualifiers/family/family.py index e979e71de..6f2150585 100644 --- a/edsnlp/pipes/qualifiers/family/family.py +++ b/edsnlp/pipes/qualifiers/family/family.py @@ -133,6 +133,8 @@ class FamilyContextQualifier(RuleBasedQualifier): The `eds.family` component was developed by AP-HP's Data Science team. """ + default_patterns = patterns + def __init__( self, nlp: PipelineProtocol, diff --git a/edsnlp/pipes/qualifiers/history/history.py b/edsnlp/pipes/qualifiers/history/history.py index 2dc8d56d8..8b753a2e9 100644 --- a/edsnlp/pipes/qualifiers/history/history.py +++ b/edsnlp/pipes/qualifiers/history/history.py @@ -83,9 +83,19 @@ class HistoryQualifier(RuleBasedQualifier): !!! info "Dates" - To take the most of the `eds.dates` component, you may add the ``note_datetime`` - context (cf. [Adding context][using-eds-nlps-helper-functions]). It allows the - component to compute the duration of absolute dates + To take the most of the `eds.dates` component, you may set a value for + `doc._.note_datetime`, either directly: + + ```python { .no-check } + doc = nlp.make_doc(text) + doc._.note_datetime = datetime.datetime(2022, 8, 28) + nlp(doc) + ``` + + or using a converter such as the + [`omop` converter][edsnlp.data.converters.OmopDict2DocConverter] + + It allows the component to compute the duration of absolute dates (e.g., le 28 août 2022/August 28, 2022). The ``birth_datetime`` context allows the component to exclude the birthdate from the extracted dates. @@ -197,6 +207,8 @@ class HistoryQualifier(RuleBasedQualifier): The `eds.history` component was developed by AP-HP's Data Science team. """ + default_patterns = patterns + history_limit: timedelta def __init__( diff --git a/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py b/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py index 924d2cf63..156016a01 100644 --- a/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py +++ b/edsnlp/pipes/qualifiers/hypothesis/hypothesis.py @@ -168,6 +168,8 @@ class HypothesisQualifier(RuleBasedQualifier): The `eds.hypothesis` pipeline was developed by AP-HP's Data Science team. """ + default_patterns = patterns + def __init__( self, nlp: PipelineProtocol, diff --git a/edsnlp/pipes/qualifiers/negation/negation.py b/edsnlp/pipes/qualifiers/negation/negation.py index fb2c7878f..98c0a78a2 100644 --- a/edsnlp/pipes/qualifiers/negation/negation.py +++ b/edsnlp/pipes/qualifiers/negation/negation.py @@ -170,6 +170,8 @@ class NegationQualifier(RuleBasedQualifier): The `eds.negation` component was developed by AP-HP's Data Science team. """ + default_patterns = patterns + def __init__( self, nlp: PipelineProtocol, diff --git a/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py b/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py index 77b0cbe91..d87352e20 100644 --- a/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py +++ b/edsnlp/pipes/qualifiers/reported_speech/reported_speech.py @@ -134,6 +134,8 @@ class ReportedSpeechQualifier(RuleBasedQualifier): The `eds.reported_speech` component was developed by AP-HP's Data Science team. """ + default_patterns = patterns + def __init__( self, nlp: PipelineProtocol, diff --git a/edsnlp/tune.py b/edsnlp/tune.py index 1dd6188e3..d982ed3a3 100644 --- a/edsnlp/tune.py +++ b/edsnlp/tune.py @@ -5,6 +5,7 @@ import os import random import sys +import warnings from typing import Dict, List, Optional, Tuple, Union import joblib @@ -34,7 +35,7 @@ CHECKPOINT = "study.pkl" -class HyperparameterConfig(BaseModel): +class HyperparameterConfig(BaseModel, extra="forbid"): """ A configuration model for hyperparameters used in optimization or tuning processes. """ @@ -47,9 +48,6 @@ class HyperparameterConfig(BaseModel): log: Optional[bool] = None choices: Optional[List[Union[str, float, int, bool]]] = None - class Config: - extra = "forbid" - if pydantic.VERSION < "2": model_dump = BaseModel.dict @@ -162,11 +160,16 @@ def compute_importances(study, n=10): cumulative_importances = collections.defaultdict(float) for i in range(n): - importance_scores = get_param_importances( - study, - evaluator=FanovaImportanceEvaluator(seed=i), - target=lambda t: t.value, - ) + try: + importance_scores = get_param_importances( + study, + evaluator=FanovaImportanceEvaluator(seed=i), + target=lambda t: t.value, + ) + except RuntimeError as e: + if "zero total variance" in str(e): # pragma: no cover + warnings.warn("Zero total variance : skipping importance computation.") + continue for feature, importance in importance_scores.items(): cumulative_importances[feature] += importance @@ -360,7 +363,7 @@ def process_results( if key_phase_1 not in best_params.keys(): f.write(f" {key_phase_1}: {value_phase_1}\n") f.write("\nImportances:\n") - for key, value in importances.items(): + for key, value in importances.items(): # pragma: no cover f.write(f" {key}: {value}\n") write_final_config(output_dir, config_path, tuned_parameters, best_params) diff --git a/edsnlp/utils/bindings.py b/edsnlp/utils/bindings.py index 17933cd5d..d02e769a4 100644 --- a/edsnlp/utils/bindings.py +++ b/edsnlp/utils/bindings.py @@ -53,7 +53,7 @@ def make_binding_getter(attribute: Union[str, Binding]): f"def getter(span):\n" f" try:\n" f" return {path} == value\n" - f" except AttributeError:\n" + f" except (AttributeError, KeyError):\n" f" return False\n", ctx, ctx, @@ -66,7 +66,7 @@ def make_binding_getter(attribute: Union[str, Binding]): f"def getter(span):\n" f" try:\n" f" return {path}\n" - f" except AttributeError:\n" + f" except (AttributeError, KeyError):\n" f" return None\n", ctx, ctx, diff --git a/edsnlp/utils/lazy_module.py b/edsnlp/utils/lazy_module.py index 6378760b6..ef2b98e40 100644 --- a/edsnlp/utils/lazy_module.py +++ b/edsnlp/utils/lazy_module.py @@ -67,6 +67,7 @@ def __getattr__(name): ------- """ + imported_module_name = module_globals["__name__"] if name in module_paths: module_path, module_name = module_paths[name] result = getattr( @@ -80,7 +81,7 @@ def __getattr__(name): ) module_globals[name] = result return result - raise AttributeError(f"module {__name__} has no attribute {name}") + raise AttributeError(f"module {imported_module_name} has no attribute {name}") def __dir__(): """ diff --git a/edsnlp/utils/span_getters.py b/edsnlp/utils/span_getters.py index b1b8a156b..58166a305 100644 --- a/edsnlp/utils/span_getters.py +++ b/edsnlp/utils/span_getters.py @@ -1,3 +1,4 @@ +import abc from collections import defaultdict from typing import ( TYPE_CHECKING, @@ -12,6 +13,7 @@ Union, ) +import numpy as np from pydantic import NonNegativeInt from spacy.tokens import Doc, Span @@ -35,18 +37,34 @@ ] -def get_spans(doc, span_getter): +def get_spans(doclike, span_getter): if span_getter is None: - yield doc[:] + yield doclike[:] return if callable(span_getter): - yield from span_getter(doc) + yield from span_getter(doclike) return - for key, span_filter in span_getter.items(): - if key == "*": - candidates = (span for group in doc.spans.values() for span in group) + for k, span_filter in span_getter.items(): + if isinstance(doclike, Doc): + if k == "*": + candidates = (s for grp in doclike.spans.values() for s in grp) + else: + candidates = doclike.spans.get(k, ()) if k != "ents" else doclike.ents else: - candidates = doc.spans.get(key, ()) if key != "ents" else doc.ents + doc = doclike.doc + if k == "*": + candidates = ( + s + for grp in doc.spans.values() + for s in grp + if not (s.end < doclike.start or s.start > doclike.end) + ) + else: + candidates = ( + s + for s in (doc.spans.get(k, ()) if k != "ents" else doc.ents) + if not (s.end < doclike.start or s.start > doclike.end) + ) if span_filter is True: yield from candidates else: @@ -251,8 +269,9 @@ class make_span_context_getter: Parameters ---------- context_words : Union[NonNegativeInt, Tuple[NonNegativeInt, NonNegativeInt]] - Minimum number of words to include on each side of the span. It could be asymmetric. - For example (5,2) will include 5 words before the start of the span and 2 after the end of the span + Minimum number of words to include on each side of the span. It could be + asymmetric. For example (5,2) will include 5 words before the start of the + span and 2 after the end of the span context_sents : Optional[ Union[NonNegativeInt, Tuple[NonNegativeInt, NonNegativeInt]] ] = 1 @@ -264,7 +283,7 @@ class make_span_context_getter: By default, 0 if the document has no sentence annotations, 1 otherwise. - """ # noqa: E501 + """ def __init__( self, @@ -284,9 +303,9 @@ def __init__( ) else: self.context_sents_left, self.context_sents_right = context_sents - assert ( - sum(context_sents) != 1 - ), "Asymmetric sentence context should not be (0,1) or (1,0)" + assert sum(context_sents) != 1, ( + "Asymmetric sentence context should not be (0,1) or (1,0)" + ) self.span_getter = validate_span_getter(span_getter, optional=True) def __call__(self, span: Union[Doc, Span]) -> Union[Span, List[Span]]: @@ -321,3 +340,210 @@ def __call__(self, span: Union[Doc, Span]) -> Union[Span, List[Span]]: end = max(end, max_end_sent) return span.doc[start:end] + + +class ContextWindowMeta(abc.ABCMeta): + pass + + +class ContextWindow(Validated, abc.ABC, metaclass=ContextWindowMeta): + """ + A ContextWindow specifies how much additional context (such as sentences or words) + should be included relative to an anchor span. For example, one might define a + context window that extracts the sentence immediately preceding and following the + anchor span, or one that extends the span by a given number of words before and + after. + + ContextWindow objects can be combined using logical operations to create more + complex context windows. For example, one can create a context window that includes + either words from a -10 to +10 range or words from the sentence. + + + Examples + -------- + + ```python + from confit import validate_arguments + from spacy.tokens import Span + + import edsnlp + from edsnlp.utils.span_getters import ContextWindow + + + @validate_arguments + def apply_context(span: Span, ctx: ContextWindow): + # ctx will be parsed and cast as a ContextWindow + return ctx(span) + + + nlp = edsnlp.blank("eds") + nlp.add_pipe("eds.sentences") + + doc = nlp("A first sentence. A second sentence, longer this time. A third.") + span = doc[5:6] # "second" + + # Will return a span with the 10 words before and after the span + # and words of the current sentence and the next sentence. + apply_context(span, "words[-3:3] | sents[0:1]").text + # Out: "sentence. A second sentence, longer this time. A third." + + # Will return the span covering at most the -5 and +5 words + # around the span and the current sentence of the span. + apply_context(span, "words[-4:4] & sent").text + # Out: "A second sentence, longer this" + ``` + + !!! warning "Indexing" + + Unlike standard Python sequence slicing, `sents[0:0]` returns + the current sentence, not an empty span. + """ + + @abc.abstractmethod + def __call__(self, span: Span) -> Span: + pass + + # logical ops + def __and__(self, other: "ContextWindow"): + # fmt: off + return IntersectionContextWindow([ + *(self.contexts if isinstance(self, IntersectionContextWindow) else (self,)), # noqa: E501 + *(other.contexts if isinstance(other, IntersectionContextWindow) else (other,)) # noqa: E501 + ]) + # fmt: on + + def __or__(self, other: "ContextWindow"): + # fmt: off + return UnionContextWindow([ + *(self.contexts if isinstance(self, UnionContextWindow) else (self,)), + *(other.contexts if isinstance(other, UnionContextWindow) else (other,)) + ]) + # fmt: on + + @classmethod + def parse(cls, query): + try: + return eval( + query, + {}, + { + "words": WordContextWindow, + "sents": SentenceContextWindow, + "sent": SentenceContextWindow(0, 0), + }, + ) + except NameError: + raise ValueError( + "Only queries containing vars `words[before:after]`, " + "`sents[before:after]` and `sent` are allowed to " + f"define a context getter, got {query!r}" + ) + + @classmethod + def validate(cls, obj, config=None): + if isinstance(obj, cls): + return obj + if isinstance(obj, str): + return cls.parse(obj) + if isinstance(obj, tuple): + assert len(obj) == 2 + return WordContextWindow(*obj) + if isinstance(obj, int): + assert obj != 0, "The provided `window` should not be 0" + return WordContextWindow(obj, 0) if obj < 0 else WordContextWindow(0, obj) + raise ValueError(f"Invalid context: {obj}") + + +class LeafContextWindowMeta(ContextWindowMeta): + def __getitem__(cls, item) -> Span: + assert isinstance(item, slice) + before = item.start + after = item.stop + return cls(before, after) + + +class LeafContextWindow(ContextWindow, metaclass=LeafContextWindowMeta): + pass + + +class WordContextWindow(LeafContextWindow): + def __init__( + self, + before: Optional[int] = None, + after: Optional[int] = None, + ): + self.before = before + self.after = after + + def __call__(self, span): + start = span.start + self.before if self.before is not None else 0 + end = span.end + self.after if self.after is not None else len(span.doc) + return span.doc[max(0, start) : min(len(span.doc), end)] + + def __repr__(self): + return "words[{}:{}]".format(self.before, self.after) + + +class SentenceContextWindow(LeafContextWindow): + def __init__( + self, + before: Optional[int] = None, + after: Optional[int] = None, + ): + self.before = before + self.after = after + + def __call__(self, span): + sent_starts = span.doc.to_array("SENT_START") == 1 + sent_indices = sent_starts.cumsum() + sent_indices = sent_indices - sent_indices[span.start] + + start_idx = end_idx = None + if self.before is not None: + start = sent_starts & (sent_indices == self.before) + x = np.flatnonzero(start) + start_idx = x[-1] if len(x) else 0 + + if self.after is not None: + end = sent_starts & (sent_indices == self.after + 1) + x = np.flatnonzero(end) + end_idx = x[0] - 1 if len(x) else len(span.doc) + + return span.doc[start_idx:end_idx] + + def __repr__(self): + return "sents[{}:{}]".format(self.before, self.after) + + +class UnionContextWindow(ContextWindow): + def __init__( + self, + contexts: AsList[ContextWindow], + ): + self.contexts = contexts + + def __call__(self, span): + results = [context(span) for context in self.contexts] + min_word = min([span.start for span in results]) + max_word = max([span.end for span in results]) + return span.doc[min_word:max_word] + + def __repr__(self): + return " | ".join(repr(context) for context in self.contexts) + + +class IntersectionContextWindow(ContextWindow): + def __init__( + self, + contexts: AsList[ContextWindow], + ): + self.contexts = contexts + + def __call__(self, span): + results = [context(span) for context in self.contexts] + min_word = max([span.start for span in results]) + max_word = min([span.end for span in results]) + return span.doc[min_word:max_word] + + def __repr__(self): + return " & ".join(repr(context) for context in self.contexts) diff --git a/edsnlp/utils/typing.py b/edsnlp/utils/typing.py index 5dd675c21..d7f0eb9c7 100644 --- a/edsnlp/utils/typing.py +++ b/edsnlp/utils/typing.py @@ -29,7 +29,8 @@ def __get_pydantic_core_schema__(cls, source, handler): class MetaAsList(type): def __init__(cls, name, bases, dct): super().__init__(name, bases, dct) - cls.type_ = Any + type_ = next((base.type_ for base in bases if hasattr(base, "type_")), Any) + cls.type_ = type_ @functools.lru_cache(maxsize=None) def __getitem__(self, item): @@ -63,12 +64,9 @@ class AsList(Generic[T], metaclass=MetaAsList): if pydantic.VERSION < "2": def cast(type_, obj): - class Model(pydantic.BaseModel): + class Model(pydantic.BaseModel, arbitrary_types_allowed=True): __root__: type_ - class Config: - arbitrary_types_allowed = True - return Model(__root__=obj).__root__ else: from dataclasses import is_dataclass diff --git a/pyproject.toml b/pyproject.toml index 16a2bfcc8..e19115af4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,9 @@ dependencies = [ # thinc doesn't provide binaries for python<3.9 from 8.2.5 so we need to cap it ourself "thinc<8.2.5; python_version<'3.9'", "thinc>=8.2.5; python_version>='3.9'", + # blis>1.2.0 (dependency of thinc) doesn't provide binaries for python<3.10 so we need to cap it ourself + "blis<1.0.0; python_version<'3.9'", + "blis<1.2.1; python_version>='3.9' and python_version<'3.10'", "confit>=0.7.3", "tqdm", "umls-downloader>=0.1.1", @@ -54,6 +57,9 @@ dev-no-ml = [ "scikit-learn", + # Packaging + "poetry", + "edsnlp[docs-no-ml]", ] docs-no-ml = [ @@ -76,7 +82,10 @@ docs-no-ml = [ ] ml = [ "rich-logger>=0.3.1", - "torch>=1.13.0", + # TODO: uv doesn't seem to resolve torch correctly, unless we cap it ourself + "torch>=1.13.0,<2.0.0; python_version<'3.8'", + "torch>=1.13.0,<2.5.0; python_version<'3.9'", + "torch>=1.13.0; python_version>='3.9'", "foldedtensor>=0.4.0", "safetensors>=0.3.0; python_version>='3.8'", "safetensors>=0.3.0,<0.5.0; python_version<'3.8'", @@ -337,6 +346,8 @@ requires = [ "numpy==1.22.2; python_version>='3.8' and python_version<'3.9' and platform_machine=='loongarch64' and platform_python_implementation!='PyPy'", "numpy==1.22.2; python_version=='3.8' and platform_machine!='loongarch64' and platform_python_implementation=='PyPy'", "numpy>=2.0; python_version>='3.9'", + "blis<1.0.0; python_version<'3.9'", + "blis<1.2.1; python_version>='3.9' and python_version<'3.10'", ] build-backend = "setuptools.build_meta" @@ -386,7 +397,7 @@ ignore-nested-functions = true ignore-nested-classes = true ignore-setters = true fail-under = 40 -exclude = ["setup.py", "docs", "build", "tests"] +exclude = ["setup.py", "docs", "build", "tests", "edsnlp/pipes/core/contextual_matcher/models.py"] verbose = 0 quiet = false whitelist-regex = [] @@ -430,6 +441,9 @@ include = ["edsnlp/*"] concurrency = ["multiprocessing", "thread"] parallel = true +[tool.uv.pip] +torch-backend = "auto" + [tool.cibuildwheel] skip = [ "*p36-*", # Skip Python 3.6 diff --git a/tests/conftest.py b/tests/conftest.py index 54578d415..26b3306d9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -27,6 +27,18 @@ pytest.importorskip("rich") +def pytest_collection_modifyitems(items): + """Run test_docs* at the end""" + first_tests = [] + last_tests = [] + for item in items: + if item.name.startswith("test_code_blocks"): + last_tests.append(item) + else: + first_tests.append(item) + items[:] = first_tests + last_tests + + @fixture(scope="session", params=["eds", "fr"]) def lang(request): return request.param diff --git a/tests/pipelines/core/test_contextual_matcher.py b/tests/pipelines/core/test_contextual_matcher.py index 7f4aaf6e7..e98da9b3a 100644 --- a/tests/pipelines/core/test_contextual_matcher.py +++ b/tests/pipelines/core/test_contextual_matcher.py @@ -1,8 +1,14 @@ +import os + import pytest +import edsnlp +import edsnlp.pipes as eds from edsnlp.utils.examples import parse_example from edsnlp.utils.extensions import rgetattr +os.environ["CONFIT_DEBUG"] = "1" + EXAMPLES = [ """ Le patient présente une métastasis sur un cancer métastasé au stade 3 voire au stade 4. @@ -156,7 +162,6 @@ @pytest.mark.parametrize("params,example", list(zip(ALL_PARAMS, EXAMPLES))) def test_contextual(blank_nlp, params, example): - include_assigned, replace_entity, reduce_mode_stage, reduce_mode_metastase = params blank_nlp.add_pipe( @@ -225,9 +230,74 @@ def test_contextual(blank_nlp, params, example): assert len(doc.ents) == len(entities) for entity, ent in zip(entities, doc.ents): - for modifier in entity.modifiers: + assert rgetattr(ent, modifier.key) == modifier.value, ( + f"{modifier.key} labels don't match." + ) + + +def test_contextual_matcher_include(blank_nlp): + if not isinstance(blank_nlp, edsnlp.Pipeline): + pytest.skip("Only running for edsnlp.Pipeline") + blank_nlp.add_pipe( + eds.quantities( + span_setter=["sizes"], + quantities=["size"], + ), + ) + blank_nlp.add_pipe( + eds.contextual_matcher( + name="tumor_size", + label="tumor_size", + assign_as_span=True, + patterns=[ + dict( + source="tumor_size", + terms=["cancer", "tumeur"], + regex_attr="NORM", + include=dict(regex="mamm", window="sents[-1:1]"), + assign=dict( + name="size", + span_getter="sizes", + reduce_mode="first", + required=True, + ), + ) + ], + ), + ) + doc = blank_nlp("""\ +Bilan mammaire: +La tumeur est de 3 cm. +Tumeur au pied sans changement. +Tumeur mammaire benigne. +""") + assert len(doc.ents) == 1 + ent = doc.ents[0] + assert ent.label_ == "tumor_size" + assert ent._.assigned["size"]._.value.cm == 3 + + +# Checks https://github.com/aphp/edsnlp/issues/394 +def test_contextual_matcher_exclude_outside(): + import edsnlp + import edsnlp.pipes as eds + + asa_pattern = r"\basa\b ?:? ?([1-5]|[A-Z]{1,3})" + exclude_asa_ttt = r"5" + asa = dict( + source="asa", + regex=asa_pattern, + regex_attr="NORM", + exclude=dict(regex=exclude_asa_ttt, window=-5), + ) + + nlp = edsnlp.blank("eds") + nlp.add_pipe(eds.sentences()) + nlp.add_pipe(eds.contextual_matcher(patterns=[asa], label="asa")) + + doc = nlp("ASA 5") + assert str(doc.ents) == "(ASA 5,)" - assert ( - rgetattr(ent, modifier.key) == modifier.value - ), f"{modifier.key} labels don't match." + doc = nlp("5 ASA 5") + assert str(doc.ents) == "()" diff --git a/tests/pipelines/ner/test_score.py b/tests/pipelines/ner/test_score.py index 6d0aab203..2c278db74 100644 --- a/tests/pipelines/ner/test_score.py +++ b/tests/pipelines/ner/test_score.py @@ -1,6 +1,8 @@ import re -from edsnlp.pipelines.ner.scores import Score +import edsnlp +import edsnlp.pipes as eds +from edsnlp.pipes.ner.scores import Score from edsnlp.utils.examples import parse_example example = """ @@ -67,8 +69,28 @@ def testscore_normalization(raw_score: str): doc = testscore(doc) for entity, ent in zip(entities, doc.ents): - for modifier in entity.modifiers: - assert ( - getattr(ent._, modifier.key) == modifier.value - ), f"{modifier.key} labels don't match." + assert getattr(ent._, modifier.key) == modifier.value, ( + f"{modifier.key} labels don't match." + ) + + +def test_multi_value_extract(): + # dummy example, we have eds.quantities to extract sizes + nlp = edsnlp.blank("eds") + nlp.add_pipe( + eds.score( + name="taille", + regex=[r"taille"], + value_extract=[ + {"name": "value", "regex": r"(\d+)"}, + {"name": "unit", "regex": r"(cm|mm)"}, + ], + score_normalization=float, + label="taille", + ) + ) + doc = nlp("taille 12 cm") + assert len(doc.ents) == 1 + ent = doc.ents[0] + assert ent._.score_value == 12.0 diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index 8c12941c0..9500b4640 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -1,3 +1,8 @@ +import pytest + +import edsnlp + + def test_pipelines(doc): assert len(doc.ents) == 3 patient, _, anomalie = doc.ents @@ -18,3 +23,10 @@ def test_import_all(): except (ImportError, AttributeError) as e: if "torch" in str(e): pass + + +def test_non_existing_pipe(): + with pytest.raises(AttributeError) as e: + getattr(edsnlp.pipes, "non_existing_pipe") + + assert str(e.value) == "module edsnlp.pipes has no attribute non_existing_pipe" diff --git a/tests/test_docs.py b/tests/test_docs.py index 5bc4ef1f9..24fe2e023 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -1,7 +1,13 @@ +import ast +import inspect +import re import sys +import textwrap import warnings +import catalogue import pytest +from spacy.tokens.underscore import Underscore pytest.importorskip("mkdocs") try: @@ -20,6 +26,8 @@ url_to_code = {} else: url_to_code = dict(extract_docs_code()) + # just to make sure something didn't go wrong + assert len(url_to_code) > 50 def printer(code: str) -> None: @@ -38,10 +46,97 @@ def printer(code: str) -> None: print("\n".join(lines)) +def insert_assert_statements(code): + line_table = [0] + for line in code.splitlines(keepends=True): + line_table.append(line_table[-1] + len(line)) + + tree = ast.parse(code) + replacements = [] + + for match in re.finditer( + r"^\s*#\s*Out\s*: (.*$(?:\n#\s.*$)*)", code, flags=re.MULTILINE + ): + lineno = code[: match.start()].count("\n") + for stmt in tree.body: + if stmt.end_lineno == lineno: + if isinstance(stmt, ast.Expr): + expected = textwrap.dedent(match.group(1)).replace("\n# ", "\n") + begin = line_table[stmt.lineno - 1] + if not (expected.startswith("'") or expected.startswith('"')): + expected = repr(expected) + end = match.end() + stmt_str = ast.unparse(stmt) + if stmt_str.startswith("print("): + stmt_str = stmt_str[len("print") :] + repl = f"""\ +value = {stmt_str} +assert {expected} == str(value) +""" + replacements.append((begin, end, repl)) + if isinstance(stmt, ast.For): + expected = textwrap.dedent(match.group(1)).split("\n# Out: ") + expected = [line.replace("\n# ", "\n") for line in expected] + begin = line_table[stmt.lineno - 1] + end = match.end() + stmt_str = ast.unparse(stmt).replace("print", "assert_print") + repl = f"""\ +printed = [] +{stmt_str} +assert {expected} == printed +""" + replacements.append((begin, end, repl)) + + for begin, end, repl in reversed(replacements): + code = code[:begin] + repl + code[end:] + + return code + + +# TODO: once in a while, it can be interesting to run reset_imports for each code block, +# instead of only once and tests should still pass, but it's way slower. +@pytest.fixture(scope="module") +def reset_imports(): + """ + Reset the imports for each test. + """ + # 1. Clear registered functions to avoid using cached ones + for k, m in list(catalogue.REGISTRY.items()): + mod = inspect.getmodule(m) + if mod is not None and mod.__name__.startswith("edsnlp"): + del catalogue.REGISTRY[k] + + # Let's ensure that we "bump" into every possible warnings: + # 2. Remove all modules that start with edsnlp, to reimport them + for k in list(sys.modules): + if k.split(".")[0] == "edsnlp": + del sys.modules[k] + + # 3. Delete spacy extensions to avoid error when re-importing + Underscore.span_extensions.clear() + Underscore.doc_extensions.clear() + Underscore.token_extensions.clear() + + # Note the use of `str`, makes for pretty output @pytest.mark.parametrize("url", sorted(url_to_code.keys()), ids=str) -def test_code_blocks(url): - raw = url_to_code[url] +def test_code_blocks(url, tmpdir, reset_imports): + code = url_to_code[url] + code_with_asserts = """ +def assert_print(*args, sep=" ", end="\\n", file=None, flush=False): + printed.append((sep.join(map(str, args)) + end).rstrip('\\n')) + +""" + insert_assert_statements(code) + assert "# Out:" not in code_with_asserts, ( + "Unparsed asserts in {url}:\n" + code_with_asserts + ) + # We'll import test_code_blocks from here + sys.path.insert(0, str(tmpdir)) + test_file = tmpdir.join("test_code_blocks.py") + + # Clear all warnings + warnings.resetwarnings() + try: with warnings.catch_warnings(): warnings.simplefilter("error") @@ -49,7 +144,18 @@ def test_code_blocks(url): warnings.filterwarnings( message="__package__ != __spec__.parent", action="ignore" ) - exec(raw, {"__MODULE__": "__main__"}) + # First, forget test_code_blocks + sys.modules.pop("test_code_blocks", None) + + # Then, reimport it, to let pytest do its assertion rewriting magic + test_file.write_text(code_with_asserts, encoding="utf-8") + + import test_code_blocks # noqa: F401 + + exec( + compile(code_with_asserts, test_file, "exec"), + {"__MODULE__": "__main__"}, + ) except Exception: - printer(raw) + printer(code_with_asserts) raise diff --git a/tests/utils/test_span_getters.py b/tests/utils/test_span_getters.py index 653514181..b401fb3bf 100644 --- a/tests/utils/test_span_getters.py +++ b/tests/utils/test_span_getters.py @@ -1,8 +1,17 @@ +import pytest +from confit import validate_arguments + import edsnlp -from edsnlp.utils.span_getters import make_span_context_getter +import edsnlp.pipes as eds +from edsnlp.utils.span_getters import ( + ContextWindow, + get_spans, + make_span_context_getter, + validate_span_setter, +) -def test_span_context_getter_symmetric(lang): +def test_span_context_getter(lang): nlp = edsnlp.blank(lang) nlp.add_pipe("eds.normalizer") nlp.add_pipe("eds.sentences") @@ -45,6 +54,28 @@ def test_span_context_getter_symmetric(lang): ] +def test_span_getter_on_span(): + nlp = edsnlp.blank("eds") + nlp.add_pipe(eds.sentences()) + nlp.add_pipe( + eds.matcher( + terms={"animal": ["snake", "dog"]}, + span_setter=["ents", "animals"], + ) + ) + doc = nlp( + "There was a snake. " + "His friend was a dog. " + "He liked baking cakes. " + "But since he had no hands, he was a bad baker. " + ) + sents = list(doc.sents) + assert str(list(get_spans(sents[0], validate_span_setter("ents")))) == "[snake]" + assert str(list(get_spans(sents[0], validate_span_setter("animals")))) == "[snake]" + assert str(list(get_spans(doc[5:], validate_span_setter("animals")))) == "[dog]" + assert str(list(get_spans(doc[5:], validate_span_setter("*")))) == "[dog]" + + def test_span_context_getter_asymmetric(lang): nlp = edsnlp.blank(lang) nlp.add_pipe("eds.normalizer") @@ -98,3 +129,44 @@ def test_span_context_getter_asymmetric(lang): assert [span_getter(s).text for s in doc.ents] == [ "This is a sentence. This is another sentence with a kangaroo. This is a third one." # noqa: E501 ] + + +def test_context_getter_syntax(): + @validate_arguments + def get_snippet(span, context: ContextWindow): + return context(span) + + nlp = edsnlp.blank("eds") + nlp.add_pipe("eds.normalizer") + nlp.add_pipe("eds.sentences") + nlp.add_pipe("eds.matcher", config={"terms": {"dog": "dog"}}) + doc = nlp( + "There was a snake. " + "His friend was a dog. " + "He liked baking cakes. " + "But since he had no hands, he was a bad baker. " + ) + + assert ( + get_snippet(doc.ents[0], "words[-5:5]").text + == ". His friend was a dog. He liked baking cakes" + ) + + assert get_snippet(doc.ents[0], "words[-5:5] & sent").text == "His friend was a dog" + + assert ( + get_snippet(doc.ents[0], "words[-5:8] | sents[-1:1]").text + == "There was a snake. His friend was a dog. He liked baking cakes. " + "But since" + ) + + +def test_invalid_context_getter_syntax(): + @validate_arguments + def apply_context(context: ContextWindow): + pass + + apply_context("sents[-2:2]") + + with pytest.raises(ValueError): + apply_context("stuff[-2:2]")