CompNet
diff --git a/‎docs/contributing.rst
Lines changed: 6 additions & 1 deletion b/‎docs/contributing.rst
Lines changed: 6 additions & 1 deletion
diff --git a/‎docs/extending.rst
Lines changed: 4 additions & 2 deletions b/‎docs/extending.rst
Lines changed: 4 additions & 2 deletions
diff --git a/‎docs/pipeline.rst
Lines changed: 46 additions & 1 deletion b/‎docs/pipeline.rst
Lines changed: 46 additions & 1 deletion
diff --git a/‎poetry.lock
Lines changed: 5 additions & 5 deletions b/‎poetry.lock
Lines changed: 5 additions & 5 deletions
diff --git a/‎pyproject.toml
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎renard/graph_utils.py
Lines changed: 11 additions & 4 deletions b/‎renard/graph_utils.py
Lines changed: 11 additions & 4 deletions
diff --git a/‎renard/ner_utils.py
Lines changed: 4 additions & 0 deletions b/‎renard/ner_utils.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎renard/pipeline/character_unification.py
Lines changed: 14 additions & 4 deletions b/‎renard/pipeline/character_unification.py
Lines changed: 14 additions & 4 deletions
diff --git a/‎renard/pipeline/characters_extraction.py
Lines changed: 3 additions & 1 deletion b/‎renard/pipeline/characters_extraction.py
Lines changed: 3 additions & 1 deletion
@@ -36,4 +36,9 @@ the ``tests`` directory. We use ``pytest`` to test code, and also use
 ``hypothesis`` when applicable. If you open a patch, make sure that
 all tests are passing. In particular, do not rely on the CI, as it
 does not run time costly tests! Check for yourself locally, using
-``RENARD_TEST_ALL=1 python -m pytest tests``
+``RENARD_TEST_ALL=1 python -m pytest tests``. Note that there are
+specific tests and environment variable for optional dependencies such
+as *stanza* (``RENARD_TEST_STANZA_OPTDEP``). These must be explicitely
+set to ``1`` if you want to test optional dependencies, as
+``RENARD_TEST_ALL=1`` does not enable test on these optional
+dependencies.
@@ -8,8 +8,10 @@ Creating new steps
 
 Usually, steps must implement at least four functions :
 
-- :meth:`.PipelineStep.__init__`: is used to pass options at step init time
-- :meth:`.PipelineStep.__call__`: is called at pipeline run time
+- :meth:`.PipelineStep.__init__`: is used to pass options at step init
+  time. Options passed at step init time should be valid for a
+  collection of texts, and not be text specific.
+- :meth:`.PipelineStep.__call__`: is called at pipeline run time.
 - :meth:`.PipelineStep.needs`: declares the set of informations needed
   from the pipeline state by this step. Each returned string should be
   an attribute of :class:`.PipelineState`.
 
@@ -68,7 +68,7 @@ In that case, the ``tokens`` requirements is fulfilled at run time. If
 you don't pass the parameter, Renard will throw the following
 exception:
 
->>> ValueError: ["step 1 (NLTKNamedEntityRecognizer) has unsatisfied needs (needs : {'tokens'}, available : {'text'})"]
+>>> ValueError: ["step 1 (NLTKNamedEntityRecognizer) has unsatisfied needs. needs: {'tokens'}. available: {'text'}). missing: {'tokens'}."]
 
 
 For simplicity, one can use one of the preconfigured pipelines:
@@ -252,6 +252,51 @@ graph to a directory. Meanwhile,
 dynamic graph to the Gephi format.
 
 
+Custom Segmentation
+-------------------
+
+The ``dynamic_window`` parameter of
+:class:`.CoOccurencesGraphExtractor` determines the segmentation of
+the dynamic networks, in number of interactions. In the example above,
+a new graph will be created for each 20 interactions.
+
+While one can rely on the arguments of the graph extractor of the
+pipeline to determine the dynamic window, Renard allows to specify a
+custom segmentation of a text with the ``dynamic_blocks``
+argument. When running a pipeline, you can cut your text however you
+want and pass this argument instead of the usual text:
+
+
+.. code-block:: python
+
+   from renard.pipeline import Pipeline
+   from renard.pipeline.tokenization import NLTKTokenizer
+   from renard.pipeline.ner import NLTKNamedEntityRecognizer
+   from renard.pipeline.character_unification import GraphRulesCharacterUnifier
+   from renard.pipeline.graph_extraction import CoOccurrencesGraphExtractor
+   from renard.utils import block_bounds
+
+   with open("./my_doc.txt") as f:
+       text = f.read()
+
+   # let's suppose the 'cut_into_chapters' function cut the text into chapters.
+   chapters = cut_into_chapters(text)
+
+   pipeline = Pipeline(
+       [
+           NLTKTokenizer(),
+           NLTKNamedEntityRecognizer(),
+           GraphRulesCharacterUnifier(),
+           CoOccurrencesGraphExtractor(co_occurrences_dist=25, dynamic=True)
+       ]
+   )
+
+   # the 'block_bounds' function automatically extracts the boundaries of your
+   # block of text.
+   out = pipeline(text, dynamic_blocks=block_bounds(chapters))
+
+
+
 Multilingual Support
 ====================
 
 
@@ -31,7 +31,7 @@ matplotlib = "^3.5.3"
 seqeval = "1.2.2"
 pandas = "^2.0.0"
 pytest = "^7.2.1"
-tibert = "^0.3.0"
+tibert = "^0.4.0"
 grimbert = "^0.1.0"
 datasets = "^2.16.1"
 
 
@@ -70,10 +70,17 @@ def graph_with_names(
     else:
         name_style_fn = name_style
 
-    return nx.relabel_nodes(
-        G,
-        {character: name_style_fn(character) for character in G.nodes()},  # type: ignore
-    )
+    mapping = {}
+    for character in G.nodes():
+        # NOTE: it is *possible* to have a graph where nodes are not
+        # characters (for example, simple strings). Therefore, we are
+        # lenient here
+        try:
+            mapping[character] = name_style_fn(character)
+        except AttributeError:
+            mapping[character] = character
+
+    return nx.relabel_nodes(G, mapping)
 
 
 def layout_with_names(
 
@@ -110,6 +110,10 @@ def __getitem__(self, index: Union[int, List[int]]) -> BatchEncoding:
         elt_context_mask = self._context_mask[index]
         for i in range(len(element)):
             w2t = batch.word_to_tokens(0, i)
+            # w2t can be None in case of truncation, which can happen
+            # if `element' is too long
+            if w2t is None:
+                continue
             mask_value = elt_context_mask[i]
             tokens_mask = [mask_value] * (w2t.end - w2t.start)
             batch["context_mask"][w2t.start : w2t.end] = tokens_mask
 
@@ -61,6 +61,8 @@ def _assign_coreference_mentions(
     # we assign each chain to the character with highest name
     # occurence in it
     for chain in corefs:
+        if len(char_mentions) == 0:
+            break
         # determine the characters with the highest number of
         # occurences
         occ_counter = {}
@@ -98,8 +100,13 @@ def __init__(self, min_appearances: int = 0) -> None:
             character for it to be valid
         """
         self.min_appearances = min_appearances
+        # a default value, will be est by _pipeline_init_
+        self.character_ner_tag = "PER"
         super().__init__()
 
+    def _pipeline_init_(self, lang: str, character_ner_tag: str, **kwargs):
+        self.character_ner_tag = character_ner_tag
+
     def __call__(
         self,
         text: str,
@@ -112,7 +119,7 @@ def __call__(
         :param tokens:
         :param entities:
         """
-        persons = [e for e in entities if e.tag == "PER"]
+        persons = [e for e in entities if e.tag == self.character_ner_tag]
 
         characters = defaultdict(list)
         for entity in persons:
@@ -182,16 +189,19 @@ def __init__(
         self.additional_hypocorisms = additional_hypocorisms
         self.link_corefs_mentions = link_corefs_mentions
         self.ignore_lone_titles = ignore_lone_titles or set()
+        self.character_ner_tag = "PER"  # a default value, will be set by _pipeline_init
 
         super().__init__()
 
-    def _pipeline_init_(self, lang: str, progress_reporter: ProgressReporter):
+    def _pipeline_init_(self, lang: str, character_ner_tag: str, **kwargs):
         self.hypocorism_gazetteer = HypocorismGazetteer(lang=lang)
         if not self.additional_hypocorisms is None:
             for name, nicknames in self.additional_hypocorisms:
                 self.hypocorism_gazetteer._add_hypocorism_(name, nicknames)
 
-        return super()._pipeline_init_(lang, progress_reporter)
+        self.character_ner_tag = character_ner_tag
+
+        return super()._pipeline_init_(lang, **kwargs)
 
     def __call__(
         self,
@@ -201,7 +211,7 @@ def __call__(
     ) -> Dict[str, Any]:
         import networkx as nx
 
-        mentions = [m for m in entities if m.tag == "PER"]
+        mentions = [m for m in entities if m.tag == self.character_ner_tag]
         mentions_str = set(
             filter(
                 lambda m: not m in self.ignore_lone_titles,
 
@@ -1,7 +1,9 @@
+import sys
 import renard.pipeline.character_unification as cu
 
 print(
-    "[warning] the characters_extraction module is deprecated. Use character_unification instead."
+    "[warning] the characters_extraction module is deprecated. Use character_unification instead.",
+    file=sys.stderr,
 )
 
 Character = cu.Character
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,9 @@`
	`1`	`+import sys`
`1`	`2`	`import renard.pipeline.character_unification as cu`
`2`	`3`
`3`	`4`	`print(`
`4`		`- "[warning] the characters_extraction module is deprecated. Use character_unification instead."`
	`5`	`+ "[warning] the characters_extraction module is deprecated. Use character_unification instead.",`
	`6`	`+ file=sys.stderr,`
`5`	`7`	`)`
`6`	`8`
`7`	`9`	`Character = cu.Character`