explosion · honnibal · Jun 10, 2023 · Jun 10, 2023 · Jun 10, 2023 · Jun 10, 2023
diff --git a/spacy/__init__.py b/spacy/__init__.py
@@ -32,6 +32,7 @@ def load(
     enable: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
     exclude: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
     config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
+    pipe_instances: Dict[str, Any] = util.SimpleFrozenDict(),
-    pipe_instances: Dict[str, Any] = util.SimpleFrozenDict(),
+    pipe_instances: Dict[str, PipeCallable] = util.SimpleFrozenDict(),
-    pipe_instances: Dict[str, Any] = util.SimpleFrozenDict(),
+    pipe_instances: Dict[str, PipeCallable] = util.SimpleFrozenDict(),
 ) -> Language:
     """Load a spaCy model from an installed package or a local path.
 
@@ -55,6 +56,7 @@ def load(
         enable=enable,
         exclude=exclude,
         config=config,
+        pipe_instances=pipe_instances,
     )
 
 

diff --git a/spacy/errors.py b/spacy/errors.py
@@ -219,6 +219,9 @@ class Warnings(metaclass=ErrorsWithCodes):
     W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
             "key attribute for vectors, configure it through Vectors(attr=) or "
             "'spacy init vectors --attr'")
+    W126 = ("Pipe instance '{name}' is being added with a vocab "
+            "instance that does not match other components. This is "
+            "usually an error.")
 
 
 class Errors(metaclass=ErrorsWithCodes):
@@ -981,6 +984,7 @@ class Errors(metaclass=ErrorsWithCodes):
              " 'min_length': {min_length}, 'max_length': {max_length}")
     E1054 = ("The text, including whitespace, must match between reference and "
              "predicted docs when training {component}.")
+    E1055 = ("Cannot create Language instance from config: missing pipeline components. The following components were added by instance (rather than config) via the 'Language.add_pipe_instance()' method, but are not present in the 'pipe_instances' variable: {names}")
 
 
 # Deprecated model shortcuts, only used in errors and warnings

diff --git a/spacy/language.py b/spacy/language.py
@@ -75,6 +75,9 @@
 # This is the base config for the [pretraining] block and currently not included
 # in the main config and only added via the 'init fill-config' command
 DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg"
+# Factory name indicating that the component wasn't constructed by a factory,
+# and was instead passed by instance
+INSTANCE_FACTORY_NAME = "__added_by_instance__"
 
 # Type variable for contexts piped with documents
 _AnyContext = TypeVar("_AnyContext")
@@ -768,9 +771,13 @@ def add_pipe(
         raw_config: Optional[Config] = None,
         validate: bool = True,
     ) -> PipeCallable:
-        """Add a component to the processing pipeline. Valid components are
-        callables that take a `Doc` object, modify it and return it. Only one
-        of before/after/first/last can be set. Default behaviour is "last".
+        """Add a component to the processing pipeline, by factory name and config.
+        Valid components are callables that take a `Doc` object, modify it, and return it.
+        Only one of before/after/first/last can be set. Default behaviour is "last".
+
+        Using this method tells spaCy how to construct your component, allowing you to load
+        your pipeline back using generic code. See 'Language.add_pipe_instance' to add a
+        component object instead, avoiding the need to create a factory.
 
         factory_name (str): Name of the component factory.
         name (str): Name of pipeline component. Overwrites existing
@@ -818,12 +825,61 @@ def add_pipe(
                 raw_config=raw_config,
                 validate=validate,
             )
-        pipe_index = self._get_pipe_index(before, after, first, last)
         self._pipe_meta[name] = self.get_factory_meta(factory_name)
+        pipe_index = self._get_pipe_index(before, after, first, last)
         self._components.insert(pipe_index, (name, pipe_component))
         self._link_components()
         return pipe_component
 
+    def add_pipe_instance(
+        self,
+        component: PipeCallable,
+        name: Optional[str] = None,
+        *,
+        before: Optional[Union[str, int]] = None,
+        after: Optional[Union[str, int]] = None,
+        first: Optional[bool] = None,
+        last: Optional[bool] = None,
+    ) -> PipeCallable:
+        """Add a component instance to the processing pipeline. Valid components
+        are callables that take a `Doc` object, modify it and return it. Only one
+        of before/after/first/last can be set. Default behaviour is "last".
+
+        A limitation of this method is that spaCy will not know how to reconstruct
+        your pipeline after you save it out (unlike the 'Language.add_pipe()' method,
+        where you provide a config and let spaCy construct the instance). See 'spacy.load'
+        for details of how to load back a pipeline with components added by instance.
+
+        component (Callable[[Doc], Doc]): The component to add.
+        name (str): Name of pipeline component. Overwrites existing
+            component.name attribute if available. If no name is set and
+            the component exposes no name attribute, component.__name__ is
+            used. An error is raised if a name already exists in the pipeline.
+        before (Union[str, int]): Name or index of the component to insert new
+            component directly before.
+        after (Union[str, int]): Name or index of the component to insert new
+            component directly after.
+        first (bool): If True, insert component first in the pipeline.
+        last (bool): If True, insert component last in the pipeline.
+        RETURNS (Callable[[Doc], Doc]): The pipeline component.
+
+        DOCS: https://spacy.io/api/language#add_pipe_instance
+        """
+        name = name if name is not None else getattr(component, "name")
+        if name is None:
+            raise ValueError("TODO error")
+        if name in self.component_names:
+            raise ValueError(Errors.E007.format(name=name, opts=self.component_names))
+
+        # It would be possible to take arguments for the FactoryMeta here, but we'll then have
+        # a problem on deserialization: where will the data be coming from?
+        # I think if someone wants that, they should register a component function.
+        self._pipe_meta[name] = FactoryMeta(INSTANCE_FACTORY_NAME)
+        self._pipe_configs[name] = Config()
+        pipe_index = self._get_pipe_index(before, after, first, last)
+        self._components.insert(pipe_index, (name, component))
+        return component
+
     def _get_pipe_index(
         self,
         before: Optional[Union[str, int]] = None,
@@ -1735,6 +1791,7 @@ def from_config(
         meta: Dict[str, Any] = SimpleFrozenDict(),
         auto_fill: bool = True,
         validate: bool = True,
+        pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
     ) -> "Language":
         """Create the nlp object from a loaded config. Will set up the tokenizer
         and language data, add pipeline components etc. If no config is provided,
@@ -1810,6 +1867,11 @@ def from_config(
 
         # Warn about require_gpu usage in jupyter notebook
         warn_if_jupyter_cupy()
+        # If we've been passed pipe instances, check whether
+        # they have a Vocab instance, and if they do, use
+        # that one. This also performs some additional checks and
+        # warns if there's a mismatch.
+        vocab = _get_instantiated_vocab(vocab, pipe_instances)
 
         # Note that we don't load vectors here, instead they get loaded explicitly
         # inside stuff like the spacy train function. If we loaded them here,
@@ -1826,6 +1888,11 @@ def from_config(
         interpolated = filled.interpolate() if not filled.is_interpolated else filled
         pipeline = interpolated.get("components", {})
         sourced = util.get_sourced_components(interpolated)
+        # Check for components that aren't in the pipe_instances dict, aren't disabled,
+        # and aren't built by factory.
+        missing_components = _find_missing_components(pipeline, pipe_instances, exclude)
+        if missing_components:
+            raise ValueError(Errors.E1055.format(names=", ".join(missing_components)))
         # If components are loaded from a source (existing models), we cache
         # them here so they're only loaded once
         source_nlps = {}
@@ -1835,6 +1902,16 @@ def from_config(
             if pipe_name not in pipeline:
                 opts = ", ".join(pipeline.keys())
                 raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
+            if pipe_name in pipe_instances:
+                if pipe_name in exclude:
+                    continue
+                else:
+                    nlp.add_pipe_instance(pipe_instances[pipe_name])
+            # Is it important that we instantiate pipes that
+            # aren't excluded? It seems like we would want
+            # the exclude check above. I've left it how it
+            # is though, in case there's some sort of crazy
+            # load-bearing side-effects someone is relying on?
             pipe_cfg = util.copy_config(pipeline[pipe_name])
             raw_config = Config(filled["components"][pipe_name])
             if pipe_name not in exclude:
@@ -2337,3 +2414,45 @@ def step(self) -> None:
         if self.count >= self.chunk_size:
             self.count = 0
             self.send()
+
+
+def _get_instantiated_vocab(
+    vocab: Union[bool, Vocab], pipe_instances: Dict[str, Any]
+) -> Union[bool, Vocab]:
+    vocab_instances = {}
+    for name, instance in pipe_instances.items():
+        if hasattr(instance, "vocab") and isinstance(instance.vocab, Vocab):
+            vocab_instances[name] = instance.vocab
+    if not vocab_instances:
+        return vocab
+    elif isinstance(vocab, Vocab):
+        for name, inst_voc in vocab_instances.items():
+            if inst_voc is not vocab:
+                warnings.warn(Warnings.W126.format(name=name))
+        return vocab
+    else:
+        resolved_vocab = None
+        for name, inst_voc in vocab_instances.items():
+            if resolved_vocab is None:
+                resolved_vocab = inst_voc
+            elif inst_voc is not resolved_vocab:
+                warnings.warn(Warnings.W126.format(name=name))
+        # We should guarantee a vocab from the logic above.
+        assert resolved_vocab is not None
+        return resolved_vocab
+
+
+def _find_missing_components(
+    pipeline: Dict[str, Dict[str, Any]],
+    pipe_instances: Dict[str, Any],
+    exclude: Iterable[str],
+) -> List[str]:
+    missing = []
+    for name, config in pipeline.items():
+        if (
+            config.get("factory") == INSTANCE_FACTORY_NAME
+            and name not in pipe_instances
+            and name not in exclude
+        ):
+            missing.append(name)
+    return missing
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
@@ -800,3 +800,40 @@ def bad_pipe(doc):
     nlp.add_pipe("test_component_bad_pipe")
     with pytest.raises(ValueError, match="instead of a Doc"):
         nlp("text")
+
+
+@pytest.mark.parametrize(
+    "components,kwargs,position",
+    [
+        (["t1", "t2"], {"before": "t1"}, 0),
+        (["t1", "t2"], {"after": "t1"}, 1),
+        (["t1", "t2"], {"after": "t1"}, 1),
+        (["t1", "t2"], {"first": True}, 0),
+        (["t1", "t2"], {"last": True}, 2),
+        (["t1", "t2"], {"last": False}, 2),
+        (["t1", "t2"], {"first": False}, ValueError),
+    ],
+)
+def test_add_pipe_instance(components, kwargs, position):
+    nlp = Language()
+    for name in components:
+        nlp.add_pipe("textcat", name=name)
+    pipe_names = list(nlp.pipe_names)
+    if isinstance(position, int):
+        result = nlp.add_pipe_instance(evil_component, name="new_component", **kwargs)
+        assert result is evil_component
+        pipe_names.insert(position, "new_component")
+        assert nlp.pipe_names == pipe_names
+    else:
+        with pytest.raises(ValueError):
+            result = nlp.add_pipe_instance(
+                evil_component, name="new_component", **kwargs
+            )
+
+
+def test_add_pipe_instance_to_bytes():
+    nlp = Language()
+    nlp.add_pipe("textcat", name="t1")
+    nlp.add_pipe("textcat", name="t2")
+    nlp.add_pipe_instance(evil_component, name="new_component")
+    b = nlp.to_bytes()
diff --git a/spacy/util.py b/spacy/util.py
@@ -438,6 +438,7 @@ def load_model(
     enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
     exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
     config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
+    pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
 ) -> "Language":
     """Load a model from a package or data path.
 
@@ -449,6 +450,9 @@ def load_model(
     exclude (Union[str, Iterable[str]]):  Name(s) of pipeline component(s) to exclude.
     config (Dict[str, Any] / Config): Config overrides as nested dict or dict
         keyed by section values in dot notation.
+    pipe_instances (Dict[str, Any]): Dictionary of components
+        to be added to the pipeline directly (not created from
+        config)
     RETURNS (Language): The loaded nlp object.
     """
     kwargs = {
@@ -457,6 +461,7 @@ def load_model(
         "enable": enable,
         "exclude": exclude,
         "config": config,
+        "pipe_instances": pipe_instances,
     }
     if isinstance(name, str):  # name or string path
         if name.startswith("blank:"):  # shortcut for blank model
@@ -480,6 +485,7 @@ def load_model_from_package(
     enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
     exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
     config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
+    pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
 ) -> "Language":
     """Load a model from an installed package.
 
@@ -495,10 +501,13 @@ def load_model_from_package(
         components won't be loaded.
     config (Dict[str, Any] / Config): Config overrides as nested dict or dict
         keyed by section values in dot notation.
+    pipe_instances (Dict[str, Any]): Dictionary of components
+        to be added to the pipeline directly (not created from
+        config)
     RETURNS (Language): The loaded nlp object.
     """
     cls = importlib.import_module(name)
-    return cls.load(vocab=vocab, disable=disable, enable=enable, exclude=exclude, config=config)  # type: ignore[attr-defined]
+    return cls.load(vocab=vocab, disable=disable, enable=enable, exclude=exclude, config=config, pipe_instances=pipe_instances)  # type: ignore[attr-defined]
 
 
 def load_model_from_path(
@@ -510,6 +519,7 @@ def load_model_from_path(
     enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
     exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
     config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
+    pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
 ) -> "Language":
     """Load a model from a data directory path. Creates Language class with
     pipeline from config.cfg and then calls from_disk() with path.
@@ -527,6 +537,9 @@ def load_model_from_path(
         components won't be loaded.
     config (Dict[str, Any] / Config): Config overrides as nested dict or dict
         keyed by section values in dot notation.
+    pipe_instances (Dict[str, Any]): Dictionary of components
+        to be added to the pipeline directly (not created from
+        config)
     RETURNS (Language): The loaded nlp object.
     """
     if not model_path.exists():
@@ -543,6 +556,7 @@ def load_model_from_path(
         enable=enable,
         exclude=exclude,
         meta=meta,
+        pipe_instances=pipe_instances,
     )
     return nlp.from_disk(model_path, exclude=exclude, overrides=overrides)
 
@@ -557,6 +571,7 @@ def load_model_from_config(
     exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
     auto_fill: bool = False,
     validate: bool = True,
+    pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
 ) -> "Language":
     """Create an nlp object from a config. Expects the full config file including
     a section "nlp" containing the settings for the nlp object.
@@ -574,6 +589,9 @@ def load_model_from_config(
         components won't be loaded.
     auto_fill (bool): Whether to auto-fill config with missing defaults.
     validate (bool): Whether to show config validation errors.
+    pipe_instances (Dict[str, Any]): Dictionary of components
+        to be added to the pipeline directly (not created from
+        config)
     RETURNS (Language): The loaded nlp object.
     """
     if "nlp" not in config:
@@ -593,6 +611,7 @@ def load_model_from_config(
         auto_fill=auto_fill,
         validate=validate,
         meta=meta,
+        pipe_instances=pipe_instances,
     )
     return nlp
 
@@ -656,6 +675,7 @@ def load_model_from_init_py(
     enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
     exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
     config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
+    pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
 ) -> "Language":
     """Helper function to use in the `load()` method of a model package's
     __init__.py.
@@ -671,6 +691,9 @@ def load_model_from_init_py(
         components won't be loaded.
     config (Dict[str, Any] / Config): Config overrides as nested dict or dict
         keyed by section values in dot notation.
+    pipe_instances (Dict[str, Any]): Dictionary of components
+        to be added to the pipeline directly (not created from
+        config)
     RETURNS (Language): The loaded nlp object.
     """
     model_path = Path(init_file).parent