From aa18bace892521e6a72b0720e75a955206f8dce8 Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Thu, 26 Oct 2023 13:22:35 -0400
Subject: [PATCH 1/7] Pipelines Refactor - Initial Impl (#1287)

---
 src/deepsparse/v2/__init__.py                 |  21 ++++
 src/deepsparse/v2/operators/__init__.py       |  17 +++
 src/deepsparse/v2/operators/operator.py       |  90 ++++++++++++++++
 src/deepsparse/v2/pipeline.py                 | 102 ++++++++++++++++++
 src/deepsparse/v2/routers/__init__.py         |  17 +++
 src/deepsparse/v2/routers/router.py           |  95 ++++++++++++++++
 src/deepsparse/v2/schedulers/__init__.py      |  18 ++++
 src/deepsparse/v2/schedulers/scheduler.py     |  63 +++++++++++
 .../v2/schedulers/scheduler_group.py          |  64 +++++++++++
 src/deepsparse/v2/utils/__init__.py           |  18 ++++
 src/deepsparse/v2/utils/context.py            |  42 ++++++++
 src/deepsparse/v2/utils/types.py              |  28 +++++
 tests/deepsparse/v2/__init__.py               |   0
 tests/deepsparse/v2/test_basic_pipeline.py    |  45 ++++++++
 14 files changed, 620 insertions(+)
 create mode 100644 src/deepsparse/v2/__init__.py
 create mode 100644 src/deepsparse/v2/operators/__init__.py
 create mode 100644 src/deepsparse/v2/operators/operator.py
 create mode 100644 src/deepsparse/v2/pipeline.py
 create mode 100644 src/deepsparse/v2/routers/__init__.py
 create mode 100644 src/deepsparse/v2/routers/router.py
 create mode 100644 src/deepsparse/v2/schedulers/__init__.py
 create mode 100644 src/deepsparse/v2/schedulers/scheduler.py
 create mode 100644 src/deepsparse/v2/schedulers/scheduler_group.py
 create mode 100644 src/deepsparse/v2/utils/__init__.py
 create mode 100644 src/deepsparse/v2/utils/context.py
 create mode 100644 src/deepsparse/v2/utils/types.py
 create mode 100644 tests/deepsparse/v2/__init__.py
 create mode 100644 tests/deepsparse/v2/test_basic_pipeline.py

diff --git a/src/deepsparse/v2/__init__.py b/src/deepsparse/v2/__init__.py
new file mode 100644
index 0000000000..4a897be06f
--- /dev/null
+++ b/src/deepsparse/v2/__init__.py
@@ -0,0 +1,21 @@
+# flake8: noqa
+
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .pipeline import *
+from .operators import *
+from .routers import *
+from .schedulers import *
+from .utils import *
diff --git a/src/deepsparse/v2/operators/__init__.py b/src/deepsparse/v2/operators/__init__.py
new file mode 100644
index 0000000000..8f7e6a169d
--- /dev/null
+++ b/src/deepsparse/v2/operators/__init__.py
@@ -0,0 +1,17 @@
+# flake8: noqa
+
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .operator import *
diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py
new file mode 100644
index 0000000000..30e1a48379
--- /dev/null
+++ b/src/deepsparse/v2/operators/operator.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Optional, Type
+
+from pydantic import BaseModel
+
+from deepsparse.v2.utils import Context, OperatorSchema
+
+
+__all__ = ["Operator"]
+
+
+class Operator(ABC):
+    """
+    Base operator class - can represent any part of an ML pipeline
+    """
+
+    # expected structured input and output types, to be defined by child classes
+    input_schema: Optional[Type[OperatorSchema]] = None
+    output_schema: Optional[Type[OperatorSchema]] = None
+
+    @abstractmethod
+    def run(self, inp: OperatorSchema, context: Context) -> OperatorSchema:
+        """
+        :param inp: operator input, as the defined input schema if applicable
+        :param context: pipeline context of already run operators
+        :return: result of this operator as the defined output schema if applicable
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def has_input_schema(cls) -> bool:
+        """
+        :return: True if this class has a defined pydantic input schema
+        """
+        return issubclass(cls.input_schema, BaseModel)
+
+    @classmethod
+    def has_output_schema(cls) -> bool:
+        """
+        :return: True if this class has a defined pydantic input schema
+        """
+        return issubclass(cls.output_schema, BaseModel)
+
+    def __call__(
+        self,
+        *args,
+        context: Optional[Context] = None,
+        **kwargs,
+    ) -> OperatorSchema:
+        """
+        Parses inputs to this Operator and runs the run() method of this operator
+
+        :param args: an unnamed arg may only be provided
+            if it is of the type of the input_schema
+        :param context: pipeline context to pass to operator
+        :param kwargs: kwargs when not initializing from an instantiated schema
+        :return: operator output
+        """
+        if len(args) > 1:
+            raise ValueError(
+                f"Only 1 unnamed arg may be supplied to an Operator, found {len(args)}"
+            )
+
+        if len(args) == 1:
+            if self.input_schema is not None and isinstance(args[0], self.input_schema):
+                inference_input = args[0]
+            else:
+                raise ValueError(
+                    f"1 arg supplied to Operator {self.__class__.__name__} but was not "
+                    f"of expected type {self.input_schema}, found {type(args[0])}"
+                )
+        elif self.has_input_schema():
+            inference_input = self.input_schema(**kwargs)
+        else:
+            inference_input = kwargs
+        return self.run(inference_input, context=context)
diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py
new file mode 100644
index 0000000000..0ec580687d
--- /dev/null
+++ b/src/deepsparse/v2/pipeline.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List
+
+from pydantic import BaseModel, Field, PrivateAttr
+
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.routers import Router
+from deepsparse.v2.schedulers import OperatorScheduler, SchedulerGroup
+
+
+__all__ = ["Pipeline"]
+
+
+class Pipeline(BaseModel):
+    """
+    Pipeline accepts a series of operators, schedulers, and a router which define
+    an end to end ML transformation.
+
+    Calling a pipeline runs these transformations
+    """
+
+    stages: List[Operator] = Field(
+        required=True,
+        description="In-order list of operators that make up this pipeline",
+    )
+    router: Router = Field(
+        default_factor=Router,
+        description="Router object to determine order and run the stages. "
+        "Defaults to the base Router object",
+    )
+    schedulers: List[OperatorScheduler] = Field(
+        default_factor=lambda: [OperatorScheduler()],
+        description="List of schedulers to run operators in order of priority",
+    )
+
+    _scheduler_group: SchedulerGroup = PrivateAttr()
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.validate()
+
+        # SchedulerGroup handles running all schedulers in order of priority
+        self._scheduler_group = SchedulerGroup(self.schedulers)
+
+    def __call__(self, *args, return_context: bool = False, **kwargs):
+        """
+        :param return_context: if True, retrns tuple of the pipelien output
+            and entire context. Default False
+        :return: output of the pipeline stages ran with the router for the given input
+        """
+        if len(args) > 1:
+            raise ValueError(
+                "Only 1 in-line argument may be supplied to Pipeline which "
+                f"must be a Schema, found: {len(args)}"
+            )
+        if args and kwargs:
+            raise ValueError(
+                "Pipeline can only run either a single in-line argument schema or a "
+                f"series of kwargs, found {len(args)} args and {len(kwargs)} kwargs"
+            )
+
+        pipeline_input = args[0] or kwargs
+        pipeline_output, context = self.router.run(
+            inp=pipeline_input,
+            operators=self.stages,
+            scheduler=self._scheduler_group,
+        )
+
+        if return_context:
+            return pipeline_output, context
+
+        return pipeline_output
+
+    def validate(self):
+        router_validation = self.router.validate(self.stages)
+
+        if router_validation is False:
+            # default error message
+            stage_types = [type(stage) for stage in self.stages]
+            raise ValueError(
+                f"Invalid Router: {type(self.router)} for stages: {stage_types}"
+            )
+        elif isinstance(router_validation, str):
+            raise ValueError(f"Invalid Router for stages: {router_validation}")
diff --git a/src/deepsparse/v2/routers/__init__.py b/src/deepsparse/v2/routers/__init__.py
new file mode 100644
index 0000000000..8718bedeb4
--- /dev/null
+++ b/src/deepsparse/v2/routers/__init__.py
@@ -0,0 +1,17 @@
+# flake8: noqa
+
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .router import *
diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py
new file mode 100644
index 0000000000..284c348c10
--- /dev/null
+++ b/src/deepsparse/v2/routers/router.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Tuple, Union
+
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.schedulers import OperatorScheduler
+from deepsparse.v2.utils import Context, OperatorSchema
+
+
+__all__ = ["Router"]
+
+
+class Router:
+    """
+    Routers must implement a run method which runs a series of operators
+    for a pipeline for a given input. Base Router runs operators linearly
+    in a series
+    """
+
+    @staticmethod
+    def run(
+        inp: OperatorSchema,
+        operators: List[Operator],
+        scheduler: OperatorScheduler,
+    ) -> Tuple[OperatorSchema, Context]:
+        """
+        :param inp: input to the first operator of the series
+        :param operators: list of operators to run
+        :param scheduler: scheudler to submit operators to
+        :return: final output of the operators
+        """
+        context = Context()
+
+        # run operators linearly
+        operator_input = inp
+        for operator in operators:
+            output_future = scheduler.submit(
+                operator=operator, operator_input=operator_input, context=context
+            )
+
+            # wait for future to resolve
+            operator_output = output_future.result()
+
+            # update context
+            context.update(
+                operator=operator,
+                input=operator_input,
+                output=operator_output,
+            )
+
+            # previous output becomes next input
+            operator_input = operator_output
+
+        return operator_output, context
+
+    @staticmethod
+    def validate(operators: List[Operator]) -> Union[bool, str]:
+        """
+        :param operators: operators that this Router could potentially run over
+        :return: True if this Router can run this series of operators. Base Router
+            runs any series of operators that is non empty and whose input and output
+            schemas align. If not valid, either False or an error string will be
+            returned
+        """
+        if len(operators) < 1:
+            return "No operators found"
+
+        for idx in range(len(operators) - 1):
+            current_output_schema = operators[idx].output_schema
+            next_input_schema = operators[idx + 1].input_schema
+
+            if current_output_schema is None or next_input_schema is None:
+                # if no input/output schema defined, assume operator can run
+                # without schema
+                continue
+
+            if current_output_schema != next_input_schema:
+                return (
+                    f"Operator at idx {idx}: {type(operators[idx])} has invalid "
+                    f"output schema {current_output_schema} for next operator "
+                    f"{type(operators[idx + 1])} which requires {next_input_schema}"
+                )
diff --git a/src/deepsparse/v2/schedulers/__init__.py b/src/deepsparse/v2/schedulers/__init__.py
new file mode 100644
index 0000000000..04c37077e1
--- /dev/null
+++ b/src/deepsparse/v2/schedulers/__init__.py
@@ -0,0 +1,18 @@
+# flake8: noqa
+
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .scheduler import *
+from .scheduler_group import *
diff --git a/src/deepsparse/v2/schedulers/scheduler.py b/src/deepsparse/v2/schedulers/scheduler.py
new file mode 100644
index 0000000000..53f0c8f625
--- /dev/null
+++ b/src/deepsparse/v2/schedulers/scheduler.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from concurrent.futures import Future, ThreadPoolExecutor
+
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.utils import Context, OperatorSchema
+
+
+__all__ = ["OperatorScheduler"]
+
+
+class OperatorScheduler:
+    """
+    OperatorSchedulers should implement a `submit` function that asynchronously
+    runs an operator and its input and returns a Future. Priority of operators
+    to run and resources they are run on are deferred to specific OperatorScheduler
+    implementations
+
+    Base OperatorScheduler behaves as a simple queue deferring to ThreadPoolExecutor
+
+    :param max_workers: maximum number of threads to execute at once
+    """
+
+    def __init__(self, max_workers: int = 1):
+        self._threadpool = ThreadPoolExecutor(max_workers=max_workers)
+
+    def submit(
+        self,
+        operator: Operator,
+        operator_input: OperatorSchema,
+        context: Context,
+    ) -> Future:
+        """
+        :param operator: operator to run
+        :param operator_input: input schema to the operator
+        :param context: context of already run operators
+        :return: future referencing the asynchronously run output of the operator
+        """
+        if isinstance(operator_input, dict):
+            return self._threadpool.submit(operator, context=context, **operator_input)
+        return self._threadpool.submit(operator, operator_input, context=context)
+
+    def can_process(self, operator: Operator, operator_input: OperatorSchema) -> bool:
+        """
+        :param operator: operator to check
+        :param operator_input: operator_input to check
+        :return: True if this Operator can process the given operator and input.
+            Base OperatorScheduler always returns True
+        """
+        return True
diff --git a/src/deepsparse/v2/schedulers/scheduler_group.py b/src/deepsparse/v2/schedulers/scheduler_group.py
new file mode 100644
index 0000000000..2f797b30c7
--- /dev/null
+++ b/src/deepsparse/v2/schedulers/scheduler_group.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from concurrent.futures import Future
+from typing import List
+
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.schedulers.scheduler import OperatorScheduler
+from deepsparse.v2.utils import Context, OperatorSchema
+
+
+__all__ = ["SchedulerGroup"]
+
+
+class SchedulerGroup(OperatorScheduler):
+    """
+    Wrapper for a series of schedulers. Runs submitted operators on the first
+    scheduler that can process a given input
+
+    :param schedulers: list of schedulers to pass operators to
+    """
+
+    def __init__(self, schedulers: List[OperatorScheduler]):
+        self.schedulers = schedulers
+
+    def submit(
+        self,
+        operator: Operator,
+        operator_input: OperatorSchema,
+        context: Context,
+    ) -> Future:
+        """
+        :param operator: operator to run
+        :param operator_input: input schema to the operator
+        :param context: context of already run operators
+        :return: future referencing the asynchronously run output of the operator
+        """
+        for scheduler in self.schedulers:
+            if scheduler.can_process(operator, operator_input):
+                return scheduler.submit(operator, operator_input, context)
+
+    def can_process(self, operator: Operator, operator_input: OperatorSchema) -> bool:
+        """
+        :param operator: operator to check
+        :param operator_input: operator_input to check
+        :return: True if this Operator can process the given operator and input.
+            SchedulerGroup always returns True
+        """
+        return any(
+            scheduler.can_process(operator, operator_input)
+            for scheduler in self.schedulers
+        )
diff --git a/src/deepsparse/v2/utils/__init__.py b/src/deepsparse/v2/utils/__init__.py
new file mode 100644
index 0000000000..4f36eeb448
--- /dev/null
+++ b/src/deepsparse/v2/utils/__init__.py
@@ -0,0 +1,18 @@
+# flake8: noqa
+
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .context import *
+from .types import *
diff --git a/src/deepsparse/v2/utils/context.py b/src/deepsparse/v2/utils/context.py
new file mode 100644
index 0000000000..81fe26de61
--- /dev/null
+++ b/src/deepsparse/v2/utils/context.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Callable, List, NamedTuple
+
+from deepsparse.v2.utils.types import OperatorSchema
+
+
+__all__ = ["Context"]
+
+
+class StageInfo(NamedTuple):
+    operator: Callable
+    input: OperatorSchema
+    output: OperatorSchema
+
+
+class Context:
+    """
+    Context contains the full history of operators and their inputs and outputs
+    in a pipeline
+    """
+
+    def __init__(self):
+        self.stages_executed: List[StageInfo] = []
+
+    def update(self, operator: Callable, input: OperatorSchema, output: OperatorSchema):
+        self.stages_executed.append(
+            StageInfo(operator=operator, input=input, output=output)
+        )
diff --git a/src/deepsparse/v2/utils/types.py b/src/deepsparse/v2/utils/types.py
new file mode 100644
index 0000000000..3e4b974453
--- /dev/null
+++ b/src/deepsparse/v2/utils/types.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Types to support deepsparse pipelines
+"""
+
+from typing import Any, Dict, Union
+
+from pydantic import BaseModel
+
+
+__all__ = ["OperatorSchema"]
+
+
+# Operator inputs and outputs may either be a pydantic base model or a dict of kwargs
+OperatorSchema = Union[BaseModel, Dict[str, Any]]
diff --git a/tests/deepsparse/v2/__init__.py b/tests/deepsparse/v2/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/deepsparse/v2/test_basic_pipeline.py b/tests/deepsparse/v2/test_basic_pipeline.py
new file mode 100644
index 0000000000..d39bc61c8c
--- /dev/null
+++ b/tests/deepsparse/v2/test_basic_pipeline.py
@@ -0,0 +1,45 @@
+"""
+Simple example and test of a dummy pipeline
+"""
+
+from pydantic import BaseModel
+
+from deepsparse.v2 import Pipeline
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.routers import Router
+from deepsparse.v2.schedulers import OperatorScheduler
+from deepsparse.v2.utils import Context, OperatorSchema
+
+
+class IntSchema(BaseModel):
+    value: int
+
+
+class AddOneOperator(Operator):
+    input_schema = IntSchema
+    output_schema = IntSchema
+
+    def run(self, inp: IntSchema, context: Context) -> OperatorSchema:
+        return IntSchema(value=inp.value + 1)
+
+
+class AddTwoOperator(Operator):
+    input_schema = IntSchema
+    output_schema = IntSchema
+
+    def run(self, inp: IntSchema, context: Context) -> OperatorSchema:
+        return IntSchema(value=inp.value + 2)
+
+
+AddThreePipeline = Pipeline(
+    stages=[AddOneOperator(), AddTwoOperator()],
+    router=Router(),
+    schedulers=[OperatorScheduler()],
+)
+
+
+def test_run_simple_pipeline():
+    pipeline_input = IntSchema(value=5)
+    pipeline_output = AddThreePipeline(pipeline_input)
+
+    assert pipeline_output.value == 8

From 4e5f2af1da99b5ecd7e76ead5a056734e88e0af1 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 31 Oct 2023 16:24:10 -0400
Subject: [PATCH 2/7] [Pipeline Refactor] Additional functionality, engine
 operator, linear router and image classification pipeline/operators/example
 (#1325)

* initial functionality and working example with image classification

* remove testing image

* update args

* initial functionality and working example with image classification

* remove testing image

* pr comments

* defines schemas for operators and test

* add image classification test, PR comments

* fix input/output handling in pipeline and operator base classes to be more generic; remove context

* add additional operator input message

* typo fix
---
 src/deepsparse/v2/__init__.py                 |   2 +-
 .../v2/image_classification/__init__.py       |  20 +++
 .../v2/image_classification/pipeline.py       |  62 ++++++++
 .../postprocess_operator.py                   |  81 ++++++++++
 .../preprocess_operator.py                    | 149 ++++++++++++++++++
 .../v2/operators/engine_operator.py           | 133 ++++++++++++++++
 src/deepsparse/v2/operators/operator.py       |  92 +++++++----
 src/deepsparse/v2/pipeline.py                 | 130 ++++++++-------
 src/deepsparse/v2/routers/router.py           |  88 ++++++-----
 src/deepsparse/v2/schedulers/scheduler.py     |  14 +-
 .../v2/schedulers/scheduler_group.py          |  16 +-
 src/deepsparse/v2/utils/__init__.py           |   1 -
 src/deepsparse/v2/utils/context.py            |  42 -----
 tests/deepsparse/v2/__init__.py               |  13 ++
 tests/deepsparse/v2/test_basic_pipeline.py    |  31 +++-
 .../v2/test_image_classification.py           |  39 +++++
 16 files changed, 709 insertions(+), 204 deletions(-)
 create mode 100644 src/deepsparse/v2/image_classification/__init__.py
 create mode 100644 src/deepsparse/v2/image_classification/pipeline.py
 create mode 100644 src/deepsparse/v2/image_classification/postprocess_operator.py
 create mode 100644 src/deepsparse/v2/image_classification/preprocess_operator.py
 create mode 100644 src/deepsparse/v2/operators/engine_operator.py
 delete mode 100644 src/deepsparse/v2/utils/context.py
 create mode 100644 tests/deepsparse/v2/test_image_classification.py

diff --git a/src/deepsparse/v2/__init__.py b/src/deepsparse/v2/__init__.py
index 4a897be06f..29fcd4126c 100644
--- a/src/deepsparse/v2/__init__.py
+++ b/src/deepsparse/v2/__init__.py
@@ -14,8 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .pipeline import *
 from .operators import *
+from .pipeline import *
 from .routers import *
 from .schedulers import *
 from .utils import *
diff --git a/src/deepsparse/v2/image_classification/__init__.py b/src/deepsparse/v2/image_classification/__init__.py
new file mode 100644
index 0000000000..8668227df7
--- /dev/null
+++ b/src/deepsparse/v2/image_classification/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+from .postprocess_operator import *
+from .preprocess_operator import *
+
+
+from .pipeline import *  # isort:skip
diff --git a/src/deepsparse/v2/image_classification/pipeline.py b/src/deepsparse/v2/image_classification/pipeline.py
new file mode 100644
index 0000000000..3d7887a701
--- /dev/null
+++ b/src/deepsparse/v2/image_classification/pipeline.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import warnings
+from typing import Dict, Optional, Tuple, Union
+
+from deepsparse.v2.image_classification.postprocess_operator import (
+    ImageClassificationPostProcess,
+)
+from deepsparse.v2.image_classification.preprocess_operator import (
+    ImageClassificationPreProcess,
+)
+from deepsparse.v2.operators.engine_operator import EngineOperator
+from deepsparse.v2.pipeline import Pipeline
+from deepsparse.v2.routers.router import LinearRouter
+from deepsparse.v2.schedulers.scheduler import OperatorScheduler
+
+
+_LOGGER = logging.getLogger(__name__)
+
+__all__ = ["ImageClassificationPipeline"]
+
+
+class ImageClassificationPipeline(Pipeline):
+    def __init__(
+        self,
+        model_path: str,
+        engine_kwargs: Optional[Dict] = None,
+        class_names: Union[None, str, Dict[str, str]] = None,
+        image_size: Optional[Tuple[int]] = None,
+        top_k: int = 1,
+    ):
+        if not engine_kwargs:
+            engine_kwargs = {}
+            engine_kwargs["model_path"] = model_path
+        elif engine_kwargs.get("model_path") != model_path:
+            warnings.warn(f"Updating engine_kwargs to include {model_path}")
+
+        engine = EngineOperator(**engine_kwargs)
+        preproces = ImageClassificationPreProcess(
+            model_path=engine.model_path, image_size=image_size
+        )
+        postprocess = ImageClassificationPostProcess(
+            top_k=top_k, class_names=class_names
+        )
+
+        ops = [preproces, engine, postprocess]
+        router = LinearRouter(end_route=len(ops))
+        scheduler = [OperatorScheduler()]
+        super().__init__(ops=ops, router=router, schedulers=scheduler)
diff --git a/src/deepsparse/v2/image_classification/postprocess_operator.py b/src/deepsparse/v2/image_classification/postprocess_operator.py
new file mode 100644
index 0000000000..9231113368
--- /dev/null
+++ b/src/deepsparse/v2/image_classification/postprocess_operator.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from typing import Dict, List, Union
+
+import numpy
+from pydantic import BaseModel, Field
+
+from deepsparse.v2.operators import Operator
+
+
+class ImageClassificationOutput(BaseModel):
+    """
+    Output model for image classification
+    """
+
+    labels: List[Union[int, str, List[int], List[str]]] = Field(
+        description="List of labels, one for each prediction"
+    )
+    scores: List[Union[float, List[float]]] = Field(
+        description="List of scores, one for each prediction"
+    )
+
+
+__all__ = ["ImageClassificationPostProcess"]
+
+
+class ImageClassificationPostProcess(Operator):
+    """
+    Image Classification post-processing Operator. This Operator is responsible for
+    processing outputs from the engine and returning the classification results to
+    the user, using the ImageClassifcationOutput structure.
+    """
+
+    input_schema = None
+    output_schema = ImageClassificationOutput
+
+    def __init__(
+        self, top_k: int = 1, class_names: Union[None, str, Dict[str, str]] = None
+    ):
+        self.top_k = top_k
+        if isinstance(class_names, str) and class_names.endswith(".json"):
+            self._class_names = json.load(open(class_names))
+        elif isinstance(class_names, dict):
+            self._class_names = class_names
+        else:
+            self._class_names = None
+
+    def run(self, inp: "EngineOperatorOutputs", **kwargs) -> Dict:  # noqa: F821
+        labels, scores = [], []
+        inp = inp.engine_outputs
+        for prediction_batch in inp[0]:
+            label = (-prediction_batch).argsort()[: self.top_k]
+            score = prediction_batch[label]
+            labels.append(label)
+            scores.append(score.tolist())
+
+        if self._class_names is not None:
+            labels = numpy.vectorize(self._class_names.__getitem__)(labels)
+            labels = labels.tolist()
+
+        if isinstance(labels[0], numpy.ndarray):
+            labels = [label.tolist() for label in labels]
+
+        if len(labels) == 1:
+            labels = labels[0]
+            scores = scores[0]
+
+        return {"scores": scores, "labels": labels}
diff --git a/src/deepsparse/v2/image_classification/preprocess_operator.py b/src/deepsparse/v2/image_classification/preprocess_operator.py
new file mode 100644
index 0000000000..9b4517a44c
--- /dev/null
+++ b/src/deepsparse/v2/image_classification/preprocess_operator.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional, Tuple
+
+import numpy
+import onnx
+from PIL import Image
+from torchvision import transforms
+
+from deepsparse.image_classification.constants import (
+    IMAGENET_RGB_MEANS,
+    IMAGENET_RGB_STDS,
+)
+from deepsparse.pipelines.computer_vision import ComputerVisionSchema
+from deepsparse.v2.operators import Operator
+
+
+class ImageClassificationInput(ComputerVisionSchema):
+    """
+    Input model for image classification
+    """
+
+
+__all__ = ["ImageClassificationPreProcess"]
+
+
+class ImageClassificationPreProcess(Operator):
+    """
+    Image Classification pre-processing operator. This Operator is expected to process
+    the user inputs and prepare them for the engine. Inputs to this Operator are
+    expected to follow the ImageClassificationInput schema.
+    """
+
+    input_schema = ImageClassificationInput
+    output_schema = None
+
+    def __init__(self, model_path: str, image_size: Optional[Tuple[int]] = None):
+        self.model_path = model_path
+        self._image_size = image_size or self._infer_image_size()
+        non_rand_resize_scale = 256.0 / 224.0  # standard used
+        self._pre_normalization_transforms = transforms.Compose(
+            [
+                transforms.Resize(
+                    tuple(
+                        [
+                            round(non_rand_resize_scale * size)
+                            for size in self._image_size
+                        ]
+                    )
+                ),
+                transforms.CenterCrop(self._image_size),
+            ]
+        )
+
+    def run(self, inp: ImageClassificationInput, **kwargs) -> Dict:
+        """
+        Pre-Process the Inputs for DeepSparse Engine
+
+        :param inputs: input model
+        :return: list of preprocessed numpy arrays
+        """
+
+        if isinstance(inp.images, numpy.ndarray):
+            image_batch = inp.images
+        else:
+            if isinstance(inp.images, str):
+                inp.images = [inp.images]
+
+            image_batch = list(map(self._preprocess_image, inp.images))
+
+            # build batch
+            image_batch = numpy.stack(image_batch, axis=0)
+
+        original_dtype = image_batch.dtype
+        image_batch = numpy.ascontiguousarray(image_batch, dtype=numpy.float32)
+
+        if original_dtype == numpy.uint8:
+            image_batch /= 255
+            # normalize entire batch
+            image_batch -= numpy.asarray(IMAGENET_RGB_MEANS).reshape((-1, 3, 1, 1))
+            image_batch /= numpy.asarray(IMAGENET_RGB_STDS).reshape((-1, 3, 1, 1))
+
+        return {"engine_inputs": [image_batch]}
+
+    def _preprocess_image(self, image) -> numpy.ndarray:
+        if isinstance(image, List):
+            # image given as raw list
+            image = numpy.asarray(image)
+            if image.dtype == numpy.float32:
+                # image is already processed, append and continue
+                return image
+            # assume raw image input
+            # put image in PIL format for torchvision processing
+            image = image.astype(numpy.uint8)
+            if image.shape[0] < image.shape[-1]:
+                # put channel last
+                image = numpy.einsum("cwh->whc", image)
+            image = Image.fromarray(image)
+        elif isinstance(image, str):
+            # load image from string filepath
+            image = Image.open(image).convert("RGB")
+        elif isinstance(image, numpy.ndarray):
+            image = image.astype(numpy.uint8)
+            if image.shape[0] < image.shape[-1]:
+                # put channel last
+                image = numpy.einsum("cwh->whc", image)
+            image = Image.fromarray(image)
+
+        if not isinstance(image, Image.Image):
+            raise ValueError(
+                f"inputs to {self.__class__.__name__} must be a string image "
+                "file path(s), a list representing a raw image, "
+                "PIL.Image.Image object(s), or a numpy array representing"
+                f"the entire pre-processed batch. Found {type(image)}"
+            )
+
+        # apply resize and center crop
+        image = self._pre_normalization_transforms(image)
+        image_numpy = numpy.array(image)
+        image.close()
+
+        # make channel first dimension
+        image_numpy = image_numpy.transpose(2, 0, 1)
+        return image_numpy
+
+    def _infer_image_size(self) -> Tuple[int, ...]:
+        """
+        Infer and return the expected shape of the input tensor
+
+        :return: The expected shape of the input tensor from onnx graph
+        """
+        onnx_model = onnx.load(self.model_path)
+        input_tensor = onnx_model.graph.input[0]
+        return (
+            input_tensor.type.tensor_type.shape.dim[2].dim_value,
+            input_tensor.type.tensor_type.shape.dim[3].dim_value,
+        )
diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py
new file mode 100644
index 0000000000..aac94a7697
--- /dev/null
+++ b/src/deepsparse/v2/operators/engine_operator.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional, Union
+
+from pydantic import BaseModel, Field
+
+from deepsparse import Context, Engine, MultiModelEngine, Scheduler
+from deepsparse.benchmark import ORTEngine
+from deepsparse.utils import join_engine_outputs, model_to_path, split_engine_inputs
+from deepsparse.v2.operators import Operator
+
+
+DEEPSPARSE_ENGINE = "deepsparse"
+ORT_ENGINE = "onnxruntime"
+
+SUPPORTED_PIPELINE_ENGINES = [DEEPSPARSE_ENGINE, ORT_ENGINE]
+
+__all__ = ["EngineOperator"]
+
+
+class EngineOperatorInputs(BaseModel):
+    engine_inputs: List = Field(description="engine_inputs")
+
+
+class EngineOperatorOutputs(BaseModel):
+    engine_outputs: List = Field(description="engine outputs")
+
+
+class EngineOperator(Operator):
+    input_schema = EngineOperatorInputs
+    output_schema = EngineOperatorOutputs
+
+    def __init__(
+        self,
+        model_path: str,
+        engine_type: str = DEEPSPARSE_ENGINE,
+        batch_size: Optional[int] = 1,
+        num_cores: int = None,
+        num_streams: int = None,
+        scheduler: Scheduler = None,
+        input_shapes: List[List[int]] = None,
+        engine_context: Optional[Context] = None,
+    ):
+
+        self._batch_size = batch_size
+        self.model_path = model_to_path(model_path)
+        self.engine_context = engine_context
+
+        if self.engine_context is not None:
+            num_cores = num_cores or self.engine_context.num_cores
+            if self.engine_context.num_cores != num_cores:
+                raise ValueError(
+                    f"num_cores mismatch. Expected {self.engine_context.num_cores} "
+                    f"from passed context, but got {num_cores} while "
+                    f"instantiating Pipeline"
+                )
+
+        engine_args = dict(
+            batch_size=self._batch_size,
+            num_cores=num_cores,
+            input_shapes=input_shapes,
+        )
+        if engine_type.lower() == DEEPSPARSE_ENGINE:
+            engine_args["scheduler"] = scheduler
+            engine_args["num_streams"] = num_streams
+
+        self.engine = self._create_engine(self.model_path, engine_type, engine_args)
+
+    def _create_engine(
+        self, onnx_file_path: str, engine_type: str, engine_args: Dict
+    ) -> Union[Engine, MultiModelEngine, ORTEngine]:
+        """
+        Create an inference engine for a given ONNX model
+
+        :param onnx_file_path: path to ONNX model file
+        :param engine_type: type of engine to create.
+        :param engine_args: arguments to pass to engine constructor
+        :param context: context to use for engine
+        :return: inference engine
+        """
+        engine_type = engine_type.lower()
+
+        if engine_type == DEEPSPARSE_ENGINE:
+            if self.engine_context is not None and isinstance(
+                self.engine_context, Context
+            ):
+                engine_args.pop("num_cores", None)
+                engine_args.pop("scheduler", None)
+                engine_args.pop("num_streams", None)
+                engine_args["context"] = self.engien_context
+                return MultiModelEngine(
+                    model=onnx_file_path,
+                    **engine_args,
+                )
+            engine_args.pop("cache_output_bools", None)
+            return Engine(onnx_file_path, **engine_args)
+
+        if engine_type == ORT_ENGINE:
+            return ORTEngine(onnx_file_path, **engine_args)
+
+        raise ValueError(
+            f"Unknown engine_type {engine_type}. Supported values include: "
+            f"{SUPPORTED_PIPELINE_ENGINES}"
+        )
+
+    def run(self, inp: EngineOperatorInputs) -> Dict:
+        inp = inp.engine_inputs
+        batches, orig_batch_size = self.expand_inputs(engine_inputs=inp)
+        batches_outputs = list(map(self.engine, batches))
+        engine_outputs = self.condense_inputs(
+            batch_outputs=batches_outputs, orig_batch_size=orig_batch_size
+        )
+        return {"engine_outputs": engine_outputs}
+
+    def expand_inputs(self, **kwargs):
+        return split_engine_inputs(kwargs["engine_inputs"], self._batch_size)
+
+    def condense_inputs(self, **kwargs):
+        batch_outputs = kwargs["batch_outputs"]
+        orig_batch_size = kwargs["orig_batch_size"]
+        return join_engine_outputs(batch_outputs, orig_batch_size)
diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py
index 30e1a48379..c3a3e28b78 100644
--- a/src/deepsparse/v2/operators/operator.py
+++ b/src/deepsparse/v2/operators/operator.py
@@ -13,39 +13,32 @@
 # limitations under the License.
 
 from abc import ABC, abstractmethod
-from typing import Optional, Type
+from typing import Any, Optional, Type
 
 from pydantic import BaseModel
 
-from deepsparse.v2.utils import Context, OperatorSchema
-
 
 __all__ = ["Operator"]
 
 
 class Operator(ABC):
     """
-    Base operator class - can represent any part of an ML pipeline
+    Base operator class - an operator should be defined for each atomic, functional
+    part of the pipeline.
     """
 
     # expected structured input and output types, to be defined by child classes
-    input_schema: Optional[Type[OperatorSchema]] = None
-    output_schema: Optional[Type[OperatorSchema]] = None
-
-    @abstractmethod
-    def run(self, inp: OperatorSchema, context: Context) -> OperatorSchema:
-        """
-        :param inp: operator input, as the defined input schema if applicable
-        :param context: pipeline context of already run operators
-        :return: result of this operator as the defined output schema if applicable
-        """
-        raise NotImplementedError
+    input_schema: Optional[Type[BaseModel]] = None
+    output_schema: Optional[Type[BaseModel]] = None
 
     @classmethod
     def has_input_schema(cls) -> bool:
         """
         :return: True if this class has a defined pydantic input schema
         """
+        if not cls.input_schema:
+            return False
+
         return issubclass(cls.input_schema, BaseModel)
 
     @classmethod
@@ -53,38 +46,73 @@ def has_output_schema(cls) -> bool:
         """
         :return: True if this class has a defined pydantic input schema
         """
+        if not cls.output_schema:
+            return False
+
         return issubclass(cls.output_schema, BaseModel)
 
     def __call__(
         self,
         *args,
-        context: Optional[Context] = None,
         **kwargs,
-    ) -> OperatorSchema:
+    ) -> Any:
         """
         Parses inputs to this Operator and runs the run() method of this operator
 
-        :param args: an unnamed arg may only be provided
-            if it is of the type of the input_schema
+        :param args: an unnamed arg may only be provided if it is of the type of the
+            input_schema
         :param context: pipeline context to pass to operator
         :param kwargs: kwargs when not initializing from an instantiated schema
         :return: operator output
         """
-        if len(args) > 1:
-            raise ValueError(
-                f"Only 1 unnamed arg may be supplied to an Operator, found {len(args)}"
-            )
-
-        if len(args) == 1:
-            if self.input_schema is not None and isinstance(args[0], self.input_schema):
+        if self.has_input_schema():
+            if len(args) > 1:
+                raise ValueError(
+                    f"The operator requires an {self.input_schema}. Too many arguments"
+                    "provided."
+                )
+            elif args and isinstance(args[0], self.input_schema):
                 inference_input = args[0]
+            elif kwargs:
+                inference_input = self.input_schema(**kwargs)
             else:
                 raise ValueError(
-                    f"1 arg supplied to Operator {self.__class__.__name__} but was not "
-                    f"of expected type {self.input_schema}, found {type(args[0])}"
+                    "Can't resolve inputs. The values for the schema must be provided"
+                    "in the form of a dictionary or an instance of the input_schema"
+                    "object"
                 )
-        elif self.has_input_schema():
-            inference_input = self.input_schema(**kwargs)
+
+            run_output = self.run(inference_input)
         else:
-            inference_input = kwargs
-        return self.run(inference_input, context=context)
+            run_output = self.run(*args, **kwargs)
+
+        if self.has_output_schema():
+            return self.output_schema(**run_output)
+        return run_output
+
+    @abstractmethod
+    def run(self, *args, **kwargs) -> Any:
+        """
+        :param inp: operator input, as the defined input schema if applicable
+        :param context: pipeline context of already run operators
+        :return: result of this operator as the defined output schema if applicable
+        """
+        raise NotImplementedError
+
+    def expand_inputs(self, **kwargs):
+        """
+        Generic function to handle expanding values.
+        """
+        raise NotImplementedError
+
+    def condense_inputs(self, **kwargs):
+        """
+        Generic function to handle condensing values.
+        """
+        raise NotImplementedError
+
+    def yaml(self):
+        pass
+
+    def json(self):
+        pass
diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py
index 0ec580687d..e58f8a5191 100644
--- a/src/deepsparse/v2/pipeline.py
+++ b/src/deepsparse/v2/pipeline.py
@@ -13,9 +13,7 @@
 # limitations under the License.
 
 
-from typing import List
-
-from pydantic import BaseModel, Field, PrivateAttr
+from typing import Dict, List, Union
 
 from deepsparse.v2.operators import Operator
 from deepsparse.v2.routers import Router
@@ -25,78 +23,90 @@
 __all__ = ["Pipeline"]
 
 
-class Pipeline(BaseModel):
+class Pipeline(Operator):
     """
-    Pipeline accepts a series of operators, schedulers, and a router which define
-    an end to end ML transformation.
+    Pipeline accepts a series of operators, schedulers, and a router. Calling a pipeline
+    will use the router to run through all the defined operators. The operators should
+    be implemented using the Operator class and each implemented Operator should be
+    responsible for a functional component of the pipelines. The flow of inputs/outputs
+    between the operators and the steps in the pipeline should be defined by the router,
+    (based off of the Router class), which dicates the next operator in the pipeline.
+    Execution of the operators will be handled by the provided schedulers.
+
+    :param ops: Operators to run within the pipeline. Can either be a list of operators
+        or dictionary of operators.
+    :param router: A Router which dictates the next operator to call.
+    :param schedulers: A list of schedulers to run operators.
 
-    Calling a pipeline runs these transformations
     """
 
-    stages: List[Operator] = Field(
-        required=True,
-        description="In-order list of operators that make up this pipeline",
-    )
-    router: Router = Field(
-        default_factor=Router,
-        description="Router object to determine order and run the stages. "
-        "Defaults to the base Router object",
-    )
-    schedulers: List[OperatorScheduler] = Field(
-        default_factor=lambda: [OperatorScheduler()],
-        description="List of schedulers to run operators in order of priority",
-    )
-
-    _scheduler_group: SchedulerGroup = PrivateAttr()
-
-    class Config:
-        arbitrary_types_allowed = True
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+    def __init__(
+        self,
+        ops: Union[Dict[str, Operator], List[Operator]],
+        router: Router,
+        schedulers: List[OperatorScheduler],
+    ):
 
+        self.ops = ops
+        self.router = router
+        self.schedulers = schedulers
         self.validate()
 
         # SchedulerGroup handles running all schedulers in order of priority
         self._scheduler_group = SchedulerGroup(self.schedulers)
 
-    def __call__(self, *args, return_context: bool = False, **kwargs):
+    def run(self, *args, **kwargs):
+        """
+        Run through the operators using the provided router and scheduler. Update the
+        context to reflect each step of the router. The input to a given operator is the
+        output of the previous operator.
+
+        :param inp: input to the operator. expected to be of any type that is
+        expected by the operator.
+        :param context: context to store the current the inputs, outputs, and operator
+        for each step of the router.
+
+        """
+        next_step = self.router.START_ROUTE
+        operator_output = None
+        while next_step != self.router.END_ROUTE:
+            # Either a dictionary key or valid index
+            operator = self.ops[next_step]
+            if next_step == self.router.START_ROUTE:
+                output_future = self._scheduler_group.submit(
+                    *args, operator=operator, **kwargs
+                )
+            else:
+                if isinstance(operator_output, dict):
+                    output_future = self._scheduler_group.submit(
+                        operator=operator, **operator_output
+                    )
+                else:
+                    output_future = self._scheduler_group.submit(
+                        operator_output, operator=operator
+                    )
+
+            # wait for future to resolve
+            operator_output = output_future.result()
+            next_step = self.router.next(next_step, self.ops)
+        return operator_output
+
+    def __call__(self, *args, **kwargs):
         """
-        :param return_context: if True, retrns tuple of the pipelien output
-            and entire context. Default False
-        :return: output of the pipeline stages ran with the router for the given input
+        :return: output of the pipeline operators ran with the router for the given
+        input
         """
-        if len(args) > 1:
-            raise ValueError(
-                "Only 1 in-line argument may be supplied to Pipeline which "
-                f"must be a Schema, found: {len(args)}"
-            )
-        if args and kwargs:
-            raise ValueError(
-                "Pipeline can only run either a single in-line argument schema or a "
-                f"series of kwargs, found {len(args)} args and {len(kwargs)} kwargs"
-            )
-
-        pipeline_input = args[0] or kwargs
-        pipeline_output, context = self.router.run(
-            inp=pipeline_input,
-            operators=self.stages,
-            scheduler=self._scheduler_group,
-        )
-
-        if return_context:
-            return pipeline_output, context
-
-        return pipeline_output
+        return self.run(*args, **kwargs)
 
     def validate(self):
-        router_validation = self.router.validate(self.stages)
+        """
+        Validate that compatability of the router and operators provided.
+        """
+        router_validation = self.router.validate(self.ops)
 
         if router_validation is False:
             # default error message
-            stage_types = [type(stage) for stage in self.stages]
-            raise ValueError(
-                f"Invalid Router: {type(self.router)} for stages: {stage_types}"
-            )
+            op_types = [type(op) for op in self.ops]
+            raise ValueError(f"Invalid Router: {type(self.router)} for ops: {op_types}")
         elif isinstance(router_validation, str):
-            raise ValueError(f"Invalid Router for stages: {router_validation}")
+            raise ValueError(f"Invalid Router for operators: {router_validation}")
diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py
index 284c348c10..6050803b5e 100644
--- a/src/deepsparse/v2/routers/router.py
+++ b/src/deepsparse/v2/routers/router.py
@@ -13,61 +13,70 @@
 # limitations under the License.
 
 
-from typing import List, Tuple, Union
+import logging
+from abc import abstractmethod
+from typing import Dict, List, Union
 
 from deepsparse.v2.operators import Operator
-from deepsparse.v2.schedulers import OperatorScheduler
-from deepsparse.v2.utils import Context, OperatorSchema
 
 
-__all__ = ["Router"]
+_LOGGER = logging.getLogger(__name__)
+
+__all__ = ["Router", "LinearRouter"]
 
 
 class Router:
     """
-    Routers must implement a run method which runs a series of operators
-    for a pipeline for a given input. Base Router runs operators linearly
-    in a series
+    Routers dicate the next operator to run. Each Router must implement a next function,
+    which dictates the index or key of the next operator to run.
+
+    :param start_route: the start index or key of the router
+    :param end_route: the end index or key of the router
+
     """
 
-    @staticmethod
-    def run(
-        inp: OperatorSchema,
-        operators: List[Operator],
-        scheduler: OperatorScheduler,
-    ) -> Tuple[OperatorSchema, Context]:
+    def __init__(self, end_route: Union[str, int], start_route: Union[str, int]):
+        self.START_ROUTE = start_route
+        self.END_ROUTE = end_route
+
+    @abstractmethod
+    def next(
+        self, past: Union[str, int], ops: Union[List[Operator], Dict[str, Operator]]
+    ) -> Union[str, int]:
         """
-        :param inp: input to the first operator of the series
-        :param operators: list of operators to run
-        :param scheduler: scheudler to submit operators to
-        :return: final output of the operators
+        Determines the index or dictionary key for the next operator which should run.
+
+        :param past: the previous index or key. This should uniquely determine the next
+        operator to run
+        :param ops: list or dictionary of operators
+        :returns: the next index or dictionary key for the next operator to run
         """
-        context = Context()
+        raise NotImplementedError
+
+    def yaml(self):
+        pass
 
-        # run operators linearly
-        operator_input = inp
-        for operator in operators:
-            output_future = scheduler.submit(
-                operator=operator, operator_input=operator_input, context=context
-            )
+    def json(self):
+        pass
 
-            # wait for future to resolve
-            operator_output = output_future.result()
 
-            # update context
-            context.update(
-                operator=operator,
-                input=operator_input,
-                output=operator_output,
-            )
+class LinearRouter(Router):
+    """
+    LinearRouterruns a list of Operators in sequential order. end_route should
+    be the length of the list and the start_route should be the start index.
+    """
 
-            # previous output becomes next input
-            operator_input = operator_output
+    def __init__(self, end_route: int, start_route: int = 0):
+        super().__init__(end_route=end_route, start_route=start_route)
 
-        return operator_output, context
+    def next(self, past: int, ops: List[Operator]) -> int:
+        new_index = past + 1
+        if new_index < self.END_ROUTE:
+            return new_index
+        return self.END_ROUTE
 
     @staticmethod
-    def validate(operators: List[Operator]) -> Union[bool, str]:
+    def validate(operators: List[Operator]) -> bool:
         """
         :param operators: operators that this Router could potentially run over
         :return: True if this Router can run this series of operators. Base Router
@@ -76,7 +85,8 @@ def validate(operators: List[Operator]) -> Union[bool, str]:
             returned
         """
         if len(operators) < 1:
-            return "No operators found"
+            _LOGGER.info("No operators provided")
+            return False
 
         for idx in range(len(operators) - 1):
             current_output_schema = operators[idx].output_schema
@@ -88,8 +98,10 @@ def validate(operators: List[Operator]) -> Union[bool, str]:
                 continue
 
             if current_output_schema != next_input_schema:
-                return (
+                _LOGGER.info(
                     f"Operator at idx {idx}: {type(operators[idx])} has invalid "
                     f"output schema {current_output_schema} for next operator "
                     f"{type(operators[idx + 1])} which requires {next_input_schema}"
                 )
+                return False
+        return True
diff --git a/src/deepsparse/v2/schedulers/scheduler.py b/src/deepsparse/v2/schedulers/scheduler.py
index 53f0c8f625..7d4f249444 100644
--- a/src/deepsparse/v2/schedulers/scheduler.py
+++ b/src/deepsparse/v2/schedulers/scheduler.py
@@ -16,7 +16,6 @@
 from concurrent.futures import Future, ThreadPoolExecutor
 
 from deepsparse.v2.operators import Operator
-from deepsparse.v2.utils import Context, OperatorSchema
 
 
 __all__ = ["OperatorScheduler"]
@@ -37,23 +36,16 @@ class OperatorScheduler:
     def __init__(self, max_workers: int = 1):
         self._threadpool = ThreadPoolExecutor(max_workers=max_workers)
 
-    def submit(
-        self,
-        operator: Operator,
-        operator_input: OperatorSchema,
-        context: Context,
-    ) -> Future:
+    def submit(self, *args, operator: Operator, **kwargs) -> Future:
         """
         :param operator: operator to run
         :param operator_input: input schema to the operator
         :param context: context of already run operators
         :return: future referencing the asynchronously run output of the operator
         """
-        if isinstance(operator_input, dict):
-            return self._threadpool.submit(operator, context=context, **operator_input)
-        return self._threadpool.submit(operator, operator_input, context=context)
+        return self._threadpool.submit(operator, *args, **kwargs)
 
-    def can_process(self, operator: Operator, operator_input: OperatorSchema) -> bool:
+    def can_process(self, *args, operator: Operator, **kwargs) -> bool:
         """
         :param operator: operator to check
         :param operator_input: operator_input to check
diff --git a/src/deepsparse/v2/schedulers/scheduler_group.py b/src/deepsparse/v2/schedulers/scheduler_group.py
index 2f797b30c7..7f00a3c17c 100644
--- a/src/deepsparse/v2/schedulers/scheduler_group.py
+++ b/src/deepsparse/v2/schedulers/scheduler_group.py
@@ -18,7 +18,6 @@
 
 from deepsparse.v2.operators import Operator
 from deepsparse.v2.schedulers.scheduler import OperatorScheduler
-from deepsparse.v2.utils import Context, OperatorSchema
 
 
 __all__ = ["SchedulerGroup"]
@@ -35,12 +34,7 @@ class SchedulerGroup(OperatorScheduler):
     def __init__(self, schedulers: List[OperatorScheduler]):
         self.schedulers = schedulers
 
-    def submit(
-        self,
-        operator: Operator,
-        operator_input: OperatorSchema,
-        context: Context,
-    ) -> Future:
+    def submit(self, *args, operator: Operator, **kwargs) -> Future:
         """
         :param operator: operator to run
         :param operator_input: input schema to the operator
@@ -48,10 +42,10 @@ def submit(
         :return: future referencing the asynchronously run output of the operator
         """
         for scheduler in self.schedulers:
-            if scheduler.can_process(operator, operator_input):
-                return scheduler.submit(operator, operator_input, context)
+            if scheduler.can_process(*args, operator=operator, **kwargs):
+                return scheduler.submit(*args, operator=operator, **kwargs)
 
-    def can_process(self, operator: Operator, operator_input: OperatorSchema) -> bool:
+    def can_process(self, *args, operator: Operator, **kwargs) -> bool:
         """
         :param operator: operator to check
         :param operator_input: operator_input to check
@@ -59,6 +53,6 @@ def can_process(self, operator: Operator, operator_input: OperatorSchema) -> boo
             SchedulerGroup always returns True
         """
         return any(
-            scheduler.can_process(operator, operator_input)
+            scheduler.can_process(*args, operator=operator, **kwargs)
             for scheduler in self.schedulers
         )
diff --git a/src/deepsparse/v2/utils/__init__.py b/src/deepsparse/v2/utils/__init__.py
index 4f36eeb448..a36d8e92ec 100644
--- a/src/deepsparse/v2/utils/__init__.py
+++ b/src/deepsparse/v2/utils/__init__.py
@@ -14,5 +14,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .context import *
 from .types import *
diff --git a/src/deepsparse/v2/utils/context.py b/src/deepsparse/v2/utils/context.py
deleted file mode 100644
index 81fe26de61..0000000000
--- a/src/deepsparse/v2/utils/context.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Callable, List, NamedTuple
-
-from deepsparse.v2.utils.types import OperatorSchema
-
-
-__all__ = ["Context"]
-
-
-class StageInfo(NamedTuple):
-    operator: Callable
-    input: OperatorSchema
-    output: OperatorSchema
-
-
-class Context:
-    """
-    Context contains the full history of operators and their inputs and outputs
-    in a pipeline
-    """
-
-    def __init__(self):
-        self.stages_executed: List[StageInfo] = []
-
-    def update(self, operator: Callable, input: OperatorSchema, output: OperatorSchema):
-        self.stages_executed.append(
-            StageInfo(operator=operator, input=input, output=output)
-        )
diff --git a/tests/deepsparse/v2/__init__.py b/tests/deepsparse/v2/__init__.py
index e69de29bb2..0c44f887a4 100644
--- a/tests/deepsparse/v2/__init__.py
+++ b/tests/deepsparse/v2/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/deepsparse/v2/test_basic_pipeline.py b/tests/deepsparse/v2/test_basic_pipeline.py
index d39bc61c8c..9f85e4976e 100644
--- a/tests/deepsparse/v2/test_basic_pipeline.py
+++ b/tests/deepsparse/v2/test_basic_pipeline.py
@@ -1,14 +1,29 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 Simple example and test of a dummy pipeline
 """
 
+from typing import Dict
+
 from pydantic import BaseModel
 
 from deepsparse.v2 import Pipeline
 from deepsparse.v2.operators import Operator
-from deepsparse.v2.routers import Router
+from deepsparse.v2.routers import LinearRouter
 from deepsparse.v2.schedulers import OperatorScheduler
-from deepsparse.v2.utils import Context, OperatorSchema
 
 
 class IntSchema(BaseModel):
@@ -19,21 +34,21 @@ class AddOneOperator(Operator):
     input_schema = IntSchema
     output_schema = IntSchema
 
-    def run(self, inp: IntSchema, context: Context) -> OperatorSchema:
-        return IntSchema(value=inp.value + 1)
+    def run(self, inp: IntSchema) -> Dict:
+        return {"value": inp.value + 1}
 
 
 class AddTwoOperator(Operator):
     input_schema = IntSchema
     output_schema = IntSchema
 
-    def run(self, inp: IntSchema, context: Context) -> OperatorSchema:
-        return IntSchema(value=inp.value + 2)
+    def run(self, inp: IntSchema) -> Dict:
+        return {"value": inp.value + 2}
 
 
 AddThreePipeline = Pipeline(
-    stages=[AddOneOperator(), AddTwoOperator()],
-    router=Router(),
+    ops=[AddOneOperator(), AddTwoOperator()],
+    router=LinearRouter(end_route=2),
     schedulers=[OperatorScheduler()],
 )
 
diff --git a/tests/deepsparse/v2/test_image_classification.py b/tests/deepsparse/v2/test_image_classification.py
new file mode 100644
index 0000000000..03e2807454
--- /dev/null
+++ b/tests/deepsparse/v2/test_image_classification.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+
+import pytest
+from deepsparse.v2.image_classification import ImageClassificationPipeline
+from deepsparse.v2.image_classification.preprocess_operator import (
+    ImageClassificationInput,
+)
+from tests.deepsparse.pipelines.data_helpers import computer_vision
+
+
+@pytest.fixture
+def get_images():
+    batch_size = 2
+    images = computer_vision(batch_size=batch_size)
+    return images.get("images")
+
+
+def test_image_classification(get_images):
+    model_path = (
+        "zoo:cv/classification/resnet_v1-50/pytorch/sparseml/imagenet/pruned95-none"
+    )
+    pipeline = ImageClassificationPipeline(model_path=model_path)
+    output = pipeline(ImageClassificationInput(images=get_images))
+    assert output.labels == [[207], [670]]
+    assert numpy.allclose(output.scores, [[21.85], [17.33]], atol=0.01)

From 59fb5872f8c81584f1826c69289f05b7d06160ad Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Wed, 1 Nov 2023 10:53:10 -0400
Subject: [PATCH 3/7] [v2] EngineOperator updates to make continuous batching
 easier (#1371)

* [v2] EngineOperator updates to make continuous batching easier

* test fixes
---
 .../v2/operators/engine_operator.py           | 42 +++++++++++++++----
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py
index aac94a7697..2c61755df9 100644
--- a/src/deepsparse/v2/operators/engine_operator.py
+++ b/src/deepsparse/v2/operators/engine_operator.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from copy import deepcopy
 from typing import Dict, List, Optional, Union
 
 from pydantic import BaseModel, Field
@@ -32,6 +33,13 @@
 
 class EngineOperatorInputs(BaseModel):
     engine_inputs: List = Field(description="engine_inputs")
+    engine: Optional[Engine] = Field(
+        description="override the engine to run forward pass with",
+        default=None,
+    )
+
+    class Config:
+        arbitrary_types_allowed = True
 
 
 class EngineOperatorOutputs(BaseModel):
@@ -76,21 +84,33 @@ def __init__(
             engine_args["scheduler"] = scheduler
             engine_args["num_streams"] = num_streams
 
-        self.engine = self._create_engine(self.model_path, engine_type, engine_args)
+        self._engine_args = engine_args
+        self._engine_type = engine_type
+
+        self.engine = self.create_engine()
+
+    @property
+    def batch_size(self) -> int:
+        """
+        :return: the batch size this engine operator is compiled at
+        """
+        return self._batch_size
 
-    def _create_engine(
-        self, onnx_file_path: str, engine_type: str, engine_args: Dict
+    def create_engine(
+        self,
+        **kwargs,
     ) -> Union[Engine, MultiModelEngine, ORTEngine]:
         """
         Create an inference engine for a given ONNX model
 
-        :param onnx_file_path: path to ONNX model file
-        :param engine_type: type of engine to create.
-        :param engine_args: arguments to pass to engine constructor
-        :param context: context to use for engine
+        :param kwargs: overrides to engine_args used as kwargs for engine
+            constructor/compilation
         :return: inference engine
         """
-        engine_type = engine_type.lower()
+        onnx_file_path = self.model_path
+        engine_args = deepcopy(self._engine_args)
+        engine_args.update(kwargs)
+        engine_type = self._engine_type.lower()
 
         if engine_type == DEEPSPARSE_ENGINE:
             if self.engine_context is not None and isinstance(
@@ -116,6 +136,12 @@ def _create_engine(
         )
 
     def run(self, inp: EngineOperatorInputs) -> Dict:
+        if inp.engine:
+            # run with custom engine, do not split/join since custom engine
+            # may run at any batch size, returning here as code below has a
+            # planned refactor
+            engine_outputs = inp.engine(inp.engine_inputs)
+            return {"engine_outputs": engine_outputs}
         inp = inp.engine_inputs
         batches, orig_batch_size = self.expand_inputs(engine_inputs=inp)
         batches_outputs = list(map(self.engine, batches))

From d54ef26aa8885160478771f16fcd1b150589af53 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Thu, 2 Nov 2023 20:47:25 -0400
Subject: [PATCH 4/7] [Pipeline Refactor] Update routes, text generation
 initial functionality (#1348)

* initial functionality and working example with image classification

* remove testing image

* rebase fixes

* initial functionality and working example with image classification

* text gen

* updates func

* prompt inference, initial functionality

* remove image; update state docstring

* Fix typo

* add todo for split/join

* remove context, clean-up args, remove prefill_preprocess_operaator

* fix docstrings
---
 src/deepsparse/v2/operators/__init__.py       |   1 -
 .../v2/operators/engine_operator.py           |  18 +-
 src/deepsparse/v2/operators/operator.py       |  30 ++-
 src/deepsparse/v2/pipeline.py                 |  70 ++++--
 src/deepsparse/v2/routers/router.py           |  57 ++++-
 src/deepsparse/v2/schedulers/scheduler.py     |  23 +-
 .../v2/schedulers/scheduler_group.py          |  35 ++-
 src/deepsparse/v2/text_generation/__init__.py |  24 ++
 .../autoregressive_preprocess_operator.py     | 100 ++++++++
 .../v2/text_generation/compile_logits.py      |  43 ++++
 .../v2/text_generation/kv_cache_operator.py   |  70 ++++++
 .../multi_engine_prefill_operator.py          | 135 +++++++++++
 .../v2/text_generation/nl_engine_operator.py  | 191 ++++++++++++++++
 src/deepsparse/v2/text_generation/pipeline.py | 213 ++++++++++++++++++
 .../v2/text_generation/prep_for_prefill.py    |  57 +++++
 .../v2/text_generation/process_inputs.py      | 121 ++++++++++
 src/deepsparse/v2/utils/__init__.py           |   2 +-
 src/deepsparse/v2/utils/state.py              |  64 ++++++
 tests/deepsparse/v2/test_basic_pipeline.py    |   4 +-
 19 files changed, 1203 insertions(+), 55 deletions(-)
 create mode 100644 src/deepsparse/v2/text_generation/__init__.py
 create mode 100644 src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py
 create mode 100644 src/deepsparse/v2/text_generation/compile_logits.py
 create mode 100644 src/deepsparse/v2/text_generation/kv_cache_operator.py
 create mode 100644 src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py
 create mode 100644 src/deepsparse/v2/text_generation/nl_engine_operator.py
 create mode 100644 src/deepsparse/v2/text_generation/pipeline.py
 create mode 100644 src/deepsparse/v2/text_generation/prep_for_prefill.py
 create mode 100644 src/deepsparse/v2/text_generation/process_inputs.py
 create mode 100644 src/deepsparse/v2/utils/state.py

diff --git a/src/deepsparse/v2/operators/__init__.py b/src/deepsparse/v2/operators/__init__.py
index 8f7e6a169d..9d1a9812ac 100644
--- a/src/deepsparse/v2/operators/__init__.py
+++ b/src/deepsparse/v2/operators/__init__.py
@@ -13,5 +13,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from .operator import *
diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py
index 2c61755df9..b7d920a686 100644
--- a/src/deepsparse/v2/operators/engine_operator.py
+++ b/src/deepsparse/v2/operators/engine_operator.py
@@ -17,7 +17,8 @@
 
 from pydantic import BaseModel, Field
 
-from deepsparse import Context, Engine, MultiModelEngine, Scheduler
+from deepsparse import Context as EngineContext
+from deepsparse import Engine, MultiModelEngine, Scheduler
 from deepsparse.benchmark import ORTEngine
 from deepsparse.utils import join_engine_outputs, model_to_path, split_engine_inputs
 from deepsparse.v2.operators import Operator
@@ -54,16 +55,15 @@ def __init__(
         self,
         model_path: str,
         engine_type: str = DEEPSPARSE_ENGINE,
-        batch_size: Optional[int] = 1,
         num_cores: int = None,
         num_streams: int = None,
         scheduler: Scheduler = None,
         input_shapes: List[List[int]] = None,
-        engine_context: Optional[Context] = None,
+        engine_context: Optional[EngineContext] = None,
+        engine_kwargs: Dict = None,
     ):
-
-        self._batch_size = batch_size
         self.model_path = model_to_path(model_path)
+        self._batch_size = 1
         self.engine_context = engine_context
 
         if self.engine_context is not None:
@@ -87,7 +87,7 @@ def __init__(
         self._engine_args = engine_args
         self._engine_type = engine_type
 
-        self.engine = self.create_engine()
+        self.engine = self.create_engine(**engine_kwargs)
 
     @property
     def batch_size(self) -> int:
@@ -114,12 +114,12 @@ def create_engine(
 
         if engine_type == DEEPSPARSE_ENGINE:
             if self.engine_context is not None and isinstance(
-                self.engine_context, Context
+                self.engine_context, EngineContext
             ):
                 engine_args.pop("num_cores", None)
                 engine_args.pop("scheduler", None)
                 engine_args.pop("num_streams", None)
-                engine_args["context"] = self.engien_context
+                engine_args["context"] = self.engine_context
                 return MultiModelEngine(
                     model=onnx_file_path,
                     **engine_args,
@@ -135,7 +135,7 @@ def create_engine(
             f"{SUPPORTED_PIPELINE_ENGINES}"
         )
 
-    def run(self, inp: EngineOperatorInputs) -> Dict:
+    def run(self, inp: EngineOperatorInputs, **kwargs) -> Dict:
         if inp.engine:
             # run with custom engine, do not split/join since custom engine
             # may run at any batch size, returning here as code below has a
diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py
index c3a3e28b78..b3963d8223 100644
--- a/src/deepsparse/v2/operators/operator.py
+++ b/src/deepsparse/v2/operators/operator.py
@@ -17,6 +17,8 @@
 
 from pydantic import BaseModel
 
+from deepsparse.v2.utils import InferenceState, PipelineState
+
 
 __all__ = ["Operator"]
 
@@ -54,6 +56,8 @@ def has_output_schema(cls) -> bool:
     def __call__(
         self,
         *args,
+        inference_state: InferenceState,
+        pipeline_state: PipelineState,
         **kwargs,
     ) -> Any:
         """
@@ -61,7 +65,9 @@ def __call__(
 
         :param args: an unnamed arg may only be provided if it is of the type of the
             input_schema
-        :param context: pipeline context to pass to operator
+        :param inference_state: inference_state for the pipeline.
+        :param pipeline_state: pipeline_state for the pipeline. The values in the state
+            are created during pipeline creation and are read-only during inference.
         :param kwargs: kwargs when not initializing from an instantiated schema
         :return: operator output
         """
@@ -81,10 +87,18 @@ def __call__(
                     "in the form of a dictionary or an instance of the input_schema"
                     "object"
                 )
-
-            run_output = self.run(inference_input)
+            run_output = self.run(
+                inference_input,
+                inference_state=inference_state,
+                pipeline_state=pipeline_state,
+            )
         else:
-            run_output = self.run(*args, **kwargs)
+            run_output = self.run(
+                *args,
+                inference_state=inference_state,
+                pipeline_state=pipeline_state,
+                **kwargs,
+            )
 
         if self.has_output_schema():
             return self.output_schema(**run_output)
@@ -93,12 +107,16 @@ def __call__(
     @abstractmethod
     def run(self, *args, **kwargs) -> Any:
         """
-        :param inp: operator input, as the defined input schema if applicable
-        :param context: pipeline context of already run operators
         :return: result of this operator as the defined output schema if applicable
         """
         raise NotImplementedError
 
+    def can_operate(self, inp: Any) -> bool:
+        """
+        Whether or not the given operator can run, based on input
+        """
+        return True
+
     def expand_inputs(self, **kwargs):
         """
         Generic function to handle expanding values.
diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py
index e58f8a5191..0a8c8b2f93 100644
--- a/src/deepsparse/v2/pipeline.py
+++ b/src/deepsparse/v2/pipeline.py
@@ -18,6 +18,7 @@
 from deepsparse.v2.operators import Operator
 from deepsparse.v2.routers import Router
 from deepsparse.v2.schedulers import OperatorScheduler, SchedulerGroup
+from deepsparse.v2.utils import InferenceState, PipelineState
 
 
 __all__ = ["Pipeline"]
@@ -27,7 +28,7 @@ class Pipeline(Operator):
     """
     Pipeline accepts a series of operators, schedulers, and a router. Calling a pipeline
     will use the router to run through all the defined operators. The operators should
-    be implemented using the Operator class and each implemented Operator should be
+    be implemented using the Operator class and each implemented operator should be
     responsible for a functional component of the pipelines. The flow of inputs/outputs
     between the operators and the steps in the pipeline should be defined by the router,
     (based off of the Router class), which dicates the next operator in the pipeline.
@@ -37,6 +38,7 @@ class Pipeline(Operator):
         or dictionary of operators.
     :param router: A Router which dictates the next operator to call.
     :param schedulers: A list of schedulers to run operators.
+    :param pipeline_state: pipeline_state created during pipeline initialization
 
     """
 
@@ -45,57 +47,93 @@ def __init__(
         ops: Union[Dict[str, Operator], List[Operator]],
         router: Router,
         schedulers: List[OperatorScheduler],
+        pipeline_state: PipelineState = None,
     ):
 
         self.ops = ops
         self.router = router
         self.schedulers = schedulers
+        self.pipeline_state = pipeline_state
         self.validate()
 
         # SchedulerGroup handles running all schedulers in order of priority
         self._scheduler_group = SchedulerGroup(self.schedulers)
 
-    def run(self, *args, **kwargs):
+    def run(
+        self,
+        *args,
+        inference_state: InferenceState,
+        pipeline_state: PipelineState,
+        **kwargs,
+    ):
         """
-        Run through the operators using the provided router and scheduler. Update the
-        context to reflect each step of the router. The input to a given operator is the
-        output of the previous operator.
-
-        :param inp: input to the operator. expected to be of any type that is
-        expected by the operator.
-        :param context: context to store the current the inputs, outputs, and operator
-        for each step of the router.
+        Run through the operators using the provided router and scheduler.
+        The input to a given operator is the output of the previous operator.
 
+        :param inference_state: inference_state for the pipeline.
+        :param pipeline_state: pipeline_state for the pipeline. The values in the state
+            are created during pipeline creation and are read-only during inference.
         """
         next_step = self.router.START_ROUTE
         operator_output = None
+
         while next_step != self.router.END_ROUTE:
             # Either a dictionary key or valid index
             operator = self.ops[next_step]
             if next_step == self.router.START_ROUTE:
                 output_future = self._scheduler_group.submit(
-                    *args, operator=operator, **kwargs
+                    *args,
+                    inference_state=inference_state,
+                    operator=operator,
+                    pipeline_state=pipeline_state,
+                    **kwargs,
                 )
             else:
                 if isinstance(operator_output, dict):
                     output_future = self._scheduler_group.submit(
-                        operator=operator, **operator_output
+                        inference_state=inference_state,
+                        operator=operator,
+                        pipeline_state=pipeline_state,
+                        **operator_output,
                     )
                 else:
                     output_future = self._scheduler_group.submit(
-                        operator_output, operator=operator
+                        operator_output,
+                        inference_state=inference_state,
+                        pipeline_state=pipeline_state,
+                        operator=operator,
                     )
 
-            # wait for future to resolve
             operator_output = output_future.result()
-            next_step = self.router.next(next_step, self.ops)
+            if isinstance(operator_output, tuple):
+                state_update = operator_output[-1]
+                operator_output = operator_output[0]
+                inference_state.update_state(state_update)
+
+            next_step = self.router.next(next_step, self.ops, operator_output)
+
         return operator_output
 
     def __call__(self, *args, **kwargs):
         """
+        Consolidate any provided inference_state or pipeline_state objects and pass
+        any other operator inputs to run().
+
         :return: output of the pipeline operators ran with the router for the given
-        input
+            input
         """
+        if kwargs.get("inference_state"):
+            inference_state = kwargs.pop("inference_state")
+        else:
+            inference_state = InferenceState()
+            inference_state.create_state({})
+
+        if "pipeline_state" in kwargs:
+            self.pipeline_state = kwargs.get("pipeline_state")
+
+        kwargs["inference_state"] = inference_state
+        kwargs["pipeline_state"] = self.pipeline_state
+
         return self.run(*args, **kwargs)
 
     def validate(self):
diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py
index 6050803b5e..d1110d4ca7 100644
--- a/src/deepsparse/v2/routers/router.py
+++ b/src/deepsparse/v2/routers/router.py
@@ -15,14 +15,14 @@
 
 import logging
 from abc import abstractmethod
-from typing import Dict, List, Union
+from typing import Any, Dict, List, Optional, Union
 
 from deepsparse.v2.operators import Operator
 
 
 _LOGGER = logging.getLogger(__name__)
 
-__all__ = ["Router", "LinearRouter"]
+__all__ = ["Router", "LinearRouter", "GraphRouter"]
 
 
 class Router:
@@ -32,23 +32,34 @@ class Router:
 
     :param start_route: the start index or key of the router
     :param end_route: the end index or key of the router
+    :param route: the route that the router has to traverse through
 
     """
 
-    def __init__(self, end_route: Union[str, int], start_route: Union[str, int]):
+    def __init__(
+        self,
+        end_route: Union[str, int],
+        start_route: Union[str, int],
+        route: Optional[Dict] = None,
+    ):
         self.START_ROUTE = start_route
         self.END_ROUTE = end_route
+        self.route = route
 
     @abstractmethod
     def next(
-        self, past: Union[str, int], ops: Union[List[Operator], Dict[str, Operator]]
+        self,
+        past: Union[str, int],
+        ops: Optional[Union[List[Operator], Dict[str, Operator]]],
+        inp: Optional[Any],
     ) -> Union[str, int]:
         """
         Determines the index or dictionary key for the next operator which should run.
 
         :param past: the previous index or key. This should uniquely determine the next
-        operator to run
+            operator to run
         :param ops: list or dictionary of operators
+        :param inp: operator input
         :returns: the next index or dictionary key for the next operator to run
         """
         raise NotImplementedError
@@ -69,7 +80,9 @@ class LinearRouter(Router):
     def __init__(self, end_route: int, start_route: int = 0):
         super().__init__(end_route=end_route, start_route=start_route)
 
-    def next(self, past: int, ops: List[Operator]) -> int:
+    def next(
+        self, past: int, ops: Optional[List[Operator]] = None, inp: Optional[Any] = None
+    ) -> int:
         new_index = past + 1
         if new_index < self.END_ROUTE:
             return new_index
@@ -105,3 +118,35 @@ def validate(operators: List[Operator]) -> bool:
                 )
                 return False
         return True
+
+
+class GraphRouter(Router):
+    """
+    Router for a DAG. Expects graphs be presented in the form of a dictionary, where
+    keys are the nodes of the graph and the values are the connected nodes. For
+    nodes with multiple ouput edges, all the nodes will be visited and the first node
+    where `can_operate` returns True will run. Paths should be deterministic.
+    """
+
+    def __init__(self, end_route: str, start_route: str, route: Dict):
+        super().__init__(end_route=end_route, start_route=start_route, route=route)
+
+    def next(
+        self,
+        past: str,
+        ops: Dict[str, Operator],
+        inp: Any,
+    ) -> int:
+        node = past
+        if isinstance(self.route[node], str):
+            return self.route[node]
+        else:
+            for neighbour_node in self.route[node]:
+                neighbour_node_op = ops[neighbour_node]
+                if neighbour_node_op.can_operate(inp):
+                    return neighbour_node
+            raise ValueError("Cannot operate on any of the nodes")
+
+    @staticmethod
+    def validate(ops) -> bool:
+        pass
diff --git a/src/deepsparse/v2/schedulers/scheduler.py b/src/deepsparse/v2/schedulers/scheduler.py
index 7d4f249444..78a58e3389 100644
--- a/src/deepsparse/v2/schedulers/scheduler.py
+++ b/src/deepsparse/v2/schedulers/scheduler.py
@@ -36,19 +36,30 @@ class OperatorScheduler:
     def __init__(self, max_workers: int = 1):
         self._threadpool = ThreadPoolExecutor(max_workers=max_workers)
 
-    def submit(self, *args, operator: Operator, **kwargs) -> Future:
+    def submit(
+        self,
+        *args,
+        operator: Operator,
+        **kwargs,
+    ) -> Future:
         """
         :param operator: operator to run
-        :param operator_input: input schema to the operator
-        :param context: context of already run operators
         :return: future referencing the asynchronously run output of the operator
         """
-        return self._threadpool.submit(operator, *args, **kwargs)
+        return self._threadpool.submit(
+            operator,
+            *args,
+            **kwargs,
+        )
 
-    def can_process(self, *args, operator: Operator, **kwargs) -> bool:
+    def can_process(
+        self,
+        *args,
+        operator: Operator,
+        **kwargs,
+    ) -> bool:
         """
         :param operator: operator to check
-        :param operator_input: operator_input to check
         :return: True if this Operator can process the given operator and input.
             Base OperatorScheduler always returns True
         """
diff --git a/src/deepsparse/v2/schedulers/scheduler_group.py b/src/deepsparse/v2/schedulers/scheduler_group.py
index 7f00a3c17c..40b5695f22 100644
--- a/src/deepsparse/v2/schedulers/scheduler_group.py
+++ b/src/deepsparse/v2/schedulers/scheduler_group.py
@@ -34,25 +34,44 @@ class SchedulerGroup(OperatorScheduler):
     def __init__(self, schedulers: List[OperatorScheduler]):
         self.schedulers = schedulers
 
-    def submit(self, *args, operator: Operator, **kwargs) -> Future:
+    def submit(
+        self,
+        *args,
+        operator: Operator,
+        **kwargs,
+    ) -> Future:
         """
         :param operator: operator to run
-        :param operator_input: input schema to the operator
-        :param context: context of already run operators
         :return: future referencing the asynchronously run output of the operator
         """
         for scheduler in self.schedulers:
-            if scheduler.can_process(*args, operator=operator, **kwargs):
-                return scheduler.submit(*args, operator=operator, **kwargs)
+            if scheduler.can_process(
+                *args,
+                operator=operator,
+                **kwargs,
+            ):
+                return scheduler.submit(
+                    *args,
+                    operator=operator,
+                    **kwargs,
+                )
 
-    def can_process(self, *args, operator: Operator, **kwargs) -> bool:
+    def can_process(
+        self,
+        *args,
+        operator: Operator,
+        **kwargs,
+    ) -> bool:
         """
         :param operator: operator to check
-        :param operator_input: operator_input to check
         :return: True if this Operator can process the given operator and input.
             SchedulerGroup always returns True
         """
         return any(
-            scheduler.can_process(*args, operator=operator, **kwargs)
+            scheduler.can_process(
+                *args,
+                operator=operator,
+                **kwargs,
+            )
             for scheduler in self.schedulers
         )
diff --git a/src/deepsparse/v2/text_generation/__init__.py b/src/deepsparse/v2/text_generation/__init__.py
new file mode 100644
index 0000000000..37ac88d02f
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# flake8: noqa
+from .autoregressive_preprocess_operator import *
+from .compile_logits import *
+from .kv_cache_operator import *
+from .multi_engine_prefill_operator import *
+from .nl_engine_operator import *
+from .prep_for_prefill import *
+from .process_inputs import *
+
+
+from .pipeline import *  # isort:skip
diff --git a/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py
new file mode 100644
index 0000000000..cfe7cb531b
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Any
+
+import numpy
+
+from deepsparse.transformers.utils.helpers import create_causal_mask
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.utils import PipelineState
+
+
+_LOGGER = logging.getLogger(__name__)
+
+__all__ = ["AutoRegressiveOperatorPreprocess"]
+
+
+class AutoRegressiveOperatorPreprocess(Operator):
+    def __init__(self, sequence_length: int, prompt_sequence_length: int):
+        """
+        Prepare the tokens for the single-token engine. This requires creating the
+        attention mask, positions, and causal mask. The output contains these three
+        arrays to be passed into the single-token engine.
+        """
+        self.sequence_length = sequence_length
+        self.prompt_sequence_length = prompt_sequence_length
+        self.set_capacity = False
+
+        _LOGGER.warn(
+            "This operator requires the PipelineState to be set-up with the "
+            "onnx_input_names_no_cache attribute set from the NLEngineOperator."
+        )
+
+    def can_operate(self, inp: Any) -> bool:
+        """
+        Can run this Operator if the number of tokens left to process is greater than
+        0 but less than the self.prompt_sequence_length.
+        """
+        tokens = inp.get("tokens")
+        kv_cache = inp.get("kv_cache")
+
+        remaining_tokens = len(tokens) - kv_cache.total_num_processed_tokens
+        if remaining_tokens > 0 and remaining_tokens < self.prompt_sequence_length:
+            return True
+        return False
+
+    def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwargs):
+
+        if not self.set_capacity:
+            self.set_capacity = True
+            kv_cache.set_capacity(self.sequence_length - 1)
+
+        num_total_processed_tokens = kv_cache.total_num_processed_tokens
+        new_token = tokens[num_total_processed_tokens]
+        engine_input_names = pipeline_state.current_state.get(
+            "onnx_input_names_no_cache"
+        )
+
+        # padding is added to left, so attention mask is 1s from the
+        # right up to the number of total tokens (prompt + generated)
+        attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
+        num_attention_entries_to_unmask = min(
+            num_total_processed_tokens + 1, self.sequence_length
+        )  # cap by seq len
+        attention_mask[:, -num_attention_entries_to_unmask:] = 1
+        positions = numpy.array([[num_total_processed_tokens]], dtype=numpy.int64)
+        input_ids = numpy.array([[new_token]])
+        causal_mask = create_causal_mask(input_ids, attention_mask)
+
+        engine_inputs_map = dict(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            causal_mask=causal_mask,
+            positions=positions,
+        )
+
+        engine_inputs = [engine_inputs_map[name] for name in engine_input_names]
+
+        onnx_input_names_no_cache = pipeline_state.current_state.get(
+            "onnx_input_names_no_cache"
+        )
+        engine_inputs = [engine_inputs_map[name] for name in onnx_input_names_no_cache]
+
+        return {
+            "engine_inputs": engine_inputs,
+            "kv_cache": kv_cache,
+            "tokens": tokens,
+        }
diff --git a/src/deepsparse/v2/text_generation/compile_logits.py b/src/deepsparse/v2/text_generation/compile_logits.py
new file mode 100644
index 0000000000..55c87d791d
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/compile_logits.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.utils import InferenceState
+
+
+__all__ = ["CompilePromptLogits"]
+
+
+class CompilePromptLogits(Operator):
+    """
+    Combine the prompt logits. Currently relying on the inference state to store the
+    prompt logits for each token or multi-token batch processed. This operator will
+    take prompt logits from each iteration run and update the inference state.
+    """
+
+    def run(self, logits, inference_state: InferenceState, **kwargs):
+        logit_type = "prompt_logits"
+
+        if inference_state.current_state.get(logit_type) is not None:
+            current_logits = inference_state.current_state.get(logit_type).copy()
+            current_logits.append(logits)
+        else:
+            current_logits = [logits]
+
+        state_update = {logit_type: current_logits}
+        return {
+            "kv_cache": kwargs.get("kv_cache"),
+            "tokens": kwargs.get("tokens"),
+        }, state_update
diff --git a/src/deepsparse/v2/text_generation/kv_cache_operator.py b/src/deepsparse/v2/text_generation/kv_cache_operator.py
new file mode 100644
index 0000000000..0b232402b3
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/kv_cache_operator.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from deepsparse.transformers.utils import DecoderKVCache
+from deepsparse.transformers.utils.helpers import (
+    initialize_kv_cache_state,
+    prepends_bos_token,
+)
+from deepsparse.v2.operators import Operator
+
+
+__all__ = ["KVCacheCreator"]
+
+
+class KVCacheCreatorOutput(BaseModel):
+    kv_cache: Any = Field(description="KV Cache Created")  # DecoderKVCache
+
+
+class KVCacheCreatorInput(BaseModel):
+    cache_shape: Any = Field(description="shape")
+    kv_cache_data_type: Any = Field(description="data type")
+    output_names: Any = Field(description="output names")
+
+
+class KVCacheCreator(Operator):
+    input_schema = KVCacheCreatorInput
+    output_schema = KVCacheCreatorOutput
+
+    def __init__(
+        self,
+        tokenizer,
+        sequence_length: int,
+        prompt_sequence_length: int,
+        internal_kv_cache: bool,
+    ):
+        self.tokenizer = tokenizer
+        self.prompt_sequence_length = prompt_sequence_length
+        self.internal_kv_cache = internal_kv_cache
+        self.sequence_length = sequence_length
+
+    def run(self, cache_shape, kv_cache_data_type: str, output_names: list, **kwargs):
+        kv_cache_state = initialize_kv_cache_state(
+            cache_shape=cache_shape,
+            kv_cache_data_type=kv_cache_data_type,
+            output_names=output_names,
+            length=self.sequence_length - self.prompt_sequence_length,
+            empty=bool(self.internal_kv_cache),
+        )
+
+        kv_cache = DecoderKVCache(self.internal_kv_cache)
+        kv_cache.setup(
+            state=kv_cache_state,
+            freeze_first_position=prepends_bos_token(self.tokenizer),
+        )
+        return {"kv_cache": kv_cache}
diff --git a/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py
new file mode 100644
index 0000000000..41ee830a8a
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from enum import Enum
+from typing import Any
+
+import numpy
+
+from deepsparse.transformers.utils.helpers import create_causal_mask
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.utils import PipelineState
+
+
+_LOGGER = logging.getLogger(__name__)
+
+__all__ = ["MultiEnginePrefill"]
+
+
+class OnnxInputNames(Enum):
+    INPUT_IDS = "input_ids"
+    ATTN_MASK = "attention_mask"
+    CAUSAL_MASK = "causal_mask"
+    POSITIONS = "positions"
+
+
+# NOTE: A possible clean-up could involve combining this Operator and the
+# autoregressive_preprocess_operator
+
+
+class MultiEnginePrefill(Operator):
+    def __init__(self, prompt_sequence_length, sequence_length):
+        """
+        Prepare the tokens for the multi-token engine. This requires creating the
+        attention mask, positions, and causal mask. The output contains these three
+        arrays to be passed into the multi-token engine.
+        """
+        self.prompt_sequence_length = prompt_sequence_length
+        self.sequence_length = sequence_length
+        self.cases = {
+            OnnxInputNames.ATTN_MASK.value: self._case_attn_mask,
+            OnnxInputNames.POSITIONS.value: self._case_positions,
+        }
+        _LOGGER.warn(
+            "This operator requires the PipelineState to be set-up with the "
+            "onnx_input_names_no_cache attribute set from the NLEngineOperator."
+        )
+
+    def can_operate(self, inp: Any):
+        """
+        Can only run if the number of prompt tokens left to process is greater than
+        or equal to the self.prompt_sequence_length.
+        """
+        kv_cache = inp.get("kv_cache")
+        tokens = inp.get("tokens")
+
+        if len(tokens) < self.prompt_sequence_length:
+            return False
+
+        if (
+            len(tokens) - kv_cache.total_num_processed_tokens
+            >= self.prompt_sequence_length
+        ):
+            return True
+        return False
+
+    def _case_attn_mask(self, num_total_processed_tokens: int):
+        # create an empty attention mask
+        engine_input = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
+        # calculate the number of entries in attention mask that should be set to 1
+        num_attention_entries_to_unmask = min(
+            num_total_processed_tokens + self.prompt_sequence_length,
+            self.sequence_length,
+        )
+        engine_input[:, -num_attention_entries_to_unmask:] = 1
+        return engine_input
+
+    def _case_positions(self, num_total_processed_tokens: int):
+        return (
+            numpy.arange(
+                num_total_processed_tokens,
+                num_total_processed_tokens + self.prompt_sequence_length,
+            )
+            .reshape(1, -1)
+            .astype(numpy.int64)
+        )
+
+    def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwargs):
+
+        onnx_input_names_no_cache = pipeline_state.current_state.get(
+            "onnx_input_names_no_cache"
+        )
+
+        num_total_processed_tokens = kv_cache.total_num_processed_tokens
+        start = num_total_processed_tokens
+        end = start + self.prompt_sequence_length
+        token_batch = tokens[start:end]
+
+        engine_inputs = []
+        for name in onnx_input_names_no_cache:
+            if name == OnnxInputNames.INPUT_IDS.value:
+                engine_input = numpy.array([token_batch])
+            elif (
+                name == OnnxInputNames.ATTN_MASK.value
+                or name == OnnxInputNames.POSITIONS.value
+            ):
+                engine_input = self.cases[name](num_total_processed_tokens)
+            elif name == OnnxInputNames.CAUSAL_MASK.value:
+                continue
+
+            engine_inputs.append(engine_input)
+
+        if OnnxInputNames.CAUSAL_MASK.value in onnx_input_names_no_cache:
+            causal_mask = create_causal_mask(
+                input_ids=engine_inputs[0],
+                attention_mask=engine_inputs[1],
+            )
+            engine_inputs.append(causal_mask)
+
+        return {
+            "engine_inputs": engine_inputs,
+            "kv_cache": kv_cache,
+            "tokens": tokens,
+        }
diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py
new file mode 100644
index 0000000000..6c1ad1966e
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+from typing import Any, List, Tuple
+
+from pydantic import BaseModel, Field
+
+from deepsparse.utils.onnx import (
+    CACHE_INPUT_PREFIX,
+    overwrite_onnx_model_inputs_for_kv_cache_models,
+)
+from deepsparse.v2.operators.engine_operator import (
+    DEEPSPARSE_ENGINE,
+    EngineOperator,
+    EngineOperatorInputs,
+)
+
+
+__all__ = ["NLEngineOperator"]
+
+
+class NlEngineInput(BaseModel):
+    engine_inputs: List = Field(description="engine inputs")
+    kv_cache: Any = Field(description="kv_cache object")
+    tokens: List = Field(description="tokens")
+
+
+class NLEngineOperator(EngineOperator):
+
+    """
+    Operator for the NL Decoder Engine. This Operator inherits from the EngineOperator.
+    Specific updates to engine attributes are made through this operator, as well
+    as updating the kv_cache. This Operator is used for both the single-token and
+    multi-token case.
+    """
+
+    input_schema = NlEngineInput
+    output_schema = None
+
+    def __init__(
+        self,
+        sequence_length: int,
+        input_ids_length: int,
+        internal_kv_cache: bool = False,
+        **kwargs,
+    ):
+
+        self.kv_cache_data_type = None
+        (
+            onnx_file_path,
+            output_indices_to_be_cached,
+            kv_cache_data_type,
+        ) = overwrite_onnx_model_inputs_for_kv_cache_models(
+            onnx_file_path=kwargs.get("model_path"),
+            batch_size=kwargs.get("batch_size", 1),
+            sequence_length=sequence_length,
+            input_ids_length=input_ids_length,
+        )
+
+        engine_kwargs = kwargs.get("engine_kwargs", {})
+        if kwargs.get("engine_type", DEEPSPARSE_ENGINE) == DEEPSPARSE_ENGINE:
+            if "WAND_OPT_FLAGS" not in os.environ:
+                os.environ["WAND_OPT_FLAGS"] = "default,~pyramids"
+
+        if any(output_indices_to_be_cached):
+            self.kv_cache_data_type = kv_cache_data_type
+            if (
+                internal_kv_cache
+                and kwargs.get("engine_type", DEEPSPARSE_ENGINE) == DEEPSPARSE_ENGINE
+            ):
+                engine_kwargs["cached_outputs"] = output_indices_to_be_cached
+
+        kwargs["engine_kwargs"] = engine_kwargs
+        kwargs["model_path"] = onnx_file_path
+        super().__init__(**kwargs)
+
+        self.input_ids_length = input_ids_length
+
+    def run(self, inp: NlEngineInput, **kwargs) -> Any:
+        engine_input = inp.engine_inputs
+        kv_cache = inp.kv_cache
+
+        inputs = self._add_kv_cache_to_input(engine_input, kv_cache)
+        if bool(kv_cache.engine_internal_cache):
+            # conventionally, before dispatching
+            # inputs to the engine, we validate them
+            # if val_inp=True. However, in this case
+            # we want to pass the empty kv cache inputs
+            # (batch_size=0) to the engine. Therefore,
+            # we skip the validation
+            out = self.engine._eng_net.execute_list_out(
+                inputs, kv_cache.engine_internal_cache
+            )
+        else:
+            # run the engine without the LIB.kv_cache object
+            out = (
+                super()
+                .run(EngineOperatorInputs(engine_inputs=inputs), **kwargs)
+                .get("engine_outputs")
+            )
+
+        logits, *kv_cache_state = out
+        self._update_kv_cache(
+            kv_cache_state=kv_cache_state,
+            input_ids_len=self.input_ids_length,
+            kv_cache=kv_cache,
+        )
+
+        output = {"logits": logits, "kv_cache": kv_cache, "tokens": inp.tokens}
+        return output
+
+    def _add_kv_cache_to_input(self, engine_input, kv_cache):
+        kv_cache_state = copy.copy(kv_cache.cached_inputs)
+
+        for idx, input_name in enumerate(self.onnx_input_names_no_cache):
+            kv_cache_state[input_name] = engine_input[idx]
+
+        new_inp = [kv_cache_state[name] for name in self.engine.input_names]
+        return new_inp
+
+    def _update_kv_cache(self, kv_cache_state, input_ids_len, kv_cache):
+        if bool(kv_cache.engine_internal_cache):
+            kv_cache.total_num_processed_tokens += input_ids_len
+            return
+
+        kv_cache_state = {
+            name: array
+            for name, array in zip(self.onnx_input_names_cached, kv_cache_state)
+        }
+
+        kv_cache.update(
+            state=kv_cache_state,
+            input_ids_len=input_ids_len,
+        )
+
+    @property
+    def onnx_input_names_no_cache(self) -> List[str]:
+        """
+        :return: The input names for the onnx model, excluding
+            the potential kv cache inputs
+        """
+        return [
+            name
+            for name in self.engine.input_names
+            if not name.startswith(CACHE_INPUT_PREFIX)
+        ]
+
+    @property
+    def onnx_input_names_cached(self) -> List[str]:
+        """
+        :return: The cached input names for the onnx model
+        """
+        return [
+            name
+            for name in self.engine.input_names
+            if name.startswith(CACHE_INPUT_PREFIX)
+        ]
+
+    @property
+    def cache_shape(self) -> Tuple[int, int, int, int]:
+        """
+        :return: The shape of the kv cache inputs
+            for the onnx model. The shape is
+            (batch_size, num_heads, sequence_length, hidden_size)
+        """
+        cache_engine_input_index = next(
+            i
+            for i, name in enumerate(self.engine.input_names)
+            if CACHE_INPUT_PREFIX in name
+        )
+        return self.engine.input_shapes[cache_engine_input_index]
+
+    @property
+    def output_names(self) -> List[str]:
+        """
+        :return: The output names for the onnx model
+        """
+        return self.engine.output_names
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
new file mode 100644
index 0000000000..9878aa0061
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -0,0 +1,213 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+from deepsparse.transformers.utils.helpers import process_generation_config
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.pipeline import Pipeline
+from deepsparse.v2.routers import GraphRouter
+from deepsparse.v2.schedulers import OperatorScheduler
+from deepsparse.v2.text_generation import (
+    AutoRegressiveOperatorPreprocess,
+    CompilePromptLogits,
+    KVCacheCreator,
+    MultiEnginePrefill,
+    NLEngineOperator,
+    PrepareforPrefill,
+    ProcessInputsTextGeneration,
+)
+from deepsparse.v2.utils import PipelineState
+
+
+class TextGenerationPipeline(Pipeline):
+    def __init__(
+        self,
+        model_path: str,
+        prompt_sequence_length: int = 16,
+        sequence_length: int = 1024,
+        internal_kv_cache: bool = True,
+        force_max_tokens: bool = False,
+        generation_config=None,
+        engine_kwargs: Dict = None,
+    ):
+
+        pipeline_state = PipelineState()
+        pipeline_state_vals = {}
+
+        # TODO: The code below will be replaced with a transformers set-up Operator.
+        self.tokenizer = None
+        model_path = self.setup_onnx_file_path(model_path, sequence_length)
+        self.tokenizer.padding_side = "left"
+        if not self.tokenizer.pad_token:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+
+        if not engine_kwargs:
+            engine_kwargs = {}
+        engine_kwargs["model_path"] = model_path
+
+        if internal_kv_cache and engine_kwargs.get("engine_type") == "onnxruntime":
+            internal_kv_cache = False
+
+        single_engine_operator = NLEngineOperator(
+            sequence_length=sequence_length,
+            internal_kv_cache=internal_kv_cache,
+            input_ids_length=1,
+            **engine_kwargs,
+        )
+
+        multi_engine_operator = NLEngineOperator(
+            sequence_length=sequence_length,
+            internal_kv_cache=internal_kv_cache,
+            input_ids_length=prompt_sequence_length,
+            **engine_kwargs,
+        )
+
+        # NOTE: Currently using pipeline state. Can swap to simply pass in the
+        # attributes to the specific Operator that neeed them, as class attributes.
+        pipeline_state_vals[
+            "onnx_input_names_no_cache"
+        ] = single_engine_operator.onnx_input_names_no_cache
+        pipeline_state_vals["cache_shape"] = single_engine_operator.cache_shape
+        pipeline_state_vals["output_names"] = single_engine_operator.output_names
+        pipeline_state_vals[
+            "kv_cache_data_type"
+        ] = single_engine_operator.kv_cache_data_type
+        pipeline_state.create_state(pipeline_state_vals)
+
+        process_inputs = ProcessInputsTextGeneration(
+            generation_config=process_generation_config(generation_config),
+            sequence_length=sequence_length,
+            tokenizer=self.tokenizer,
+        )
+
+        kv_cache_creator = KVCacheCreator(
+            sequence_length=sequence_length,
+            tokenizer=self.tokenizer,
+            prompt_sequence_length=prompt_sequence_length,
+            internal_kv_cache=internal_kv_cache,
+        )
+
+        # NOTE: Can also have the KVCacheCreator be initialized inside this Operator.
+        # Relies on pipeline state variables set-up above (can be swapped to be class
+        # attributes instead of using the state.
+        engine_inputs_for_prefill = PrepareforPrefill(kv_cache_creator=kv_cache_creator)
+
+        multi_engine_prefill = MultiEnginePrefill(
+            prompt_sequence_length=prompt_sequence_length,
+            sequence_length=sequence_length,
+        )
+        compile_prompt_logits = CompilePromptLogits()
+        """
+        prep_for_single_engine = PrepareforSingleEngine(
+            prompt_sequence_length=prompt_sequence_length,
+            sequence_length=sequence_length,
+        )
+        """
+        autoregressive_preprocess = AutoRegressiveOperatorPreprocess(
+            sequence_length=sequence_length,
+            prompt_sequence_length=prompt_sequence_length,
+        )
+        final_step = FinalStep()
+
+        ops = {
+            "process_input": process_inputs,
+            "single_engine": single_engine_operator,
+            "multi_engine": multi_engine_operator,
+            "kv_cache_creator": kv_cache_creator,
+            "prepare_prefill": engine_inputs_for_prefill,
+            "multi_engine_prefill": multi_engine_prefill,
+            "compile_logits": compile_prompt_logits,
+            "autoregressive_preprocess": autoregressive_preprocess,
+            "final_step": final_step,
+        }
+
+        routes = {
+            "process_input": "prepare_prefill",
+            "prepare_prefill": ["multi_engine_prefill", "autoregressive_preprocess"],
+            "multi_engine_prefill": "multi_engine",
+            "multi_engine": "compile_logits",
+            "compile_logits": [
+                "multi_engine_prefill",
+                "autoregressive_preprocess",
+                "final_step",
+            ],
+            "autoregressive_preprocess": "single_engine",
+            "single_engine": "compile_logits",
+            "final_step": "STOP",
+        }
+
+        router = GraphRouter(
+            end_route="STOP", start_route="process_input", route=routes
+        )
+        scheduler = [OperatorScheduler()]
+        super().__init__(
+            ops=ops, router=router, schedulers=scheduler, pipeline_state=pipeline_state
+        )
+
+    # TODO: Move to be part of a generic transformers set-up Operator.
+    def setup_onnx_file_path(self, model_path, sequence_length) -> str:
+        import logging
+
+        import transformers
+        from transformers import AutoTokenizer
+
+        from deepsparse.transformers.helpers import get_deployment_path
+
+        """
+        Parses ONNX model from the `model_path` provided. It additionally
+        creates config and tokenizer objects from the `deployment path`,
+        derived from the `model_path` provided.
+
+        :return: file path to the processed ONNX file for the engine to compile
+        """
+        deployment_path, onnx_path = get_deployment_path(model_path)
+
+        hf_logger = logging.getLogger("transformers")
+        hf_logger_level = hf_logger.level
+        hf_logger.setLevel(logging.ERROR)
+        self.config = transformers.PretrainedConfig.from_pretrained(
+            deployment_path,
+            finetuning_task=self.task if hasattr(self, "task") else None,
+        )
+        hf_logger.setLevel(hf_logger_level)
+
+        self._trust_remote_code = False
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            deployment_path,
+            trust_remote_code=self._trust_remote_code,
+            model_max_length=sequence_length,
+        )
+
+        if not self.config or not self.tokenizer:
+            raise RuntimeError(
+                "Invalid config or tokenizer provided. Please provide "
+                "paths to the files or ensure they exist in the `model_path` provided. "
+                "See `tokenizer` and `config` arguments for details."
+            )
+        return onnx_path
+
+
+# NOTE: This is a dummy last step which will be removed. Used as a final step
+# for the current routes.
+class FinalStep(Operator):
+    def can_operate(self, *args, **kwargs):
+        return True
+
+    def run(self, *args, **kwargs):
+        import numpy
+
+        inference_state = kwargs.get("inference_state")
+        prompt_logits = inference_state.current_state.get("prompt_logits")
+        return numpy.concatenate(prompt_logits, axis=1)
diff --git a/src/deepsparse/v2/text_generation/prep_for_prefill.py b/src/deepsparse/v2/text_generation/prep_for_prefill.py
new file mode 100644
index 0000000000..2f9eb15797
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/prep_for_prefill.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Any
+
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.utils import PipelineState
+
+
+_LOGGER = logging.getLogger(__name__)
+
+__all__ = ["PrepareforPrefill"]
+
+
+class PrepareforPrefill(Operator):
+    def __init__(self, kv_cache_creator: Operator):
+        """
+        Operator before prefill. Responsible for creating the kv_cache based on engine
+        variables. Currently, this operator expects that the kv_cache_creator is
+        provided during initization and then uses pipeline_state to run the
+        kv_cache_operator.
+        """
+        # NOTE: Alternatively, we can initialize the kv_cache_creater operator here,
+        # instead of at the pipeline level.
+        self.kv_cache_creator = kv_cache_creator
+
+        _LOGGER.warn(
+            "This operator requires the PipelineState to be set-up with the "
+            "cache_shape, output_names, kv_cache_data_type attributes to be set "
+            "from the NLEngineOperator"
+        )
+
+    def run(self, tokens: Any, pipeline_state: PipelineState, **kwargs):
+        # NOTE: Can potentially just be class attributes instead of relying on
+        # pipeline state.
+        cache_shape = pipeline_state.current_state.get("cache_shape")
+        data_type = pipeline_state.current_state.get("kv_cache_data_type")
+        output_names = pipeline_state.current_state.get("output_names")
+
+        kv_cache = self.kv_cache_creator.run(
+            cache_shape=cache_shape,
+            kv_cache_data_type=data_type,
+            output_names=output_names,
+        ).get("kv_cache")
+        return {"tokens": tokens, "kv_cache": kv_cache}
diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py
new file mode 100644
index 0000000000..528dcee0b7
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/process_inputs.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pathlib
+from typing import Dict, Union
+
+import transformers
+
+from deepsparse.transformers.pipelines.text_generation import TextGenerationInput
+from deepsparse.transformers.utils.helpers import (
+    check_and_return_generation_config,
+    override_config,
+    repeat_inputs,
+)
+from deepsparse.v2.operators import Operator
+
+
+class GenerationDefaults:
+    num_return_sequences = 1
+    max_length = 1024
+    max_new_tokens = None
+    output_scores = False
+    top_k = 0
+    top_p = 0.0
+    repetition_penalty = 0.0
+    do_sample = False
+    temperature = 1.0
+
+
+__all__ = ["ProcessInputsTextGeneration"]
+
+
+class ProcessInputsTextGeneration(Operator):
+    """
+    Input processing operator. Responsible for tokenizing the input, handling the
+    generation_config (if provided), updating the inference_state for later use,
+    and returning the tokens for prompt inferece. The expected input is defined by
+    the input_schema, which for this operator is TextGeneratioInput.
+    """
+
+    input_schema = TextGenerationInput
+
+    def __init__(
+        self,
+        tokenizer: transformers.PreTrainedTokenizerBase,
+        generation_config: Union[
+            str, pathlib.Path, Dict, transformers.GenerationConfig
+        ],
+        sequence_length: int,
+    ):
+        self.generation_config = generation_config
+        self.tokenizer = tokenizer
+        self.sequence_length = sequence_length
+
+    def run(self, inp: TextGenerationInput, **kwargs):
+        generation_config = check_and_return_generation_config(
+            self.generation_config, inp.generation_config, GenerationDefaults()
+        )
+
+        generation_config = override_config(inp.generation_kwargs, generation_config)
+
+        original_inputs = inp.sequences
+        if generation_config.num_return_sequences > 1:
+            if isinstance(inp.sequences, str):
+                inp.sequences = [inp.sequences]
+            inp.sequences = repeat_inputs(
+                inp.sequences, generation_config.num_return_sequences
+            )
+
+        if inp.fixed_sequences_length:
+            # to enforce a fixed sequence length, we need to
+            # truncate the input to the maximum sequence length
+            # or/and pad it to the maximum sequence length
+            truncate, padding = True, "max_length"
+        else:
+            # otherwise, we do not need to truncate the input
+            # and we shall can pad it to the longest sequence
+            # in the batch (so that the engine can process multiple inputs
+            # at once)
+            truncate, padding = False, "longest"
+
+        input_tokens = self.tokenizer(
+            inp.sequences,
+            return_tensors="np",
+            max_length=self.sequence_length,
+            padding=padding,
+            truncation=truncate,
+        )
+
+        input_ids = input_tokens["input_ids"]
+        attention_mask = input_tokens["attention_mask"]
+
+        inference_state_update = dict(
+            prompts=original_inputs,
+            streaming=inp.streaming,
+            generation_config=generation_config,
+            include_prompt_logits=inp.include_prompt_logits,
+            callback=inp.callback,
+            stop=inp.stop,
+            top_p=generation_config.top_p,
+            top_k=generation_config.top_k,
+            presence_penalty=inp.presence_penalty,
+            frequency_penalty=generation_config.repetition_penalty,
+        )
+
+        # TODO: move this step to prep_for_prefill and add attention mask to the output
+        # this will allow us to split/join more easily when processing multiple prompts
+        # in parallel
+        tokens = input_ids[attention_mask.nonzero()].tolist()
+        return {"tokens": tokens}, inference_state_update
diff --git a/src/deepsparse/v2/utils/__init__.py b/src/deepsparse/v2/utils/__init__.py
index a36d8e92ec..358405d7af 100644
--- a/src/deepsparse/v2/utils/__init__.py
+++ b/src/deepsparse/v2/utils/__init__.py
@@ -13,5 +13,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from .state import *
 from .types import *
diff --git a/src/deepsparse/v2/utils/state.py b/src/deepsparse/v2/utils/state.py
new file mode 100644
index 0000000000..b54b890acf
--- /dev/null
+++ b/src/deepsparse/v2/utils/state.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from abc import ABC
+from typing import Any, Union
+
+
+__all__ = ["State", "PipelineState", "InferenceState"]
+
+
+class State(ABC):
+    """
+    Abstract class to store pipeline-level and inference-level state variables which
+    are generated by some Operator, and required by some other Operator.
+    """
+
+    def __init__(self):
+        self._current_state = None
+
+    @property
+    def current_state(self):
+        return self._current_state
+
+
+class PipelineState(State):
+    """
+    Created during pipeline initialization. Pipeline state values are ready-only
+    duirng inference.
+    """
+
+    def create_state(self, new_state: dict):
+        if self._current_state:
+            raise ValueError("State creation is only allowed during initialization.")
+        self._current_state = new_state
+
+
+class InferenceState(State):
+    """
+    Inference state, created during every inference run.
+    """
+
+    def create_state(self, new_state: dict):
+        if self._current_state:
+            warnings.warn("Current state already exists, overriding.")
+        self._current_state = new_state
+
+    def update_value(self, attribute: str, value: Union[str, int, list]):
+        if not self._current_state.get(attribute):
+            raise ValueError(f"{attribute} is not a valid state attribute")
+        self._current_state[attribute] = value
+
+    def update_state(self, value: Any):
+        self._current_state.update(value)
diff --git a/tests/deepsparse/v2/test_basic_pipeline.py b/tests/deepsparse/v2/test_basic_pipeline.py
index 9f85e4976e..bedddd537a 100644
--- a/tests/deepsparse/v2/test_basic_pipeline.py
+++ b/tests/deepsparse/v2/test_basic_pipeline.py
@@ -34,7 +34,7 @@ class AddOneOperator(Operator):
     input_schema = IntSchema
     output_schema = IntSchema
 
-    def run(self, inp: IntSchema) -> Dict:
+    def run(self, inp: IntSchema, **kwargs) -> Dict:
         return {"value": inp.value + 1}
 
 
@@ -42,7 +42,7 @@ class AddTwoOperator(Operator):
     input_schema = IntSchema
     output_schema = IntSchema
 
-    def run(self, inp: IntSchema) -> Dict:
+    def run(self, inp: IntSchema, **kwargs) -> Dict:
         return {"value": inp.value + 2}
 
 

From 5ee36d6aca7c6d510765c433f6c31a1751bad58b Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Fri, 3 Nov 2023 11:15:00 -0400
Subject: [PATCH 5/7] [Pipeline Refactor] Additional Operators, Route update
 and completed generation functionality (#1356)

* initial functionality and working example with image classification

* remove testing image

* rebase fixes

* initial functionality and working example with image classification

* text gen

* updates func

* prompt inference, initial functionality

* remove image; update state docstring

* Fix typo

* add todo for split/join

* remove context, clean-up args, remove prefill_preprocess_operaator

* fix docstrings

* initial functionality and working example with image classification

* updates func

* prompt inference, initial functionality

* finish generation operators and update routes

* further breakdown operators

* add operators

* fix can_operate condition

* update can_operate to not rely on the inference_state

* rebase + update

* fix condition

* fix capacity settting again

* typo fixes
---
 .../v2/operators/engine_operator.py           |   3 +
 src/deepsparse/v2/text_generation/__init__.py |   7 +
 .../autoregressive_preprocess_operator.py     |  20 ++-
 .../compile_generated_tokens.py               |  56 +++++++
 .../v2/text_generation/compile_generations.py |  55 +++++++
 .../v2/text_generation/compile_logits.py      |   6 +
 .../v2/text_generation/generate_new_token.py  |  90 +++++++++++
 .../multi_engine_prefill_operator.py          |   1 +
 .../v2/text_generation/nl_engine_operator.py  |   8 +-
 src/deepsparse/v2/text_generation/pipeline.py |  61 ++++----
 .../v2/text_generation/prep_for_generation.py | 140 ++++++++++++++++++
 .../v2/text_generation/process_inputs.py      |   2 +-
 .../v2/text_generation/process_outputs.py     |  88 +++++++++++
 .../v2/text_generation/token_generator.py     |  30 ++++
 14 files changed, 529 insertions(+), 38 deletions(-)
 create mode 100644 src/deepsparse/v2/text_generation/compile_generated_tokens.py
 create mode 100644 src/deepsparse/v2/text_generation/compile_generations.py
 create mode 100644 src/deepsparse/v2/text_generation/generate_new_token.py
 create mode 100644 src/deepsparse/v2/text_generation/prep_for_generation.py
 create mode 100644 src/deepsparse/v2/text_generation/process_outputs.py
 create mode 100644 src/deepsparse/v2/text_generation/token_generator.py

diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py
index b7d920a686..c2fc562c63 100644
--- a/src/deepsparse/v2/operators/engine_operator.py
+++ b/src/deepsparse/v2/operators/engine_operator.py
@@ -87,6 +87,9 @@ def __init__(
         self._engine_args = engine_args
         self._engine_type = engine_type
 
+        if not engine_kwargs:
+            engine_kwargs = {}
+
         self.engine = self.create_engine(**engine_kwargs)
 
     @property
diff --git a/src/deepsparse/v2/text_generation/__init__.py b/src/deepsparse/v2/text_generation/__init__.py
index 37ac88d02f..21cd7e2acd 100644
--- a/src/deepsparse/v2/text_generation/__init__.py
+++ b/src/deepsparse/v2/text_generation/__init__.py
@@ -13,12 +13,19 @@
 # limitations under the License.
 # flake8: noqa
 from .autoregressive_preprocess_operator import *
+from .compile_generated_tokens import *
+from .compile_generations import *
 from .compile_logits import *
+from .generate_new_token import *
 from .kv_cache_operator import *
 from .multi_engine_prefill_operator import *
 from .nl_engine_operator import *
 from .prep_for_prefill import *
 from .process_inputs import *
+from .process_outputs import *
 
 
+from .token_generator import *  # isort:skip
+from .prep_for_generation import *  # isort:skip
+
 from .pipeline import *  # isort:skip
diff --git a/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py
index cfe7cb531b..6e97412e43 100644
--- a/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py
+++ b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py
@@ -36,7 +36,6 @@ def __init__(self, sequence_length: int, prompt_sequence_length: int):
         """
         self.sequence_length = sequence_length
         self.prompt_sequence_length = prompt_sequence_length
-        self.set_capacity = False
 
         _LOGGER.warn(
             "This operator requires the PipelineState to be set-up with the "
@@ -51,16 +50,19 @@ def can_operate(self, inp: Any) -> bool:
         tokens = inp.get("tokens")
         kv_cache = inp.get("kv_cache")
 
+        if inp.get("in_generation"):
+            return True
+
         remaining_tokens = len(tokens) - kv_cache.total_num_processed_tokens
-        if remaining_tokens > 0 and remaining_tokens < self.prompt_sequence_length:
+        can_process = (
+            remaining_tokens > 0 and remaining_tokens < self.prompt_sequence_length
+        )
+        if can_process and inp.get("in_generation") is None:
             return True
         return False
 
     def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwargs):
-
-        if not self.set_capacity:
-            self.set_capacity = True
-            kv_cache.set_capacity(self.sequence_length - 1)
+        kv_cache.set_capacity(self.sequence_length - 1)
 
         num_total_processed_tokens = kv_cache.total_num_processed_tokens
         new_token = tokens[num_total_processed_tokens]
@@ -88,13 +90,9 @@ def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwarg
 
         engine_inputs = [engine_inputs_map[name] for name in engine_input_names]
 
-        onnx_input_names_no_cache = pipeline_state.current_state.get(
-            "onnx_input_names_no_cache"
-        )
-        engine_inputs = [engine_inputs_map[name] for name in onnx_input_names_no_cache]
-
         return {
             "engine_inputs": engine_inputs,
             "kv_cache": kv_cache,
             "tokens": tokens,
+            "in_generation": kwargs.get("in_generation"),
         }
diff --git a/src/deepsparse/v2/text_generation/compile_generated_tokens.py b/src/deepsparse/v2/text_generation/compile_generated_tokens.py
new file mode 100644
index 0000000000..c87436ab3a
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/compile_generated_tokens.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.utils import InferenceState
+
+
+__all__ = ["CompileGeneratedTokens"]
+
+
+class CompileGeneratedTokens(Operator):
+    def run(
+        self,
+        new_token,
+        logits,
+        finish_reason,
+        kv_cache,
+        tokens,
+        inference_state: InferenceState,
+        **kwargs,
+    ):
+        in_generation = True
+
+        generated_tokens = inference_state.current_state.get("generated_tokens")
+        generated_logits = inference_state.current_state.get("generated_logits")
+        finished_reason = inference_state.current_state.get("finished_reason")
+
+        generated_tokens.append(new_token)
+        generated_logits.append(logits)
+        finished_reason.append(finish_reason)
+
+        if finish_reason is not None:
+            in_generation = False
+
+        state_update = {  # TODO: check if necessary
+            "finished_reason": finished_reason,
+            "generated_tokens": generated_tokens,
+            "generated_logits": generated_logits,
+        }
+
+        output = {
+            "tokens": tokens,
+            "kv_cache": kv_cache,
+            "in_generation": in_generation,
+        }
+        return output, state_update
diff --git a/src/deepsparse/v2/text_generation/compile_generations.py b/src/deepsparse/v2/text_generation/compile_generations.py
new file mode 100644
index 0000000000..ed8297ac01
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/compile_generations.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+
+import numpy
+from pydantic import BaseModel, Field
+
+from deepsparse.transformers.pipelines.text_generation import FinishReason
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.utils import InferenceState
+
+
+__all__ = ["CompileGenerations", "CompileGenerationsOutput"]
+
+
+class CompileGenerationsOutput(BaseModel):
+    generated_tokens: Any = Field(description="generated_tokens")
+    generated_logits: Any = Field(description="generated_logits")
+    finished_reason: Any = Field(description="finished_reason")
+
+
+class CompileGenerations(Operator):
+    output_schema = CompileGenerationsOutput
+
+    def can_operate(self, inp: Any):
+        if inp.get("in_generation") is False:
+            return True
+        return False
+
+    def run(self, inference_state: InferenceState, **kwargs):
+        generated_tokens = inference_state.current_state.get("generated_tokens")
+        generated_logits = inference_state.current_state.get("generated_logits")
+        finished_reason = inference_state.current_state.get("finished_reason")
+
+        if len(finished_reason) == 0:
+            finished_reason.append(FinishReason.LENGTH)
+
+        generated_tokens = numpy.array([generated_tokens])
+        generated_logits = numpy.concatenate(generated_logits, axis=1)
+        return {
+            "generated_tokens": generated_tokens,
+            "generated_logits": generated_logits,
+            "finished_reason": finished_reason,
+        }
diff --git a/src/deepsparse/v2/text_generation/compile_logits.py b/src/deepsparse/v2/text_generation/compile_logits.py
index 55c87d791d..21bd50e03e 100644
--- a/src/deepsparse/v2/text_generation/compile_logits.py
+++ b/src/deepsparse/v2/text_generation/compile_logits.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Any
 
 from deepsparse.v2.operators import Operator
 from deepsparse.v2.utils import InferenceState
@@ -27,6 +28,11 @@ class CompilePromptLogits(Operator):
     take prompt logits from each iteration run and update the inference state.
     """
 
+    def can_operate(self, inp: Any):
+        if inp.get("in_generation") is None:
+            return True
+        return False
+
     def run(self, logits, inference_state: InferenceState, **kwargs):
         logit_type = "prompt_logits"
 
diff --git a/src/deepsparse/v2/text_generation/generate_new_token.py b/src/deepsparse/v2/text_generation/generate_new_token.py
new file mode 100644
index 0000000000..33ab546e39
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/generate_new_token.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Sequence, Union
+
+import transformers
+
+from deepsparse.transformers.pipelines.text_generation import FinishReason
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.utils import InferenceState
+
+
+__all__ = ["GenerateNewTokenOperator"]
+
+
+class GenerateNewTokenOperator(Operator):
+    def __init__(
+        self, tokenizer: transformers.PreTrainedTokenizerBase, force_max_tokens: bool
+    ):
+        self.force_max_tokens = force_max_tokens
+        self.tokenizer = tokenizer
+
+    def can_operate(self, inp: Any):
+        if inp.get("in_generation"):
+            return True
+        return False
+
+    def run(self, logits, kv_cache, inference_state: InferenceState, **kwargs):
+        token_generator = inference_state.current_state.get("token_generator")
+        token = token_generator.generate(logits=logits[0, -1, :])
+        finish_reason = None
+
+        callback = inference_state.current_state.get("callback")
+        stop = inference_state.current_state.get("stop")
+
+        if token == self.tokenizer.eos_token_id and not self.force_max_tokens:
+            finish_reason = FinishReason.STOP
+
+        if self._stop_token_generated(token, stop_tokens=stop):
+            print(
+                "Stop token %s generated. Stopping generation."
+                % self.tokenizer.decode(token)
+            )
+            finish_reason = FinishReason.STOP
+
+        if callback is not None and callback(token) is False:
+            print(
+                "callback %s returned False, stopping generation."
+                % callback.__qualname__
+            )
+            finish_reason = FinishReason.CALLBACK
+
+        max_tokens = inference_state.current_state.get("max_tokens")
+        if len(inference_state.current_state.get("generated_tokens")) + 1 == max_tokens:
+            finish_reason = inference_state.current_state.get("length_finish_reason")
+
+        state_update = {
+            "token_generator": token_generator,
+        }
+
+        new_generation = {
+            "logits": logits,
+            "new_token": token,
+            "finish_reason": finish_reason,
+        }
+        output = {"tokens": token_generator.tokens, "kv_cache": kv_cache}
+        output.update(new_generation)
+        return output, state_update
+
+    def _stop_token_generated(
+        self, token, stop_tokens: Union[None, str, Sequence[str]]
+    ) -> bool:
+        if stop_tokens is None:
+            return False
+
+        decoded_token = self.tokenizer.decode(token)
+        decoded_token = (
+            decoded_token if decoded_token.isspace() else decoded_token.strip()
+        )
+        return decoded_token in stop_tokens
diff --git a/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py
index 41ee830a8a..9a885c2355 100644
--- a/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py
+++ b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py
@@ -97,6 +97,7 @@ def _case_positions(self, num_total_processed_tokens: int):
         )
 
     def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwargs):
+        kv_cache.set_capacity(self.sequence_length - self.prompt_sequence_length)
 
         onnx_input_names_no_cache = pipeline_state.current_state.get(
             "onnx_input_names_no_cache"
diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py
index 6c1ad1966e..0bd9098a40 100644
--- a/src/deepsparse/v2/text_generation/nl_engine_operator.py
+++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py
@@ -36,6 +36,7 @@ class NlEngineInput(BaseModel):
     engine_inputs: List = Field(description="engine inputs")
     kv_cache: Any = Field(description="kv_cache object")
     tokens: List = Field(description="tokens")
+    in_generation: bool = Field(description="in_generation", default=None)
 
 
 class NLEngineOperator(EngineOperator):
@@ -119,7 +120,12 @@ def run(self, inp: NlEngineInput, **kwargs) -> Any:
             kv_cache=kv_cache,
         )
 
-        output = {"logits": logits, "kv_cache": kv_cache, "tokens": inp.tokens}
+        output = {
+            "logits": logits,
+            "kv_cache": kv_cache,
+            "tokens": inp.tokens,
+            "in_generation": inp.in_generation,
+        }
         return output
 
     def _add_kv_cache_to_input(self, engine_input, kv_cache):
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
index 9878aa0061..49826b8af7 100644
--- a/src/deepsparse/v2/text_generation/pipeline.py
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -15,18 +15,23 @@
 from typing import Dict
 
 from deepsparse.transformers.utils.helpers import process_generation_config
-from deepsparse.v2.operators import Operator
 from deepsparse.v2.pipeline import Pipeline
 from deepsparse.v2.routers import GraphRouter
 from deepsparse.v2.schedulers import OperatorScheduler
 from deepsparse.v2.text_generation import (
     AutoRegressiveOperatorPreprocess,
+    CompileGeneratedTokens,
+    CompileGenerations,
     CompilePromptLogits,
+    GenerateNewTokenOperator,
     KVCacheCreator,
     MultiEnginePrefill,
     NLEngineOperator,
     PrepareforPrefill,
+    PrepareGeneration,
     ProcessInputsTextGeneration,
+    ProcessOutputs,
+    TokenGeneratorOperator,
 )
 from deepsparse.v2.utils import PipelineState
 
@@ -109,17 +114,23 @@ def __init__(
             sequence_length=sequence_length,
         )
         compile_prompt_logits = CompilePromptLogits()
-        """
-        prep_for_single_engine = PrepareforSingleEngine(
-            prompt_sequence_length=prompt_sequence_length,
+
+        autoregressive_preprocess = AutoRegressiveOperatorPreprocess(
             sequence_length=sequence_length,
+            prompt_sequence_length=prompt_sequence_length,
         )
-        """
-        autoregressive_preprocess = AutoRegressiveOperatorPreprocess(
+        token_generator = TokenGeneratorOperator()
+        prep_for_generation = PrepareGeneration(
             sequence_length=sequence_length,
             prompt_sequence_length=prompt_sequence_length,
+            token_generator=token_generator,
+        )
+        generate_new_token = GenerateNewTokenOperator(
+            tokenizer=self.tokenizer, force_max_tokens=force_max_tokens
         )
-        final_step = FinalStep()
+        process_output = ProcessOutputs(tokenizer=self.tokenizer)
+        compile_generations = CompileGenerations()
+        compile_generated_tokens = CompileGeneratedTokens()
 
         ops = {
             "process_input": process_inputs,
@@ -130,7 +141,11 @@ def __init__(
             "multi_engine_prefill": multi_engine_prefill,
             "compile_logits": compile_prompt_logits,
             "autoregressive_preprocess": autoregressive_preprocess,
-            "final_step": final_step,
+            "prep_for_generation": prep_for_generation,
+            "generate_new_token": generate_new_token,
+            "process_outputs": process_output,
+            "compile_generations": compile_generations,
+            "compile_generated_tokens": compile_generated_tokens,
         }
 
         routes = {
@@ -140,12 +155,22 @@ def __init__(
             "multi_engine": "compile_logits",
             "compile_logits": [
                 "multi_engine_prefill",
+                "prep_for_generation",
                 "autoregressive_preprocess",
-                "final_step",
             ],
             "autoregressive_preprocess": "single_engine",
-            "single_engine": "compile_logits",
-            "final_step": "STOP",
+            "single_engine": [
+                "compile_logits",
+                "generate_new_token",
+            ],
+            "prep_for_generation": "autoregressive_preprocess",
+            "generate_new_token": "compile_generated_tokens",
+            "compile_generated_tokens": [
+                "autoregressive_preprocess",
+                "compile_generations",
+            ],
+            "compile_generations": "process_outputs",
+            "process_outputs": "STOP",
         }
 
         router = GraphRouter(
@@ -197,17 +222,3 @@ def setup_onnx_file_path(self, model_path, sequence_length) -> str:
                 "See `tokenizer` and `config` arguments for details."
             )
         return onnx_path
-
-
-# NOTE: This is a dummy last step which will be removed. Used as a final step
-# for the current routes.
-class FinalStep(Operator):
-    def can_operate(self, *args, **kwargs):
-        return True
-
-    def run(self, *args, **kwargs):
-        import numpy
-
-        inference_state = kwargs.get("inference_state")
-        prompt_logits = inference_state.current_state.get("prompt_logits")
-        return numpy.concatenate(prompt_logits, axis=1)
diff --git a/src/deepsparse/v2/text_generation/prep_for_generation.py b/src/deepsparse/v2/text_generation/prep_for_generation.py
new file mode 100644
index 0000000000..544af43980
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/prep_for_generation.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+
+import numpy
+
+from deepsparse.transformers.pipelines.text_generation import FinishReason
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.text_generation import TokenGeneratorOperator
+from deepsparse.v2.utils import InferenceState
+
+
+__all__ = ["PrepareGeneration"]
+
+
+class PrepareGeneration(Operator):
+    def __init__(
+        self,
+        token_generator: TokenGeneratorOperator,
+        prompt_sequence_length: int,
+        sequence_length: int,
+    ):
+        self.prompt_sequence_length = prompt_sequence_length
+        self.sequence_length = sequence_length
+        self.token_generator_creator = token_generator
+
+    def can_operate(self, inp: Any):
+        kv_cache = inp.get("kv_cache")
+        tokens = inp.get("tokens")
+
+        # If the number of prompt tokens is greater than what we've processed,
+        # don't start generation. Should be equal when started as all prompt logits
+        # should be accounted for and we should have updated the kv_cache for the single
+        # token engine.
+        if len(tokens) == kv_cache.total_num_processed_tokens:
+            return True
+        return False
+
+    @staticmethod
+    def set_generated_length(
+        max_length: int,
+        prompt_tokens_length: int,
+        sequence_length: int,
+        prompt_sequence_length: int,
+        max_new_tokens: int,
+        finish_reason_choices: "FinishReason",  # noqa
+    ):
+        """
+        Determine the length of the generated tokens. The hard cap on the total number
+        of tokens is based on the sequence length. If max_length is provided and is less
+        than the sequence length, it will be used to cap the total number of tokens
+        generated. If it is not provided, the max_new_tokens attribute will be used and
+        also capped by the sequence length.
+
+        :param max_length: max_length attribute, provided as input during inference
+        :param prompt_tokens_length: the number of prompt tokens used as part of the
+            generated output
+        :param sequence_length: the sequence length used for the pipeline
+        :param prompt_sequence_length: the prompt sequence length used for the pipeline
+        :param max_new_tokens: the max_new_tokens attribute, which may be provided
+        as part of the input during inference
+        """
+        if max_length:
+            # if max_length provided, use that to cap total tokens generated
+            max_tokens = max_length
+            finish_reason = finish_reason_choices.LENGTH
+        else:
+            # if not provided, max tokens is based on max_new_tokens + prompt tokens
+            max_tokens = (
+                min(max_new_tokens, sequence_length - prompt_sequence_length)
+                + prompt_tokens_length
+            )
+            finish_reason = finish_reason_choices.MAX_NEW_TOKENS
+
+        # hard model/pipeline cap
+        return (
+            (sequence_length, finish_reason_choices.CAPACITY)
+            if sequence_length < max_tokens
+            else (max_tokens, finish_reason)
+        )
+
+    def run(
+        self, tokens: Any, kv_cache: Any, inference_state: InferenceState, **kwargs
+    ):
+        prompt_logits = inference_state.current_state.get("prompt_logits")
+        prompt_logits = numpy.concatenate(prompt_logits, axis=1)
+        # TODO: clean this up such that dont have to keep writing current_state
+        # everywhere
+
+        generation_config = inference_state.current_state.get("generation_config")
+        include_prompt_logits = inference_state.current_state.get(
+            "include_prompt_logits"
+        )
+
+        token_generator_creator_output = self.token_generator_creator.run(
+            logits_shape=prompt_logits[0, -1, :].shape,
+            deterministic=not generation_config.do_sample,
+            sampling_temperature=generation_config.temperature,
+            tokens=tokens,
+            **inference_state.current_state,
+        )
+        token_generator = token_generator_creator_output.get("token_generator")
+        token_generator.generate(prompt_logits[0, -1, :])
+
+        max_tokens, length_finish_reason = PrepareGeneration.set_generated_length(
+            max_length=generation_config.max_length,
+            prompt_tokens_length=1,
+            max_new_tokens=generation_config.max_new_tokens,
+            sequence_length=self.sequence_length,
+            prompt_sequence_length=self.prompt_sequence_length,
+            finish_reason_choices=FinishReason,
+        )
+        state_update = {
+            "max_tokens": max_tokens,
+            "length_finish_reason": length_finish_reason,
+            "generated_tokens": [token_generator.tokens[-1]],
+            "generated_logits": [prompt_logits]
+            if include_prompt_logits
+            else [numpy.expand_dims(prompt_logits[:, -1, :], 0)],
+            "finished_reason": [],
+            "token_generator": token_generator,
+        }
+
+        output = {
+            "tokens": token_generator.tokens,
+            "kv_cache": kv_cache,
+            "in_generation": True,
+        }
+        return output, state_update
diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py
index 528dcee0b7..e57e402983 100644
--- a/src/deepsparse/v2/text_generation/process_inputs.py
+++ b/src/deepsparse/v2/text_generation/process_inputs.py
@@ -28,7 +28,7 @@
 
 class GenerationDefaults:
     num_return_sequences = 1
-    max_length = 1024
+    max_length = 100
     max_new_tokens = None
     output_scores = False
     top_k = 0
diff --git a/src/deepsparse/v2/text_generation/process_outputs.py b/src/deepsparse/v2/text_generation/process_outputs.py
new file mode 100644
index 0000000000..ca1cf78521
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/process_outputs.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import datetime
+from typing import Optional
+
+import numpy
+
+from deepsparse.transformers.pipelines.text_generation import (
+    FinishReason,
+    GeneratedText,
+    TextGenerationOutput,
+)
+from deepsparse.v2.operators import Operator
+from deepsparse.v2.text_generation.compile_generations import CompileGenerationsOutput
+from deepsparse.v2.utils import InferenceState
+
+
+class ProcessOutputs(Operator):
+    output_schema = TextGenerationOutput
+
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+
+    def _create_generated_text_output(
+        self,
+        sequence: str,
+        finish_reason: Optional[FinishReason] = None,
+        logits: Optional[numpy.array] = None,
+    ):
+        if finish_reason:
+            return GeneratedText(
+                text=sequence,
+                score=logits,
+                finished=True,
+                finished_reason=finish_reason.value,
+            )
+        return GeneratedText(
+            text=sequence,
+            score=logits,
+            finished=False,
+        )
+
+    def run(
+        self, inp: CompileGenerationsOutput, inference_state: InferenceState, **kwargs
+    ):
+        generation_config = inference_state.current_state.get("generation_config")
+        generated_tokens = inp.generated_tokens
+        generated_logits = (
+            inp.generated_logits if generation_config.output_scores else None
+        )
+        finished_reason = inp.finished_reason
+        sequences = self.tokenizer.batch_decode(
+            generated_tokens, skip_special_tokens=True
+        )
+
+        finished_reason = [f for f in finished_reason if f]
+
+        if generated_logits is not None:
+            generations = list(
+                map(
+                    self._create_generated_text_output,
+                    sequences,
+                    finished_reason,
+                    generated_logits,
+                )
+            )
+        else:
+            generations = list(
+                map(self._create_generated_text_output, sequences, finished_reason)
+            )
+        outputs = dict(
+            created=datetime.datetime.now(),
+            prompts=inference_state.current_state.get("prompts"),
+            generations=generations,
+        )
+
+        return outputs
diff --git a/src/deepsparse/v2/text_generation/token_generator.py b/src/deepsparse/v2/text_generation/token_generator.py
new file mode 100644
index 0000000000..9148d71cc8
--- /dev/null
+++ b/src/deepsparse/v2/text_generation/token_generator.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from deepsparse.transformers.utils.token_generator import TokenGenerator
+from deepsparse.v2.operators import Operator
+
+
+__all__ = ["TokenGeneratorOperator"]
+
+
+class TokenGeneratorOperator(Operator):
+    def run(self, logits_shape, deterministic, tokens, sampling_temperature, **kwargs):
+        token_generator = TokenGenerator(
+            logits_shape=logits_shape,
+            deterministic=deterministic,
+            tokens=tokens,
+            sampling_temperature=sampling_temperature,
+            **kwargs,
+        )
+        return {"token_generator": token_generator}

From 7f3eb12e25ff8ee57caa55c9112e3ed9c610026b Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Mon, 18 Dec 2023 16:10:18 +0000
Subject: [PATCH 6/7] initial commit

---
 src/deepsparse/transformers/helpers.py        | 38 ++++++-----
 src/deepsparse/utils/onnx.py                  |  8 +--
 src/deepsparse/v2/text_generation/pipeline.py | 68 ++++---------------
 3 files changed, 36 insertions(+), 78 deletions(-)

diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index d5fc5ed438..1e733ec1bb 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -30,6 +30,8 @@
 from onnx import ModelProto
 
 from deepsparse.log import get_main_logger
+from deepsparse.utils.onnx import MODEL_ONNX_NAME, truncate_onnx_model
+from sparsezoo import Model
 from deepsparse.utils.onnx import (
     _MODEL_DIR_ONNX_NAME,
     model_to_path,
@@ -39,6 +41,7 @@
 
 
 __all__ = [
+    "setup_transformers_pipeline",
     "get_deployment_path",
     "setup_transformers_pipeline",
     "overwrite_transformer_onnx_model_inputs",
@@ -55,6 +58,7 @@ def setup_transformers_pipeline(
     sequence_length: int,
     tokenizer_padding_side: str = "left",
     engine_kwargs: Optional[Dict] = None,
+    onnx_model_name: Optional[str] = None,
 ) -> Tuple[
     str, transformers.PretrainedConfig, transformers.PreTrainedTokenizer, Dict[str, Any]
 ]:
@@ -66,30 +70,27 @@ def setup_transformers_pipeline(
     :param tokenizer_padding_side: The side to pad on for the tokenizer,
         either "left" or "right"
     :param engine_kwargs: The kwargs to pass to the engine
+    :param onnx_model_name: The name of the onnx model to be loaded.
+        If not specified, defaults are used (see setup_onnx_file_path)
     :return The model path, config, tokenizer, and engine kwargs
     """
-    model_path, config, tokenizer = fetch_onnx_file_path(model_path, sequence_length)
+    model_path, config, tokenizer = setup_onnx_file_path(
+        model_path, sequence_length, onnx_model_name
+    )
 
     tokenizer.padding_side = tokenizer_padding_side
     if not tokenizer.pad_token:
         tokenizer.pad_token = tokenizer.eos_token
 
     engine_kwargs = engine_kwargs or {}
-    if engine_kwargs.get("model_path"):
-        raise ValueError(
-            "The engine kwargs already specify "
-            f"a model path: {engine_kwargs['model_path']}, "
-            f"but a model path was also provided: {model_path}. "
-            "Please only provide one."
-        )
     engine_kwargs["model_path"] = model_path
     return model_path, config, tokenizer, engine_kwargs
 
 
-def fetch_onnx_file_path(
+def setup_onnx_file_path(
     model_path: str,
     sequence_length: int,
-    task: Optional[str] = None,
+    onnx_model_name: Optional[str] = None,
 ) -> Tuple[str, transformers.PretrainedConfig, transformers.PreTrainedTokenizer]:
     """
     Parses ONNX model from the `model_path` provided. It additionally
@@ -97,17 +98,18 @@ def fetch_onnx_file_path(
     derived from the `model_path` provided.
     :param model_path: path to the model to be parsed
     :param sequence_length: maximum sequence length of the model
+    :param onnx_model_name: optionally, the precise name of the ONNX model
+        of interest may be specified. If not specified, the default ONNX model
+        name will be used (refer to `get_deployment_path` for details)
     :return: file path to the processed ONNX file for the engine to compile
     """
-    deployment_path, onnx_path = get_deployment_path(model_path)
+    deployment_path, onnx_path = get_deployment_path(model_path, onnx_model_name)
 
     hf_logger = logging.getLogger("transformers")
     hf_logger_level = hf_logger.level
     hf_logger.setLevel(logging.ERROR)
 
-    config = transformers.PretrainedConfig.from_pretrained(
-        deployment_path, finetuning_task=task
-    )
+    config = transformers.PretrainedConfig.from_pretrained(deployment_path)
     hf_logger.setLevel(hf_logger_level)
 
     trust_remote_code = False
@@ -145,13 +147,13 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]:
     if os.path.isdir(model_path):
         model_files = os.listdir(model_path)
 
-        if _MODEL_DIR_ONNX_NAME not in model_files:
+        if MODEL_ONNX_NAME not in model_files:
             raise ValueError(
-                f"{_MODEL_DIR_ONNX_NAME} not found in transformers model directory "
+                f"{MODEL_ONNX_NAME} not found in transformers model directory "
                 f"{model_path}. Be sure that an export of the model is written to "
-                f"{os.path.join(model_path, _MODEL_DIR_ONNX_NAME)}"
+                f"{os.path.join(model_path, MODEL_ONNX_NAME)}"
             )
-        return model_path, os.path.join(model_path, _MODEL_DIR_ONNX_NAME)
+        return model_path, os.path.join(model_path, MODEL_ONNX_NAME)
 
     elif model_path.startswith("zoo:") or model_path.startswith("hf:"):
         onnx_model_path = model_to_path(model_path)
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index ae0913ffd7..35d932c75d 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -56,12 +56,12 @@
     "has_model_kv_cache",
     "CACHE_INPUT_PREFIX",
     "CACHE_OUTPUT_PREFIX",
-    "_MODEL_DIR_ONNX_NAME",
+    "MODEL_ONNX_NAME",
 ]
 
 _LOGGER = logging.getLogger(__name__)
 
-_MODEL_DIR_ONNX_NAME = "model.onnx"
+MODEL_ONNX_NAME = "model.onnx"
 CACHE_INPUT_PREFIX = "past_key_values"
 CACHE_OUTPUT_PREFIX = "present"
 
@@ -132,7 +132,7 @@ def model_to_path(model: Union[str, Model, File]) -> str:
         model.deployment.path
 
         # default to the main onnx file for the model
-        model = model.deployment.get_file(_MODEL_DIR_ONNX_NAME).path
+        model = model.deployment.get_file(MODEL_ONNX_NAME).path
 
     elif File is not object and isinstance(model, File):
         # get the downloaded_path -- will auto download if not on local system
@@ -161,7 +161,7 @@ def model_to_path(model: Union[str, Model, File]) -> str:
 
     model_path = Path(model)
     if model_path.is_dir():
-        return str(model_path / _MODEL_DIR_ONNX_NAME)
+        return str(model_path / MODEL_ONNX_NAME)
 
     return model
 
diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py
index 49826b8af7..fdb31f1c6c 100644
--- a/src/deepsparse/v2/text_generation/pipeline.py
+++ b/src/deepsparse/v2/text_generation/pipeline.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict
+from typing import Dict, Optional
 
+from deepsparse.transformers.helpers import setup_transformers_pipeline
 from deepsparse.transformers.utils.helpers import process_generation_config
 from deepsparse.v2.pipeline import Pipeline
 from deepsparse.v2.routers import GraphRouter
@@ -45,23 +46,20 @@ def __init__(
         internal_kv_cache: bool = True,
         force_max_tokens: bool = False,
         generation_config=None,
-        engine_kwargs: Dict = None,
+        engine_kwargs: Optional[Dict] = None,
     ):
+        (
+            self.model_path,
+            self.config,
+            self.tokenizer,
+            engine_kwargs,
+        ) = setup_transformers_pipeline(
+            model_path, sequence_length, engine_kwargs=engine_kwargs
+        )
 
         pipeline_state = PipelineState()
         pipeline_state_vals = {}
 
-        # TODO: The code below will be replaced with a transformers set-up Operator.
-        self.tokenizer = None
-        model_path = self.setup_onnx_file_path(model_path, sequence_length)
-        self.tokenizer.padding_side = "left"
-        if not self.tokenizer.pad_token:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-
-        if not engine_kwargs:
-            engine_kwargs = {}
-        engine_kwargs["model_path"] = model_path
-
         if internal_kv_cache and engine_kwargs.get("engine_type") == "onnxruntime":
             internal_kv_cache = False
 
@@ -80,7 +78,7 @@ def __init__(
         )
 
         # NOTE: Currently using pipeline state. Can swap to simply pass in the
-        # attributes to the specific Operator that neeed them, as class attributes.
+        # attributes to the specific Operator that need them, as class attributes.
         pipeline_state_vals[
             "onnx_input_names_no_cache"
         ] = single_engine_operator.onnx_input_names_no_cache
@@ -180,45 +178,3 @@ def __init__(
         super().__init__(
             ops=ops, router=router, schedulers=scheduler, pipeline_state=pipeline_state
         )
-
-    # TODO: Move to be part of a generic transformers set-up Operator.
-    def setup_onnx_file_path(self, model_path, sequence_length) -> str:
-        import logging
-
-        import transformers
-        from transformers import AutoTokenizer
-
-        from deepsparse.transformers.helpers import get_deployment_path
-
-        """
-        Parses ONNX model from the `model_path` provided. It additionally
-        creates config and tokenizer objects from the `deployment path`,
-        derived from the `model_path` provided.
-
-        :return: file path to the processed ONNX file for the engine to compile
-        """
-        deployment_path, onnx_path = get_deployment_path(model_path)
-
-        hf_logger = logging.getLogger("transformers")
-        hf_logger_level = hf_logger.level
-        hf_logger.setLevel(logging.ERROR)
-        self.config = transformers.PretrainedConfig.from_pretrained(
-            deployment_path,
-            finetuning_task=self.task if hasattr(self, "task") else None,
-        )
-        hf_logger.setLevel(hf_logger_level)
-
-        self._trust_remote_code = False
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            deployment_path,
-            trust_remote_code=self._trust_remote_code,
-            model_max_length=sequence_length,
-        )
-
-        if not self.config or not self.tokenizer:
-            raise RuntimeError(
-                "Invalid config or tokenizer provided. Please provide "
-                "paths to the files or ensure they exist in the `model_path` provided. "
-                "See `tokenizer` and `config` arguments for details."
-            )
-        return onnx_path

From 0901a01dea3e09856df98bf6605cb18d92b3f19f Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Mon, 18 Dec 2023 16:15:41 +0000
Subject: [PATCH 7/7] ready for reviews

---
 src/deepsparse/transformers/helpers.py | 25 ++++++++++++-------------
 src/deepsparse/utils/onnx.py           |  4 ++--
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index 1e733ec1bb..25a179c5d0 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -30,18 +30,11 @@
 from onnx import ModelProto
 
 from deepsparse.log import get_main_logger
-from deepsparse.utils.onnx import MODEL_ONNX_NAME, truncate_onnx_model
-from sparsezoo import Model
-from deepsparse.utils.onnx import (
-    _MODEL_DIR_ONNX_NAME,
-    model_to_path,
-    truncate_onnx_model,
-)
+from deepsparse.utils.onnx import MODEL_ONNX_NAME, model_to_path, truncate_onnx_model
 from sparsezoo.utils import save_onnx
 
 
 __all__ = [
-    "setup_transformers_pipeline",
     "get_deployment_path",
     "setup_transformers_pipeline",
     "overwrite_transformer_onnx_model_inputs",
@@ -128,7 +121,9 @@ def setup_onnx_file_path(
     return onnx_path, config, tokenizer
 
 
-def get_deployment_path(model_path: str) -> Tuple[str, str]:
+def get_deployment_path(
+    model_path: str, onnx_model_name: Optional[str] = None
+) -> Tuple[str, str]:
     """
     Returns the path to the deployment directory
     for the given model path and the path to the mandatory
@@ -137,9 +132,13 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]:
     for running the transformers model in the deepsparse pipeline
 
     :param model_path: path to model directory, sparsezoo stub, or ONNX file
+    :param onnx_model_name: optionally, the precise name of the ONNX model
+        of interest may be specified. If not specified, the default ONNX model
+        name will be used.
     :return: path to the deployment directory and path to the ONNX file inside
         the deployment directory
     """
+    onnx_model_name = onnx_model_name or MODEL_ONNX_NAME
     if os.path.isfile(model_path):
         # return the parent directory of the ONNX file
         return os.path.dirname(model_path), model_path
@@ -147,13 +146,13 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]:
     if os.path.isdir(model_path):
         model_files = os.listdir(model_path)
 
-        if MODEL_ONNX_NAME not in model_files:
+        if onnx_model_name not in model_files:
             raise ValueError(
-                f"{MODEL_ONNX_NAME} not found in transformers model directory "
+                f"{onnx_model_name} not found in transformers model directory "
                 f"{model_path}. Be sure that an export of the model is written to "
-                f"{os.path.join(model_path, MODEL_ONNX_NAME)}"
+                f"{os.path.join(model_path, onnx_model_name)}"
             )
-        return model_path, os.path.join(model_path, MODEL_ONNX_NAME)
+        return model_path, os.path.join(model_path, onnx_model_name)
 
     elif model_path.startswith("zoo:") or model_path.startswith("hf:"):
         onnx_model_path = model_to_path(model_path)
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index 35d932c75d..e4b41f3286 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -143,10 +143,10 @@ def model_to_path(model: Union[str, Model, File]) -> str:
         from huggingface_hub import snapshot_download
 
         deployment_path = snapshot_download(repo_id=model.replace("hf:", "", 1))
-        onnx_path = os.path.join(deployment_path, _MODEL_DIR_ONNX_NAME)
+        onnx_path = os.path.join(deployment_path, MODEL_ONNX_NAME)
         if not os.path.isfile(onnx_path):
             raise ValueError(
-                f"Could not find the ONNX model file '{_MODEL_DIR_ONNX_NAME}' in the "
+                f"Could not find the ONNX model file '{MODEL_ONNX_NAME}' in the "
                 f"Hugging Face Hub repository located at {deployment_path}. Please "
                 f"ensure the model has been correctly exported to ONNX format and "
                 f"exists in the repository."