From aa18bace892521e6a72b0720e75a955206f8dce8 Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Thu, 26 Oct 2023 13:22:35 -0400 Subject: [PATCH 1/7] Pipelines Refactor - Initial Impl (#1287) --- src/deepsparse/v2/__init__.py | 21 ++++ src/deepsparse/v2/operators/__init__.py | 17 +++ src/deepsparse/v2/operators/operator.py | 90 ++++++++++++++++ src/deepsparse/v2/pipeline.py | 102 ++++++++++++++++++ src/deepsparse/v2/routers/__init__.py | 17 +++ src/deepsparse/v2/routers/router.py | 95 ++++++++++++++++ src/deepsparse/v2/schedulers/__init__.py | 18 ++++ src/deepsparse/v2/schedulers/scheduler.py | 63 +++++++++++ .../v2/schedulers/scheduler_group.py | 64 +++++++++++ src/deepsparse/v2/utils/__init__.py | 18 ++++ src/deepsparse/v2/utils/context.py | 42 ++++++++ src/deepsparse/v2/utils/types.py | 28 +++++ tests/deepsparse/v2/__init__.py | 0 tests/deepsparse/v2/test_basic_pipeline.py | 45 ++++++++ 14 files changed, 620 insertions(+) create mode 100644 src/deepsparse/v2/__init__.py create mode 100644 src/deepsparse/v2/operators/__init__.py create mode 100644 src/deepsparse/v2/operators/operator.py create mode 100644 src/deepsparse/v2/pipeline.py create mode 100644 src/deepsparse/v2/routers/__init__.py create mode 100644 src/deepsparse/v2/routers/router.py create mode 100644 src/deepsparse/v2/schedulers/__init__.py create mode 100644 src/deepsparse/v2/schedulers/scheduler.py create mode 100644 src/deepsparse/v2/schedulers/scheduler_group.py create mode 100644 src/deepsparse/v2/utils/__init__.py create mode 100644 src/deepsparse/v2/utils/context.py create mode 100644 src/deepsparse/v2/utils/types.py create mode 100644 tests/deepsparse/v2/__init__.py create mode 100644 tests/deepsparse/v2/test_basic_pipeline.py diff --git a/src/deepsparse/v2/__init__.py b/src/deepsparse/v2/__init__.py new file mode 100644 index 0000000000..4a897be06f --- /dev/null +++ b/src/deepsparse/v2/__init__.py @@ -0,0 +1,21 @@ +# flake8: noqa + +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .pipeline import * +from .operators import * +from .routers import * +from .schedulers import * +from .utils import * diff --git a/src/deepsparse/v2/operators/__init__.py b/src/deepsparse/v2/operators/__init__.py new file mode 100644 index 0000000000..8f7e6a169d --- /dev/null +++ b/src/deepsparse/v2/operators/__init__.py @@ -0,0 +1,17 @@ +# flake8: noqa + +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .operator import * diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py new file mode 100644 index 0000000000..30e1a48379 --- /dev/null +++ b/src/deepsparse/v2/operators/operator.py @@ -0,0 +1,90 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import Optional, Type + +from pydantic import BaseModel + +from deepsparse.v2.utils import Context, OperatorSchema + + +__all__ = ["Operator"] + + +class Operator(ABC): + """ + Base operator class - can represent any part of an ML pipeline + """ + + # expected structured input and output types, to be defined by child classes + input_schema: Optional[Type[OperatorSchema]] = None + output_schema: Optional[Type[OperatorSchema]] = None + + @abstractmethod + def run(self, inp: OperatorSchema, context: Context) -> OperatorSchema: + """ + :param inp: operator input, as the defined input schema if applicable + :param context: pipeline context of already run operators + :return: result of this operator as the defined output schema if applicable + """ + raise NotImplementedError + + @classmethod + def has_input_schema(cls) -> bool: + """ + :return: True if this class has a defined pydantic input schema + """ + return issubclass(cls.input_schema, BaseModel) + + @classmethod + def has_output_schema(cls) -> bool: + """ + :return: True if this class has a defined pydantic input schema + """ + return issubclass(cls.output_schema, BaseModel) + + def __call__( + self, + *args, + context: Optional[Context] = None, + **kwargs, + ) -> OperatorSchema: + """ + Parses inputs to this Operator and runs the run() method of this operator + + :param args: an unnamed arg may only be provided + if it is of the type of the input_schema + :param context: pipeline context to pass to operator + :param kwargs: kwargs when not initializing from an instantiated schema + :return: operator output + """ + if len(args) > 1: + raise ValueError( + f"Only 1 unnamed arg may be supplied to an Operator, found {len(args)}" + ) + + if len(args) == 1: + if self.input_schema is not None and isinstance(args[0], self.input_schema): + inference_input = args[0] + else: + raise ValueError( + f"1 arg supplied to Operator {self.__class__.__name__} but was not " + f"of expected type {self.input_schema}, found {type(args[0])}" + ) + elif self.has_input_schema(): + inference_input = self.input_schema(**kwargs) + else: + inference_input = kwargs + return self.run(inference_input, context=context) diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py new file mode 100644 index 0000000000..0ec580687d --- /dev/null +++ b/src/deepsparse/v2/pipeline.py @@ -0,0 +1,102 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import List + +from pydantic import BaseModel, Field, PrivateAttr + +from deepsparse.v2.operators import Operator +from deepsparse.v2.routers import Router +from deepsparse.v2.schedulers import OperatorScheduler, SchedulerGroup + + +__all__ = ["Pipeline"] + + +class Pipeline(BaseModel): + """ + Pipeline accepts a series of operators, schedulers, and a router which define + an end to end ML transformation. + + Calling a pipeline runs these transformations + """ + + stages: List[Operator] = Field( + required=True, + description="In-order list of operators that make up this pipeline", + ) + router: Router = Field( + default_factor=Router, + description="Router object to determine order and run the stages. " + "Defaults to the base Router object", + ) + schedulers: List[OperatorScheduler] = Field( + default_factor=lambda: [OperatorScheduler()], + description="List of schedulers to run operators in order of priority", + ) + + _scheduler_group: SchedulerGroup = PrivateAttr() + + class Config: + arbitrary_types_allowed = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.validate() + + # SchedulerGroup handles running all schedulers in order of priority + self._scheduler_group = SchedulerGroup(self.schedulers) + + def __call__(self, *args, return_context: bool = False, **kwargs): + """ + :param return_context: if True, retrns tuple of the pipelien output + and entire context. Default False + :return: output of the pipeline stages ran with the router for the given input + """ + if len(args) > 1: + raise ValueError( + "Only 1 in-line argument may be supplied to Pipeline which " + f"must be a Schema, found: {len(args)}" + ) + if args and kwargs: + raise ValueError( + "Pipeline can only run either a single in-line argument schema or a " + f"series of kwargs, found {len(args)} args and {len(kwargs)} kwargs" + ) + + pipeline_input = args[0] or kwargs + pipeline_output, context = self.router.run( + inp=pipeline_input, + operators=self.stages, + scheduler=self._scheduler_group, + ) + + if return_context: + return pipeline_output, context + + return pipeline_output + + def validate(self): + router_validation = self.router.validate(self.stages) + + if router_validation is False: + # default error message + stage_types = [type(stage) for stage in self.stages] + raise ValueError( + f"Invalid Router: {type(self.router)} for stages: {stage_types}" + ) + elif isinstance(router_validation, str): + raise ValueError(f"Invalid Router for stages: {router_validation}") diff --git a/src/deepsparse/v2/routers/__init__.py b/src/deepsparse/v2/routers/__init__.py new file mode 100644 index 0000000000..8718bedeb4 --- /dev/null +++ b/src/deepsparse/v2/routers/__init__.py @@ -0,0 +1,17 @@ +# flake8: noqa + +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .router import * diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py new file mode 100644 index 0000000000..284c348c10 --- /dev/null +++ b/src/deepsparse/v2/routers/router.py @@ -0,0 +1,95 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import List, Tuple, Union + +from deepsparse.v2.operators import Operator +from deepsparse.v2.schedulers import OperatorScheduler +from deepsparse.v2.utils import Context, OperatorSchema + + +__all__ = ["Router"] + + +class Router: + """ + Routers must implement a run method which runs a series of operators + for a pipeline for a given input. Base Router runs operators linearly + in a series + """ + + @staticmethod + def run( + inp: OperatorSchema, + operators: List[Operator], + scheduler: OperatorScheduler, + ) -> Tuple[OperatorSchema, Context]: + """ + :param inp: input to the first operator of the series + :param operators: list of operators to run + :param scheduler: scheudler to submit operators to + :return: final output of the operators + """ + context = Context() + + # run operators linearly + operator_input = inp + for operator in operators: + output_future = scheduler.submit( + operator=operator, operator_input=operator_input, context=context + ) + + # wait for future to resolve + operator_output = output_future.result() + + # update context + context.update( + operator=operator, + input=operator_input, + output=operator_output, + ) + + # previous output becomes next input + operator_input = operator_output + + return operator_output, context + + @staticmethod + def validate(operators: List[Operator]) -> Union[bool, str]: + """ + :param operators: operators that this Router could potentially run over + :return: True if this Router can run this series of operators. Base Router + runs any series of operators that is non empty and whose input and output + schemas align. If not valid, either False or an error string will be + returned + """ + if len(operators) < 1: + return "No operators found" + + for idx in range(len(operators) - 1): + current_output_schema = operators[idx].output_schema + next_input_schema = operators[idx + 1].input_schema + + if current_output_schema is None or next_input_schema is None: + # if no input/output schema defined, assume operator can run + # without schema + continue + + if current_output_schema != next_input_schema: + return ( + f"Operator at idx {idx}: {type(operators[idx])} has invalid " + f"output schema {current_output_schema} for next operator " + f"{type(operators[idx + 1])} which requires {next_input_schema}" + ) diff --git a/src/deepsparse/v2/schedulers/__init__.py b/src/deepsparse/v2/schedulers/__init__.py new file mode 100644 index 0000000000..04c37077e1 --- /dev/null +++ b/src/deepsparse/v2/schedulers/__init__.py @@ -0,0 +1,18 @@ +# flake8: noqa + +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .scheduler import * +from .scheduler_group import * diff --git a/src/deepsparse/v2/schedulers/scheduler.py b/src/deepsparse/v2/schedulers/scheduler.py new file mode 100644 index 0000000000..53f0c8f625 --- /dev/null +++ b/src/deepsparse/v2/schedulers/scheduler.py @@ -0,0 +1,63 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from concurrent.futures import Future, ThreadPoolExecutor + +from deepsparse.v2.operators import Operator +from deepsparse.v2.utils import Context, OperatorSchema + + +__all__ = ["OperatorScheduler"] + + +class OperatorScheduler: + """ + OperatorSchedulers should implement a `submit` function that asynchronously + runs an operator and its input and returns a Future. Priority of operators + to run and resources they are run on are deferred to specific OperatorScheduler + implementations + + Base OperatorScheduler behaves as a simple queue deferring to ThreadPoolExecutor + + :param max_workers: maximum number of threads to execute at once + """ + + def __init__(self, max_workers: int = 1): + self._threadpool = ThreadPoolExecutor(max_workers=max_workers) + + def submit( + self, + operator: Operator, + operator_input: OperatorSchema, + context: Context, + ) -> Future: + """ + :param operator: operator to run + :param operator_input: input schema to the operator + :param context: context of already run operators + :return: future referencing the asynchronously run output of the operator + """ + if isinstance(operator_input, dict): + return self._threadpool.submit(operator, context=context, **operator_input) + return self._threadpool.submit(operator, operator_input, context=context) + + def can_process(self, operator: Operator, operator_input: OperatorSchema) -> bool: + """ + :param operator: operator to check + :param operator_input: operator_input to check + :return: True if this Operator can process the given operator and input. + Base OperatorScheduler always returns True + """ + return True diff --git a/src/deepsparse/v2/schedulers/scheduler_group.py b/src/deepsparse/v2/schedulers/scheduler_group.py new file mode 100644 index 0000000000..2f797b30c7 --- /dev/null +++ b/src/deepsparse/v2/schedulers/scheduler_group.py @@ -0,0 +1,64 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from concurrent.futures import Future +from typing import List + +from deepsparse.v2.operators import Operator +from deepsparse.v2.schedulers.scheduler import OperatorScheduler +from deepsparse.v2.utils import Context, OperatorSchema + + +__all__ = ["SchedulerGroup"] + + +class SchedulerGroup(OperatorScheduler): + """ + Wrapper for a series of schedulers. Runs submitted operators on the first + scheduler that can process a given input + + :param schedulers: list of schedulers to pass operators to + """ + + def __init__(self, schedulers: List[OperatorScheduler]): + self.schedulers = schedulers + + def submit( + self, + operator: Operator, + operator_input: OperatorSchema, + context: Context, + ) -> Future: + """ + :param operator: operator to run + :param operator_input: input schema to the operator + :param context: context of already run operators + :return: future referencing the asynchronously run output of the operator + """ + for scheduler in self.schedulers: + if scheduler.can_process(operator, operator_input): + return scheduler.submit(operator, operator_input, context) + + def can_process(self, operator: Operator, operator_input: OperatorSchema) -> bool: + """ + :param operator: operator to check + :param operator_input: operator_input to check + :return: True if this Operator can process the given operator and input. + SchedulerGroup always returns True + """ + return any( + scheduler.can_process(operator, operator_input) + for scheduler in self.schedulers + ) diff --git a/src/deepsparse/v2/utils/__init__.py b/src/deepsparse/v2/utils/__init__.py new file mode 100644 index 0000000000..4f36eeb448 --- /dev/null +++ b/src/deepsparse/v2/utils/__init__.py @@ -0,0 +1,18 @@ +# flake8: noqa + +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .context import * +from .types import * diff --git a/src/deepsparse/v2/utils/context.py b/src/deepsparse/v2/utils/context.py new file mode 100644 index 0000000000..81fe26de61 --- /dev/null +++ b/src/deepsparse/v2/utils/context.py @@ -0,0 +1,42 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Callable, List, NamedTuple + +from deepsparse.v2.utils.types import OperatorSchema + + +__all__ = ["Context"] + + +class StageInfo(NamedTuple): + operator: Callable + input: OperatorSchema + output: OperatorSchema + + +class Context: + """ + Context contains the full history of operators and their inputs and outputs + in a pipeline + """ + + def __init__(self): + self.stages_executed: List[StageInfo] = [] + + def update(self, operator: Callable, input: OperatorSchema, output: OperatorSchema): + self.stages_executed.append( + StageInfo(operator=operator, input=input, output=output) + ) diff --git a/src/deepsparse/v2/utils/types.py b/src/deepsparse/v2/utils/types.py new file mode 100644 index 0000000000..3e4b974453 --- /dev/null +++ b/src/deepsparse/v2/utils/types.py @@ -0,0 +1,28 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Types to support deepsparse pipelines +""" + +from typing import Any, Dict, Union + +from pydantic import BaseModel + + +__all__ = ["OperatorSchema"] + + +# Operator inputs and outputs may either be a pydantic base model or a dict of kwargs +OperatorSchema = Union[BaseModel, Dict[str, Any]] diff --git a/tests/deepsparse/v2/__init__.py b/tests/deepsparse/v2/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/deepsparse/v2/test_basic_pipeline.py b/tests/deepsparse/v2/test_basic_pipeline.py new file mode 100644 index 0000000000..d39bc61c8c --- /dev/null +++ b/tests/deepsparse/v2/test_basic_pipeline.py @@ -0,0 +1,45 @@ +""" +Simple example and test of a dummy pipeline +""" + +from pydantic import BaseModel + +from deepsparse.v2 import Pipeline +from deepsparse.v2.operators import Operator +from deepsparse.v2.routers import Router +from deepsparse.v2.schedulers import OperatorScheduler +from deepsparse.v2.utils import Context, OperatorSchema + + +class IntSchema(BaseModel): + value: int + + +class AddOneOperator(Operator): + input_schema = IntSchema + output_schema = IntSchema + + def run(self, inp: IntSchema, context: Context) -> OperatorSchema: + return IntSchema(value=inp.value + 1) + + +class AddTwoOperator(Operator): + input_schema = IntSchema + output_schema = IntSchema + + def run(self, inp: IntSchema, context: Context) -> OperatorSchema: + return IntSchema(value=inp.value + 2) + + +AddThreePipeline = Pipeline( + stages=[AddOneOperator(), AddTwoOperator()], + router=Router(), + schedulers=[OperatorScheduler()], +) + + +def test_run_simple_pipeline(): + pipeline_input = IntSchema(value=5) + pipeline_output = AddThreePipeline(pipeline_input) + + assert pipeline_output.value == 8 From 4e5f2af1da99b5ecd7e76ead5a056734e88e0af1 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Tue, 31 Oct 2023 16:24:10 -0400 Subject: [PATCH 2/7] [Pipeline Refactor] Additional functionality, engine operator, linear router and image classification pipeline/operators/example (#1325) * initial functionality and working example with image classification * remove testing image * update args * initial functionality and working example with image classification * remove testing image * pr comments * defines schemas for operators and test * add image classification test, PR comments * fix input/output handling in pipeline and operator base classes to be more generic; remove context * add additional operator input message * typo fix --- src/deepsparse/v2/__init__.py | 2 +- .../v2/image_classification/__init__.py | 20 +++ .../v2/image_classification/pipeline.py | 62 ++++++++ .../postprocess_operator.py | 81 ++++++++++ .../preprocess_operator.py | 149 ++++++++++++++++++ .../v2/operators/engine_operator.py | 133 ++++++++++++++++ src/deepsparse/v2/operators/operator.py | 92 +++++++---- src/deepsparse/v2/pipeline.py | 130 ++++++++------- src/deepsparse/v2/routers/router.py | 88 ++++++----- src/deepsparse/v2/schedulers/scheduler.py | 14 +- .../v2/schedulers/scheduler_group.py | 16 +- src/deepsparse/v2/utils/__init__.py | 1 - src/deepsparse/v2/utils/context.py | 42 ----- tests/deepsparse/v2/__init__.py | 13 ++ tests/deepsparse/v2/test_basic_pipeline.py | 31 +++- .../v2/test_image_classification.py | 39 +++++ 16 files changed, 709 insertions(+), 204 deletions(-) create mode 100644 src/deepsparse/v2/image_classification/__init__.py create mode 100644 src/deepsparse/v2/image_classification/pipeline.py create mode 100644 src/deepsparse/v2/image_classification/postprocess_operator.py create mode 100644 src/deepsparse/v2/image_classification/preprocess_operator.py create mode 100644 src/deepsparse/v2/operators/engine_operator.py delete mode 100644 src/deepsparse/v2/utils/context.py create mode 100644 tests/deepsparse/v2/test_image_classification.py diff --git a/src/deepsparse/v2/__init__.py b/src/deepsparse/v2/__init__.py index 4a897be06f..29fcd4126c 100644 --- a/src/deepsparse/v2/__init__.py +++ b/src/deepsparse/v2/__init__.py @@ -14,8 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .pipeline import * from .operators import * +from .pipeline import * from .routers import * from .schedulers import * from .utils import * diff --git a/src/deepsparse/v2/image_classification/__init__.py b/src/deepsparse/v2/image_classification/__init__.py new file mode 100644 index 0000000000..8668227df7 --- /dev/null +++ b/src/deepsparse/v2/image_classification/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# flake8: noqa +from .postprocess_operator import * +from .preprocess_operator import * + + +from .pipeline import * # isort:skip diff --git a/src/deepsparse/v2/image_classification/pipeline.py b/src/deepsparse/v2/image_classification/pipeline.py new file mode 100644 index 0000000000..3d7887a701 --- /dev/null +++ b/src/deepsparse/v2/image_classification/pipeline.py @@ -0,0 +1,62 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import warnings +from typing import Dict, Optional, Tuple, Union + +from deepsparse.v2.image_classification.postprocess_operator import ( + ImageClassificationPostProcess, +) +from deepsparse.v2.image_classification.preprocess_operator import ( + ImageClassificationPreProcess, +) +from deepsparse.v2.operators.engine_operator import EngineOperator +from deepsparse.v2.pipeline import Pipeline +from deepsparse.v2.routers.router import LinearRouter +from deepsparse.v2.schedulers.scheduler import OperatorScheduler + + +_LOGGER = logging.getLogger(__name__) + +__all__ = ["ImageClassificationPipeline"] + + +class ImageClassificationPipeline(Pipeline): + def __init__( + self, + model_path: str, + engine_kwargs: Optional[Dict] = None, + class_names: Union[None, str, Dict[str, str]] = None, + image_size: Optional[Tuple[int]] = None, + top_k: int = 1, + ): + if not engine_kwargs: + engine_kwargs = {} + engine_kwargs["model_path"] = model_path + elif engine_kwargs.get("model_path") != model_path: + warnings.warn(f"Updating engine_kwargs to include {model_path}") + + engine = EngineOperator(**engine_kwargs) + preproces = ImageClassificationPreProcess( + model_path=engine.model_path, image_size=image_size + ) + postprocess = ImageClassificationPostProcess( + top_k=top_k, class_names=class_names + ) + + ops = [preproces, engine, postprocess] + router = LinearRouter(end_route=len(ops)) + scheduler = [OperatorScheduler()] + super().__init__(ops=ops, router=router, schedulers=scheduler) diff --git a/src/deepsparse/v2/image_classification/postprocess_operator.py b/src/deepsparse/v2/image_classification/postprocess_operator.py new file mode 100644 index 0000000000..9231113368 --- /dev/null +++ b/src/deepsparse/v2/image_classification/postprocess_operator.py @@ -0,0 +1,81 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from typing import Dict, List, Union + +import numpy +from pydantic import BaseModel, Field + +from deepsparse.v2.operators import Operator + + +class ImageClassificationOutput(BaseModel): + """ + Output model for image classification + """ + + labels: List[Union[int, str, List[int], List[str]]] = Field( + description="List of labels, one for each prediction" + ) + scores: List[Union[float, List[float]]] = Field( + description="List of scores, one for each prediction" + ) + + +__all__ = ["ImageClassificationPostProcess"] + + +class ImageClassificationPostProcess(Operator): + """ + Image Classification post-processing Operator. This Operator is responsible for + processing outputs from the engine and returning the classification results to + the user, using the ImageClassifcationOutput structure. + """ + + input_schema = None + output_schema = ImageClassificationOutput + + def __init__( + self, top_k: int = 1, class_names: Union[None, str, Dict[str, str]] = None + ): + self.top_k = top_k + if isinstance(class_names, str) and class_names.endswith(".json"): + self._class_names = json.load(open(class_names)) + elif isinstance(class_names, dict): + self._class_names = class_names + else: + self._class_names = None + + def run(self, inp: "EngineOperatorOutputs", **kwargs) -> Dict: # noqa: F821 + labels, scores = [], [] + inp = inp.engine_outputs + for prediction_batch in inp[0]: + label = (-prediction_batch).argsort()[: self.top_k] + score = prediction_batch[label] + labels.append(label) + scores.append(score.tolist()) + + if self._class_names is not None: + labels = numpy.vectorize(self._class_names.__getitem__)(labels) + labels = labels.tolist() + + if isinstance(labels[0], numpy.ndarray): + labels = [label.tolist() for label in labels] + + if len(labels) == 1: + labels = labels[0] + scores = scores[0] + + return {"scores": scores, "labels": labels} diff --git a/src/deepsparse/v2/image_classification/preprocess_operator.py b/src/deepsparse/v2/image_classification/preprocess_operator.py new file mode 100644 index 0000000000..9b4517a44c --- /dev/null +++ b/src/deepsparse/v2/image_classification/preprocess_operator.py @@ -0,0 +1,149 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Optional, Tuple + +import numpy +import onnx +from PIL import Image +from torchvision import transforms + +from deepsparse.image_classification.constants import ( + IMAGENET_RGB_MEANS, + IMAGENET_RGB_STDS, +) +from deepsparse.pipelines.computer_vision import ComputerVisionSchema +from deepsparse.v2.operators import Operator + + +class ImageClassificationInput(ComputerVisionSchema): + """ + Input model for image classification + """ + + +__all__ = ["ImageClassificationPreProcess"] + + +class ImageClassificationPreProcess(Operator): + """ + Image Classification pre-processing operator. This Operator is expected to process + the user inputs and prepare them for the engine. Inputs to this Operator are + expected to follow the ImageClassificationInput schema. + """ + + input_schema = ImageClassificationInput + output_schema = None + + def __init__(self, model_path: str, image_size: Optional[Tuple[int]] = None): + self.model_path = model_path + self._image_size = image_size or self._infer_image_size() + non_rand_resize_scale = 256.0 / 224.0 # standard used + self._pre_normalization_transforms = transforms.Compose( + [ + transforms.Resize( + tuple( + [ + round(non_rand_resize_scale * size) + for size in self._image_size + ] + ) + ), + transforms.CenterCrop(self._image_size), + ] + ) + + def run(self, inp: ImageClassificationInput, **kwargs) -> Dict: + """ + Pre-Process the Inputs for DeepSparse Engine + + :param inputs: input model + :return: list of preprocessed numpy arrays + """ + + if isinstance(inp.images, numpy.ndarray): + image_batch = inp.images + else: + if isinstance(inp.images, str): + inp.images = [inp.images] + + image_batch = list(map(self._preprocess_image, inp.images)) + + # build batch + image_batch = numpy.stack(image_batch, axis=0) + + original_dtype = image_batch.dtype + image_batch = numpy.ascontiguousarray(image_batch, dtype=numpy.float32) + + if original_dtype == numpy.uint8: + image_batch /= 255 + # normalize entire batch + image_batch -= numpy.asarray(IMAGENET_RGB_MEANS).reshape((-1, 3, 1, 1)) + image_batch /= numpy.asarray(IMAGENET_RGB_STDS).reshape((-1, 3, 1, 1)) + + return {"engine_inputs": [image_batch]} + + def _preprocess_image(self, image) -> numpy.ndarray: + if isinstance(image, List): + # image given as raw list + image = numpy.asarray(image) + if image.dtype == numpy.float32: + # image is already processed, append and continue + return image + # assume raw image input + # put image in PIL format for torchvision processing + image = image.astype(numpy.uint8) + if image.shape[0] < image.shape[-1]: + # put channel last + image = numpy.einsum("cwh->whc", image) + image = Image.fromarray(image) + elif isinstance(image, str): + # load image from string filepath + image = Image.open(image).convert("RGB") + elif isinstance(image, numpy.ndarray): + image = image.astype(numpy.uint8) + if image.shape[0] < image.shape[-1]: + # put channel last + image = numpy.einsum("cwh->whc", image) + image = Image.fromarray(image) + + if not isinstance(image, Image.Image): + raise ValueError( + f"inputs to {self.__class__.__name__} must be a string image " + "file path(s), a list representing a raw image, " + "PIL.Image.Image object(s), or a numpy array representing" + f"the entire pre-processed batch. Found {type(image)}" + ) + + # apply resize and center crop + image = self._pre_normalization_transforms(image) + image_numpy = numpy.array(image) + image.close() + + # make channel first dimension + image_numpy = image_numpy.transpose(2, 0, 1) + return image_numpy + + def _infer_image_size(self) -> Tuple[int, ...]: + """ + Infer and return the expected shape of the input tensor + + :return: The expected shape of the input tensor from onnx graph + """ + onnx_model = onnx.load(self.model_path) + input_tensor = onnx_model.graph.input[0] + return ( + input_tensor.type.tensor_type.shape.dim[2].dim_value, + input_tensor.type.tensor_type.shape.dim[3].dim_value, + ) diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py new file mode 100644 index 0000000000..aac94a7697 --- /dev/null +++ b/src/deepsparse/v2/operators/engine_operator.py @@ -0,0 +1,133 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Optional, Union + +from pydantic import BaseModel, Field + +from deepsparse import Context, Engine, MultiModelEngine, Scheduler +from deepsparse.benchmark import ORTEngine +from deepsparse.utils import join_engine_outputs, model_to_path, split_engine_inputs +from deepsparse.v2.operators import Operator + + +DEEPSPARSE_ENGINE = "deepsparse" +ORT_ENGINE = "onnxruntime" + +SUPPORTED_PIPELINE_ENGINES = [DEEPSPARSE_ENGINE, ORT_ENGINE] + +__all__ = ["EngineOperator"] + + +class EngineOperatorInputs(BaseModel): + engine_inputs: List = Field(description="engine_inputs") + + +class EngineOperatorOutputs(BaseModel): + engine_outputs: List = Field(description="engine outputs") + + +class EngineOperator(Operator): + input_schema = EngineOperatorInputs + output_schema = EngineOperatorOutputs + + def __init__( + self, + model_path: str, + engine_type: str = DEEPSPARSE_ENGINE, + batch_size: Optional[int] = 1, + num_cores: int = None, + num_streams: int = None, + scheduler: Scheduler = None, + input_shapes: List[List[int]] = None, + engine_context: Optional[Context] = None, + ): + + self._batch_size = batch_size + self.model_path = model_to_path(model_path) + self.engine_context = engine_context + + if self.engine_context is not None: + num_cores = num_cores or self.engine_context.num_cores + if self.engine_context.num_cores != num_cores: + raise ValueError( + f"num_cores mismatch. Expected {self.engine_context.num_cores} " + f"from passed context, but got {num_cores} while " + f"instantiating Pipeline" + ) + + engine_args = dict( + batch_size=self._batch_size, + num_cores=num_cores, + input_shapes=input_shapes, + ) + if engine_type.lower() == DEEPSPARSE_ENGINE: + engine_args["scheduler"] = scheduler + engine_args["num_streams"] = num_streams + + self.engine = self._create_engine(self.model_path, engine_type, engine_args) + + def _create_engine( + self, onnx_file_path: str, engine_type: str, engine_args: Dict + ) -> Union[Engine, MultiModelEngine, ORTEngine]: + """ + Create an inference engine for a given ONNX model + + :param onnx_file_path: path to ONNX model file + :param engine_type: type of engine to create. + :param engine_args: arguments to pass to engine constructor + :param context: context to use for engine + :return: inference engine + """ + engine_type = engine_type.lower() + + if engine_type == DEEPSPARSE_ENGINE: + if self.engine_context is not None and isinstance( + self.engine_context, Context + ): + engine_args.pop("num_cores", None) + engine_args.pop("scheduler", None) + engine_args.pop("num_streams", None) + engine_args["context"] = self.engien_context + return MultiModelEngine( + model=onnx_file_path, + **engine_args, + ) + engine_args.pop("cache_output_bools", None) + return Engine(onnx_file_path, **engine_args) + + if engine_type == ORT_ENGINE: + return ORTEngine(onnx_file_path, **engine_args) + + raise ValueError( + f"Unknown engine_type {engine_type}. Supported values include: " + f"{SUPPORTED_PIPELINE_ENGINES}" + ) + + def run(self, inp: EngineOperatorInputs) -> Dict: + inp = inp.engine_inputs + batches, orig_batch_size = self.expand_inputs(engine_inputs=inp) + batches_outputs = list(map(self.engine, batches)) + engine_outputs = self.condense_inputs( + batch_outputs=batches_outputs, orig_batch_size=orig_batch_size + ) + return {"engine_outputs": engine_outputs} + + def expand_inputs(self, **kwargs): + return split_engine_inputs(kwargs["engine_inputs"], self._batch_size) + + def condense_inputs(self, **kwargs): + batch_outputs = kwargs["batch_outputs"] + orig_batch_size = kwargs["orig_batch_size"] + return join_engine_outputs(batch_outputs, orig_batch_size) diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py index 30e1a48379..c3a3e28b78 100644 --- a/src/deepsparse/v2/operators/operator.py +++ b/src/deepsparse/v2/operators/operator.py @@ -13,39 +13,32 @@ # limitations under the License. from abc import ABC, abstractmethod -from typing import Optional, Type +from typing import Any, Optional, Type from pydantic import BaseModel -from deepsparse.v2.utils import Context, OperatorSchema - __all__ = ["Operator"] class Operator(ABC): """ - Base operator class - can represent any part of an ML pipeline + Base operator class - an operator should be defined for each atomic, functional + part of the pipeline. """ # expected structured input and output types, to be defined by child classes - input_schema: Optional[Type[OperatorSchema]] = None - output_schema: Optional[Type[OperatorSchema]] = None - - @abstractmethod - def run(self, inp: OperatorSchema, context: Context) -> OperatorSchema: - """ - :param inp: operator input, as the defined input schema if applicable - :param context: pipeline context of already run operators - :return: result of this operator as the defined output schema if applicable - """ - raise NotImplementedError + input_schema: Optional[Type[BaseModel]] = None + output_schema: Optional[Type[BaseModel]] = None @classmethod def has_input_schema(cls) -> bool: """ :return: True if this class has a defined pydantic input schema """ + if not cls.input_schema: + return False + return issubclass(cls.input_schema, BaseModel) @classmethod @@ -53,38 +46,73 @@ def has_output_schema(cls) -> bool: """ :return: True if this class has a defined pydantic input schema """ + if not cls.output_schema: + return False + return issubclass(cls.output_schema, BaseModel) def __call__( self, *args, - context: Optional[Context] = None, **kwargs, - ) -> OperatorSchema: + ) -> Any: """ Parses inputs to this Operator and runs the run() method of this operator - :param args: an unnamed arg may only be provided - if it is of the type of the input_schema + :param args: an unnamed arg may only be provided if it is of the type of the + input_schema :param context: pipeline context to pass to operator :param kwargs: kwargs when not initializing from an instantiated schema :return: operator output """ - if len(args) > 1: - raise ValueError( - f"Only 1 unnamed arg may be supplied to an Operator, found {len(args)}" - ) - - if len(args) == 1: - if self.input_schema is not None and isinstance(args[0], self.input_schema): + if self.has_input_schema(): + if len(args) > 1: + raise ValueError( + f"The operator requires an {self.input_schema}. Too many arguments" + "provided." + ) + elif args and isinstance(args[0], self.input_schema): inference_input = args[0] + elif kwargs: + inference_input = self.input_schema(**kwargs) else: raise ValueError( - f"1 arg supplied to Operator {self.__class__.__name__} but was not " - f"of expected type {self.input_schema}, found {type(args[0])}" + "Can't resolve inputs. The values for the schema must be provided" + "in the form of a dictionary or an instance of the input_schema" + "object" ) - elif self.has_input_schema(): - inference_input = self.input_schema(**kwargs) + + run_output = self.run(inference_input) else: - inference_input = kwargs - return self.run(inference_input, context=context) + run_output = self.run(*args, **kwargs) + + if self.has_output_schema(): + return self.output_schema(**run_output) + return run_output + + @abstractmethod + def run(self, *args, **kwargs) -> Any: + """ + :param inp: operator input, as the defined input schema if applicable + :param context: pipeline context of already run operators + :return: result of this operator as the defined output schema if applicable + """ + raise NotImplementedError + + def expand_inputs(self, **kwargs): + """ + Generic function to handle expanding values. + """ + raise NotImplementedError + + def condense_inputs(self, **kwargs): + """ + Generic function to handle condensing values. + """ + raise NotImplementedError + + def yaml(self): + pass + + def json(self): + pass diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py index 0ec580687d..e58f8a5191 100644 --- a/src/deepsparse/v2/pipeline.py +++ b/src/deepsparse/v2/pipeline.py @@ -13,9 +13,7 @@ # limitations under the License. -from typing import List - -from pydantic import BaseModel, Field, PrivateAttr +from typing import Dict, List, Union from deepsparse.v2.operators import Operator from deepsparse.v2.routers import Router @@ -25,78 +23,90 @@ __all__ = ["Pipeline"] -class Pipeline(BaseModel): +class Pipeline(Operator): """ - Pipeline accepts a series of operators, schedulers, and a router which define - an end to end ML transformation. + Pipeline accepts a series of operators, schedulers, and a router. Calling a pipeline + will use the router to run through all the defined operators. The operators should + be implemented using the Operator class and each implemented Operator should be + responsible for a functional component of the pipelines. The flow of inputs/outputs + between the operators and the steps in the pipeline should be defined by the router, + (based off of the Router class), which dicates the next operator in the pipeline. + Execution of the operators will be handled by the provided schedulers. + + :param ops: Operators to run within the pipeline. Can either be a list of operators + or dictionary of operators. + :param router: A Router which dictates the next operator to call. + :param schedulers: A list of schedulers to run operators. - Calling a pipeline runs these transformations """ - stages: List[Operator] = Field( - required=True, - description="In-order list of operators that make up this pipeline", - ) - router: Router = Field( - default_factor=Router, - description="Router object to determine order and run the stages. " - "Defaults to the base Router object", - ) - schedulers: List[OperatorScheduler] = Field( - default_factor=lambda: [OperatorScheduler()], - description="List of schedulers to run operators in order of priority", - ) - - _scheduler_group: SchedulerGroup = PrivateAttr() - - class Config: - arbitrary_types_allowed = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__( + self, + ops: Union[Dict[str, Operator], List[Operator]], + router: Router, + schedulers: List[OperatorScheduler], + ): + self.ops = ops + self.router = router + self.schedulers = schedulers self.validate() # SchedulerGroup handles running all schedulers in order of priority self._scheduler_group = SchedulerGroup(self.schedulers) - def __call__(self, *args, return_context: bool = False, **kwargs): + def run(self, *args, **kwargs): + """ + Run through the operators using the provided router and scheduler. Update the + context to reflect each step of the router. The input to a given operator is the + output of the previous operator. + + :param inp: input to the operator. expected to be of any type that is + expected by the operator. + :param context: context to store the current the inputs, outputs, and operator + for each step of the router. + + """ + next_step = self.router.START_ROUTE + operator_output = None + while next_step != self.router.END_ROUTE: + # Either a dictionary key or valid index + operator = self.ops[next_step] + if next_step == self.router.START_ROUTE: + output_future = self._scheduler_group.submit( + *args, operator=operator, **kwargs + ) + else: + if isinstance(operator_output, dict): + output_future = self._scheduler_group.submit( + operator=operator, **operator_output + ) + else: + output_future = self._scheduler_group.submit( + operator_output, operator=operator + ) + + # wait for future to resolve + operator_output = output_future.result() + next_step = self.router.next(next_step, self.ops) + return operator_output + + def __call__(self, *args, **kwargs): """ - :param return_context: if True, retrns tuple of the pipelien output - and entire context. Default False - :return: output of the pipeline stages ran with the router for the given input + :return: output of the pipeline operators ran with the router for the given + input """ - if len(args) > 1: - raise ValueError( - "Only 1 in-line argument may be supplied to Pipeline which " - f"must be a Schema, found: {len(args)}" - ) - if args and kwargs: - raise ValueError( - "Pipeline can only run either a single in-line argument schema or a " - f"series of kwargs, found {len(args)} args and {len(kwargs)} kwargs" - ) - - pipeline_input = args[0] or kwargs - pipeline_output, context = self.router.run( - inp=pipeline_input, - operators=self.stages, - scheduler=self._scheduler_group, - ) - - if return_context: - return pipeline_output, context - - return pipeline_output + return self.run(*args, **kwargs) def validate(self): - router_validation = self.router.validate(self.stages) + """ + Validate that compatability of the router and operators provided. + """ + router_validation = self.router.validate(self.ops) if router_validation is False: # default error message - stage_types = [type(stage) for stage in self.stages] - raise ValueError( - f"Invalid Router: {type(self.router)} for stages: {stage_types}" - ) + op_types = [type(op) for op in self.ops] + raise ValueError(f"Invalid Router: {type(self.router)} for ops: {op_types}") elif isinstance(router_validation, str): - raise ValueError(f"Invalid Router for stages: {router_validation}") + raise ValueError(f"Invalid Router for operators: {router_validation}") diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py index 284c348c10..6050803b5e 100644 --- a/src/deepsparse/v2/routers/router.py +++ b/src/deepsparse/v2/routers/router.py @@ -13,61 +13,70 @@ # limitations under the License. -from typing import List, Tuple, Union +import logging +from abc import abstractmethod +from typing import Dict, List, Union from deepsparse.v2.operators import Operator -from deepsparse.v2.schedulers import OperatorScheduler -from deepsparse.v2.utils import Context, OperatorSchema -__all__ = ["Router"] +_LOGGER = logging.getLogger(__name__) + +__all__ = ["Router", "LinearRouter"] class Router: """ - Routers must implement a run method which runs a series of operators - for a pipeline for a given input. Base Router runs operators linearly - in a series + Routers dicate the next operator to run. Each Router must implement a next function, + which dictates the index or key of the next operator to run. + + :param start_route: the start index or key of the router + :param end_route: the end index or key of the router + """ - @staticmethod - def run( - inp: OperatorSchema, - operators: List[Operator], - scheduler: OperatorScheduler, - ) -> Tuple[OperatorSchema, Context]: + def __init__(self, end_route: Union[str, int], start_route: Union[str, int]): + self.START_ROUTE = start_route + self.END_ROUTE = end_route + + @abstractmethod + def next( + self, past: Union[str, int], ops: Union[List[Operator], Dict[str, Operator]] + ) -> Union[str, int]: """ - :param inp: input to the first operator of the series - :param operators: list of operators to run - :param scheduler: scheudler to submit operators to - :return: final output of the operators + Determines the index or dictionary key for the next operator which should run. + + :param past: the previous index or key. This should uniquely determine the next + operator to run + :param ops: list or dictionary of operators + :returns: the next index or dictionary key for the next operator to run """ - context = Context() + raise NotImplementedError + + def yaml(self): + pass - # run operators linearly - operator_input = inp - for operator in operators: - output_future = scheduler.submit( - operator=operator, operator_input=operator_input, context=context - ) + def json(self): + pass - # wait for future to resolve - operator_output = output_future.result() - # update context - context.update( - operator=operator, - input=operator_input, - output=operator_output, - ) +class LinearRouter(Router): + """ + LinearRouterruns a list of Operators in sequential order. end_route should + be the length of the list and the start_route should be the start index. + """ - # previous output becomes next input - operator_input = operator_output + def __init__(self, end_route: int, start_route: int = 0): + super().__init__(end_route=end_route, start_route=start_route) - return operator_output, context + def next(self, past: int, ops: List[Operator]) -> int: + new_index = past + 1 + if new_index < self.END_ROUTE: + return new_index + return self.END_ROUTE @staticmethod - def validate(operators: List[Operator]) -> Union[bool, str]: + def validate(operators: List[Operator]) -> bool: """ :param operators: operators that this Router could potentially run over :return: True if this Router can run this series of operators. Base Router @@ -76,7 +85,8 @@ def validate(operators: List[Operator]) -> Union[bool, str]: returned """ if len(operators) < 1: - return "No operators found" + _LOGGER.info("No operators provided") + return False for idx in range(len(operators) - 1): current_output_schema = operators[idx].output_schema @@ -88,8 +98,10 @@ def validate(operators: List[Operator]) -> Union[bool, str]: continue if current_output_schema != next_input_schema: - return ( + _LOGGER.info( f"Operator at idx {idx}: {type(operators[idx])} has invalid " f"output schema {current_output_schema} for next operator " f"{type(operators[idx + 1])} which requires {next_input_schema}" ) + return False + return True diff --git a/src/deepsparse/v2/schedulers/scheduler.py b/src/deepsparse/v2/schedulers/scheduler.py index 53f0c8f625..7d4f249444 100644 --- a/src/deepsparse/v2/schedulers/scheduler.py +++ b/src/deepsparse/v2/schedulers/scheduler.py @@ -16,7 +16,6 @@ from concurrent.futures import Future, ThreadPoolExecutor from deepsparse.v2.operators import Operator -from deepsparse.v2.utils import Context, OperatorSchema __all__ = ["OperatorScheduler"] @@ -37,23 +36,16 @@ class OperatorScheduler: def __init__(self, max_workers: int = 1): self._threadpool = ThreadPoolExecutor(max_workers=max_workers) - def submit( - self, - operator: Operator, - operator_input: OperatorSchema, - context: Context, - ) -> Future: + def submit(self, *args, operator: Operator, **kwargs) -> Future: """ :param operator: operator to run :param operator_input: input schema to the operator :param context: context of already run operators :return: future referencing the asynchronously run output of the operator """ - if isinstance(operator_input, dict): - return self._threadpool.submit(operator, context=context, **operator_input) - return self._threadpool.submit(operator, operator_input, context=context) + return self._threadpool.submit(operator, *args, **kwargs) - def can_process(self, operator: Operator, operator_input: OperatorSchema) -> bool: + def can_process(self, *args, operator: Operator, **kwargs) -> bool: """ :param operator: operator to check :param operator_input: operator_input to check diff --git a/src/deepsparse/v2/schedulers/scheduler_group.py b/src/deepsparse/v2/schedulers/scheduler_group.py index 2f797b30c7..7f00a3c17c 100644 --- a/src/deepsparse/v2/schedulers/scheduler_group.py +++ b/src/deepsparse/v2/schedulers/scheduler_group.py @@ -18,7 +18,6 @@ from deepsparse.v2.operators import Operator from deepsparse.v2.schedulers.scheduler import OperatorScheduler -from deepsparse.v2.utils import Context, OperatorSchema __all__ = ["SchedulerGroup"] @@ -35,12 +34,7 @@ class SchedulerGroup(OperatorScheduler): def __init__(self, schedulers: List[OperatorScheduler]): self.schedulers = schedulers - def submit( - self, - operator: Operator, - operator_input: OperatorSchema, - context: Context, - ) -> Future: + def submit(self, *args, operator: Operator, **kwargs) -> Future: """ :param operator: operator to run :param operator_input: input schema to the operator @@ -48,10 +42,10 @@ def submit( :return: future referencing the asynchronously run output of the operator """ for scheduler in self.schedulers: - if scheduler.can_process(operator, operator_input): - return scheduler.submit(operator, operator_input, context) + if scheduler.can_process(*args, operator=operator, **kwargs): + return scheduler.submit(*args, operator=operator, **kwargs) - def can_process(self, operator: Operator, operator_input: OperatorSchema) -> bool: + def can_process(self, *args, operator: Operator, **kwargs) -> bool: """ :param operator: operator to check :param operator_input: operator_input to check @@ -59,6 +53,6 @@ def can_process(self, operator: Operator, operator_input: OperatorSchema) -> boo SchedulerGroup always returns True """ return any( - scheduler.can_process(operator, operator_input) + scheduler.can_process(*args, operator=operator, **kwargs) for scheduler in self.schedulers ) diff --git a/src/deepsparse/v2/utils/__init__.py b/src/deepsparse/v2/utils/__init__.py index 4f36eeb448..a36d8e92ec 100644 --- a/src/deepsparse/v2/utils/__init__.py +++ b/src/deepsparse/v2/utils/__init__.py @@ -14,5 +14,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .context import * from .types import * diff --git a/src/deepsparse/v2/utils/context.py b/src/deepsparse/v2/utils/context.py deleted file mode 100644 index 81fe26de61..0000000000 --- a/src/deepsparse/v2/utils/context.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import Callable, List, NamedTuple - -from deepsparse.v2.utils.types import OperatorSchema - - -__all__ = ["Context"] - - -class StageInfo(NamedTuple): - operator: Callable - input: OperatorSchema - output: OperatorSchema - - -class Context: - """ - Context contains the full history of operators and their inputs and outputs - in a pipeline - """ - - def __init__(self): - self.stages_executed: List[StageInfo] = [] - - def update(self, operator: Callable, input: OperatorSchema, output: OperatorSchema): - self.stages_executed.append( - StageInfo(operator=operator, input=input, output=output) - ) diff --git a/tests/deepsparse/v2/__init__.py b/tests/deepsparse/v2/__init__.py index e69de29bb2..0c44f887a4 100644 --- a/tests/deepsparse/v2/__init__.py +++ b/tests/deepsparse/v2/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/deepsparse/v2/test_basic_pipeline.py b/tests/deepsparse/v2/test_basic_pipeline.py index d39bc61c8c..9f85e4976e 100644 --- a/tests/deepsparse/v2/test_basic_pipeline.py +++ b/tests/deepsparse/v2/test_basic_pipeline.py @@ -1,14 +1,29 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """ Simple example and test of a dummy pipeline """ +from typing import Dict + from pydantic import BaseModel from deepsparse.v2 import Pipeline from deepsparse.v2.operators import Operator -from deepsparse.v2.routers import Router +from deepsparse.v2.routers import LinearRouter from deepsparse.v2.schedulers import OperatorScheduler -from deepsparse.v2.utils import Context, OperatorSchema class IntSchema(BaseModel): @@ -19,21 +34,21 @@ class AddOneOperator(Operator): input_schema = IntSchema output_schema = IntSchema - def run(self, inp: IntSchema, context: Context) -> OperatorSchema: - return IntSchema(value=inp.value + 1) + def run(self, inp: IntSchema) -> Dict: + return {"value": inp.value + 1} class AddTwoOperator(Operator): input_schema = IntSchema output_schema = IntSchema - def run(self, inp: IntSchema, context: Context) -> OperatorSchema: - return IntSchema(value=inp.value + 2) + def run(self, inp: IntSchema) -> Dict: + return {"value": inp.value + 2} AddThreePipeline = Pipeline( - stages=[AddOneOperator(), AddTwoOperator()], - router=Router(), + ops=[AddOneOperator(), AddTwoOperator()], + router=LinearRouter(end_route=2), schedulers=[OperatorScheduler()], ) diff --git a/tests/deepsparse/v2/test_image_classification.py b/tests/deepsparse/v2/test_image_classification.py new file mode 100644 index 0000000000..03e2807454 --- /dev/null +++ b/tests/deepsparse/v2/test_image_classification.py @@ -0,0 +1,39 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy + +import pytest +from deepsparse.v2.image_classification import ImageClassificationPipeline +from deepsparse.v2.image_classification.preprocess_operator import ( + ImageClassificationInput, +) +from tests.deepsparse.pipelines.data_helpers import computer_vision + + +@pytest.fixture +def get_images(): + batch_size = 2 + images = computer_vision(batch_size=batch_size) + return images.get("images") + + +def test_image_classification(get_images): + model_path = ( + "zoo:cv/classification/resnet_v1-50/pytorch/sparseml/imagenet/pruned95-none" + ) + pipeline = ImageClassificationPipeline(model_path=model_path) + output = pipeline(ImageClassificationInput(images=get_images)) + assert output.labels == [[207], [670]] + assert numpy.allclose(output.scores, [[21.85], [17.33]], atol=0.01) From 59fb5872f8c81584f1826c69289f05b7d06160ad Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Wed, 1 Nov 2023 10:53:10 -0400 Subject: [PATCH 3/7] [v2] EngineOperator updates to make continuous batching easier (#1371) * [v2] EngineOperator updates to make continuous batching easier * test fixes --- .../v2/operators/engine_operator.py | 42 +++++++++++++++---- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py index aac94a7697..2c61755df9 100644 --- a/src/deepsparse/v2/operators/engine_operator.py +++ b/src/deepsparse/v2/operators/engine_operator.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from copy import deepcopy from typing import Dict, List, Optional, Union from pydantic import BaseModel, Field @@ -32,6 +33,13 @@ class EngineOperatorInputs(BaseModel): engine_inputs: List = Field(description="engine_inputs") + engine: Optional[Engine] = Field( + description="override the engine to run forward pass with", + default=None, + ) + + class Config: + arbitrary_types_allowed = True class EngineOperatorOutputs(BaseModel): @@ -76,21 +84,33 @@ def __init__( engine_args["scheduler"] = scheduler engine_args["num_streams"] = num_streams - self.engine = self._create_engine(self.model_path, engine_type, engine_args) + self._engine_args = engine_args + self._engine_type = engine_type + + self.engine = self.create_engine() + + @property + def batch_size(self) -> int: + """ + :return: the batch size this engine operator is compiled at + """ + return self._batch_size - def _create_engine( - self, onnx_file_path: str, engine_type: str, engine_args: Dict + def create_engine( + self, + **kwargs, ) -> Union[Engine, MultiModelEngine, ORTEngine]: """ Create an inference engine for a given ONNX model - :param onnx_file_path: path to ONNX model file - :param engine_type: type of engine to create. - :param engine_args: arguments to pass to engine constructor - :param context: context to use for engine + :param kwargs: overrides to engine_args used as kwargs for engine + constructor/compilation :return: inference engine """ - engine_type = engine_type.lower() + onnx_file_path = self.model_path + engine_args = deepcopy(self._engine_args) + engine_args.update(kwargs) + engine_type = self._engine_type.lower() if engine_type == DEEPSPARSE_ENGINE: if self.engine_context is not None and isinstance( @@ -116,6 +136,12 @@ def _create_engine( ) def run(self, inp: EngineOperatorInputs) -> Dict: + if inp.engine: + # run with custom engine, do not split/join since custom engine + # may run at any batch size, returning here as code below has a + # planned refactor + engine_outputs = inp.engine(inp.engine_inputs) + return {"engine_outputs": engine_outputs} inp = inp.engine_inputs batches, orig_batch_size = self.expand_inputs(engine_inputs=inp) batches_outputs = list(map(self.engine, batches)) From d54ef26aa8885160478771f16fcd1b150589af53 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Thu, 2 Nov 2023 20:47:25 -0400 Subject: [PATCH 4/7] [Pipeline Refactor] Update routes, text generation initial functionality (#1348) * initial functionality and working example with image classification * remove testing image * rebase fixes * initial functionality and working example with image classification * text gen * updates func * prompt inference, initial functionality * remove image; update state docstring * Fix typo * add todo for split/join * remove context, clean-up args, remove prefill_preprocess_operaator * fix docstrings --- src/deepsparse/v2/operators/__init__.py | 1 - .../v2/operators/engine_operator.py | 18 +- src/deepsparse/v2/operators/operator.py | 30 ++- src/deepsparse/v2/pipeline.py | 70 ++++-- src/deepsparse/v2/routers/router.py | 57 ++++- src/deepsparse/v2/schedulers/scheduler.py | 23 +- .../v2/schedulers/scheduler_group.py | 35 ++- src/deepsparse/v2/text_generation/__init__.py | 24 ++ .../autoregressive_preprocess_operator.py | 100 ++++++++ .../v2/text_generation/compile_logits.py | 43 ++++ .../v2/text_generation/kv_cache_operator.py | 70 ++++++ .../multi_engine_prefill_operator.py | 135 +++++++++++ .../v2/text_generation/nl_engine_operator.py | 191 ++++++++++++++++ src/deepsparse/v2/text_generation/pipeline.py | 213 ++++++++++++++++++ .../v2/text_generation/prep_for_prefill.py | 57 +++++ .../v2/text_generation/process_inputs.py | 121 ++++++++++ src/deepsparse/v2/utils/__init__.py | 2 +- src/deepsparse/v2/utils/state.py | 64 ++++++ tests/deepsparse/v2/test_basic_pipeline.py | 4 +- 19 files changed, 1203 insertions(+), 55 deletions(-) create mode 100644 src/deepsparse/v2/text_generation/__init__.py create mode 100644 src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py create mode 100644 src/deepsparse/v2/text_generation/compile_logits.py create mode 100644 src/deepsparse/v2/text_generation/kv_cache_operator.py create mode 100644 src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py create mode 100644 src/deepsparse/v2/text_generation/nl_engine_operator.py create mode 100644 src/deepsparse/v2/text_generation/pipeline.py create mode 100644 src/deepsparse/v2/text_generation/prep_for_prefill.py create mode 100644 src/deepsparse/v2/text_generation/process_inputs.py create mode 100644 src/deepsparse/v2/utils/state.py diff --git a/src/deepsparse/v2/operators/__init__.py b/src/deepsparse/v2/operators/__init__.py index 8f7e6a169d..9d1a9812ac 100644 --- a/src/deepsparse/v2/operators/__init__.py +++ b/src/deepsparse/v2/operators/__init__.py @@ -13,5 +13,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .operator import * diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py index 2c61755df9..b7d920a686 100644 --- a/src/deepsparse/v2/operators/engine_operator.py +++ b/src/deepsparse/v2/operators/engine_operator.py @@ -17,7 +17,8 @@ from pydantic import BaseModel, Field -from deepsparse import Context, Engine, MultiModelEngine, Scheduler +from deepsparse import Context as EngineContext +from deepsparse import Engine, MultiModelEngine, Scheduler from deepsparse.benchmark import ORTEngine from deepsparse.utils import join_engine_outputs, model_to_path, split_engine_inputs from deepsparse.v2.operators import Operator @@ -54,16 +55,15 @@ def __init__( self, model_path: str, engine_type: str = DEEPSPARSE_ENGINE, - batch_size: Optional[int] = 1, num_cores: int = None, num_streams: int = None, scheduler: Scheduler = None, input_shapes: List[List[int]] = None, - engine_context: Optional[Context] = None, + engine_context: Optional[EngineContext] = None, + engine_kwargs: Dict = None, ): - - self._batch_size = batch_size self.model_path = model_to_path(model_path) + self._batch_size = 1 self.engine_context = engine_context if self.engine_context is not None: @@ -87,7 +87,7 @@ def __init__( self._engine_args = engine_args self._engine_type = engine_type - self.engine = self.create_engine() + self.engine = self.create_engine(**engine_kwargs) @property def batch_size(self) -> int: @@ -114,12 +114,12 @@ def create_engine( if engine_type == DEEPSPARSE_ENGINE: if self.engine_context is not None and isinstance( - self.engine_context, Context + self.engine_context, EngineContext ): engine_args.pop("num_cores", None) engine_args.pop("scheduler", None) engine_args.pop("num_streams", None) - engine_args["context"] = self.engien_context + engine_args["context"] = self.engine_context return MultiModelEngine( model=onnx_file_path, **engine_args, @@ -135,7 +135,7 @@ def create_engine( f"{SUPPORTED_PIPELINE_ENGINES}" ) - def run(self, inp: EngineOperatorInputs) -> Dict: + def run(self, inp: EngineOperatorInputs, **kwargs) -> Dict: if inp.engine: # run with custom engine, do not split/join since custom engine # may run at any batch size, returning here as code below has a diff --git a/src/deepsparse/v2/operators/operator.py b/src/deepsparse/v2/operators/operator.py index c3a3e28b78..b3963d8223 100644 --- a/src/deepsparse/v2/operators/operator.py +++ b/src/deepsparse/v2/operators/operator.py @@ -17,6 +17,8 @@ from pydantic import BaseModel +from deepsparse.v2.utils import InferenceState, PipelineState + __all__ = ["Operator"] @@ -54,6 +56,8 @@ def has_output_schema(cls) -> bool: def __call__( self, *args, + inference_state: InferenceState, + pipeline_state: PipelineState, **kwargs, ) -> Any: """ @@ -61,7 +65,9 @@ def __call__( :param args: an unnamed arg may only be provided if it is of the type of the input_schema - :param context: pipeline context to pass to operator + :param inference_state: inference_state for the pipeline. + :param pipeline_state: pipeline_state for the pipeline. The values in the state + are created during pipeline creation and are read-only during inference. :param kwargs: kwargs when not initializing from an instantiated schema :return: operator output """ @@ -81,10 +87,18 @@ def __call__( "in the form of a dictionary or an instance of the input_schema" "object" ) - - run_output = self.run(inference_input) + run_output = self.run( + inference_input, + inference_state=inference_state, + pipeline_state=pipeline_state, + ) else: - run_output = self.run(*args, **kwargs) + run_output = self.run( + *args, + inference_state=inference_state, + pipeline_state=pipeline_state, + **kwargs, + ) if self.has_output_schema(): return self.output_schema(**run_output) @@ -93,12 +107,16 @@ def __call__( @abstractmethod def run(self, *args, **kwargs) -> Any: """ - :param inp: operator input, as the defined input schema if applicable - :param context: pipeline context of already run operators :return: result of this operator as the defined output schema if applicable """ raise NotImplementedError + def can_operate(self, inp: Any) -> bool: + """ + Whether or not the given operator can run, based on input + """ + return True + def expand_inputs(self, **kwargs): """ Generic function to handle expanding values. diff --git a/src/deepsparse/v2/pipeline.py b/src/deepsparse/v2/pipeline.py index e58f8a5191..0a8c8b2f93 100644 --- a/src/deepsparse/v2/pipeline.py +++ b/src/deepsparse/v2/pipeline.py @@ -18,6 +18,7 @@ from deepsparse.v2.operators import Operator from deepsparse.v2.routers import Router from deepsparse.v2.schedulers import OperatorScheduler, SchedulerGroup +from deepsparse.v2.utils import InferenceState, PipelineState __all__ = ["Pipeline"] @@ -27,7 +28,7 @@ class Pipeline(Operator): """ Pipeline accepts a series of operators, schedulers, and a router. Calling a pipeline will use the router to run through all the defined operators. The operators should - be implemented using the Operator class and each implemented Operator should be + be implemented using the Operator class and each implemented operator should be responsible for a functional component of the pipelines. The flow of inputs/outputs between the operators and the steps in the pipeline should be defined by the router, (based off of the Router class), which dicates the next operator in the pipeline. @@ -37,6 +38,7 @@ class Pipeline(Operator): or dictionary of operators. :param router: A Router which dictates the next operator to call. :param schedulers: A list of schedulers to run operators. + :param pipeline_state: pipeline_state created during pipeline initialization """ @@ -45,57 +47,93 @@ def __init__( ops: Union[Dict[str, Operator], List[Operator]], router: Router, schedulers: List[OperatorScheduler], + pipeline_state: PipelineState = None, ): self.ops = ops self.router = router self.schedulers = schedulers + self.pipeline_state = pipeline_state self.validate() # SchedulerGroup handles running all schedulers in order of priority self._scheduler_group = SchedulerGroup(self.schedulers) - def run(self, *args, **kwargs): + def run( + self, + *args, + inference_state: InferenceState, + pipeline_state: PipelineState, + **kwargs, + ): """ - Run through the operators using the provided router and scheduler. Update the - context to reflect each step of the router. The input to a given operator is the - output of the previous operator. - - :param inp: input to the operator. expected to be of any type that is - expected by the operator. - :param context: context to store the current the inputs, outputs, and operator - for each step of the router. + Run through the operators using the provided router and scheduler. + The input to a given operator is the output of the previous operator. + :param inference_state: inference_state for the pipeline. + :param pipeline_state: pipeline_state for the pipeline. The values in the state + are created during pipeline creation and are read-only during inference. """ next_step = self.router.START_ROUTE operator_output = None + while next_step != self.router.END_ROUTE: # Either a dictionary key or valid index operator = self.ops[next_step] if next_step == self.router.START_ROUTE: output_future = self._scheduler_group.submit( - *args, operator=operator, **kwargs + *args, + inference_state=inference_state, + operator=operator, + pipeline_state=pipeline_state, + **kwargs, ) else: if isinstance(operator_output, dict): output_future = self._scheduler_group.submit( - operator=operator, **operator_output + inference_state=inference_state, + operator=operator, + pipeline_state=pipeline_state, + **operator_output, ) else: output_future = self._scheduler_group.submit( - operator_output, operator=operator + operator_output, + inference_state=inference_state, + pipeline_state=pipeline_state, + operator=operator, ) - # wait for future to resolve operator_output = output_future.result() - next_step = self.router.next(next_step, self.ops) + if isinstance(operator_output, tuple): + state_update = operator_output[-1] + operator_output = operator_output[0] + inference_state.update_state(state_update) + + next_step = self.router.next(next_step, self.ops, operator_output) + return operator_output def __call__(self, *args, **kwargs): """ + Consolidate any provided inference_state or pipeline_state objects and pass + any other operator inputs to run(). + :return: output of the pipeline operators ran with the router for the given - input + input """ + if kwargs.get("inference_state"): + inference_state = kwargs.pop("inference_state") + else: + inference_state = InferenceState() + inference_state.create_state({}) + + if "pipeline_state" in kwargs: + self.pipeline_state = kwargs.get("pipeline_state") + + kwargs["inference_state"] = inference_state + kwargs["pipeline_state"] = self.pipeline_state + return self.run(*args, **kwargs) def validate(self): diff --git a/src/deepsparse/v2/routers/router.py b/src/deepsparse/v2/routers/router.py index 6050803b5e..d1110d4ca7 100644 --- a/src/deepsparse/v2/routers/router.py +++ b/src/deepsparse/v2/routers/router.py @@ -15,14 +15,14 @@ import logging from abc import abstractmethod -from typing import Dict, List, Union +from typing import Any, Dict, List, Optional, Union from deepsparse.v2.operators import Operator _LOGGER = logging.getLogger(__name__) -__all__ = ["Router", "LinearRouter"] +__all__ = ["Router", "LinearRouter", "GraphRouter"] class Router: @@ -32,23 +32,34 @@ class Router: :param start_route: the start index or key of the router :param end_route: the end index or key of the router + :param route: the route that the router has to traverse through """ - def __init__(self, end_route: Union[str, int], start_route: Union[str, int]): + def __init__( + self, + end_route: Union[str, int], + start_route: Union[str, int], + route: Optional[Dict] = None, + ): self.START_ROUTE = start_route self.END_ROUTE = end_route + self.route = route @abstractmethod def next( - self, past: Union[str, int], ops: Union[List[Operator], Dict[str, Operator]] + self, + past: Union[str, int], + ops: Optional[Union[List[Operator], Dict[str, Operator]]], + inp: Optional[Any], ) -> Union[str, int]: """ Determines the index or dictionary key for the next operator which should run. :param past: the previous index or key. This should uniquely determine the next - operator to run + operator to run :param ops: list or dictionary of operators + :param inp: operator input :returns: the next index or dictionary key for the next operator to run """ raise NotImplementedError @@ -69,7 +80,9 @@ class LinearRouter(Router): def __init__(self, end_route: int, start_route: int = 0): super().__init__(end_route=end_route, start_route=start_route) - def next(self, past: int, ops: List[Operator]) -> int: + def next( + self, past: int, ops: Optional[List[Operator]] = None, inp: Optional[Any] = None + ) -> int: new_index = past + 1 if new_index < self.END_ROUTE: return new_index @@ -105,3 +118,35 @@ def validate(operators: List[Operator]) -> bool: ) return False return True + + +class GraphRouter(Router): + """ + Router for a DAG. Expects graphs be presented in the form of a dictionary, where + keys are the nodes of the graph and the values are the connected nodes. For + nodes with multiple ouput edges, all the nodes will be visited and the first node + where `can_operate` returns True will run. Paths should be deterministic. + """ + + def __init__(self, end_route: str, start_route: str, route: Dict): + super().__init__(end_route=end_route, start_route=start_route, route=route) + + def next( + self, + past: str, + ops: Dict[str, Operator], + inp: Any, + ) -> int: + node = past + if isinstance(self.route[node], str): + return self.route[node] + else: + for neighbour_node in self.route[node]: + neighbour_node_op = ops[neighbour_node] + if neighbour_node_op.can_operate(inp): + return neighbour_node + raise ValueError("Cannot operate on any of the nodes") + + @staticmethod + def validate(ops) -> bool: + pass diff --git a/src/deepsparse/v2/schedulers/scheduler.py b/src/deepsparse/v2/schedulers/scheduler.py index 7d4f249444..78a58e3389 100644 --- a/src/deepsparse/v2/schedulers/scheduler.py +++ b/src/deepsparse/v2/schedulers/scheduler.py @@ -36,19 +36,30 @@ class OperatorScheduler: def __init__(self, max_workers: int = 1): self._threadpool = ThreadPoolExecutor(max_workers=max_workers) - def submit(self, *args, operator: Operator, **kwargs) -> Future: + def submit( + self, + *args, + operator: Operator, + **kwargs, + ) -> Future: """ :param operator: operator to run - :param operator_input: input schema to the operator - :param context: context of already run operators :return: future referencing the asynchronously run output of the operator """ - return self._threadpool.submit(operator, *args, **kwargs) + return self._threadpool.submit( + operator, + *args, + **kwargs, + ) - def can_process(self, *args, operator: Operator, **kwargs) -> bool: + def can_process( + self, + *args, + operator: Operator, + **kwargs, + ) -> bool: """ :param operator: operator to check - :param operator_input: operator_input to check :return: True if this Operator can process the given operator and input. Base OperatorScheduler always returns True """ diff --git a/src/deepsparse/v2/schedulers/scheduler_group.py b/src/deepsparse/v2/schedulers/scheduler_group.py index 7f00a3c17c..40b5695f22 100644 --- a/src/deepsparse/v2/schedulers/scheduler_group.py +++ b/src/deepsparse/v2/schedulers/scheduler_group.py @@ -34,25 +34,44 @@ class SchedulerGroup(OperatorScheduler): def __init__(self, schedulers: List[OperatorScheduler]): self.schedulers = schedulers - def submit(self, *args, operator: Operator, **kwargs) -> Future: + def submit( + self, + *args, + operator: Operator, + **kwargs, + ) -> Future: """ :param operator: operator to run - :param operator_input: input schema to the operator - :param context: context of already run operators :return: future referencing the asynchronously run output of the operator """ for scheduler in self.schedulers: - if scheduler.can_process(*args, operator=operator, **kwargs): - return scheduler.submit(*args, operator=operator, **kwargs) + if scheduler.can_process( + *args, + operator=operator, + **kwargs, + ): + return scheduler.submit( + *args, + operator=operator, + **kwargs, + ) - def can_process(self, *args, operator: Operator, **kwargs) -> bool: + def can_process( + self, + *args, + operator: Operator, + **kwargs, + ) -> bool: """ :param operator: operator to check - :param operator_input: operator_input to check :return: True if this Operator can process the given operator and input. SchedulerGroup always returns True """ return any( - scheduler.can_process(*args, operator=operator, **kwargs) + scheduler.can_process( + *args, + operator=operator, + **kwargs, + ) for scheduler in self.schedulers ) diff --git a/src/deepsparse/v2/text_generation/__init__.py b/src/deepsparse/v2/text_generation/__init__.py new file mode 100644 index 0000000000..37ac88d02f --- /dev/null +++ b/src/deepsparse/v2/text_generation/__init__.py @@ -0,0 +1,24 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# flake8: noqa +from .autoregressive_preprocess_operator import * +from .compile_logits import * +from .kv_cache_operator import * +from .multi_engine_prefill_operator import * +from .nl_engine_operator import * +from .prep_for_prefill import * +from .process_inputs import * + + +from .pipeline import * # isort:skip diff --git a/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py new file mode 100644 index 0000000000..cfe7cb531b --- /dev/null +++ b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py @@ -0,0 +1,100 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Any + +import numpy + +from deepsparse.transformers.utils.helpers import create_causal_mask +from deepsparse.v2.operators import Operator +from deepsparse.v2.utils import PipelineState + + +_LOGGER = logging.getLogger(__name__) + +__all__ = ["AutoRegressiveOperatorPreprocess"] + + +class AutoRegressiveOperatorPreprocess(Operator): + def __init__(self, sequence_length: int, prompt_sequence_length: int): + """ + Prepare the tokens for the single-token engine. This requires creating the + attention mask, positions, and causal mask. The output contains these three + arrays to be passed into the single-token engine. + """ + self.sequence_length = sequence_length + self.prompt_sequence_length = prompt_sequence_length + self.set_capacity = False + + _LOGGER.warn( + "This operator requires the PipelineState to be set-up with the " + "onnx_input_names_no_cache attribute set from the NLEngineOperator." + ) + + def can_operate(self, inp: Any) -> bool: + """ + Can run this Operator if the number of tokens left to process is greater than + 0 but less than the self.prompt_sequence_length. + """ + tokens = inp.get("tokens") + kv_cache = inp.get("kv_cache") + + remaining_tokens = len(tokens) - kv_cache.total_num_processed_tokens + if remaining_tokens > 0 and remaining_tokens < self.prompt_sequence_length: + return True + return False + + def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwargs): + + if not self.set_capacity: + self.set_capacity = True + kv_cache.set_capacity(self.sequence_length - 1) + + num_total_processed_tokens = kv_cache.total_num_processed_tokens + new_token = tokens[num_total_processed_tokens] + engine_input_names = pipeline_state.current_state.get( + "onnx_input_names_no_cache" + ) + + # padding is added to left, so attention mask is 1s from the + # right up to the number of total tokens (prompt + generated) + attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64) + num_attention_entries_to_unmask = min( + num_total_processed_tokens + 1, self.sequence_length + ) # cap by seq len + attention_mask[:, -num_attention_entries_to_unmask:] = 1 + positions = numpy.array([[num_total_processed_tokens]], dtype=numpy.int64) + input_ids = numpy.array([[new_token]]) + causal_mask = create_causal_mask(input_ids, attention_mask) + + engine_inputs_map = dict( + input_ids=input_ids, + attention_mask=attention_mask, + causal_mask=causal_mask, + positions=positions, + ) + + engine_inputs = [engine_inputs_map[name] for name in engine_input_names] + + onnx_input_names_no_cache = pipeline_state.current_state.get( + "onnx_input_names_no_cache" + ) + engine_inputs = [engine_inputs_map[name] for name in onnx_input_names_no_cache] + + return { + "engine_inputs": engine_inputs, + "kv_cache": kv_cache, + "tokens": tokens, + } diff --git a/src/deepsparse/v2/text_generation/compile_logits.py b/src/deepsparse/v2/text_generation/compile_logits.py new file mode 100644 index 0000000000..55c87d791d --- /dev/null +++ b/src/deepsparse/v2/text_generation/compile_logits.py @@ -0,0 +1,43 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from deepsparse.v2.operators import Operator +from deepsparse.v2.utils import InferenceState + + +__all__ = ["CompilePromptLogits"] + + +class CompilePromptLogits(Operator): + """ + Combine the prompt logits. Currently relying on the inference state to store the + prompt logits for each token or multi-token batch processed. This operator will + take prompt logits from each iteration run and update the inference state. + """ + + def run(self, logits, inference_state: InferenceState, **kwargs): + logit_type = "prompt_logits" + + if inference_state.current_state.get(logit_type) is not None: + current_logits = inference_state.current_state.get(logit_type).copy() + current_logits.append(logits) + else: + current_logits = [logits] + + state_update = {logit_type: current_logits} + return { + "kv_cache": kwargs.get("kv_cache"), + "tokens": kwargs.get("tokens"), + }, state_update diff --git a/src/deepsparse/v2/text_generation/kv_cache_operator.py b/src/deepsparse/v2/text_generation/kv_cache_operator.py new file mode 100644 index 0000000000..0b232402b3 --- /dev/null +++ b/src/deepsparse/v2/text_generation/kv_cache_operator.py @@ -0,0 +1,70 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any + +from pydantic import BaseModel, Field + +from deepsparse.transformers.utils import DecoderKVCache +from deepsparse.transformers.utils.helpers import ( + initialize_kv_cache_state, + prepends_bos_token, +) +from deepsparse.v2.operators import Operator + + +__all__ = ["KVCacheCreator"] + + +class KVCacheCreatorOutput(BaseModel): + kv_cache: Any = Field(description="KV Cache Created") # DecoderKVCache + + +class KVCacheCreatorInput(BaseModel): + cache_shape: Any = Field(description="shape") + kv_cache_data_type: Any = Field(description="data type") + output_names: Any = Field(description="output names") + + +class KVCacheCreator(Operator): + input_schema = KVCacheCreatorInput + output_schema = KVCacheCreatorOutput + + def __init__( + self, + tokenizer, + sequence_length: int, + prompt_sequence_length: int, + internal_kv_cache: bool, + ): + self.tokenizer = tokenizer + self.prompt_sequence_length = prompt_sequence_length + self.internal_kv_cache = internal_kv_cache + self.sequence_length = sequence_length + + def run(self, cache_shape, kv_cache_data_type: str, output_names: list, **kwargs): + kv_cache_state = initialize_kv_cache_state( + cache_shape=cache_shape, + kv_cache_data_type=kv_cache_data_type, + output_names=output_names, + length=self.sequence_length - self.prompt_sequence_length, + empty=bool(self.internal_kv_cache), + ) + + kv_cache = DecoderKVCache(self.internal_kv_cache) + kv_cache.setup( + state=kv_cache_state, + freeze_first_position=prepends_bos_token(self.tokenizer), + ) + return {"kv_cache": kv_cache} diff --git a/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py new file mode 100644 index 0000000000..41ee830a8a --- /dev/null +++ b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py @@ -0,0 +1,135 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from enum import Enum +from typing import Any + +import numpy + +from deepsparse.transformers.utils.helpers import create_causal_mask +from deepsparse.v2.operators import Operator +from deepsparse.v2.utils import PipelineState + + +_LOGGER = logging.getLogger(__name__) + +__all__ = ["MultiEnginePrefill"] + + +class OnnxInputNames(Enum): + INPUT_IDS = "input_ids" + ATTN_MASK = "attention_mask" + CAUSAL_MASK = "causal_mask" + POSITIONS = "positions" + + +# NOTE: A possible clean-up could involve combining this Operator and the +# autoregressive_preprocess_operator + + +class MultiEnginePrefill(Operator): + def __init__(self, prompt_sequence_length, sequence_length): + """ + Prepare the tokens for the multi-token engine. This requires creating the + attention mask, positions, and causal mask. The output contains these three + arrays to be passed into the multi-token engine. + """ + self.prompt_sequence_length = prompt_sequence_length + self.sequence_length = sequence_length + self.cases = { + OnnxInputNames.ATTN_MASK.value: self._case_attn_mask, + OnnxInputNames.POSITIONS.value: self._case_positions, + } + _LOGGER.warn( + "This operator requires the PipelineState to be set-up with the " + "onnx_input_names_no_cache attribute set from the NLEngineOperator." + ) + + def can_operate(self, inp: Any): + """ + Can only run if the number of prompt tokens left to process is greater than + or equal to the self.prompt_sequence_length. + """ + kv_cache = inp.get("kv_cache") + tokens = inp.get("tokens") + + if len(tokens) < self.prompt_sequence_length: + return False + + if ( + len(tokens) - kv_cache.total_num_processed_tokens + >= self.prompt_sequence_length + ): + return True + return False + + def _case_attn_mask(self, num_total_processed_tokens: int): + # create an empty attention mask + engine_input = numpy.zeros((1, self.sequence_length), dtype=numpy.int64) + # calculate the number of entries in attention mask that should be set to 1 + num_attention_entries_to_unmask = min( + num_total_processed_tokens + self.prompt_sequence_length, + self.sequence_length, + ) + engine_input[:, -num_attention_entries_to_unmask:] = 1 + return engine_input + + def _case_positions(self, num_total_processed_tokens: int): + return ( + numpy.arange( + num_total_processed_tokens, + num_total_processed_tokens + self.prompt_sequence_length, + ) + .reshape(1, -1) + .astype(numpy.int64) + ) + + def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwargs): + + onnx_input_names_no_cache = pipeline_state.current_state.get( + "onnx_input_names_no_cache" + ) + + num_total_processed_tokens = kv_cache.total_num_processed_tokens + start = num_total_processed_tokens + end = start + self.prompt_sequence_length + token_batch = tokens[start:end] + + engine_inputs = [] + for name in onnx_input_names_no_cache: + if name == OnnxInputNames.INPUT_IDS.value: + engine_input = numpy.array([token_batch]) + elif ( + name == OnnxInputNames.ATTN_MASK.value + or name == OnnxInputNames.POSITIONS.value + ): + engine_input = self.cases[name](num_total_processed_tokens) + elif name == OnnxInputNames.CAUSAL_MASK.value: + continue + + engine_inputs.append(engine_input) + + if OnnxInputNames.CAUSAL_MASK.value in onnx_input_names_no_cache: + causal_mask = create_causal_mask( + input_ids=engine_inputs[0], + attention_mask=engine_inputs[1], + ) + engine_inputs.append(causal_mask) + + return { + "engine_inputs": engine_inputs, + "kv_cache": kv_cache, + "tokens": tokens, + } diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py new file mode 100644 index 0000000000..6c1ad1966e --- /dev/null +++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py @@ -0,0 +1,191 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import os +from typing import Any, List, Tuple + +from pydantic import BaseModel, Field + +from deepsparse.utils.onnx import ( + CACHE_INPUT_PREFIX, + overwrite_onnx_model_inputs_for_kv_cache_models, +) +from deepsparse.v2.operators.engine_operator import ( + DEEPSPARSE_ENGINE, + EngineOperator, + EngineOperatorInputs, +) + + +__all__ = ["NLEngineOperator"] + + +class NlEngineInput(BaseModel): + engine_inputs: List = Field(description="engine inputs") + kv_cache: Any = Field(description="kv_cache object") + tokens: List = Field(description="tokens") + + +class NLEngineOperator(EngineOperator): + + """ + Operator for the NL Decoder Engine. This Operator inherits from the EngineOperator. + Specific updates to engine attributes are made through this operator, as well + as updating the kv_cache. This Operator is used for both the single-token and + multi-token case. + """ + + input_schema = NlEngineInput + output_schema = None + + def __init__( + self, + sequence_length: int, + input_ids_length: int, + internal_kv_cache: bool = False, + **kwargs, + ): + + self.kv_cache_data_type = None + ( + onnx_file_path, + output_indices_to_be_cached, + kv_cache_data_type, + ) = overwrite_onnx_model_inputs_for_kv_cache_models( + onnx_file_path=kwargs.get("model_path"), + batch_size=kwargs.get("batch_size", 1), + sequence_length=sequence_length, + input_ids_length=input_ids_length, + ) + + engine_kwargs = kwargs.get("engine_kwargs", {}) + if kwargs.get("engine_type", DEEPSPARSE_ENGINE) == DEEPSPARSE_ENGINE: + if "WAND_OPT_FLAGS" not in os.environ: + os.environ["WAND_OPT_FLAGS"] = "default,~pyramids" + + if any(output_indices_to_be_cached): + self.kv_cache_data_type = kv_cache_data_type + if ( + internal_kv_cache + and kwargs.get("engine_type", DEEPSPARSE_ENGINE) == DEEPSPARSE_ENGINE + ): + engine_kwargs["cached_outputs"] = output_indices_to_be_cached + + kwargs["engine_kwargs"] = engine_kwargs + kwargs["model_path"] = onnx_file_path + super().__init__(**kwargs) + + self.input_ids_length = input_ids_length + + def run(self, inp: NlEngineInput, **kwargs) -> Any: + engine_input = inp.engine_inputs + kv_cache = inp.kv_cache + + inputs = self._add_kv_cache_to_input(engine_input, kv_cache) + if bool(kv_cache.engine_internal_cache): + # conventionally, before dispatching + # inputs to the engine, we validate them + # if val_inp=True. However, in this case + # we want to pass the empty kv cache inputs + # (batch_size=0) to the engine. Therefore, + # we skip the validation + out = self.engine._eng_net.execute_list_out( + inputs, kv_cache.engine_internal_cache + ) + else: + # run the engine without the LIB.kv_cache object + out = ( + super() + .run(EngineOperatorInputs(engine_inputs=inputs), **kwargs) + .get("engine_outputs") + ) + + logits, *kv_cache_state = out + self._update_kv_cache( + kv_cache_state=kv_cache_state, + input_ids_len=self.input_ids_length, + kv_cache=kv_cache, + ) + + output = {"logits": logits, "kv_cache": kv_cache, "tokens": inp.tokens} + return output + + def _add_kv_cache_to_input(self, engine_input, kv_cache): + kv_cache_state = copy.copy(kv_cache.cached_inputs) + + for idx, input_name in enumerate(self.onnx_input_names_no_cache): + kv_cache_state[input_name] = engine_input[idx] + + new_inp = [kv_cache_state[name] for name in self.engine.input_names] + return new_inp + + def _update_kv_cache(self, kv_cache_state, input_ids_len, kv_cache): + if bool(kv_cache.engine_internal_cache): + kv_cache.total_num_processed_tokens += input_ids_len + return + + kv_cache_state = { + name: array + for name, array in zip(self.onnx_input_names_cached, kv_cache_state) + } + + kv_cache.update( + state=kv_cache_state, + input_ids_len=input_ids_len, + ) + + @property + def onnx_input_names_no_cache(self) -> List[str]: + """ + :return: The input names for the onnx model, excluding + the potential kv cache inputs + """ + return [ + name + for name in self.engine.input_names + if not name.startswith(CACHE_INPUT_PREFIX) + ] + + @property + def onnx_input_names_cached(self) -> List[str]: + """ + :return: The cached input names for the onnx model + """ + return [ + name + for name in self.engine.input_names + if name.startswith(CACHE_INPUT_PREFIX) + ] + + @property + def cache_shape(self) -> Tuple[int, int, int, int]: + """ + :return: The shape of the kv cache inputs + for the onnx model. The shape is + (batch_size, num_heads, sequence_length, hidden_size) + """ + cache_engine_input_index = next( + i + for i, name in enumerate(self.engine.input_names) + if CACHE_INPUT_PREFIX in name + ) + return self.engine.input_shapes[cache_engine_input_index] + + @property + def output_names(self) -> List[str]: + """ + :return: The output names for the onnx model + """ + return self.engine.output_names diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py new file mode 100644 index 0000000000..9878aa0061 --- /dev/null +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -0,0 +1,213 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict + +from deepsparse.transformers.utils.helpers import process_generation_config +from deepsparse.v2.operators import Operator +from deepsparse.v2.pipeline import Pipeline +from deepsparse.v2.routers import GraphRouter +from deepsparse.v2.schedulers import OperatorScheduler +from deepsparse.v2.text_generation import ( + AutoRegressiveOperatorPreprocess, + CompilePromptLogits, + KVCacheCreator, + MultiEnginePrefill, + NLEngineOperator, + PrepareforPrefill, + ProcessInputsTextGeneration, +) +from deepsparse.v2.utils import PipelineState + + +class TextGenerationPipeline(Pipeline): + def __init__( + self, + model_path: str, + prompt_sequence_length: int = 16, + sequence_length: int = 1024, + internal_kv_cache: bool = True, + force_max_tokens: bool = False, + generation_config=None, + engine_kwargs: Dict = None, + ): + + pipeline_state = PipelineState() + pipeline_state_vals = {} + + # TODO: The code below will be replaced with a transformers set-up Operator. + self.tokenizer = None + model_path = self.setup_onnx_file_path(model_path, sequence_length) + self.tokenizer.padding_side = "left" + if not self.tokenizer.pad_token: + self.tokenizer.pad_token = self.tokenizer.eos_token + + if not engine_kwargs: + engine_kwargs = {} + engine_kwargs["model_path"] = model_path + + if internal_kv_cache and engine_kwargs.get("engine_type") == "onnxruntime": + internal_kv_cache = False + + single_engine_operator = NLEngineOperator( + sequence_length=sequence_length, + internal_kv_cache=internal_kv_cache, + input_ids_length=1, + **engine_kwargs, + ) + + multi_engine_operator = NLEngineOperator( + sequence_length=sequence_length, + internal_kv_cache=internal_kv_cache, + input_ids_length=prompt_sequence_length, + **engine_kwargs, + ) + + # NOTE: Currently using pipeline state. Can swap to simply pass in the + # attributes to the specific Operator that neeed them, as class attributes. + pipeline_state_vals[ + "onnx_input_names_no_cache" + ] = single_engine_operator.onnx_input_names_no_cache + pipeline_state_vals["cache_shape"] = single_engine_operator.cache_shape + pipeline_state_vals["output_names"] = single_engine_operator.output_names + pipeline_state_vals[ + "kv_cache_data_type" + ] = single_engine_operator.kv_cache_data_type + pipeline_state.create_state(pipeline_state_vals) + + process_inputs = ProcessInputsTextGeneration( + generation_config=process_generation_config(generation_config), + sequence_length=sequence_length, + tokenizer=self.tokenizer, + ) + + kv_cache_creator = KVCacheCreator( + sequence_length=sequence_length, + tokenizer=self.tokenizer, + prompt_sequence_length=prompt_sequence_length, + internal_kv_cache=internal_kv_cache, + ) + + # NOTE: Can also have the KVCacheCreator be initialized inside this Operator. + # Relies on pipeline state variables set-up above (can be swapped to be class + # attributes instead of using the state. + engine_inputs_for_prefill = PrepareforPrefill(kv_cache_creator=kv_cache_creator) + + multi_engine_prefill = MultiEnginePrefill( + prompt_sequence_length=prompt_sequence_length, + sequence_length=sequence_length, + ) + compile_prompt_logits = CompilePromptLogits() + """ + prep_for_single_engine = PrepareforSingleEngine( + prompt_sequence_length=prompt_sequence_length, + sequence_length=sequence_length, + ) + """ + autoregressive_preprocess = AutoRegressiveOperatorPreprocess( + sequence_length=sequence_length, + prompt_sequence_length=prompt_sequence_length, + ) + final_step = FinalStep() + + ops = { + "process_input": process_inputs, + "single_engine": single_engine_operator, + "multi_engine": multi_engine_operator, + "kv_cache_creator": kv_cache_creator, + "prepare_prefill": engine_inputs_for_prefill, + "multi_engine_prefill": multi_engine_prefill, + "compile_logits": compile_prompt_logits, + "autoregressive_preprocess": autoregressive_preprocess, + "final_step": final_step, + } + + routes = { + "process_input": "prepare_prefill", + "prepare_prefill": ["multi_engine_prefill", "autoregressive_preprocess"], + "multi_engine_prefill": "multi_engine", + "multi_engine": "compile_logits", + "compile_logits": [ + "multi_engine_prefill", + "autoregressive_preprocess", + "final_step", + ], + "autoregressive_preprocess": "single_engine", + "single_engine": "compile_logits", + "final_step": "STOP", + } + + router = GraphRouter( + end_route="STOP", start_route="process_input", route=routes + ) + scheduler = [OperatorScheduler()] + super().__init__( + ops=ops, router=router, schedulers=scheduler, pipeline_state=pipeline_state + ) + + # TODO: Move to be part of a generic transformers set-up Operator. + def setup_onnx_file_path(self, model_path, sequence_length) -> str: + import logging + + import transformers + from transformers import AutoTokenizer + + from deepsparse.transformers.helpers import get_deployment_path + + """ + Parses ONNX model from the `model_path` provided. It additionally + creates config and tokenizer objects from the `deployment path`, + derived from the `model_path` provided. + + :return: file path to the processed ONNX file for the engine to compile + """ + deployment_path, onnx_path = get_deployment_path(model_path) + + hf_logger = logging.getLogger("transformers") + hf_logger_level = hf_logger.level + hf_logger.setLevel(logging.ERROR) + self.config = transformers.PretrainedConfig.from_pretrained( + deployment_path, + finetuning_task=self.task if hasattr(self, "task") else None, + ) + hf_logger.setLevel(hf_logger_level) + + self._trust_remote_code = False + self.tokenizer = AutoTokenizer.from_pretrained( + deployment_path, + trust_remote_code=self._trust_remote_code, + model_max_length=sequence_length, + ) + + if not self.config or not self.tokenizer: + raise RuntimeError( + "Invalid config or tokenizer provided. Please provide " + "paths to the files or ensure they exist in the `model_path` provided. " + "See `tokenizer` and `config` arguments for details." + ) + return onnx_path + + +# NOTE: This is a dummy last step which will be removed. Used as a final step +# for the current routes. +class FinalStep(Operator): + def can_operate(self, *args, **kwargs): + return True + + def run(self, *args, **kwargs): + import numpy + + inference_state = kwargs.get("inference_state") + prompt_logits = inference_state.current_state.get("prompt_logits") + return numpy.concatenate(prompt_logits, axis=1) diff --git a/src/deepsparse/v2/text_generation/prep_for_prefill.py b/src/deepsparse/v2/text_generation/prep_for_prefill.py new file mode 100644 index 0000000000..2f9eb15797 --- /dev/null +++ b/src/deepsparse/v2/text_generation/prep_for_prefill.py @@ -0,0 +1,57 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Any + +from deepsparse.v2.operators import Operator +from deepsparse.v2.utils import PipelineState + + +_LOGGER = logging.getLogger(__name__) + +__all__ = ["PrepareforPrefill"] + + +class PrepareforPrefill(Operator): + def __init__(self, kv_cache_creator: Operator): + """ + Operator before prefill. Responsible for creating the kv_cache based on engine + variables. Currently, this operator expects that the kv_cache_creator is + provided during initization and then uses pipeline_state to run the + kv_cache_operator. + """ + # NOTE: Alternatively, we can initialize the kv_cache_creater operator here, + # instead of at the pipeline level. + self.kv_cache_creator = kv_cache_creator + + _LOGGER.warn( + "This operator requires the PipelineState to be set-up with the " + "cache_shape, output_names, kv_cache_data_type attributes to be set " + "from the NLEngineOperator" + ) + + def run(self, tokens: Any, pipeline_state: PipelineState, **kwargs): + # NOTE: Can potentially just be class attributes instead of relying on + # pipeline state. + cache_shape = pipeline_state.current_state.get("cache_shape") + data_type = pipeline_state.current_state.get("kv_cache_data_type") + output_names = pipeline_state.current_state.get("output_names") + + kv_cache = self.kv_cache_creator.run( + cache_shape=cache_shape, + kv_cache_data_type=data_type, + output_names=output_names, + ).get("kv_cache") + return {"tokens": tokens, "kv_cache": kv_cache} diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py new file mode 100644 index 0000000000..528dcee0b7 --- /dev/null +++ b/src/deepsparse/v2/text_generation/process_inputs.py @@ -0,0 +1,121 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pathlib +from typing import Dict, Union + +import transformers + +from deepsparse.transformers.pipelines.text_generation import TextGenerationInput +from deepsparse.transformers.utils.helpers import ( + check_and_return_generation_config, + override_config, + repeat_inputs, +) +from deepsparse.v2.operators import Operator + + +class GenerationDefaults: + num_return_sequences = 1 + max_length = 1024 + max_new_tokens = None + output_scores = False + top_k = 0 + top_p = 0.0 + repetition_penalty = 0.0 + do_sample = False + temperature = 1.0 + + +__all__ = ["ProcessInputsTextGeneration"] + + +class ProcessInputsTextGeneration(Operator): + """ + Input processing operator. Responsible for tokenizing the input, handling the + generation_config (if provided), updating the inference_state for later use, + and returning the tokens for prompt inferece. The expected input is defined by + the input_schema, which for this operator is TextGeneratioInput. + """ + + input_schema = TextGenerationInput + + def __init__( + self, + tokenizer: transformers.PreTrainedTokenizerBase, + generation_config: Union[ + str, pathlib.Path, Dict, transformers.GenerationConfig + ], + sequence_length: int, + ): + self.generation_config = generation_config + self.tokenizer = tokenizer + self.sequence_length = sequence_length + + def run(self, inp: TextGenerationInput, **kwargs): + generation_config = check_and_return_generation_config( + self.generation_config, inp.generation_config, GenerationDefaults() + ) + + generation_config = override_config(inp.generation_kwargs, generation_config) + + original_inputs = inp.sequences + if generation_config.num_return_sequences > 1: + if isinstance(inp.sequences, str): + inp.sequences = [inp.sequences] + inp.sequences = repeat_inputs( + inp.sequences, generation_config.num_return_sequences + ) + + if inp.fixed_sequences_length: + # to enforce a fixed sequence length, we need to + # truncate the input to the maximum sequence length + # or/and pad it to the maximum sequence length + truncate, padding = True, "max_length" + else: + # otherwise, we do not need to truncate the input + # and we shall can pad it to the longest sequence + # in the batch (so that the engine can process multiple inputs + # at once) + truncate, padding = False, "longest" + + input_tokens = self.tokenizer( + inp.sequences, + return_tensors="np", + max_length=self.sequence_length, + padding=padding, + truncation=truncate, + ) + + input_ids = input_tokens["input_ids"] + attention_mask = input_tokens["attention_mask"] + + inference_state_update = dict( + prompts=original_inputs, + streaming=inp.streaming, + generation_config=generation_config, + include_prompt_logits=inp.include_prompt_logits, + callback=inp.callback, + stop=inp.stop, + top_p=generation_config.top_p, + top_k=generation_config.top_k, + presence_penalty=inp.presence_penalty, + frequency_penalty=generation_config.repetition_penalty, + ) + + # TODO: move this step to prep_for_prefill and add attention mask to the output + # this will allow us to split/join more easily when processing multiple prompts + # in parallel + tokens = input_ids[attention_mask.nonzero()].tolist() + return {"tokens": tokens}, inference_state_update diff --git a/src/deepsparse/v2/utils/__init__.py b/src/deepsparse/v2/utils/__init__.py index a36d8e92ec..358405d7af 100644 --- a/src/deepsparse/v2/utils/__init__.py +++ b/src/deepsparse/v2/utils/__init__.py @@ -13,5 +13,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +from .state import * from .types import * diff --git a/src/deepsparse/v2/utils/state.py b/src/deepsparse/v2/utils/state.py new file mode 100644 index 0000000000..b54b890acf --- /dev/null +++ b/src/deepsparse/v2/utils/state.py @@ -0,0 +1,64 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import warnings +from abc import ABC +from typing import Any, Union + + +__all__ = ["State", "PipelineState", "InferenceState"] + + +class State(ABC): + """ + Abstract class to store pipeline-level and inference-level state variables which + are generated by some Operator, and required by some other Operator. + """ + + def __init__(self): + self._current_state = None + + @property + def current_state(self): + return self._current_state + + +class PipelineState(State): + """ + Created during pipeline initialization. Pipeline state values are ready-only + duirng inference. + """ + + def create_state(self, new_state: dict): + if self._current_state: + raise ValueError("State creation is only allowed during initialization.") + self._current_state = new_state + + +class InferenceState(State): + """ + Inference state, created during every inference run. + """ + + def create_state(self, new_state: dict): + if self._current_state: + warnings.warn("Current state already exists, overriding.") + self._current_state = new_state + + def update_value(self, attribute: str, value: Union[str, int, list]): + if not self._current_state.get(attribute): + raise ValueError(f"{attribute} is not a valid state attribute") + self._current_state[attribute] = value + + def update_state(self, value: Any): + self._current_state.update(value) diff --git a/tests/deepsparse/v2/test_basic_pipeline.py b/tests/deepsparse/v2/test_basic_pipeline.py index 9f85e4976e..bedddd537a 100644 --- a/tests/deepsparse/v2/test_basic_pipeline.py +++ b/tests/deepsparse/v2/test_basic_pipeline.py @@ -34,7 +34,7 @@ class AddOneOperator(Operator): input_schema = IntSchema output_schema = IntSchema - def run(self, inp: IntSchema) -> Dict: + def run(self, inp: IntSchema, **kwargs) -> Dict: return {"value": inp.value + 1} @@ -42,7 +42,7 @@ class AddTwoOperator(Operator): input_schema = IntSchema output_schema = IntSchema - def run(self, inp: IntSchema) -> Dict: + def run(self, inp: IntSchema, **kwargs) -> Dict: return {"value": inp.value + 2} From 5ee36d6aca7c6d510765c433f6c31a1751bad58b Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 3 Nov 2023 11:15:00 -0400 Subject: [PATCH 5/7] [Pipeline Refactor] Additional Operators, Route update and completed generation functionality (#1356) * initial functionality and working example with image classification * remove testing image * rebase fixes * initial functionality and working example with image classification * text gen * updates func * prompt inference, initial functionality * remove image; update state docstring * Fix typo * add todo for split/join * remove context, clean-up args, remove prefill_preprocess_operaator * fix docstrings * initial functionality and working example with image classification * updates func * prompt inference, initial functionality * finish generation operators and update routes * further breakdown operators * add operators * fix can_operate condition * update can_operate to not rely on the inference_state * rebase + update * fix condition * fix capacity settting again * typo fixes --- .../v2/operators/engine_operator.py | 3 + src/deepsparse/v2/text_generation/__init__.py | 7 + .../autoregressive_preprocess_operator.py | 20 ++- .../compile_generated_tokens.py | 56 +++++++ .../v2/text_generation/compile_generations.py | 55 +++++++ .../v2/text_generation/compile_logits.py | 6 + .../v2/text_generation/generate_new_token.py | 90 +++++++++++ .../multi_engine_prefill_operator.py | 1 + .../v2/text_generation/nl_engine_operator.py | 8 +- src/deepsparse/v2/text_generation/pipeline.py | 61 ++++---- .../v2/text_generation/prep_for_generation.py | 140 ++++++++++++++++++ .../v2/text_generation/process_inputs.py | 2 +- .../v2/text_generation/process_outputs.py | 88 +++++++++++ .../v2/text_generation/token_generator.py | 30 ++++ 14 files changed, 529 insertions(+), 38 deletions(-) create mode 100644 src/deepsparse/v2/text_generation/compile_generated_tokens.py create mode 100644 src/deepsparse/v2/text_generation/compile_generations.py create mode 100644 src/deepsparse/v2/text_generation/generate_new_token.py create mode 100644 src/deepsparse/v2/text_generation/prep_for_generation.py create mode 100644 src/deepsparse/v2/text_generation/process_outputs.py create mode 100644 src/deepsparse/v2/text_generation/token_generator.py diff --git a/src/deepsparse/v2/operators/engine_operator.py b/src/deepsparse/v2/operators/engine_operator.py index b7d920a686..c2fc562c63 100644 --- a/src/deepsparse/v2/operators/engine_operator.py +++ b/src/deepsparse/v2/operators/engine_operator.py @@ -87,6 +87,9 @@ def __init__( self._engine_args = engine_args self._engine_type = engine_type + if not engine_kwargs: + engine_kwargs = {} + self.engine = self.create_engine(**engine_kwargs) @property diff --git a/src/deepsparse/v2/text_generation/__init__.py b/src/deepsparse/v2/text_generation/__init__.py index 37ac88d02f..21cd7e2acd 100644 --- a/src/deepsparse/v2/text_generation/__init__.py +++ b/src/deepsparse/v2/text_generation/__init__.py @@ -13,12 +13,19 @@ # limitations under the License. # flake8: noqa from .autoregressive_preprocess_operator import * +from .compile_generated_tokens import * +from .compile_generations import * from .compile_logits import * +from .generate_new_token import * from .kv_cache_operator import * from .multi_engine_prefill_operator import * from .nl_engine_operator import * from .prep_for_prefill import * from .process_inputs import * +from .process_outputs import * +from .token_generator import * # isort:skip +from .prep_for_generation import * # isort:skip + from .pipeline import * # isort:skip diff --git a/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py index cfe7cb531b..6e97412e43 100644 --- a/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py +++ b/src/deepsparse/v2/text_generation/autoregressive_preprocess_operator.py @@ -36,7 +36,6 @@ def __init__(self, sequence_length: int, prompt_sequence_length: int): """ self.sequence_length = sequence_length self.prompt_sequence_length = prompt_sequence_length - self.set_capacity = False _LOGGER.warn( "This operator requires the PipelineState to be set-up with the " @@ -51,16 +50,19 @@ def can_operate(self, inp: Any) -> bool: tokens = inp.get("tokens") kv_cache = inp.get("kv_cache") + if inp.get("in_generation"): + return True + remaining_tokens = len(tokens) - kv_cache.total_num_processed_tokens - if remaining_tokens > 0 and remaining_tokens < self.prompt_sequence_length: + can_process = ( + remaining_tokens > 0 and remaining_tokens < self.prompt_sequence_length + ) + if can_process and inp.get("in_generation") is None: return True return False def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwargs): - - if not self.set_capacity: - self.set_capacity = True - kv_cache.set_capacity(self.sequence_length - 1) + kv_cache.set_capacity(self.sequence_length - 1) num_total_processed_tokens = kv_cache.total_num_processed_tokens new_token = tokens[num_total_processed_tokens] @@ -88,13 +90,9 @@ def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwarg engine_inputs = [engine_inputs_map[name] for name in engine_input_names] - onnx_input_names_no_cache = pipeline_state.current_state.get( - "onnx_input_names_no_cache" - ) - engine_inputs = [engine_inputs_map[name] for name in onnx_input_names_no_cache] - return { "engine_inputs": engine_inputs, "kv_cache": kv_cache, "tokens": tokens, + "in_generation": kwargs.get("in_generation"), } diff --git a/src/deepsparse/v2/text_generation/compile_generated_tokens.py b/src/deepsparse/v2/text_generation/compile_generated_tokens.py new file mode 100644 index 0000000000..c87436ab3a --- /dev/null +++ b/src/deepsparse/v2/text_generation/compile_generated_tokens.py @@ -0,0 +1,56 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from deepsparse.v2.operators import Operator +from deepsparse.v2.utils import InferenceState + + +__all__ = ["CompileGeneratedTokens"] + + +class CompileGeneratedTokens(Operator): + def run( + self, + new_token, + logits, + finish_reason, + kv_cache, + tokens, + inference_state: InferenceState, + **kwargs, + ): + in_generation = True + + generated_tokens = inference_state.current_state.get("generated_tokens") + generated_logits = inference_state.current_state.get("generated_logits") + finished_reason = inference_state.current_state.get("finished_reason") + + generated_tokens.append(new_token) + generated_logits.append(logits) + finished_reason.append(finish_reason) + + if finish_reason is not None: + in_generation = False + + state_update = { # TODO: check if necessary + "finished_reason": finished_reason, + "generated_tokens": generated_tokens, + "generated_logits": generated_logits, + } + + output = { + "tokens": tokens, + "kv_cache": kv_cache, + "in_generation": in_generation, + } + return output, state_update diff --git a/src/deepsparse/v2/text_generation/compile_generations.py b/src/deepsparse/v2/text_generation/compile_generations.py new file mode 100644 index 0000000000..ed8297ac01 --- /dev/null +++ b/src/deepsparse/v2/text_generation/compile_generations.py @@ -0,0 +1,55 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any + +import numpy +from pydantic import BaseModel, Field + +from deepsparse.transformers.pipelines.text_generation import FinishReason +from deepsparse.v2.operators import Operator +from deepsparse.v2.utils import InferenceState + + +__all__ = ["CompileGenerations", "CompileGenerationsOutput"] + + +class CompileGenerationsOutput(BaseModel): + generated_tokens: Any = Field(description="generated_tokens") + generated_logits: Any = Field(description="generated_logits") + finished_reason: Any = Field(description="finished_reason") + + +class CompileGenerations(Operator): + output_schema = CompileGenerationsOutput + + def can_operate(self, inp: Any): + if inp.get("in_generation") is False: + return True + return False + + def run(self, inference_state: InferenceState, **kwargs): + generated_tokens = inference_state.current_state.get("generated_tokens") + generated_logits = inference_state.current_state.get("generated_logits") + finished_reason = inference_state.current_state.get("finished_reason") + + if len(finished_reason) == 0: + finished_reason.append(FinishReason.LENGTH) + + generated_tokens = numpy.array([generated_tokens]) + generated_logits = numpy.concatenate(generated_logits, axis=1) + return { + "generated_tokens": generated_tokens, + "generated_logits": generated_logits, + "finished_reason": finished_reason, + } diff --git a/src/deepsparse/v2/text_generation/compile_logits.py b/src/deepsparse/v2/text_generation/compile_logits.py index 55c87d791d..21bd50e03e 100644 --- a/src/deepsparse/v2/text_generation/compile_logits.py +++ b/src/deepsparse/v2/text_generation/compile_logits.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Any from deepsparse.v2.operators import Operator from deepsparse.v2.utils import InferenceState @@ -27,6 +28,11 @@ class CompilePromptLogits(Operator): take prompt logits from each iteration run and update the inference state. """ + def can_operate(self, inp: Any): + if inp.get("in_generation") is None: + return True + return False + def run(self, logits, inference_state: InferenceState, **kwargs): logit_type = "prompt_logits" diff --git a/src/deepsparse/v2/text_generation/generate_new_token.py b/src/deepsparse/v2/text_generation/generate_new_token.py new file mode 100644 index 0000000000..33ab546e39 --- /dev/null +++ b/src/deepsparse/v2/text_generation/generate_new_token.py @@ -0,0 +1,90 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Sequence, Union + +import transformers + +from deepsparse.transformers.pipelines.text_generation import FinishReason +from deepsparse.v2.operators import Operator +from deepsparse.v2.utils import InferenceState + + +__all__ = ["GenerateNewTokenOperator"] + + +class GenerateNewTokenOperator(Operator): + def __init__( + self, tokenizer: transformers.PreTrainedTokenizerBase, force_max_tokens: bool + ): + self.force_max_tokens = force_max_tokens + self.tokenizer = tokenizer + + def can_operate(self, inp: Any): + if inp.get("in_generation"): + return True + return False + + def run(self, logits, kv_cache, inference_state: InferenceState, **kwargs): + token_generator = inference_state.current_state.get("token_generator") + token = token_generator.generate(logits=logits[0, -1, :]) + finish_reason = None + + callback = inference_state.current_state.get("callback") + stop = inference_state.current_state.get("stop") + + if token == self.tokenizer.eos_token_id and not self.force_max_tokens: + finish_reason = FinishReason.STOP + + if self._stop_token_generated(token, stop_tokens=stop): + print( + "Stop token %s generated. Stopping generation." + % self.tokenizer.decode(token) + ) + finish_reason = FinishReason.STOP + + if callback is not None and callback(token) is False: + print( + "callback %s returned False, stopping generation." + % callback.__qualname__ + ) + finish_reason = FinishReason.CALLBACK + + max_tokens = inference_state.current_state.get("max_tokens") + if len(inference_state.current_state.get("generated_tokens")) + 1 == max_tokens: + finish_reason = inference_state.current_state.get("length_finish_reason") + + state_update = { + "token_generator": token_generator, + } + + new_generation = { + "logits": logits, + "new_token": token, + "finish_reason": finish_reason, + } + output = {"tokens": token_generator.tokens, "kv_cache": kv_cache} + output.update(new_generation) + return output, state_update + + def _stop_token_generated( + self, token, stop_tokens: Union[None, str, Sequence[str]] + ) -> bool: + if stop_tokens is None: + return False + + decoded_token = self.tokenizer.decode(token) + decoded_token = ( + decoded_token if decoded_token.isspace() else decoded_token.strip() + ) + return decoded_token in stop_tokens diff --git a/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py index 41ee830a8a..9a885c2355 100644 --- a/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py +++ b/src/deepsparse/v2/text_generation/multi_engine_prefill_operator.py @@ -97,6 +97,7 @@ def _case_positions(self, num_total_processed_tokens: int): ) def run(self, tokens: Any, kv_cache: Any, pipeline_state: PipelineState, **kwargs): + kv_cache.set_capacity(self.sequence_length - self.prompt_sequence_length) onnx_input_names_no_cache = pipeline_state.current_state.get( "onnx_input_names_no_cache" diff --git a/src/deepsparse/v2/text_generation/nl_engine_operator.py b/src/deepsparse/v2/text_generation/nl_engine_operator.py index 6c1ad1966e..0bd9098a40 100644 --- a/src/deepsparse/v2/text_generation/nl_engine_operator.py +++ b/src/deepsparse/v2/text_generation/nl_engine_operator.py @@ -36,6 +36,7 @@ class NlEngineInput(BaseModel): engine_inputs: List = Field(description="engine inputs") kv_cache: Any = Field(description="kv_cache object") tokens: List = Field(description="tokens") + in_generation: bool = Field(description="in_generation", default=None) class NLEngineOperator(EngineOperator): @@ -119,7 +120,12 @@ def run(self, inp: NlEngineInput, **kwargs) -> Any: kv_cache=kv_cache, ) - output = {"logits": logits, "kv_cache": kv_cache, "tokens": inp.tokens} + output = { + "logits": logits, + "kv_cache": kv_cache, + "tokens": inp.tokens, + "in_generation": inp.in_generation, + } return output def _add_kv_cache_to_input(self, engine_input, kv_cache): diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index 9878aa0061..49826b8af7 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -15,18 +15,23 @@ from typing import Dict from deepsparse.transformers.utils.helpers import process_generation_config -from deepsparse.v2.operators import Operator from deepsparse.v2.pipeline import Pipeline from deepsparse.v2.routers import GraphRouter from deepsparse.v2.schedulers import OperatorScheduler from deepsparse.v2.text_generation import ( AutoRegressiveOperatorPreprocess, + CompileGeneratedTokens, + CompileGenerations, CompilePromptLogits, + GenerateNewTokenOperator, KVCacheCreator, MultiEnginePrefill, NLEngineOperator, PrepareforPrefill, + PrepareGeneration, ProcessInputsTextGeneration, + ProcessOutputs, + TokenGeneratorOperator, ) from deepsparse.v2.utils import PipelineState @@ -109,17 +114,23 @@ def __init__( sequence_length=sequence_length, ) compile_prompt_logits = CompilePromptLogits() - """ - prep_for_single_engine = PrepareforSingleEngine( - prompt_sequence_length=prompt_sequence_length, + + autoregressive_preprocess = AutoRegressiveOperatorPreprocess( sequence_length=sequence_length, + prompt_sequence_length=prompt_sequence_length, ) - """ - autoregressive_preprocess = AutoRegressiveOperatorPreprocess( + token_generator = TokenGeneratorOperator() + prep_for_generation = PrepareGeneration( sequence_length=sequence_length, prompt_sequence_length=prompt_sequence_length, + token_generator=token_generator, + ) + generate_new_token = GenerateNewTokenOperator( + tokenizer=self.tokenizer, force_max_tokens=force_max_tokens ) - final_step = FinalStep() + process_output = ProcessOutputs(tokenizer=self.tokenizer) + compile_generations = CompileGenerations() + compile_generated_tokens = CompileGeneratedTokens() ops = { "process_input": process_inputs, @@ -130,7 +141,11 @@ def __init__( "multi_engine_prefill": multi_engine_prefill, "compile_logits": compile_prompt_logits, "autoregressive_preprocess": autoregressive_preprocess, - "final_step": final_step, + "prep_for_generation": prep_for_generation, + "generate_new_token": generate_new_token, + "process_outputs": process_output, + "compile_generations": compile_generations, + "compile_generated_tokens": compile_generated_tokens, } routes = { @@ -140,12 +155,22 @@ def __init__( "multi_engine": "compile_logits", "compile_logits": [ "multi_engine_prefill", + "prep_for_generation", "autoregressive_preprocess", - "final_step", ], "autoregressive_preprocess": "single_engine", - "single_engine": "compile_logits", - "final_step": "STOP", + "single_engine": [ + "compile_logits", + "generate_new_token", + ], + "prep_for_generation": "autoregressive_preprocess", + "generate_new_token": "compile_generated_tokens", + "compile_generated_tokens": [ + "autoregressive_preprocess", + "compile_generations", + ], + "compile_generations": "process_outputs", + "process_outputs": "STOP", } router = GraphRouter( @@ -197,17 +222,3 @@ def setup_onnx_file_path(self, model_path, sequence_length) -> str: "See `tokenizer` and `config` arguments for details." ) return onnx_path - - -# NOTE: This is a dummy last step which will be removed. Used as a final step -# for the current routes. -class FinalStep(Operator): - def can_operate(self, *args, **kwargs): - return True - - def run(self, *args, **kwargs): - import numpy - - inference_state = kwargs.get("inference_state") - prompt_logits = inference_state.current_state.get("prompt_logits") - return numpy.concatenate(prompt_logits, axis=1) diff --git a/src/deepsparse/v2/text_generation/prep_for_generation.py b/src/deepsparse/v2/text_generation/prep_for_generation.py new file mode 100644 index 0000000000..544af43980 --- /dev/null +++ b/src/deepsparse/v2/text_generation/prep_for_generation.py @@ -0,0 +1,140 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any + +import numpy + +from deepsparse.transformers.pipelines.text_generation import FinishReason +from deepsparse.v2.operators import Operator +from deepsparse.v2.text_generation import TokenGeneratorOperator +from deepsparse.v2.utils import InferenceState + + +__all__ = ["PrepareGeneration"] + + +class PrepareGeneration(Operator): + def __init__( + self, + token_generator: TokenGeneratorOperator, + prompt_sequence_length: int, + sequence_length: int, + ): + self.prompt_sequence_length = prompt_sequence_length + self.sequence_length = sequence_length + self.token_generator_creator = token_generator + + def can_operate(self, inp: Any): + kv_cache = inp.get("kv_cache") + tokens = inp.get("tokens") + + # If the number of prompt tokens is greater than what we've processed, + # don't start generation. Should be equal when started as all prompt logits + # should be accounted for and we should have updated the kv_cache for the single + # token engine. + if len(tokens) == kv_cache.total_num_processed_tokens: + return True + return False + + @staticmethod + def set_generated_length( + max_length: int, + prompt_tokens_length: int, + sequence_length: int, + prompt_sequence_length: int, + max_new_tokens: int, + finish_reason_choices: "FinishReason", # noqa + ): + """ + Determine the length of the generated tokens. The hard cap on the total number + of tokens is based on the sequence length. If max_length is provided and is less + than the sequence length, it will be used to cap the total number of tokens + generated. If it is not provided, the max_new_tokens attribute will be used and + also capped by the sequence length. + + :param max_length: max_length attribute, provided as input during inference + :param prompt_tokens_length: the number of prompt tokens used as part of the + generated output + :param sequence_length: the sequence length used for the pipeline + :param prompt_sequence_length: the prompt sequence length used for the pipeline + :param max_new_tokens: the max_new_tokens attribute, which may be provided + as part of the input during inference + """ + if max_length: + # if max_length provided, use that to cap total tokens generated + max_tokens = max_length + finish_reason = finish_reason_choices.LENGTH + else: + # if not provided, max tokens is based on max_new_tokens + prompt tokens + max_tokens = ( + min(max_new_tokens, sequence_length - prompt_sequence_length) + + prompt_tokens_length + ) + finish_reason = finish_reason_choices.MAX_NEW_TOKENS + + # hard model/pipeline cap + return ( + (sequence_length, finish_reason_choices.CAPACITY) + if sequence_length < max_tokens + else (max_tokens, finish_reason) + ) + + def run( + self, tokens: Any, kv_cache: Any, inference_state: InferenceState, **kwargs + ): + prompt_logits = inference_state.current_state.get("prompt_logits") + prompt_logits = numpy.concatenate(prompt_logits, axis=1) + # TODO: clean this up such that dont have to keep writing current_state + # everywhere + + generation_config = inference_state.current_state.get("generation_config") + include_prompt_logits = inference_state.current_state.get( + "include_prompt_logits" + ) + + token_generator_creator_output = self.token_generator_creator.run( + logits_shape=prompt_logits[0, -1, :].shape, + deterministic=not generation_config.do_sample, + sampling_temperature=generation_config.temperature, + tokens=tokens, + **inference_state.current_state, + ) + token_generator = token_generator_creator_output.get("token_generator") + token_generator.generate(prompt_logits[0, -1, :]) + + max_tokens, length_finish_reason = PrepareGeneration.set_generated_length( + max_length=generation_config.max_length, + prompt_tokens_length=1, + max_new_tokens=generation_config.max_new_tokens, + sequence_length=self.sequence_length, + prompt_sequence_length=self.prompt_sequence_length, + finish_reason_choices=FinishReason, + ) + state_update = { + "max_tokens": max_tokens, + "length_finish_reason": length_finish_reason, + "generated_tokens": [token_generator.tokens[-1]], + "generated_logits": [prompt_logits] + if include_prompt_logits + else [numpy.expand_dims(prompt_logits[:, -1, :], 0)], + "finished_reason": [], + "token_generator": token_generator, + } + + output = { + "tokens": token_generator.tokens, + "kv_cache": kv_cache, + "in_generation": True, + } + return output, state_update diff --git a/src/deepsparse/v2/text_generation/process_inputs.py b/src/deepsparse/v2/text_generation/process_inputs.py index 528dcee0b7..e57e402983 100644 --- a/src/deepsparse/v2/text_generation/process_inputs.py +++ b/src/deepsparse/v2/text_generation/process_inputs.py @@ -28,7 +28,7 @@ class GenerationDefaults: num_return_sequences = 1 - max_length = 1024 + max_length = 100 max_new_tokens = None output_scores = False top_k = 0 diff --git a/src/deepsparse/v2/text_generation/process_outputs.py b/src/deepsparse/v2/text_generation/process_outputs.py new file mode 100644 index 0000000000..ca1cf78521 --- /dev/null +++ b/src/deepsparse/v2/text_generation/process_outputs.py @@ -0,0 +1,88 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import datetime +from typing import Optional + +import numpy + +from deepsparse.transformers.pipelines.text_generation import ( + FinishReason, + GeneratedText, + TextGenerationOutput, +) +from deepsparse.v2.operators import Operator +from deepsparse.v2.text_generation.compile_generations import CompileGenerationsOutput +from deepsparse.v2.utils import InferenceState + + +class ProcessOutputs(Operator): + output_schema = TextGenerationOutput + + def __init__(self, tokenizer): + self.tokenizer = tokenizer + + def _create_generated_text_output( + self, + sequence: str, + finish_reason: Optional[FinishReason] = None, + logits: Optional[numpy.array] = None, + ): + if finish_reason: + return GeneratedText( + text=sequence, + score=logits, + finished=True, + finished_reason=finish_reason.value, + ) + return GeneratedText( + text=sequence, + score=logits, + finished=False, + ) + + def run( + self, inp: CompileGenerationsOutput, inference_state: InferenceState, **kwargs + ): + generation_config = inference_state.current_state.get("generation_config") + generated_tokens = inp.generated_tokens + generated_logits = ( + inp.generated_logits if generation_config.output_scores else None + ) + finished_reason = inp.finished_reason + sequences = self.tokenizer.batch_decode( + generated_tokens, skip_special_tokens=True + ) + + finished_reason = [f for f in finished_reason if f] + + if generated_logits is not None: + generations = list( + map( + self._create_generated_text_output, + sequences, + finished_reason, + generated_logits, + ) + ) + else: + generations = list( + map(self._create_generated_text_output, sequences, finished_reason) + ) + outputs = dict( + created=datetime.datetime.now(), + prompts=inference_state.current_state.get("prompts"), + generations=generations, + ) + + return outputs diff --git a/src/deepsparse/v2/text_generation/token_generator.py b/src/deepsparse/v2/text_generation/token_generator.py new file mode 100644 index 0000000000..9148d71cc8 --- /dev/null +++ b/src/deepsparse/v2/text_generation/token_generator.py @@ -0,0 +1,30 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from deepsparse.transformers.utils.token_generator import TokenGenerator +from deepsparse.v2.operators import Operator + + +__all__ = ["TokenGeneratorOperator"] + + +class TokenGeneratorOperator(Operator): + def run(self, logits_shape, deterministic, tokens, sampling_temperature, **kwargs): + token_generator = TokenGenerator( + logits_shape=logits_shape, + deterministic=deterministic, + tokens=tokens, + sampling_temperature=sampling_temperature, + **kwargs, + ) + return {"token_generator": token_generator} From 7f3eb12e25ff8ee57caa55c9112e3ed9c610026b Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Mon, 18 Dec 2023 16:10:18 +0000 Subject: [PATCH 6/7] initial commit --- src/deepsparse/transformers/helpers.py | 38 ++++++----- src/deepsparse/utils/onnx.py | 8 +-- src/deepsparse/v2/text_generation/pipeline.py | 68 ++++--------------- 3 files changed, 36 insertions(+), 78 deletions(-) diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py index d5fc5ed438..1e733ec1bb 100644 --- a/src/deepsparse/transformers/helpers.py +++ b/src/deepsparse/transformers/helpers.py @@ -30,6 +30,8 @@ from onnx import ModelProto from deepsparse.log import get_main_logger +from deepsparse.utils.onnx import MODEL_ONNX_NAME, truncate_onnx_model +from sparsezoo import Model from deepsparse.utils.onnx import ( _MODEL_DIR_ONNX_NAME, model_to_path, @@ -39,6 +41,7 @@ __all__ = [ + "setup_transformers_pipeline", "get_deployment_path", "setup_transformers_pipeline", "overwrite_transformer_onnx_model_inputs", @@ -55,6 +58,7 @@ def setup_transformers_pipeline( sequence_length: int, tokenizer_padding_side: str = "left", engine_kwargs: Optional[Dict] = None, + onnx_model_name: Optional[str] = None, ) -> Tuple[ str, transformers.PretrainedConfig, transformers.PreTrainedTokenizer, Dict[str, Any] ]: @@ -66,30 +70,27 @@ def setup_transformers_pipeline( :param tokenizer_padding_side: The side to pad on for the tokenizer, either "left" or "right" :param engine_kwargs: The kwargs to pass to the engine + :param onnx_model_name: The name of the onnx model to be loaded. + If not specified, defaults are used (see setup_onnx_file_path) :return The model path, config, tokenizer, and engine kwargs """ - model_path, config, tokenizer = fetch_onnx_file_path(model_path, sequence_length) + model_path, config, tokenizer = setup_onnx_file_path( + model_path, sequence_length, onnx_model_name + ) tokenizer.padding_side = tokenizer_padding_side if not tokenizer.pad_token: tokenizer.pad_token = tokenizer.eos_token engine_kwargs = engine_kwargs or {} - if engine_kwargs.get("model_path"): - raise ValueError( - "The engine kwargs already specify " - f"a model path: {engine_kwargs['model_path']}, " - f"but a model path was also provided: {model_path}. " - "Please only provide one." - ) engine_kwargs["model_path"] = model_path return model_path, config, tokenizer, engine_kwargs -def fetch_onnx_file_path( +def setup_onnx_file_path( model_path: str, sequence_length: int, - task: Optional[str] = None, + onnx_model_name: Optional[str] = None, ) -> Tuple[str, transformers.PretrainedConfig, transformers.PreTrainedTokenizer]: """ Parses ONNX model from the `model_path` provided. It additionally @@ -97,17 +98,18 @@ def fetch_onnx_file_path( derived from the `model_path` provided. :param model_path: path to the model to be parsed :param sequence_length: maximum sequence length of the model + :param onnx_model_name: optionally, the precise name of the ONNX model + of interest may be specified. If not specified, the default ONNX model + name will be used (refer to `get_deployment_path` for details) :return: file path to the processed ONNX file for the engine to compile """ - deployment_path, onnx_path = get_deployment_path(model_path) + deployment_path, onnx_path = get_deployment_path(model_path, onnx_model_name) hf_logger = logging.getLogger("transformers") hf_logger_level = hf_logger.level hf_logger.setLevel(logging.ERROR) - config = transformers.PretrainedConfig.from_pretrained( - deployment_path, finetuning_task=task - ) + config = transformers.PretrainedConfig.from_pretrained(deployment_path) hf_logger.setLevel(hf_logger_level) trust_remote_code = False @@ -145,13 +147,13 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]: if os.path.isdir(model_path): model_files = os.listdir(model_path) - if _MODEL_DIR_ONNX_NAME not in model_files: + if MODEL_ONNX_NAME not in model_files: raise ValueError( - f"{_MODEL_DIR_ONNX_NAME} not found in transformers model directory " + f"{MODEL_ONNX_NAME} not found in transformers model directory " f"{model_path}. Be sure that an export of the model is written to " - f"{os.path.join(model_path, _MODEL_DIR_ONNX_NAME)}" + f"{os.path.join(model_path, MODEL_ONNX_NAME)}" ) - return model_path, os.path.join(model_path, _MODEL_DIR_ONNX_NAME) + return model_path, os.path.join(model_path, MODEL_ONNX_NAME) elif model_path.startswith("zoo:") or model_path.startswith("hf:"): onnx_model_path = model_to_path(model_path) diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py index ae0913ffd7..35d932c75d 100644 --- a/src/deepsparse/utils/onnx.py +++ b/src/deepsparse/utils/onnx.py @@ -56,12 +56,12 @@ "has_model_kv_cache", "CACHE_INPUT_PREFIX", "CACHE_OUTPUT_PREFIX", - "_MODEL_DIR_ONNX_NAME", + "MODEL_ONNX_NAME", ] _LOGGER = logging.getLogger(__name__) -_MODEL_DIR_ONNX_NAME = "model.onnx" +MODEL_ONNX_NAME = "model.onnx" CACHE_INPUT_PREFIX = "past_key_values" CACHE_OUTPUT_PREFIX = "present" @@ -132,7 +132,7 @@ def model_to_path(model: Union[str, Model, File]) -> str: model.deployment.path # default to the main onnx file for the model - model = model.deployment.get_file(_MODEL_DIR_ONNX_NAME).path + model = model.deployment.get_file(MODEL_ONNX_NAME).path elif File is not object and isinstance(model, File): # get the downloaded_path -- will auto download if not on local system @@ -161,7 +161,7 @@ def model_to_path(model: Union[str, Model, File]) -> str: model_path = Path(model) if model_path.is_dir(): - return str(model_path / _MODEL_DIR_ONNX_NAME) + return str(model_path / MODEL_ONNX_NAME) return model diff --git a/src/deepsparse/v2/text_generation/pipeline.py b/src/deepsparse/v2/text_generation/pipeline.py index 49826b8af7..fdb31f1c6c 100644 --- a/src/deepsparse/v2/text_generation/pipeline.py +++ b/src/deepsparse/v2/text_generation/pipeline.py @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict +from typing import Dict, Optional +from deepsparse.transformers.helpers import setup_transformers_pipeline from deepsparse.transformers.utils.helpers import process_generation_config from deepsparse.v2.pipeline import Pipeline from deepsparse.v2.routers import GraphRouter @@ -45,23 +46,20 @@ def __init__( internal_kv_cache: bool = True, force_max_tokens: bool = False, generation_config=None, - engine_kwargs: Dict = None, + engine_kwargs: Optional[Dict] = None, ): + ( + self.model_path, + self.config, + self.tokenizer, + engine_kwargs, + ) = setup_transformers_pipeline( + model_path, sequence_length, engine_kwargs=engine_kwargs + ) pipeline_state = PipelineState() pipeline_state_vals = {} - # TODO: The code below will be replaced with a transformers set-up Operator. - self.tokenizer = None - model_path = self.setup_onnx_file_path(model_path, sequence_length) - self.tokenizer.padding_side = "left" - if not self.tokenizer.pad_token: - self.tokenizer.pad_token = self.tokenizer.eos_token - - if not engine_kwargs: - engine_kwargs = {} - engine_kwargs["model_path"] = model_path - if internal_kv_cache and engine_kwargs.get("engine_type") == "onnxruntime": internal_kv_cache = False @@ -80,7 +78,7 @@ def __init__( ) # NOTE: Currently using pipeline state. Can swap to simply pass in the - # attributes to the specific Operator that neeed them, as class attributes. + # attributes to the specific Operator that need them, as class attributes. pipeline_state_vals[ "onnx_input_names_no_cache" ] = single_engine_operator.onnx_input_names_no_cache @@ -180,45 +178,3 @@ def __init__( super().__init__( ops=ops, router=router, schedulers=scheduler, pipeline_state=pipeline_state ) - - # TODO: Move to be part of a generic transformers set-up Operator. - def setup_onnx_file_path(self, model_path, sequence_length) -> str: - import logging - - import transformers - from transformers import AutoTokenizer - - from deepsparse.transformers.helpers import get_deployment_path - - """ - Parses ONNX model from the `model_path` provided. It additionally - creates config and tokenizer objects from the `deployment path`, - derived from the `model_path` provided. - - :return: file path to the processed ONNX file for the engine to compile - """ - deployment_path, onnx_path = get_deployment_path(model_path) - - hf_logger = logging.getLogger("transformers") - hf_logger_level = hf_logger.level - hf_logger.setLevel(logging.ERROR) - self.config = transformers.PretrainedConfig.from_pretrained( - deployment_path, - finetuning_task=self.task if hasattr(self, "task") else None, - ) - hf_logger.setLevel(hf_logger_level) - - self._trust_remote_code = False - self.tokenizer = AutoTokenizer.from_pretrained( - deployment_path, - trust_remote_code=self._trust_remote_code, - model_max_length=sequence_length, - ) - - if not self.config or not self.tokenizer: - raise RuntimeError( - "Invalid config or tokenizer provided. Please provide " - "paths to the files or ensure they exist in the `model_path` provided. " - "See `tokenizer` and `config` arguments for details." - ) - return onnx_path From 0901a01dea3e09856df98bf6605cb18d92b3f19f Mon Sep 17 00:00:00 2001 From: dbogunowicz Date: Mon, 18 Dec 2023 16:15:41 +0000 Subject: [PATCH 7/7] ready for reviews --- src/deepsparse/transformers/helpers.py | 25 ++++++++++++------------- src/deepsparse/utils/onnx.py | 4 ++-- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py index 1e733ec1bb..25a179c5d0 100644 --- a/src/deepsparse/transformers/helpers.py +++ b/src/deepsparse/transformers/helpers.py @@ -30,18 +30,11 @@ from onnx import ModelProto from deepsparse.log import get_main_logger -from deepsparse.utils.onnx import MODEL_ONNX_NAME, truncate_onnx_model -from sparsezoo import Model -from deepsparse.utils.onnx import ( - _MODEL_DIR_ONNX_NAME, - model_to_path, - truncate_onnx_model, -) +from deepsparse.utils.onnx import MODEL_ONNX_NAME, model_to_path, truncate_onnx_model from sparsezoo.utils import save_onnx __all__ = [ - "setup_transformers_pipeline", "get_deployment_path", "setup_transformers_pipeline", "overwrite_transformer_onnx_model_inputs", @@ -128,7 +121,9 @@ def setup_onnx_file_path( return onnx_path, config, tokenizer -def get_deployment_path(model_path: str) -> Tuple[str, str]: +def get_deployment_path( + model_path: str, onnx_model_name: Optional[str] = None +) -> Tuple[str, str]: """ Returns the path to the deployment directory for the given model path and the path to the mandatory @@ -137,9 +132,13 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]: for running the transformers model in the deepsparse pipeline :param model_path: path to model directory, sparsezoo stub, or ONNX file + :param onnx_model_name: optionally, the precise name of the ONNX model + of interest may be specified. If not specified, the default ONNX model + name will be used. :return: path to the deployment directory and path to the ONNX file inside the deployment directory """ + onnx_model_name = onnx_model_name or MODEL_ONNX_NAME if os.path.isfile(model_path): # return the parent directory of the ONNX file return os.path.dirname(model_path), model_path @@ -147,13 +146,13 @@ def get_deployment_path(model_path: str) -> Tuple[str, str]: if os.path.isdir(model_path): model_files = os.listdir(model_path) - if MODEL_ONNX_NAME not in model_files: + if onnx_model_name not in model_files: raise ValueError( - f"{MODEL_ONNX_NAME} not found in transformers model directory " + f"{onnx_model_name} not found in transformers model directory " f"{model_path}. Be sure that an export of the model is written to " - f"{os.path.join(model_path, MODEL_ONNX_NAME)}" + f"{os.path.join(model_path, onnx_model_name)}" ) - return model_path, os.path.join(model_path, MODEL_ONNX_NAME) + return model_path, os.path.join(model_path, onnx_model_name) elif model_path.startswith("zoo:") or model_path.startswith("hf:"): onnx_model_path = model_to_path(model_path) diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py index 35d932c75d..e4b41f3286 100644 --- a/src/deepsparse/utils/onnx.py +++ b/src/deepsparse/utils/onnx.py @@ -143,10 +143,10 @@ def model_to_path(model: Union[str, Model, File]) -> str: from huggingface_hub import snapshot_download deployment_path = snapshot_download(repo_id=model.replace("hf:", "", 1)) - onnx_path = os.path.join(deployment_path, _MODEL_DIR_ONNX_NAME) + onnx_path = os.path.join(deployment_path, MODEL_ONNX_NAME) if not os.path.isfile(onnx_path): raise ValueError( - f"Could not find the ONNX model file '{_MODEL_DIR_ONNX_NAME}' in the " + f"Could not find the ONNX model file '{MODEL_ONNX_NAME}' in the " f"Hugging Face Hub repository located at {deployment_path}. Please " f"ensure the model has been correctly exported to ONNX format and " f"exists in the repository."