src/python/library/tritonclient/http/__init__.py

# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

try:
    from geventhttpclient import HTTPClient
    from geventhttpclient.url import URL
    import gevent
    import gevent.pool
    from urllib.parse import quote, quote_plus
    import rapidjson as json
    import numpy as np
    import struct
    import gzip, zlib
    import base64
except ModuleNotFoundError as error:
    raise RuntimeError(
        'The installation does not include http support. Specify \'http\' or \'all\' while installing the tritonclient package to include the support'
    ) from error

from tritonclient.utils import *


def _get_error(response):
    """
    Returns the InferenceServerException object if response
    indicates the error. If no error then return None
    """
    if response.status_code != 200:
        error_response = json.loads(response.read())
        return InferenceServerException(msg=error_response["error"])
    else:
        return None


def _raise_if_error(response):
    """
    Raise InferenceServerException if received non-Success
    response from the server
    """
    error = _get_error(response)
    if error is not None:
        raise error


def _get_query_string(query_params):
    params = []
    for key, value in query_params.items():
        if isinstance(value, list):
            for item in value:
                params.append("%s=%s" %
                              (quote_plus(key), quote_plus(str(item))))
        else:
            params.append("%s=%s" % (quote_plus(key), quote_plus(str(value))))
    if params:
        return "&".join(params)
    return ''


def _get_inference_request(inputs, request_id, outputs, sequence_id,
                           sequence_start, sequence_end, priority, timeout):
    infer_request = {}
    parameters = {}
    if request_id != "":
        infer_request['id'] = request_id
    if sequence_id != 0 and sequence_id != "":
        parameters['sequence_id'] = sequence_id
        parameters['sequence_start'] = sequence_start
        parameters['sequence_end'] = sequence_end
    if priority != 0:
        parameters['priority'] = priority
    if timeout is not None:
        parameters['timeout'] = timeout

    infer_request['inputs'] = [
        this_input._get_tensor() for this_input in inputs
    ]
    if outputs:
        infer_request['outputs'] = [
            this_output._get_tensor() for this_output in outputs
        ]
    else:
        # no outputs specified so set 'binary_data_output' True in the
        # request so that all outputs are returned in binary format
        parameters['binary_data_output'] = True

    if parameters:
        infer_request['parameters'] = parameters

    request_body = json.dumps(infer_request)
    json_size = len(request_body)
    binary_data = None
    for input_tensor in inputs:
        raw_data = input_tensor._get_binary_data()
        if raw_data is not None:
            if binary_data is not None:
                binary_data += raw_data
            else:
                binary_data = raw_data

    if binary_data is not None:
        request_body = struct.pack(
            '{}s{}s'.format(len(request_body), len(binary_data)),
            request_body.encode(), binary_data)
        return request_body, json_size

    return request_body, None


class InferenceServerClient:
    """An InferenceServerClient object is used to perform any kind of
    communication with the InferenceServer using http protocol. None
    of the methods are thread safe. The object is intended to be used
    by a single thread and simultaneously calling different methods
    with different threads is not supported and will cause undefined
    behavior.

    Parameters
    ----------
    url : str
        The inference server name, port and optional base path 
        in the following format: host:port/<base-path>, e.g.
        'localhost:8000'.

    verbose : bool
        If True generate verbose output. Default value is False.
    concurrency : int
        The number of connections to create for this client.
        Default value is 1.
    connection_timeout : float
        The timeout value for the connection. Default value
        is 60.0 sec.
    network_timeout : float
        The timeout value for the network. Default value is
        60.0 sec
    max_greenlets : int
        Determines the maximum allowed number of worker greenlets
        for handling asynchronous inference requests. Default value
        is None, which means there will be no restriction on the
        number of greenlets created.
    ssl : bool
        If True, channels the requests to encrypted https scheme.
        Some improper settings may cause connection to prematurely
        terminate with an unsuccessful handshake. See
        `ssl_context_factory` option for using secure default
        settings. Default value for this option is False.
    ssl_options : dict
        Any options supported by `ssl.wrap_socket` specified as
        dictionary. The argument is ignored if 'ssl' is specified
        False.
    ssl_context_factory : SSLContext callable
        It must be a callbable that returns a SSLContext. Set to
        `gevent.ssl.create_default_context` to use contexts with
        secure default settings. This should most likely resolve
        connection issues in a secure way. The default value for
        this option is None which directly wraps the socket with
        the options provided via `ssl_options`. The argument is
        ignored if 'ssl' is specified False.
    insecure : bool
        If True, then does not match the host name with the certificate.
        Default value is False. The argument is ignored if 'ssl' is
        specified False.

    Raises
        ------
        Exception
            If unable to create a client.

    """

    def __init__(self,
                 url,
                 verbose=False,
                 concurrency=1,
                 connection_timeout=60.0,
                 network_timeout=60.0,
                 max_greenlets=None,
                 ssl=False,
                 ssl_options=None,
                 ssl_context_factory=None,
                 insecure=False):
        if url.startswith("http://") or url.startswith("https://"):
            raise_error("url should not include the scheme")
        scheme = "https://" if ssl else "http://"
        self._parsed_url = URL(scheme + url)
        self._base_uri = self._parsed_url.request_uri.rstrip('/')
        self._client_stub = HTTPClient.from_url(
            self._parsed_url,
            concurrency=concurrency,
            connection_timeout=connection_timeout,
            network_timeout=network_timeout,
            ssl_options=ssl_options,
            ssl_context_factory=ssl_context_factory,
            insecure=insecure)
        self._pool = gevent.pool.Pool(max_greenlets)
        self._verbose = verbose

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.close()

    def __del__(self):
        self.close()

    def close(self):
        """Close the client. Any future calls to server
        will result in an Error.

        """
        self._pool.join()
        self._client_stub.close()

    def _get(self, request_uri, headers, query_params):
        """Issues the GET request to the server

         Parameters
        ----------
        request_uri: str
            The request URI to be used in GET request.
        headers: dict
            Additional HTTP headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction.

        Returns
        -------
        geventhttpclient.response.HTTPSocketPoolResponse
            The response from server.
        """
        self._validate_headers(headers)

        if self._base_uri is not None:
            request_uri = self._base_uri + "/" + request_uri

        if query_params is not None:
            request_uri = request_uri + "?" + _get_query_string(query_params)

        if self._verbose:
            print("GET {}, headers {}".format(request_uri, headers))

        if headers is not None:
            response = self._client_stub.get(request_uri, headers=headers)
        else:
            response = self._client_stub.get(request_uri)

        if self._verbose:
            print(response)

        return response

    def _post(self, request_uri, request_body, headers, query_params):
        """Issues the POST request to the server

        Parameters
        ----------
        request_uri: str
            The request URI to be used in POST request.
        request_body: str
            The body of the request
        headers: dict
            Additional HTTP headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction.

        Returns
        -------
        geventhttpclient.response.HTTPSocketPoolResponse
            The response from server.
        """
        self._validate_headers(headers)

        if self._base_uri is not None:
            request_uri = self._base_uri + "/" + request_uri

        if query_params is not None:
            request_uri = request_uri + "?" + _get_query_string(query_params)

        if self._verbose:
            print("POST {}, headers {}\n{}".format(request_uri, headers,
                                                   request_body))

        if headers is not None:
            response = self._client_stub.post(request_uri=request_uri,
                                              body=request_body,
                                              headers=headers)
        else:
            response = self._client_stub.post(request_uri=request_uri,
                                              body=request_body)

        if self._verbose:
            print(response)

        return response

    def _validate_headers(self, headers):
        """Checks for any unsupported HTTP headers before processing a request.

        Parameters
        ----------
        headers: dict
            HTTP headers to validate before processing the request.

        Raises
        ------
        InferenceServerException
            If an unsupported HTTP header is included in a request.
        """
        if not headers:
            return

        # HTTP headers are case-insensitive, so force lowercase for comparison
        headers_lowercase = {k.lower(): v for k, v in headers.items()}
        # The python client lirary (and geventhttpclient) do not encode request
        # data based on "Transfer-Encoding" header, so reject this header if
        # included. Other libraries may do this encoding under the hood.
        # The python client library does expose special arguments to support
        # some "Content-Encoding" headers.
        if "transfer-encoding" in headers_lowercase:
            raise_error("Unsupported HTTP header: 'Transfer-Encoding' is not "
                        "supported in the Python client library. Use raw HTTP "
                        "request libraries or the C++ client instead for this "
                        "header.")

    def is_server_live(self, headers=None, query_params=None):
        """Contact the inference server and get liveness.

        Parameters
        ----------
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction.

        Returns
        -------
        bool
            True if server is live, False if server is not live.

        Raises
        ------
        Exception
            If unable to get liveness.

        """

        request_uri = "v2/health/live"
        response = self._get(request_uri=request_uri,
                             headers=headers,
                             query_params=query_params)

        return response.status_code == 200

    def is_server_ready(self, headers=None, query_params=None):
        """Contact the inference server and get readiness.

        Parameters
        ----------
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction.

        Returns
        -------
        bool
            True if server is ready, False if server is not ready.

        Raises
        ------
        Exception
            If unable to get readiness.

        """
        request_uri = "v2/health/ready"
        response = self._get(request_uri=request_uri,
                             headers=headers,
                             query_params=query_params)

        return response.status_code == 200

    def is_model_ready(self,
                       model_name,
                       model_version="",
                       headers=None,
                       query_params=None):
        """Contact the inference server and get the readiness of specified model.

        Parameters
        ----------
        model_name: str
            The name of the model to check for readiness.
        model_version: str
            The version of the model to check for readiness. The default value
            is an empty string which means then the server will choose a version
            based on the model and internal policy.
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction.

        Returns
        -------
        bool
            True if the model is ready, False if not ready.

        Raises
        ------
        Exception
            If unable to get model readiness.

        """
        if type(model_version) != str:
            raise_error("model version must be a string")
        if model_version != "":
            request_uri = "v2/models/{}/versions/{}/ready".format(
                quote(model_name), model_version)
        else:
            request_uri = "v2/models/{}/ready".format(quote(model_name))

        response = self._get(request_uri=request_uri,
                             headers=headers,
                             query_params=query_params)

        return response.status_code == 200

    def get_server_metadata(self, headers=None, query_params=None):
        """Contact the inference server and get its metadata.

        Parameters
        ----------
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction.

        Returns
        -------
        dict
            The JSON dict holding the metadata.

        Raises
        ------
        InferenceServerException
            If unable to get server metadata.

        """
        request_uri = "v2"
        response = self._get(request_uri=request_uri,
                             headers=headers,
                             query_params=query_params)
        _raise_if_error(response)

        content = response.read()
        if self._verbose:
            print(content)

        return json.loads(content)

    def get_model_metadata(self,
                           model_name,
                           model_version="",
                           headers=None,
                           query_params=None):
        """Contact the inference server and get the metadata for specified model.

        Parameters
        ----------
        model_name: str
            The name of the model
        model_version: str
            The version of the model to get metadata. The default value
            is an empty string which means then the server will choose
            a version based on the model and internal policy.
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Returns
        -------
        dict
            The JSON dict holding the metadata.

        Raises
        ------
        InferenceServerException
            If unable to get model metadata.

        """
        if type(model_version) != str:
            raise_error("model version must be a string")
        if model_version != "":
            request_uri = "v2/models/{}/versions/{}".format(
                quote(model_name), model_version)
        else:
            request_uri = "v2/models/{}".format(quote(model_name))

        response = self._get(request_uri=request_uri,
                             headers=headers,
                             query_params=query_params)
        _raise_if_error(response)

        content = response.read()
        if self._verbose:
            print(content)

        return json.loads(content)

    def get_model_config(self,
                         model_name,
                         model_version="",
                         headers=None,
                         query_params=None):
        """Contact the inference server and get the configuration for specified model.

        Parameters
        ----------
        model_name: str
            The name of the model
        model_version: str
            The version of the model to get configuration. The default value
            is an empty string which means then the server will choose
            a version based on the model and internal policy.
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Returns
        -------
        dict
            The JSON dict holding the model config.

        Raises
        ------
        InferenceServerException
            If unable to get model configuration.

        """
        if model_version != "":
            request_uri = "v2/models/{}/versions/{}/config".format(
                quote(model_name), model_version)
        else:
            request_uri = "v2/models/{}/config".format(quote(model_name))

        response = self._get(request_uri=request_uri,
                             headers=headers,
                             query_params=query_params)
        _raise_if_error(response)

        content = response.read()
        if self._verbose:
            print(content)

        return json.loads(content)

    def get_model_repository_index(self, headers=None, query_params=None):
        """Get the index of model repository contents

        Parameters
        ----------
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Returns
        -------
        dict
            The JSON dict holding the model repository index.

        Raises
        ------
        InferenceServerException
            If unable to get the repository index.

        """
        request_uri = "v2/repository/index"
        response = self._post(request_uri=request_uri,
                              request_body="",
                              headers=headers,
                              query_params=query_params)
        _raise_if_error(response)

        content = response.read()
        if self._verbose:
            print(content)

        return json.loads(content)

    def load_model(self,
                   model_name,
                   headers=None,
                   query_params=None,
                   config=None,
                   files=None):
        """Request the inference server to load or reload specified model.

        Parameters
        ----------
        model_name : str
            The name of the model to be loaded.
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction.
        config: str
            Optional JSON representation of a model config provided for
            the load request, if provided, this config will be used for
            loading the model.
        files: dict
            Optional dictionary specifying file path (with "file:" prefix) in
            the override model directory to the file content as bytes.
            The files will form the model directory that the model will be
            loaded from. If specified, 'config' must be provided to be
            the model configuration of the override model directory.

        Raises
        ------
        InferenceServerException
            If unable to load the model.

        """
        request_uri = "v2/repository/models/{}/load".format(quote(model_name))
        load_request = {}
        if config is not None:
            if "parameters" not in load_request:
                load_request["parameters"] = {}
            load_request["parameters"]["config"] = config
        if files is not None:
            for path, content in files.items():
                if "parameters" not in load_request:
                    load_request["parameters"] = {}
                load_request["parameters"][path] = base64.b64encode(content)
        response = self._post(request_uri=request_uri,
                              request_body=json.dumps(load_request),
                              headers=headers,
                              query_params=query_params)
        _raise_if_error(response)
        if self._verbose:
            print("Loaded model '{}'".format(model_name))

    def unload_model(self,
                     model_name,
                     headers=None,
                     query_params=None,
                     unload_dependents=False):
        """Request the inference server to unload specified model.

        Parameters
        ----------
        model_name : str
            The name of the model to be unloaded.
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction
        unload_dependents : bool
            Whether the dependents of the model should also be unloaded.

        Raises
        ------
        InferenceServerException
            If unable to unload the model.

        """
        request_uri = "v2/repository/models/{}/unload".format(quote(model_name))
        unload_request = {
            "parameters": {
                "unload_dependents": unload_dependents
            }
        }
        response = self._post(request_uri=request_uri,
                              request_body=json.dumps(unload_request),
                              headers=headers,
                              query_params=query_params)
        _raise_if_error(response)
        if self._verbose:
            print("Loaded model '{}'".format(model_name))

    def get_inference_statistics(self,
                                 model_name="",
                                 model_version="",
                                 headers=None,
                                 query_params=None):
        """Get the inference statistics for the specified model name and
        version.

        Parameters
        ----------
        model_name : str
            The name of the model to get statistics. The default value is
            an empty string, which means statistics of all models will
            be returned.
        model_version: str
            The version of the model to get inference statistics. The
            default value is an empty string which means then the server
            will return the statistics of all available model versions.
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Returns
        -------
        dict
            The JSON dict holding the model inference statistics.

        Raises
        ------
        InferenceServerException
            If unable to get the model inference statistics.

        """

        if model_name != "":
            if type(model_version) != str:
                raise_error("model version must be a string")
            if model_version != "":
                request_uri = "v2/models/{}/versions/{}/stats".format(
                    quote(model_name), model_version)
            else:
                request_uri = "v2/models/{}/stats".format(quote(model_name))
        else:
            request_uri = "v2/models/stats"

        response = self._get(request_uri=request_uri,
                             headers=headers,
                             query_params=query_params)
        _raise_if_error(response)

        content = response.read()
        if self._verbose:
            print(content)

        return json.loads(content)

    def update_trace_settings(self,
                              model_name=None,
                              settings={},
                              headers=None,
                              query_params=None):
        """Update the trace settings for the specified model name, or
        global trace settings if model name is not given.
        Returns the trace settings after the update.

        Parameters
        ----------
        model_name : str
            The name of the model to update trace settings. Specifying None or
            empty string will update the global trace settings.
            The default value is None.
        settings: dict
            The new trace setting values. Only the settings listed will be
            updated. If a trace setting is listed in the dictionary with
            a value of 'None', that setting will be cleared.
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Returns
        -------
        dict
            The JSON dict holding the updated trace settings.

        Raises
        ------
        InferenceServerException
            If unable to update the trace settings.

        """

        if (model_name is not None) and (model_name != ""):
            request_uri = "v2/models/{}/trace/setting".format(quote(model_name))
        else:
            request_uri = "v2/trace/setting"

        response = self._post(request_uri=request_uri,
                              request_body=json.dumps(settings),
                              headers=headers,
                              query_params=query_params)
        _raise_if_error(response)

        content = response.read()
        if self._verbose:
            print(content)

        return json.loads(content)

    def get_trace_settings(self,
                           model_name=None,
                           headers=None,
                           query_params=None):
        """Get the trace settings for the specified model name, or global trace
        settings if model name is not given

        Parameters
        ----------
        model_name : str
            The name of the model to get trace settings. Specifying None or
            empty string will return the global trace settings.
            The default value is None.
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Returns
        -------
        dict
            The JSON dict holding the trace settings.

        Raises
        ------
        InferenceServerException
            If unable to get the trace settings.

        """

        if (model_name is not None) and (model_name != ""):
            request_uri = "v2/models/{}/trace/setting".format(quote(model_name))
        else:
            request_uri = "v2/trace/setting"

        response = self._get(request_uri=request_uri,
                             headers=headers,
                             query_params=query_params)
        _raise_if_error(response)

        content = response.read()
        if self._verbose:
            print(content)

        return json.loads(content)

    def update_log_settings(self, settings, headers=None, query_params=None):
        """Update the global log settings of the Triton server.
        Parameters
        ----------
        settings: dict
            The new log setting values. Only the settings listed will be
            updated.
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction
        Returns
        -------
        dict
            The JSON dict holding the updated log settings.
        Raises
        ------
        InferenceServerException
            If unable to update the log settings.
        """
        request_uri = "v2/logging"
        response = self._post(request_uri=request_uri,
                              request_body=json.dumps(settings),
                              headers=headers,
                              query_params=query_params)
        _raise_if_error(response)

        content = response.read()
        if self._verbose:
            print(content)

        return json.loads(content)

    def get_log_settings(self, headers=None, query_params=None):
        """Get the global log settings for the Triton server
        Parameters
        ----------
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction
        Returns
        -------
        dict
            The JSON dict holding the log settings.
        Raises
        ------
        InferenceServerException
            If unable to get the log settings.
        """

        request_uri = "v2/logging"

        response = self._get(request_uri=request_uri,
                             headers=headers,
                             query_params=query_params)
        _raise_if_error(response)

        content = response.read()
        if self._verbose:
            print(content)

        return json.loads(content)

    def get_system_shared_memory_status(self,
                                        region_name="",
                                        headers=None,
                                        query_params=None):
        """Request system shared memory status from the server.

        Parameters
        ----------
        region_name : str
            The name of the region to query status. The default
            value is an empty string, which means that the status
            of all active system shared memory will be returned.
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Returns
        -------
        dict
            The JSON dict holding system shared memory status.

        Raises
        ------
        InferenceServerException
            If unable to get the status of specified shared memory.

        """
        if region_name != "":
            request_uri = "v2/systemsharedmemory/region/{}/status".format(
                quote(region_name))
        else:
            request_uri = "v2/systemsharedmemory/status"

        response = self._get(request_uri=request_uri,
                             headers=headers,
                             query_params=query_params)
        _raise_if_error(response)

        content = response.read()
        if self._verbose:
            print(content)

        return json.loads(content)

    def register_system_shared_memory(self,
                                      name,
                                      key,
                                      byte_size,
                                      offset=0,
                                      headers=None,
                                      query_params=None):
        """Request the server to register a system shared memory with the
        following specification.

        Parameters
        ----------
        name : str
            The name of the region to register.
        key : str
            The key of the underlying memory object that contains the
            system shared memory region.
        byte_size : int
            The size of the system shared memory region, in bytes.
        offset : int
            Offset, in bytes, within the underlying memory object to
            the start of the system shared memory region. The default
            value is zero.
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Raises
        ------
        InferenceServerException
            If unable to register the specified system shared memory.

        """
        request_uri = "v2/systemsharedmemory/region/{}/register".format(
            quote(name))

        register_request = {
            'key': key,
            'offset': offset,
            'byte_size': byte_size
        }
        request_body = json.dumps(register_request)

        response = self._post(request_uri=request_uri,
                              request_body=request_body,
                              headers=headers,
                              query_params=query_params)
        _raise_if_error(response)
        if self._verbose:
            print("Registered system shared memory with name '{}'".format(name))

    def unregister_system_shared_memory(self,
                                        name="",
                                        headers=None,
                                        query_params=None):
        """Request the server to unregister a system shared memory with the
        specified name.

        Parameters
        ----------
        name : str
            The name of the region to unregister. The default value is empty
            string which means all the system shared memory regions will be
            unregistered.
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Raises
        ------
        InferenceServerException
            If unable to unregister the specified system shared memory region.

        """
        if name != "":
            request_uri = "v2/systemsharedmemory/region/{}/unregister".format(
                quote(name))
        else:
            request_uri = "v2/systemsharedmemory/unregister"

        response = self._post(request_uri=request_uri,
                              request_body="",
                              headers=headers,
                              query_params=query_params)
        _raise_if_error(response)
        if self._verbose:
            if name != "":
                print("Unregistered system shared memory with name '{}'".format(
                    name))
            else:
                print("Unregistered all system shared memory regions")

    def get_cuda_shared_memory_status(self,
                                      region_name="",
                                      headers=None,
                                      query_params=None):
        """Request cuda shared memory status from the server.

        Parameters
        ----------
        region_name : str
            The name of the region to query status. The default
            value is an empty string, which means that the status
            of all active cuda shared memory will be returned.
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Returns
        -------
        dict
            The JSON dict holding cuda shared memory status.

        Raises
        ------
        InferenceServerException
            If unable to get the status of specified shared memory.

        """
        if region_name != "":
            request_uri = "v2/cudasharedmemory/region/{}/status".format(
                quote(region_name))
        else:
            request_uri = "v2/cudasharedmemory/status"

        response = self._get(request_uri=request_uri,
                             headers=headers,
                             query_params=query_params)
        _raise_if_error(response)

        content = response.read()
        if self._verbose:
            print(content)

        return json.loads(content)

    def register_cuda_shared_memory(self,
                                    name,
                                    raw_handle,
                                    device_id,
                                    byte_size,
                                    headers=None,
                                    query_params=None):
        """Request the server to register a system shared memory with the
        following specification.

        Parameters
        ----------
        name : str
            The name of the region to register.
        raw_handle : bytes
            The raw serialized cudaIPC handle in base64 encoding.
        device_id : int
            The GPU device ID on which the cudaIPC handle was created.
        byte_size : int
            The size of the cuda shared memory region, in bytes.
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Raises
        ------
        InferenceServerException
            If unable to register the specified cuda shared memory.

        """
        request_uri = "v2/cudasharedmemory/region/{}/register".format(
            quote(name))

        register_request = {
            'raw_handle': {
                'b64': raw_handle
            },
            'device_id': device_id,
            'byte_size': byte_size
        }
        request_body = json.dumps(register_request)

        response = self._post(request_uri=request_uri,
                              request_body=request_body,
                              headers=headers,
                              query_params=query_params)
        _raise_if_error(response)
        if self._verbose:
            print("Registered cuda shared memory with name '{}'".format(name))

    def unregister_cuda_shared_memory(self,
                                      name="",
                                      headers=None,
                                      query_params=None):
        """Request the server to unregister a cuda shared memory with the
        specified name.

        Parameters
        ----------
        name : str
            The name of the region to unregister. The default value is empty
            string which means all the cuda shared memory regions will be
            unregistered.
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Raises
        ------
        InferenceServerException
            If unable to unregister the specified cuda shared memory region.

        """
        if name != "":
            request_uri = "v2/cudasharedmemory/region/{}/unregister".format(
                quote(name))
        else:
            request_uri = "v2/cudasharedmemory/unregister"

        response = self._post(request_uri=request_uri,
                              request_body="",
                              headers=headers,
                              query_params=query_params)
        _raise_if_error(response)
        if self._verbose:
            if name != "":
                print("Unregistered cuda shared memory with name '{}'".format(
                    name))
            else:
                print("Unregistered all cuda shared memory regions")

    @staticmethod
    def generate_request_body(inputs,
                              outputs=None,
                              request_id="",
                              sequence_id=0,
                              sequence_start=False,
                              sequence_end=False,
                              priority=0,
                              timeout=None):
        """Generate a request body for inference using the supplied 'inputs'
        requesting the outputs specified by 'outputs'.

        Parameters
        ----------
        inputs : list
            A list of InferInput objects, each describing data for a input
            tensor required by the model.
        outputs : list
            A list of InferRequestedOutput objects, each describing how the output
            data must be returned. If not specified all outputs produced
            by the model will be returned using default settings.
        request_id: str
            Optional identifier for the request. If specified will be returned
            in the response. Default value is an empty string which means no
            request_id will be used.
        sequence_id : int or str
            The unique identifier for the sequence being represented by the
            object. A value of 0 or "" means that the request does not
            belong to a sequence. Default is 0.
        sequence_start: bool
            Indicates whether the request being added marks the start of the
            sequence. Default value is False. This argument is ignored if
            'sequence_id' is 0.
        sequence_end: bool
            Indicates whether the request being added marks the end of the
            sequence. Default value is False. This argument is ignored if
            'sequence_id' is 0.
        priority : int
            Indicates the priority of the request. Priority value zero
            indicates that the default priority level should be used
            (i.e. same behavior as not specifying the priority parameter).
            Lower value priorities indicate higher priority levels. Thus
            the highest priority level is indicated by setting the parameter
            to 1, the next highest is 2, etc. If not provided, the server
            will handle the request using default setting for the model.
        timeout : int
            The timeout value for the request, in microseconds. If the request
            cannot be completed within the time the server can take a
            model-specific action such as terminating the request. If not
            provided, the server will handle the request using default setting
            for the model.

        Returns
        -------
        Bytes
            The request body of the inference.
        Int
            The byte size of the inference request header in the request body.
            Returns None if the whole request body constitutes the request header.
            

        Raises
        ------
        InferenceServerException
            If server fails to perform inference.
        """
        return _get_inference_request(inputs=inputs,
                                      request_id=request_id,
                                      outputs=outputs,
                                      sequence_id=sequence_id,
                                      sequence_start=sequence_start,
                                      sequence_end=sequence_end,
                                      priority=priority,
                                      timeout=timeout)

    @staticmethod
    def parse_response_body(response_body,
                            verbose=False,
                            header_length=None,
                            content_encoding=None):
        """Generate a InferResult object from the given 'response_body'

        Parameters
        ----------
        response_body : bytes
            The inference response from the server
        verbose : bool
            If True generate verbose output. Default value is False.
        header_length : int
            The length of the inference header if the header does not occupy
            the whole response body. Default value is None.
        content_encoding : string
            The encoding of the response body if it is compressed.
            Default value is None.
        
        Returns
        -------
        InferResult
            The InferResult object generated from the response body
        """
        return InferResult.from_response_body(response_body, verbose,
                                              header_length, content_encoding)

    def infer(self,
              model_name,
              inputs,
              model_version="",
              outputs=None,
              request_id="",
              sequence_id=0,
              sequence_start=False,
              sequence_end=False,
              priority=0,
              timeout=None,
              headers=None,
              query_params=None,
              request_compression_algorithm=None,
              response_compression_algorithm=None):
        """Run synchronous inference using the supplied 'inputs' requesting
        the outputs specified by 'outputs'.

        Parameters
        ----------
        model_name: str
            The name of the model to run inference.
        inputs : list
            A list of InferInput objects, each describing data for a input
            tensor required by the model.
        model_version: str
            The version of the model to run inference. The default value
            is an empty string which means then the server will choose
            a version based on the model and internal policy.
        outputs : list
            A list of InferRequestedOutput objects, each describing how the output
            data must be returned. If not specified all outputs produced
            by the model will be returned using default settings.
        request_id: str
            Optional identifier for the request. If specified will be returned
            in the response. Default value is an empty string which means no
            request_id will be used.
        sequence_id : int
            The unique identifier for the sequence being represented by the
            object. Default value is 0 which means that the request does not
            belong to a sequence.
        sequence_start: bool
            Indicates whether the request being added marks the start of the
            sequence. Default value is False. This argument is ignored if
            'sequence_id' is 0.
        sequence_end: bool
            Indicates whether the request being added marks the end of the
            sequence. Default value is False. This argument is ignored if
            'sequence_id' is 0.
        priority : int
            Indicates the priority of the request. Priority value zero
            indicates that the default priority level should be used
            (i.e. same behavior as not specifying the priority parameter).
            Lower value priorities indicate higher priority levels. Thus
            the highest priority level is indicated by setting the parameter
            to 1, the next highest is 2, etc. If not provided, the server
            will handle the request using default setting for the model.
        timeout : int
            The timeout value for the request, in microseconds. If the request
            cannot be completed within the time the server can take a
            model-specific action such as terminating the request. If not
            provided, the server will handle the request using default setting
            for the model.
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction.
        request_compression_algorithm : str
            Optional HTTP compression algorithm to use for the request body on client side.
            Currently supports "deflate", "gzip" and None. By default, no
            compression is used.
        response_compression_algorithm : str
            Optional HTTP compression algorithm to request for the response body.
            Note that the response may not be compressed if the server does not
            support the specified algorithm. Currently supports "deflate",
            "gzip" and None. By default, no compression is requested.

        Returns
        -------
        InferResult
            The object holding the result of the inference.

        Raises
        ------
        InferenceServerException
            If server fails to perform inference.
        """

        request_body, json_size = _get_inference_request(
            inputs=inputs,
            request_id=request_id,
            outputs=outputs,
            sequence_id=sequence_id,
            sequence_start=sequence_start,
            sequence_end=sequence_end,
            priority=priority,
            timeout=timeout)

        if request_compression_algorithm == "gzip":
            if headers is None:
                headers = {}
            headers["Content-Encoding"] = "gzip"
            request_body = gzip.compress(request_body)
        elif request_compression_algorithm == 'deflate':
            if headers is None:
                headers = {}
            headers["Content-Encoding"] = "deflate"
            # "Content-Encoding: deflate" actually means compressing in zlib structure
            # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding
            request_body = zlib.compress(request_body)

        if response_compression_algorithm == "gzip":
            if headers is None:
                headers = {}
            headers["Accept-Encoding"] = "gzip"
        elif response_compression_algorithm == 'deflate':
            if headers is None:
                headers = {}
            headers["Accept-Encoding"] = "deflate"

        if json_size is not None:
            if headers is None:
                headers = {}
            headers["Inference-Header-Content-Length"] = json_size

        if type(model_version) != str:
            raise_error("model version must be a string")
        if model_version != "":
            request_uri = "v2/models/{}/versions/{}/infer".format(
                quote(model_name), model_version)
        else:
            request_uri = "v2/models/{}/infer".format(quote(model_name))

        response = self._post(request_uri=request_uri,
                              request_body=request_body,
                              headers=headers,
                              query_params=query_params)
        _raise_if_error(response)

        return InferResult(response, self._verbose)

    def async_infer(self,
                    model_name,
                    inputs,
                    model_version="",
                    outputs=None,
                    request_id="",
                    sequence_id=0,
                    sequence_start=False,
                    sequence_end=False,
                    priority=0,
                    timeout=None,
                    headers=None,
                    query_params=None,
                    request_compression_algorithm=None,
                    response_compression_algorithm=None):
        """Run asynchronous inference using the supplied 'inputs' requesting
        the outputs specified by 'outputs'. Even though this call is
        non-blocking, however, the actual number of concurrent requests to
        the server will be limited by the 'concurrency' parameter specified
        while creating this client. In other words, if the inflight
        async_infer exceeds the specified 'concurrency', the delivery of
        the exceeding request(s) to server will be blocked till the slot is
        made available by retrieving the results of previously issued requests.

        Parameters
        ----------
        model_name: str
            The name of the model to run inference.
        inputs : list
            A list of InferInput objects, each describing data for a input
            tensor required by the model.
        model_version: str
            The version of the model to run inference. The default value
            is an empty string which means then the server will choose
            a version based on the model and internal policy.
        outputs : list
            A list of InferRequestedOutput objects, each describing how the output
            data must be returned. If not specified all outputs produced
            by the model will be returned using default settings.
        request_id: str
            Optional identifier for the request. If specified will be returned
            in the response. Default value is 'None' which means no request_id
            will be used.
        sequence_id : int
            The unique identifier for the sequence being represented by the
            object. Default value is 0 which means that the request does not
            belong to a sequence.
        sequence_start: bool
            Indicates whether the request being added marks the start of the
            sequence. Default value is False. This argument is ignored if
            'sequence_id' is 0.
        sequence_end: bool
            Indicates whether the request being added marks the end of the
            sequence. Default value is False. This argument is ignored if
            'sequence_id' is 0.
        priority : int
            Indicates the priority of the request. Priority value zero
            indicates that the default priority level should be used
            (i.e. same behavior as not specifying the priority parameter).
            Lower value priorities indicate higher priority levels. Thus
            the highest priority level is indicated by setting the parameter
            to 1, the next highest is 2, etc. If not provided, the server
            will handle the request using default setting for the model.
        timeout : int
            The timeout value for the request, in microseconds. If the request
            cannot be completed within the time the server can take a
            model-specific action such as terminating the request. If not
            provided, the server will handle the request using default setting
            for the model.
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction.
        request_compression_algorithm : str
            Optional HTTP compression algorithm to use for the request body on client side.
            Currently supports "deflate", "gzip" and None. By default, no
            compression is used.
        response_compression_algorithm : str
            Optional HTTP compression algorithm to request for the response body.
            Note that the response may not be compressed if the server does not
            support the specified algorithm. Currently supports "deflate",
            "gzip" and None. By default, no compression is requested.

        Returns
        -------
        InferAsyncRequest object
            The handle to the asynchronous inference request.

        Raises
        ------
        InferenceServerException
            If server fails to issue inference.
        """

        def wrapped_post(request_uri, request_body, headers, query_params):
            return self._post(request_uri, request_body, headers, query_params)

        request_body, json_size = _get_inference_request(
            inputs=inputs,
            request_id=request_id,
            outputs=outputs,
            sequence_id=sequence_id,
            sequence_start=sequence_start,
            sequence_end=sequence_end,
            priority=priority,
            timeout=timeout)

        if request_compression_algorithm == "gzip":
            if headers is None:
                headers = {}
            headers["Content-Encoding"] = "gzip"
            request_body = gzip.compress(request_body)
        elif request_compression_algorithm == 'deflate':
            if headers is None:
                headers = {}
            headers["Content-Encoding"] = "deflate"
            # "Content-Encoding: deflate" actually means compressing in zlib structure
            # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding
            request_body = zlib.compress(request_body)

        if response_compression_algorithm == "gzip":
            if headers is None:
                headers = {}
            headers["Accept-Encoding"] = "gzip"
        elif response_compression_algorithm == 'deflate':
            if headers is None:
                headers = {}
            headers["Accept-Encoding"] = "deflate"

        if json_size is not None:
            if headers is None:
                headers = {}
            headers["Inference-Header-Content-Length"] = json_size

        if type(model_version) != str:
            raise_error("model version must be a string")
        if model_version != "":
            request_uri = "v2/models/{}/versions/{}/infer".format(
                quote(model_name), model_version)
        else:
            request_uri = "v2/models/{}/infer".format(quote(model_name))

        g = self._pool.apply_async(
            wrapped_post, (request_uri, request_body, headers, query_params))

        # Schedule the greenlet to run in this loop iteration
        g.start()

        # Relinquish control to greenlet loop. Using non-zero
        # value to ensure the control is transferred to the
        # event loop.
        gevent.sleep(0.01)

        if self._verbose:
            verbose_message = "Sent request"
            if request_id != "":
                verbose_message = verbose_message + " '{}'".format(request_id)
            print(verbose_message)

        return InferAsyncRequest(g, self._verbose)


class InferAsyncRequest:
    """An object of InferAsyncRequest class is used to describe
    a handle to an ongoing asynchronous inference request.

    Parameters
    ----------
    greenlet : gevent.Greenlet
        The greenlet object which will provide the results.
        For further details about greenlets refer
        http://www.gevent.org/api/gevent.greenlet.html.

    verbose : bool
        If True generate verbose output. Default value is False.
    """

    def __init__(self, greenlet, verbose=False):
        self._greenlet = greenlet
        self._verbose = verbose

    def get_result(self, block=True, timeout=None):
        """Get the results of the associated asynchronous inference.
        Parameters
        ----------
        block : bool
            If block is True, the function will wait till the
            corresponding response is received from the server.
            Default value is True.
        timeout : int
            The maximum wait time for the function. This setting is
            ignored if the block is set False. Default is None,
            which means the function will block indefinitely till
            the corresponding response is received.

        Returns
        -------
        InferResult
            The object holding the result of the async inference.

        Raises
        ------
        InferenceServerException
            If server fails to perform inference or failed to respond
            within specified timeout.
        """

        try:
            response = self._greenlet.get(block=block, timeout=timeout)
        except gevent.Timeout as e:
            raise_error("failed to obtain inference response")

        _raise_if_error(response)
        return InferResult(response, self._verbose)


class InferInput:
    """An object of InferInput class is used to describe
    input tensor for an inference request.

    Parameters
    ----------
    name : str
        The name of input whose data will be described by this object
    shape : list
        The shape of the associated input.
    datatype : str
        The datatype of the associated input.
    """

    def __init__(self, name, shape, datatype):
        self._name = name
        self._shape = shape
        self._datatype = datatype
        self._parameters = {}
        self._data = None
        self._raw_data = None

    def name(self):
        """Get the name of input associated with this object.

        Returns
        -------
        str
            The name of input
        """
        return self._name

    def datatype(self):
        """Get the datatype of input associated with this object.

        Returns
        -------
        str
            The datatype of input
        """
        return self._datatype

    def shape(self):
        """Get the shape of input associated with this object.

        Returns
        -------
        list
            The shape of input
        """
        return self._shape

    def set_shape(self, shape):
        """Set the shape of input.

        Parameters
        ----------
        shape : list
            The shape of the associated input.
        """
        self._shape = shape

    def set_data_from_numpy(self, input_tensor, binary_data=True):
        """Set the tensor data from the specified numpy array for
        input associated with this object.

        Parameters
        ----------
        input_tensor : numpy array
            The tensor data in numpy array format
        binary_data : bool
            Indicates whether to set data for the input in binary format
            or explicit tensor within JSON. The default value is True,
            which means the data will be delivered as binary data in the
            HTTP body after the JSON object.

        Raises
        ------
        InferenceServerException
            If failed to set data for the tensor.
        """
        if not isinstance(input_tensor, (np.ndarray,)):
            raise_error("input_tensor must be a numpy array")
        # DLIS-3986: Special handling for bfloat16 until Numpy officially supports it
        if self._datatype == "BF16":
            if input_tensor.dtype != triton_to_np_dtype(self._datatype):
                raise_error(
                    "got unexpected datatype {} from numpy array, expected {} for BF16 type"
                    .format(input_tensor.dtype,
                            triton_to_np_dtype(self._datatype)))
        else:
            dtype = np_to_triton_dtype(input_tensor.dtype)
            if self._datatype != dtype:
                raise_error(
                    "got unexpected datatype {} from numpy array, expected {}".
                    format(dtype, self._datatype))
        valid_shape = True
        if len(self._shape) != len(input_tensor.shape):
            valid_shape = False
        else:
            for i in range(len(self._shape)):
                if self._shape[i] != input_tensor.shape[i]:
                    valid_shape = False
        if not valid_shape:
            raise_error(
                "got unexpected numpy array shape [{}], expected [{}]".format(
                    str(input_tensor.shape)[1:-1],
                    str(self._shape)[1:-1]))

        self._parameters.pop('shared_memory_region', None)
        self._parameters.pop('shared_memory_byte_size', None)
        self._parameters.pop('shared_memory_offset', None)

        if not binary_data:
            self._parameters.pop('binary_data_size', None)
            self._raw_data = None
            if self._datatype == "BF16":
                raise_error(
                    "BF16 inputs must be sent as binary data over HTTP. Please set binary_data=True"
                )
            if self._datatype == "BYTES":
                self._data = []
                try:
                    if input_tensor.size > 0:
                        for obj in np.nditer(input_tensor,
                                             flags=["refs_ok"],
                                             order='C'):
                            # We need to convert the object to string using utf-8,
                            # if we want to use the binary_data=False. JSON requires
                            # the input to be a UTF-8 string.
                            if input_tensor.dtype == np.object_:
                                if type(obj.item()) == bytes:
                                    self._data.append(
                                        str(obj.item(), encoding='utf-8'))
                                else:
                                    self._data.append(str(obj.item()))
                            else:
                                self._data.append(
                                    str(obj.item(), encoding='utf-8'))
                except UnicodeDecodeError:
                    raise_error(
                        f'Failed to encode "{obj.item()}" using UTF-8. Please use binary_data=True, if'
                        ' you want to pass a byte array.')
            else:
                self._data = [val.item() for val in input_tensor.flatten()]
        else:
            self._data = None
            if self._datatype == "BYTES":
                serialized_output = serialize_byte_tensor(input_tensor)
                if serialized_output.size > 0:
                    self._raw_data = serialized_output.item()
                else:
                    self._raw_data = b''
            elif self._datatype == "BF16":
                serialized_output = serialize_bf16_tensor(input_tensor)
                if serialized_output.size > 0:
                    self._raw_data = serialized_output.item()
                else:
                    self._raw_data = b''
            else:
                self._raw_data = input_tensor.tobytes()
            self._parameters['binary_data_size'] = len(self._raw_data)

    def set_shared_memory(self, region_name, byte_size, offset=0):
        """Set the tensor data from the specified shared memory region.

        Parameters
        ----------
        region_name : str
            The name of the shared memory region holding tensor data.
        byte_size : int
            The size of the shared memory region holding tensor data.
        offset : int
            The offset, in bytes, into the region where the data for
            the tensor starts. The default value is 0.

        """
        self._data = None
        self._raw_data = None
        self._parameters.pop('binary_data_size', None)

        self._parameters['shared_memory_region'] = region_name
        self._parameters['shared_memory_byte_size'] = byte_size
        if offset != 0:
            self._parameters['shared_memory_offset'] = offset

    def _get_binary_data(self):
        """Returns the raw binary data if available

        Returns
        -------
        bytes
            The raw data for the input tensor
        """
        return self._raw_data

    def _get_tensor(self):
        """Retrieve the underlying input as json dict.

        Returns
        -------
        dict
            The underlying tensor specification as dict
        """
        tensor = {
            'name': self._name,
            'shape': self._shape,
            'datatype': self._datatype
        }
        if self._parameters:
            tensor['parameters'] = self._parameters

        if self._parameters.get('shared_memory_region') is None and \
                self._raw_data is None:
            if self._data is not None:
                tensor['data'] = self._data
        return tensor


class InferRequestedOutput:
    """An object of InferRequestedOutput class is used to describe a
    requested output tensor for an inference request.

    Parameters
    ----------
    name : str
        The name of output tensor to associate with this object.
    binary_data : bool
        Indicates whether to return result data for the output in
        binary format or explicit tensor within JSON. The default
        value is True, which means the data will be delivered as
        binary data in the HTTP body after JSON object. This field
        will be unset if shared memory is set for the output.
    class_count : int
        The number of classifications to be requested. The default
        value is 0 which means the classification results are not
        requested.
    """

    def __init__(self, name, binary_data=True, class_count=0):
        self._name = name
        self._parameters = {}
        if class_count != 0:
            self._parameters['classification'] = class_count
        self._binary = binary_data
        self._parameters['binary_data'] = binary_data

    def name(self):
        """Get the name of output associated with this object.

        Returns
        -------
        str
            The name of output
        """
        return self._name

    def set_shared_memory(self, region_name, byte_size, offset=0):
        """Marks the output to return the inference result in
        specified shared memory region.

        Parameters
        ----------
        region_name : str
            The name of the shared memory region to hold tensor data.
        byte_size : int
            The size of the shared memory region to hold tensor data.
        offset : int
            The offset, in bytes, into the region where the data for
            the tensor starts. The default value is 0.

        """
        if 'classification' in self._parameters:
            raise_error("shared memory can't be set on classification output")
        if self._binary:
            self._parameters['binary_data'] = False

        self._parameters['shared_memory_region'] = region_name
        self._parameters['shared_memory_byte_size'] = byte_size
        if offset != 0:
            self._parameters['shared_memory_offset'] = offset

    def unset_shared_memory(self):
        """Clears the shared memory option set by the last call to
        InferRequestedOutput.set_shared_memory(). After call to this
        function requested output will no longer be returned in a
        shared memory region.
        """

        self._parameters['binary_data'] = self._binary
        self._parameters.pop('shared_memory_region', None)
        self._parameters.pop('shared_memory_byte_size', None)
        self._parameters.pop('shared_memory_offset', None)

    def _get_tensor(self):
        """Retrieve the underlying input as json dict.

        Returns
        -------
        dict
            The underlying tensor as a dict
        """
        tensor = {'name': self._name}
        if self._parameters:
            tensor['parameters'] = self._parameters
        return tensor


class InferResult:
    """An object of InferResult class holds the response of
    an inference request and provide methods to retrieve
    inference results.

    Parameters
    ----------
    response : geventhttpclient.response.HTTPSocketPoolResponse
        The inference response from the server
    verbose : bool
        If True generate verbose output. Default value is False.
    """

    def __init__(self, response, verbose):
        header_length = response.get('Inference-Header-Content-Length')

        # Internal class that simulate the interface of 'response'
        class DecompressedResponse:

            def __init__(self, decompressed_data):
                self.decompressed_data_ = decompressed_data
                self.offset_ = 0

            def read(self, length=-1):
                if length == -1:
                    return self.decompressed_data_[self.offset_:]
                else:
                    prev_offset = self.offset_
                    self.offset_ += length
                    return self.decompressed_data_[prev_offset:self.offset_]

        content_encoding = response.get('Content-Encoding')
        if content_encoding is not None:
            if content_encoding == "gzip":
                response = DecompressedResponse(gzip.decompress(
                    response.read()))
            elif content_encoding == 'deflate':
                response = DecompressedResponse(zlib.decompress(
                    response.read()))
        if header_length is None:
            content = response.read()
            if verbose:
                print(content)
            try:
                self._result = json.loads(content)
            except UnicodeDecodeError as e:
                raise_error(
                    f'Failed to encode using UTF-8. Please use binary_data=True, if'
                    f' you want to pass a byte array. UnicodeError: {e}')
        else:
            header_length = int(header_length)
            content = response.read(length=header_length)
            if verbose:
                print(content)
            self._result = json.loads(content)

            # Maps the output name to the index in buffer for quick retrieval
            self._output_name_to_buffer_map = {}
            # Read the remaining data off the response body.
            self._buffer = response.read()
            buffer_index = 0
            for output in self._result['outputs']:
                parameters = output.get("parameters")
                if parameters is not None:
                    this_data_size = parameters.get("binary_data_size")
                    if this_data_size is not None:
                        self._output_name_to_buffer_map[
                            output['name']] = buffer_index
                        buffer_index = buffer_index + this_data_size

    @classmethod
    def from_response_body(cls,
                           response_body,
                           verbose=False,
                           header_length=None,
                           content_encoding=None):
        """A class method to construct InferResult object
        from a given 'response_body'.

        Parameters
        ----------
        response_body : bytes
            The inference response from the server
        verbose : bool
            If True generate verbose output. Default value is False.
        header_length : int
            The length of the inference header if the header does not occupy
            the whole response body. Default value is None.
        content_encoding : string
            The encoding of the response body if it is compressed.
            Default value is None.
        
        Returns
        -------
        InferResult
            The InferResult object generated from the response body
        """

        # Internal class that simulate the interface of 'response'
        class Response:

            def __init__(self, response_body, header_length, content_encoding):
                self.response_body_ = response_body
                self.offset_ = 0
                self.parameters_ = {
                    'Inference-Header-Content-Length': header_length,
                    'Content-Encoding': content_encoding
                }

            def get(self, key):
                return self.parameters_.get(key)

            def read(self, length=-1):
                if length == -1:
                    return self.response_body_[self.offset_:]
                else:
                    prev_offset = self.offset_
                    self.offset_ += length
                    return self.response_body_[prev_offset:self.offset_]

        return cls(Response(response_body, header_length, content_encoding),
                   verbose)

    def as_numpy(self, name):
        """Get the tensor data for output associated with this object
        in numpy format

        Parameters
        ----------
        name : str
            The name of the output tensor whose result is to be retrieved.

        Returns
        -------
        numpy array
            The numpy array containing the response data for the tensor or
            None if the data for specified tensor name is not found.
        """
        if self._result.get('outputs') is not None:
            for output in self._result['outputs']:
                if output['name'] == name:
                    datatype = output['datatype']
                    has_binary_data = False
                    parameters = output.get("parameters")
                    if parameters is not None:
                        this_data_size = parameters.get("binary_data_size")
                        if this_data_size is not None:
                            has_binary_data = True
                            if this_data_size != 0:
                                start_index = self._output_name_to_buffer_map[
                                    name]
                                end_index = start_index + this_data_size
                                if datatype == 'BYTES':
                                    # String results contain a 4-byte string length
                                    # followed by the actual string characters. Hence,
                                    # need to decode the raw bytes to convert into
                                    # array elements.
                                    np_array = deserialize_bytes_tensor(
                                        self._buffer[start_index:end_index])
                                elif datatype == "BF16":
                                    np_array = deserialize_bf16_tensor(
                                        self._buffer[start_index:end_index])
                                else:
                                    np_array = np.frombuffer(
                                        self._buffer[start_index:end_index],
                                        dtype=triton_to_np_dtype(datatype))
                            else:
                                np_array = np.empty(0)
                    if not has_binary_data:
                        np_array = np.array(output['data'],
                                            dtype=triton_to_np_dtype(datatype))
                    np_array = np_array.reshape(output['shape'])
                    return np_array
        return None

    def get_output(self, name):
        """Retrieves the output tensor corresponding to the named ouput.

        Parameters
        ----------
        name : str
            The name of the tensor for which Output is to be
            retrieved.

        Returns
        -------
        Dict
            If an output tensor with specified name is present in
            the infer resonse then returns it as a json dict,
            otherwise returns None.
        """
        for output in self._result['outputs']:
            if output['name'] == name:
                return output

        return None

    def get_response(self):
        """Retrieves the complete response

        Returns
        -------
        dict
            The underlying response dict.
        """
        return self._result