diff --git a/torch2trt/calibration.py b/torch2trt/calibration.py index 7506ea4e..b09b7193 100644 --- a/torch2trt/calibration.py +++ b/torch2trt/calibration.py @@ -51,7 +51,7 @@ def get_batch(self, *args, **kwargs): buffer[i].copy_(tensor) self.count += 1 - + return [int(buf.data_ptr()) for buf in self.buffers] else: return [] diff --git a/torch2trt/converters/AdaptiveAvgPool2d.py b/torch2trt/converters/AdaptiveAvgPool2d.py index 41ad141d..cd7c2b79 100644 --- a/torch2trt/converters/AdaptiveAvgPool2d.py +++ b/torch2trt/converters/AdaptiveAvgPool2d.py @@ -16,7 +16,8 @@ def convert_AdaptiveAvgPool2d(ctx): stride = (input_trt.shape[-2] // output_size[-2], input_trt.shape[-1] // output_size[-1]) - kernel_size = stride + kernel_size = (input_trt.shape[-2] - (output_size[-2] - 1) * stride[-2], input_trt.shape[-1] - (output_size[-1] - 1) * stride[-1]) + layer = ctx.network.add_pooling( input=input_trt, type=trt.PoolingType.AVERAGE, window_size=kernel_size) layer.stride = stride diff --git a/torch2trt/converters/AdaptiveAvgPool3d.py b/torch2trt/converters/AdaptiveAvgPool3d.py index 6c1b4302..9fd5b6dd 100644 --- a/torch2trt/converters/AdaptiveAvgPool3d.py +++ b/torch2trt/converters/AdaptiveAvgPool3d.py @@ -22,7 +22,8 @@ def convert_AdaptiveAvgPool3d(ctx): input_trt.shape[-1] // output_size[-1], ) - kernel_size = stride + kernel_size = (input_trt.shape[-3] - (output_size[-3] - 1) * stride[-3], input_trt.shape[-2] - (output_size[-2] - 1) * stride[-2], input_trt.shape[-1] - (output_size[-1] - 1) * stride[-1]) + layer = ctx.network.add_pooling_nd( input=input_trt, type=trt.PoolingType.AVERAGE, diff --git a/torch2trt/converters/__init__.py b/torch2trt/converters/__init__.py index 5ffe38b4..1c779a25 100644 --- a/torch2trt/converters/__init__.py +++ b/torch2trt/converters/__init__.py @@ -4,7 +4,6 @@ # supported converters will override dummy converters -from .AdaptiveAvgPool2d import * from .BatchNorm1d import * from .BatchNorm2d import * from .clone import * @@ -38,6 +37,7 @@ from .instance_norm import * from .interpolate import * from .layer_norm import * +from .matmul import * from .max import * from .max_pool1d import * from .max_pool2d import * diff --git a/torch2trt/converters/interpolate.py b/torch2trt/converters/interpolate.py index 949570d6..cc90d8ca 100644 --- a/torch2trt/converters/interpolate.py +++ b/torch2trt/converters/interpolate.py @@ -94,6 +94,47 @@ def convert_interpolate_trt7(ctx): output._trt = layer.get_output(0) +@tensorrt_converter('torch.nn.functional.interpolate', enabled=trt_version() >= '8.0') +@tensorrt_converter('torch.nn.functional.upsample', enabled=trt_version() >= '8.0') +def convert_interpolate_trt8(ctx): + #parse args + input = get_arg(ctx, 'input', pos=0, default=None) + size = get_arg(ctx, 'size', pos=1, default=None) + scale_factor=get_arg(ctx, 'scale_factor', pos=2, default=None) + mode = get_arg(ctx, 'mode', pos=3, default='nearest') + align_corners = get_arg(ctx, 'align_corners', pos=4, default=None) + + input_dim = input.dim() - 2 + + input_trt = add_missing_trt_tensors(ctx.network, [input])[0] + output = ctx.method_return + layer = ctx.network.add_resize(input=input_trt) + + shape = size + if shape != None: + if isinstance(shape, collections.Sequence): + shape = [input.size(1)] + list(shape) + else: + shape = [input.size(1)] + [shape] * input_dim + + layer.shape = shape + + scales = scale_factor + if scales != None: + if not isinstance(scales, collections.Sequence): + scales = [scales] * input_dim + layer.scales = [1] + list(scales) + + resize_mode = mode + if resize_mode.lower() in ["linear","bilinear","trilinear"]: + layer.resize_mode = trt.ResizeMode.LINEAR + else: + layer.resize_mode=trt.ResizeMode.NEAREST + + if align_corners: + layer.coordinate_transformation = trt.ResizeCoordinateTransformation.ALIGN_CORNERS + + output._trt = layer.get_output(0) class Interpolate(torch.nn.Module): def __init__(self, size=None,scale_factor=None, mode=None, align_corners=None): diff --git a/torch2trt/converters/matmul.py b/torch2trt/converters/matmul.py new file mode 100644 index 00000000..fda01951 --- /dev/null +++ b/torch2trt/converters/matmul.py @@ -0,0 +1,38 @@ +from torch2trt.torch2trt import * +from torch2trt.module_test import add_module_test + + +@tensorrt_converter('torch.matmul') +@tensorrt_converter('torch.mm') +@tensorrt_converter('torch.bmm') +def convert_matmul(ctx): + input_a = ctx.method_args[0] + input_b = ctx.method_args[1] + input_a_trt, input_b_trt = trt_(ctx.network, input_a, input_b) + output = ctx.method_return + layer = ctx.network.add_matrix_multiply(input_a_trt, trt.MatrixOperation.NONE, input_b_trt, trt.MatrixOperation.NONE) + output._trt = layer.get_output(0) + + +class MatMul(torch.nn.Module): + def __init__(self): + super(MatMul, self).__init__() + + def forward(self, x, y): + return torch.matmul(x, y) + +@add_module_test(torch.float32, torch.device('cuda'), [(1, 3, 4), (1, 4, 5)]) +def test_matmul_basic(): + return MatMul() + + +class BatchMatMul(torch.nn.Module): + def __init__(self): + super(BatchMatMul, self).__init__() + + def forward(self, x, y): + return torch.bmm(x, y) + +@add_module_test(torch.float32, torch.device('cuda'), [(10, 3, 4), (10, 4, 5)], max_batch_size=10) +def test_batchmatmul_basic(): + return BatchMatMul() \ No newline at end of file diff --git a/torch2trt/converters/mod.py b/torch2trt/converters/mod.py index 6cf69435..2bead5fa 100644 --- a/torch2trt/converters/mod.py +++ b/torch2trt/converters/mod.py @@ -64,7 +64,7 @@ def __init__(self): super(ModAssign, self).__init__() def forward(self, x, y): - x %= y + x = x % y return x diff --git a/torch2trt/plugins/group_norm.cpp b/torch2trt/plugins/group_norm.cpp index ccc7b51d..3247553a 100644 --- a/torch2trt/plugins/group_norm.cpp +++ b/torch2trt/plugins/group_norm.cpp @@ -112,19 +112,19 @@ class GroupNormPlugin : public IPluginV2 { return data_str.str(); } - const char* getPluginType() const override { + const char* getPluginType() const noexcept override { return "group_norm"; }; - const char* getPluginVersion() const override { + const char* getPluginVersion() const noexcept override { return "1"; } - int getNbOutputs() const override { + int getNbOutputs() const noexcept override { return 1; } - Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override { + Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) noexcept override { Dims dims; dims.nbDims = inputs->nbDims; @@ -135,8 +135,8 @@ class GroupNormPlugin : public IPluginV2 { return dims; } - bool supportsFormat(DataType type, PluginFormat format) const override { - if (format != PluginFormat::kNCHW) { + bool supportsFormat(DataType type, PluginFormat format) const noexcept override { + if (format != PluginFormat::kLINEAR) { return false; } if (type == DataType::kINT32 || type == DataType::kINT8) { @@ -146,7 +146,7 @@ class GroupNormPlugin : public IPluginV2 { } void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, - int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override { + int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) noexcept override { // set data type if (type == DataType::kFLOAT) { @@ -170,7 +170,7 @@ class GroupNormPlugin : public IPluginV2 { } } - int initialize() override { + int initialize() noexcept override { // set device tensor_options = tensor_options.device(c10::kCUDA); @@ -188,11 +188,11 @@ class GroupNormPlugin : public IPluginV2 { return 0; } - void terminate() override {} + void terminate() noexcept override {} - size_t getWorkspaceSize(int maxBatchSize) const override { return 0; } - - int enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override { + size_t getWorkspaceSize(int maxBatchSize) const noexcept override { return 0; } + int32_t enqueue(int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) noexcept override { // get input / output dimensions std::vector batch_input_sizes = input_sizes; std::vector batch_output_sizes = output_sizes; @@ -235,25 +235,25 @@ class GroupNormPlugin : public IPluginV2 { } - size_t getSerializationSize() const override { + size_t getSerializationSize() const noexcept override { return serializeToString().size(); } - void serialize(void* buffer) const override { + void serialize(void* buffer) const noexcept override { std::string data = serializeToString(); size_t size = getSerializationSize(); data.copy((char *) buffer, size); } - void destroy() override {} + void destroy() noexcept override {} - IPluginV2* clone() const override { + IPluginV2* clone() const noexcept override { return new GroupNormPlugin(num_groups, weight, bias, eps); } - void setPluginNamespace(const char* pluginNamespace) override {} + void setPluginNamespace(const char* pluginNamespace) noexcept override {} - const char *getPluginNamespace() const override { + const char *getPluginNamespace() const noexcept override { return "torch2trt"; } @@ -263,26 +263,26 @@ class GroupNormPluginCreator : public IPluginCreator { public: GroupNormPluginCreator() {} - const char *getPluginNamespace() const override { + const char *getPluginNamespace() const noexcept override { return "torch2trt"; } - const char *getPluginName() const override { + const char *getPluginName() const noexcept override { return "group_norm"; } - const char *getPluginVersion() const override { + const char *getPluginVersion() const noexcept override { return "1"; } - IPluginV2 *deserializePlugin(const char *name, const void *data, size_t length) override { + IPluginV2 *deserializePlugin(const char *name, const void *data, size_t length) noexcept override { return new GroupNormPlugin((const char*) data, length); } - void setPluginNamespace(const char *N) override {} - const PluginFieldCollection *getFieldNames() override { return nullptr; } + void setPluginNamespace(const char *N) noexcept override {} + const PluginFieldCollection *getFieldNames() noexcept override { return nullptr; } - IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) override { return nullptr; } + IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) noexcept override { return nullptr; } }; diff --git a/torch2trt/plugins/interpolate.cpp b/torch2trt/plugins/interpolate.cpp index cf463d0f..cc06a922 100644 --- a/torch2trt/plugins/interpolate.cpp +++ b/torch2trt/plugins/interpolate.cpp @@ -103,19 +103,19 @@ class InterpolatePlugin : public IPluginV2 { return data_str.str(); } - const char* getPluginType() const override { + const char* getPluginType() const noexcept override { return "interpolate"; }; - const char* getPluginVersion() const override { + const char* getPluginVersion() const noexcept override { return "1"; } - int getNbOutputs() const override { + int getNbOutputs() const noexcept override { return 1; } - Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override { + Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) noexcept override { Dims dims; dims.nbDims = inputs->nbDims; @@ -127,8 +127,8 @@ class InterpolatePlugin : public IPluginV2 { return dims; } - bool supportsFormat(DataType type, PluginFormat format) const override { - if (format != PluginFormat::kNCHW) { + bool supportsFormat(DataType type, PluginFormat format) const noexcept override { + if (format != PluginFormat::kLINEAR) { return false; } if (type == DataType::kINT32 || type == DataType::kINT8) { @@ -138,7 +138,7 @@ class InterpolatePlugin : public IPluginV2 { } void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, - int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override { + int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) noexcept override { // set data type if (type == DataType::kFLOAT) { @@ -162,7 +162,7 @@ class InterpolatePlugin : public IPluginV2 { } } - int initialize() override { + int initialize() noexcept override { // set device tensor_options = tensor_options.device(c10::kCUDA); @@ -176,11 +176,11 @@ class InterpolatePlugin : public IPluginV2 { return 0; } - void terminate() override {} + void terminate() noexcept override {} - size_t getWorkspaceSize(int maxBatchSize) const override { return 0; } - - int enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override { + size_t getWorkspaceSize(int maxBatchSize) const noexcept override { return 0; } + int32_t enqueue(int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) noexcept override { // get input / output dimensions std::vector batch_input_sizes = input_sizes; std::vector batch_output_sizes = output_sizes; @@ -227,25 +227,25 @@ class InterpolatePlugin : public IPluginV2 { return 0; } - size_t getSerializationSize() const override { + size_t getSerializationSize() const noexcept override { return serializeToString().size(); } - void serialize(void* buffer) const override { + void serialize(void* buffer) const noexcept override { std::string data = serializeToString(); size_t size = getSerializationSize(); data.copy((char *) buffer, size); } - void destroy() override {} + void destroy() noexcept override {} - IPluginV2* clone() const override { + IPluginV2* clone() const noexcept override { return new InterpolatePlugin(size, mode, align_corners); } - void setPluginNamespace(const char* pluginNamespace) override {} + void setPluginNamespace(const char* pluginNamespace) noexcept override {} - const char *getPluginNamespace() const override { + const char *getPluginNamespace() const noexcept override { return "torch2trt"; } @@ -255,26 +255,26 @@ class InterpolatePluginCreator : public IPluginCreator { public: InterpolatePluginCreator() {} - const char *getPluginNamespace() const override { + const char *getPluginNamespace() const noexcept override { return "torch2trt"; } - const char *getPluginName() const override { + const char *getPluginName() const noexcept override { return "interpolate"; } - const char *getPluginVersion() const override { + const char *getPluginVersion() const noexcept override { return "1"; } - IPluginV2 *deserializePlugin(const char *name, const void *data, size_t length) override { + IPluginV2 *deserializePlugin(const char *name, const void *data, size_t length) noexcept override { return new InterpolatePlugin((const char*) data, length); } - void setPluginNamespace(const char *N) override {} - const PluginFieldCollection *getFieldNames() override { return nullptr; } + void setPluginNamespace(const char *N) noexcept override {} + const PluginFieldCollection *getFieldNames() noexcept override { return nullptr; } - IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) override { return nullptr; } + IPluginV2 *createPlugin(const char *name, const PluginFieldCollection *fc) noexcept override { return nullptr; } }; diff --git a/torch2trt/test.py b/torch2trt/test.py index 3ea7b509..82b312eb 100644 --- a/torch2trt/test.py +++ b/torch2trt/test.py @@ -142,7 +142,6 @@ def run(self): num_tests, num_success, num_tolerance, num_error, num_tolerance_psnr = 0, 0, 0, 0, 0 for test in MODULE_TESTS: - # filter by module name name = test.module_name() if not re.search(args.name, name): diff --git a/torch2trt/torch2trt.py b/torch2trt/torch2trt.py index 6db93412..0cbd8f5e 100644 --- a/torch2trt/torch2trt.py +++ b/torch2trt/torch2trt.py @@ -406,7 +406,7 @@ def add_inputs(self, torch_inputs, names=None): shape=tuple(torch_input.shape)[1:], dtype=torch_dtype_to_trt(torch_input.dtype), ) - trt_tensor.location = torch_device_to_trt(torch_input.device) + trt_tensor.location = trt.TensorLocation.DEVICE torch_input._trt = trt_tensor def mark_outputs(self, torch_outputs, names=None): @@ -417,7 +417,7 @@ def mark_outputs(self, torch_outputs, names=None): for i, torch_output in enumerate(torch_outputs): trt_tensor = torch_output._trt trt_tensor.name = names[i] - trt_tensor.location = torch_device_to_trt(torch_output.device) + trt_tensor.location = trt.TensorLocation.DEVICE trt_tensor.dtype = torch_dtype_to_trt(torch_output.dtype) self.network.mark_output(trt_tensor) @@ -501,10 +501,12 @@ def torch2trt(module, strict_type_constraints=False, keep_network=True, int8_mode=False, + int8_calibrator=None, int8_calib_dataset=None, int8_calib_algorithm=DEFAULT_CALIBRATION_ALGORITHM, int8_calib_batch_size=1, use_onnx=False, + onnx_file_path=None, **kwargs): # capture arguments to provide to context @@ -518,7 +520,7 @@ def torch2trt(module, logger = trt.Logger(log_level) builder = trt.Builder(logger) config = builder.create_builder_config() - + if isinstance(inputs, list): inputs = tuple(inputs) if not isinstance(inputs, tuple): @@ -535,14 +537,24 @@ def torch2trt(module, output_names = default_output_names(len(outputs)) if use_onnx: - - f = io.BytesIO() - torch.onnx.export(module, inputs, f, input_names=input_names, output_names=output_names) - f.seek(0) - onnx_bytes = f.read() network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) parser = trt.OnnxParser(network, logger) - parser.parse(onnx_bytes) + if onnx_file_path is not None: + print('\tBeginning ONNX file parsing.. path = ', onnx_file_path) + with open(onnx_file_path, 'rb') as onnx_model_file: + onnx_model = onnx_model_file.read() + if not parser.parse(onnx_model): + raise RuntimeError("Onnx model parsing from {} failed. Error: {}".format(onnx_model_file, parser.get_error(0).desc())) + else: + parser.parse(onnx_model) + print('\tEND ONNX file parsing.') + else: + f = io.BytesIO() + torch.onnx.export(module, inputs, f, input_names=input_names, output_names=output_names, opset_version=13) + f.seek(0) + onnx_bytes = f.read() + # parser = trt.OnnxParser(network, logger) + parser.parse(onnx_bytes) else: network = builder.create_network() @@ -555,17 +567,11 @@ def torch2trt(module, if not isinstance(outputs, tuple) and not isinstance(outputs, list): outputs = (outputs,) ctx.mark_outputs(outputs, output_names) - - # set max workspace size + + config = builder.create_builder_config() config.max_workspace_size = max_workspace_size - - if fp16_mode: - config.set_flag(trt.BuilderFlag.FP16) - + config.flags = fp16_mode << int(trt.BuilderFlag.FP16) | strict_type_constraints << int(trt.BuilderFlag.STRICT_TYPES) builder.max_batch_size = max_batch_size - - if strict_type_constraints: - config.set_flag(trt.BuilderFlag.STRICT_TYPES) if int8_mode: @@ -573,15 +579,19 @@ def torch2trt(module, if int8_calib_dataset is None: int8_calib_dataset = TensorBatchDataset(inputs_in) + config.set_flag(trt.BuilderFlag.FP16) config.set_flag(trt.BuilderFlag.INT8) - + #Making sure not to run calibration with QAT mode on if not 'qat_mode' in kwargs: # @TODO(jwelsh): Should we set batch_size=max_batch_size? Need to investigate memory consumption - calibrator = DatasetCalibrator( - inputs, int8_calib_dataset, batch_size=int8_calib_batch_size, algorithm=int8_calib_algorithm - ) - config.int8_calibrator = calibrator + if int8_calibrator is None: + calibrator = DatasetCalibrator( + inputs, int8_calib_dataset, batch_size=int8_calib_batch_size, algorithm=int8_calib_algorithm + ) + config.int8_calibrator = calibrator + else: + config.int8_calibrator = int8_calibrator engine = builder.build_engine(network, config)