Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 157 additions & 0 deletions modelopt/onnx/quantization/autotune/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Pattern-Based Q/DQ Autotuning for ONNX Models.

This package provides automated optimization of Quantize/Dequantize (Q/DQ) node placement
in ONNX computation graphs to minimize TensorRT inference latency. It uses pattern-based
region analysis to efficiently explore and optimize Q/DQ insertion strategies.

**Key Features:**

- **Automated Region Discovery**: Hierarchical decomposition of computation graphs into
LEAF and COMPOSITE regions with automatic pattern identification

- **Pattern-Based Optimization**: Groups structurally-similar regions and optimizes them
together, making the process efficient and consistent

- **TensorRT Performance Measurement**: Direct integration with TensorRT Python API for
accurate latency profiling of each Q/DQ configuration

- **State Management**: Checkpoint/resume capability for long-running optimizations with
incremental state saving after each region

- **Pattern Cache**: Warm-start optimization using learned schemes from previous runs,
enabling transfer learning across models

**Core Components:**

Autotuner Classes:
- QDQAutotuner: Main autotuner with automatic hierarchical region discovery
- QDQAutotunerBase: Base class for custom region identification strategies

Region Management:
- Region: Hierarchical subgraph representation (nodes + children)
- RegionType: Enumeration (LEAF, COMPOSITE, ROOT)
- CombinedRegionSearch: Two-phase region discovery (partitioning + refinement)
- RegionPattern: Structural pattern analysis and matching for region grouping

Q/DQ Insertion Points:
- InsertionScheme: Collection of Q/DQ insertion points for a region pattern
- NodeInputInsertionPoint: Q/DQ insertion at specific node inputs
- ChildRegionInputInsertionPoint: Q/DQ insertion at child region input boundaries
- RegionOutputInsertionPoint: Q/DQ insertion at region output boundaries

Configuration & State:
- Config: Autotuning parameters (quant type, thresholds, verbosity)
- PatternCache: Top-performing schemes indexed by pattern (warm-start)
- PatternSchemes: Scheme collection and measurement results for a pattern

Benchmarking:
- Benchmark: Abstract base class for model benchmarking
- TensorRTPyBenchmark: Benchmark using TensorRT Python API (recommended)
- TrtExecBenchmark: Benchmark using trtexec command-line tool (legacy)

**Quick Start:**

>>> from modelopt.onnx.quantization.autotune import QDQAutotuner, Config
>>> import onnx
>>> # Load model and initialize autotuner
>>> model = onnx.load("model.onnx")
>>> autotuner = QDQAutotuner(model)
>>> # Configure autotuning parameters
>>> config = Config(default_quant_type="int8")
>>> autotuner.initialize(config)
>>> # Generate and test Q/DQ schemes
>>> # (see workflows.region_pattern_autotuning_workflow for complete example)

**Command-Line Interface:**

The package can be run directly as a module:

$ python -m modelopt.onnx.quantization.autotune --model model.onnx --output ./output
$ python -m modelopt.onnx.quantization.autotune --model model.onnx --quant-type fp8

**See Also:**

- workflows.region_pattern_autotuning_workflow: Complete end-to-end optimization
- QDQAutotuner: Main autotuner class documentation
- RegionPattern: Pattern matching and signature computation
"""

# Autotuner classes
from .autotuner import QDQAutotuner, QDQAutotunerBase

# Benchmark classes
from .benchmark import Benchmark, TensorRTPyBenchmark, TrtExecBenchmark

# Core data structures
from .common import (
AutotunerError,
AutotunerNotInitializedError,
Config,
InsertionScheme,
InvalidSchemeError,
PatternCache,
PatternSchemes,
Region,
RegionError,
RegionType,
)

# Insertion points (from dedicated module)
from .insertion_points import (
ChildRegionInputInsertionPoint,
NodeInputInsertionPoint,
RegionOutputInsertionPoint,
ResolvedInsertionPoint,
)

# Pattern analysis
from .region_pattern import RegionPattern

# Region search
from .region_search import CombinedRegionSearch

# Public API
__all__ = [
# Exceptions
"AutotunerError",
"AutotunerNotInitializedError",
# Benchmark classes
"Benchmark",
"TensorRTPyBenchmark",
"TrtExecBenchmark",
# Configuration and state
"Config",
# Q/DQ insertion
"InsertionScheme",
"InvalidSchemeError",
"NodeInputInsertionPoint",
"ChildRegionInputInsertionPoint",
"RegionOutputInsertionPoint",
"ResolvedInsertionPoint",
# Main autotuner classes
"QDQAutotuner",
"QDQAutotunerBase",
# Region classes
"Region",
"RegionError",
"RegionPattern",
"RegionType",
"PatternCache",
"PatternSchemes",
"CombinedRegionSearch",
]
91 changes: 91 additions & 0 deletions modelopt/onnx/quantization/autotune/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""ONNX Q/DQ Autotuning Command-Line Interface.

This module provides a command-line interface for automated Q/DQ (Quantize/Dequantize)
optimization of ONNX models. It uses pattern-based region analysis and TensorRT performance
measurement to find optimal Q/DQ insertion points that minimize inference latency.

**Usage Examples:**

# Basic usage - automatic region discovery and optimization
python -m modelopt.onnx.quantization.autotune --model model.onnx

# INT8 vs FP8 quantization
python -m modelopt.onnx.quantization.autotune --model model.onnx --quant-type fp8

# Warm-start from pattern cache (transfer learning)
python -m modelopt.onnx.quantization.autotune \\
--model model.onnx \\
--pattern-cache ./output/pattern_cache.yaml

# Import patterns from pre-quantized baseline model
python -m modelopt.onnx.quantization.autotune \\
--model model.onnx \\
--qdq-baseline quantized_baseline.onnx

# Full example with all optimization options
python -m modelopt.onnx.quantization.autotune \\
--model model.onnx \\
--schemes-per-region 50 \\
--pattern-cache pattern_cache.yaml \\
--qdq-baseline baseline.onnx \\
--output ./results \\
--quant-type int8 \\
--verbose

# Use custom TensorRT plugins for model-specific operations
python -m modelopt.onnx.quantization.autotune \\
--model model.onnx \\
--plugin-libraries /path/to/plugin1.so /path/to/plugin2.so

**Output Files:**

output_dir/
├── autotuner_state.yaml # Checkpoint for resume capability
├── baseline.onnx # Unquantized baseline model
├── optimized_final.onnx # Final optimized model with Q/DQ
├── logs/ # TensorRT build logs per scheme
│ ├── baseline.log
│ ├── region_*_scheme_*.log
│ └── final.log
└── region_models/ # Best model per region
└── region_*_level_*.onnx
"""

import sys

from modelopt.onnx.quantization.autotune.cli import get_autotune_parser, run_autotune


def main():
"""Command-line entry point for ONNX Q/DQ autotuning.

Parses command-line arguments and executes the autotuning workflow.

Returns:
Exit code from run_autotune (0 for success, non-zero for errors)
"""
parser = get_autotune_parser()
args = parser.parse_args()

# Run autotuning
return run_autotune(args)


if __name__ == "__main__":
sys.exit(main())
Loading