Add changes for 0.33.1 release

kevalmorabia97 · kevalmorabia97 · commit 55b91066d68e · 2025-08-13T00:16:37.000+05:30
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,7 +1,7 @@
 Model Optimizer Changelog (Linux)
 =================================
 
-0.33 (2025-07-xx)
+0.33 (2025-07-14)
 ^^^^^^^^^^^^^^^^^
 
 **Backward Breaking Changes**
@@ -20,7 +20,8 @@ Model Optimizer Changelog (Linux)
 - Add per node calibration support in ONNX quantization.
 - ModelOpt now supports quantization of tensor-parallel sharded Huggingface transformer models. This requires ``transformers>=4.52.0``.
 - Support quantization of FSDP2 wrapped models and add FSDP2 support in the ``llm_qat`` example.
-- Add NeMo 2 Simplified Flow examples for quantization aware training/distillation (QAT/QAD), speculative decoding, pruning & distilllation.
+- Add NeMo 2 Simplified Flow examples for quantization aware training/distillation (QAT/QAD), speculative decoding, pruning & distillation.
+- Fix a Qwen3 MOE model export issue.
 
 0.31 (2025-06-04)
 ^^^^^^^^^^^^^^^^^
diff --git a/modelopt/torch/_deploy/utils/torch_onnx.py b/modelopt/torch/_deploy/utils/torch_onnx.py
@@ -410,14 +410,16 @@ def get_onnx_bytes_and_metadata(
     )
     with torch.inference_mode(), autocast, quantizer_context:
         if not dynamo_export or Version(torch.__version__) >= Version("2.6"):
+            additional_kwargs = {}
+            if not dynamo_export and Version(torch.__version__) >= Version("2.8"):
+                additional_kwargs["dynamic_axes"] = dynamic_axes
             torch.onnx.export(
                 model,
                 dummy_input,
                 onnx_save_path,
                 input_names=input_names,
                 output_names=output_names,
                 opset_version=onnx_opset,
-                dynamic_axes=dynamic_axes,
                 dynamo=dynamo_export,
             )
         else:  # torch < 2.6 with dynamo export
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -17,6 +17,7 @@
 
 import collections.abc
 import json
+import re
 import tempfile
 import warnings
 from collections import defaultdict
@@ -97,7 +98,12 @@ def _output_hook(module, input, output):
     handles = []
     model_type = type(model).__name__.lower()
 
+    fused_linears = {}
+    module_names = set()
+
     for name, module in model.named_modules():
+        module_names.add(name)
+
         # For MoE models update pre_quant_scale to average pre_quant_scale amongst experts
         if is_moe(module) and ("awq" in quantization_format):
             # update_experts_avg_prequant_scale(module)
@@ -151,6 +157,7 @@ def _output_hook(module, input, output):
         ]:
             # Fuse modules that have the same input
             preprocess_linear_fusion(modules)
+            fused_linears[modules[0].name] = [module.name for module in modules]
 
         # Fuse layernorms
         if (
@@ -161,6 +168,29 @@ def _output_hook(module, input, output):
             # Pre quant scale of modules is already updated to avg_pre_quant_scale
             fuse_prequant_layernorm(output_to_layernorm[tensor], modules)
 
+    # The dummy forward may not be able to activate all the experts.
+    # Process experts by naming rules like experts.0, experts.1, etc.
+    for name, modules_fused in fused_linears.items():
+        if re.search(r"experts?\.\d+", name):
+            expert_id = 0
+            while True:
+                new_expert_name = re.sub(r"(experts?\.)\d+", rf"\g<1>{expert_id}", name, count=1)
+                if new_expert_name in fused_linears:
+                    expert_id += 1
+                    continue
+                if new_expert_name not in module_names:
+                    break
+
+                new_expert_modules = []
+                for name_fused in modules_fused:
+                    new_expert_name = re.sub(r"(experts?\.)\d+", rf"\g<1>{expert_id}", name_fused)
+                    assert new_expert_name in module_names
+                    new_expert_modules.append(model.get_submodule(new_expert_name))
+
+                preprocess_linear_fusion(new_expert_modules)
+
+                expert_id += 1
+
 
 def _export_hf_checkpoint(
     model: nn.Module, dtype: torch.dtype | None = None
diff --git a/modelopt/torch/nas/modules/utils.py b/modelopt/torch/nas/modules/utils.py
@@ -40,9 +40,9 @@ def get_sliced_tensor_by_slices(
     tensor_sliced = tensor
     for i, _ in enumerate(slices):
         if sum(not isinstance(s, slice) for s in slices) < 2:
-            tensor_sliced = tensor_sliced[slices]
+            tensor_sliced = tensor_sliced[tuple(slices)]
             break
-        tensor_sliced = tensor_sliced[slices[: i + 1]]
+        tensor_sliced = tensor_sliced[tuple(slices[: i + 1])]
         slices[i] = slice(None)  # replace with a vanilla slice ("[:]") for next slicing iteration
 
     # return sliced, contiguous tensor
diff --git a/modelopt/torch/quantization/qtensor/base_qtensor.py b/modelopt/torch/quantization/qtensor/base_qtensor.py
@@ -173,7 +173,7 @@ def pack_real_quantize_weight(module, force_quantize: bool = False):
 
     with SequentialQuantizer.convert_to_single_quantizer(module), torch.no_grad():
         for _, m in module.named_modules():
-            if hasattr(m, "weight") and m.weight.is_meta:
+            if hasattr(m, "weight") and (m.weight is None or m.weight.is_meta):
                 continue
             if (
                 hasattr(m, "weight_quantizer")
diff --git a/modelopt/torch/quantization/tensor_quant.py b/modelopt/torch/quantization/tensor_quant.py
@@ -25,7 +25,6 @@
 import modelopt.torch.quantization.triton as triton_kernel
 
 from .config import QuantizerAttributeConfig
-from .export_onnx import export_fp4, export_fp8, export_int8, export_mxfp8
 from .extensions import get_cuda_ext, get_cuda_ext_fp8, get_cuda_ext_mx
 
 mx_format_map = {
@@ -325,6 +324,8 @@ def symbolic(
         trt_high_precision_dtype=None,
     ):
         """ONNX symbolic function."""
+        from .export_onnx import export_int8
+
         return export_int8(
             g, inputs, amax, num_bits, unsigned, narrow_range, trt_high_precision_dtype
         )
@@ -395,6 +396,8 @@ class ScaledE4M3Function(Function):
     @symbolic_helper.parse_args("v", "t", "t", "i", "i", "s")
     def symbolic(g, inputs, amax=None, bias=None, E=4, M=3, trt_high_precision_dtype=None):  # noqa: N803
         """ONNX symbolic function."""
+        from .export_onnx import export_fp8
+
         return export_fp8(g, inputs, amax, trt_high_precision_dtype)
 
     @staticmethod
@@ -475,6 +478,8 @@ def symbolic(
         onnx_quantizer_type="dynamic",
     ):
         """ONNX symbolic function."""
+        from .export_onnx import export_fp4, export_mxfp8
+
         if num_bits == (2, 1) and scale_bits == (4, 3):
             return export_fp4(
                 g,
@@ -643,6 +648,8 @@ def symbolic(
         trt_high_precision_dtype=None,
     ):
         """ONNX symbolic function."""
+        from .export_onnx import export_int8
+
         return export_int8(
             g, inputs, amax, num_bits, unsigned, narrow_range, trt_high_precision_dtype
         )
diff --git a/setup.py b/setup.py
@@ -23,7 +23,7 @@
 # Package configuration ############################################################################
 name = "nvidia-modelopt"
 version = os.environ.get(
-    "SETUPTOOLS_SCM_PRETEND_VERSION", "0.33.0" if platform.system() == "Linux" else "0.27.0"
+    "SETUPTOOLS_SCM_PRETEND_VERSION", "0.33.1" if platform.system() == "Linux" else "0.27.0"
 )
 packages = setuptools.find_namespace_packages(include=["modelopt*"])
 package_dir = {"": "."}
@@ -56,11 +56,13 @@
         "cppimport",
         "cupy-cuda12x; platform_machine != 'aarch64' and platform_system != 'Darwin'",
         "ml_dtypes",  # for bfloat16 conversion
-        "onnx>=1.18.0",
         "onnx-graphsurgeon",
+        "onnx>=1.18.0",
+        "onnxconverter-common",
         "onnxruntime~=1.22.0 ; platform_machine == 'aarch64' or platform_system == 'Darwin'",
         "onnxruntime-gpu~=1.22.0 ; platform_machine != 'aarch64' and platform_system != 'Darwin' and platform_system != 'Windows'",  # noqa: E501
         "onnxruntime-gpu==1.20.0; platform_system == 'Windows'",
+        "onnxscript",  # For test_onnx_dynamo_export unit test
         "onnxsim ; python_version < '3.12' and platform_machine != 'aarch64'",
         "polygraphy>=0.49.22",
     ],
@@ -82,13 +84,12 @@
     # testing
     "dev-test": [
         "coverage",
-        "onnxscript",  # For test_onnx_dynamo_export unit test
         "pytest",
         "pytest-cov",
         "pytest-timeout",
         "timm",
-        "tox",
-        "tox-current-env>=0.0.12",  # Incompatible with tox==4.18.0
+        "tox>4.18",
+        "tox-current-env>=0.0.12",
     ],
     # docs
     "dev-docs": [
diff --git a/tests/_test_utils/torch_model/transformers_models.py b/tests/_test_utils/torch_model/transformers_models.py
@@ -27,7 +27,7 @@
     LlamaConfig,
     LlamaForCausalLM,
     T5Config,
-    T5Model,
+    T5ForConditionalGeneration,
     T5Tokenizer,
 )
 
@@ -50,7 +50,7 @@ def get_tiny_llama(**config_kwargs) -> LlamaForCausalLM:
     return tiny_llama
 
 
-def get_tiny_t5(**config_kwargs) -> T5Model:
+def get_tiny_t5(**config_kwargs) -> T5ForConditionalGeneration:
     kwargs = {
         "vocab_size": 32,
         "d_model": 32,
@@ -63,7 +63,7 @@ def get_tiny_t5(**config_kwargs) -> T5Model:
         "decoder_start_token_id": 0,
     }
     kwargs.update(**config_kwargs)
-    t5_model = T5Model(T5Config(**kwargs))
+    t5_model = T5ForConditionalGeneration(T5Config(**kwargs))
 
     return t5_model
 
diff --git a/tests/unit/torch/quantization/test_quant_rnn.py b/tests/unit/torch/quantization/test_quant_rnn.py
@@ -211,7 +211,7 @@ def test_fake_quant_per_channel(self, original_cls, bidirectional):
 
         out1 = quant_rnn_object(test_input)[0]
         out2 = rnn_object_original(test_input)[0]
-        assert torch.allclose(out1, out2)
+        assert torch.allclose(out1, out2, atol=1e-5)
 
     @pytest.mark.parametrize(
         ("original_cls", "bidirectional"),
diff --git a/tests/unit/torch/trace/test_symbol.py b/tests/unit/torch/trace/test_symbol.py
@@ -19,14 +19,6 @@
 from modelopt.torch.trace import RobustTracer, Symbol, SymMap
 from modelopt.torch.trace.modules.nn import get_conv_sym_info, get_linear_sym_info
 
-try:
-    import megatron  # noqa: F401
-    import transformer_engine  # noqa: F401
-
-    SKIP = True
-except ImportError:
-    SKIP = False
-
 
 def test_symbol_cls():
     sym = Symbol(elastic_dims={1, 2}, cl_type=Symbol.CLType.INCOMING)
@@ -117,11 +109,7 @@ def assert_num_symbols():
     assert_num_symbols()
 
 
-@pytest.mark.skipif(SKIP, reason="This cpu unit test will fail on GPU with Megatron/TE installed!")
 def test_sym_map_registry():
-    # NOTE: If running with transformer_engine or megatron-core installed, this test will fail.
-    # Ignoring this error for now, as it will only be there if running CPU tests on a GPU machine
-    # with the above packages installed.
     mods_in_registry = {
         nn.Linear,
         nn.BatchNorm1d,
@@ -151,6 +139,13 @@ def test_sym_map_registry():
     except ImportError:
         pass
 
+    try:
+        from megatron.core.models.gpt import GPTModel
+
+        mods_in_registry.add(GPTModel)
+    except ImportError:
+        pass
+
     not_a_leaf = {nn.Sequential}
     dependent_registry = set()