Skip to content
53 changes: 30 additions & 23 deletions modelopt/torch/quantization/model_calib.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,10 @@
from tqdm import tqdm

from modelopt.torch.opt.searcher import ForwardLoop
from modelopt.torch.quantization.utils import LayerActivationCollector
from modelopt.torch.quantization.utils.activation_collector import LayerActivationCollector
from modelopt.torch.utils import print_rank_0
from modelopt.torch.utils.distributed import DistributedProcessGroup, ParallelState
from modelopt.torch.utils.network import (
bind_forward_method,
get_decoder_layers,
unpatch_forward_method,
)
from modelopt.torch.utils.network import bind_forward_method, unpatch_forward_method
from modelopt.torch.utils.perf import get_used_gpu_mem_fraction

from .calib import MseCalibrator, NVFP4MSECalibrator
Expand Down Expand Up @@ -1838,31 +1834,42 @@ def sequential_calibrate(
calib_func: Callable,
**calib_kwargs,
):
"""Sequential calibration - a sequential layer-by-layer calibration algorithm."""
"""Sequential calibration - a sequential layer-by-layer calibration algorithm.

Runs the full model forward per layer but patches decoder layers with a
skip / run / capture strategy so that inter-layer logic in parent modules
(e.g. mask construction) executes naturally without model-specific hooks.
"""
if forward_loop is None:
raise ValueError("forward_loop must not be None for sequential calibration.")
raise ValueError(
"forward_loop must not be None for sequential calibration. "
"Please provide a valid forward_loop callable."
)

transformer_layers = get_decoder_layers(model)
if transformer_layers is None:
transformer_layers = LayerActivationCollector.get_decoder_layers(model)
if transformer_layers is None or len(transformer_layers) == 0:
raise ValueError(
"Could not find transformer layers in model'. "
"Could not find transformer layers in model. "
"Sequential calibration requires a model with identifiable transformer layers."
)

print_rank_0(f"Sequential calibration: Found {len(transformer_layers)} transformer layers")

gettr = LayerActivationCollector(model)
input_getter = LayerActivationCollector(model)
input_getter._patch_all_layers(decoder_layers=transformer_layers)

for layer in transformer_layers:
# Get updated input activations to the current layer
layer_inputs = gettr.get_input_activations(layer, forward_loop)
try:
for layer_idx, layer in enumerate(transformer_layers):
print_rank_0(f"Calibrating layer {layer_idx}")
layer_inputs = input_getter.get_input_activations(layer, forward_loop)

# Define a forward loop for the current layer
def _layer_forward_loop(m, _inputs=layer_inputs):
for args, kwargs_input in _inputs:
m(*args, **kwargs_input)
def _layer_forward_loop(m, _inputs=layer_inputs):
for args, kwargs_input in _inputs:
m(*args, **kwargs_input)

# Call calibration function
calib_func(layer, _layer_forward_loop, **calib_kwargs)
del layer_inputs
torch.cuda.empty_cache()
calib_func(layer, _layer_forward_loop, **calib_kwargs)
Comment on lines +1864 to +1872
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Reject empty captures before invoking calib_func.

If forward_loop yields no calibration batches, layer_inputs is empty and we silently run calib_func with a no-op replay loop. That makes sequential calibration look successful while leaving the layer uncalibrated.

Suggested guard
         for layer_idx, layer in enumerate(transformer_layers):
             print_rank_0(f"Calibrating layer {layer_idx}")
             layer_inputs = input_getter.get_input_activations(layer, forward_loop)
+            if not layer_inputs:
+                raise RuntimeError(
+                    f"No calibration inputs were captured for layer {layer_idx}. "
+                    "Please ensure `forward_loop` executes at least one forward pass."
+                )
 
             def _layer_forward_loop(m, _inputs=layer_inputs):
                 for args, kwargs_input in _inputs:
                     m(*args, **kwargs_input)
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@modelopt/torch/quantization/model_calib.py` around lines 1864 - 1872, The
loop calls calib_func even when input_getter.get_input_activations(...) returns
an empty list, causing silent no-op calibration; before constructing
_layer_forward_loop and calling calib_func for each layer (symbols:
transformer_layers, input_getter.get_input_activations, forward_loop,
layer_inputs, _layer_forward_loop, calib_func), check whether layer_inputs is
empty and if so reject it (either raise an exception or skip with a clear
error/warning and do not call calib_func) so layers with no captured batches are
not treated as successfully calibrated.


del layer_inputs
torch.cuda.empty_cache()
finally:
input_getter._unpatch_all_layers()
49 changes: 48 additions & 1 deletion modelopt/torch/quantization/plugins/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
else:
weight_dequant = None

from ..utils import replace_function, sync_moe_expert_amax
from ..utils import LayerActivationCollector, replace_function, sync_moe_expert_amax
from .attention import register_attention_for_kv_quant
from .custom import CUSTOM_MODEL_PLUGINS, _ParallelLinear, _QuantFunctionalMixin

Expand Down Expand Up @@ -1216,6 +1216,42 @@ def _is_supported_hf_model(model):
return isinstance(model, tuple(supported_models))


def is_nemotron_h_model(model: nn.Module) -> bool:
return get_nemotron_h_decoder_layers(model) is not None


def get_nemotron_h_decoder_layers(model: nn.Module) -> nn.ModuleList | None:
if not _is_supported_hf_model(model):
return None

if hasattr(model, "backbone") and hasattr(model.backbone, "layers"):
layers = model.backbone.layers
if len(layers) > 0 and hasattr(layers[0], "block_type"):
return layers

return None


def is_homogeneous_hf_model(model: nn.Module) -> bool:
if is_nemotron_h_model(model):
return False
decoder_layers = get_homogeneous_hf_decoder_layers(model)
if decoder_layers is None or len(decoder_layers) == 0:
return False
layer_classes = {type(layer) for layer in decoder_layers}
return len(layer_classes) == 1


def get_homogeneous_hf_decoder_layers(model: nn.Module) -> nn.ModuleList | None:
if not _is_supported_hf_model(model):
return None

if hasattr(model, "model") and hasattr(model.model, "layers"):
return model.model.layers

return None


@contextmanager
def setup_model_for_gradient_checkpointing(model: nn.Module):
use_cache = None
Expand Down Expand Up @@ -1269,6 +1305,17 @@ def _is_param_grad_enabled_for_auto_quantize(pname, model):
_is_param_grad_enabled_for_auto_quantize,
)

# Order matters: more specific predicates must be registered first because
# the first matching entry wins. Nemotron-H must precede the generic
# homogeneous HF discoverer (which explicitly rejects Nemotron-H).
LayerActivationCollector.register_decoder_layer_support(
is_nemotron_h_model, get_nemotron_h_decoder_layers
)

LayerActivationCollector.register_decoder_layer_support(
is_homogeneous_hf_model, get_homogeneous_hf_decoder_layers
)
Comment on lines +1311 to +1317
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does not cover the pattern for Megatron models, which have model.decoder.layers. Maybe a follow up is to add similar support for mcore/


CUSTOM_MODEL_PLUGINS.update(
[
register_falcon_linears_on_the_fly,
Expand Down
34 changes: 34 additions & 0 deletions modelopt/torch/quantization/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Quantization utilities."""

from .core_utils import * # noqa: F401,F403
from .activation_collector import LayerActivationCollector # noqa: F401

__all__ = [
"EXPORT_MODE",
"convert_quantization_axis_to_reduce_axis",
"export_torch_mode",
"is_quantized",
"is_quantized_column_parallel_linear",
"is_quantized_linear",
"is_quantized_row_parallel_linear",
"reduce_amax",
"reduce_sum",
"replace_function",
"update_quant_cfg_with_kv_cache_quant",
"weight_attr_names",
]
Comment on lines +18 to +34
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Make the package exports explicit here.

This hunk is the source of both CI failures: Ruff cannot resolve the names referenced from __all__ through the star import, and mypy does not see LayerActivationCollector as exported because it is missing from __all__. Keep the star import if you still need the broader runtime surface, but explicitly import the symbols referenced in __all__ and add LayerActivationCollector there.

One minimal way to satisfy both tools without shrinking the current runtime surface
 from .core_utils import *  # noqa: F401,F403
-from .activation_collector import LayerActivationCollector  # noqa: F401
+from .core_utils import (
+    EXPORT_MODE,
+    convert_quantization_axis_to_reduce_axis,
+    export_torch_mode,
+    is_quantized,
+    is_quantized_column_parallel_linear,
+    is_quantized_linear,
+    is_quantized_row_parallel_linear,
+    reduce_amax,
+    reduce_sum,
+    replace_function,
+    update_quant_cfg_with_kv_cache_quant,
+    weight_attr_names,
+)
+from .activation_collector import LayerActivationCollector
 
 __all__ = [
     "EXPORT_MODE",
     "convert_quantization_axis_to_reduce_axis",
     "export_torch_mode",
@@
     "replace_function",
     "update_quant_cfg_with_kv_cache_quant",
     "weight_attr_names",
+    "LayerActivationCollector",
 ]

As per coding guidelines, **/*.py: Use ruff linter for Python code (configured in pyproject.toml) and Use mypy for type checking on Python code (configured in pyproject.toml).

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@modelopt/torch/quantization/utils/__init__.py` around lines 18 - 34, Add
explicit imports for every name listed in __all__ (e.g., EXPORT_MODE,
convert_quantization_axis_to_reduce_axis, export_torch_mode, is_quantized,
is_quantized_column_parallel_linear, is_quantized_linear,
is_quantized_row_parallel_linear, reduce_amax, reduce_sum, replace_function,
update_quant_cfg_with_kv_cache_quant, weight_attr_names) at the top of
modelopt/torch/quantization/utils/__init__.py (you may keep the existing from
.core_utils import * for runtime surface), and include LayerActivationCollector
in the __all__ list so mypy and Ruff can resolve the exports; ensure the
explicit imports reference the correct module paths (e.g., from
.activation_collector import LayerActivationCollector) and update the __all__
array to contain all those symbol names.

Loading
Loading