-
Notifications
You must be signed in to change notification settings - Fork 292
Sequential calibrate refactor #982
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
1a4d650
99b5134
97a6d4b
437a4ae
0cba938
9e989c4
0df2540
d4c07a7
537902b
ef44d05
54b2a6a
c18d109
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,14 +28,10 @@ | |
| from tqdm import tqdm | ||
|
|
||
| from modelopt.torch.opt.searcher import ForwardLoop | ||
| from modelopt.torch.quantization.utils import LayerActivationCollector | ||
| from modelopt.torch.quantization.utils.activation_collector import LayerActivationCollector | ||
| from modelopt.torch.utils import print_rank_0 | ||
| from modelopt.torch.utils.distributed import DistributedProcessGroup, ParallelState | ||
| from modelopt.torch.utils.network import ( | ||
| bind_forward_method, | ||
| get_decoder_layers, | ||
| unpatch_forward_method, | ||
| ) | ||
| from modelopt.torch.utils.network import bind_forward_method, unpatch_forward_method | ||
| from modelopt.torch.utils.perf import get_used_gpu_mem_fraction | ||
|
|
||
| from .calib import MseCalibrator, NVFP4MSECalibrator | ||
|
|
@@ -1838,31 +1834,42 @@ def sequential_calibrate( | |
| calib_func: Callable, | ||
| **calib_kwargs, | ||
| ): | ||
| """Sequential calibration - a sequential layer-by-layer calibration algorithm.""" | ||
| """Sequential calibration - a sequential layer-by-layer calibration algorithm. | ||
|
|
||
| Runs the full model forward per layer but patches decoder layers with a | ||
| skip / run / capture strategy so that inter-layer logic in parent modules | ||
| (e.g. mask construction) executes naturally without model-specific hooks. | ||
| """ | ||
| if forward_loop is None: | ||
| raise ValueError("forward_loop must not be None for sequential calibration.") | ||
| raise ValueError( | ||
| "forward_loop must not be None for sequential calibration. " | ||
| "Please provide a valid forward_loop callable." | ||
| ) | ||
|
|
||
| transformer_layers = get_decoder_layers(model) | ||
| if transformer_layers is None: | ||
| transformer_layers = LayerActivationCollector.get_decoder_layers(model) | ||
| if transformer_layers is None or len(transformer_layers) == 0: | ||
| raise ValueError( | ||
| "Could not find transformer layers in model'. " | ||
| "Could not find transformer layers in model. " | ||
| "Sequential calibration requires a model with identifiable transformer layers." | ||
| ) | ||
|
|
||
| print_rank_0(f"Sequential calibration: Found {len(transformer_layers)} transformer layers") | ||
|
|
||
| gettr = LayerActivationCollector(model) | ||
| input_getter = LayerActivationCollector(model) | ||
| input_getter._patch_all_layers(decoder_layers=transformer_layers) | ||
|
|
||
| for layer in transformer_layers: | ||
| # Get updated input activations to the current layer | ||
| layer_inputs = gettr.get_input_activations(layer, forward_loop) | ||
| try: | ||
| for layer_idx, layer in enumerate(transformer_layers): | ||
| print_rank_0(f"Calibrating layer {layer_idx}") | ||
| layer_inputs = input_getter.get_input_activations(layer, forward_loop) | ||
|
|
||
| # Define a forward loop for the current layer | ||
| def _layer_forward_loop(m, _inputs=layer_inputs): | ||
| for args, kwargs_input in _inputs: | ||
| m(*args, **kwargs_input) | ||
| def _layer_forward_loop(m, _inputs=layer_inputs): | ||
| for args, kwargs_input in _inputs: | ||
| m(*args, **kwargs_input) | ||
|
|
||
| # Call calibration function | ||
| calib_func(layer, _layer_forward_loop, **calib_kwargs) | ||
| del layer_inputs | ||
| torch.cuda.empty_cache() | ||
| calib_func(layer, _layer_forward_loop, **calib_kwargs) | ||
|
Comment on lines
+1864
to
+1872
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reject empty captures before invoking If Suggested guard for layer_idx, layer in enumerate(transformer_layers):
print_rank_0(f"Calibrating layer {layer_idx}")
layer_inputs = input_getter.get_input_activations(layer, forward_loop)
+ if not layer_inputs:
+ raise RuntimeError(
+ f"No calibration inputs were captured for layer {layer_idx}. "
+ "Please ensure `forward_loop` executes at least one forward pass."
+ )
def _layer_forward_loop(m, _inputs=layer_inputs):
for args, kwargs_input in _inputs:
m(*args, **kwargs_input)🤖 Prompt for AI Agents |
||
|
|
||
| del layer_inputs | ||
| torch.cuda.empty_cache() | ||
| finally: | ||
| input_getter._unpatch_all_layers() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -56,7 +56,7 @@ | |
| else: | ||
| weight_dequant = None | ||
|
|
||
| from ..utils import replace_function, sync_moe_expert_amax | ||
| from ..utils import LayerActivationCollector, replace_function, sync_moe_expert_amax | ||
| from .attention import register_attention_for_kv_quant | ||
| from .custom import CUSTOM_MODEL_PLUGINS, _ParallelLinear, _QuantFunctionalMixin | ||
|
|
||
|
|
@@ -1216,6 +1216,42 @@ def _is_supported_hf_model(model): | |
| return isinstance(model, tuple(supported_models)) | ||
|
|
||
|
|
||
| def is_nemotron_h_model(model: nn.Module) -> bool: | ||
| return get_nemotron_h_decoder_layers(model) is not None | ||
|
|
||
|
|
||
| def get_nemotron_h_decoder_layers(model: nn.Module) -> nn.ModuleList | None: | ||
| if not _is_supported_hf_model(model): | ||
| return None | ||
|
|
||
| if hasattr(model, "backbone") and hasattr(model.backbone, "layers"): | ||
| layers = model.backbone.layers | ||
| if len(layers) > 0 and hasattr(layers[0], "block_type"): | ||
| return layers | ||
|
|
||
| return None | ||
|
|
||
|
|
||
| def is_homogeneous_hf_model(model: nn.Module) -> bool: | ||
| if is_nemotron_h_model(model): | ||
| return False | ||
| decoder_layers = get_homogeneous_hf_decoder_layers(model) | ||
| if decoder_layers is None or len(decoder_layers) == 0: | ||
| return False | ||
| layer_classes = {type(layer) for layer in decoder_layers} | ||
| return len(layer_classes) == 1 | ||
|
|
||
|
|
||
| def get_homogeneous_hf_decoder_layers(model: nn.Module) -> nn.ModuleList | None: | ||
| if not _is_supported_hf_model(model): | ||
| return None | ||
|
|
||
| if hasattr(model, "model") and hasattr(model.model, "layers"): | ||
| return model.model.layers | ||
|
|
||
| return None | ||
|
|
||
|
|
||
| @contextmanager | ||
| def setup_model_for_gradient_checkpointing(model: nn.Module): | ||
| use_cache = None | ||
|
|
@@ -1269,6 +1305,17 @@ def _is_param_grad_enabled_for_auto_quantize(pname, model): | |
| _is_param_grad_enabled_for_auto_quantize, | ||
| ) | ||
|
|
||
| # Order matters: more specific predicates must be registered first because | ||
| # the first matching entry wins. Nemotron-H must precede the generic | ||
| # homogeneous HF discoverer (which explicitly rejects Nemotron-H). | ||
| LayerActivationCollector.register_decoder_layer_support( | ||
| is_nemotron_h_model, get_nemotron_h_decoder_layers | ||
| ) | ||
|
|
||
| LayerActivationCollector.register_decoder_layer_support( | ||
| is_homogeneous_hf_model, get_homogeneous_hf_decoder_layers | ||
| ) | ||
|
Comment on lines
+1311
to
+1317
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This does not cover the pattern for Megatron models, which have |
||
|
|
||
| CUSTOM_MODEL_PLUGINS.update( | ||
| [ | ||
| register_falcon_linears_on_the_fly, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,34 @@ | ||
| # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """Quantization utilities.""" | ||
|
|
||
| from .core_utils import * # noqa: F401,F403 | ||
| from .activation_collector import LayerActivationCollector # noqa: F401 | ||
|
|
||
| __all__ = [ | ||
| "EXPORT_MODE", | ||
| "convert_quantization_axis_to_reduce_axis", | ||
| "export_torch_mode", | ||
| "is_quantized", | ||
| "is_quantized_column_parallel_linear", | ||
| "is_quantized_linear", | ||
| "is_quantized_row_parallel_linear", | ||
| "reduce_amax", | ||
| "reduce_sum", | ||
| "replace_function", | ||
| "update_quant_cfg_with_kv_cache_quant", | ||
| "weight_attr_names", | ||
| ] | ||
|
Comment on lines
+18
to
+34
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make the package exports explicit here. This hunk is the source of both CI failures: Ruff cannot resolve the names referenced from One minimal way to satisfy both tools without shrinking the current runtime surface from .core_utils import * # noqa: F401,F403
-from .activation_collector import LayerActivationCollector # noqa: F401
+from .core_utils import (
+ EXPORT_MODE,
+ convert_quantization_axis_to_reduce_axis,
+ export_torch_mode,
+ is_quantized,
+ is_quantized_column_parallel_linear,
+ is_quantized_linear,
+ is_quantized_row_parallel_linear,
+ reduce_amax,
+ reduce_sum,
+ replace_function,
+ update_quant_cfg_with_kv_cache_quant,
+ weight_attr_names,
+)
+from .activation_collector import LayerActivationCollector
__all__ = [
"EXPORT_MODE",
"convert_quantization_axis_to_reduce_axis",
"export_torch_mode",
@@
"replace_function",
"update_quant_cfg_with_kv_cache_quant",
"weight_attr_names",
+ "LayerActivationCollector",
]As per coding guidelines, 🤖 Prompt for AI Agents |
||
Uh oh!
There was an error while loading. Please reload this page.