Skip to content
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
"--no-cov",
],
"evenBetterToml.schema.enabled": false, // disable toml/json schema since we have custom fields
"python.analysis.extraPaths": [
"cursorpyright.analysis.extraPaths": [
"./tests/" // add tests to python path just like pytest does in pyproject.toml
],
"git.alwaysSignOff": true,
Expand Down
3 changes: 2 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ NVIDIA Model Optimizer Changelog (Linux)
- Add support for parallel draft heads in Eagle speculative decoding.
- Add support to enable custom emulated quantization backend. See :meth:`register_quant_backend <modelopt.torch.quantization.nn.modules.tensor_quantizer.register_quant_backend>`` for more details. See an example in ``tests/unit/torch/quantization/test_custom_backend.py``.
- Add ``examples/llm_qad`` for QAD training with Megatron-LM.
- Add support for ``params`` constraint based automatic neural architecture search in Minitron pruning (``mcore_minitron``) as an alternative to manual pruning using ``export_config``. See `examples/pruning/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning>`_ for more details on its usage.

**Deprecations**

Expand Down Expand Up @@ -80,7 +81,7 @@ NVIDIA Model Optimizer Changelog (Linux)

**Documentation**

- Add general guidelines for Minitron pruning and distillation. See `examples/pruning/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning#pruning-guidelines>`_ for more details.
- Add general guidelines for Minitron pruning and distillation. See `pruning guidelines <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning#pruning-guidelines>`_ for more details.
- Added example for exporting QLoRA checkpoint for vLLM deployment. Refer to `examples/llm_qat/README.md <https://github.com/NVIDIA/Model-Optimizer/blob/79ef31bc7269ba4da0cfab446da5b64509cbfcef/examples/llm_qat/README.md#qlora-deployment>`_ for more details

0.37 (2025-10-08)
Expand Down
188 changes: 134 additions & 54 deletions examples/pruning/README.md

Large diffs are not rendered by default.

126 changes: 11 additions & 115 deletions modelopt/torch/nas/plugins/megatron.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,18 @@
import types
from abc import ABC
from collections.abc import Callable, Sequence
from typing import Any

import torch
import torch.nn as nn
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
from megatron.core.models.gpt import GPTModel
from megatron.core.parallel_state import (
get_data_parallel_group,
get_pipeline_model_parallel_group,
get_tensor_model_parallel_group,
is_pipeline_first_stage,
is_pipeline_last_stage,
)
from megatron.core.parallel_state import is_pipeline_first_stage, is_pipeline_last_stage
from megatron.core.tensor_parallel.layers import (
ColumnParallelLinear,
RowParallelLinear,
VocabParallelEmbedding,
)
from megatron.core.transformer import MegatronModule
from megatron.core.transformer.attention import SelfAttention
from megatron.core.transformer.dot_product_attention import DotProductAttention
from megatron.core.transformer.mlp import MLP
Expand All @@ -51,29 +43,14 @@
from modelopt.torch.nas.modules import DynamicModuleList
from modelopt.torch.opt.dynamic import DynamicModule
from modelopt.torch.opt.hparam import HPType
from modelopt.torch.opt.searcher import ConstraintsDict
from modelopt.torch.trace import Symbol
from modelopt.torch.utils import distributed as dist
from modelopt.torch.utils import (
get_module_device,
make_divisible,
param_num_from_forward,
print_rank_0,
random,
)
from modelopt.torch.utils import make_divisible

from ..algorithms import (
MODULE_TYPE_TO_CONSTRAINTS_FUNC,
ConstraintEvalFunc,
ConstraintInterpolator,
ConstraintsFunc,
ConstraintsRes,
)
from ..hparams.concat import build_concat_hp
from ..modules import _DynamicLayerNorm
from ..modules.utils import get_sliced_tensor, get_sliced_tensor_by_slices
from ..registry import DMRegistry
from ..search_space import SampleFunc
from ..traced_hp import TracedHp

SUPPORTED_MODELS = {GPTModel: "megatron.core.models.gpt.GPTModel"}
Expand Down Expand Up @@ -634,7 +611,6 @@ def modify(

def _export_reinit_token_dispatcher(self) -> None:
"""Reinitialize the token dispatcher after pruning."""
print_rank_0("Reinitializing token dispatcher after pruning")
if hasattr(moe_utils, "get_default_model_comm_pgs"):
model_comm_pgs = moe_utils.get_default_model_comm_pgs()
else:
Expand Down Expand Up @@ -1045,27 +1021,30 @@ def modify(
*,
hidden_size_divisor: int = 1,
ffn_hidden_size_divisor: int = 1,
mamba_num_heads_divisor: int = 1,
mamba_head_dim_divisor: int = 1,
num_moe_experts_divisor: int = 1,
num_layers_divisor: int = 1,
):
"""Modify the dynamic choices of the module according to provided keyword arguments.

Args:
hidden_size_divisor: The divisor of the hidden_size.
ffn_hidden_size_divisor: The divisor of the mlp ffn_hidden_size.
mamba_num_heads_divisor: The divisor of the mamba num_heads.
mamba_head_dim_divisor: The divisor of the mamba head_dim.
num_moe_experts_divisor: The divisor of the number of MoE experts.
num_layers_divisor: The divisor of the number of layers.
"""
hp = self.get_hparam("hidden_size")
choices = {int(make_divisible(c, hidden_size_divisor)) for c in hp.choices} # type: ignore[arg-type]
hp.choices = list(set(hp.choices) & choices | {hp.original})
for hp_name, divisor in [
("hidden_size", hidden_size_divisor),
("num_layers", num_layers_divisor),
]:
hp = self.get_hparam(hp_name)
choices = {int(make_divisible(c, divisor)) for c in hp.choices} # type: ignore[arg-type]
hp.choices = list(set(hp.choices) & choices | {hp.original})

for layer in self.decoder.layers:
layer.modify(
ffn_hidden_size_divisor=ffn_hidden_size_divisor,
mamba_num_heads_divisor=mamba_num_heads_divisor,
mamba_head_dim_divisor=mamba_head_dim_divisor,
num_moe_experts_divisor=num_moe_experts_divisor,
)
Expand All @@ -1084,86 +1063,3 @@ def export(self) -> torch.nn.Module:
).export()
self.output_layer.export()
return super().export()


class MegatronConstraintsFunc(ConstraintsFunc):
"""A Functor class to check if sub-net satisfied all provided constraints.

We intentionally expose some attributes like `limits` s.t. we can modify it manually.
"""

_sample_points_dict: dict[tuple[str, ...], dict[str, SampleFunc]] = {
("params",): {"min": min, "centroid": random.centroid, "max": max},
}

def __init__(
self,
model: MegatronModule,
constraints: ConstraintsDict,
dummy_input: Any | tuple[Any, ...],
deployment: dict | None = None,
fast_eval: bool = True,
):
"""Initialize with additional data parallel group info from megatron."""
for key in constraints:
if key != "params":
raise ValueError("Only params constraints is supported for MegatronModule!")

self.model = model
self.dummy_input = dummy_input
self.deployment = deployment
self._fast_eval = fast_eval

# Getting data parallel group for
self.dp_group = get_data_parallel_group()

# initialize latency interpolator
keys_for_interpolation = ("params",)
if ConstraintsFunc.is_configurable(self.model, "depth"):
keys_for_interpolation += ("flops_min_depth",)
self._latency_interpolator = ConstraintInterpolator(
self.model,
points_funcs={k: self.constraint_eval_funcs[k] for k in keys_for_interpolation},
value_func=self._get_true_latency,
)
# set fast/regular mode for latency interpolator
self._latency_interpolator.collect_mode = not self.fast_eval

# set limit at the end with setter to use sanity checks on constraints
self._limits = {}
self.limits = constraints

@property
def constraint_eval_funcs(self) -> dict[str, ConstraintEvalFunc]:
"""Get constraint eval fns."""
return {
"params": self._get_params,
}

def _get_params(self, _: ConstraintsRes | None = None) -> float:
"""Get number of model parameters from forward pass."""
params = param_num_from_forward(self.model, args=self.dummy_input, unit=1.0)
reduced_params = torch.Tensor([params]).to(device=get_module_device(self.model))
torch.distributed.all_reduce(reduced_params, group=get_pipeline_model_parallel_group())
torch.distributed.all_reduce(reduced_params, group=get_tensor_model_parallel_group())
return reduced_params.item()

def _get_flops(self, _: ConstraintsRes | None = None) -> float:
"""Get inference FLOPs."""
raise NotImplementedError

def _get_flops_min_depth(self, _: ConstraintsRes | None = None) -> float:
"""Get inference FLOPs with depth set to minimum."""
raise NotImplementedError

def _get_true_latency(self, _: ConstraintsRes | None = None) -> float:
"""Get true inference latency."""
raise NotImplementedError

def _get_latency(self, precomputed: ConstraintsRes | None = None) -> float:
"""Get inference latency from interpolator."""
raise NotImplementedError


# Clear the mapping and reinsert.
MODULE_TYPE_TO_CONSTRAINTS_FUNC[MegatronModule] = MegatronConstraintsFunc
4 changes: 1 addition & 3 deletions modelopt/torch/nas/search_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,7 @@ def sort_parameters(self, hps_to_sort: set[str] | None = None, verbose: bool = F
hps_to_sort: A set of hparam names to sort. If not provided or empty, all hparams will be sorted.
verbose: Whether to print the search space and hparam importances.
"""
print_rank_0("Sorting parameters...")
if verbose:
self.print_summary()
print_rank_0("\nSorting parameters...")

# get config and set to max
config = self.config()
Expand Down
3 changes: 2 additions & 1 deletion modelopt/torch/opt/searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
import torch.nn as nn

from modelopt.torch.utils import distributed as dist
from modelopt.torch.utils import no_stdout, run_forward_loop
from modelopt.torch.utils import no_stdout, print_rank_0, run_forward_loop

LimitsTuple = tuple[float, float]
ConstraintsDict = dict[str, str | float | dict | None]
Expand Down Expand Up @@ -212,6 +212,7 @@ def construct_forward_loop(
return None

def forward_loop_with_silence_check(m: nn.Module) -> None:
print_rank_0("Running forward loop...")
with no_stdout() if silent else nullcontext():
if data_loader is not None:
run_forward_loop(
Expand Down
2 changes: 0 additions & 2 deletions modelopt/torch/prune/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@
simplifies the overall workflow to accommodate for the simpler nature of pruning algorithms.
"""

# nas is a required - so let's check if it's available
import modelopt.torch.nas
from modelopt.torch.utils import import_plugin

from . import fastnas, gradnas, plugins
Expand Down
Loading