NVIDIA
diff --git a/‎.github/workflows/trigger-ci.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/trigger-ci.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎3rdparty/cudnn-frontend b/‎3rdparty/cudnn-frontend
diff --git a/‎examples/pytorch/comm_gemm_overlap/README.md
Lines changed: 2 additions & 2 deletions b/‎examples/pytorch/comm_gemm_overlap/README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎qa/L0_jax_distributed_unittest/test.sh
Lines changed: 15 additions & 0 deletions b/‎qa/L0_jax_distributed_unittest/test.sh
Lines changed: 15 additions & 0 deletions
diff --git a/‎qa/L0_jax_unittest/test.sh
Lines changed: 1 addition & 2 deletions b/‎qa/L0_jax_unittest/test.sh
Lines changed: 1 addition & 2 deletions
diff --git a/‎qa/L0_pytorch_unittest/test.sh
Lines changed: 1 addition & 1 deletion b/‎qa/L0_pytorch_unittest/test.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎qa/L1_pytorch_distributed_unittest/test.sh
Lines changed: 1 addition & 0 deletions b/‎qa/L1_pytorch_distributed_unittest/test.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/jax/conftest.py
Lines changed: 1 addition & 1 deletion b/‎tests/jax/conftest.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/jax/test_distributed_fused_attn.py
Lines changed: 2 additions & 4 deletions b/‎tests/jax/test_distributed_fused_attn.py
Lines changed: 2 additions & 4 deletions
@@ -42,6 +42,7 @@ jobs:
            || github.actor == 'kocchop'
            || github.actor == 'youngeunkwon0405'
            || github.actor == 'KshitijLakhani'
+           || github.actor == 'jberchtold-nvidia'
          )
     steps:
       - name: Check if comment is issued by authorized person
 
@@ -16,7 +16,7 @@
 Forward and backward passes with layer weights distributed over all GPUs in a single node.
 
 ```bash
-$ torchrun --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) ln_mlp_with_overlap.py
+$ torchrun --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) te_layer_with_overlap.py
 
 # Sample output on 8x H100s:
 #   [rank0:node0] |-- Created tensor-parallel group: [0, 1, 2, 3, 4, 5, 6, 7]
@@ -70,7 +70,7 @@ Uses `torch.nn.parallel.DistributedDataParallel` for replicatin the model across
 groups in a single node.
 
 ```bash
-$ torchrun --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) ln_mlp_overlap.py --num-replicas 2
+$ torchrun --nnodes=1 --nproc-per-node=$(nvidia-smi -L | wc -l) te_layer_with_overlap.py --num-replicas 2
 
 # Sample output on 8x H100s:
 #   [rank0:node0] |-- Created tensor-parallel group: [0, 1, 2, 3]
 
@@ -0,0 +1,15 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+set -xe
+
+: ${TE_PATH:=/opt/transformerengine}
+
+pip install -r $TE_PATH/examples/jax/encoder/requirements.txt
+
+# Make encoder tests to have run-to-run deterministic to have the stable CI results
+export XLA_FLAGS="${XLA_FLAGS} --xla_gpu_deterministic_ops"
+pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/encoder/test_multigpu_encoder.py
+pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/encoder/test_model_parallel_encoder.py
+pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/encoder/test_multiprocessing_encoder.py
@@ -20,5 +20,4 @@ pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/mnist
 
 # Make encoder tests to have run-to-run deterministic to have the stable CI results
 export XLA_FLAGS="${XLA_FLAGS} --xla_gpu_deterministic_ops"
-pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/encoder --ignore=$TE_PATH/examples/jax/encoder/test_multiprocessing_encoder.py
-pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/encoder/test_multiprocessing_encoder.py
+pytest -c $TE_PATH/tests/jax/pytest.ini -v $TE_PATH/examples/jax/encoder/test_single_gpu_encoder.py
@@ -13,7 +13,6 @@ pytest -v -s $TE_PATH/tests/pytorch/test_deferred_init.py
 PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_numerics.py
 PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_cuda_graphs.py
 pytest -v -s $TE_PATH/tests/pytorch/test_jit.py
-NVTE_TORCH_COMPILE=0 NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=1 pytest -o log_cli=true --log-cli-level=INFO -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn.py
 pytest -v -s $TE_PATH/tests/pytorch/test_fused_rope.py
 pytest -v -s $TE_PATH/tests/pytorch/test_float8tensor.py
 pytest -v -s $TE_PATH/tests/pytorch/test_torch_save_load.py
@@ -22,3 +21,4 @@ pytest -v -s $TE_PATH/tests/pytorch/test_fused_optimizer.py
 pytest -v -s $TE_PATH/tests/pytorch/test_multi_tensor.py
 pytest -v -s $TE_PATH/tests/pytorch/test_fusible_ops.py
 pytest -v -s $TE_PATH/tests/pytorch/test_permutation.py
+NVTE_TORCH_COMPILE=0 NVTE_DEBUG=1 NVTE_DEBUG_LEVEL=1 pytest -o log_cli=true --log-cli-level=INFO -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn.py
@@ -11,4 +11,5 @@ pytest -v -s $TE_PATH/tests/pytorch/distributed/test_numerics.py
 pytest -v -s $TE_PATH/tests/pytorch/distributed/test_comm_gemm_overlap.py
 pytest -v -s $TE_PATH/tests/pytorch/distributed/test_fusible_ops.py
 pytest -v -s $TE_PATH/tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py
+pytest -v -s $TE_PATH/tests/pytorch/distributed/test_torch_fsdp2.py
 pytest -v -s $TE_PATH/tests/pytorch/fused_attn/test_fused_attn_with_cp.py
@@ -20,7 +20,7 @@ def clear_live_arrays():
 
 
 @pytest.fixture(autouse=True, scope="module")
-def enable_fused_attn():
+def enable_fused_attn_after_hopper():
     """
     Enable fused attn for hopper+ arch.
     Fused attn kernels on pre-hopper arch are not deterministic.
 
@@ -20,7 +20,6 @@
 from utils import (
     make_causal_mask,
     make_self_mask,
-    assert_tree_like_allclose,
     assert_allclose,
     print_debug_tensor_stats,
 )
@@ -32,7 +31,6 @@
     AttnMaskType,
     QKVLayout,
     QKVFormat,
-    get_qkv_format,
     reorder_causal_load_balancing,
     inverse_reorder_causal_load_balancing,
     CPStrategy,
@@ -421,7 +419,7 @@ def impl_test_contex_parallel_attn(
         dropout_prob = 0.0
         is_training = True
         dp_size, cp_size, tp_size = mesh_shape
-        qkv_format = get_qkv_format(qkv_layout)
+        qkv_format = qkv_layout.get_qkv_format()
 
         batch, seqlen, num_head, hidden = data_shape
 
@@ -503,7 +501,7 @@ def grad_func(func, *args, **kwargs):
             # Gradient is small, use a gradient multiplier to amplify the gradient
             _, max_seq_len, num_heads, _ = data_shape
             gradient_multiplier = max_seq_len * num_heads
-            if attn_mask_type in [AttnMaskType.CAUSAL_MASK, AttnMaskType.CAUSAL_BOTTOM_RIGHT_MASK]:
+            if attn_mask_type.is_causal():
                 gradient_multiplier /= 10
             ret_valid = func(*args, **kwargs)
             return (jnp.mean(ret_valid, dtype=jnp.float32) * gradient_multiplier).astype(dtype)
Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,7 @@ jobs:`
`42`	`42`	`\|\| github.actor == 'kocchop'`
`43`	`43`	`\|\| github.actor == 'youngeunkwon0405'`
`44`	`44`	`\|\| github.actor == 'KshitijLakhani'`
	`45`	`+ \|\| github.actor == 'jberchtold-nvidia'`
`45`	`46`	`)`
`46`	`47`	`steps:`
`47`	`48`	`- name: Check if comment is issued by authorized person`