Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions cvs/lib/rccl_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,10 +548,11 @@ def rccl_regression(
for node in vpc_node_list:
host_file_params = f'{host_file_params}{node} slots={proc_per_node}\n'

cmd = 'sudo rm -f /tmp/rccl_hosts_file.txt'
hosts_file_path = f'/tmp/rccl_hosts_file_{os.environ.get("USER", "cvs")}.txt'
cmd = f'rm -f {hosts_file_path}'
shdl.exec(cmd)

cmd = f'echo "{host_file_params}" > /tmp/rccl_hosts_file.txt'
cmd = f'echo "{host_file_params}" > {hosts_file_path}'
shdl.exec(cmd)

# Determine PML (Point-to-Point Messaging Layer) based on user config or auto-detection
Expand Down Expand Up @@ -596,7 +597,7 @@ def rccl_regression(
cmd = f'''{mpi_dir}/bin/mpirun \
--allow-run-as-root \
-np {no_of_global_ranks} \
--hostfile /tmp/rccl_hosts_file.txt \
--hostfile {hosts_file_path} \
--bind-to numa \
{ucx_params} \
--mca btl ^vader,openib \
Expand Down Expand Up @@ -722,10 +723,11 @@ def rccl_perf(
for node in vpc_node_list:
host_file_params = f'{host_file_params}' + f'{node} slots={proc_per_node}\n'

cmd = 'sudo rm -f /tmp/rccl_hosts_file.txt'
hosts_file_path = f'/tmp/rccl_hosts_file_{os.environ.get("USER", "cvs")}.txt'
cmd = f'rm -f {hosts_file_path}'
shdl.exec(cmd)

cmd = f'echo "{host_file_params}" > /tmp/rccl_hosts_file.txt'
cmd = f'echo "{host_file_params}" > {hosts_file_path}'
shdl.exec(cmd)

# Determine PML (Point-to-Point Messaging Layer) based on user config or auto-detection
Expand Down Expand Up @@ -774,7 +776,7 @@ def rccl_perf(
# Build mpirun command
cmd = f'''{mpi_dir}/bin/mpirun --np {no_of_global_ranks} \
--allow-run-as-root \
--hostfile /tmp/rccl_hosts_file.txt \
--hostfile {hosts_file_path} \
--bind-to numa \
{ucx_params} \
--mca btl ^vader,openib \
Expand Down
6 changes: 5 additions & 1 deletion cvs/lib/verify_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
'crash': 'crashed|Traceback|cut here|Bug:|Call Trace|RIP:|end trace|amdgpu: Fatal error|segfault|show_stack|dump_stack|fault ',
'test_fail': 'Test failure',
'fault': 'no-retry page fault|Illegal register access|PROTECTION_FAULT_STATUS',
'driver': 'Queue preemption failed for queue|Failed to evict process queues|Runlist is getting oversubscribed|No more SDMA queue to allocate|Expect reduced ROCm performance|amdgpu: process pid',
# Note: 'Runlist is getting oversubscribed' and 'Expect reduced ROCm performance'
# are amdgpu kernel info-level messages (not errors). They fire routinely on
# large multi-rank RCCL runs whenever HSA queue count exceeds the runlist
# size, even when the run itself is healthy. Excluded from failure matching.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add links to the reference doc in the comment for future reference

'driver': 'Queue preemption failed for queue|Failed to evict process queues|No more SDMA queue to allocate|amdgpu: process pid',
'hardware': 'hardware error|hardware fail|ras error|uncorrectable|correctable err',
'network': 'NIC Link is Down|link is down|ib_uverb|CQE|queue catastrophic|CQ error',
}
Expand Down
6 changes: 5 additions & 1 deletion cvs/tests/rccl/rccl_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,11 @@ def test_rccl_perf(phdl, shdl, cluster_dict, config_dict, rccl_collective):
phdl.exec(f'sudo echo "End of Test {rccl_collective}" | sudo tee /dev/kmsg')

end_time = phdl.exec('date +"%a %b %e %H:%M"')
verify_dmesg_for_errors(phdl, start_time, end_time, till_end_flag=True)
# Bound dmesg scan to this test's own start..end window (per-test).
# till_end_flag=True scans from start_time to the end of the dmesg
# buffer, which causes earlier-test kernel events (e.g. a scatter_perf
# segfault) to repeatedly fail every subsequent parametrized test.
verify_dmesg_for_errors(phdl, start_time, end_time, till_end_flag=False)

# Get new cluster snapshot and compare ..
if re.search('True', config_dict.get('cvs_params', {}).get('cluster_snapshot_debug', 'False'), re.I):
Expand Down