diff --git a/cvs/lib/rccl_lib.py b/cvs/lib/rccl_lib.py index 0d57618b..21b3b8b1 100644 --- a/cvs/lib/rccl_lib.py +++ b/cvs/lib/rccl_lib.py @@ -548,10 +548,11 @@ def rccl_regression( for node in vpc_node_list: host_file_params = f'{host_file_params}{node} slots={proc_per_node}\n' - cmd = 'sudo rm -f /tmp/rccl_hosts_file.txt' + hosts_file_path = f'/tmp/rccl_hosts_file_{os.environ.get("USER", "cvs")}.txt' + cmd = f'rm -f {hosts_file_path}' shdl.exec(cmd) - cmd = f'echo "{host_file_params}" > /tmp/rccl_hosts_file.txt' + cmd = f'echo "{host_file_params}" > {hosts_file_path}' shdl.exec(cmd) # Determine PML (Point-to-Point Messaging Layer) based on user config or auto-detection @@ -596,7 +597,7 @@ def rccl_regression( cmd = f'''{mpi_dir}/bin/mpirun \ --allow-run-as-root \ -np {no_of_global_ranks} \ - --hostfile /tmp/rccl_hosts_file.txt \ + --hostfile {hosts_file_path} \ --bind-to numa \ {ucx_params} \ --mca btl ^vader,openib \ @@ -722,10 +723,11 @@ def rccl_perf( for node in vpc_node_list: host_file_params = f'{host_file_params}' + f'{node} slots={proc_per_node}\n' - cmd = 'sudo rm -f /tmp/rccl_hosts_file.txt' + hosts_file_path = f'/tmp/rccl_hosts_file_{os.environ.get("USER", "cvs")}.txt' + cmd = f'rm -f {hosts_file_path}' shdl.exec(cmd) - cmd = f'echo "{host_file_params}" > /tmp/rccl_hosts_file.txt' + cmd = f'echo "{host_file_params}" > {hosts_file_path}' shdl.exec(cmd) # Determine PML (Point-to-Point Messaging Layer) based on user config or auto-detection @@ -774,7 +776,7 @@ def rccl_perf( # Build mpirun command cmd = f'''{mpi_dir}/bin/mpirun --np {no_of_global_ranks} \ --allow-run-as-root \ - --hostfile /tmp/rccl_hosts_file.txt \ + --hostfile {hosts_file_path} \ --bind-to numa \ {ucx_params} \ --mca btl ^vader,openib \ diff --git a/cvs/lib/verify_lib.py b/cvs/lib/verify_lib.py index 95074ac1..7614b6ac 100644 --- a/cvs/lib/verify_lib.py +++ b/cvs/lib/verify_lib.py @@ -17,7 +17,11 @@ 'crash': 'crashed|Traceback|cut here|Bug:|Call Trace|RIP:|end trace|amdgpu: Fatal error|segfault|show_stack|dump_stack|fault ', 'test_fail': 'Test failure', 'fault': 'no-retry page fault|Illegal register access|PROTECTION_FAULT_STATUS', - 'driver': 'Queue preemption failed for queue|Failed to evict process queues|Runlist is getting oversubscribed|No more SDMA queue to allocate|Expect reduced ROCm performance|amdgpu: process pid', + # Note: 'Runlist is getting oversubscribed' and 'Expect reduced ROCm performance' + # are amdgpu kernel info-level messages (not errors). They fire routinely on + # large multi-rank RCCL runs whenever HSA queue count exceeds the runlist + # size, even when the run itself is healthy. Excluded from failure matching. + 'driver': 'Queue preemption failed for queue|Failed to evict process queues|No more SDMA queue to allocate|amdgpu: process pid', 'hardware': 'hardware error|hardware fail|ras error|uncorrectable|correctable err', 'network': 'NIC Link is Down|link is down|ib_uverb|CQE|queue catastrophic|CQ error', } diff --git a/cvs/tests/rccl/rccl_perf.py b/cvs/tests/rccl/rccl_perf.py index 122a294c..b26ebcd1 100644 --- a/cvs/tests/rccl/rccl_perf.py +++ b/cvs/tests/rccl/rccl_perf.py @@ -328,7 +328,11 @@ def test_rccl_perf(phdl, shdl, cluster_dict, config_dict, rccl_collective): phdl.exec(f'sudo echo "End of Test {rccl_collective}" | sudo tee /dev/kmsg') end_time = phdl.exec('date +"%a %b %e %H:%M"') - verify_dmesg_for_errors(phdl, start_time, end_time, till_end_flag=True) + # Bound dmesg scan to this test's own start..end window (per-test). + # till_end_flag=True scans from start_time to the end of the dmesg + # buffer, which causes earlier-test kernel events (e.g. a scatter_perf + # segfault) to repeatedly fail every subsequent parametrized test. + verify_dmesg_for_errors(phdl, start_time, end_time, till_end_flag=False) # Get new cluster snapshot and compare .. if re.search('True', config_dict.get('cvs_params', {}).get('cluster_snapshot_debug', 'False'), re.I):