ROCm · speriaswamy-amd · May 18, 2026 · May 14, 2026 · cijohnson · May 18, 2026
diff --git a/cvs/lib/rccl_lib.py b/cvs/lib/rccl_lib.py
@@ -548,10 +548,11 @@ def rccl_regression(
     for node in vpc_node_list:
         host_file_params = f'{host_file_params}{node} slots={proc_per_node}\n'
 
-    cmd = 'sudo rm -f /tmp/rccl_hosts_file.txt'
+    hosts_file_path = f'/tmp/rccl_hosts_file_{os.environ.get("USER", "cvs")}.txt'
+    cmd = f'rm -f {hosts_file_path}'
     shdl.exec(cmd)
 
-    cmd = f'echo "{host_file_params}" > /tmp/rccl_hosts_file.txt'
+    cmd = f'echo "{host_file_params}" > {hosts_file_path}'
     shdl.exec(cmd)
 
     # Determine PML (Point-to-Point Messaging Layer) based on user config or auto-detection
@@ -596,7 +597,7 @@ def rccl_regression(
     cmd = f'''{mpi_dir}/bin/mpirun \
         --allow-run-as-root \
         -np {no_of_global_ranks} \
-        --hostfile /tmp/rccl_hosts_file.txt \
+        --hostfile {hosts_file_path} \
         --bind-to numa \
         {ucx_params} \
         --mca btl ^vader,openib \
@@ -722,10 +723,11 @@ def rccl_perf(
     for node in vpc_node_list:
         host_file_params = f'{host_file_params}' + f'{node} slots={proc_per_node}\n'
 
-    cmd = 'sudo rm -f /tmp/rccl_hosts_file.txt'
+    hosts_file_path = f'/tmp/rccl_hosts_file_{os.environ.get("USER", "cvs")}.txt'
+    cmd = f'rm -f {hosts_file_path}'
     shdl.exec(cmd)
 
-    cmd = f'echo "{host_file_params}" > /tmp/rccl_hosts_file.txt'
+    cmd = f'echo "{host_file_params}" > {hosts_file_path}'
     shdl.exec(cmd)
 
     # Determine PML (Point-to-Point Messaging Layer) based on user config or auto-detection
@@ -774,7 +776,7 @@ def rccl_perf(
         # Build mpirun command
         cmd = f'''{mpi_dir}/bin/mpirun --np {no_of_global_ranks} \
         --allow-run-as-root \
-        --hostfile /tmp/rccl_hosts_file.txt \
+        --hostfile {hosts_file_path} \
         --bind-to numa \
         {ucx_params} \
         --mca btl ^vader,openib \

diff --git a/cvs/lib/verify_lib.py b/cvs/lib/verify_lib.py
@@ -17,7 +17,11 @@
     'crash': 'crashed|Traceback|cut here|Bug:|Call Trace|RIP:|end trace|amdgpu: Fatal error|segfault|show_stack|dump_stack|fault ',
     'test_fail': 'Test failure',
     'fault': 'no-retry page fault|Illegal register access|PROTECTION_FAULT_STATUS',
-    'driver': 'Queue preemption failed for queue|Failed to evict process queues|Runlist is getting oversubscribed|No more SDMA queue to allocate|Expect reduced ROCm performance|amdgpu: process pid',
+    # Note: 'Runlist is getting oversubscribed' and 'Expect reduced ROCm performance'
+    # are amdgpu kernel info-level messages (not errors). They fire routinely on
+    # large multi-rank RCCL runs whenever HSA queue count exceeds the runlist
+    # size, even when the run itself is healthy. Excluded from failure matching.
+    'driver': 'Queue preemption failed for queue|Failed to evict process queues|No more SDMA queue to allocate|amdgpu: process pid',
     'hardware': 'hardware error|hardware fail|ras error|uncorrectable|correctable err',
     'network': 'NIC Link is Down|link is down|ib_uverb|CQE|queue catastrophic|CQ error',
 }

diff --git a/cvs/tests/rccl/rccl_perf.py b/cvs/tests/rccl/rccl_perf.py
@@ -328,7 +328,11 @@ def test_rccl_perf(phdl, shdl, cluster_dict, config_dict, rccl_collective):
     phdl.exec(f'sudo echo "End of Test {rccl_collective}" | sudo tee /dev/kmsg')
 
     end_time = phdl.exec('date +"%a %b %e %H:%M"')
-    verify_dmesg_for_errors(phdl, start_time, end_time, till_end_flag=True)
+    # Bound dmesg scan to this test's own start..end window (per-test).
+    # till_end_flag=True scans from start_time to the end of the dmesg
+    # buffer, which causes earlier-test kernel events (e.g. a scatter_perf
+    # segfault) to repeatedly fail every subsequent parametrized test.
+    verify_dmesg_for_errors(phdl, start_time, end_time, till_end_flag=False)
 
     # Get new cluster snapshot and compare ..
     if re.search('True', config_dict.get('cvs_params', {}).get('cluster_snapshot_debug', 'False'), re.I):