From a46d6b46870e0e2536fcbbefd4e9e12330682f88 Mon Sep 17 00:00:00 2001 From: Joseph <162703152+josephnef@users.noreply.github.com> Date: Mon, 1 Jun 2026 09:45:44 +0300 Subject: [PATCH] tests/regress.py: prevent WiFiDriverDemo process leaks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Long-running Popens spawned by run_cell can outlive the script if either (a) an outer `timeout` / SIGKILL hits Python before run_cell's finally block executes, or (b) an unhandled exception aborts the run. The orphaned child reparents to init (PPID=1) and keeps the USB device claimed, causing kernel-side `usbfs: process N (WiFiDriverDemo) did not claim interface 0 before use` spam and preventing `aircrack-ng/88XXau` from binding in the VM on subsequent cells. A leaked process from yesterday's debugging session ran for 2:58 hours against the 8812AU and wedged the chip for the entire investigation — every "can't set config #1, error -32" symptom was downstream of that. Defense in depth (two independent layers): 1. atexit + SIGTERM/SIGINT/SIGHUP handler walks a module-level _ACTIVE_LOCAL_PROCS set and SIGKILLs every Popen that's still running. Re-raises the signal at default disposition so the exit code reflects how the script died. 2. preexec_fn for each spawned Popen calls prctl(PR_SET_PDEATHSIG, SIGKILL) + os.setsid(). The PDEATHSIG asks the kernel to kill the child the moment its parent dies — catches the pathological SIGKILL-from-outer-timeout case where Python has zero opportunity to run cleanup code. Replaces the previous `start_new_session=True` (which only handled the setsid half). Wired into _spawn_devourer_rx, _spawn_devourer_tx, _spawn_sniffer (all local Popens). _spawn_kernel_rx / _spawn_kernel_tx remain untouched — those run remotely via ssh and die naturally when the ssh tunnel breaks. _install_cleanup_handlers() is called once at the top of main(). Verified: - `kill -KILL $(python_pid)` mid-spawn: PR_SET_PDEATHSIG kills the WiFiDriverDemo child within ~ms (verified by `ps --ppid`). - `kill -TERM $(python_pid)` mid-spawn: signal handler runs _kill_all_local_procs, child gone before harness exit. - Full 4-cell matrix at ch100: zero leaks after the run (`ps -ef | grep WiFiDriver | wc -l` == 0). Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/regress.py | 113 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 104 insertions(+), 9 deletions(-) diff --git a/tests/regress.py b/tests/regress.py index ea9969e..11840a1 100755 --- a/tests/regress.py +++ b/tests/regress.py @@ -48,6 +48,8 @@ from __future__ import annotations import argparse +import atexit +import ctypes import dataclasses import glob import os @@ -61,6 +63,96 @@ from pathlib import Path from typing import Optional + +# --------------------------------------------------------------------------- +# Process leak prevention. +# +# Long-running Popens (WiFiDriverDemo, WiFiDriverTxDemo, kernel-side tcpdump) +# can outlive this script if: +# 1. an outer `timeout` / SIGKILL hits Python before the run_cell finally +# block executes (the leaked child reparents to init with PPID=1); +# 2. an unhandled exception propagates out of run_cell before _terminate +# runs on the in-flight Popen. +# +# A leaked WiFiDriverDemo keeps the USB device claimed, which manifests +# in the kernel as `usbfs: process N (WiFiDriverDemo) did not claim +# interface 0 before use` spam and prevents `aircrack-ng/88XXau` from +# binding in the VM — exactly what bit us during the 2026-05-30 / -31 +# 8821AU 5GHz UNII-2 investigation. +# +# Mitigation layers (defense in depth): +# A) Track every local Popen in `_ACTIVE_LOCAL_PROCS`; SIGTERM/SIGINT/ +# atexit walks the set and SIGKILLs survivors. +# B) Child preexec uses prctl(PR_SET_PDEATHSIG, SIGKILL) so the kernel +# kills the child the moment its parent (this process) dies, even +# if Python had no chance to run cleanup (the SIGKILL-from-outer- +# timeout case). +# --------------------------------------------------------------------------- + + +_ACTIVE_LOCAL_PROCS: set[subprocess.Popen] = set() + + +PR_SET_PDEATHSIG = 1 + + +def _child_preexec() -> None: + """Runs in the child after fork, before exec. Asks the kernel to send + SIGKILL to this child the moment its parent (the regress.py process) + dies. Belt-and-braces against orphaned WiFiDriverDemo processes when + Python is killed by an outer `timeout` / SIGKILL that can't be caught.""" + try: + libc = ctypes.CDLL("libc.so.6", use_errno=True) + libc.prctl(PR_SET_PDEATHSIG, signal.SIGKILL, 0, 0, 0) + except Exception: + # Best-effort — if prctl unavailable, fall back to layer (A). + pass + # Also put the child in its own session so killpg targets work cleanly. + try: + os.setsid() + except OSError: + pass + + +def _register_local_proc(proc: subprocess.Popen) -> subprocess.Popen: + _ACTIVE_LOCAL_PROCS.add(proc) + return proc + + +def _unregister_local_proc(proc: subprocess.Popen) -> None: + _ACTIVE_LOCAL_PROCS.discard(proc) + + +def _kill_all_local_procs() -> None: + """Last-resort sweep of every tracked Popen. Best effort — don't raise.""" + for proc in list(_ACTIVE_LOCAL_PROCS): + if proc.poll() is not None: + _ACTIVE_LOCAL_PROCS.discard(proc) + continue + try: + os.killpg(os.getpgid(proc.pid), signal.SIGKILL) + except (ProcessLookupError, PermissionError): + try: + proc.kill() + except ProcessLookupError: + pass + _ACTIVE_LOCAL_PROCS.discard(proc) + + +def _install_cleanup_handlers() -> None: + atexit.register(_kill_all_local_procs) + + def _handler(signum, _frame): + _kill_all_local_procs() + # Re-raise the signal at default disposition so the exit code + # reflects how we died. + signal.signal(signum, signal.SIG_DFL) + os.kill(os.getpid(), signum) + + signal.signal(signal.SIGTERM, _handler) + signal.signal(signal.SIGINT, _handler) + signal.signal(signal.SIGHUP, _handler) + # Source MAC of the canonical beacon — must match txdemo/main.cpp and the # `` matcher in demo/main.cpp. tcpdump filter and scapy # injector both use this. @@ -533,11 +625,11 @@ def _spawn_devourer_rx( devourer_root: Path, dut: Dut, channel: int, log_path: Path ) -> subprocess.Popen: fh = open(log_path, "w") - return subprocess.Popen( + return _register_local_proc(subprocess.Popen( [str(devourer_root / "build" / "WiFiDriverDemo")], env=_devourer_env(dut, channel), - stdout=fh, stderr=subprocess.STDOUT, start_new_session=True, - ) + stdout=fh, stderr=subprocess.STDOUT, preexec_fn=_child_preexec, + )) def _spawn_devourer_tx( @@ -545,11 +637,11 @@ def _spawn_devourer_tx( encoding: Optional[dict] = None, ) -> subprocess.Popen: fh = open(log_path, "w") - return subprocess.Popen( + return _register_local_proc(subprocess.Popen( [str(devourer_root / "build" / "WiFiDriverTxDemo")], env=_devourer_env(dut, channel, tx_encoding=encoding), - stdout=fh, stderr=subprocess.STDOUT, start_new_session=True, - ) + stdout=fh, stderr=subprocess.STDOUT, preexec_fn=_child_preexec, + )) def _spawn_kernel_rx( @@ -632,11 +724,11 @@ def _spawn_sniffer( sa_str = ":".join( f"{b:02x}" for b in bytes.fromhex(CANONICAL_SA.replace(":", "")) ) - return subprocess.Popen( + return _register_local_proc(subprocess.Popen( ["tcpdump", "-i", iface, "-w", str(pcap_path), "-U", "-nn", f"ether src {sa_str}"], - stdout=fh, stderr=subprocess.STDOUT, start_new_session=True, - ) + stdout=fh, stderr=subprocess.STDOUT, preexec_fn=_child_preexec, + )) def _summarise_sniffer_pcap(pcap_path: Path) -> str: @@ -683,6 +775,7 @@ def _summarise_sniffer_pcap(pcap_path: Path) -> str: def _terminate(proc: subprocess.Popen, grace: float = 2.0) -> None: if proc.poll() is not None: + _unregister_local_proc(proc) return try: os.killpg(os.getpgid(proc.pid), signal.SIGINT) @@ -693,6 +786,7 @@ def _terminate(proc: subprocess.Popen, grace: float = 2.0) -> None: except ProcessLookupError: pass proc.wait() + _unregister_local_proc(proc) # --------------------------------------------------------------------------- @@ -1243,6 +1337,7 @@ def emit_markdown( def main(): + _install_cleanup_handlers() ap = argparse.ArgumentParser( description="Cross-driver regression matrix for devourer.", )