canonical · hector-cao · Mar 21, 2025 · Mar 21, 2025 · Mar 21, 2025
diff --git a/tests/lib/Qemu.py b/tests/lib/Qemu.py
@@ -629,33 +629,45 @@ def run_and_wait(self):
         self.run()
         QemuMonitor(self)
 
-    def communicate(self):
+    def communicate(self, timeout=60):
         """
         Wait for qemu to exit
         """
-        self.out, self.err = self.proc.communicate(timeout=60)
+        self.out, self.err = self.proc.communicate(timeout=timeout)
         if self.proc.returncode != 0:
             print(self.err.decode())
         return self.out, self.err
 
-    def stop(self):
+    def shutdown(self):
         """
-        Stop qemu process
+        Send shutdown command to the VM
+        Do not wait for the VM to exit
+        Return false if the VM is already terminated
         """
         if self.proc is None:
-            return
+            return False
         if self.proc.returncode is not None:
-            return
+            return False
+
+        try:
+            mon = QemuMonitor(self)
+            mon.powerdown()
+        except Exception as e:
+            pass
 
-        # self.proc.returncode == None -> not yet terminated
+        return True
+
+    def stop(self):
+        """
+        Stop qemu process
+        """
+        if not self.shutdown():
+            return
 
         try:
             # try to shutdown the VM properly, this is important to avoid
             # rootfs corruption if we want to run the guest again
             # catch exception and ignore it since we are stopping .... no need to fail the test
-            mon = QemuMonitor(self)
-            mon.powerdown()
-
             self.communicate()
             return
         except Exception as e:

diff --git a/tests/lib/util.py b/tests/lib/util.py
@@ -45,6 +45,16 @@ def tcp_port_available():
     return port
 
 def get_max_td_vms():
+    """
+    MKTME encryption engine is used both for legacy MKTME operation and TDX operation
+    The key space is partitionned in 3 ranges:
+     - first key
+     - shared keys
+     - TDX keys
+    So if we have 128 keys and we decide to split this range into 2 equal sets (in BIOS)
+    TDX key space will only have 63 keys instead of 64.
+    The nb of TDX key space can be read from the IA32_MKTME_KEYID_PARTITIONING MSR (0x87)
+    """
     cmd = ['rdmsr', '0x87']
     rc = subprocess.run(cmd, capture_output=True)
     assert rc.returncode == 0, "Failed getting max td vms"

diff --git a/tests/tests/stress/test_stress_resources.py b/tests/tests/stress/test_stress_resources.py
@@ -17,6 +17,7 @@
 import subprocess
 import time
 import multiprocessing
+import pytest
 
 import Qemu
 import util
@@ -70,15 +71,47 @@ def test_stress_max_vcpus(qm):
 
     qm.stop()
 
+def check_qemu_fail_to_start(qm, error_msg=None):
+    try:
+        _, err = qm.communicate(timeout=5)
+    except:
+        # if timeout, that means the QEMU is running fine
+        # try to connect with ssh to make sure the TD is running fine
+        try:
+            ssh = Qemu.QemuSSH(qm)
+        except:
+            # the qemu is running but we cannot connect to SSH
+            # we consider that the check is OK
+            qm.stop()
+            return
+        pytest.fail('The TD is running !')
+    if error_msg:
+        assert error_msg in err.decode()
 
 def test_stress_max_guests():
     """
     Test max guests (No Intel Case ID)
+
+    There is a limit on the number of TDs that can be run in parralel.
+    This limit can be due to several factors, but the most prevalent factor
+    is the number of keys the CPU can allocate to TDs.
+    In fact, TDX takes advantage of an existing CPU feature called MK-TME
+    (Multi-key Total Memory Encryption) to encrypt the VM memory. It enables
+    the CPU to encrypt each TD’s memory with a unique Advanced Encryption Standard (AES) key.
+    MK-TME offers a number of keys and this key space is partionned into 2 sets:
+    Shared (VMM) and Private (TDX). The number of key in the Private space defines the
+    maximum number of TDs we can run in parralel.
+
+    This test verifies that we can run TDs up to this limit and any new TD creation
+    is refused by qemu in a nice way.
     """
 
     # get max number of TD VMs we can create (max - current)
     max_td_vms = util.get_max_td_vms() - util.get_current_td_vms()
     assert max_td_vms > 0, "No available space for TD VMs"
+
+    print(f'The limit number of TDs is : {max_td_vms}')
+
     qm = [None] * max_td_vms
 
     # initialize machines
@@ -95,8 +128,21 @@ def test_stress_max_guests():
         print("Waiting for machine %d" % (i))
         ssh = Qemu.QemuSSH(qm[i])
 
+    # try to run a new TD
+    # expect qemu quit immediately with a specific error message
+    with Qemu.QemuMachine() as one_more:
+        one_more.run()
+        check_qemu_fail_to_start(one_more, error_msg="KVM_TDX_INIT_VM failed: No space left on device")
+
     # stop all machines
     for i in range(max_td_vms):
         print("Stopping machine %d" % (i))
-        qm[i].stop()
+        qm[i].shutdown()
 
+    # wait for all machines to exit
+    for i in range(max_td_vms):
+        print("Stopping machine %d" % (i))
+        try:
+            qm[i].communicate()
+        except:
+            pass