From 0080fa4af48f38ef95a915057f0ab03306a55873 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Mon, 8 Sep 2025 09:18:36 -0600 Subject: [PATCH] v2: set memory.oom.group => OOMPolicy in systemd We are interested in using memory.oom.cgroup, but need it to be set systemd because of [1], so let's set it. There are a few caveats, in no particular order: A. systemd does not allow OOMPolicy to be set on units that have already started, so we must do this in Apply() instead of Set(). B. As the comment suggests, OOMPolicy has three states (continue, stop, kill), where kill maps to memory.oom.group=1, and continue maps to =0. However, the bit about `runc update` doesn't quite make sense: the values will only ever be expressed in terms of memory.oom.group, so we only need to map the continue and kill values, which have direct mappings. Note that `runc update` here doesn't make sense anyway: because of (A), we cannot update these values. Perhaps we should reject these updates since systemd will? (Or maybe we try to update and just error out, in the event that systemd eventually allows this? The kernel allows updating it, the reason the systemd semantics have diverged is unclear.) C. systemd only gained support for setting OOMPolicy on scopes in versions >= 253; versions before this will fail. So, let's add a bit allowing the setup of OOMPolicy to Apply(), and ignore it in Set() -> genV2ResourcesProperties() -> unifiedResToSystemdProps(). [1]: This arguably is more important than the debug-level warning would suggest: if someone does the equivalent of a `systemctl daemon-reload`, systemd will reset our manually-via-cgroupfs set value to 0, because we did not explicitly set it in the service / scope definition, meaning that individual tasks will not actually oom the whole cgroup when they oom. Co-authored-by: Ethan Adams Signed-off-by: Tycho Andersen --- systemd/systemd_test.go | 103 ++++++++++++++++++++++++++++++++++++++++ systemd/v2.go | 27 ++++++++--- 2 files changed, 123 insertions(+), 7 deletions(-) diff --git a/systemd/systemd_test.go b/systemd/systemd_test.go index f770ab1..4d536a3 100644 --- a/systemd/systemd_test.go +++ b/systemd/systemd_test.go @@ -1,8 +1,10 @@ package systemd import ( + "context" "os" "reflect" + "strconv" "testing" systemdDbus "github.com/coreos/go-systemd/v22/dbus" @@ -157,6 +159,12 @@ func TestUnifiedResToSystemdProps(t *testing.T) { newProp("CPUWeight", uint64(1000)), }, }, + { + name: "memory.oom.group handled by Apply method", + res: map[string]string{ + "memory.oom.group": "1", + }, + }, } for _, tc := range testCases { @@ -236,3 +244,98 @@ func TestAddCPUQuota(t *testing.T) { }) } } + +func TestOOMPolicyApply(t *testing.T) { + if !IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + if !cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v2 is required") + } + if os.Geteuid() != 0 { + t.Skip("Test requires root.") + } + + testCases := []struct { + name string + oomGroupValue string + expectedPolicy string + expectError bool + }{ + { + name: "memory.oom.group=0 sets OOMPolicy=continue", + oomGroupValue: "0", + expectedPolicy: "continue", + expectError: false, + }, + { + name: "memory.oom.group=1 sets OOMPolicy=kill", + oomGroupValue: "1", + expectedPolicy: "kill", + expectError: false, + }, + { + name: "invalid memory.oom.group value", + oomGroupValue: "invalid", + expectError: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + config := &cgroups.Cgroup{ + Name: "test-oom-policy-" + strconv.FormatInt(int64(os.Getpid()), 10), + Resources: &cgroups.Resources{ + Unified: map[string]string{ + "memory.oom.group": tc.oomGroupValue, + }, + }, + } + + manager, err := NewUnifiedManager(config, "") + if err != nil { + t.Fatalf("Failed to create manager: %v", err) + } + defer func() { + _ = manager.Destroy() + }() + + err = manager.Apply(-1) + if tc.expectError { + if err == nil { + t.Fatal("Expected error but got none") + } + return + } + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + + unitName := getUnitName(config) + conn, err := systemdDbus.NewSystemdConnectionContext(context.Background()) + if err != nil { + t.Fatalf("Failed to connect to systemd: %v", err) + } + defer conn.Close() + + properties, err := conn.GetUnitPropertiesContext(context.Background(), unitName) + if err != nil { + t.Fatalf("Failed to get unit properties: %v", err) + } + + oomPolicyValue, exists := properties["OOMPolicy"] + if !exists { + t.Fatal("OOMPolicy property not found") + } + + oomPolicyStr, ok := oomPolicyValue.(string) + if !ok { + t.Fatalf("OOMPolicy value is not a string: %T", oomPolicyValue) + } + + if oomPolicyStr != tc.expectedPolicy { + t.Errorf("Expected OOMPolicy=%s, got %s", tc.expectedPolicy, oomPolicyStr) + } + }) + } +} diff --git a/systemd/v2.go b/systemd/v2.go index c2f2e87..e480f98 100644 --- a/systemd/v2.go +++ b/systemd/v2.go @@ -180,13 +180,8 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props newProp("TasksMax", num)) case "memory.oom.group": - // Setting this to 1 is roughly equivalent to OOMPolicy=kill - // (as per systemd.service(5) and - // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html), - // but it's not clear what to do if it is unset or set - // to 0 in runc update, as there are two other possible - // values for OOMPolicy (continue/stop). - fallthrough + // This was set before the unit started, so no need to + // warn about it here. default: // Ignore the unknown resource here -- will still be @@ -327,6 +322,24 @@ func (m *UnifiedManager) Apply(pid int) error { properties = append(properties, c.SystemdProps...) + if c.Resources != nil && c.Resources.Unified != nil { + if v, ok := c.Resources.Unified["memory.oom.group"]; ok { + value, err := strconv.ParseUint(v, 10, 64) + if err != nil { + return fmt.Errorf("unified resource %q value conversion error: %w", "memory.oom.group", err) + } + + switch value { + case 0: + properties = append(properties, newProp("OOMPolicy", "continue")) + case 1: + properties = append(properties, newProp("OOMPolicy", "kill")) + default: + logrus.Debugf("don't know how to convert memory.oom.group=%d; skipping (will still be applied to cgroupfs)", value) + } + } + } + if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil { return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err) }