From c851475eef64fae899bc8b3c8097a05859af819f Mon Sep 17 00:00:00 2001 From: Felix Moessbauer Date: Wed, 27 Apr 2022 16:31:01 +0200 Subject: [PATCH 1/3] add parser to discover available GPU partitions This patch adds a parser to parse the output of nvidia-smi mig -lgip to get the possible GPU partitions. Signed-off-by: Felix Moessbauer --- partition_gpu/Dockerfile | 2 +- partition_gpu/nvidia_smi_parser.go | 69 ++++++++++++++++++++++ partition_gpu/nvidia_smi_parser_test.go | 77 +++++++++++++++++++++++++ 3 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 partition_gpu/nvidia_smi_parser.go create mode 100644 partition_gpu/nvidia_smi_parser_test.go diff --git a/partition_gpu/Dockerfile b/partition_gpu/Dockerfile index f390051a8..70c52a57c 100644 --- a/partition_gpu/Dockerfile +++ b/partition_gpu/Dockerfile @@ -15,7 +15,7 @@ FROM golang:1.15 as builder WORKDIR /go/src/github.com/GoogleCloudPlatform/container-engine-accelerators COPY . . -RUN go build -o gpu_partitioner partition_gpu/partition_gpu.go +RUN go build -o gpu_partitioner partition_gpu/partition_gpu.go partition_gpu/nvidia_smi_parser.go RUN chmod a+x /go/src/github.com/GoogleCloudPlatform/container-engine-accelerators/gpu_partitioner FROM gcr.io/distroless/base-debian10 diff --git a/partition_gpu/nvidia_smi_parser.go b/partition_gpu/nvidia_smi_parser.go new file mode 100644 index 000000000..3c658e711 --- /dev/null +++ b/partition_gpu/nvidia_smi_parser.go @@ -0,0 +1,69 @@ +// Copyright 2022 Siemens AG. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "errors" + "regexp" + "strconv" + "strings" + + "github.com/golang/glog" +) + +type GPUPerInstanceProfiles = map[int]GPUAvailableProfiles + +type GPUProfile struct{ + id int + total int +} + +type GPUAvailableProfiles struct { + byname map[string]GPUProfile +} + +func ParseMIGAvailableProfiles(lgip_output string) (GPUPerInstanceProfiles, error){ + profile_pattern_spec := `^\|\s+(\d+)\s+MIG\s+([^\s]+)\s+(\d+)\s+(\d+)\/(\d+).*\|$` + profile_pattern := regexp.MustCompile(profile_pattern_spec) + + profiles := make(map[int]GPUAvailableProfiles) + for _, line := range strings.Split(strings.TrimSuffix(lgip_output, "\n"), "\n") { + matches := profile_pattern.FindStringSubmatch(line) + if len(matches) == 0 { + continue + } + glog.Infof("found profile: gpu: %s, profile: %-10s, id: %3s, free: %2s, total: %2s\n", matches[1], matches[2], matches[3], matches[4], matches[5]) + gpuid, _ := strconv.Atoi(matches[1]) + name := matches[2] + profileid, _ := strconv.Atoi(matches[3]) + total, _ := strconv.Atoi(matches[5]) + + if gpuid != 0 { + return nil, errors.New("multi-gpu systems are not supported yet") + } + + // assignment + profile := profiles[gpuid] + if profile.byname == nil { + profile.byname = make(map[string]GPUProfile) + } + profile.byname[name] = GPUProfile{ + id: profileid, + total: total, + } + profiles[gpuid]= profile + } + return profiles, nil +} diff --git a/partition_gpu/nvidia_smi_parser_test.go b/partition_gpu/nvidia_smi_parser_test.go new file mode 100644 index 000000000..e909713fd --- /dev/null +++ b/partition_gpu/nvidia_smi_parser_test.go @@ -0,0 +1,77 @@ +// Copyright 2022 Siemens AG. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "reflect" + "strings" + "testing" +) + +var PROFILES_A30 = GPUAvailableProfiles{ + byname: map[string]GPUProfile{ + "1g.6gb": { + id: 14, + total: 4, + }, + "1g.6gb+me": { + id: 21, + total: 1, + }, + "2g.12gb": { + id: 5, + total: 2, + }, + "4g.24gb": { + id: 0, + total: 1, + }, + }, +} + +var SMIOUTPUT_A30 string = strings.TrimSpace(` ++-----------------------------------------------------------------------------+ +| GPU instance profiles: | +| GPU Name ID Instances Memory P2P SM DEC ENC | +| Free/Total GiB CE JPEG OFA | +|=============================================================================| +| 0 MIG 1g.6gb 14 4/4 5.81 No 14 1 0 | +| 1 0 0 | ++-----------------------------------------------------------------------------+ +| 0 MIG 1g.6gb+me 21 1/1 5.81 No 14 1 0 | +| 1 1 1 | ++-----------------------------------------------------------------------------+ +| 0 MIG 2g.12gb 5 2/2 11.69 No 28 2 0 | +| 2 0 0 | ++-----------------------------------------------------------------------------+ +| 0 MIG 4g.24gb 0 1/1 23.44 No 56 4 0 | +| 4 1 1 | ++-----------------------------------------------------------------------------+ +`) + +func Test_parseA30Config(t *testing.T) { + got, err := ParseMIGAvailableProfiles(SMIOUTPUT_A30) + if err != nil { + t.Errorf("ParseMIGAvailableInstances() error = %v", err) + } + + if len(got) != 1 { + t.Errorf("ParseMIGAvailableInstances() len(res) = %v, expected = 1", len(got)) + } + + if !reflect.DeepEqual(got[0], PROFILES_A30) { + t.Errorf("ParseMIGAvailableInstances() got = %v, expected = %v", got[0], PROFILES_A30) + } +} From 859b5f7deadfd5e1be0c0b4ccbec18398a4e16e7 Mon Sep 17 00:00:00 2001 From: Felix Moessbauer Date: Wed, 27 Apr 2022 17:20:32 +0200 Subject: [PATCH 2/3] dynamically discover supported profiles of GPU This patch replaces the static discovery and mapping of GPU profiles (and sizes) by a dynamic discovery. By that, the plugin supports any partitionable GPUs. Signed-off-by: Felix Moessbauer --- partition_gpu/partition_gpu.go | 63 +++++++++++++---------------- partition_gpu/partition_gpu_test.go | 27 ++++++++++++- 2 files changed, 53 insertions(+), 37 deletions(-) diff --git a/partition_gpu/partition_gpu.go b/partition_gpu/partition_gpu.go index 0b6fd064c..adfd7d0c6 100644 --- a/partition_gpu/partition_gpu.go +++ b/partition_gpu/partition_gpu.go @@ -32,36 +32,6 @@ var ( gpuConfigFile = flag.String("gpu-config", "/etc/nvidia/gpu_config.json", "File with GPU configurations for device plugin") ) -var partitionSizeToProfileID = map[string]string{ - //nvidia-tesla-a100 - "1g.5gb": "19", - "2g.10gb": "14", - "3g.20gb": "9", - "4g.20gb": "5", - "7g.40gb": "0", - //nvidia-a100-80gb - "1g.10gb": "19", - "2g.20gb": "14", - "3g.40gb": "9", - "4g.40gb": "5", - "7g.80gb": "0", -} - -var partitionSizeMaxCount = map[string]int{ - //nvidia-tesla-a100 - "1g.5gb": 7, - "2g.10gb": 3, - "3g.20gb": 2, - "4g.20gb": 1, - "7g.40gb": 1, - //nvidia-a100-80gb - "1g.10gb": 7, - "2g.20gb": 3, - "3g.40gb": 2, - "4g.40gb": 1, - "7g.80gb": 1, -} - const SIGRTMIN = 34 // GPUConfig stores the settings used to configure the GPUs on a node. @@ -174,6 +144,22 @@ func rebootNode() error { return syscall.Kill(1, SIGRTMIN+5) } +func discoverPossibleGPUPartitions() (GPUAvailableProfiles, error) { + args := []string{"mig", "-lgip"} + glog.Infof("Running %s %s", *nvidiaSmiPath, strings.Join(args, " ")) + out, err := exec.Command(*nvidiaSmiPath, args...).Output() + if err != nil { + return GPUAvailableProfiles{}, fmt.Errorf("failed to discover partitions, nvidia-smi output: %s, error: %v ", string(out), err) + } + profiles, err := ParseMIGAvailableProfiles(string(out)) + if err != nil || len(profiles) == 0 { + return GPUAvailableProfiles{}, fmt.Errorf("failed to parse output of nvidia-smi. output: %s, error: %v ", string(out), err) + } + + glog.Infof("Output:\n %s", string(out)) + return profiles[0], nil +} + func cleanupAllGPUPartitions() error { args := []string{"mig", "-dci"} glog.Infof("Running %s %s", *nvidiaSmiPath, strings.Join(args, " ")) @@ -194,7 +180,12 @@ func cleanupAllGPUPartitions() error { } func createGPUPartitions(partitionSize string) error { - p, err := buildPartitionStr(partitionSize) + profiles, err := discoverPossibleGPUPartitions() + if err != nil { + return err + } + + p, err := buildPartitionStr(partitionSize, profiles) if err != nil { return err } @@ -219,19 +210,19 @@ func createGPUPartitions(partitionSize string) error { } -func buildPartitionStr(partitionSize string) (string, error) { +func buildPartitionStr(partitionSize string, profiles GPUAvailableProfiles) (string, error) { if partitionSize == "" { return "", nil } - p, ok := partitionSizeToProfileID[partitionSize] + p, ok := profiles.byname[partitionSize] if !ok { return "", fmt.Errorf("%s is not a valid partition size", partitionSize) } - partitionStr := p - for i := 1; i < partitionSizeMaxCount[partitionSize]; i++ { - partitionStr += fmt.Sprintf(",%s", p) + partitionStr := fmt.Sprint(p.id) + for i := 1; i < p.total; i++ { + partitionStr += fmt.Sprintf(",%d", p.id) } return partitionStr, nil diff --git a/partition_gpu/partition_gpu_test.go b/partition_gpu/partition_gpu_test.go index a2adc8577..554bcd2b1 100644 --- a/partition_gpu/partition_gpu_test.go +++ b/partition_gpu/partition_gpu_test.go @@ -16,6 +16,31 @@ package main import "testing" +var PROFILES_A100 = GPUAvailableProfiles{ + byname: map[string]GPUProfile{ + "1g.5gb": { + id: 19, + total: 7, + }, + "2g.10gb": { + id: 14, + total: 3, + }, + "3g.20gb": { + id: 9, + total: 2, + }, + "4g.20gb": { + id: 5, + total: 1, + }, + "7g.40gb": { + id: 0, + total: 1, + }, + }, +} + func Test_buildPartitionStr(t *testing.T) { tests := []struct { name string @@ -50,7 +75,7 @@ func Test_buildPartitionStr(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, err := buildPartitionStr(tt.partitionSize) + got, err := buildPartitionStr(tt.partitionSize, PROFILES_A100) if (err != nil) != tt.wantErr { t.Errorf("buildPartitionStr() error = %v, wantErr %v", err, tt.wantErr) return From 1a9427b277e2ec4c17dfc009efe8d62fac00a39c Mon Sep 17 00:00:00 2001 From: Felix Moessbauer Date: Thu, 28 Apr 2022 09:30:30 +0200 Subject: [PATCH 3/3] remove partitioning checks only working for A100 This patch removes some sanity checks from nvidia_gpu that use hard-coded partition sizes. By that, we make the plugin compatible with other NVIDIA cards like the A30. Signed-off-by: Felix Moessbauer --- pkg/gpu/nvidia/mig/mig.go | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/pkg/gpu/nvidia/mig/mig.go b/pkg/gpu/nvidia/mig/mig.go index 0d19c0824..2b3cc8030 100644 --- a/pkg/gpu/nvidia/mig/mig.go +++ b/pkg/gpu/nvidia/mig/mig.go @@ -28,21 +28,6 @@ import ( const nvidiaDeviceRE = `^nvidia[0-9]*$` -// Max number of GPU partitions that can be created for each partition size. -// Source: https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#partitioning -var gpuPartitionSizeMaxCount = map[string]int{ - //nvidia-tesla-a100 - "1g.5gb": 7, - "2g.10gb": 3, - "3g.20gb": 2, - "7g.40gb": 1, - //nvidia-a100-80gb - "1g.10gb": 7, - "2g.20gb": 3, - "3g.40gb": 2, - "7g.80gb": 1, -} - // DeviceManager performs various management operations on mig devices. type DeviceManager struct { devDirectory string @@ -83,11 +68,6 @@ func (d *DeviceManager) Start(partitionSize string) error { return nil } - maxPartitionCount, ok := gpuPartitionSizeMaxCount[partitionSize] - if !ok { - return fmt.Errorf("%s is not a valid GPU partition size", partitionSize) - } - d.gpuPartitionSpecs = make(map[string][]pluginapi.DeviceSpec) nvidiaCapDir := path.Join(d.procDirectory, "driver/nvidia/capabilities") @@ -192,10 +172,6 @@ func (d *DeviceManager) Start(partitionSize string) error { } d.gpuPartitions[gpuInstanceID] = pluginapi.Device{ID: gpuInstanceID, Health: pluginapi.Healthy} } - - if numPartitions != maxPartitionCount { - return fmt.Errorf("Number of partitions (%d) for GPU %s does not match expected partition count (%d)", numPartitions, gpuID, maxPartitionCount) - } } numGPUs, err := d.discoverNumGPUs()