From 51272d000562531a8fe9f54b298a1c3003f2a4e0 Mon Sep 17 00:00:00 2001 From: Tariq Ibrahim Date: Thu, 8 Aug 2024 15:40:00 -0700 Subject: [PATCH] fetch current container runtime config through the command line Signed-off-by: Tariq Ibrahim add default runtime binary path to runtimes field of toolkit config toml Signed-off-by: Tariq Ibrahim --- .golangci.yml | 4 + cmd/nvidia-ctk/runtime/configure/configure.go | 72 ++++++-- internal/config/toml.go | 15 ++ pkg/config/engine/api.go | 6 + pkg/config/engine/containerd/config_v1.go | 38 +++++ pkg/config/engine/containerd/config_v2.go | 14 ++ pkg/config/engine/containerd/containerd.go | 11 ++ pkg/config/engine/crio/crio.go | 29 +++- pkg/config/engine/crio/crio_test.go | 4 +- pkg/config/engine/docker/docker.go | 27 +++ pkg/config/toml/source-cli.go | 44 +++++ pkg/config/toml/source.go | 13 ++ tools/container/container.go | 23 ++- tools/container/nvidia-toolkit/run.go | 19 ++- .../runtime/containerd/containerd.go | 27 ++- tools/container/runtime/crio/crio.go | 35 ++-- tools/container/runtime/docker/docker.go | 12 +- tools/container/runtime/runtime.go | 26 +-- tools/container/toolkit/toolkit.go | 157 ++++++++++++++++-- 19 files changed, 487 insertions(+), 89 deletions(-) create mode 100644 pkg/config/toml/source-cli.go diff --git a/.golangci.yml b/.golangci.yml index 825d94c3d..aecd7d7e2 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -18,6 +18,10 @@ linters: linters-settings: goimports: local-prefixes: github.com/NVIDIA/nvidia-container-toolkit + gosec: + excludes: + # TODO: Consider hardening security of command line invocations + - G204 issues: exclude: diff --git a/cmd/nvidia-ctk/runtime/configure/configure.go b/cmd/nvidia-ctk/runtime/configure/configure.go index 0b321d606..9e6130716 100644 --- a/cmd/nvidia-ctk/runtime/configure/configure.go +++ b/cmd/nvidia-ctk/runtime/configure/configure.go @@ -44,6 +44,14 @@ const ( defaultContainerdConfigFilePath = "/etc/containerd/config.toml" defaultCrioConfigFilePath = "/etc/crio/crio.conf" defaultDockerConfigFilePath = "/etc/docker/daemon.json" + + defaultConfigSource = configSourceCommand + configSourceCommand = "command" + configSourceFile = "file" + + runtimeContainerd = "containerd" + runtimeCrio = "crio" + runtimeDocker = "docker" ) type command struct { @@ -64,6 +72,7 @@ type config struct { dryRun bool runtime string configFilePath string + configSource string mode string hookFilePath string @@ -120,6 +129,12 @@ func (m command) build() *cli.Command { Usage: "the config mode for runtimes that support multiple configuration mechanisms", Destination: &config.mode, }, + &cli.StringFlag{ + Name: "config-source", + Usage: "the source to retrieve the container runtime configuration; one of [command, file]\"", + Destination: &config.configSource, + Value: defaultConfigSource, + }, &cli.StringFlag{ Name: "oci-hook-path", Usage: "the path to the OCI runtime hook to create if --config-mode=oci-hook is specified. If no path is specified, the generated hook is output to STDOUT.\n\tNote: The use of OCI hooks is deprecated.", @@ -174,14 +189,14 @@ func (m command) validateFlags(c *cli.Context, config *config) error { config.mode = "config-file" switch config.runtime { - case "containerd", "crio", "docker": + case runtimeContainerd, runtimeCrio, runtimeDocker: break default: return fmt.Errorf("unrecognized runtime '%v'", config.runtime) } switch config.runtime { - case "containerd", "crio": + case runtimeContainerd, runtimeCrio: if config.nvidiaRuntime.path == defaultNVIDIARuntimeExecutable { config.nvidiaRuntime.path = defaultNVIDIARuntimeExpecutablePath } @@ -190,7 +205,7 @@ func (m command) validateFlags(c *cli.Context, config *config) error { } } - if config.runtime != "containerd" && config.runtime != "docker" { + if config.runtime != runtimeContainerd && config.runtime != runtimeDocker { if config.cdi.enabled { m.logger.Warningf("Ignoring cdi.enabled flag for %v", config.runtime) } @@ -220,22 +235,27 @@ func (m command) configureWrapper(c *cli.Context, config *config) error { func (m command) configureConfigFile(c *cli.Context, config *config) error { configFilePath := config.resolveConfigFilePath() - var cfg engine.Interface var err error + configSource, err := config.resolveConfigSource() + if err != nil { + return err + } + + var cfg engine.Interface switch config.runtime { - case "containerd": + case runtimeContainerd: cfg, err = containerd.New( containerd.WithLogger(m.logger), containerd.WithPath(configFilePath), - containerd.WithConfigSource(toml.FromFile(configFilePath)), + containerd.WithConfigSource(configSource), ) - case "crio": + case runtimeCrio: cfg, err = crio.New( crio.WithLogger(m.logger), crio.WithPath(configFilePath), - crio.WithConfigSource(toml.FromFile(configFilePath)), + crio.WithConfigSource(configSource), ) - case "docker": + case runtimeDocker: cfg, err = docker.New( docker.WithLogger(m.logger), docker.WithPath(configFilePath), @@ -285,16 +305,40 @@ func (c *config) resolveConfigFilePath() string { return c.configFilePath } switch c.runtime { - case "containerd": + case runtimeContainerd: return defaultContainerdConfigFilePath - case "crio": + case runtimeCrio: return defaultCrioConfigFilePath - case "docker": + case runtimeDocker: return defaultDockerConfigFilePath } return "" } +// resolveConfigSource returns the default config source or the user provided config source +func (c *config) resolveConfigSource() (toml.Loader, error) { + switch c.configSource { + case configSourceCommand: + cmd := c.getConfigSourceCommand() + return toml.FromCommandLine(cmd), nil + case configSourceFile: + return toml.FromFile(c.configFilePath), nil + default: + return nil, fmt.Errorf("unrecognized config source: %s", c.configSource) + } +} + +// getConfigSourceCommand returns the default cli command to fetch the current runtime config +func (c *config) getConfigSourceCommand() []string { + switch c.runtime { + case runtimeContainerd: + return []string{"containerd", "config", "dump"} + case runtimeCrio: + return []string{"crio", "status", "config"} + } + return []string{} +} + // getOuputConfigPath returns the configured config path or "" if dry-run is enabled func (c *config) getOuputConfigPath() string { if c.dryRun { @@ -318,9 +362,9 @@ func enableCDI(config *config, cfg engine.Interface) error { return nil } switch config.runtime { - case "containerd": + case runtimeContainerd: cfg.Set("enable_cdi", true) - case "docker": + case runtimeDocker: cfg.Set("features", map[string]bool{"cdi": true}) default: return fmt.Errorf("enabling CDI in %s is not supported", config.runtime) diff --git a/internal/config/toml.go b/internal/config/toml.go index a1d37428d..60a641266 100644 --- a/internal/config/toml.go +++ b/internal/config/toml.go @@ -170,11 +170,26 @@ func (t *Toml) Get(key string) interface{} { return (*toml.Tree)(t).Get(key) } +// GetDefault returns the value for the specified key and falls back to the default value if the Get call fails +func (t *Toml) GetDefault(key string, def interface{}) interface{} { + val := t.Get(key) + if val == nil { + return def + } + return val +} + // Set sets the specified key to the specified value in the TOML config. func (t *Toml) Set(key string, value interface{}) { (*toml.Tree)(t).Set(key, value) } +// WriteTo encode the Tree as Toml and writes it to the writer w. +// Returns the number of bytes written in case of success, or an error if anything happened. +func (t *Toml) WriteTo(w io.Writer) (int64, error) { + return (*toml.Tree)(t).WriteTo(w) +} + // commentDefaults applies the required comments for default values to the Toml. func (t *Toml) commentDefaults() *Toml { asToml := (*toml.Tree)(t) diff --git a/pkg/config/engine/api.go b/pkg/config/engine/api.go index b074dadfa..449ceb484 100644 --- a/pkg/config/engine/api.go +++ b/pkg/config/engine/api.go @@ -23,4 +23,10 @@ type Interface interface { Set(string, interface{}) RemoveRuntime(string) error Save(string) (int64, error) + GetRuntimeConfig(string) (RuntimeConfig, error) +} + +// RuntimeConfig defines the interface to query container runtime handler configuration +type RuntimeConfig interface { + GetBinPath() string } diff --git a/pkg/config/engine/containerd/config_v1.go b/pkg/config/engine/containerd/config_v1.go index e94a22f5a..91c74e955 100644 --- a/pkg/config/engine/containerd/config_v1.go +++ b/pkg/config/engine/containerd/config_v1.go @@ -22,6 +22,7 @@ import ( "github.com/pelletier/go-toml" "github.com/NVIDIA/nvidia-container-toolkit/pkg/config/engine" + cfgtoml "github.com/NVIDIA/nvidia-container-toolkit/pkg/config/toml" ) // ConfigV1 represents a version 1 containerd config @@ -29,6 +30,19 @@ type ConfigV1 Config var _ engine.Interface = (*ConfigV1)(nil) +type ctrdCfgV1Runtime struct { + tree *cfgtoml.Tree +} + +var _ engine.RuntimeConfig = (*ctrdCfgV1Runtime)(nil) + +func (c *ctrdCfgV1Runtime) GetBinPath() string { + if binPath, ok := c.tree.GetPath([]string{"options", "BinaryName"}).(string); ok { + return binPath + } + return "" +} + // AddRuntime adds a runtime to the containerd config func (c *ConfigV1) AddRuntime(name string, path string, setAsDefault bool) error { if c == nil || c.Tree == nil { @@ -146,6 +160,18 @@ func (c *ConfigV1) RemoveRuntime(name string) error { return nil } +func (c *ConfigV1) GetRuntime(name string) (interface{}, error) { + if c == nil || c.Tree == nil { + return nil, fmt.Errorf("config is nil") + } + config := *c.Tree + runtimeData, ok := config.GetPath([]string{"plugins", "cri", "containerd", "runtimes", name}).(*toml.Tree) + if !ok { + return nil, fmt.Errorf("invalid toml object") + } + return runtimeData, nil +} + // SetOption sets the specified containerd option. func (c *ConfigV1) Set(key string, value interface{}) { config := *c.Tree @@ -157,3 +183,15 @@ func (c *ConfigV1) Set(key string, value interface{}) { func (c ConfigV1) Save(path string) (int64, error) { return (Config)(c).Save(path) } + +func (c *ConfigV1) GetRuntimeConfig(name string) (engine.RuntimeConfig, error) { + if c == nil || c.Tree == nil { + return nil, fmt.Errorf("config is nil") + } + config := *c.Tree + runtimeData := config.GetSubtreeByPath([]string{"plugins", "cri", "containerd", "runtimes", name}) + + return &ctrdCfgV1Runtime{ + tree: runtimeData, + }, nil +} diff --git a/pkg/config/engine/containerd/config_v2.go b/pkg/config/engine/containerd/config_v2.go index 8f3e601f4..6c9678a82 100644 --- a/pkg/config/engine/containerd/config_v2.go +++ b/pkg/config/engine/containerd/config_v2.go @@ -19,9 +19,23 @@ package containerd import ( "fmt" + "github.com/NVIDIA/nvidia-container-toolkit/pkg/config/engine" "github.com/NVIDIA/nvidia-container-toolkit/pkg/config/toml" ) +type ctrdCfgV2Runtime struct { + tree *toml.Tree +} + +var _ engine.RuntimeConfig = (*ctrdCfgV2Runtime)(nil) + +func (c *ctrdCfgV2Runtime) GetBinPath() string { + if binPath, ok := c.tree.GetPath([]string{"options", "BinaryName"}).(string); ok { + return binPath + } + return "" +} + // AddRuntime adds a runtime to the containerd config func (c *Config) AddRuntime(name string, path string, setAsDefault bool) error { if c == nil || c.Tree == nil { diff --git a/pkg/config/engine/containerd/containerd.go b/pkg/config/engine/containerd/containerd.go index 92bf9fff8..991dd89be 100644 --- a/pkg/config/engine/containerd/containerd.go +++ b/pkg/config/engine/containerd/containerd.go @@ -98,3 +98,14 @@ func (c *Config) parseVersion(useLegacyConfig bool) (int, error) { return -1, fmt.Errorf("unsupported type for version field: %v", v) } } + +func (c *Config) GetRuntimeConfig(name string) (engine.RuntimeConfig, error) { + if c == nil || c.Tree == nil { + return nil, fmt.Errorf("config is nil") + } + config := *c.Tree + runtimeData := config.GetSubtreeByPath([]string{"plugins", "io.containerd.grpc.v1.cri", "containerd", "runtimes", name}) + return &ctrdCfgV2Runtime{ + tree: runtimeData, + }, nil +} diff --git a/pkg/config/engine/crio/crio.go b/pkg/config/engine/crio/crio.go index d243372da..d7b926886 100644 --- a/pkg/config/engine/crio/crio.go +++ b/pkg/config/engine/crio/crio.go @@ -30,6 +30,19 @@ type Config struct { Logger logger.Interface } +type crioRuntime struct { + tree *toml.Tree +} + +var _ engine.RuntimeConfig = (*crioRuntime)(nil) + +func (c *crioRuntime) GetBinPath() string { + if binaryPath, ok := c.tree.GetPath([]string{"runtime_path"}).(string); ok { + return binaryPath + } + return "" +} + var _ engine.Interface = (*Config)(nil) // New creates a cri-o config with the specified options @@ -65,11 +78,12 @@ func (c *Config) AddRuntime(name string, path string, setAsDefault bool) error { config := *c.Tree - // By default we extract the runtime options from the runc settings; if this does not exist we get the options from the default runtime specified in the config. - runtimeNamesForConfig := []string{"runc"} + // By default, we extract the runtime options from the runc settings; if this does not exist we get the options from the default runtime specified in the config. + var runtimeNamesForConfig []string if name, ok := config.GetPath([]string{"crio", "runtime", "default_runtime"}).(string); ok && name != "" { runtimeNamesForConfig = append(runtimeNamesForConfig, name) } + runtimeNamesForConfig = append(runtimeNamesForConfig, "runc") for _, r := range runtimeNamesForConfig { if options, ok := config.GetPath([]string{"crio", "runtime", "runtimes", r}).(*toml.Tree); ok { c.Logger.Debugf("using options from runtime %v: %v", r, options.String()) @@ -129,3 +143,14 @@ func (c *Config) RemoveRuntime(name string) error { *c.Tree = config return nil } + +func (c *Config) GetRuntimeConfig(name string) (engine.RuntimeConfig, error) { + if c == nil || c.Tree == nil { + return nil, fmt.Errorf("config is nil") + } + config := *c.Tree + runtimeData := config.GetSubtreeByPath([]string{"crio", "runtime", "runtimes", name}) + return &crioRuntime{ + tree: runtimeData, + }, nil +} diff --git a/pkg/config/engine/crio/crio_test.go b/pkg/config/engine/crio/crio_test.go index d2b81b9e8..84689194f 100644 --- a/pkg/config/engine/crio/crio_test.go +++ b/pkg/config/engine/crio/crio_test.go @@ -91,7 +91,7 @@ func TestAddRuntime(t *testing.T) { `, }, { - description: "options from runc take precedence over default runtime", + description: "options from runc DO NOT take precedence over default runtime", config: ` [crio] [crio.runtime] @@ -120,7 +120,7 @@ func TestAddRuntime(t *testing.T) { [crio.runtime.runtimes.test] runtime_path = "/usr/bin/test" runtime_type = "oci" - runc_option = "option" + default_option = "option" `, }, } diff --git a/pkg/config/engine/docker/docker.go b/pkg/config/engine/docker/docker.go index 45a96255d..a824e6897 100644 --- a/pkg/config/engine/docker/docker.go +++ b/pkg/config/engine/docker/docker.go @@ -18,6 +18,7 @@ package docker import ( "encoding/json" + "errors" "fmt" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" @@ -132,3 +133,29 @@ func (c Config) Save(path string) (int64, error) { n, err := config.Raw(path).Write(output) return int64(n), err } + +// GetRuntime returns the TOML object of the runtime passed as input +func (c *Config) GetRuntime(name string) (interface{}, error) { + if c == nil { + return nil, fmt.Errorf("config is nil") + } + + config := *c + + var runtimes map[string]interface{} + if _, exists := config["runtimes"]; exists { + runtimes = config["runtimes"].(map[string]interface{}) + } else { + return nil, fmt.Errorf("config is nil") + } + + if r, ok := runtimes[name]; ok { + return r, nil + } else { + return nil, fmt.Errorf("runtime %s not found", name) + } +} + +func (c *Config) GetRuntimeConfig(name string) (engine.RuntimeConfig, error) { + return nil, errors.New("not implemented") +} diff --git a/pkg/config/toml/source-cli.go b/pkg/config/toml/source-cli.go new file mode 100644 index 000000000..7a2fa0c32 --- /dev/null +++ b/pkg/config/toml/source-cli.go @@ -0,0 +1,44 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package toml + +import ( + "bytes" + "fmt" + "os/exec" +) + +type tomlCliSource struct { + command string + args []string +} + +func (c tomlCliSource) Load() (*Tree, error) { + //nolint:gosec // Subprocess launched with a potential tainted input or cmd arguments + cmd := exec.Command(c.command, c.args...) + + var outb bytes.Buffer + var errb bytes.Buffer + + cmd.Stdout = &outb + cmd.Stderr = &errb + if err := cmd.Run(); err != nil { + return nil, fmt.Errorf("failed to run command %v %v: %w", c.command, c.args, err) + } + + return LoadBytes(outb.Bytes()) +} diff --git a/pkg/config/toml/source.go b/pkg/config/toml/source.go index 2bd9191d4..f835ca8da 100644 --- a/pkg/config/toml/source.go +++ b/pkg/config/toml/source.go @@ -33,3 +33,16 @@ func FromFile(path string) Loader { } return tomlFile(path) } + +// FromCommandLine creates a TOML source from the output +// of a shell command specified via string slice. +// If an empty slice is passed an empty toml config is used. +func FromCommandLine(params []string) Loader { + if len(params) == 0 { + return Empty + } + return &tomlCliSource{ + command: params[0], + args: params[1:], + } +} diff --git a/tools/container/container.go b/tools/container/container.go index c2c50c5b6..9ead88753 100644 --- a/tools/container/container.go +++ b/tools/container/container.go @@ -36,13 +36,12 @@ const ( // Options defines the shared options for the CLIs to configure containers runtimes. type Options struct { - Config string - Socket string - RuntimeName string - RuntimeDir string - SetAsDefault bool - RestartMode string - HostRootMount string + Config string + Socket string + RuntimeName string + RuntimeDir string + SetAsDefault bool + RestartMode string } // ParseArgs parses the command line arguments to the CLI @@ -132,7 +131,7 @@ func (o Options) RevertConfig(cfg engine.Interface) error { } // Restart restarts the specified service -func (o Options) Restart(service string, withSignal func(string) error) error { +func (o Options) Restart(service, hostRoot string, withSignal func(string) error) error { switch o.RestartMode { case restartModeNone: logrus.Warningf("Skipping restart of %v due to --restart-mode=%v", service, o.RestartMode) @@ -140,19 +139,19 @@ func (o Options) Restart(service string, withSignal func(string) error) error { case restartModeSignal: return withSignal(o.Socket) case restartModeSystemd: - return o.SystemdRestart(service) + return o.SystemdRestart(service, hostRoot) } return fmt.Errorf("invalid restart mode specified: %v", o.RestartMode) } // SystemdRestart restarts the specified service using systemd -func (o Options) SystemdRestart(service string) error { +func (o Options) SystemdRestart(service, hostRoot string) error { var args []string var msg string - if o.HostRootMount != "" { + if hostRoot != "" { msg = " on host" - args = append(args, "chroot", o.HostRootMount) + args = append(args, "chroot", hostRoot) } args = append(args, "systemctl", "restart", service) diff --git a/tools/container/nvidia-toolkit/run.go b/tools/container/nvidia-toolkit/run.go index 65a807ec2..277be8d1f 100644 --- a/tools/container/nvidia-toolkit/run.go +++ b/tools/container/nvidia-toolkit/run.go @@ -22,8 +22,9 @@ const ( toolkitCommand = "toolkit" toolkitSubDir = "toolkit" - defaultRuntime = "docker" - defaultRuntimeArgs = "" + defaultRuntime = "docker" + defaultRuntimeArgs = "" + defaultHostRootMount = "/host" ) var availableRuntimes = map[string]struct{}{"docker": {}, "crio": {}, "containerd": {}} @@ -37,6 +38,7 @@ type options struct { runtime string runtimeArgs string root string + hostRoot string pidFile string toolkitOptions toolkit.Options @@ -114,6 +116,13 @@ func main() { Destination: &options.pidFile, EnvVars: []string{"TOOLKIT_PID_FILE", "PID_FILE"}, }, + &cli.StringFlag{ + Name: "host-root", + Usage: "Specify the path to the host root to be used when executing shell commands.", + Value: defaultHostRootMount, + Destination: &options.hostRoot, + EnvVars: []string{"HOST_ROOT_MOUNT"}, + }, } c.Flags = append(c.Flags, toolkit.Flags(&options.toolkitOptions)...) @@ -155,12 +164,12 @@ func Run(c *cli.Context, o *options) error { } defer shutdown(o.pidFile) - err = toolkit.Install(c, &o.toolkitOptions, o.toolkitRoot()) + err = toolkit.Install(c, &o.toolkitOptions, o.toolkitRoot(), o.hostRoot) if err != nil { return fmt.Errorf("unable to install toolkit: %v", err) } - err = runtime.Setup(c, &o.runtimeOptions, o.runtime) + err = runtime.Setup(c, &o.runtimeOptions, o.runtime, o.hostRoot) if err != nil { return fmt.Errorf("unable to setup runtime: %v", err) } @@ -171,7 +180,7 @@ func Run(c *cli.Context, o *options) error { return fmt.Errorf("unable to wait for signal: %v", err) } - err = runtime.Cleanup(c, &o.runtimeOptions, o.runtime) + err = runtime.Cleanup(c, &o.runtimeOptions, o.runtime, o.hostRoot) if err != nil { return fmt.Errorf("unable to cleanup runtime: %v", err) } diff --git a/tools/container/runtime/containerd/containerd.go b/tools/container/runtime/containerd/containerd.go index df5db6d62..7b1d3ec23 100644 --- a/tools/container/runtime/containerd/containerd.go +++ b/tools/container/runtime/containerd/containerd.go @@ -24,6 +24,7 @@ import ( cli "github.com/urfave/cli/v2" "github.com/NVIDIA/nvidia-container-toolkit/pkg/config/engine/containerd" + "github.com/NVIDIA/nvidia-container-toolkit/pkg/config/toml" "github.com/NVIDIA/nvidia-container-toolkit/tools/container" ) @@ -80,11 +81,13 @@ func Flags(opts *Options) []cli.Flag { } // Setup updates a containerd configuration to include the nvidia-containerd-runtime and reloads it -func Setup(c *cli.Context, o *container.Options, co *Options) error { +func Setup(c *cli.Context, hostRoot string, o *container.Options, co *Options) error { log.Infof("Starting 'setup' for %v", c.App.Name) + containerdCommand := getContainerdConfigCommand(hostRoot) cfg, err := containerd.New( containerd.WithPath(o.Config), + containerd.WithConfigSource(toml.FromCommandLine(containerdCommand)), containerd.WithRuntimeType(co.runtimeType), containerd.WithUseLegacyConfig(co.useLegacyConfig), containerd.WithContainerAnnotations(co.containerAnnotationsFromCDIPrefixes()...), @@ -98,7 +101,7 @@ func Setup(c *cli.Context, o *container.Options, co *Options) error { return fmt.Errorf("unable to configure containerd: %v", err) } - err = RestartContainerd(o) + err = RestartContainerd(hostRoot, o) if err != nil { return fmt.Errorf("unable to restart containerd: %v", err) } @@ -109,11 +112,13 @@ func Setup(c *cli.Context, o *container.Options, co *Options) error { } // Cleanup reverts a containerd configuration to remove the nvidia-containerd-runtime and reloads it -func Cleanup(c *cli.Context, o *container.Options, co *Options) error { +func Cleanup(c *cli.Context, hostRoot string, o *container.Options, co *Options) error { log.Infof("Starting 'cleanup' for %v", c.App.Name) + containerdCommand := getContainerdConfigCommand(hostRoot) cfg, err := containerd.New( containerd.WithPath(o.Config), + containerd.WithConfigSource(toml.FromCommandLine(containerdCommand)), containerd.WithRuntimeType(co.runtimeType), containerd.WithUseLegacyConfig(co.useLegacyConfig), containerd.WithContainerAnnotations(co.containerAnnotationsFromCDIPrefixes()...), @@ -127,7 +132,7 @@ func Cleanup(c *cli.Context, o *container.Options, co *Options) error { return fmt.Errorf("unable to unconfigure containerd: %v", err) } - err = RestartContainerd(o) + err = RestartContainerd(hostRoot, o) if err != nil { return fmt.Errorf("unable to restart containerd: %v", err) } @@ -138,8 +143,8 @@ func Cleanup(c *cli.Context, o *container.Options, co *Options) error { } // RestartContainerd restarts containerd depending on the value of restartModeFlag -func RestartContainerd(o *container.Options) error { - return o.Restart("containerd", SignalContainerd) +func RestartContainerd(hostRoot string, o *container.Options) error { + return o.Restart("containerd", hostRoot, SignalContainerd) } // containerAnnotationsFromCDIPrefixes returns the container annotations to set for the given CDI prefixes. @@ -164,3 +169,13 @@ func (o *Options) runtimeConfigOverride() (map[string]interface{}, error) { return runtimeOptions, nil } + +// getContainerdConfigCommand returns a string slice which contains the CLI args to retrieve the current runtime configuration +func getContainerdConfigCommand(hostRoot string) []string { + var cliArgs []string + if hostRoot != "" { + cliArgs = append(cliArgs, "chroot", hostRoot) + } + cliArgs = append(cliArgs, "containerd", "config", "dump") + return cliArgs +} diff --git a/tools/container/runtime/crio/crio.go b/tools/container/runtime/crio/crio.go index 69482191f..547d813eb 100644 --- a/tools/container/runtime/crio/crio.go +++ b/tools/container/runtime/crio/crio.go @@ -27,6 +27,7 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/config" "github.com/NVIDIA/nvidia-container-toolkit/pkg/config/engine/crio" "github.com/NVIDIA/nvidia-container-toolkit/pkg/config/ocihook" + "github.com/NVIDIA/nvidia-container-toolkit/pkg/config/toml" "github.com/NVIDIA/nvidia-container-toolkit/tools/container" ) @@ -85,14 +86,14 @@ func Flags(opts *Options) []cli.Flag { } // Setup installs the prestart hook required to launch GPU-enabled containers -func Setup(c *cli.Context, o *container.Options, co *Options) error { +func Setup(c *cli.Context, hostRoot string, o *container.Options, co *Options) error { log.Infof("Starting 'setup' for %v", c.App.Name) switch co.configMode { case "hook": return setupHook(o, co) case "config": - return setupConfig(o) + return setupConfig(hostRoot, o) default: return fmt.Errorf("invalid config-mode '%v'", co.configMode) } @@ -112,11 +113,13 @@ func setupHook(o *container.Options, co *Options) error { } // setupConfig updates the cri-o config for the NVIDIA container runtime -func setupConfig(o *container.Options) error { +func setupConfig(hostRoot string, o *container.Options) error { log.Infof("Updating config file") + crioCommand := getCRIOConfigCommand(hostRoot) cfg, err := crio.New( crio.WithPath(o.Config), + crio.WithConfigSource(toml.FromCommandLine(crioCommand)), ) if err != nil { return fmt.Errorf("unable to load config: %v", err) @@ -127,7 +130,7 @@ func setupConfig(o *container.Options) error { return fmt.Errorf("unable to configure cri-o: %v", err) } - err = RestartCrio(o) + err = RestartCrio(o, hostRoot) if err != nil { return fmt.Errorf("unable to restart crio: %v", err) } @@ -136,14 +139,14 @@ func setupConfig(o *container.Options) error { } // Cleanup removes the specified prestart hook -func Cleanup(c *cli.Context, o *container.Options, co *Options) error { +func Cleanup(c *cli.Context, hostRoot string, o *container.Options, co *Options) error { log.Infof("Starting 'cleanup' for %v", c.App.Name) switch co.configMode { case "hook": return cleanupHook(co) case "config": - return cleanupConfig(o) + return cleanupConfig(hostRoot, o) default: return fmt.Errorf("invalid config-mode '%v'", co.configMode) } @@ -163,11 +166,13 @@ func cleanupHook(co *Options) error { } // cleanupConfig removes the NVIDIA container runtime from the cri-o config -func cleanupConfig(o *container.Options) error { +func cleanupConfig(hostRoot string, o *container.Options) error { log.Infof("Reverting config file modifications") + crioCommand := getCRIOConfigCommand(hostRoot) cfg, err := crio.New( crio.WithPath(o.Config), + crio.WithConfigSource(toml.FromCommandLine(crioCommand)), ) if err != nil { return fmt.Errorf("unable to load config: %v", err) @@ -178,7 +183,7 @@ func cleanupConfig(o *container.Options) error { return fmt.Errorf("unable to unconfigure cri-o: %v", err) } - err = RestartCrio(o) + err = RestartCrio(o, hostRoot) if err != nil { return fmt.Errorf("unable to restart crio: %v", err) } @@ -187,6 +192,16 @@ func cleanupConfig(o *container.Options) error { } // RestartCrio restarts crio depending on the value of restartModeFlag -func RestartCrio(o *container.Options) error { - return o.Restart("crio", func(string) error { return fmt.Errorf("supporting crio via signal is unsupported") }) +func RestartCrio(o *container.Options, hostRoot string) error { + return o.Restart("crio", hostRoot, func(string) error { return fmt.Errorf("supporting crio via signal is unsupported") }) +} + +// getCRIOConfigCommand returns a string slice which contains the CLI args to retrieve the current runtime configuration +func getCRIOConfigCommand(hostRoot string) []string { + var cliArgs []string + if hostRoot != "" { + cliArgs = append(cliArgs, "chroot", hostRoot) + } + cliArgs = append(cliArgs, "crio", "status", "config") + return cliArgs } diff --git a/tools/container/runtime/docker/docker.go b/tools/container/runtime/docker/docker.go index f3524825e..5542c5d83 100644 --- a/tools/container/runtime/docker/docker.go +++ b/tools/container/runtime/docker/docker.go @@ -41,7 +41,7 @@ func Flags(opts *Options) []cli.Flag { } // Setup updates docker configuration to include the nvidia runtime and reloads it -func Setup(c *cli.Context, o *container.Options) error { +func Setup(c *cli.Context, hostRoot string, o *container.Options) error { log.Infof("Starting 'setup' for %v", c.App.Name) cfg, err := docker.New( @@ -56,7 +56,7 @@ func Setup(c *cli.Context, o *container.Options) error { return fmt.Errorf("unable to configure docker: %v", err) } - err = RestartDocker(o) + err = RestartDocker(hostRoot, o) if err != nil { return fmt.Errorf("unable to restart docker: %v", err) } @@ -67,7 +67,7 @@ func Setup(c *cli.Context, o *container.Options) error { } // Cleanup reverts docker configuration to remove the nvidia runtime and reloads it -func Cleanup(c *cli.Context, o *container.Options) error { +func Cleanup(c *cli.Context, hostRoot string, o *container.Options) error { log.Infof("Starting 'cleanup' for %v", c.App.Name) cfg, err := docker.New( @@ -82,7 +82,7 @@ func Cleanup(c *cli.Context, o *container.Options) error { return fmt.Errorf("unable to unconfigure docker: %v", err) } - err = RestartDocker(o) + err = RestartDocker(hostRoot, o) if err != nil { return fmt.Errorf("unable to signal docker: %v", err) } @@ -93,6 +93,6 @@ func Cleanup(c *cli.Context, o *container.Options) error { } // RestartDocker restarts docker depending on the value of restartModeFlag -func RestartDocker(o *container.Options) error { - return o.Restart("docker", SignalDocker) +func RestartDocker(hostRoot string, o *container.Options) error { + return o.Restart("docker", hostRoot, SignalDocker) } diff --git a/tools/container/runtime/runtime.go b/tools/container/runtime/runtime.go index 6fa5442a7..6a6fd87a6 100644 --- a/tools/container/runtime/runtime.go +++ b/tools/container/runtime/runtime.go @@ -30,8 +30,7 @@ import ( const ( defaultSetAsDefault = true // defaultRuntimeName specifies the NVIDIA runtime to be use as the default runtime if setting the default runtime is enabled - defaultRuntimeName = "nvidia" - defaultHostRootMount = "/host" + defaultRuntimeName = "nvidia" runtimeSpecificDefault = "RUNTIME_SPECIFIC_DEFAULT" ) @@ -66,13 +65,6 @@ func Flags(opts *Options) []cli.Flag { Destination: &opts.RestartMode, EnvVars: []string{"RUNTIME_RESTART_MODE"}, }, - &cli.StringFlag{ - Name: "host-root", - Usage: "Specify the path to the host root to be used when restarting the runtime using systemd", - Value: defaultHostRootMount, - Destination: &opts.HostRootMount, - EnvVars: []string{"HOST_ROOT_MOUNT"}, - }, &cli.StringFlag{ Name: "runtime-name", Aliases: []string{"nvidia-runtime-name", "runtime-class"}, @@ -141,27 +133,27 @@ func ValidateOptions(opts *Options, runtime string, toolkitRoot string) error { return nil } -func Setup(c *cli.Context, opts *Options, runtime string) error { +func Setup(c *cli.Context, opts *Options, runtime, hostRoot string) error { switch runtime { case containerd.Name: - return containerd.Setup(c, &opts.Options, &opts.containerdOptions) + return containerd.Setup(c, hostRoot, &opts.Options, &opts.containerdOptions) case crio.Name: - return crio.Setup(c, &opts.Options, &opts.crioOptions) + return crio.Setup(c, hostRoot, &opts.Options, &opts.crioOptions) case docker.Name: - return docker.Setup(c, &opts.Options) + return docker.Setup(c, hostRoot, &opts.Options) default: return fmt.Errorf("undefined runtime %v", runtime) } } -func Cleanup(c *cli.Context, opts *Options, runtime string) error { +func Cleanup(c *cli.Context, opts *Options, runtime, hostRoot string) error { switch runtime { case containerd.Name: - return containerd.Cleanup(c, &opts.Options, &opts.containerdOptions) + return containerd.Cleanup(c, hostRoot, &opts.Options, &opts.containerdOptions) case crio.Name: - return crio.Cleanup(c, &opts.Options, &opts.crioOptions) + return crio.Cleanup(c, hostRoot, &opts.Options, &opts.crioOptions) case docker.Name: - return docker.Cleanup(c, &opts.Options) + return docker.Cleanup(c, hostRoot, &opts.Options) default: return fmt.Errorf("undefined runtime %v", runtime) } diff --git a/tools/container/toolkit/toolkit.go b/tools/container/toolkit/toolkit.go index 484d7891b..f74b4d569 100644 --- a/tools/container/toolkit/toolkit.go +++ b/tools/container/toolkit/toolkit.go @@ -24,7 +24,6 @@ import ( "path/filepath" "strings" - toml "github.com/pelletier/go-toml" log "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" "tags.cncf.io/container-device-interface/pkg/cdi" @@ -32,6 +31,10 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/config" "github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvdevices" + "github.com/NVIDIA/nvidia-container-toolkit/pkg/config/engine" + "github.com/NVIDIA/nvidia-container-toolkit/pkg/config/engine/containerd" + "github.com/NVIDIA/nvidia-container-toolkit/pkg/config/engine/crio" + cfgtoml "github.com/NVIDIA/nvidia-container-toolkit/pkg/config/toml" "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi" transformroot "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/transform/root" ) @@ -40,6 +43,9 @@ const ( // DefaultNvidiaDriverRoot specifies the default NVIDIA driver run directory DefaultNvidiaDriverRoot = "/run/nvidia/driver" + // DefaultHostRootMount specifies the path to the host root to be used when executing shell commands + DefaultHostRootMount = "/host" + nvidiaContainerCliSource = "/usr/bin/nvidia-container-cli" nvidiaContainerRuntimeHookSource = "/usr/bin/nvidia-container-runtime-hook" @@ -79,6 +85,8 @@ type Options struct { acceptNVIDIAVisibleDevicesWhenUnprivileged bool acceptNVIDIAVisibleDevicesAsVolumeMounts bool + toolkitConfigSource string + ignoreErrors bool } @@ -190,6 +198,13 @@ func Flags(opts *Options) []cli.Flag { Destination: &opts.cdiKind, EnvVars: []string{"CDI_KIND"}, }, + &cli.StringFlag{ + Name: "toolkit-config-source", + Usage: "The file where the NVIDIA Container toolkit source configuration is specified", + Value: nvidiaContainerToolkitConfigSource, + Destination: &opts.toolkitConfigSource, + EnvVars: []string{"TOOLKIT_CONFIG_SOURCE"}, + }, &cli.BoolFlag{ Name: "ignore-errors", Usage: "ignore errors when installing the NVIDIA Container toolkit. This is used for testing purposes only.", @@ -279,7 +294,7 @@ func TryDelete(cli *cli.Context, toolkitRoot string) error { // Install installs the components of the NVIDIA container toolkit. // Any existing installation is removed. -func Install(cli *cli.Context, opts *Options, toolkitRoot string) error { +func Install(cli *cli.Context, opts *Options, toolkitRoot, hostRoot string) error { log.Infof("Installing NVIDIA container toolkit to '%v'", toolkitRoot) log.Infof("Removing existing NVIDIA container toolkit installation") @@ -342,7 +357,14 @@ func Install(cli *cli.Context, opts *Options, toolkitRoot string) error { log.Errorf("Ignoring error: %v", fmt.Errorf("error installing NVIDIA Container CDI Hook CLI: %v", err)) } - err = installToolkitConfig(cli, toolkitConfigPath, nvidiaContainerCliExecutable, nvidiaCTKPath, nvidiaContainerRuntimeHookPath, opts) + var runtimeBinPaths []string + runtimeBinPaths, err = getRuntimeBinaryPaths(hostRoot) + if err != nil { + log.Warningf("Error retrieving runtime binary paths: %v", err) + } + + err = installToolkitConfig(cli, toolkitConfigPath, nvidiaContainerCliExecutable, nvidiaCTKPath, nvidiaContainerRuntimeHookPath, + runtimeBinPaths, opts) if err != nil && !opts.ignoreErrors { return fmt.Errorf("error installing NVIDIA container toolkit config: %v", err) } else if err != nil { @@ -416,10 +438,13 @@ func installLibrary(libName string, toolkitRoot string) error { // installToolkitConfig installs the config file for the NVIDIA container toolkit ensuring // that the settings are updated to match the desired install and nvidia driver directories. -func installToolkitConfig(c *cli.Context, toolkitConfigPath string, nvidiaContainerCliExecutablePath string, nvidiaCTKPath string, nvidaContainerRuntimeHookPath string, opts *Options) error { +func installToolkitConfig(c *cli.Context, toolkitConfigPath string, nvidiaContainerCliExecutablePath string, nvidiaCTKPath string, + nvidaContainerRuntimeHookPath string, runtimeBinaryPaths []string, opts *Options) error { log.Infof("Installing NVIDIA container toolkit config '%v'", toolkitConfigPath) - cfg, err := loadConfig(nvidiaContainerToolkitConfigSource) + cfg, err := config.New( + config.WithConfigFile(opts.toolkitConfigSource), + ) if err != nil { return fmt.Errorf("could not open source config file: %v", err) } @@ -436,6 +461,18 @@ func installToolkitConfig(c *cli.Context, toolkitConfigPath string, nvidiaContai // Use the driver run root as the root: driverLdconfigPath := config.NormalizeLDConfigPath("@" + filepath.Join(opts.DriverRoot, strings.TrimPrefix(ldconfigPath, "@/"))) + var ctkRuntimes []string + defaultCfg, err := cfg.Config() + if err == nil { + defaultCfgRuntimes := defaultCfg.NVIDIAContainerRuntimeConfig.Runtimes + if len(runtimeBinaryPaths) > 0 { + ctkRuntimes = append(ctkRuntimes, runtimeBinaryPaths...) + } + ctkRuntimes = append(ctkRuntimes, defaultCfgRuntimes...) + } else { + log.Warningf("could not get default toolkit config: %v", err) + } + configValues := map[string]interface{}{ // Set the options in the root toml table "accept-nvidia-visible-devices-envvar-when-unprivileged": opts.acceptNVIDIAVisibleDevicesWhenUnprivileged, @@ -450,6 +487,11 @@ func installToolkitConfig(c *cli.Context, toolkitConfigPath string, nvidiaContai "nvidia-container-runtime-hook.path": nvidaContainerRuntimeHookPath, "nvidia-container-runtime-hook.skip-mode-detection": opts.ContainerRuntimeHookSkipModeDetection, } + + if len(ctkRuntimes) > 0 { + configValues["nvidia-container-runtime.runtimes"] = ctkRuntimes + } + for key, value := range configValues { cfg.Set(key, value) } @@ -503,16 +545,6 @@ func installToolkitConfig(c *cli.Context, toolkitConfigPath string, nvidiaContai return nil } -func loadConfig(path string) (*toml.Tree, error) { - _, err := os.Stat(path) - if err == nil { - return toml.LoadFile(path) - } else if os.IsNotExist(err) { - return toml.TreeFromMap(nil) - } - return nil, err -} - // installContainerToolkitCLI installs the nvidia-ctk CLI executable and wrapper. func installContainerToolkitCLI(toolkitDir string) (string, error) { e := executable{ @@ -793,3 +825,98 @@ func generateCDISpec(opts *Options, nvidiaCDIHookPath string) error { return nil } + +// getRuntimeBinaryPaths extracts the full paths of the low-level runtime binaries specified in the container runtime config +func getRuntimeBinaryPaths(hostRoot string) ([]string, error) { + var runtimeBinaryPaths []string + + if r, ok := os.LookupEnv("RUNTIME"); ok { + runtimeConfigCommand := getRuntimeConfigCommand(hostRoot, r) + var err error + + switch r { + case "containerd": + runtimeBinaryPaths, err = getContainerdRuntimeBinaryPaths(runtimeConfigCommand) + if err != nil { + log.Warningf("Unable to determine containerd runtime binary paths: %v", err) + } + + case "crio": + runtimeBinaryPaths, err = getCrioRuntimeBinaryPaths(runtimeConfigCommand) + if err != nil { + log.Warningf("Unable to determine crio runtime binary paths: %v", err) + } + } + } + return runtimeBinaryPaths, nil +} + +// getContainerdRuntimeBinaryPaths extracts the full paths of the low-level runtime binaries specified in the containerd runtime config +func getContainerdRuntimeBinaryPaths(cliArgs []string) ([]string, error) { + cfg, err := containerd.New( + containerd.WithConfigSource(cfgtoml.FromCommandLine(cliArgs)), + ) + if err != nil { + return nil, fmt.Errorf("unable to load containerd config: %w", err) + } + + runtimeHandlers := []string{"runc"} + defaultRuntime := cfg.DefaultRuntime() + if defaultRuntime != "" { + runtimeHandlers = append(runtimeHandlers, defaultRuntime) + } + + return getBinaryPathsForHandlers(cfg, runtimeHandlers), nil +} + +// getCrioRuntimeBinaryPaths extracts the full paths of the low-level runtime binaries specified in the CRI-O runtime config +func getCrioRuntimeBinaryPaths(cliArgs []string) ([]string, error) { + cfg, err := crio.New( + crio.WithConfigSource(cfgtoml.FromCommandLine(cliArgs)), + ) + if err != nil { + return nil, fmt.Errorf("unable to load crio config: %w", err) + } + + var runtimeHandlers []string + defaultRuntime := cfg.DefaultRuntime() + if defaultRuntime != "" { + runtimeHandlers = append(runtimeHandlers, defaultRuntime) + } + runtimeHandlers = append(runtimeHandlers, "runc") + + return getBinaryPathsForHandlers(cfg, runtimeHandlers), nil +} + +func getBinaryPathsForHandlers(cfg engine.Interface, handlers []string) []string { + var binPaths []string + for _, rh := range handlers { + runtimeCfg, err := cfg.GetRuntimeConfig(rh) + if err != nil { + log.Warningf("Unable to determine runtime binary path: %v", err) + continue + } + binPath := runtimeCfg.GetBinPath() + if binPath != "" { + binPaths = append(binPaths, binPath) + } + } + return binPaths +} + +// getRuntimeConfigCommand returns the default cli command to fetch the current runtime config +func getRuntimeConfigCommand(hostRoot, runtime string) []string { + var cliArgs []string + if hostRoot != "" { + cliArgs = append(cliArgs, "chroot", hostRoot) + } + switch runtime { + case "containerd": + cliArgs = append(cliArgs, "containerd", "config", "dump") + return cliArgs + case "crio": + cliArgs = append(cliArgs, "crio", "status", "config") + return cliArgs + } + return []string{} +}