Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CTX-6655: Fix node container losing access to gpu randomly. #259

Merged
merged 7 commits into from
Aug 27, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion coretex/cli/modules/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import os
import logging
import requests
import platform

import click

Expand Down Expand Up @@ -388,11 +389,41 @@ def configureNode(advanced: bool) -> NodeConfiguration:
else:
nodeConfig.image = "coretexai/coretex-node"

if isGPUAvailable():
if isGPUAvailable() and not docker.isDockerDesktop():
dule1322 marked this conversation as resolved.
Show resolved Hide resolved
nodeConfig.allowGpu = ui.clickPrompt("Do you want to allow the Node to access your GPU? (Y/n)", type = bool, default = True)
else:
nodeConfig.allowGpu = False

if nodeConfig.allowGpu and platform.system().lower() == "linux":
shouldUpdateDaemon = not docker.isDaemonFileUpdated()
dule1322 marked this conversation as resolved.
Show resolved Hide resolved

if shouldUpdateDaemon:
userApproval = ui.clickPrompt(
dule1322 marked this conversation as resolved.
Show resolved Hide resolved
"NVIDIA has a bug where a docker container running Coretex Node can lose access to GPU "
"(https://github.com/NVIDIA/nvidia-container-toolkit/issues/48). "
"\nDo you want Coretex CLI to apply a workaround for this bug "
"(NOTE: This requires docker daemon restart)? (Y/n)",
type = bool,
default = True
)

if userApproval:
docker.updateDaemonFile()
restartPrompt = ui.clickPrompt("Do you want to restart Docker to apply the changes? (Y/n)", type = bool, default = True)
dule1322 marked this conversation as resolved.
Show resolved Hide resolved

if restartPrompt:
docker.restartDocker()
else:
ui.warningEcho(
"Warning: The changes will not take effect until Docker is restarted. "
"(https://github.com/NVIDIA/nvidia-container-toolkit/issues/48)"
)
else:
ui.warningEcho(
"Warning: Not updating the daemon.json file may lead to GPU access issues in Docker "
"containers. (https://github.com/NVIDIA/nvidia-container-toolkit/issues/48)"
)

if imageType == ImageType.official:
tag = "gpu" if nodeConfig.allowGpu else "cpu"
nodeConfig.image += f":latest-{tag}"
Expand Down
59 changes: 57 additions & 2 deletions coretex/utils/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import json
import platform
import tempfile

from .process import command, CommandException
from ..statistics import getTotalSwapMemory
Expand All @@ -22,7 +23,7 @@ def isDockerAvailable() -> None:

def networkExists(name: str) -> bool:
# This function inspects the specified Docker network using the
# 'docker network inspect' command. If the command exits with a return code
# "docker network inspect" command. If the command exits with a return code
# of 0, indicating success, the function returns True, meaning the network exists.
# If the command exits with a non-zero return code, indicating failure,
# the function returns False, meaning the network doesn't exist.
Expand Down Expand Up @@ -98,7 +99,7 @@ def start(

runCommand = [
"docker", "run", "-d",
"--restart", 'always',
"--restart", "always",
"-p", "21000:21000",
"--cap-add", "SYS_PTRACE",
"--network", name,
Expand Down Expand Up @@ -198,3 +199,57 @@ def getLogs(name: str, tail: Optional[int], follow: bool, timestamps: bool) -> N
runCommand.append("-f")

command(runCommand)


def isDockerDesktop() -> bool:
try:
_, output, _ = command(["docker", "context", "show"], ignoreStdout = True, ignoreStderr = True)
return output.strip() == "desktop-linux"
dule1322 marked this conversation as resolved.
Show resolved Hide resolved
except:
return False


def isDaemonFileUpdated() -> bool:
daemonFile = "/etc/docker/daemon.json"
dule1322 marked this conversation as resolved.
Show resolved Hide resolved
cGroupFix = "native.cgroupdriver=cgroupfs"

if not Path(daemonFile).exists():
return False

with open(daemonFile, "r") as file:
try:
config = json.load(file)
execOpts = config.get("exec-opts", [])
return cGroupFix in execOpts
except json.JSONDecodeError:
return False


def updateDaemonFile() -> None:
daemonFile = "/etc/docker/daemon.json"
dule1322 marked this conversation as resolved.
Show resolved Hide resolved
cGroupFix = "native.cgroupdriver=cgroupfs"
config: Dict[str, Any] = {}

if not Path(daemonFile).exists():
config = {}

with open(daemonFile, "r") as file:
try:
config = json.load(file)
except json.JSONDecodeError:
config = {}

execOpts: List[str] = config.get("exec-opts", [])
execOpts.append(cGroupFix)
config["exec-opts"] = execOpts

with tempfile.NamedTemporaryFile("w", delete = False) as tempFile:
dule1322 marked this conversation as resolved.
Show resolved Hide resolved
json.dump(config, tempFile, indent = 4)
tempFilePath = tempFile.name

# Use sudo to move the temporary file to the protected location
command(["sudo", "mv", tempFilePath, daemonFile], ignoreStderr = True, ignoreStdout = True)


def restartDocker() -> None:
command(["sudo", "systemctl", "restart", "docker"], ignoreStderr = True, ignoreStdout = True)
Loading