Skip to content

Commit

Permalink
Merge pull request #259 from bogdant36/CTX-6655
Browse files Browse the repository at this point in the history
CTX-6655: Fix node container losing access to gpu randomly.
  • Loading branch information
dule1322 authored Aug 27, 2024
2 parents a2ae22d + 8d0b69e commit 997e8b3
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 3 deletions.
35 changes: 34 additions & 1 deletion coretex/cli/modules/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import os
import logging
import requests
import platform

import click

Expand Down Expand Up @@ -376,6 +377,7 @@ def checkResourceLimitations() -> None:
def configureNode(advanced: bool) -> NodeConfiguration:
ui.highlightEcho("[Node Configuration]")
nodeConfig = NodeConfiguration({}) # create new empty node config
currentOS = platform.system().lower()

cpuLimit, ramLimit = docker.getResourceLimits()
swapLimit = docker.getDockerSwapLimit()
Expand All @@ -388,11 +390,42 @@ def configureNode(advanced: bool) -> NodeConfiguration:
else:
nodeConfig.image = "coretexai/coretex-node"

if isGPUAvailable():
# GPU Access is supported for:
# - Linux (Docker Engine)
# - Windows (Docker Desktop)

if isGPUAvailable() and not (docker.isDockerDesktop() and currentOS != "windows"):
nodeConfig.allowGpu = ui.clickPrompt("Do you want to allow the Node to access your GPU? (Y/n)", type = bool, default = True)
else:
nodeConfig.allowGpu = False

if nodeConfig.allowGpu and platform.system().lower() == "linux" and not docker.isDaemonFileUpdated():
shouldUpdateDockerConfig = ui.clickPrompt(
"NVIDIA has a bug where a docker container running Coretex Node can lose access to GPU "
"(https://github.com/NVIDIA/nvidia-container-toolkit/issues/48). "
"\nDo you want Coretex CLI to apply a workaround for this bug "
"(NOTE: This requires docker daemon restart)? (Y/n)",
type = bool,
default = True
)

if shouldUpdateDockerConfig:
docker.updateDaemonFile()
shouldRestartDocker = ui.clickPrompt("Do you want to restart Docker to apply the changes? (Y/n)", type = bool, default = True)

if shouldRestartDocker:
docker.restartDocker()
else:
ui.warningEcho(
"Warning: The changes will not take effect until Docker is restarted. "
"(https://github.com/NVIDIA/nvidia-container-toolkit/issues/48)"
)
else:
ui.warningEcho(
"Warning: Not updating the daemon.json file may lead to GPU access issues in Docker "
"containers. (https://github.com/NVIDIA/nvidia-container-toolkit/issues/48)"
)

if imageType == ImageType.official:
tag = "gpu" if nodeConfig.allowGpu else "cpu"
nodeConfig.image += f":latest-{tag}"
Expand Down
73 changes: 71 additions & 2 deletions coretex/utils/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import json
import platform
import tempfile

from .process import command, CommandException
from ..statistics import getTotalSwapMemory
Expand All @@ -22,7 +23,7 @@ def isDockerAvailable() -> None:

def networkExists(name: str) -> bool:
# This function inspects the specified Docker network using the
# 'docker network inspect' command. If the command exits with a return code
# "docker network inspect" command. If the command exits with a return code
# of 0, indicating success, the function returns True, meaning the network exists.
# If the command exits with a non-zero return code, indicating failure,
# the function returns False, meaning the network doesn't exist.
Expand Down Expand Up @@ -98,7 +99,7 @@ def start(

runCommand = [
"docker", "run", "-d",
"--restart", 'always',
"--restart", "always",
"-p", "21000:21000",
"--cap-add", "SYS_PTRACE",
"--network", name,
Expand Down Expand Up @@ -203,3 +204,71 @@ def getLogs(name: str, tail: Optional[int], follow: bool, timestamps: bool) -> N
runCommand.append("-f")

command(runCommand)


def isDockerDesktop() -> bool:
try:
_, output, _ = command(["docker", "info", "--format", "{{json .}}"], ignoreStdout = True, ignoreStderr = True)
jsonOutput = json.loads(output)

clientInfo = jsonOutput.get("ClientInfo")
if not isinstance(clientInfo, dict):
return False

pluginsInfo = clientInfo.get("Plugins")
if not isinstance(pluginsInfo, dict):
return False

versionInfo = pluginsInfo.get("Version")
if not isinstance(versionInfo, str):
return False

return "desktop" in versionInfo
except:
return False


def isDaemonFileUpdated() -> bool:
daemonFile = Path("/etc/docker/daemon.json")
cGroupFix = "native.cgroupdriver=cgroupfs"

if not daemonFile.exists():
return False

with daemonFile.open("r") as file:
try:
config = json.load(file)
execOpts = config.get("exec-opts", [])
return cGroupFix in execOpts
except json.JSONDecodeError:
return False


def updateDaemonFile() -> None:
daemonFile = Path("/etc/docker/daemon.json")
cGroupFix = "native.cgroupdriver=cgroupfs"
config: Dict[str, Any] = {}

if not daemonFile.exists():
config = {}

with daemonFile.open("r") as file:
try:
config = json.load(file)
except json.JSONDecodeError:
config = {}

execOpts: List[str] = config.get("exec-opts", [])
execOpts.append(cGroupFix)
config["exec-opts"] = execOpts

with tempfile.NamedTemporaryFile("w", delete = True) as tempFile:
json.dump(config, tempFile, indent = 4)
tempFilePath = tempFile.name

# Use sudo to move the temporary file to the protected location
command(["sudo", "mv", tempFilePath, str(daemonFile)], ignoreStderr = True, ignoreStdout = True)


def restartDocker() -> None:
command(["sudo", "systemctl", "restart", "docker"], ignoreStderr = True, ignoreStdout = True)

0 comments on commit 997e8b3

Please sign in to comment.