Skip to content

Commit

Permalink
Release 1.2.0, adding MIG support
Browse files Browse the repository at this point in the history
- add MIG support
- provide random CPU core assignment
- incorporate upstream changes allowing listing of free and used GPUs
  • Loading branch information
saviola777 committed Apr 1, 2022
1 parent dfec279 commit 02c0ea5
Show file tree
Hide file tree
Showing 8 changed files with 175 additions and 64 deletions.
12 changes: 12 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
userdocker-diffproml Version 1.2.0
=============

This is a minor release.

New features and changes:
-------------

- fix problem where the ``--name`` parameter was ignored
- add support for NVIDIA MIG


userdocker-diffproml Version 1.1.0
=============

Expand Down
4 changes: 2 additions & 2 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
MIT License

userdocker Copyright (c) 2017-2018 Jörn Hees
userdocker-diffproml Copyright (c) 2018 Johannes Rückert
userdocker Copyright (c) 2017-2022 Jörn Hees
userdocker-diffproml Copyright (c) 2018-2022 Johannes Rückert

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
8 changes: 4 additions & 4 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,9 @@ Sample Usage:
sudo userdocker pull debian
sudo userdocker load < image.tar.gz
# (nvidia-docker) extensions for nvidia GPU support
# (nvidia-docker) extensions for nvidia GPU support, see nvidia-smi -L for GPU IDs
alias nvidia-userdocker='userdocker --executor=nvidia-docker'
NV_GPU=1,3,7 nvidia-userdocker run -it --rm nvcr.io/nvidia/tensorflow
NV_GPU="GPU-37f6b436-5153-16af-0d26-88b96cd391bf,MIG-af8113ae-fec4-52d4-9cd7-299e6db5f9c6" nvidia-userdocker run -it --rm nvcr.io/nvidia/tensorflow
userdocker ps --gpu-used
userdocker ps --gpu-free
Expand All @@ -107,7 +107,7 @@ Features:
- enforce docker args
- restrict port publishing
- explicitly white-list available args to user
- restrict allowed GPU access / reservations via ``NV_GPU``
- restrict allowed GPU access / reservations via ``NV_GPU`` (using GPU IDs obtained from ``nvidia-smi -L``)

- System wide config + overrides for individual groups, gids, users, uids.
- Easy extensibility for further subcommands and args.
Expand All @@ -132,7 +132,7 @@ Afterwards, as ``userdocker-diffproml`` is written in python3 and not yet availa

.. code-block:: bash
sudo pip3 install -U https://github.com/saviola777/userdocker/archive/diffproml-1.0.0.tar.gz
sudo pip3 install -U https://github.com/saviola777/userdocker/archive/diffproml-1.2.0.tar.gz
The above is the preferable way of installation of the latest stable release.

Expand Down
2 changes: 1 addition & 1 deletion userdocker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
https://github.com/joernhees/userdocker
"""

__version__ = '1.1.0'
__version__ = '1.2.0'
4 changes: 3 additions & 1 deletion userdocker/config/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@
# flag, the user can append a "ro" to guard herself (even for
# VOLUME_MOUNTS_ALWAYS).
# Example:
# VOLUME_MOUNTS_DEFAULT = ['/netscratch:/netscratch', '/data:/input:ro']
# VOLUME_MOUNTS_DEFAULT = ['/scratch:/scratch', '/data:/input:ro', '/foo:']
VOLUME_MOUNTS_ALWAYS = []
VOLUME_MOUNTS_AVAILABLE = []
VOLUME_MOUNTS_DEFAULT = [
Expand Down Expand Up @@ -271,6 +271,8 @@
]
}

# How many CPU cores to assign to a container by default, 0 for all
NUM_CPUS_DEFAULT = 0

# nvidia docker specific settings
# The following settings allow to restrict the way users can use nvidia GPUs
Expand Down
2 changes: 1 addition & 1 deletion userdocker/helpers/cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def init_cmd(args) -> list:
- user arguments,
- ARGS_DEFAULT if the user did not disable default arguments.
Later arguments to not overwrite earlier arguments.
Later arguments do not overwrite earlier arguments.
Args:
args (argparse.Namespace): Collected user arguments as well as executor
Expand Down
129 changes: 92 additions & 37 deletions userdocker/helpers/nvidia.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,90 @@
import re
from collections import defaultdict
from typing import Tuple
from xml.etree import ElementTree as ET

from ..config import uid, NVIDIA_SMI, NV_ALLOWED_GPUS, \
NV_EXCLUSIVE_CONTAINER_GPU_RESERVATION, \
NV_GPU_UNAVAILABLE_ABOVE_MEMORY_USED
from .logger import logger
from .execute import exec_cmd
from .container import container_get_running, container_find_userdocker_user_uid
from .container import container_get_running


def get_gpu_and_mig_uuids(nvidia_smi) -> dict:
"""Extracts the GPU and MIG UUIDs from nvidia-smi -L
Returns:
``dict`` mapping GPU UUIDs to a list of MIG UUIDs
"""
gpu_list_str = exec_cmd(
[nvidia_smi,
'--list-gpus',],
return_status=False,
loglvl=logging.DEBUG,
)

gpu_uuid = None
mig_uuids = {}
for line in gpu_list_str.split("\n"):
result = re.search("(GPU-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})", line)
if result is not None:
gpu_uuid = result.group(1)
mig_uuids[gpu_uuid] = []

result = re.search(
"(MIG-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})", line)
if result is not None:
mig_uuids[gpu_uuid].append(result.group(1))

return mig_uuids


def extract_gis_with_memory(nvidia_smi, mig_uuids) -> dict:
"""Returns a dictionary mapping GPU instance UUIDs to memory usage
"""

gis = {}

for gpu_uuid in mig_uuids.keys():
gpu_info_str = exec_cmd(
[nvidia_smi,
'--query',
'--id=' + gpu_uuid,
'--xml-format'],
return_status=False,
loglvl=logging.DEBUG,
)

root = ET.fromstring(gpu_info_str)

gpu = root.find('gpu')
uuid = gpu.find("uuid").text

mig_enabled = gpu.find(".//current_mig").text == "Enabled"

if mig_enabled:
mig_index = 0
for mig in gpu.find("mig_devices").findall("mig_device"):
uuid = mig_uuids[gpu_uuid][mig_index]

gis[uuid] = int(mig.find("fb_memory_usage/used").text[:-4])

mig_index += 1
else:
gis[uuid] = int(gpu.find("fb_memory_usage/used").text[:-4])

return gis


def container_find_userdocker_user_uid_gpus(container_env):
pairs = [var.partition('=') for var in container_env]
users = [v for k, _, v in pairs if k == 'USERDOCKER_USER']
uids = [v for k, _, v in pairs if k == 'USERDOCKER_UID']
gpus = [v for k, _, v in pairs if k == 'USERDOCKER_NV_GPU']
if gpus:
gpus = gpus[0].split(',')
return users[0] if users else '', uids[0] if uids else None, gpus


def nvidia_get_gpus_used_by_containers(docker: str) -> defaultdict:
Expand All @@ -35,35 +112,25 @@ def nvidia_get_gpus_used_by_containers(docker: str) -> defaultdict:
gpu_used_by_containers_str = exec_cmd(
[
docker, 'inspect', '--format',
'[{{json .Name}}, {{json .Id}}, {{json .Config.Env}}, '
'{{json $.HostConfig.Devices}}]'
'[{{json .Name}}, {{json .Id}}, {{json .Config.Env}}]'
] + running_containers,
return_status=False,
loglvl=logging.DEBUG,
)
logger.debug('gpu_used_by_containers_str: %s', gpu_used_by_containers_str)

gpu_dev_id_re = re.compile('^/dev/nvidia([0-9]+)$')
for line in gpu_used_by_containers_str.splitlines():
container_name, container, container_env, devs = json.loads(line)

# Skip of no devs found
if devs is None:
continue

for dev in devs:
d = dev.get('PathOnHost', '')
m = gpu_dev_id_re.match(d)
if m:
gpu_id = int(m.groups()[0])
container_user, container_uid = container_find_userdocker_user_uid(container_env)
gpu_used_by_containers[gpu_id].append(
(container, container_name, container_user, container_uid)
)
logger.debug(
'gpu %d used by container: %s, name: %s, user: %s, uid: %s',
gpu_id, container, container_name, container_user, container_uid
)
container_name, container, container_env = json.loads(line)
container_user, container_uid, gpus = \
container_find_userdocker_user_uid_gpus(container_env)
for gpu_uuid in gpus:
gpu_used_by_containers[gpu_uuid].append(
(container, container_name, container_user, container_uid)
)
logger.debug(
'gpu %d used by container: %s, name: %s, user: %s, uid: %s',
gpu_uuid, container, container_name, container_user, container_uid
)
return gpu_used_by_containers


Expand All @@ -82,20 +149,8 @@ def nvidia_get_available_gpus(docker: str, nvidia_smi: str=NVIDIA_SMI) -> Tuple[
if not NV_ALLOWED_GPUS:
return list(), list()

gpu_mem_used_str = exec_cmd(
[nvidia_smi,
'--query-gpu=index,memory.used,utilization.gpu',
'--format=csv'],
return_status=False,
loglvl=logging.DEBUG,
)
logger.debug('gpu usage:\n%s', gpu_mem_used_str)
gpu_mem_used = {}
for line in gpu_mem_used_str.splitlines()[1:]: # skip header
gpu, mem_used, gpu_utilization = line.split(', ')
gpu = int(gpu)
mem_used = int(mem_used.split(' MiB')[0])
gpu_mem_used[gpu] = mem_used
gpu_mem_used = extract_gis_with_memory(nvidia_smi, get_gpu_and_mig_uuids(nvidia_smi))
logger.debug('gpu usage:\n%s', gpu_mem_used)

gpus_used_by_containers = nvidia_get_gpus_used_by_containers(docker)
gpus_used_by_own_containers = [
Expand Down
Loading

0 comments on commit 02c0ea5

Please sign in to comment.