From f91e7b7ee482bd265d37e15a58f9f0fad279d840 Mon Sep 17 00:00:00 2001 From: Jorge Date: Tue, 27 Sep 2022 14:57:09 -0400 Subject: [PATCH] Implement custom env variable injection into plugin containers --- .github/workflows/ci.yml | 0 Dockerfile | 0 LICENSE | 0 README.md | 0 docker-compose.yml | 0 kubernetes/pman_dev.yaml | 0 kubernetes/prod/extra/README | 0 kubernetes/prod/extra/job_creator/README | 0 kubernetes/prod/extra/job_creator/binding.yml | 0 kubernetes/prod/extra/job_creator/role.yml | 0 kubernetes/prod/extra/job_creator/sa.yml | 0 kubernetes/prod/kustomization.yaml | 0 kubernetes/prod/resources/pman.yaml | 0 kubernetes/prod/secrets/.pman.env | 0 openshift/README.rst | 0 openshift/example-config.cfg | 0 openshift/example-secret.yml | 0 ...pman-openshift-template-without-swift.json | 0 openshift/pman-openshift-template.json | 0 pman/__init__.py | 0 pman/__main__.py | 0 pman/abstractmgr.py | 2 +- pman/app.py | 0 pman/config.py | 0 pman/cromwell/__init__.py | 0 pman/cromwell/client.py | 0 pman/cromwell/models.py | 0 pman/cromwell/slurm/__init__.py | 0 pman/cromwell/slurm/wdl.py | 0 pman/cromwellmgr.py | 0 pman/kubernetesmgr.py | 24 ++++++++++++------- pman/openshiftmgr.py | 0 pman/resources.py | 13 +++++++--- pman/swarmmgr.py | 4 +++- pman/wsgi.py | 0 requirements/base.txt | 0 requirements/local.txt | 0 requirements/production.txt | 0 setup.cfg | 0 setup.py | 0 tests/__init__.py | 0 tests/cromwell/__init__.py | 0 tests/cromwell/examples/__init__.py | 0 tests/cromwell/examples/metadata.py | 0 tests/cromwell/examples/query.py | 0 tests/cromwell/examples/wdl.py | 0 tests/cromwell/helpers.py | 0 tests/cromwell/test_client.py | 0 tests/cromwell/test_cromwellmgr.py | 0 tests/cromwell/test_wdl.py | 0 tests/test_cmd.py | 0 tests/test_openshiftmgr.py | 0 tests/test_resources.py | 0 53 files changed, 30 insertions(+), 13 deletions(-) mode change 100644 => 100755 .github/workflows/ci.yml mode change 100644 => 100755 Dockerfile mode change 100644 => 100755 LICENSE mode change 100644 => 100755 README.md mode change 100644 => 100755 docker-compose.yml mode change 100644 => 100755 kubernetes/pman_dev.yaml mode change 100644 => 100755 kubernetes/prod/extra/README mode change 100644 => 100755 kubernetes/prod/extra/job_creator/README mode change 100644 => 100755 kubernetes/prod/extra/job_creator/binding.yml mode change 100644 => 100755 kubernetes/prod/extra/job_creator/role.yml mode change 100644 => 100755 kubernetes/prod/extra/job_creator/sa.yml mode change 100644 => 100755 kubernetes/prod/kustomization.yaml mode change 100644 => 100755 kubernetes/prod/resources/pman.yaml mode change 100644 => 100755 kubernetes/prod/secrets/.pman.env mode change 100644 => 100755 openshift/README.rst mode change 100644 => 100755 openshift/example-config.cfg mode change 100644 => 100755 openshift/example-secret.yml mode change 100644 => 100755 openshift/pman-openshift-template-without-swift.json mode change 100644 => 100755 openshift/pman-openshift-template.json mode change 100644 => 100755 pman/__init__.py mode change 100644 => 100755 pman/__main__.py mode change 100644 => 100755 pman/abstractmgr.py mode change 100644 => 100755 pman/app.py mode change 100644 => 100755 pman/config.py mode change 100644 => 100755 pman/cromwell/__init__.py mode change 100644 => 100755 pman/cromwell/client.py mode change 100644 => 100755 pman/cromwell/models.py mode change 100644 => 100755 pman/cromwell/slurm/__init__.py mode change 100644 => 100755 pman/cromwell/slurm/wdl.py mode change 100644 => 100755 pman/cromwellmgr.py mode change 100644 => 100755 pman/kubernetesmgr.py mode change 100644 => 100755 pman/openshiftmgr.py mode change 100644 => 100755 pman/resources.py mode change 100644 => 100755 pman/swarmmgr.py mode change 100644 => 100755 pman/wsgi.py mode change 100644 => 100755 requirements/base.txt mode change 100644 => 100755 requirements/local.txt mode change 100644 => 100755 requirements/production.txt mode change 100644 => 100755 setup.cfg mode change 100644 => 100755 setup.py mode change 100644 => 100755 tests/__init__.py mode change 100644 => 100755 tests/cromwell/__init__.py mode change 100644 => 100755 tests/cromwell/examples/__init__.py mode change 100644 => 100755 tests/cromwell/examples/metadata.py mode change 100644 => 100755 tests/cromwell/examples/query.py mode change 100644 => 100755 tests/cromwell/examples/wdl.py mode change 100644 => 100755 tests/cromwell/helpers.py mode change 100644 => 100755 tests/cromwell/test_client.py mode change 100644 => 100755 tests/cromwell/test_cromwellmgr.py mode change 100644 => 100755 tests/cromwell/test_wdl.py mode change 100644 => 100755 tests/test_cmd.py mode change 100644 => 100755 tests/test_openshiftmgr.py mode change 100644 => 100755 tests/test_resources.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml old mode 100644 new mode 100755 diff --git a/Dockerfile b/Dockerfile old mode 100644 new mode 100755 diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/docker-compose.yml b/docker-compose.yml old mode 100644 new mode 100755 diff --git a/kubernetes/pman_dev.yaml b/kubernetes/pman_dev.yaml old mode 100644 new mode 100755 diff --git a/kubernetes/prod/extra/README b/kubernetes/prod/extra/README old mode 100644 new mode 100755 diff --git a/kubernetes/prod/extra/job_creator/README b/kubernetes/prod/extra/job_creator/README old mode 100644 new mode 100755 diff --git a/kubernetes/prod/extra/job_creator/binding.yml b/kubernetes/prod/extra/job_creator/binding.yml old mode 100644 new mode 100755 diff --git a/kubernetes/prod/extra/job_creator/role.yml b/kubernetes/prod/extra/job_creator/role.yml old mode 100644 new mode 100755 diff --git a/kubernetes/prod/extra/job_creator/sa.yml b/kubernetes/prod/extra/job_creator/sa.yml old mode 100644 new mode 100755 diff --git a/kubernetes/prod/kustomization.yaml b/kubernetes/prod/kustomization.yaml old mode 100644 new mode 100755 diff --git a/kubernetes/prod/resources/pman.yaml b/kubernetes/prod/resources/pman.yaml old mode 100644 new mode 100755 diff --git a/kubernetes/prod/secrets/.pman.env b/kubernetes/prod/secrets/.pman.env old mode 100644 new mode 100755 diff --git a/openshift/README.rst b/openshift/README.rst old mode 100644 new mode 100755 diff --git a/openshift/example-config.cfg b/openshift/example-config.cfg old mode 100644 new mode 100755 diff --git a/openshift/example-secret.yml b/openshift/example-secret.yml old mode 100644 new mode 100755 diff --git a/openshift/pman-openshift-template-without-swift.json b/openshift/pman-openshift-template-without-swift.json old mode 100644 new mode 100755 diff --git a/openshift/pman-openshift-template.json b/openshift/pman-openshift-template.json old mode 100644 new mode 100755 diff --git a/pman/__init__.py b/pman/__init__.py old mode 100644 new mode 100755 diff --git a/pman/__main__.py b/pman/__main__.py old mode 100644 new mode 100755 diff --git a/pman/abstractmgr.py b/pman/abstractmgr.py old mode 100644 new mode 100755 index a919dc28..89c25fa9 --- a/pman/abstractmgr.py +++ b/pman/abstractmgr.py @@ -85,7 +85,7 @@ def __init__(self, config_dict: dict = None): @abstractmethod def schedule_job(self, image: Image, command: List[str], name: JobName, - resources_dict: Resources, mountdir: Optional[str] = None) -> J: + resources_dict: Resources, env: List[str], mountdir: Optional[str] = None) -> J: """ Schedule a new job and return the job object. """ diff --git a/pman/app.py b/pman/app.py old mode 100644 new mode 100755 diff --git a/pman/config.py b/pman/config.py old mode 100644 new mode 100755 diff --git a/pman/cromwell/__init__.py b/pman/cromwell/__init__.py old mode 100644 new mode 100755 diff --git a/pman/cromwell/client.py b/pman/cromwell/client.py old mode 100644 new mode 100755 diff --git a/pman/cromwell/models.py b/pman/cromwell/models.py old mode 100644 new mode 100755 diff --git a/pman/cromwell/slurm/__init__.py b/pman/cromwell/slurm/__init__.py old mode 100644 new mode 100755 diff --git a/pman/cromwell/slurm/wdl.py b/pman/cromwell/slurm/wdl.py old mode 100644 new mode 100755 diff --git a/pman/cromwellmgr.py b/pman/cromwellmgr.py old mode 100644 new mode 100755 diff --git a/pman/kubernetesmgr.py b/pman/kubernetesmgr.py old mode 100644 new mode 100755 index 6cf26fce..c085c255 --- a/pman/kubernetesmgr.py +++ b/pman/kubernetesmgr.py @@ -25,11 +25,13 @@ def __init__(self, config_dict=None): self.kube_client = k_client.CoreV1Api() self.kube_v1_batch_client = k_client.BatchV1Api() - def schedule_job(self, image, command, name, resources_dict, mountdir=None) -> V1Job: + def schedule_job(self, image, command, name, resources_dict, env, mountdir=None) -> \ + V1Job: """ Schedule a new job and return the job object. """ - job_instance = self.create_job(image, command, name, resources_dict, mountdir) + job_instance = self.create_job(image, command, name, resources_dict, env, + mountdir) job = self.submit_job(job_instance) return job @@ -58,7 +60,7 @@ def get_job_logs(self, job: V1Job, tail: int) -> AnyStr: # and return immediately. term_reason = self.__get_termination_reason(pod_item) if term_reason is not None: - if term_reason is not 'Completed': + if term_reason != 'Completed': logs += f'\n{term_reason}' return logs return logs @@ -117,7 +119,8 @@ def remove_job(self, job): self.kube_v1_batch_client.delete_namespaced_job(job.metadata.name, body=body, namespace=job_namespace) - def create_job(self, image, command, name, resources_dict, mountdir=None) -> V1Job: + def create_job(self, image, command, name, resources_dict, env_l, mountdir=None) -> \ + V1Job: """ Create and return a new job instance. """ @@ -133,14 +136,19 @@ def create_job(self, image, command, name, resources_dict, mountdir=None) -> V1J # > request, Kubernetes automatically assigns a memory request that matches the limit. limits = {'memory': memory_limit, 'cpu': cpu_limit} + env = [] + for s in env_l: + key, val = s.split('=', 1) + env.append(k_client.V1EnvVar(name=key, value=val)) + if gpu_limit > 0: # ref: https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/ limits['nvidia.com/gpu'] = gpu_limit - env = [k_client.V1EnvVar(name='NVIDIA_VISIBLE_DEVICES', value='all'), - k_client.V1EnvVar(name='NVIDIA_DRIVER_CAPABILITIES', - value='compute,utility'), - k_client.V1EnvVar(name='NVIDIA_REQUIRE_CUDA', value='cuda>=9.0')], + env.append(k_client.V1EnvVar(name='NVIDIA_VISIBLE_DEVICES', value='all')) + env.append(k_client.V1EnvVar(name='NVIDIA_DRIVER_CAPABILITIES', + value='compute,utility')) + env.append(k_client.V1EnvVar(name='NVIDIA_REQUIRE_CUDA', value='cuda>=9.0')) security_context = { 'allow_privilege_escalation': False, diff --git a/pman/openshiftmgr.py b/pman/openshiftmgr.py old mode 100644 new mode 100755 diff --git a/pman/resources.py b/pman/resources.py old mode 100644 new mode 100755 index 0d00e148..90b703fe --- a/pman/resources.py +++ b/pman/resources.py @@ -17,7 +17,8 @@ parser = reqparse.RequestParser(bundle_errors=True) parser.add_argument('jid', dest='jid', required=True) parser.add_argument('args', dest='args', type=list, location='json', required=True) -parser.add_argument('args_path_flags', dest='args_path_flags', type=frozenset, location='json', required=False, default=frozenset()) +parser.add_argument('args_path_flags', dest='args_path_flags', type=frozenset, + location='json', required=False, default=frozenset()) parser.add_argument('auid', dest='auid', required=True) parser.add_argument('number_of_workers', dest='number_of_workers', type=int, required=True) @@ -25,8 +26,10 @@ parser.add_argument('memory_limit', dest='memory_limit', type=int, required=True) parser.add_argument('gpu_limit', dest='gpu_limit', type=int, required=True) parser.add_argument('image', dest='image', required=True) -parser.add_argument('entrypoint', dest='entrypoint', type=list, location='json', required=True) +parser.add_argument('entrypoint', dest='entrypoint', type=list, location='json', + required=True) parser.add_argument('type', dest='type', choices=('ds', 'fs', 'ts'), required=True) +parser.add_argument('env', dest='env', type=list, location='json', default=[]) def get_compute_mgr(container_env): @@ -67,6 +70,10 @@ def post(self): if len(args.entrypoint) == 0: abort(400, message='"entrypoint" cannot be empty') + for s in args.env: + if len(s.split('=', 1)) != 2: + abort(400, message='"env" must be a list of "key=value" strings') + job_id = args.jid.lstrip('/') cmd = self.build_app_cmd(args.args, args.args_path_flags, args.entrypoint, args.type) @@ -87,7 +94,7 @@ def post(self): compute_mgr = get_compute_mgr(self.container_env) try: job = compute_mgr.schedule_job(args.image, cmd, job_id, resources_dict, - share_dir) + args.env, share_dir) except ManagerException as e: logger.error(f'Error from {self.container_env} while scheduling job ' f'{job_id}, detail: {str(e)}') diff --git a/pman/swarmmgr.py b/pman/swarmmgr.py old mode 100644 new mode 100755 index a1d6c094..99aeb613 --- a/pman/swarmmgr.py +++ b/pman/swarmmgr.py @@ -19,7 +19,8 @@ def __init__(self, config_dict=None): else: self.docker_client = docker.from_env(environment=self.config) - def schedule_job(self, image, command, name, resources_dict, mountdir=None) -> Service: + def schedule_job(self, image, command, name, resources_dict, env, mountdir=None) -> \ + Service: """ Schedule a new job and return the job (swarm service) object. """ @@ -30,6 +31,7 @@ def schedule_job(self, image, command, name, resources_dict, mountdir=None) -> S try: job = self.docker_client.services.create(image, command, name=name, + env=env, mounts=mounts, restart_policy=restart_policy, tty=True) diff --git a/pman/wsgi.py b/pman/wsgi.py old mode 100644 new mode 100755 diff --git a/requirements/base.txt b/requirements/base.txt old mode 100644 new mode 100755 diff --git a/requirements/local.txt b/requirements/local.txt old mode 100644 new mode 100755 diff --git a/requirements/production.txt b/requirements/production.txt old mode 100644 new mode 100755 diff --git a/setup.cfg b/setup.cfg old mode 100644 new mode 100755 diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 diff --git a/tests/__init__.py b/tests/__init__.py old mode 100644 new mode 100755 diff --git a/tests/cromwell/__init__.py b/tests/cromwell/__init__.py old mode 100644 new mode 100755 diff --git a/tests/cromwell/examples/__init__.py b/tests/cromwell/examples/__init__.py old mode 100644 new mode 100755 diff --git a/tests/cromwell/examples/metadata.py b/tests/cromwell/examples/metadata.py old mode 100644 new mode 100755 diff --git a/tests/cromwell/examples/query.py b/tests/cromwell/examples/query.py old mode 100644 new mode 100755 diff --git a/tests/cromwell/examples/wdl.py b/tests/cromwell/examples/wdl.py old mode 100644 new mode 100755 diff --git a/tests/cromwell/helpers.py b/tests/cromwell/helpers.py old mode 100644 new mode 100755 diff --git a/tests/cromwell/test_client.py b/tests/cromwell/test_client.py old mode 100644 new mode 100755 diff --git a/tests/cromwell/test_cromwellmgr.py b/tests/cromwell/test_cromwellmgr.py old mode 100644 new mode 100755 diff --git a/tests/cromwell/test_wdl.py b/tests/cromwell/test_wdl.py old mode 100644 new mode 100755 diff --git a/tests/test_cmd.py b/tests/test_cmd.py old mode 100644 new mode 100755 diff --git a/tests/test_openshiftmgr.py b/tests/test_openshiftmgr.py old mode 100644 new mode 100755 diff --git a/tests/test_resources.py b/tests/test_resources.py old mode 100644 new mode 100755