diff --git a/config/clusters/2i2c-aws-us/daskhub-common.values.yaml b/config/clusters/2i2c-aws-us/daskhub-common.values.yaml index f79cf466fc..ac74f8094f 100644 --- a/config/clusters/2i2c-aws-us/daskhub-common.values.yaml +++ b/config/clusters/2i2c-aws-us/daskhub-common.values.yaml @@ -17,7 +17,15 @@ basehub: - noresvport serverIP: fs-0b70db2b65209a77d.efs.us-west-2.amazonaws.com baseShareName: / + dask-gateway: + enabled: true jupyterhub: scheduling: userScheduler: enabled: true + custom: + daskhubSetup: + enabled: true + singleuser: + cloudMetadata: + blockWithIptables: false diff --git a/config/clusters/2i2c/daskhub-common.values.yaml b/config/clusters/2i2c/daskhub-common.values.yaml index 0841fd8668..5fc0f5469e 100644 --- a/config/clusters/2i2c/daskhub-common.values.yaml +++ b/config/clusters/2i2c/daskhub-common.values.yaml @@ -9,3 +9,12 @@ basehub: serverIP: 10.234.45.250 # MUST HAVE TRAILING SLASH baseShareName: /homes/homes/ + dask-gateway: + enabled: true + jupyterhub: + custom: + daskhubSetup: + enabled: true + singleuser: + cloudMetadata: + blockWithIptables: false diff --git a/config/clusters/awi-ciroh/common.values.yaml b/config/clusters/awi-ciroh/common.values.yaml index 7f20efc9b3..dfcc0fceae 100644 --- a/config/clusters/awi-ciroh/common.values.yaml +++ b/config/clusters/awi-ciroh/common.values.yaml @@ -10,8 +10,24 @@ basehub: serverIP: 10.11.233.234 # Name of Google Filestore share baseShareName: /homes/ + dask-gateway: + enabled: true + gateway: + backend: + scheduler: + cores: + request: 0.8 + limit: 1 + memory: + request: 1G + limit: 2G jupyterhub: + singleuser: + cloudMetadata: + blockWithIptables: false custom: + daskhubSetup: + enabled: true 2i2c: add_staff_user_ids_to_admin_users: true add_staff_user_ids_of_type: "github" @@ -48,13 +64,3 @@ basehub: - arpita0911patel - sepehrkrz - benlee0423 -dask-gateway: - gateway: - backend: - scheduler: - cores: - request: 0.8 - limit: 1 - memory: - request: 1G - limit: 2G diff --git a/config/clusters/earthscope/common.values.yaml b/config/clusters/earthscope/common.values.yaml index e31d18a187..bdc8204ef8 100644 --- a/config/clusters/earthscope/common.values.yaml +++ b/config/clusters/earthscope/common.values.yaml @@ -13,8 +13,12 @@ basehub: - noresvport serverIP: fs-08e7747330d833d82.efs.us-east-2.amazonaws.com baseShareName: / + dask-gateway: + enabled: true jupyterhub: custom: + daskhubSetup: + enabled: true 2i2c: add_staff_user_ids_to_admin_users: true add_staff_user_ids_of_type: "google" @@ -136,6 +140,8 @@ basehub: # Chad Trebant, https://2i2c.freshdesk.com/a/tickets/1279 - google-oauth2|117718799995701713253 singleuser: + cloudMetadata: + blockWithIptables: false profileList: - display_name: "Shared Small: 1-4 CPU, 8-32 GB" description: "A shared machine, the recommended option until you experience a limitation." diff --git a/config/clusters/gridsst/common.values.yaml b/config/clusters/gridsst/common.values.yaml index 7a8871aa50..1460287560 100644 --- a/config/clusters/gridsst/common.values.yaml +++ b/config/clusters/gridsst/common.values.yaml @@ -13,8 +13,12 @@ basehub: - noresvport serverIP: fs-05f68d7e096d7cf16.efs.us-west-2.amazonaws.com baseShareName: / + dask-gateway: + enabled: true jupyterhub: custom: + daskhubSetup: + enabled: true 2i2c: add_staff_user_ids_to_admin_users: true add_staff_user_ids_of_type: "github" @@ -73,6 +77,8 @@ basehub: - nikki-t - dgumustel singleuser: + cloudMetadata: + blockWithIptables: false profileList: # The mem-guarantees are here so k8s doesn't schedule other pods # on these nodes. diff --git a/config/clusters/hhmi/cluster.yaml b/config/clusters/hhmi/cluster.yaml index 2b3bd01c01..ef7492b644 100644 --- a/config/clusters/hhmi/cluster.yaml +++ b/config/clusters/hhmi/cluster.yaml @@ -21,7 +21,7 @@ hubs: domain: staging.hhmi.2i2c.cloud helm_chart: daskhub helm_chart_values_files: - - common.values.yaml + - daskhub-common.values.yaml - staging.values.yaml - enc-staging.secret.values.yaml - name: prod @@ -29,7 +29,7 @@ hubs: domain: hhmi.2i2c.cloud helm_chart: daskhub helm_chart_values_files: - - common.values.yaml + - daskhub-common.values.yaml - prod.values.yaml - enc-prod.secret.values.yaml - name: spyglass diff --git a/config/clusters/hhmi/common.values.yaml b/config/clusters/hhmi/daskhub-common.values.yaml similarity index 98% rename from config/clusters/hhmi/common.values.yaml rename to config/clusters/hhmi/daskhub-common.values.yaml index 903cb230c3..6310a322ec 100644 --- a/config/clusters/hhmi/common.values.yaml +++ b/config/clusters/hhmi/daskhub-common.values.yaml @@ -8,8 +8,12 @@ basehub: - noatime serverIP: 10.55.112.74 baseShareName: /homes/ + dask-gateway: + enabled: true jupyterhub: custom: + daskhubSetup: + enabled: true 2i2c: add_staff_user_ids_to_admin_users: true add_staff_user_ids_of_type: "github" @@ -31,6 +35,8 @@ basehub: url: "" custom_html: HHMI and Stratos singleuser: + cloudMetadata: + blockWithIptables: false extraContainers: - name: mysql image: datajoint/mysql:8.0 # following the spyglass tutorial at https://lorenfranklab.github.io/spyglass/latest/notebooks/00_Setup/#existing-database diff --git a/config/clusters/jupyter-meets-the-earth/cluster.yaml b/config/clusters/jupyter-meets-the-earth/cluster.yaml index 99cad4e452..cdbdc77dbf 100644 --- a/config/clusters/jupyter-meets-the-earth/cluster.yaml +++ b/config/clusters/jupyter-meets-the-earth/cluster.yaml @@ -16,7 +16,7 @@ hubs: domain: staging.jmte.2i2c.cloud helm_chart: daskhub helm_chart_values_files: - - common.values.yaml + - daskhub-common.values.yaml - staging.values.yaml - enc-staging.secret.values.yaml - name: prod @@ -24,6 +24,6 @@ hubs: domain: jmte.2i2c.cloud helm_chart: daskhub helm_chart_values_files: - - common.values.yaml + - daskhub-common.values.yaml - prod.values.yaml - enc-prod.secret.values.yaml diff --git a/config/clusters/jupyter-meets-the-earth/common.values.yaml b/config/clusters/jupyter-meets-the-earth/daskhub-common.values.yaml similarity index 93% rename from config/clusters/jupyter-meets-the-earth/common.values.yaml rename to config/clusters/jupyter-meets-the-earth/daskhub-common.values.yaml index 8724e36a49..3c676e9162 100644 --- a/config/clusters/jupyter-meets-the-earth/common.values.yaml +++ b/config/clusters/jupyter-meets-the-earth/daskhub-common.values.yaml @@ -14,9 +14,28 @@ basehub: serverIP: fs-01707b06.efs.us-west-2.amazonaws.com # This is different from rest of our hubs! baseShareName: / - + dask-gateway: + enabled: true + gateway: + backend: + scheduler: + # IMPORTANT: We have experienced that the scheduler can fail with + # 1GB memory limit. This was observed "stream closed" + # from the python client working against the + # Dask-Gateway created DaskCluster. + # + # CommClosedError: in : Stream is closed + # + cores: + request: 1 + limit: 64 + memory: + request: 2G + limit: 500G jupyterhub: custom: + daskhubSetup: + enabled: true 2i2c: add_staff_user_ids_to_admin_users: true add_staff_user_ids_of_type: "github" @@ -39,6 +58,8 @@ basehub: url: https://jupytearth.org singleuser: + cloudMetadata: + blockWithIptables: false initContainers: # Need to explicitly set this up and copy what's in basehub/values.yaml # as we have an extra 'shared-public' directory here. @@ -276,21 +297,3 @@ basehub: scheduling: userScheduler: enabled: true - -dask-gateway: - gateway: - backend: - scheduler: - # IMPORTANT: We have experienced that the scheduler can fail with - # 1GB memory limit. This was observed "stream closed" - # from the python client working against the - # Dask-Gateway created DaskCluster. - # - # CommClosedError: in : Stream is closed - # - cores: - request: 1 - limit: 64 - memory: - request: 2G - limit: 500G diff --git a/config/clusters/leap/cluster.yaml b/config/clusters/leap/cluster.yaml index da5d81471e..6450bf6338 100644 --- a/config/clusters/leap/cluster.yaml +++ b/config/clusters/leap/cluster.yaml @@ -21,7 +21,7 @@ hubs: domain: staging.leap.2i2c.cloud helm_chart: daskhub helm_chart_values_files: - - common.values.yaml + - daskhub-common.values.yaml - staging.values.yaml - enc-staging.secret.values.yaml - name: prod @@ -29,7 +29,7 @@ hubs: domain: leap.2i2c.cloud helm_chart: daskhub helm_chart_values_files: - - common.values.yaml + - daskhub-common.values.yaml - prod.values.yaml - enc-prod.secret.values.yaml - name: public diff --git a/config/clusters/leap/common.values.yaml b/config/clusters/leap/daskhub-common.values.yaml similarity index 97% rename from config/clusters/leap/common.values.yaml rename to config/clusters/leap/daskhub-common.values.yaml index 4981345277..a826e51f0a 100644 --- a/config/clusters/leap/common.values.yaml +++ b/config/clusters/leap/daskhub-common.values.yaml @@ -10,8 +10,21 @@ basehub: serverIP: 10.104.103.242 # Name of Google Filestore share baseShareName: /homes/ + dask-gateway: + enabled: true + gateway: + backend: + scheduler: + cores: + request: 1 + limit: 2 + memory: + request: 4G + limit: 4G jupyterhub: custom: + daskhubSetup: + enabled: true # Extra mount point for admins to access to all users' home dirs # Ref https://github.com/2i2c-org/infrastructure/issues/2105 singleuserAdmin: @@ -86,6 +99,8 @@ basehub: - rabernat - jbusecke singleuser: + cloudMetadata: + blockWithIptables: false extraEnv: GH_SCOPED_CREDS_CLIENT_ID: "Iv1.0c7df3d4b3191b2f" GH_SCOPED_CREDS_APP_URL: https://github.com/apps/leap-hub-push-access @@ -348,14 +363,3 @@ basehub: mem_guarantee: 24G extra_resource_limits: nvidia.com/gpu: "1" - -dask-gateway: - gateway: - backend: - scheduler: - cores: - request: 1 - limit: 2 - memory: - request: 4G - limit: 4G diff --git a/config/clusters/linked-earth/common.values.yaml b/config/clusters/linked-earth/common.values.yaml index 7b75154849..a0d403f63a 100644 --- a/config/clusters/linked-earth/common.values.yaml +++ b/config/clusters/linked-earth/common.values.yaml @@ -10,8 +10,21 @@ basehub: serverIP: 10.155.150.2 # Name of Google Filestore share baseShareName: /homes/ + dask-gateway: + enabled: true + gateway: + backend: + scheduler: + cores: + request: 0.8 + limit: 1 + memory: + request: 1G + limit: 2G jupyterhub: custom: + daskhubSetup: + enabled: true 2i2c: add_staff_user_ids_to_admin_users: true add_staff_user_ids_of_type: "github" @@ -43,6 +56,8 @@ basehub: admin_users: - khider singleuser: + cloudMetadata: + blockWithIptables: false image: # User image repo: https://quay.io/repository/linkedearth/pyleoclim name: quay.io/linkedearth/pyleoclim @@ -163,13 +178,3 @@ basehub: mem_limit: null node_selector: node.kubernetes.io/instance-type: n2-highmem-16 -dask-gateway: - gateway: - backend: - scheduler: - cores: - request: 0.8 - limit: 1 - memory: - request: 1G - limit: 2G diff --git a/config/clusters/meom-ige/common.values.yaml b/config/clusters/meom-ige/common.values.yaml index 05fd095572..30c9d9d237 100644 --- a/config/clusters/meom-ige/common.values.yaml +++ b/config/clusters/meom-ige/common.values.yaml @@ -13,8 +13,12 @@ basehub: - noresvport serverIP: nfs-server-01.us-central1-b.c.meom-ige-cnrs.internal baseShareName: /export/home-01/homes/ + dask-gateway: + enabled: true jupyterhub: custom: + daskhubSetup: + enabled: true 2i2c: add_staff_user_ids_to_admin_users: true add_staff_user_ids_of_type: "github" @@ -34,6 +38,8 @@ basehub: name: SWOT Ocean Pangeo Team url: https://meom-group.github.io/ singleuser: + cloudMetadata: + blockWithIptables: false extraEnv: DATA_BUCKET: gs://meom-ige-data SCRATCH_BUCKET: "gs://meom-ige-scratch/$(JUPYTERHUB_USER)" diff --git a/config/clusters/nasa-cryo/common.values.yaml b/config/clusters/nasa-cryo/common.values.yaml index 69edac3fe7..b73688b0ac 100644 --- a/config/clusters/nasa-cryo/common.values.yaml +++ b/config/clusters/nasa-cryo/common.values.yaml @@ -13,8 +13,12 @@ basehub: - noresvport serverIP: fs-0872256335d483d5f.efs.us-west-2.amazonaws.com baseShareName: / + dask-gateway: + enabled: true jupyterhub: custom: + daskhubSetup: + enabled: true 2i2c: add_staff_user_ids_to_admin_users: true add_staff_user_ids_of_type: "github" @@ -66,6 +70,8 @@ basehub: - jomey singleuser: + cloudMetadata: + blockWithIptables: false defaultUrl: /lab storage: extraVolumes: diff --git a/config/clusters/nasa-ghg/common.values.yaml b/config/clusters/nasa-ghg/common.values.yaml index 28695bff80..f39bd81ae8 100644 --- a/config/clusters/nasa-ghg/common.values.yaml +++ b/config/clusters/nasa-ghg/common.values.yaml @@ -13,8 +13,12 @@ basehub: - noresvport serverIP: fs-09bd18b3ca23eefa2.efs.us-west-2.amazonaws.com baseShareName: / + dask-gateway: + enabled: true jupyterhub: custom: + daskhubSetup: + enabled: true 2i2c: add_staff_user_ids_to_admin_users: true add_staff_user_ids_of_type: "github" @@ -63,6 +67,8 @@ basehub: - Jeanne-le-Roux - amarouane-ABDELHAK singleuser: + cloudMetadata: + blockWithIptables: false defaultUrl: /lab profileList: - display_name: "Modified Pangeo Notebook" diff --git a/config/clusters/nasa-veda/common.values.yaml b/config/clusters/nasa-veda/common.values.yaml index e49e761d4b..770bb36b20 100644 --- a/config/clusters/nasa-veda/common.values.yaml +++ b/config/clusters/nasa-veda/common.values.yaml @@ -14,8 +14,12 @@ basehub: serverIP: fs-08b7410bc122c9d70.efs.us-west-2.amazonaws.com baseShareName: / + dask-gateway: + enabled: true jupyterhub: custom: + daskhubSetup: + enabled: true 2i2c: add_staff_user_ids_to_admin_users: true add_staff_user_ids_of_type: "github" @@ -66,6 +70,8 @@ basehub: - wildintellect - amarouane-ABDELHAK singleuser: + cloudMetadata: + blockWithIptables: false defaultUrl: /lab initContainers: - &volume_ownership_fix_initcontainer diff --git a/config/clusters/openscapes/common.values.yaml b/config/clusters/openscapes/common.values.yaml index 6895dfd8ae..6534643c79 100644 --- a/config/clusters/openscapes/common.values.yaml +++ b/config/clusters/openscapes/common.values.yaml @@ -13,8 +13,12 @@ basehub: - noresvport serverIP: fs-b25253b5.efs.us-west-2.amazonaws.com baseShareName: / + dask-gateway: + enabled: true jupyterhub: custom: + daskhubSetup: + enabled: true 2i2c: add_staff_user_ids_to_admin_users: true add_staff_user_ids_of_type: "github" @@ -56,6 +60,8 @@ basehub: mountPath: /home/rstudio/shared-readwrite subPath: _shared singleuser: + cloudMetadata: + blockWithIptables: false defaultUrl: /lab extraEnv: GH_SCOPED_CREDS_CLIENT_ID: "Iv1.6981e043b45f042f" diff --git a/config/clusters/pangeo-hubs/common.values.yaml b/config/clusters/pangeo-hubs/common.values.yaml index 077f042bb8..25f1b508ef 100644 --- a/config/clusters/pangeo-hubs/common.values.yaml +++ b/config/clusters/pangeo-hubs/common.values.yaml @@ -10,8 +10,21 @@ basehub: serverIP: 10.229.44.234 # Name of Google Filestore share baseShareName: /homes/ + dask-gateway: + enabled: true + gateway: + backend: + scheduler: + cores: + request: 0.8 + limit: 1 + memory: + request: 1G + limit: 2G jupyterhub: custom: + daskhubSetup: + enabled: true 2i2c: add_staff_user_ids_to_admin_users: true add_staff_user_ids_of_type: "github" @@ -49,6 +62,8 @@ basehub: - scottyhq - TomAugspurger singleuser: + cloudMetadata: + blockWithIptables: false extraEnv: GH_SCOPED_CREDS_CLIENT_ID: "Iv1.c90ee430400a347f" GH_SCOPED_CREDS_APP_URL: https://github.com/apps/pangeo-gcp-hub-push-access @@ -102,13 +117,3 @@ basehub: mem_guarantee: 52G node_selector: node.kubernetes.io/instance-type: n1-standard-16 -dask-gateway: - gateway: - backend: - scheduler: - cores: - request: 0.8 - limit: 1 - memory: - request: 1G - limit: 2G diff --git a/config/clusters/pchub/common.values.yaml b/config/clusters/pchub/common.values.yaml index 231c2f1a5e..e6337848c6 100644 --- a/config/clusters/pchub/common.values.yaml +++ b/config/clusters/pchub/common.values.yaml @@ -11,8 +11,12 @@ basehub: serverIP: 2i2cpchub.file.core.windows.net # Trailing slash is important! baseShareName: /2i2cpchub/homes/ + dask-gateway: + enabled: true jupyterhub: custom: + daskhubSetup: + enabled: true 2i2c: add_staff_user_ids_to_admin_users: true add_staff_user_ids_of_type: "github" @@ -48,6 +52,8 @@ basehub: Authenticator: admin_users: [] singleuser: + cloudMetadata: + blockWithIptables: false initContainers: - name: volume-mount-ownership-fix image: busybox:1.36.1 diff --git a/config/clusters/projectpythia/testing.values.yaml b/config/clusters/projectpythia/testing.values.yaml new file mode 100644 index 0000000000..24d2d37ae4 --- /dev/null +++ b/config/clusters/projectpythia/testing.values.yaml @@ -0,0 +1,69 @@ +jupyterhub: + ingress: + hosts: [hub.binder.pythia.2i2c.cloud] + tls: + - hosts: [hub.binder.pythia.2i2c.cloud] + secretName: https-auto-tls + custom: + binderhubUI: + enabled: true + 2i2c: + add_staff_user_ids_to_admin_users: false + jupyterhubConfigurator: + enabled: false + singleuserAdmin: + extraVolumeMounts: [] + singleuser: + storage: + type: none + extraVolumeMounts: [] + initContainers: [] + profileList: [] + hub: + redirectToServer: false + services: + binder: {} + loadRoles: + binder: + services: + - binder + scopes: + - servers + - admin:users + user: + scopes: + - self + # Admin users will by default have access:services, so this is only + # observed to be required for non-admin users. + - access:services!service=binder +binderhub-service: + enabled: true + ingress: + enabled: true + hosts: [binder.pythia.2i2c.cloud] + tls: + - hosts: [binder.pythia.2i2c.cloud] + secretName: binder-https-auto-tls + config: + BinderHub: + base_url: / + hub_url: https://hub.binder.pythia.2i2c.cloud + badge_base_url: https://binder.pythia.2i2c.cloud + auth_enabled: false + enable_api_only_mode: false + banner_message: "" + about_message: Binder for use with Project Pythia + extraEnv: + - name: JUPYTERHUB_API_TOKEN + valueFrom: + secretKeyRef: + name: hub + key: hub.services.binder.apiToken + - name: JUPYTERHUB_CLIENT_ID + value: "service-binder" + - name: JUPYTERHUB_API_URL + value: "https://hub.binder.pythia.2i2c.cloud/hub/api" + # Without this, the redirect URL to /hub/api/... gets + # appended to binderhub's URL instead of the hub's + - name: JUPYTERHUB_BASE_URL + value: "https://hub.binder.pythia.2i2c.cloud/" diff --git a/config/clusters/smithsonian/common.values.yaml b/config/clusters/smithsonian/common.values.yaml index b6ee632298..6859ddf9e2 100644 --- a/config/clusters/smithsonian/common.values.yaml +++ b/config/clusters/smithsonian/common.values.yaml @@ -13,9 +13,12 @@ basehub: - noresvport serverIP: fs-0ba20c6122f4a7236.efs.us-east-2.amazonaws.com baseShareName: / - + dask-gateway: + enabled: true jupyterhub: custom: + daskhubSetup: + enabled: true 2i2c: add_staff_user_ids_to_admin_users: true add_staff_user_ids_of_type: "github" @@ -78,6 +81,8 @@ basehub: enabled: true singleuser: + cloudMetadata: + blockWithIptables: false profileList: - display_name: "Pangeo Notebook" slug: pangeo diff --git a/deployer/commands/cilogon.py b/deployer/commands/cilogon.py index 5d1fc84bd2..0e20c5ff33 100644 --- a/deployer/commands/cilogon.py +++ b/deployer/commands/cilogon.py @@ -66,9 +66,8 @@ def build_client_details(cluster_name, hub_name, callback_url): } -def persist_client_credentials_in_config_file(client, hub_type, config_filename): - auth_config = {} - jupyterhub_config = { +def persist_client_credentials_in_config_file(client, config_filename): + auth_config = { "jupyterhub": { "hub": { "config": { @@ -81,11 +80,6 @@ def persist_client_credentials_in_config_file(client, hub_type, config_filename) } } - if hub_type == "daskhub": - auth_config["basehub"] = jupyterhub_config - else: - auth_config = jupyterhub_config - persist_config_in_encrypted_file(config_filename, auth_config) print_colour( f"Successfully persisted the encrypted CILogon client app credentials to file {config_filename}" @@ -97,9 +91,9 @@ def load_client_id_from_file(config_filename): with open(decrypted_path) as f: auth_config = yaml.load(f) - daskhub = auth_config.get("basehub", None) + daskhub_legacy_config = auth_config.get("basehub", None) try: - if daskhub: + if daskhub_legacy_config: return auth_config["basehub"]["jupyterhub"]["hub"]["config"][ "CILogonOAuthenticator" ]["client_id"] @@ -132,9 +126,7 @@ def print_not_ok_request_message(response): print_colour(f"{response.text}", "yellow") -def create_client( - admin_id, admin_secret, cluster_name, hub_name, hub_type, callback_url -): +def create_client(admin_id, admin_secret, cluster_name, hub_name, callback_url): """Creates a new client Args: @@ -183,7 +175,7 @@ def create_client( print_colour("Done! Successfully created a new CILogon client app.") # Persist and encrypt the client credentials - return persist_client_credentials_in_config_file(client, hub_type, config_filename) + return persist_client_credentials_in_config_file(client, config_filename) def update_client(admin_id, admin_secret, cluster_name, hub_name, callback_url): @@ -379,10 +371,6 @@ def create( hub_name: str = typer.Argument( ..., help="Name of the hub for which we'll create a CILogon client" ), - hub_type: str = typer.Argument( - "basehub", - help="Type of hub for which we'll create a CILogon client (ex: basehub, daskhub)", - ), hub_domain: str = typer.Argument( ..., help="The hub domain, as specified in `cluster.yaml` (ex: staging.2i2c.cloud)", @@ -391,9 +379,7 @@ def create( """Create a CILogon client for a hub.""" admin_id, admin_secret = get_2i2c_cilogon_admin_credentials() callback_url = f"https://{hub_domain}/hub/oauth_callback" - create_client( - admin_id, admin_secret, cluster_name, hub_name, hub_type, callback_url - ) + create_client(admin_id, admin_secret, cluster_name, hub_name, callback_url) @cilogon_client_app.command() diff --git a/deployer/commands/deployer.py b/deployer/commands/deployer.py index c655a66164..40a528c05a 100644 --- a/deployer/commands/deployer.py +++ b/deployer/commands/deployer.py @@ -252,7 +252,26 @@ def run_hub_health_check( f"--hub-type={hub.spec['helm_chart']}", ] - if (hub.spec["helm_chart"] == "daskhub") and check_dask_scaling: + for values_file in hub.spec["helm_chart_values_files"]: + if "secret" not in os.path.basename(values_file): + values_file = config_file_path.parent.joinpath(values_file) + config = yaml.load(values_file) + # Check if there's config that enables dask-gateway + if config.get("basehub", {}): + dask_gateway_enabled = ( + config.get("basehub", {}) + .get("dask-gateway", {}) + .get("enabled", False) + ) + else: + dask_gateway_enabled = config.get("dask-gateway", {}).get( + "enabled", False + ) + + if dask_gateway_enabled: + break + + if dask_gateway_enabled and check_dask_scaling: pytest_args.append("--check-dask-scaling") if gh_ci == "true": diff --git a/deployer/commands/generate/dedicated_cluster/aws.py b/deployer/commands/generate/dedicated_cluster/aws.py index b1657add9a..851785efeb 100644 --- a/deployer/commands/generate/dedicated_cluster/aws.py +++ b/deployer/commands/generate/dedicated_cluster/aws.py @@ -12,7 +12,6 @@ import jinja2 import typer -from typing_extensions import Annotated from deployer.utils.file_acquisition import REPO_ROOT_PATH from deployer.utils.rendering import print_colour @@ -106,12 +105,6 @@ def aws( cluster_region: str = typer.Option( ..., prompt="The region where to deploy the cluster" ), - hub_type: Annotated[ - str, - typer.Option( - prompt="Please type in the hub type: basehub/daskhub.\n-> If this cluster will host daskhubs, please type `daskhub`.\n-> If you don't know this info, or this is not the case, just hit ENTER" - ), - ] = "basehub", force: bool = typer.Option( False, "--force", @@ -124,13 +117,12 @@ def aws( """ # These are the variables needed by the templates used to generate the cluster config file # and support files - vars = { # Also store the provider, as it's useful for some jinja templates # to differentiate between them when rendering the configuration "provider": "aws", + "hub_type": "basehub", "cluster_name": cluster_name, - "hub_type": hub_type, "cluster_region": cluster_region, } diff --git a/deployer/commands/generate/dedicated_cluster/gcp.py b/deployer/commands/generate/dedicated_cluster/gcp.py index 6c5ee6d608..c5622061a6 100644 --- a/deployer/commands/generate/dedicated_cluster/gcp.py +++ b/deployer/commands/generate/dedicated_cluster/gcp.py @@ -12,7 +12,6 @@ import jinja2 import typer -from typing_extensions import Annotated from deployer.utils.file_acquisition import REPO_ROOT_PATH from deployer.utils.rendering import print_colour @@ -64,12 +63,6 @@ def gcp( project_id: str = typer.Option( ..., prompt="Please insert the Project ID of the GCP project" ), - hub_type: Annotated[ - str, - typer.Option( - prompt="Please type in the hub type: basehub/daskhub.\n-> If this cluster will host daskhubs, please type `daskhub`.\n-> If you don't know this info, or this is not the case, just hit ENTER" - ), - ] = "basehub", force: bool = typer.Option( False, "--force", @@ -86,8 +79,8 @@ def gcp( # Also store the provider, as it's useful for some jinja templates # to differentiate between them when rendering the configuration "provider": "gcp", + "hub_type": "basehub", "cluster_name": cluster_name, - "hub_type": hub_type, "cluster_region": cluster_region, "project_id": project_id, } diff --git a/deployer/commands/generate/hub_asset/cluster_entry.py b/deployer/commands/generate/hub_asset/cluster_entry.py index 4a569df3e0..3f9d094bbe 100644 --- a/deployer/commands/generate/hub_asset/cluster_entry.py +++ b/deployer/commands/generate/hub_asset/cluster_entry.py @@ -12,16 +12,15 @@ def cluster_entry( ..., prompt="The name of the cluster where the hub will live" ), hub_name: str = typer.Option(..., prompt="The name of the hub"), - hub_type: str = typer.Option(..., prompt="The hub type: basehub/daskhub"), ): """ Outputs the relevant cluster.yaml hub entry that the engineer can then manually copy-paste into the relevant cluster.yaml file. """ vars = { + "hub_type": "basehub", "cluster_name": cluster_name, "hub_name": hub_name, - "hub_type": hub_type, } with open( diff --git a/deployer/commands/validate/config.py b/deployer/commands/validate/config.py index 31d4e200c8..b81514f2a1 100644 --- a/deployer/commands/validate/config.py +++ b/deployer/commands/validate/config.py @@ -51,14 +51,13 @@ def _prepare_helm_charts_dependencies_and_schemas(): """ Ensures that the helm charts we deploy, basehub and daskhub, have got their dependencies updated and .json schema files generated so that they can be - rendered during validation or deployment. """ basehub_dir = HELM_CHARTS_DIR.joinpath("basehub") _generate_values_schema_json(basehub_dir) subprocess.check_call(["helm", "dep", "up", basehub_dir]) daskhub_dir = HELM_CHARTS_DIR.joinpath("daskhub") - _generate_values_schema_json(daskhub_dir) + # Not generating schema for daskhub, as it is dead subprocess.check_call(["helm", "dep", "up", daskhub_dir]) support_dir = HELM_CHARTS_DIR.joinpath("support") @@ -122,11 +121,17 @@ def hub_config( ] for values_file in hub.spec["helm_chart_values_files"]: if "secret" not in os.path.basename(values_file): - cmd.append(f"--values={config_file_path.parent.joinpath(values_file)}") + values_file = config_file_path.parent.joinpath(values_file) + cmd.append(f"--values={values_file}") + config = yaml.load(values_file) + # Check if there's config that enables dask-gateway + dask_gateway_enabled = config.get("dask-gateway", {}).get( + "enabled", False + ) # Workaround the current requirement for dask-gateway 0.9.0 to have a # JupyterHub api-token specified, for updates if this workaround can be # removed, see https://github.com/dask/dask-gateway/issues/473. - if hub.spec["helm_chart"] == "daskhub": + if dask_gateway_enabled: cmd.append("--set=dask-gateway.gateway.auth.jupyterhub.apiToken=dummy") try: subprocess.check_output(cmd, text=True) @@ -204,6 +209,8 @@ def authenticator_config( config = yaml.load(values_file) # Check if there's config that specifies an authenticator class try: + # This special casing is needed for legacy daskhubs still + # using the daskhub chart if hub.spec["helm_chart"] != "basehub": hub_config = config["basehub"]["jupyterhub"]["hub"]["config"] else: @@ -262,6 +269,8 @@ def configurator_config( # Load the hub extra config from its specific values files config = yaml.load(values_file) try: + # This special casing is needed for legacy daskhubs still + # using the daskhub chart if hub.spec["helm_chart"] != "basehub": singleuser_config = config["basehub"]["jupyterhub"][ "singleuser" @@ -270,7 +279,6 @@ def configurator_config( else: singleuser_config = config["jupyterhub"]["singleuser"] custom_config = config["jupyterhub"]["custom"] - configurator_enabled = custom_config.get( "jupyterhubConfigurator", {} ).get("enabled") diff --git a/deployer/health_check_tests/test_hub_health.py b/deployer/health_check_tests/test_hub_health.py index f2c0061bc0..c1a1dd4208 100644 --- a/deployer/health_check_tests/test_hub_health.py +++ b/deployer/health_check_tests/test_hub_health.py @@ -64,15 +64,15 @@ async def check_hub_health(hub_url, test_notebook_path, service_api_token): async def test_hub_healthy(hub_url, api_token, notebook_dir, check_dask_scaling): try: print(f"Starting hub {hub_url} health validation...") - for root, directories, files in os.walk(notebook_dir, topdown=False): - for i, name in enumerate(files): + for root, _, files in os.walk(notebook_dir, topdown=False): + for _, name in enumerate(files): # We only want to run the "scale_dask_workers.ipynb" file if the # check_dask_scaling variable is true. We continue in the loop if # check_dask_scaling == False when we iterate over this file. + print(f"Running {name} test notebook...") if (not check_dask_scaling) and (name == "scale_dask_workers.ipynb"): continue - print(f"Running {name} test notebook...") test_notebook_path = os.path.join(root, name) await check_hub_health(hub_url, test_notebook_path, api_token) diff --git a/deployer/infra_components/hub.py b/deployer/infra_components/hub.py index 20d5f9320d..667e2a6181 100644 --- a/deployer/infra_components/hub.py +++ b/deployer/infra_components/hub.py @@ -1,9 +1,11 @@ +import os import subprocess from ruamel.yaml import YAML from deployer.utils.file_acquisition import ( HELM_CHARTS_DIR, + find_absolute_path_to_cluster_file, get_decrypted_file, get_decrypted_files, ) @@ -48,7 +50,19 @@ def deploy(self, dask_gateway_version, debug, dry_run): self.spec["domain"] = domain_override_config["domain"] - if self.spec["helm_chart"] == "daskhub": + config_file_path = find_absolute_path_to_cluster_file(self.cluster.config_path) + for values_file in self.spec["helm_chart_values_files"]: + if "secret" not in os.path.basename(values_file): + values_file = config_file_path.parent.joinpath(values_file) + config = yaml.load(values_file) + # Check if there's config that enables dask-gateway + dask_gateway_enabled = config.get("dask-gateway", {}).get( + "enabled", False + ) + if dask_gateway_enabled: + break + + if dask_gateway_enabled: # Install CRDs for daskhub before deployment manifest_urls = [ f"https://raw.githubusercontent.com/dask/dask-gateway/{dask_gateway_version}/resources/helm/dask-gateway/crds/daskclusters.yaml", diff --git a/deployer/utils/file_acquisition.py b/deployer/utils/file_acquisition.py index e28a739dbf..f78ac8fb20 100644 --- a/deployer/utils/file_acquisition.py +++ b/deployer/utils/file_acquisition.py @@ -119,7 +119,7 @@ def persist_config_in_encrypted_file(encrypted_file, new_config): def remove_jupyterhub_hub_config_key_from_encrypted_file(encrypted_file, key): """ Remove config from the dict `config["jupyterhub"]["hub"]["config"][]` - in `encrypted_file` (the config is also searched for under daskhub prefix). + in `encrypted_file. If after removing this config, the file only contains a config dict with empty leaves, delete the entire file, as it no longer holds any information. @@ -130,8 +130,8 @@ def remove_jupyterhub_hub_config_key_from_encrypted_file(encrypted_file, key): with open(decrypted_path) as f: config = yaml.load(f) - daskhub = config.get("basehub", None) - if daskhub: + daskhub_legacy_config = config.get("basehub", None) + if daskhub_legacy_config: config["basehub"]["jupyterhub"]["hub"]["config"].pop(key) else: config["jupyterhub"]["hub"]["config"].pop(key) diff --git a/docs/howto/features/dask.md b/docs/howto/features/dask.md index b714b5940d..1dad6a239a 100644 --- a/docs/howto/features/dask.md +++ b/docs/howto/features/dask.md @@ -1,5 +1,46 @@ +# Add support for daskhubs + +(howto:features:daskhub:existing-hub)= +## To an existing hub + +A daskhub setup can now also be enabled **after** a hub has been setup as a basehub. + +To enable dask-gateway support on a hub, the following configuration changes need to be made to the hub's values file: + +1. set `dask-gateway.enabled` to true: + + ```yaml + dask-gateway: + enabled: true + ``` + +1. set `jupyterhub.custom.daskGateway.enabled` to true: + + ```yaml + jupyterhub: + custom: + daskhubSetup: + enabled: true + ``` + +1. set `jupyterhub.singleuser.cloudMetadata.blockWithIptables` to false + + This is to don't block access to the cloud provider's metadata server! + If we do the coupling between the cloud providers IAM permissions and + the credentials provided to pod's by mounting a k8s ServiceAccount + with certain annotations on breaks (AWS IRSA, GCP workload identity). + This in turn results in users unable to able to access AWS/GCP object + storage buckets. + + ```yaml + jupyterhub: + singleuser: + cloudMetadata: + blockWithIptables: false + ``` + (howto:features:daskhub)= -# Add support for daskhubs in an existing cluster +## To an existing cluster ## GCP diff --git a/docs/hub-deployment-guide/runbooks/phase3/initial-hub-setup.md b/docs/hub-deployment-guide/runbooks/phase3/initial-hub-setup.md index 06f3bf79e1..a81f289973 100644 --- a/docs/hub-deployment-guide/runbooks/phase3/initial-hub-setup.md +++ b/docs/hub-deployment-guide/runbooks/phase3/initial-hub-setup.md @@ -43,22 +43,6 @@ When reviewing initial hub setup PRs, make sure the files above are all present. All of the following steps must be followed in order to consider phase 3.1 complete. Steps might contain references to other smaller, topic-specific runbooks that are gathered together and listed in the order they should be carried out in by an engineer. -1. **Determine the hub helm chart that is needed** - - Use the info provided in the new hub GitHub issue for the `Dask gateway` section. - If Dask gateway will be needed, then go for a `daskhub` helm chart, otherwise choose a `basehub`. - - Store the helm type under $HELM_CHART_TYPE env var: - - ```bash - export HELM_CHART_TYPE=type - ``` - - ```{seealso} - For more information about our hub helm charts and how to choose, see [](hub-helm-charts). - - See [](/topic/infrastructure/config.md) for general information about hub helm chart configuration. - ``` 1. **Create the relevant `values.yaml` file/s under the appropriate cluster directory** @@ -161,7 +145,7 @@ All of the following steps must be followed in order to consider phase 3.1 compl You can use the `deployer generate hub-asset` subcommand to generate the relevant entry to insert into cluster.yaml file. ```bash - deployer generate hub-asset cluster-entry --cluster-name $CLUSTER_NAME --hub-name $HUB_NAME --hub-type $HELM_CHART_TYPE + deployer generate hub-asset cluster-entry --cluster-name $CLUSTER_NAME --hub-name $HUB_NAME ``` ```{warning} @@ -172,6 +156,12 @@ All of the following steps must be followed in order to consider phase 3.1 compl If you are deploying a binderhub ui style hub, then make sure that in the `cluster.yaml` file **the hub's domain** is entered instead of the binderhub's, for testing purposes. ``` +1. **Enable dask-gateway** + + Use the info provided in the new hub GitHub issue for the `Dask gateway` section. + If Dask gateway will be needed, then choose a `basehub`, and follow the guide on + (how to enable dask-gateway on an existing hub)[howto:features:daskhub] + 1. **Add the new cluster to CI/CD** ```{important} diff --git a/docs/topic/infrastructure/hub-helm-charts.md b/docs/topic/infrastructure/hub-helm-charts.md index 1d0965b575..011d5a87fd 100644 --- a/docs/topic/infrastructure/hub-helm-charts.md +++ b/docs/topic/infrastructure/hub-helm-charts.md @@ -1,6 +1,11 @@ (hub-helm-charts)= # Hub helm charts +```{warning} +The `daskhub` helm chart has now been deprecated and `dask-gateway` is now a conditional dependency of the `basehub` chart. It shouldn't be used for new hub deployments. + +However there are still existing hub configuration that uses a chart called `daskhub`. This is just for backward compatibility in order to not disrupt existing deployments by causing hubs reinstallations and should not be replicated going further. +``` The hubs are configured and deployed using [*locally defined helm charts*](https://helm.sh/docs/topics/chart_repository/#create-a-chart-repository). Because each hub type can be described by a helm chart, a hierarchy of hub types can be built and this makes development and usage easier. diff --git a/helm-charts/basehub/Chart.yaml b/helm-charts/basehub/Chart.yaml index ebdfa4a532..3a44a12f40 100644 --- a/helm-charts/basehub/Chart.yaml +++ b/helm-charts/basehub/Chart.yaml @@ -18,3 +18,10 @@ dependencies: version: 0.1.0-0.dev.git.244.h1e88014 repository: https://2i2c.org/binderhub-service/ condition: binderhub-service.enabled + # If bumping the version of dask-gateway, please also bump the default version set + # in the deployer's CLI + # https://github.com/2i2c-org/infrastructure/blob/HEAD/deployer/commands/deployer.py#L100 + - name: dask-gateway + version: "2024.1.0" + repository: "https://helm.dask.org/" + condition: dask-gateway.enabled diff --git a/helm-charts/basehub/values.schema.yaml b/helm-charts/basehub/values.schema.yaml index f638c04201..492da9a582 100644 --- a/helm-charts/basehub/values.schema.yaml +++ b/helm-charts/basehub/values.schema.yaml @@ -262,6 +262,9 @@ properties: global: type: object additionalProperties: true + dask-gateway: + type: object + additionalProperties: true binderhub-service: type: object additionalProperties: true @@ -524,3 +527,11 @@ properties: properties: enabled: type: boolean + daskhubSetup: + type: object + additionalProperties: false + required: + - enabled + properties: + enabled: + type: boolean diff --git a/helm-charts/basehub/values.yaml b/helm-charts/basehub/values.yaml index a86aabc427..d87367dfff 100644 --- a/helm-charts/basehub/values.yaml +++ b/helm-charts/basehub/values.yaml @@ -73,6 +73,240 @@ staticWebsite: id: 0 privateKey: "" +dask-gateway: + enabled: false # Enabling dask-gateway will install Dask Gateway as a dependency. + # Further Dask Gateway configuration goes here + # See https://github.com/dask/dask-gateway/blob/main/resources/helm/dask-gateway/values.yaml + gateway: + backend: + scheduler: + extraPodConfig: + serviceAccountName: user-sa + tolerations: + # Let's put schedulers on notebook nodes, since they aren't ephemeral + # dask can recover from dead workers, but not dead schedulers + - key: "hub.jupyter.org/dedicated" + operator: "Equal" + value: "user" + effect: "NoSchedule" + - key: "hub.jupyter.org_dedicated" + operator: "Equal" + value: "user" + effect: "NoSchedule" + nodeSelector: + k8s.dask.org/node-purpose: scheduler + cores: + request: 0.01 + limit: 1 + memory: + request: 128M + limit: 1G + worker: + extraContainerConfig: + securityContext: + runAsGroup: 1000 + runAsUser: 1000 + extraPodConfig: + serviceAccountName: user-sa + securityContext: + fsGroup: 1000 + tolerations: + - key: "k8s.dask.org/dedicated" + operator: "Equal" + value: "worker" + effect: "NoSchedule" + - key: "k8s.dask.org_dedicated" + operator: "Equal" + value: "worker" + effect: "NoSchedule" + nodeSelector: + # Dask workers get their own pre-emptible pool + k8s.dask.org/node-purpose: worker + env: + - name: BASEHUB_K8S_DIST + valueFrom: + configMapKeyRef: + name: basehub-cluster-info + key: K8S_DIST + + extraConfig: + # This configuration represents options that can be presented to users + # that want to create a Dask cluster using dask-gateway client. + # + # This configuration is meant to enable the user to request dask worker + # pods that fits well on 2i2c's clusters. Currently the only kind of + # instance types used are n2-highmem-16 or r5.4xlarge. + # + # - Documentation about exposing cluster options to users: + # https://gateway.dask.org/cluster-options.html and the + # - Reference for KubeClusterConfig, which is what can be configured: + # https://gateway.dask.org/api-server.html#kubeclusterconfig. + # + optionHandler: | + import os + import string + + from dask_gateway_server.options import Integer, Mapping, Options, Select, String + + # Escape a string to be dns-safe in the same way that KubeSpawner does it. + # Reference https://github.com/jupyterhub/kubespawner/blob/616f72c4aee26c3d2127c6af6086ec50d6cda383/kubespawner/spawner.py#L1828-L1835 + # Adapted from https://github.com/minrk/escapism to avoid installing the package + # in the dask-gateway api pod which would have been problematic. + def escape_string_label_safe(to_escape): + safe_chars = set(string.ascii_lowercase + string.digits) + escape_char = "-" + chars = [] + for c in to_escape: + if c in safe_chars: + chars.append(c) + else: + # escape one character + buf = [] + # UTF-8 uses 1 to 4 bytes per character, depending on the Unicode symbol + # so we need to transform each byte to its hex value + for byte in c.encode("utf8"): + buf.append(escape_char) + # %X is the hex value of the byte + buf.append('%X' % byte) + escaped_hex_char = "".join(buf) + chars.append(escaped_hex_char) + return u''.join(chars) + + # Decide on available instance types and their resource allocation + # choices to expose based on cloud provider. For each daskhub hub + # managed by 2i2c, there should be these instance types available. + # + cloud_provider = os.environ["BASEHUB_K8S_DIST"] # gke, eks, or aks + instance_types = { + "gke": ["n2-highmem-16"], + "eks": ["r5.4xlarge"], + "aks": ["Standard_E16_v4"], + } + + # NOTE: Data mentioned below comes from manual inspection of data + # collected and currently only available at + # https://github.com/2i2c-org/infrastructure/pull/3337. + # + resource_allocations = { + # n2-highmem-16 nodes in our clusters have 15.89 allocatable cores + # and 116.549Gi allocatable memory, and daemonset are expected to + # not add more than 400m cores and 800Mi (0.781Gi) memory with some + # margin, so we get 15.49 cores and 115.768Gi available for worker + # pods to request. + # + # This is an initial conservative strategy, allowing a slight + # oversubscription of CPU but not any oversubscription of memory. + # + # To workaround https://github.com/dask/dask-gateway/issues/765, we + # round worker_cores down from [0.968, 1.936, 3.872, 7.745, 15.49] + # to [0.9, 1.9, 3.8, 7.7, 15.4]. + # + "n2-highmem-16": { + "1CPU, 7.2Gi": {"worker_cores": 0.9, "worker_cores_limit": 1, "worker_memory": "7.235G", "worker_memory_limit": "7.235G"}, + "2CPU, 14.5Gi": {"worker_cores": 1.9, "worker_cores_limit": 2, "worker_memory": "14.471G", "worker_memory_limit": "14.471G"}, + "4CPU, 28.9Gi": {"worker_cores": 3.8, "worker_cores_limit": 4, "worker_memory": "28.942G", "worker_memory_limit": "28.942G"}, + "8CPU, 57.9Gi": {"worker_cores": 7.7, "worker_cores_limit": 8, "worker_memory": "57.884G", "worker_memory_limit": "57.884G"}, + "16CPU, 115.8Gi": {"worker_cores": 15.4, "worker_cores_limit": 16, "worker_memory": "115.768G", "worker_memory_limit": "115.768G"}, + }, + # r5.4xlarge nodes in our clusters have 15.89 allocatable cores and + # 121.504Gi allocatable memory, and daemonset are expected to not + # add more than 400m cores and 800Mi (0.781Gi) memory with some + # margin, so we get 15.49 cores and 120.723Gi available for worker + # pods to request. + # + # This is an initial conservative strategy, allowing a slight + # oversubscription of CPU but not any oversubscription of memory. + # + # To workaround https://github.com/dask/dask-gateway/issues/765, we + # round worker_cores down from [0.968, 1.936, 3.872, 7.745, 15.49] + # to [0.9, 1.9, 3.8, 7.7, 15.4]. + # + "r5.4xlarge": { + "1CPU, 7.5Gi": {"worker_cores": 0.9, "worker_cores_limit": 1, "worker_memory": "7.545G", "worker_memory_limit": "7.545G"}, + "2CPU, 15.1Gi": {"worker_cores": 1.9, "worker_cores_limit": 2, "worker_memory": "15.090G", "worker_memory_limit": "15.090G"}, + "4CPU, 30.2Gi": {"worker_cores": 3.8, "worker_cores_limit": 4, "worker_memory": "30.180G", "worker_memory_limit": "30.180G"}, + "8CPU, 60.4Gi": {"worker_cores": 7.7, "worker_cores_limit": 8, "worker_memory": "60.361G", "worker_memory_limit": "60.361G"}, + "16CPU, 120.7Gi": {"worker_cores": 15.4, "worker_cores_limit": 16, "worker_memory": "120.723G", "worker_memory_limit": "120.723G"}, + }, + "Standard_E16_v4": { + # Set up to be proportioate, so using all the RAM uses all the CPU too + ".25-1 CPU, 2GB RAM": {"worker_cores": 0.25, "worker_cores_limit": 1, "worker_memory": "2G", "worker_memory_limit": "2G"}, + }, + } + + # for now we support only on one instance type per cluster, listing it + # as an option is a way to help convey how things work a bit better + it = instance_types[cloud_provider][0] + ra = resource_allocations[it] + ra_keys = list(ra.keys()) + + def cluster_options(user): + def option_handler(options): + if ":" not in options.image: + raise ValueError("When specifying an image you must also provide a tag") + extra_labels = { + "hub.jupyter.org/username": escape_string_label_safe(user.name), + } + scheduler_extra_pod_annotations = { + "hub.jupyter.org/username": user.name, + "prometheus.io/scrape": "true", + "prometheus.io/port": "8787", + } + worker_extra_pod_annotations = { + "hub.jupyter.org/username": user.name, + } + picked_ra = ra[options.worker_resource_allocation] + + return { + # A default image is suggested via DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE env variable + "image": options.image, + "scheduler_extra_pod_labels": extra_labels, + "scheduler_extra_pod_annotations": scheduler_extra_pod_annotations, + "worker_extra_pod_labels": extra_labels, + "worker_extra_pod_annotations": worker_extra_pod_annotations, + "worker_cores": picked_ra["worker_cores"], + "worker_cores_limit": picked_ra["worker_cores_limit"], + "worker_memory": picked_ra["worker_memory"], + "worker_memory_limit": picked_ra["worker_memory_limit"], + "environment": options.environment, + "idle_timeout": options.idle_timeout_minutes * 60, + } + return Options( + Select( + "instance_type", + [it], + default=it, + label="Instance type running worker containers", + ), + Select( + "worker_resource_allocation", + ra_keys, + default=ra_keys[0], + label="Resources per worker container", + ), + # The default image is pre-specified by the dask-gateway client + # via the env var DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE set on + # the jupyterhub user pods + String("image", label="Image"), + Mapping("environment", {}, label="Environment variables (YAML)"), + Integer("idle_timeout_minutes", 30, min=0, label="Idle cluster terminated after (minutes)"), + handler=option_handler, + ) + c.Backend.cluster_options = cluster_options + + # timeout after 30 minutes of inactivity by default, keep this in sync + # with the user exposed option idle_timeout_minutes's default value + # configured above + c.KubeClusterConfig.idle_timeout = 1800 + prefix: "/services/dask-gateway" # Users connect to the Gateway through the JupyterHub service. + auth: + type: jupyterhub # Use JupyterHub to authenticate with Dask Gateway + traefik: + nodeSelector: + k8s.dask.org/node-purpose: core + service: + type: ClusterIP # Access Dask Gateway through JupyterHub. To access the Gateway from outside JupyterHub, this must be changed to a `LoadBalancer`. + nfs: enabled: false dirsizeReporter: @@ -772,6 +1006,11 @@ jupyterhub: # more tightly scoped permissions based on our needs. # admin: true + dask-gateway: + # We provide an entry here for dask-gateway unconditionally, so + # our helm chart will correctly autogenerate a secret with appropriate + # keys. It's not used if dask-gateway is not enabled. + display: false image: name: quay.io/2i2c/pilot-hub tag: "0.0.1-0.dev.git.8663.h049aa2c2" @@ -1305,3 +1544,66 @@ jupyterhub: if get_config('custom.auth.anonymizeUsername', False): # https://jupyterhub.readthedocs.io/en/stable/reference/api/auth.html#jupyterhub.auth.Authenticator.post_auth_hook c.Authenticator.post_auth_hook = salt_username + + # Initially copied from https://github.com/dask/helm-chart/blob/main/daskhub/values.yaml + daskhub-01-add-dask-gateway-values: | + # 1. Sets `DASK_GATEWAY__PROXY_ADDRESS` in the singleuser environment. + # 2. Adds the URL for the Dask Gateway JupyterHub service. + import os + from z2jh import get_config + + if get_config('custom.daskhubSetup.enabled'): + # Default all users on hubs with dask-gateway to use JupyterLab + c.Spawner.default_url = '/lab' + + # Add an extra label that allows user pods to talk to the proxy pod + # in clusters with networkPolicy enabled so kernels can talk to the + # dask-gateway service via the proxy + c.KubeSpawner.extra_labels.update({ + "hub.jupyter.org/network-access-proxy-http": "true" + }) + # These are set by jupyterhub. + release_name = os.environ["HELM_RELEASE_NAME"] + release_namespace = os.environ["POD_NAMESPACE"] + if "PROXY_HTTP_SERVICE_HOST" in os.environ: + # https is enabled, we want to use the internal http service. + gateway_address = "http://{}:{}/services/dask-gateway/".format( + os.environ["PROXY_HTTP_SERVICE_HOST"], + os.environ["PROXY_HTTP_SERVICE_PORT"], + ) + print("Setting DASK_GATEWAY__ADDRESS {} from HTTP service".format(gateway_address)) + else: + gateway_address = "http://proxy-public/services/dask-gateway" + print("Setting DASK_GATEWAY__ADDRESS {}".format(gateway_address)) + # Internal address to connect to the Dask Gateway. + c.KubeSpawner.environment.setdefault("DASK_GATEWAY__ADDRESS", gateway_address) + # Internal address for the Dask Gateway proxy. + c.KubeSpawner.environment.setdefault("DASK_GATEWAY__PROXY_ADDRESS", "gateway://traefik-{}-dask-gateway.{}:80".format(release_name, release_namespace)) + # Relative address for the dashboard link. + c.KubeSpawner.environment.setdefault("DASK_GATEWAY__PUBLIC_ADDRESS", "/services/dask-gateway/") + # Use JupyterHub to authenticate with Dask Gateway. + c.KubeSpawner.environment.setdefault("DASK_GATEWAY__AUTH__TYPE", "jupyterhub") + + # Add some settings for dask gateway via environment variables + # https://docs.dask.org/en/latest/configuration.html has more information + # Kubernetes env variable expansion via `{{}}` is used here. See + # https://kubernetes.io/docs/tasks/inject-data-application/define-interdependent-environment-variables/ + # for more information + c.KubeSpawner.environment.update({ + # Specify what image dask-gateway workers and schedulers should use + 'DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE': '{{JUPYTER_IMAGE_SPEC}}', + 'DASK_GATEWAY__CLUSTER__OPTIONS__ENVIRONMENT': '{{"SCRATCH_BUCKET": "$(SCRATCH_BUCKET)", "PANGEO_SCRATCH": "$(PANGEO_SCRATCH)"}}', + 'DASK_DISTRIBUTED__DASHBOARD__LINK': '{{JUPYTERHUB_SERVICE_PREFIX}}proxy/{{port}}/status' + }) + + # Adds Dask Gateway as a JupyterHub service to make the gateway available at + # {HUB_URL}/services/dask-gateway + service_url = "http://traefik-{}-dask-gateway.{}".format(release_name, release_namespace) + for service in c.JupyterHub.services: + if service["name"] == "dask-gateway": + if not service.get("url", None): + print("Adding dask-gateway service URL") + service.setdefault("url", service_url) + break + else: + print("dask-gateway service not found, this should not happen!") diff --git a/helm-charts/daskhub/Chart.yaml b/helm-charts/daskhub/Chart.yaml index fe6701f1d3..d45b813911 100644 --- a/helm-charts/daskhub/Chart.yaml +++ b/helm-charts/daskhub/Chart.yaml @@ -7,9 +7,3 @@ dependencies: - name: basehub version: "0.1.0" repository: file://../basehub - # If bumping the version of dask-gateway, please also bump the default version set - # in the deployer's CLI - # https://github.com/2i2c-org/infrastructure/blob/HEAD/deployer/deployer.py#L195 - - name: dask-gateway - version: "2024.1.0" - repository: "https://helm.dask.org/" diff --git a/helm-charts/daskhub/values.schema.yaml b/helm-charts/daskhub/values.schema.yaml deleted file mode 100644 index ccf3cd201d..0000000000 --- a/helm-charts/daskhub/values.schema.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# This schema (a jsonschema in YAML format) is used to generate -# values.schema.json which is, when available, used by the helm CLI for client -# side validation by Helm of the chart's values before template rendering. -# -# We look to document everything we have default values for in values.yaml, but -# we don't look to enforce the perfect validation logic within this file. -# -# ref: https://json-schema.org/learn/getting-started-step-by-step.html -# -$schema: http://json-schema.org/draft-07/schema# -type: object -additionalProperties: false -required: - - basehub - - dask-gateway - - global -properties: - # basehub is a dependent helm chart, we rely on its schema validation for - # values passed to it and are not imposing restrictions on them in this helm - # chart. - basehub: - type: object - additionalProperties: true - # dask-gateway is a dependent helm chart, we rely on its schema validation for - # values passed to it and are not imposing restrictions on them in this helm - # chart. - dask-gateway: - type: object - additionalProperties: true - global: - type: object - additionalProperties: true diff --git a/helm-charts/daskhub/values.yaml b/helm-charts/daskhub/values.yaml deleted file mode 100644 index 8dc2bbf0fc..0000000000 --- a/helm-charts/daskhub/values.yaml +++ /dev/null @@ -1,336 +0,0 @@ -basehub: - jupyterhub: - singleuser: - # Almost everyone using dask by default wants JupyterLab - defaultUrl: /lab - extraLabels: - hub.jupyter.org/network-access-proxy-http: "true" - cloudMetadata: - # Don't block access to the cloud provider's metadata server! - # - # If we do the coupling between the cloud providers IAM permissions and - # the credentials provided to pod's by mounting a k8s ServiceAccount - # with certain annotations on breaks (AWS IRSA, GCP workload identity). - # This in turn results in users unable to able to access AWS/GCP object - # storage buckets. - # - blockWithIptables: false - extraEnv: - # About DASK_ prefixed variables we set: - # - # 1. k8s native variable expansion is applied with $(MY_ENV) syntax. The - # order variables are defined matters though and we are under the - # mercy of how KubeSpawner renders our passed dictionaries. - # - # 2. Dask loads local YAML config. - # - # 3. Dask loads environment variables prefixed DASK_. - # - DASK_ is stripped - # - Capitalization is ignored - # - Double underscore means a nested configuration - # - `ast.literal_eval` is used to parse values - # - # 4. dask-gateway and dask-distributed looks at its config and expands - # expressions in {} again, sometimes only with the environment - # variables as context but sometimes also with additional variables. - # - # References: - # - K8s expansion: https://kubernetes.io/docs/tasks/inject-data-application/define-interdependent-environment-variables/ - # - KubeSpawner issue: https://github.com/jupyterhub/kubespawner/issues/491 - # - Dask config: https://docs.dask.org/en/latest/configuration.html - # - Exploration issue: https://github.com/2i2c-org/infrastructure/issues/442 - # - # DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE makes the default worker image - # match the singleuser image. - DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE: "{{JUPYTER_IMAGE_SPEC}}" - # DASK_GATEWAY__CLUSTER__OPTIONS__ENVIRONMENT makes some environment - # variables be copied over to the worker nodes from the user nodes. - DASK_GATEWAY__CLUSTER__OPTIONS__ENVIRONMENT: '{{"SCRATCH_BUCKET": "$(SCRATCH_BUCKET)", "PANGEO_SCRATCH": "$(PANGEO_SCRATCH)"}}' - # DASK_DISTRIBUTED__DASHBOARD__LINK makes the suggested link to the - # dashboard account for the /user// prefix in the path. - # JUPYTERHUB_SERVICE_PREFIX has leading and trailing slashes as appropriate - DASK_DISTRIBUTED__DASHBOARD__LINK: "{{JUPYTERHUB_SERVICE_PREFIX}}proxy/{{port}}/status" - - hub: - services: - dask-gateway: - # Don't display a dask-gateway entry under 'services', - # as dask-gateway has no UI - display: false - extraConfig: - # Initially copied from https://github.com/dask/helm-chart/blob/main/daskhub/values.yaml - daskhub-01-add-dask-gateway-values: | - # 1. Sets `DASK_GATEWAY__PROXY_ADDRESS` in the singleuser environment. - # 2. Adds the URL for the Dask Gateway JupyterHub service. - import os - # These are set by jupyterhub. - release_name = os.environ["HELM_RELEASE_NAME"] - release_namespace = os.environ["POD_NAMESPACE"] - if "PROXY_HTTP_SERVICE_HOST" in os.environ: - # https is enabled, we want to use the internal http service. - gateway_address = "http://{}:{}/services/dask-gateway/".format( - os.environ["PROXY_HTTP_SERVICE_HOST"], - os.environ["PROXY_HTTP_SERVICE_PORT"], - ) - print("Setting DASK_GATEWAY__ADDRESS {} from HTTP service".format(gateway_address)) - else: - gateway_address = "http://proxy-public/services/dask-gateway" - print("Setting DASK_GATEWAY__ADDRESS {}".format(gateway_address)) - # Internal address to connect to the Dask Gateway. - c.KubeSpawner.environment.setdefault("DASK_GATEWAY__ADDRESS", gateway_address) - # Internal address for the Dask Gateway proxy. - c.KubeSpawner.environment.setdefault("DASK_GATEWAY__PROXY_ADDRESS", "gateway://traefik-{}-dask-gateway.{}:80".format(release_name, release_namespace)) - # Relative address for the dashboard link. - c.KubeSpawner.environment.setdefault("DASK_GATEWAY__PUBLIC_ADDRESS", "/services/dask-gateway/") - # Use JupyterHub to authenticate with Dask Gateway. - c.KubeSpawner.environment.setdefault("DASK_GATEWAY__AUTH__TYPE", "jupyterhub") - # Adds Dask Gateway as a JupyterHub service to make the gateway available at - # {HUB_URL}/services/dask-gateway - service_url = "http://traefik-{}-dask-gateway.{}".format(release_name, release_namespace) - for service in c.JupyterHub.services: - if service["name"] == "dask-gateway": - if not service.get("url", None): - print("Adding dask-gateway service URL") - service.setdefault("url", service_url) - break - else: - print("dask-gateway service not found, this should not happen!") - -dask-gateway: - enabled: true # Enabling dask-gateway will install Dask Gateway as a dependency. - # Further Dask Gateway configuration goes here - # See https://github.com/dask/dask-gateway/blob/main/resources/helm/dask-gateway/values.yaml - gateway: - backend: - scheduler: - extraPodConfig: - serviceAccountName: user-sa - tolerations: - # Let's put schedulers on notebook nodes, since they aren't ephemeral - # dask can recover from dead workers, but not dead schedulers - - key: "hub.jupyter.org/dedicated" - operator: "Equal" - value: "user" - effect: "NoSchedule" - - key: "hub.jupyter.org_dedicated" - operator: "Equal" - value: "user" - effect: "NoSchedule" - nodeSelector: - k8s.dask.org/node-purpose: scheduler - cores: - request: 0.01 - limit: 1 - memory: - request: 128M - limit: 1G - worker: - extraContainerConfig: - securityContext: - runAsGroup: 1000 - runAsUser: 1000 - extraPodConfig: - serviceAccountName: user-sa - securityContext: - fsGroup: 1000 - tolerations: - - key: "k8s.dask.org/dedicated" - operator: "Equal" - value: "worker" - effect: "NoSchedule" - - key: "k8s.dask.org_dedicated" - operator: "Equal" - value: "worker" - effect: "NoSchedule" - nodeSelector: - # Dask workers get their own pre-emptible pool - k8s.dask.org/node-purpose: worker - env: - - name: BASEHUB_K8S_DIST - valueFrom: - configMapKeyRef: - name: basehub-cluster-info - key: K8S_DIST - - extraConfig: - # This configuration represents options that can be presented to users - # that want to create a Dask cluster using dask-gateway client. - # - # This configuration is meant to enable the user to request dask worker - # pods that fits well on 2i2c's clusters. Currently the only kind of - # instance types used are n2-highmem-16 or r5.4xlarge. - # - # - Documentation about exposing cluster options to users: - # https://gateway.dask.org/cluster-options.html and the - # - Reference for KubeClusterConfig, which is what can be configured: - # https://gateway.dask.org/api-server.html#kubeclusterconfig. - # - optionHandler: | - import os - import string - - from dask_gateway_server.options import Integer, Mapping, Options, Select, String - - # Escape a string to be dns-safe in the same way that KubeSpawner does it. - # Reference https://github.com/jupyterhub/kubespawner/blob/616f72c4aee26c3d2127c6af6086ec50d6cda383/kubespawner/spawner.py#L1828-L1835 - # Adapted from https://github.com/minrk/escapism to avoid installing the package - # in the dask-gateway api pod which would have been problematic. - def escape_string_label_safe(to_escape): - safe_chars = set(string.ascii_lowercase + string.digits) - escape_char = "-" - chars = [] - for c in to_escape: - if c in safe_chars: - chars.append(c) - else: - # escape one character - buf = [] - # UTF-8 uses 1 to 4 bytes per character, depending on the Unicode symbol - # so we need to transform each byte to its hex value - for byte in c.encode("utf8"): - buf.append(escape_char) - # %X is the hex value of the byte - buf.append('%X' % byte) - escaped_hex_char = "".join(buf) - chars.append(escaped_hex_char) - return u''.join(chars) - - # Decide on available instance types and their resource allocation - # choices to expose based on cloud provider. For each daskhub hub - # managed by 2i2c, there should be these instance types available. - # - cloud_provider = os.environ["BASEHUB_K8S_DIST"] # gke, eks, or aks - instance_types = { - "gke": ["n2-highmem-16"], - "eks": ["r5.4xlarge"], - "aks": ["Standard_E16_v4"], - } - - # NOTE: Data mentioned below comes from manual inspection of data - # collected and currently only available at - # https://github.com/2i2c-org/infrastructure/pull/3337. - # - resource_allocations = { - # n2-highmem-16 nodes in our clusters have 15.89 allocatable cores - # and 116.549Gi allocatable memory, and daemonset are expected to - # not add more than 400m cores and 800Mi (0.781Gi) memory with some - # margin, so we get 15.49 cores and 115.768Gi available for worker - # pods to request. - # - # This is an initial conservative strategy, allowing a slight - # oversubscription of CPU but not any oversubscription of memory. - # - # To workaround https://github.com/dask/dask-gateway/issues/765, we - # round worker_cores down from [0.968, 1.936, 3.872, 7.745, 15.49] - # to [0.9, 1.9, 3.8, 7.7, 15.4]. - # - "n2-highmem-16": { - "1CPU, 7.2Gi": {"worker_cores": 0.9, "worker_cores_limit": 1, "worker_memory": "7.235G", "worker_memory_limit": "7.235G"}, - "2CPU, 14.5Gi": {"worker_cores": 1.9, "worker_cores_limit": 2, "worker_memory": "14.471G", "worker_memory_limit": "14.471G"}, - "4CPU, 28.9Gi": {"worker_cores": 3.8, "worker_cores_limit": 4, "worker_memory": "28.942G", "worker_memory_limit": "28.942G"}, - "8CPU, 57.9Gi": {"worker_cores": 7.7, "worker_cores_limit": 8, "worker_memory": "57.884G", "worker_memory_limit": "57.884G"}, - "16CPU, 115.8Gi": {"worker_cores": 15.4, "worker_cores_limit": 16, "worker_memory": "115.768G", "worker_memory_limit": "115.768G"}, - }, - # r5.4xlarge nodes in our clusters have 15.89 allocatable cores and - # 121.504Gi allocatable memory, and daemonset are expected to not - # add more than 400m cores and 800Mi (0.781Gi) memory with some - # margin, so we get 15.49 cores and 120.723Gi available for worker - # pods to request. - # - # This is an initial conservative strategy, allowing a slight - # oversubscription of CPU but not any oversubscription of memory. - # - # To workaround https://github.com/dask/dask-gateway/issues/765, we - # round worker_cores down from [0.968, 1.936, 3.872, 7.745, 15.49] - # to [0.9, 1.9, 3.8, 7.7, 15.4]. - # - "r5.4xlarge": { - "1CPU, 7.5Gi": {"worker_cores": 0.9, "worker_cores_limit": 1, "worker_memory": "7.545G", "worker_memory_limit": "7.545G"}, - "2CPU, 15.1Gi": {"worker_cores": 1.9, "worker_cores_limit": 2, "worker_memory": "15.090G", "worker_memory_limit": "15.090G"}, - "4CPU, 30.2Gi": {"worker_cores": 3.8, "worker_cores_limit": 4, "worker_memory": "30.180G", "worker_memory_limit": "30.180G"}, - "8CPU, 60.4Gi": {"worker_cores": 7.7, "worker_cores_limit": 8, "worker_memory": "60.361G", "worker_memory_limit": "60.361G"}, - "16CPU, 120.7Gi": {"worker_cores": 15.4, "worker_cores_limit": 16, "worker_memory": "120.723G", "worker_memory_limit": "120.723G"}, - }, - "Standard_E16_v4": { - # Set up to be proportioate, so using all the RAM uses all the CPU too - ".25-1 CPU, 2GB RAM": {"worker_cores": 0.25, "worker_cores_limit": 1, "worker_memory": "2G", "worker_memory_limit": "2G"}, - }, - } - - # for now we support only on one instance type per cluster, listing it - # as an option is a way to help convey how things work a bit better - it = instance_types[cloud_provider][0] - ra = resource_allocations[it] - ra_keys = list(ra.keys()) - - def cluster_options(user): - def option_handler(options): - if ":" not in options.image: - raise ValueError("When specifying an image you must also provide a tag") - extra_labels = { - "hub.jupyter.org/username": escape_string_label_safe(user.name), - } - scheduler_extra_pod_annotations = { - "hub.jupyter.org/username": user.name, - "prometheus.io/scrape": "true", - "prometheus.io/port": "8787", - } - worker_extra_pod_annotations = { - "hub.jupyter.org/username": user.name, - } - picked_ra = ra[options.worker_resource_allocation] - - return { - # A default image is suggested via DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE env variable - "image": options.image, - "scheduler_extra_pod_labels": extra_labels, - "scheduler_extra_pod_annotations": scheduler_extra_pod_annotations, - "worker_extra_pod_labels": extra_labels, - "worker_extra_pod_annotations": worker_extra_pod_annotations, - "worker_cores": picked_ra["worker_cores"], - "worker_cores_limit": picked_ra["worker_cores_limit"], - "worker_memory": picked_ra["worker_memory"], - "worker_memory_limit": picked_ra["worker_memory_limit"], - "environment": options.environment, - "idle_timeout": options.idle_timeout_minutes * 60, - } - return Options( - Select( - "instance_type", - [it], - default=it, - label="Instance type running worker containers", - ), - Select( - "worker_resource_allocation", - ra_keys, - default=ra_keys[0], - label="Resources per worker container", - ), - # The default image is pre-specified by the dask-gateway client - # via the env var DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE set on - # the jupyterhub user pods - String("image", label="Image"), - Mapping("environment", {}, label="Environment variables (YAML)"), - Integer("idle_timeout_minutes", 30, min=0, label="Idle cluster terminated after (minutes)"), - handler=option_handler, - ) - c.Backend.cluster_options = cluster_options - - # timeout after 30 minutes of inactivity by default, keep this in sync - # with the user exposed option idle_timeout_minutes's default value - # configured above - c.KubeClusterConfig.idle_timeout = 1800 - prefix: "/services/dask-gateway" # Users connect to the Gateway through the JupyterHub service. - auth: - type: jupyterhub # Use JupyterHub to authenticate with Dask Gateway - traefik: - nodeSelector: - k8s.dask.org/node-purpose: core - service: - type: ClusterIP # Access Dask Gateway through JupyterHub. To access the Gateway from outside JupyterHub, this must be changed to a `LoadBalancer`. - -# A placeholder as global values that can be referenced from the same location -# of any chart should be possible to provide, but aren't necessarily provided or -# used. -global: {}