From eedd8f5ef09387afe9d0821d3520383ccb727fcd Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 24 Jan 2025 18:06:29 +0000 Subject: [PATCH 01/46] Import template app from waldur example --- apps/jupyterhub/configmap.yaml | 178 ++++++++++++++++++++++++++++ apps/jupyterhub/example-secret.yaml | 19 +++ apps/jupyterhub/extra-rbac.yaml | 25 ++++ apps/jupyterhub/helmchart.yaml | 13 ++ apps/jupyterhub/helmrelease.yaml | 28 +++++ apps/jupyterhub/helmrepository.yaml | 9 ++ apps/jupyterhub/kustomization.yaml | 8 ++ apps/jupyterhub/namespace.yaml | 5 + apps/spegel/helmrelease.yaml | 15 +++ apps/spegel/helmrepository.yaml | 9 ++ apps/spegel/kustomization.yaml | 4 + apps/spegel/namespace.yaml | 4 + apps/waldur/configmap.yaml | 48 ++++++++ apps/waldur/example-secret.yaml | 13 ++ apps/waldur/helmchart.yaml | 13 ++ apps/waldur/helmrelease.yaml | 40 +++++++ apps/waldur/helmrepository.yaml | 9 ++ apps/waldur/kustomization.yaml | 7 ++ apps/waldur/namespace.yaml | 5 + kustomization.yaml | 4 + 20 files changed, 456 insertions(+) create mode 100644 apps/jupyterhub/configmap.yaml create mode 100644 apps/jupyterhub/example-secret.yaml create mode 100644 apps/jupyterhub/extra-rbac.yaml create mode 100644 apps/jupyterhub/helmchart.yaml create mode 100644 apps/jupyterhub/helmrelease.yaml create mode 100644 apps/jupyterhub/helmrepository.yaml create mode 100644 apps/jupyterhub/kustomization.yaml create mode 100644 apps/jupyterhub/namespace.yaml create mode 100644 apps/spegel/helmrelease.yaml create mode 100644 apps/spegel/helmrepository.yaml create mode 100644 apps/spegel/kustomization.yaml create mode 100644 apps/spegel/namespace.yaml create mode 100644 apps/waldur/configmap.yaml create mode 100644 apps/waldur/example-secret.yaml create mode 100644 apps/waldur/helmchart.yaml create mode 100644 apps/waldur/helmrelease.yaml create mode 100644 apps/waldur/helmrepository.yaml create mode 100644 apps/waldur/kustomization.yaml create mode 100644 apps/waldur/namespace.yaml create mode 100644 kustomization.yaml diff --git a/apps/jupyterhub/configmap.yaml b/apps/jupyterhub/configmap.yaml new file mode 100644 index 0000000..0be1121 --- /dev/null +++ b/apps/jupyterhub/configmap.yaml @@ -0,0 +1,178 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: jupyterhub-config + namespace: jupyterhub +data: + values.yaml: | + + # Add JupyterHub customisations here + # See https://artifacthub.io/packages/helm/jupyterhub/jupyterhub + + # We don't need a load balancer for the proxy + # since we want to use ingress instead. + proxy: + service: + type: ClusterIP + + # Make JupyterHub accessible via ingress + ingress: + enabled: true + ingressClassName: nginx + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + hosts: + # IP must match NGINX ingress controller's + # load balancer IP. + # See `kubectl get svc -n ingress-nginx` + - &host 128-232-226-29.sslip.io + pathSuffix: "" + tls: + - hosts: + - *host + secretName: jupyterhub-ingress-cert + + hub: + allowNamedServers: true + namedServerLimitPerUser: 5 + activeServerLimit: 3 + # Server startup fails with default + # restrictive network policy. + networkPolicy: + enabled: false + + # Configure Keycloak auth + config: + JupyterHub: + authenticator_class: generic-oauth + GenericOAuthenticator: + client_id: scott-jupyterhub-test + # client_secret: + # Must match ingress host + oauth_callback_url: https://128-232-226-29.sslip.io/hub/oauth_callback + authorize_url: https://identity.apps.hpc.cam.ac.uk/realms/az-rcp-cloud-portal-demo/protocol/openid-connect/auth + token_url: https://identity.apps.hpc.cam.ac.uk/realms/az-rcp-cloud-portal-demo/protocol/openid-connect/token + userdata_url: https://identity.apps.hpc.cam.ac.uk/realms/az-rcp-cloud-portal-demo/protocol/openid-connect/userinfo + scope: + - openid + - groups + username_claim: preferred_username + claim_groups_key: groups + userdata_params: + state: state + + # Limit access to specific keycloak groups + allowed_groups: + - /admins + - /platform-users + + # Allow hub admin access to keycloak users/groups + # admin_groups: + # - /admins + admin_users: + - scottd_stack + + # Label for the 'Sign in with ___' button + login_service: Keycloak + + # We install the kubernetes client here so that we can use it to + # detect weather the cluster has any GPU nodes, allowing us to show/hide + # GPU nodebook profiles automatically. + args: + - bash + - -c + - "pip install kubernetes && jupyterhub --config /usr/local/etc/jupyterhub/jupyterhub_config.py --upgrade-db" + extraConfig: + # Add user profiles dynamically based on cluster hardware + customspawner.py: | + from kubespawner import KubeSpawner + from kubernetes import client, config + + def build_profiles(spawner: KubeSpawner): + profiles = [ + { + "display_name": "Minimal environment", + "description": "To avoid too much bells and whistles: Python.", + "default": True, + }, + { + "display_name": "Datascience environment", + "description": "If you want the additional bells and whistles: Python, R, and Julia.", + "kubespawner_override": { + "image": "quay.io/jupyter/datascience-notebook:2024-08-05" + }, + }, + { + "display_name": "Pytorch environment (CPU)", + "description": "The official Jupyter Pytorch.", + "kubespawner_override": { + "image": "quay.io/jupyter/pytorch-notebook:pytorch-2.4.0", + }, + }, + ] + + config.load_incluster_config() + api = client.CoreV1Api() + nodes = api.list_node().items + + has_nvidia_gpu = lambda node: node.metadata.labels.get("nvidia.com/gpu.present", "") == "true" + has_intel_gpu = lambda node: node.metadata.labels.get("gpu.intel.com/device-id.0380-0bd5.present", "") == "true" + + if any(map(has_nvidia_gpu, nodes)): + profiles.append({ + "display_name": "Pytorch environment (Nvidia GPU)", + "description": "The official Jupyter Pytorch + CUDA image. Requires a GPU compatible notebook server.", + "kubespawner_override": { + "image": "quay.io/jupyter/pytorch-notebook:cuda12-pytorch-2.4.0", + "extra_resource_limits": { + "nvidia.com/gpu": "1", + }, + }, + }) + if any(map(has_intel_gpu, nodes)): + profiles.append({ + "display_name": "Pytorch environment (Intel GPU)", + "description": "A Jupyter + Intel Pytorch image. Requires a GPU compatible notebook server.", + "kubespawner_override": { + "image": "ghcr.io/stackhpc/jupyterhub-pytorch-intel-gpu:v0.0.1", + "extra_resource_limits": { + "gpu.intel.com/i915": "1", + }, + "supplemental_gids":[ + "110", # Ubuntu render group GID, requred for permission to use Intel GPU device + ], + }, + }) + + return profiles + + c.KubeSpawner.profile_list = build_profiles + + # Pre-pullers are not useful when profile list + # is built dynamically + prePuller: + hook: + enabled: false + continuous: + enabled: false + + singleuser: + defaultUrl: /lab + # Defines the default image + image: + name: quay.io/jupyter/minimal-notebook + tag: "2024-08-05" + # Build these dynamically in extraConfig above instead + # profileList: + # - display_name: "Minimal environment" + # description: "To avoid too much bells and whistles: Python." + # default: true + # - display_name: "Datascience environment" + # description: "If you want the additional bells and whistles: Python, R, and Julia." + # kubespawner_override: + # image: quay.io/jupyter/datascience-notebook:2024-08-05 + # - display_name: "Spark environment" + # description: "The Jupyter Stacks spark image!" + # kubespawner_override: + # image: quay.io/jupyter/all-spark-notebook:2024-08-05 diff --git a/apps/jupyterhub/example-secret.yaml b/apps/jupyterhub/example-secret.yaml new file mode 100644 index 0000000..6112729 --- /dev/null +++ b/apps/jupyterhub/example-secret.yaml @@ -0,0 +1,19 @@ +--- +############ +# IMPORTANT: Make sure you run kubeseal against any secret before commiting it to git! +# Example command: +# kubeseal \ +# --kubeconfig clusters/jupyterhub/kubeconfig \ +# --format yaml \ +# --controller-name sealed-secrets \ +# --controller-namespace sealed-secrets-system \ +# --secret-file components/jupyterhub/secret.yaml \ +# --sealed-secret-file components/jupyterhub/secret.yaml +############ +apiVersion: v1 +kind: Secret +metadata: + name: jupyterhub-keycloak-config + namespace: jupyterhub +stringData: + keycloakClientSecret: diff --git a/apps/jupyterhub/extra-rbac.yaml b/apps/jupyterhub/extra-rbac.yaml new file mode 100644 index 0000000..4ee2575 --- /dev/null +++ b/apps/jupyterhub/extra-rbac.yaml @@ -0,0 +1,25 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: jupyterhub-node-list +rules: +- apiGroups: + - "" + resources: + - nodes + verbs: + - list +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: jupyterhub-node-list +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: jupyterhub-node-list +subjects: +- kind: ServiceAccount + name: hub + namespace: jupyterhub diff --git a/apps/jupyterhub/helmchart.yaml b/apps/jupyterhub/helmchart.yaml new file mode 100644 index 0000000..8d96a03 --- /dev/null +++ b/apps/jupyterhub/helmchart.yaml @@ -0,0 +1,13 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmChart +metadata: + name: jupyterhub + namespace: jupyterhub +spec: + chart: jupyterhub + version: "3.3.8" + sourceRef: + kind: HelmRepository + name: jupyterhub + interval: 10m0s diff --git a/apps/jupyterhub/helmrelease.yaml b/apps/jupyterhub/helmrelease.yaml new file mode 100644 index 0000000..f252872 --- /dev/null +++ b/apps/jupyterhub/helmrelease.yaml @@ -0,0 +1,28 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: jupyterhub + namespace: jupyterhub +spec: + chartRef: + kind: HelmChart + name: jupyterhub + releaseName: jupyterhub + valuesFrom: + - kind: ConfigMap + name: jupyterhub-config + - kind: Secret + name: jupyterhub-keycloak-config + valuesKey: keycloakClientSecret + targetPath: hub.config.GenericOAuthenticator.client_secret + install: + createNamespace: true + remediation: + retries: 3 + upgrade: + remediation: + retries: 3 + driftDetection: + mode: enabled + interval: 5m diff --git a/apps/jupyterhub/helmrepository.yaml b/apps/jupyterhub/helmrepository.yaml new file mode 100644 index 0000000..2c87a51 --- /dev/null +++ b/apps/jupyterhub/helmrepository.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: jupyterhub + namespace: jupyterhub +spec: + url: https://jupyterhub.github.io/helm-chart + interval: 1h diff --git a/apps/jupyterhub/kustomization.yaml b/apps/jupyterhub/kustomization.yaml new file mode 100644 index 0000000..243a8c0 --- /dev/null +++ b/apps/jupyterhub/kustomization.yaml @@ -0,0 +1,8 @@ +resources: + - namespace.yaml + - helmrepository.yaml + - helmchart.yaml + - helmrelease.yaml + - configmap.yaml + - secret.yaml + - extra-rbac.yaml diff --git a/apps/jupyterhub/namespace.yaml b/apps/jupyterhub/namespace.yaml new file mode 100644 index 0000000..241052e --- /dev/null +++ b/apps/jupyterhub/namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: jupyterhub diff --git a/apps/spegel/helmrelease.yaml b/apps/spegel/helmrelease.yaml new file mode 100644 index 0000000..e859a31 --- /dev/null +++ b/apps/spegel/helmrelease.yaml @@ -0,0 +1,15 @@ +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: spegel + namespace: spegel +spec: + interval: 1m + chart: + spec: + chart: spegel + version: "v0.0.23" + interval: 5m + sourceRef: + kind: HelmRepository + name: spegel diff --git a/apps/spegel/helmrepository.yaml b/apps/spegel/helmrepository.yaml new file mode 100644 index 0000000..62b88ac --- /dev/null +++ b/apps/spegel/helmrepository.yaml @@ -0,0 +1,9 @@ +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: spegel + namespace: spegel +spec: + type: "oci" + interval: 5m0s + url: oci://ghcr.io/spegel-org/helm-charts diff --git a/apps/spegel/kustomization.yaml b/apps/spegel/kustomization.yaml new file mode 100644 index 0000000..157da08 --- /dev/null +++ b/apps/spegel/kustomization.yaml @@ -0,0 +1,4 @@ +resources: + - namespace.yaml + - helmrepository.yaml + - helmrelease.yaml diff --git a/apps/spegel/namespace.yaml b/apps/spegel/namespace.yaml new file mode 100644 index 0000000..2b70b05 --- /dev/null +++ b/apps/spegel/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: spegel diff --git a/apps/waldur/configmap.yaml b/apps/waldur/configmap.yaml new file mode 100644 index 0000000..dde1398 --- /dev/null +++ b/apps/waldur/configmap.yaml @@ -0,0 +1,48 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: waldur-config + namespace: waldur +data: + values.yaml: | + + postgresql: + enabled: true + rabbitmq: + enabled: true + + ### Example config for exposing the Waldur UI via sslio.io + + # NOTE: If using sslip.io, the host IP must match the FIP + # which was assigned to the NGINX ingress controller's + # load balancer. An alternative host name may be used if + # an appropriate DNS entry for the load balancer's IP + # is added to your upstream DNS servers. It is not possible + # to create / manage such a DNS entry via Kubernetes. + + # apiHostname: 128-232-226-47.sslip.io + # apiScheme: https + # homeportHostname: 128-232-226-47.sslip.io + # homeportScheme: https + # ingress: + # tls: + # enabled: true + # source: letsEncrypt + + ### Example config for Keycloak integration + + # waldur: + # authMethods: + # - LOCAL_SIGNIN + # - SOCIAL_SIGNUP + # socialAuthMethods: + # - label: Keycloak + # provider: keycloak + # # clientId: + # # clientSecret: + # # discoveryUrl: + # managementUrl: "" + # protectedFields: + # - full_name + # - email diff --git a/apps/waldur/example-secret.yaml b/apps/waldur/example-secret.yaml new file mode 100644 index 0000000..82a69a8 --- /dev/null +++ b/apps/waldur/example-secret.yaml @@ -0,0 +1,13 @@ +--- +############ +# IMPORTANT: Make sure you run kubeseal against any secret before commiting it to git! +############ +apiVersion: v1 +kind: Secret +metadata: + name: waldur-keycloak-config + namespace: waldur +stringData: + keycloakClientId: + keycloakClientSecret: + keycloakDiscoveryUrl: /.well-known/openid-configuration diff --git a/apps/waldur/helmchart.yaml b/apps/waldur/helmchart.yaml new file mode 100644 index 0000000..f57a126 --- /dev/null +++ b/apps/waldur/helmchart.yaml @@ -0,0 +1,13 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmChart +metadata: + name: waldur + namespace: waldur +spec: + chart: waldur + version: 6.7.3 + sourceRef: + kind: HelmRepository + name: waldur-charts + interval: 1h diff --git a/apps/waldur/helmrelease.yaml b/apps/waldur/helmrelease.yaml new file mode 100644 index 0000000..ec2f3bf --- /dev/null +++ b/apps/waldur/helmrelease.yaml @@ -0,0 +1,40 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: waldur + namespace: waldur +spec: + chartRef: + kind: HelmChart + name: waldur + releaseName: waldur + valuesFrom: + - kind: ConfigMap + name: waldur-config + # - kind: Secret + # name: waldur-keycloak-config + # valuesKey: keycloakClientId + # targetPath: waldur.socialAuthMethods[0].clientId + # - kind: Secret + # name: waldur-keycloak-config + # valuesKey: keycloakClientSecret + # targetPath: waldur.socialAuthMethods[0].clientSecret + # - kind: Secret + # name: waldur-keycloak-config + # valuesKey: keycloakDiscoveryUrl + # targetPath: waldur.socialAuthMethods[0].discoveryUrl + + install: + createNamespace: true + remediation: + retries: 3 + upgrade: + remediation: + retries: 3 + driftDetection: + mode: enabled + interval: 5m + # Waldur DB migrations during install/upgrade + # can take a while so increase install timeout + timeout: 60m diff --git a/apps/waldur/helmrepository.yaml b/apps/waldur/helmrepository.yaml new file mode 100644 index 0000000..413a05d --- /dev/null +++ b/apps/waldur/helmrepository.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: waldur-charts + namespace: waldur +spec: + url: https://waldur.github.io/waldur-helm + interval: 1h diff --git a/apps/waldur/kustomization.yaml b/apps/waldur/kustomization.yaml new file mode 100644 index 0000000..a62890a --- /dev/null +++ b/apps/waldur/kustomization.yaml @@ -0,0 +1,7 @@ +resources: + - namespace.yaml + - helmrepository.yaml + - helmchart.yaml + - helmrelease.yaml + - configmap.yaml + # - secret.yaml diff --git a/apps/waldur/namespace.yaml b/apps/waldur/namespace.yaml new file mode 100644 index 0000000..3457677 --- /dev/null +++ b/apps/waldur/namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: waldur diff --git a/kustomization.yaml b/kustomization.yaml new file mode 100644 index 0000000..3ac1587 --- /dev/null +++ b/kustomization.yaml @@ -0,0 +1,4 @@ +--- +resources: + # - apps/spegel + # - apps/jupyterhub From 7604cea4afa4881ce315be6d8f644b44b918a04d Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 24 Jan 2025 18:28:11 +0000 Subject: [PATCH 02/46] Add initial README file --- README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/README.md b/README.md index 4293566..01f8e0e 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,19 @@ # fluxcd-demo-apps A repository of example apps deployed and managed using Flux CD + +## Creating Sealed Secrets + +We assume the use of sealed secrets. + +TODO: add more instructions! + +## How to install + +The host cluster must have the [Flux CD](https://fluxcd.io/) controllers installed. + +Configuring Flux to manage the apps defined in the repository is a one-time operation: + +```sh +flux create source git myapps --url= --branch=main +flux create kustomization myapps --source=GitRepository/myapps --prune=true +``` From 3c45ec7063c2255e2828b4fd5c2437bcf284a57f Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 24 Jan 2025 18:34:06 +0000 Subject: [PATCH 03/46] Turn on apps --- kustomization.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kustomization.yaml b/kustomization.yaml index 3ac1587..0dcb967 100644 --- a/kustomization.yaml +++ b/kustomization.yaml @@ -1,4 +1,4 @@ --- resources: - # - apps/spegel - # - apps/jupyterhub + - apps/spegel + - apps/jupyterhub From 1246d54facb6d78b668e73050cb4c2207d16d42c Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 24 Jan 2025 18:36:50 +0000 Subject: [PATCH 04/46] Remove secret --- apps/jupyterhub/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/jupyterhub/kustomization.yaml b/apps/jupyterhub/kustomization.yaml index 243a8c0..620cff3 100644 --- a/apps/jupyterhub/kustomization.yaml +++ b/apps/jupyterhub/kustomization.yaml @@ -4,5 +4,5 @@ resources: - helmchart.yaml - helmrelease.yaml - configmap.yaml - - secret.yaml + # - secret.yaml - extra-rbac.yaml From 9dbe2ff1d87c7d810b054611d3f1dd75ddb1ebbf Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 24 Jan 2025 18:42:15 +0000 Subject: [PATCH 05/46] Attempt to fix juypterhub deploy --- apps/jupyterhub/configmap.yaml | 70 +++++++++++++++++----------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/apps/jupyterhub/configmap.yaml b/apps/jupyterhub/configmap.yaml index 0be1121..8b809f7 100644 --- a/apps/jupyterhub/configmap.yaml +++ b/apps/jupyterhub/configmap.yaml @@ -18,7 +18,7 @@ data: # Make JupyterHub accessible via ingress ingress: - enabled: true + enabled: false ingressClassName: nginx annotations: cert-manager.io/cluster-issuer: letsencrypt-prod @@ -42,39 +42,39 @@ data: networkPolicy: enabled: false - # Configure Keycloak auth - config: - JupyterHub: - authenticator_class: generic-oauth - GenericOAuthenticator: - client_id: scott-jupyterhub-test - # client_secret: - # Must match ingress host - oauth_callback_url: https://128-232-226-29.sslip.io/hub/oauth_callback - authorize_url: https://identity.apps.hpc.cam.ac.uk/realms/az-rcp-cloud-portal-demo/protocol/openid-connect/auth - token_url: https://identity.apps.hpc.cam.ac.uk/realms/az-rcp-cloud-portal-demo/protocol/openid-connect/token - userdata_url: https://identity.apps.hpc.cam.ac.uk/realms/az-rcp-cloud-portal-demo/protocol/openid-connect/userinfo - scope: - - openid - - groups - username_claim: preferred_username - claim_groups_key: groups - userdata_params: - state: state - - # Limit access to specific keycloak groups - allowed_groups: - - /admins - - /platform-users - - # Allow hub admin access to keycloak users/groups - # admin_groups: - # - /admins - admin_users: - - scottd_stack - - # Label for the 'Sign in with ___' button - login_service: Keycloak + # # Configure Keycloak auth + # config: + # JupyterHub: + # authenticator_class: generic-oauth + # GenericOAuthenticator: + # client_id: scott-jupyterhub-test + # # client_secret: + # # Must match ingress host + # oauth_callback_url: https://128-232-226-29.sslip.io/hub/oauth_callback + # authorize_url: https://identity.apps.hpc.cam.ac.uk/realms/az-rcp-cloud-portal-demo/protocol/openid-connect/auth + # token_url: https://identity.apps.hpc.cam.ac.uk/realms/az-rcp-cloud-portal-demo/protocol/openid-connect/token + # userdata_url: https://identity.apps.hpc.cam.ac.uk/realms/az-rcp-cloud-portal-demo/protocol/openid-connect/userinfo + # scope: + # - openid + # - groups + # username_claim: preferred_username + # claim_groups_key: groups + # userdata_params: + # state: state + + # # Limit access to specific keycloak groups + # allowed_groups: + # - /admins + # - /platform-users + + # # Allow hub admin access to keycloak users/groups + # # admin_groups: + # # - /admins + # admin_users: + # - scottd_stack + + # # Label for the 'Sign in with ___' button + # login_service: Keycloak # We install the kubernetes client here so that we can use it to # detect weather the cluster has any GPU nodes, allowing us to show/hide @@ -162,7 +162,7 @@ data: # Defines the default image image: name: quay.io/jupyter/minimal-notebook - tag: "2024-08-05" + tag: "2025-01-20" # Build these dynamically in extraConfig above instead # profileList: # - display_name: "Minimal environment" From c4e7645cd5a0c1cfe8fccdaad88df9992ff5ff90 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 24 Jan 2025 18:52:42 +0000 Subject: [PATCH 06/46] Actually remove secret --- apps/jupyterhub/helmrelease.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/jupyterhub/helmrelease.yaml b/apps/jupyterhub/helmrelease.yaml index f252872..01c297b 100644 --- a/apps/jupyterhub/helmrelease.yaml +++ b/apps/jupyterhub/helmrelease.yaml @@ -12,10 +12,10 @@ spec: valuesFrom: - kind: ConfigMap name: jupyterhub-config - - kind: Secret - name: jupyterhub-keycloak-config - valuesKey: keycloakClientSecret - targetPath: hub.config.GenericOAuthenticator.client_secret + # - kind: Secret + # name: jupyterhub-keycloak-config + # valuesKey: keycloakClientSecret + # targetPath: hub.config.GenericOAuthenticator.client_secret install: createNamespace: true remediation: From 417c62452a5bb8001be2296cf3838b8cbe0081d6 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Mon, 27 Jan 2025 16:35:43 +0000 Subject: [PATCH 07/46] Initial slurm-operator deploy --- apps/slinky-slurm-operator/defaults.yaml | 2 ++ apps/slinky-slurm-operator/helmrelease.yaml | 23 +++++++++++++++++++ apps/slinky-slurm-operator/kustomization.yaml | 15 ++++++++++++ .../kustomizeconfig.yaml | 7 ++++++ apps/slinky-slurm-operator/ocirepository.yaml | 10 ++++++++ 5 files changed, 57 insertions(+) create mode 100644 apps/slinky-slurm-operator/defaults.yaml create mode 100644 apps/slinky-slurm-operator/helmrelease.yaml create mode 100644 apps/slinky-slurm-operator/kustomization.yaml create mode 100644 apps/slinky-slurm-operator/kustomizeconfig.yaml create mode 100644 apps/slinky-slurm-operator/ocirepository.yaml diff --git a/apps/slinky-slurm-operator/defaults.yaml b/apps/slinky-slurm-operator/defaults.yaml new file mode 100644 index 0000000..ee236d3 --- /dev/null +++ b/apps/slinky-slurm-operator/defaults.yaml @@ -0,0 +1,2 @@ +--- +todo: false \ No newline at end of file diff --git a/apps/slinky-slurm-operator/helmrelease.yaml b/apps/slinky-slurm-operator/helmrelease.yaml new file mode 100644 index 0000000..84cebf5 --- /dev/null +++ b/apps/slinky-slurm-operator/helmrelease.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: slinky-slurm-operator +spec: + chartRef: + kind: OCIRepository + name: slinky-slurm-operator + valuesFrom: + - kind: ConfigMap + name: slinky-slurm-operator-defaults + valuesKey: values.yaml + install: + createNamespace: true + remediation: + retries: -1 + upgrade: + remediation: + retries: -1 + driftDetection: + mode: enabled + interval: 5m diff --git a/apps/slinky-slurm-operator/kustomization.yaml b/apps/slinky-slurm-operator/kustomization.yaml new file mode 100644 index 0000000..f5081b7 --- /dev/null +++ b/apps/slinky-slurm-operator/kustomization.yaml @@ -0,0 +1,15 @@ +--- +namespace: slinky-slurm-operator + +configurations: + - kustomizeconfig.yaml + +configMapGenerator: + - name: slinky-slurm-operator-defaults + files: + - values.yaml=defaults.yaml + +resources: + - namespace.yaml + - ocirepository.yaml + - helmrelease.yaml diff --git a/apps/slinky-slurm-operator/kustomizeconfig.yaml b/apps/slinky-slurm-operator/kustomizeconfig.yaml new file mode 100644 index 0000000..26387e1 --- /dev/null +++ b/apps/slinky-slurm-operator/kustomizeconfig.yaml @@ -0,0 +1,7 @@ +# Make sure configmap references in HelmReleases are updated +nameReference: + - kind: ConfigMap + version: v1 + fieldSpecs: + - path: spec/valuesFrom/name + kind: HelmRelease diff --git a/apps/slinky-slurm-operator/ocirepository.yaml b/apps/slinky-slurm-operator/ocirepository.yaml new file mode 100644 index 0000000..a54c2c0 --- /dev/null +++ b/apps/slinky-slurm-operator/ocirepository.yaml @@ -0,0 +1,10 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1beta2 +kind: OCIRepository +metadata: + name: slinky-slurm-operator +spec: + interval: 10m + url: oci://ghcr.io/slinkyproject/charts/slurm-operator + ref: + semver: "0.1.0" \ No newline at end of file From b2562b8f89a37d863a6c873e08bb1eff5f5e82e0 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Mon, 27 Jan 2025 16:53:07 +0000 Subject: [PATCH 08/46] Get initial helm deploy of slurm operator up --- apps/cert-manager/configmap.yaml | 9 ++++++++ apps/cert-manager/helmchart.yaml | 13 ++++++++++++ apps/cert-manager/helmrelease.yaml | 25 +++++++++++++++++++++++ apps/cert-manager/helmrepository.yaml | 9 ++++++++ apps/cert-manager/kustomization.yaml | 6 ++++++ apps/cert-manager/namespace.yaml | 5 +++++ apps/sealed-secrets/helmchart.yaml | 13 ++++++++++++ apps/sealed-secrets/helmrelease.yaml | 21 +++++++++++++++++++ apps/sealed-secrets/helmrepository.yaml | 9 ++++++++ apps/sealed-secrets/kustomization.yaml | 5 +++++ apps/sealed-secrets/namespace.yaml | 5 +++++ apps/slinky-slurm-operator/defaults.yaml | 4 +++- apps/slinky-slurm-operator/namespace.yaml | 5 +++++ apps/slinky/kustomization.yaml | 4 ++++ 14 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 apps/cert-manager/configmap.yaml create mode 100644 apps/cert-manager/helmchart.yaml create mode 100644 apps/cert-manager/helmrelease.yaml create mode 100644 apps/cert-manager/helmrepository.yaml create mode 100644 apps/cert-manager/kustomization.yaml create mode 100644 apps/cert-manager/namespace.yaml create mode 100644 apps/sealed-secrets/helmchart.yaml create mode 100644 apps/sealed-secrets/helmrelease.yaml create mode 100644 apps/sealed-secrets/helmrepository.yaml create mode 100644 apps/sealed-secrets/kustomization.yaml create mode 100644 apps/sealed-secrets/namespace.yaml create mode 100644 apps/slinky-slurm-operator/namespace.yaml create mode 100644 apps/slinky/kustomization.yaml diff --git a/apps/cert-manager/configmap.yaml b/apps/cert-manager/configmap.yaml new file mode 100644 index 0000000..7acc97b --- /dev/null +++ b/apps/cert-manager/configmap.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: cert-manager-config + namespace: cert-manager +data: + values.yaml: | + installCRDs: true diff --git a/apps/cert-manager/helmchart.yaml b/apps/cert-manager/helmchart.yaml new file mode 100644 index 0000000..f08fabb --- /dev/null +++ b/apps/cert-manager/helmchart.yaml @@ -0,0 +1,13 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmChart +metadata: + name: cert-manager + namespace: cert-manager +spec: + chart: cert-manager + version: v1.16.1 + sourceRef: + kind: HelmRepository + name: jetstack + interval: 1h diff --git a/apps/cert-manager/helmrelease.yaml b/apps/cert-manager/helmrelease.yaml new file mode 100644 index 0000000..b3e258e --- /dev/null +++ b/apps/cert-manager/helmrelease.yaml @@ -0,0 +1,25 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: cert-manager + namespace: cert-manager +spec: + chartRef: + kind: HelmChart + name: cert-manager + releaseName: cert-manager + valuesFrom: + - kind: ConfigMap + name: cert-manager-config + valuesKey: values.yaml + install: + createNamespace: true + remediation: + retries: -1 + upgrade: + remediation: + retries: -1 + driftDetection: + mode: enabled + interval: 5m diff --git a/apps/cert-manager/helmrepository.yaml b/apps/cert-manager/helmrepository.yaml new file mode 100644 index 0000000..a0fd9a2 --- /dev/null +++ b/apps/cert-manager/helmrepository.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: jetstack + namespace: cert-manager +spec: + url: https://charts.jetstack.io + interval: 1h diff --git a/apps/cert-manager/kustomization.yaml b/apps/cert-manager/kustomization.yaml new file mode 100644 index 0000000..ebd317e --- /dev/null +++ b/apps/cert-manager/kustomization.yaml @@ -0,0 +1,6 @@ +resources: + - namespace.yaml + - configmap.yaml + - helmrepository.yaml + - helmchart.yaml + - helmrelease.yaml diff --git a/apps/cert-manager/namespace.yaml b/apps/cert-manager/namespace.yaml new file mode 100644 index 0000000..6bc19f4 --- /dev/null +++ b/apps/cert-manager/namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: cert-manager diff --git a/apps/sealed-secrets/helmchart.yaml b/apps/sealed-secrets/helmchart.yaml new file mode 100644 index 0000000..b3e3599 --- /dev/null +++ b/apps/sealed-secrets/helmchart.yaml @@ -0,0 +1,13 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmChart +metadata: + name: sealed-secrets + namespace: sealed-secrets-system +spec: + chart: sealed-secrets + version: 2.16.1 + sourceRef: + kind: HelmRepository + name: sealed-secrets + interval: 1h diff --git a/apps/sealed-secrets/helmrelease.yaml b/apps/sealed-secrets/helmrelease.yaml new file mode 100644 index 0000000..b5283e7 --- /dev/null +++ b/apps/sealed-secrets/helmrelease.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: sealed-secrets + namespace: sealed-secrets-system +spec: + chartRef: + kind: HelmChart + name: sealed-secrets + releaseName: sealed-secrets + install: + createNamespace: true + remediation: + retries: -1 + upgrade: + remediation: + retries: -1 + driftDetection: + mode: enabled + interval: 5m diff --git a/apps/sealed-secrets/helmrepository.yaml b/apps/sealed-secrets/helmrepository.yaml new file mode 100644 index 0000000..8434309 --- /dev/null +++ b/apps/sealed-secrets/helmrepository.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: sealed-secrets + namespace: sealed-secrets-system +spec: + url: https://bitnami-labs.github.io/sealed-secrets + interval: 1h diff --git a/apps/sealed-secrets/kustomization.yaml b/apps/sealed-secrets/kustomization.yaml new file mode 100644 index 0000000..c4286f8 --- /dev/null +++ b/apps/sealed-secrets/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - namespace.yaml + - helmrepository.yaml + - helmchart.yaml + - helmrelease.yaml diff --git a/apps/sealed-secrets/namespace.yaml b/apps/sealed-secrets/namespace.yaml new file mode 100644 index 0000000..46be057 --- /dev/null +++ b/apps/sealed-secrets/namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: sealed-secrets-system diff --git a/apps/slinky-slurm-operator/defaults.yaml b/apps/slinky-slurm-operator/defaults.yaml index ee236d3..2c50cc0 100644 --- a/apps/slinky-slurm-operator/defaults.yaml +++ b/apps/slinky-slurm-operator/defaults.yaml @@ -1,2 +1,4 @@ --- -todo: false \ No newline at end of file +image: + repository: ghcr.io/slinkyproject/slurm-operator + tag: "0.1.0" \ No newline at end of file diff --git a/apps/slinky-slurm-operator/namespace.yaml b/apps/slinky-slurm-operator/namespace.yaml new file mode 100644 index 0000000..edf7d98 --- /dev/null +++ b/apps/slinky-slurm-operator/namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: slinky-slurm-operator diff --git a/apps/slinky/kustomization.yaml b/apps/slinky/kustomization.yaml new file mode 100644 index 0000000..a60baa0 --- /dev/null +++ b/apps/slinky/kustomization.yaml @@ -0,0 +1,4 @@ +--- +resources: + - ../cert-manager/ + - ../slinky-slurm-operator/ \ No newline at end of file From d8a576c14e8c7ab8f012499c4a910005034e1fe9 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Mon, 27 Jan 2025 17:29:53 +0000 Subject: [PATCH 09/46] Initial attempt to deploy slurm control plane --- apps/slinky-slurm-controlplane/defaults.yaml | 2 ++ .../helmrelease.yaml | 23 ++++++++++++++ .../kustomization.yaml | 15 ++++++++++ .../kustomizeconfig.yaml | 7 +++++ apps/slinky-slurm-controlplane/namespace.yaml | 5 ++++ .../ocirepository.yaml | 10 +++++++ apps/slinky-slurm-operator/defaults.yaml | 2 +- apps/slinky-slurm-operator/ocirepository.yaml | 2 +- apps/slinky/README.md | 30 +++++++++++++++++++ apps/slinky/kustomization.yaml | 3 +- 10 files changed, 96 insertions(+), 3 deletions(-) create mode 100644 apps/slinky-slurm-controlplane/defaults.yaml create mode 100644 apps/slinky-slurm-controlplane/helmrelease.yaml create mode 100644 apps/slinky-slurm-controlplane/kustomization.yaml create mode 100644 apps/slinky-slurm-controlplane/kustomizeconfig.yaml create mode 100644 apps/slinky-slurm-controlplane/namespace.yaml create mode 100644 apps/slinky-slurm-controlplane/ocirepository.yaml create mode 100644 apps/slinky/README.md diff --git a/apps/slinky-slurm-controlplane/defaults.yaml b/apps/slinky-slurm-controlplane/defaults.yaml new file mode 100644 index 0000000..46714a2 --- /dev/null +++ b/apps/slinky-slurm-controlplane/defaults.yaml @@ -0,0 +1,2 @@ +--- +todo: false diff --git a/apps/slinky-slurm-controlplane/helmrelease.yaml b/apps/slinky-slurm-controlplane/helmrelease.yaml new file mode 100644 index 0000000..8692fa8 --- /dev/null +++ b/apps/slinky-slurm-controlplane/helmrelease.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: slinky-slurm-controlplane +spec: + chartRef: + kind: OCIRepository + name: slinky-slurm + valuesFrom: + - kind: ConfigMap + name: slinky-slurm-defaults + valuesKey: values.yaml + install: + createNamespace: true + remediation: + retries: -1 + upgrade: + remediation: + retries: -1 + driftDetection: + mode: enabled + interval: 5m diff --git a/apps/slinky-slurm-controlplane/kustomization.yaml b/apps/slinky-slurm-controlplane/kustomization.yaml new file mode 100644 index 0000000..16e73ef --- /dev/null +++ b/apps/slinky-slurm-controlplane/kustomization.yaml @@ -0,0 +1,15 @@ +--- +namespace: slinky-slurm-controlplane + +configurations: + - kustomizeconfig.yaml + +configMapGenerator: + - name: slinky-slurm-defaults + files: + - values.yaml=defaults.yaml + +resources: + - namespace.yaml + - ocirepository.yaml + - helmrelease.yaml diff --git a/apps/slinky-slurm-controlplane/kustomizeconfig.yaml b/apps/slinky-slurm-controlplane/kustomizeconfig.yaml new file mode 100644 index 0000000..26387e1 --- /dev/null +++ b/apps/slinky-slurm-controlplane/kustomizeconfig.yaml @@ -0,0 +1,7 @@ +# Make sure configmap references in HelmReleases are updated +nameReference: + - kind: ConfigMap + version: v1 + fieldSpecs: + - path: spec/valuesFrom/name + kind: HelmRelease diff --git a/apps/slinky-slurm-controlplane/namespace.yaml b/apps/slinky-slurm-controlplane/namespace.yaml new file mode 100644 index 0000000..931d428 --- /dev/null +++ b/apps/slinky-slurm-controlplane/namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: slinky-slurm-controlplane diff --git a/apps/slinky-slurm-controlplane/ocirepository.yaml b/apps/slinky-slurm-controlplane/ocirepository.yaml new file mode 100644 index 0000000..8326499 --- /dev/null +++ b/apps/slinky-slurm-controlplane/ocirepository.yaml @@ -0,0 +1,10 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1beta2 +kind: OCIRepository +metadata: + name: slinky-slurm +spec: + interval: 10m + url: oci://ghcr.io/slinkyproject/charts/slurm + ref: + semver: "0.1.0" diff --git a/apps/slinky-slurm-operator/defaults.yaml b/apps/slinky-slurm-operator/defaults.yaml index 2c50cc0..230691d 100644 --- a/apps/slinky-slurm-operator/defaults.yaml +++ b/apps/slinky-slurm-operator/defaults.yaml @@ -1,4 +1,4 @@ --- image: repository: ghcr.io/slinkyproject/slurm-operator - tag: "0.1.0" \ No newline at end of file + tag: "0.1.0" diff --git a/apps/slinky-slurm-operator/ocirepository.yaml b/apps/slinky-slurm-operator/ocirepository.yaml index a54c2c0..d8ae9ab 100644 --- a/apps/slinky-slurm-operator/ocirepository.yaml +++ b/apps/slinky-slurm-operator/ocirepository.yaml @@ -7,4 +7,4 @@ spec: interval: 10m url: oci://ghcr.io/slinkyproject/charts/slurm-operator ref: - semver: "0.1.0" \ No newline at end of file + semver: "0.1.0" diff --git a/apps/slinky/README.md b/apps/slinky/README.md new file mode 100644 index 0000000..4d21fc3 --- /dev/null +++ b/apps/slinky/README.md @@ -0,0 +1,30 @@ +# Slinky setup + +Based on: +https://github.com/SlinkyProject/slurm-operator/blob/main/docs/user/quickstart.md + +# Testing Slinky Slurm + +To test Slurm functionality, connect to the controller to use Slurm client +commands: + +```sh +kubectl --namespace=slurm exec -it statefulsets/slurm-controller -- bash --login +``` + +On the controller pod (e.g. host `slurm@slurm-controller-0`), run the following +commands to quickly test Slurm is functioning: + +```sh +sinfo +srun hostname +sbatch --wrap="sleep 60" +squeue +``` + +See [Slurm Commands][slurm-commands] for more details on how to interact with +Slurm. + + + +[slurm-commands]: https://slurm.schedmd.com/quickstart.html#commands \ No newline at end of file diff --git a/apps/slinky/kustomization.yaml b/apps/slinky/kustomization.yaml index a60baa0..b562569 100644 --- a/apps/slinky/kustomization.yaml +++ b/apps/slinky/kustomization.yaml @@ -1,4 +1,5 @@ --- resources: - ../cert-manager/ - - ../slinky-slurm-operator/ \ No newline at end of file + - ../slinky-slurm-operator/ + - ../slinky-slurm-controlplane/ From 729c6eb27661c449f618aa3bd680a8eab7a2baea Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Mon, 27 Jan 2025 18:04:56 +0000 Subject: [PATCH 10/46] Fix storage class for slurm pvc --- apps/slinky-slurm-controlplane/defaults.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/slinky-slurm-controlplane/defaults.yaml b/apps/slinky-slurm-controlplane/defaults.yaml index 46714a2..d8725af 100644 --- a/apps/slinky-slurm-controlplane/defaults.yaml +++ b/apps/slinky-slurm-controlplane/defaults.yaml @@ -1,2 +1,4 @@ --- -todo: false +controller: + persistence: + storageClass: csi-cinder From 7cc4a62ce255cab40c1b13638f28320d5d3b1ef3 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Mon, 27 Jan 2025 18:32:58 +0000 Subject: [PATCH 11/46] Add nodesets --- apps/slinky-slurm-operator/defaults.yaml | 212 +++++++++++++++++++++++ apps/slinky/README.md | 2 +- 2 files changed, 213 insertions(+), 1 deletion(-) diff --git a/apps/slinky-slurm-operator/defaults.yaml b/apps/slinky-slurm-operator/defaults.yaml index 230691d..9c019b1 100644 --- a/apps/slinky-slurm-operator/defaults.yaml +++ b/apps/slinky-slurm-operator/defaults.yaml @@ -2,3 +2,215 @@ image: repository: ghcr.io/slinkyproject/slurm-operator tag: "0.1.0" + +# +# Slurm compute (slurmd) configurations. +compute: + # + # -- (string) + # Set the image pull policy. + imagePullPolicy: IfNotPresent + # + # Default image for the nodeset pod (slurmd) + # Each nodeset may override this setting. + image: + # + # -- (string) + # Set the image repository to use. + repository: ghcr.io/slinkyproject/slurmd + # + # -- (string) + # Set the image tag to use. + # @default -- The Release appVersion. + tag: 24.05-ubuntu-24.04 + # + # -- (list) + # Slurm NodeSets by object list. + nodesets: + # + # -- (string) + # Name of NodeSet. Must be unique. + - name: debug + # + # -- (bool) + # Enables the NodeSet in Slurm. + enabled: true + # + # -- (integer) + # Set the number of replicas to deploy. + # NOTE: if empty, all nodes matching affinity will have a replica (like DaemonSet). + replicas: 1 + # + # -- (int) + # The minimum number of seconds for which a newly created NodeSet Pod should be ready + # without any of its container crashing, for it to be considered available. + minReadySeconds: 0 + # + # -- (string) + # Set the image pull policy. + imagePullPolicy: IfNotPresent + # + # Set the image to use. + image: + # + # -- (string) + # Set the image repository to use. + repository: "" + # + # -- (string) + # Set the image tag to use. + tag: "" + # + # -- (string) + # Set the priority class to use. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass + priorityClassName: "" + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: + limits: + cpu: 1 + memory: 1Gi + # + # -- (map) + # Selector which must match a node's labels for the pod to be scheduled on that node. + nodeSelector: + kubernetes.io/os: linux + # + # -- (object) + # Set affinity for Kubernetes Pod scheduling. + affinity: {} + # nodeAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # nodeSelectorTerms: + # - matchExpressions: + # - key: "kubernetes.io/os" + # operator: In + # values: + # - linux + # podAntiAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # - topologyKey: "kubernetes.io/hostname" + # labelSelector: + # matchExpressions: + # - key: "app.kubernetes.io/name" + # operator: In + # values: + # - slurmctld + # - slurmdbd + # - slurmrestd + # + # -- (object) + # Set the update strategy configuration. + updateStrategy: + # + # -- (string) + # Set the update strategy type. + # Can be either: "RollingUpdate"; "OnDelete". + type: RollingUpdate + # + # -- (object) + # Define the rolling update policy. + # Only used when "updateStrategy.type=RollingUpdate". + rollingUpdate: + # + # -- (string) + # The maximum number of pods that can be unavailable during the update. + # Value can be an absolute number (ex: 5) or a percentage of desired + # pods (ex: 10%). Absolute number is calculated from percentage by + # rounding up. This can not be 0. Defaults to 1. + maxUnavailable: 20% + # + # -- (int) + # Partition indicates the number of NodeSet pods that should be + # not be updated to the latest version. + partition: 0 + # + # -- (bool) + # Pause will halt rollingUpdate while this value is true. + paused: false + # + # -- (object) + # The policy used for PVCs created from the NodeSet VolumeClaimTemplates. + persistentVolumeClaimRetentionPolicy: + # + # -- (string) + # WhenDeleted specifies what happens to PVCs created from NodeSet + # VolumeClaimTemplates when the NodeSet is deleted. The default policy + # of `Retain` causes PVCs to not be affected by NodeSet deletion. The + # `Delete` policy causes those PVCs to be deleted. + whenDeleted: Retain + # + # --(list) + # List of claims that pods are allowed to reference. + # The NodeSet controller is responsible for mapping network identities to + # claims in a way that maintains the identity of a pod. + volumeClaimTemplates: [] + # - metadata: + # name: data + # spec: + # storageClassName: standard + # mountPath: /mnt/data + # accessModes: + # - ReadWriteOnce + # resources: + # requests: + # storage: 1Gi + # + # -- (object) + # Partition describes the partition created specifically for this NodeSet to be added. + partition: + # + # -- (bool) + # Enables this NodeSet's partition line to be added in Slurm. + enabled: true + # + # -- (string) + # Extra Slurm partition configuration appended onto the partition line. + # Ref: https://slurm.schedmd.com/slurm.conf.html#lbAI + config: >- + State=UP + MaxTime=INFINITE + # + # -- (string) + # Set Slurm node GRES. + # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Gres_1 + nodeGres: "" + # + # -- (list) + # Set Slurm node Features as a list(string). + # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Features + nodeFeatures: [] + # + # -- (string) + # Set Slurm node weight for Slurm scheduling. + # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Weight + nodeWeight: 1 + # + # -- (list) + # Slurm Partitions by object list. + partitions: + # + # -- (string) + # Name of Partition. Must be unique. + - name: all + # + # -- (bool) + # Enables the partition in Slurm. + enabled: true + # + # -- (list) + # NodeSets to put into this Partition by name/key. + # NOTE: 'ALL' is a Slurm meta value to mean all nodes in the system. + nodesets: + - ALL + # + # -- (string) + # Extra Slurm partition configuration appended onto the partition line. + # Ref: https://slurm.schedmd.com/slurm.conf.html#lbAI + config: >- + State=UP + Default=YES + MaxTime=INFINITE diff --git a/apps/slinky/README.md b/apps/slinky/README.md index 4d21fc3..0e12119 100644 --- a/apps/slinky/README.md +++ b/apps/slinky/README.md @@ -9,7 +9,7 @@ To test Slurm functionality, connect to the controller to use Slurm client commands: ```sh -kubectl --namespace=slurm exec -it statefulsets/slurm-controller -- bash --login +kubectl -n slinky-slurm-controlplane exec -it statefulsets/slinky-slurm-controlplane-controller -- bash --login ``` On the controller pod (e.g. host `slurm@slurm-controller-0`), run the following From cc7f536688aa458c84c9bc356df799d25659722f Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Mon, 27 Jan 2025 18:49:56 +0000 Subject: [PATCH 12/46] Update slurm deploy timeout to 10mins --- apps/slinky-slurm-controlplane/helmrelease.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/slinky-slurm-controlplane/helmrelease.yaml b/apps/slinky-slurm-controlplane/helmrelease.yaml index 8692fa8..5002f07 100644 --- a/apps/slinky-slurm-controlplane/helmrelease.yaml +++ b/apps/slinky-slurm-controlplane/helmrelease.yaml @@ -20,4 +20,5 @@ spec: retries: -1 driftDetection: mode: enabled - interval: 5m + interval: 10m + timeout: 10m From b4d08854492b1bbe6387d6bfb7b41a48ed3ea122 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Mon, 27 Jan 2025 19:07:30 +0000 Subject: [PATCH 13/46] Fix up where nodesets are defined --- apps/slinky-slurm-controlplane/defaults.yaml | 212 +++++++++++++++++++ apps/slinky-slurm-operator/defaults.yaml | 212 ------------------- 2 files changed, 212 insertions(+), 212 deletions(-) diff --git a/apps/slinky-slurm-controlplane/defaults.yaml b/apps/slinky-slurm-controlplane/defaults.yaml index d8725af..3e50551 100644 --- a/apps/slinky-slurm-controlplane/defaults.yaml +++ b/apps/slinky-slurm-controlplane/defaults.yaml @@ -2,3 +2,215 @@ controller: persistence: storageClass: csi-cinder + +# +# Slurm compute (slurmd) configurations. +compute: + # + # -- (string) + # Set the image pull policy. + imagePullPolicy: IfNotPresent + # + # Default image for the nodeset pod (slurmd) + # Each nodeset may override this setting. + image: + # + # -- (string) + # Set the image repository to use. + repository: ghcr.io/slinkyproject/slurmd + # + # -- (string) + # Set the image tag to use. + # @default -- The Release appVersion. + tag: 24.05-ubuntu-24.04 + # + # -- (list) + # Slurm NodeSets by object list. + nodesets: + # + # -- (string) + # Name of NodeSet. Must be unique. + - name: debug + # + # -- (bool) + # Enables the NodeSet in Slurm. + enabled: true + # + # -- (integer) + # Set the number of replicas to deploy. + # NOTE: if empty, all nodes matching affinity will have a replica (like DaemonSet). + replicas: 1 + # + # -- (int) + # The minimum number of seconds for which a newly created NodeSet Pod should be ready + # without any of its container crashing, for it to be considered available. + minReadySeconds: 0 + # + # -- (string) + # Set the image pull policy. + imagePullPolicy: IfNotPresent + # + # Set the image to use. + image: + # + # -- (string) + # Set the image repository to use. + repository: "" + # + # -- (string) + # Set the image tag to use. + tag: "" + # + # -- (string) + # Set the priority class to use. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass + priorityClassName: "" + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: + limits: + cpu: 1 + memory: 1Gi + # + # -- (map) + # Selector which must match a node's labels for the pod to be scheduled on that node. + nodeSelector: + kubernetes.io/os: linux + # + # -- (object) + # Set affinity for Kubernetes Pod scheduling. + affinity: {} + # nodeAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # nodeSelectorTerms: + # - matchExpressions: + # - key: "kubernetes.io/os" + # operator: In + # values: + # - linux + # podAntiAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # - topologyKey: "kubernetes.io/hostname" + # labelSelector: + # matchExpressions: + # - key: "app.kubernetes.io/name" + # operator: In + # values: + # - slurmctld + # - slurmdbd + # - slurmrestd + # + # -- (object) + # Set the update strategy configuration. + updateStrategy: + # + # -- (string) + # Set the update strategy type. + # Can be either: "RollingUpdate"; "OnDelete". + type: RollingUpdate + # + # -- (object) + # Define the rolling update policy. + # Only used when "updateStrategy.type=RollingUpdate". + rollingUpdate: + # + # -- (string) + # The maximum number of pods that can be unavailable during the update. + # Value can be an absolute number (ex: 5) or a percentage of desired + # pods (ex: 10%). Absolute number is calculated from percentage by + # rounding up. This can not be 0. Defaults to 1. + maxUnavailable: 20% + # + # -- (int) + # Partition indicates the number of NodeSet pods that should be + # not be updated to the latest version. + partition: 0 + # + # -- (bool) + # Pause will halt rollingUpdate while this value is true. + paused: false + # + # -- (object) + # The policy used for PVCs created from the NodeSet VolumeClaimTemplates. + persistentVolumeClaimRetentionPolicy: + # + # -- (string) + # WhenDeleted specifies what happens to PVCs created from NodeSet + # VolumeClaimTemplates when the NodeSet is deleted. The default policy + # of `Retain` causes PVCs to not be affected by NodeSet deletion. The + # `Delete` policy causes those PVCs to be deleted. + whenDeleted: Retain + # + # --(list) + # List of claims that pods are allowed to reference. + # The NodeSet controller is responsible for mapping network identities to + # claims in a way that maintains the identity of a pod. + volumeClaimTemplates: [] + # - metadata: + # name: data + # spec: + # storageClassName: standard + # mountPath: /mnt/data + # accessModes: + # - ReadWriteOnce + # resources: + # requests: + # storage: 1Gi + # + # -- (object) + # Partition describes the partition created specifically for this NodeSet to be added. + partition: + # + # -- (bool) + # Enables this NodeSet's partition line to be added in Slurm. + enabled: true + # + # -- (string) + # Extra Slurm partition configuration appended onto the partition line. + # Ref: https://slurm.schedmd.com/slurm.conf.html#lbAI + config: >- + State=UP + MaxTime=INFINITE + # + # -- (string) + # Set Slurm node GRES. + # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Gres_1 + nodeGres: "" + # + # -- (list) + # Set Slurm node Features as a list(string). + # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Features + nodeFeatures: [] + # + # -- (string) + # Set Slurm node weight for Slurm scheduling. + # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Weight + nodeWeight: 1 + # + # -- (list) + # Slurm Partitions by object list. + partitions: + # + # -- (string) + # Name of Partition. Must be unique. + - name: all + # + # -- (bool) + # Enables the partition in Slurm. + enabled: true + # + # -- (list) + # NodeSets to put into this Partition by name/key. + # NOTE: 'ALL' is a Slurm meta value to mean all nodes in the system. + nodesets: + - ALL + # + # -- (string) + # Extra Slurm partition configuration appended onto the partition line. + # Ref: https://slurm.schedmd.com/slurm.conf.html#lbAI + config: >- + State=UP + Default=YES + MaxTime=INFINITE diff --git a/apps/slinky-slurm-operator/defaults.yaml b/apps/slinky-slurm-operator/defaults.yaml index 9c019b1..230691d 100644 --- a/apps/slinky-slurm-operator/defaults.yaml +++ b/apps/slinky-slurm-operator/defaults.yaml @@ -2,215 +2,3 @@ image: repository: ghcr.io/slinkyproject/slurm-operator tag: "0.1.0" - -# -# Slurm compute (slurmd) configurations. -compute: - # - # -- (string) - # Set the image pull policy. - imagePullPolicy: IfNotPresent - # - # Default image for the nodeset pod (slurmd) - # Each nodeset may override this setting. - image: - # - # -- (string) - # Set the image repository to use. - repository: ghcr.io/slinkyproject/slurmd - # - # -- (string) - # Set the image tag to use. - # @default -- The Release appVersion. - tag: 24.05-ubuntu-24.04 - # - # -- (list) - # Slurm NodeSets by object list. - nodesets: - # - # -- (string) - # Name of NodeSet. Must be unique. - - name: debug - # - # -- (bool) - # Enables the NodeSet in Slurm. - enabled: true - # - # -- (integer) - # Set the number of replicas to deploy. - # NOTE: if empty, all nodes matching affinity will have a replica (like DaemonSet). - replicas: 1 - # - # -- (int) - # The minimum number of seconds for which a newly created NodeSet Pod should be ready - # without any of its container crashing, for it to be considered available. - minReadySeconds: 0 - # - # -- (string) - # Set the image pull policy. - imagePullPolicy: IfNotPresent - # - # Set the image to use. - image: - # - # -- (string) - # Set the image repository to use. - repository: "" - # - # -- (string) - # Set the image tag to use. - tag: "" - # - # -- (string) - # Set the priority class to use. - # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass - priorityClassName: "" - # - # -- (object) - # Set container resource requests and limits for Kubernetes Pod scheduling. - # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container - resources: - limits: - cpu: 1 - memory: 1Gi - # - # -- (map) - # Selector which must match a node's labels for the pod to be scheduled on that node. - nodeSelector: - kubernetes.io/os: linux - # - # -- (object) - # Set affinity for Kubernetes Pod scheduling. - affinity: {} - # nodeAffinity: - # requiredDuringSchedulingIgnoredDuringExecution: - # nodeSelectorTerms: - # - matchExpressions: - # - key: "kubernetes.io/os" - # operator: In - # values: - # - linux - # podAntiAffinity: - # requiredDuringSchedulingIgnoredDuringExecution: - # - topologyKey: "kubernetes.io/hostname" - # labelSelector: - # matchExpressions: - # - key: "app.kubernetes.io/name" - # operator: In - # values: - # - slurmctld - # - slurmdbd - # - slurmrestd - # - # -- (object) - # Set the update strategy configuration. - updateStrategy: - # - # -- (string) - # Set the update strategy type. - # Can be either: "RollingUpdate"; "OnDelete". - type: RollingUpdate - # - # -- (object) - # Define the rolling update policy. - # Only used when "updateStrategy.type=RollingUpdate". - rollingUpdate: - # - # -- (string) - # The maximum number of pods that can be unavailable during the update. - # Value can be an absolute number (ex: 5) or a percentage of desired - # pods (ex: 10%). Absolute number is calculated from percentage by - # rounding up. This can not be 0. Defaults to 1. - maxUnavailable: 20% - # - # -- (int) - # Partition indicates the number of NodeSet pods that should be - # not be updated to the latest version. - partition: 0 - # - # -- (bool) - # Pause will halt rollingUpdate while this value is true. - paused: false - # - # -- (object) - # The policy used for PVCs created from the NodeSet VolumeClaimTemplates. - persistentVolumeClaimRetentionPolicy: - # - # -- (string) - # WhenDeleted specifies what happens to PVCs created from NodeSet - # VolumeClaimTemplates when the NodeSet is deleted. The default policy - # of `Retain` causes PVCs to not be affected by NodeSet deletion. The - # `Delete` policy causes those PVCs to be deleted. - whenDeleted: Retain - # - # --(list) - # List of claims that pods are allowed to reference. - # The NodeSet controller is responsible for mapping network identities to - # claims in a way that maintains the identity of a pod. - volumeClaimTemplates: [] - # - metadata: - # name: data - # spec: - # storageClassName: standard - # mountPath: /mnt/data - # accessModes: - # - ReadWriteOnce - # resources: - # requests: - # storage: 1Gi - # - # -- (object) - # Partition describes the partition created specifically for this NodeSet to be added. - partition: - # - # -- (bool) - # Enables this NodeSet's partition line to be added in Slurm. - enabled: true - # - # -- (string) - # Extra Slurm partition configuration appended onto the partition line. - # Ref: https://slurm.schedmd.com/slurm.conf.html#lbAI - config: >- - State=UP - MaxTime=INFINITE - # - # -- (string) - # Set Slurm node GRES. - # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Gres_1 - nodeGres: "" - # - # -- (list) - # Set Slurm node Features as a list(string). - # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Features - nodeFeatures: [] - # - # -- (string) - # Set Slurm node weight for Slurm scheduling. - # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Weight - nodeWeight: 1 - # - # -- (list) - # Slurm Partitions by object list. - partitions: - # - # -- (string) - # Name of Partition. Must be unique. - - name: all - # - # -- (bool) - # Enables the partition in Slurm. - enabled: true - # - # -- (list) - # NodeSets to put into this Partition by name/key. - # NOTE: 'ALL' is a Slurm meta value to mean all nodes in the system. - nodesets: - - ALL - # - # -- (string) - # Extra Slurm partition configuration appended onto the partition line. - # Ref: https://slurm.schedmd.com/slurm.conf.html#lbAI - config: >- - State=UP - Default=YES - MaxTime=INFINITE From fd0f6c257077aecaea292eb83c7f7a96b2ccb6ad Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Mon, 27 Jan 2025 19:31:14 +0000 Subject: [PATCH 14/46] Apply the example values --- apps/slinky-slurm-controlplane/defaults.yaml | 387 +++++++++++++++++++ apps/slinky-slurm-operator/defaults.yaml | 157 ++++++++ 2 files changed, 544 insertions(+) diff --git a/apps/slinky-slurm-controlplane/defaults.yaml b/apps/slinky-slurm-controlplane/defaults.yaml index 3e50551..717caac 100644 --- a/apps/slinky-slurm-controlplane/defaults.yaml +++ b/apps/slinky-slurm-controlplane/defaults.yaml @@ -1,7 +1,245 @@ --- +# +# Debug configuration. +# @ignored +debug: + # + # -- (bool) + # Enables debug configuration. + enabled: false + # + # -- (bool) + # Allow a locally running operator to communicate with slurm cluster via port-forward. + # NOTE: use when running the operator in a local debugger. + localOperator: true + +# +# -- (string) +# Overrides the name of the release. +nameOverride: "" + +# +# -- (string) +# Overrides the full name of the release. +fullnameOverride: "" + +# +# -- (string) +# Overrides the namespace of the release. +namespaceOverride: "" + +# +# -- (list) +# Set the secrets for image pull. +# Ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ +imagePullSecrets: [] + # - name: regcred + +# +# -- (string) +# Set the image pull policy. +imagePullPolicy: IfNotPresent + +# +# -- (string) +# Set the priority class to use. +# Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass +priorityClassName: "" + +# +# Slurm JWT authentication. +jwt: + # + # JWT hs256 configurations. + hs256: + # + # -- (string) + # The existing secret to use otherwise one will be generated. + existingSecret: "" + +# +# Slurm configurations. +slurm: + # + # Slurm authentication configurations. + auth: + # + # -- (string) + # The existing secret to use otherwise one will be generated. + existingSecret: "" + # + # -- (string) + # Extra slurmdbd configuration lines to append to `slurmdbd.conf`. + # WARNING: Values can override existing ones. + # Ref: https://slurm.schedmd.com/slurmdbd.conf.html + extraSlurmdbdConf: |- + CommitDelay=1 + # + # -- (string) + # Extra slurm configuration lines to append to `slurm.conf`. + # WARNING: Values can override existing ones. + # Ref: https://slurm.schedmd.com/slurm.conf.html + extraSlurmConf: |- + SchedulerParameters=batch_sched_delay=20,bf_continue,bf_interval=300,bf_min_age_reserve=10800,bf_resolution=600,bf_yield_interval=1000000,partition_job_depth=500,sched_max_job_start=200,sched_min_interval=2000000 + DefMemPerCPU=1 + # + # -- (map[string]string) + # Optional raw Slurm configuration files, as a map. + # The map key represents the config file by name; the map value represents config file contents as a string. + # Ref: https://slurm.schedmd.com/man_index.html#configuration_files + configFiles: {} + # acct_gather.conf: | + # # Ref: https://slurm.schedmd.com/acct_gather.conf.html + # burst_buffer.conf: | + # # Ref: https://slurm.schedmd.com/burst_buffer.conf.html + # gres.conf: | + # # Ref: https://slurm.schedmd.com/gres.conf.html + # helpers.conf: | + # # Ref: https://slurm.schedmd.com/helpers.conf.html + # job_container.conf: | + # # Ref: https://slurm.schedmd.com/job_container.conf.html + # mpi.conf: | + # # Ref: https://slurm.schedmd.com/mpi.conf.html + # oci.conf: | + # # Ref: https://slurm.schedmd.com/oci.conf.html + # plugstack.conf: | + # # Ref: https://slurm.schedmd.com/plugstack.conf.html + # topology.conf: | + # # Ref: https://slurm.schedmd.com/topology.conf.html + # + # -- (map[string]string) + # The Prolog scripts for compute nodesets, as a map. + # The map key represents the filename; the map value represents the script contents. + # WARNING: The script must include a shebang (!) so it can be executed correctly by Slurm. + # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog + # Ref: https://slurm.schedmd.com/prolog_epilog.html + # Ref: https://en.wikipedia.org/wiki/Shebang_(Unix) + prologScripts: {} + # empty: | + # #!/usr/bin/env bash + # exit 0 + # + # -- (map[string]string) + # The Epilog scripts for compute nodesets, as a map. + # The map key represents the filename; the map value represents the script contents. + # WARNING: The script must include a shebang (!) so it can be executed correctly by Slurm. + # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog + # Ref: https://slurm.schedmd.com/prolog_epilog.html + # Ref: https://en.wikipedia.org/wiki/Shebang_(Unix) + epilogScripts: {} + # empty: | + # #!/usr/bin/env bash + # exit 0 + +# +# Slurm authcred (sackd) configurations. +authcred: + # + # Set the image to use. + image: + # + # -- (string) + # Set the image repository to use. + repository: ghcr.io/slinkyproject/sackd + # + # -- (string) + # Set the image tag to use. + tag: 24.05-ubuntu-24.04 + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: {} + # requests: + # cpu: 1 + # memory: 1Gi + # limits: + # cpu: 2 + # memory: 4Gi + +# +# Slurm controller (slurmctld) configurations. controller: + # + # -- (bool) + # Enables the controller node. + enabled: true + # + # -- (integer) + # Set the number of replicas to deploy. + replicas: 1 + # + # -- (string) + # Set the image pull policy. + imagePullPolicy: IfNotPresent + # + # Set the image to use. + image: + # + # -- (string) + # Set the image repository to use. + repository: ghcr.io/slinkyproject/slurmctld + # + # -- (string) + # Set the image tag to use. + tag: 24.05-ubuntu-24.04 + # + # -- (string) + # Set the priority class to use. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass + priorityClassName: + # + # -- (object) + # Set affinity for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: {} + # requests: + # cpu: 1 + # memory: 1Gi + # limits: + # cpu: 2 + # memory: 4Gi + # + # Define a persistent volume for the slurm controller to store its save-state. + # Used to recover from system failures or from pod upgrades. persistence: + # + # -- (string) + # Name of an existing `PersistentVolumeClaim` to use instead of creating one from definition. + # NOTE: When not empty, the other persistence fields will be ignored. + existingClaim: "" + # + # -- (object) + # Create a `PersistentVolumeClaim` with these annotations. + annotations: {} + # + # -- (object) + # Create a `PersistentVolumeClaim` with these labels. + labels: {} + # + # -- (string) + # Create a `PersistentVolumeClaim` with this storage class. storageClass: csi-cinder + # + # -- (list) + # Create a `PersistentVolumeClaim` with these access modes. + accessModes: + - ReadWriteOnce + # + # -- (string) + # Create a `PersistentVolumeClaim` with this storage size. + size: 4Gi + # + # -- (object) + # Selector to match an existing `PersistentVolume`. + selector: {} + # matchLabels: + # app: foo # # Slurm compute (slurmd) configurations. @@ -214,3 +452,152 @@ compute: State=UP Default=YES MaxTime=INFINITE + +# +# Slurm accounting (slurmdbd) configurations. +accounting: + # + # -- (bool) + # Enables accounting services. + enabled: true + # + # -- (integer) + # Set the number of replicas to deploy. + replicas: 1 + # + # -- (string) + # Set the image pull policy. + imagePullPolicy: IfNotPresent + # + # Set the image to use. + image: + # + # -- (string) + # Set the image repository to use. + repository: ghcr.io/slinkyproject/slurmdbd + # + # -- (string) + # Set the image tag to use. + tag: 24.05-ubuntu-24.04 + # + # -- (object) + # Set affinity for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: {} + # requests: + # cpu: 1 + # memory: 1Gi + # limits: + # cpu: 2 + # memory: 4Gi + # + # Configuration for an external accounting instance (slurmdbd). + external: + # + # -- (bool) + # Use an external acounting instance (slurmdbd) instead of deploying one. + enabled: false + # + # -- (string) + # The external acounting instance (slurmdbd) host. + host: "" + # + # -- (integer) + # The external acounting instance (slurmdbd) port. + port: 6819 + +# +# `bitnami/mariadb` subchart configurations. +# Ref: https://github.com/bitnami/charts/blob/main/bitnami/mariadb/values.yaml +mariadb: + enabled: true + auth: + username: slurm + database: slurm_acct_db + existingSecret: "slurm-mariadb-passwords" + initdbScripts: + # NOTE: https://slurm.schedmd.com/accounting.html#slurm-accounting-configuration-before-build + slurm-accounting.sql: |- + SET GLOBAL innodb_buffer_pool_size=(4 * 1024 * 1024 * 1024); + SET GLOBAL innodb_log_file_size=(64 * 1024 * 1024); + SET GLOBAL innodb_lock_wait_timeout=900; + SET GLOBAL max_allowed_packet=(16 * 1024 * 1024); + primary: + persistence: + enabled: false + existingClaim: "" + storageClass: csi-cinder + labels: {} + annotations: {} + accessModes: + - ReadWriteOnce + size: 8Gi + selector: {} + priorityClassName: "" + metrics: + enabled: false + serviceMonitor: + enabled: false + affinity: {} + resources: {} + +# +# Slurm REST API (slurmrestd) configurations. +restapi: + # + # -- (bool) + # Enables restapi services. + enabled: true + # + # -- (integer) + # Set the number of replicas to deploy. + replicas: 1 + # + # -- (string) + # Set the image pull policy. + imagePullPolicy: IfNotPresent + # + # Set the image to use. + image: + # + # -- (string) + # Set the image repository to use. + repository: ghcr.io/slinkyproject/slurmrestd + # + # -- (string) + # Set the image tag to use. + tag: 24.05-ubuntu-24.04 + # + # -- (string) + # Set the priority class to use. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass + priorityClassName: "" + # + # -- (object) + # Set affinity for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: {} + # requests: + # cpu: 1 + # memory: 1Gi + # limits: + # cpu: 2 + # memory: 4Gi + +# +# `slurm-exporter` subchart configurations. +# Ref: https://github.com/SlinkyProject/slurm-exporter/-/blob/main/helm/slurm-exporter/values.yaml +slurm-exporter: + exporter: + enabled: true + secretName: "slurm-token-exporter" diff --git a/apps/slinky-slurm-operator/defaults.yaml b/apps/slinky-slurm-operator/defaults.yaml index 230691d..844a765 100644 --- a/apps/slinky-slurm-operator/defaults.yaml +++ b/apps/slinky-slurm-operator/defaults.yaml @@ -1,4 +1,161 @@ --- + +# +# -- (string) +# Overrides the name of the release. +nameOverride: "" + +# +# -- (string) +# Overrides the full name of the release. +fullnameOverride: "" + +# +# -- (string) +# Overrides the namespace of the release. +namespaceOverride: "" + +# +# -- (list) +# Sets the image pull secrets. +# Ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ +imagePullSecrets: [] + # - name: regcred + +# +# -- (string) +# Set the image pull policy. +imagePullPolicy: IfNotPresent + +# +# Image configurations. image: + # + # -- (string) + # Sets the image repository to use. repository: ghcr.io/slinkyproject/slurm-operator + # + # -- (string) + # Sets the image tag to use. + # @default -- The Release appVersion. tag: "0.1.0" + +# +# -- (string) +# Set the priority class to use. +# Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass +priorityClassName: "" + +# +# Operator configurations. +operator: + # + # -- (bool) + # Enables the operator. + enabled: true + # + # -- (integer) + # Set the number of replicas to deploy. + replicas: 1 + # + # Service account configurations. + serviceAccount: + # + # -- (bool) + # Allows chart to create the service account. + create: true + # + # -- (string) + # Set the service account to use (and create). + name: "" + # + # -- (object) + # Set affinity for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: {} + # requests: + # cpu: 1 + # memory: 1Gi + # limits: + # cpu: 2 + # memory: 4Gi + # + # -- (integer) + # Set the max concurrent workers for the Cluster controller. + clusterWorkers: 1 + # + # -- (integer) + # Set the max concurrent workers for the NodeSet controller. + nodesetWorkers: 1 + # + # -- (string) + # Set the log level by string (e.g. error, info, debug) or number (e.g. 1..5). + logLevel: info + +# +# Webhook configurations. +webhook: + # + # -- (bool) + # Enables the webhook. + enabled: true + # + # -- (integer) + # Set the number of replicas to deploy. + replicas: 1 + # + # Service account configurations. + serviceAccount: + # + # -- (bool) + # Allows chart to create the service account. + create: true + # + # -- (string) + # Set the service account to use (and create). + name: "" + # + # -- (object) + # Set affinity for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: {} + # requests: + # cpu: 1 + # memory: 1Gi + # limits: + # cpu: 2 + # memory: 4Gi + # + # -- (string) + # Set the log level by string (e.g. error, info, debug) or number (e.g. 1..5). + logLevel: info + +# +# Cert-Manager certificate configurations. +certManager: + # + # -- (bool) + # Enables cert-manager for certificate management. + enabled: true + # + # -- (string) + # The secret to be (created and) mounted. + secretName: slurm-operator-webhook-ca + # + # -- (string) + # Duration of certificate life. + duration: 43800h0m0s # 5 year + # + # -- (string) + # Certificate renewal time. Should be before the expiration. + renewBefore: 8760h0m0s # 1 year From 90a9df2c7dd1c189554fd539e307994ed13123ee Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Mon, 27 Jan 2025 21:51:21 +0000 Subject: [PATCH 15/46] Reset to the default namespaces and release names --- apps/slinky-slurm-controlplane/helmrelease.yaml | 4 ++-- apps/slinky-slurm-controlplane/kustomization.yaml | 2 +- apps/slinky-slurm-controlplane/namespace.yaml | 2 +- apps/slinky-slurm-operator/helmrelease.yaml | 4 ++-- apps/slinky-slurm-operator/kustomization.yaml | 2 +- apps/slinky-slurm-operator/namespace.yaml | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/apps/slinky-slurm-controlplane/helmrelease.yaml b/apps/slinky-slurm-controlplane/helmrelease.yaml index 5002f07..2f28ff2 100644 --- a/apps/slinky-slurm-controlplane/helmrelease.yaml +++ b/apps/slinky-slurm-controlplane/helmrelease.yaml @@ -2,7 +2,7 @@ apiVersion: helm.toolkit.fluxcd.io/v2 kind: HelmRelease metadata: - name: slinky-slurm-controlplane + name: slurm spec: chartRef: kind: OCIRepository @@ -19,6 +19,6 @@ spec: remediation: retries: -1 driftDetection: - mode: enabled + mode: disabled interval: 10m timeout: 10m diff --git a/apps/slinky-slurm-controlplane/kustomization.yaml b/apps/slinky-slurm-controlplane/kustomization.yaml index 16e73ef..8c1cd0c 100644 --- a/apps/slinky-slurm-controlplane/kustomization.yaml +++ b/apps/slinky-slurm-controlplane/kustomization.yaml @@ -1,5 +1,5 @@ --- -namespace: slinky-slurm-controlplane +namespace: slurm configurations: - kustomizeconfig.yaml diff --git a/apps/slinky-slurm-controlplane/namespace.yaml b/apps/slinky-slurm-controlplane/namespace.yaml index 931d428..62b754e 100644 --- a/apps/slinky-slurm-controlplane/namespace.yaml +++ b/apps/slinky-slurm-controlplane/namespace.yaml @@ -2,4 +2,4 @@ apiVersion: v1 kind: Namespace metadata: - name: slinky-slurm-controlplane + name: slurm diff --git a/apps/slinky-slurm-operator/helmrelease.yaml b/apps/slinky-slurm-operator/helmrelease.yaml index 84cebf5..b85e5b5 100644 --- a/apps/slinky-slurm-operator/helmrelease.yaml +++ b/apps/slinky-slurm-operator/helmrelease.yaml @@ -2,7 +2,7 @@ apiVersion: helm.toolkit.fluxcd.io/v2 kind: HelmRelease metadata: - name: slinky-slurm-operator + name: slurm-operator spec: chartRef: kind: OCIRepository @@ -19,5 +19,5 @@ spec: remediation: retries: -1 driftDetection: - mode: enabled + mode: disabled interval: 5m diff --git a/apps/slinky-slurm-operator/kustomization.yaml b/apps/slinky-slurm-operator/kustomization.yaml index f5081b7..1b8cb0c 100644 --- a/apps/slinky-slurm-operator/kustomization.yaml +++ b/apps/slinky-slurm-operator/kustomization.yaml @@ -1,5 +1,5 @@ --- -namespace: slinky-slurm-operator +namespace: slinky configurations: - kustomizeconfig.yaml diff --git a/apps/slinky-slurm-operator/namespace.yaml b/apps/slinky-slurm-operator/namespace.yaml index edf7d98..a7b1ef8 100644 --- a/apps/slinky-slurm-operator/namespace.yaml +++ b/apps/slinky-slurm-operator/namespace.yaml @@ -2,4 +2,4 @@ apiVersion: v1 kind: Namespace metadata: - name: slinky-slurm-operator + name: slinky From 52250ad3a4addd3711de54f7fe807cfb50e04ed3 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Mon, 27 Jan 2025 21:54:09 +0000 Subject: [PATCH 16/46] Increase the slurm timeout a bit more --- apps/slinky-slurm-controlplane/helmrelease.yaml | 4 ++-- apps/slinky/README.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/slinky-slurm-controlplane/helmrelease.yaml b/apps/slinky-slurm-controlplane/helmrelease.yaml index 2f28ff2..f8224d1 100644 --- a/apps/slinky-slurm-controlplane/helmrelease.yaml +++ b/apps/slinky-slurm-controlplane/helmrelease.yaml @@ -20,5 +20,5 @@ spec: retries: -1 driftDetection: mode: disabled - interval: 10m - timeout: 10m + interval: 40m + timeout: 30m diff --git a/apps/slinky/README.md b/apps/slinky/README.md index 0e12119..af36fba 100644 --- a/apps/slinky/README.md +++ b/apps/slinky/README.md @@ -9,7 +9,7 @@ To test Slurm functionality, connect to the controller to use Slurm client commands: ```sh -kubectl -n slinky-slurm-controlplane exec -it statefulsets/slinky-slurm-controlplane-controller -- bash --login +kubectl -n slurm exec -it statefulsets/slurm-controlplane -- bash --login ``` On the controller pod (e.g. host `slurm@slurm-controller-0`), run the following From a7c55a89e6535bfd83783582649df73dcbb8ae1c Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Mon, 27 Jan 2025 22:16:30 +0000 Subject: [PATCH 17/46] Fix up JuypterHub --- apps/jupyterhub/helmchart.yaml | 2 +- apps/slinky/README.md | 2 +- apps/slinky/kustomization.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/jupyterhub/helmchart.yaml b/apps/jupyterhub/helmchart.yaml index 8d96a03..737e8c7 100644 --- a/apps/jupyterhub/helmchart.yaml +++ b/apps/jupyterhub/helmchart.yaml @@ -6,7 +6,7 @@ metadata: namespace: jupyterhub spec: chart: jupyterhub - version: "3.3.8" + version: "4.1.0" sourceRef: kind: HelmRepository name: jupyterhub diff --git a/apps/slinky/README.md b/apps/slinky/README.md index af36fba..58ea7d0 100644 --- a/apps/slinky/README.md +++ b/apps/slinky/README.md @@ -9,7 +9,7 @@ To test Slurm functionality, connect to the controller to use Slurm client commands: ```sh -kubectl -n slurm exec -it statefulsets/slurm-controlplane -- bash --login +kubectl -n slurm exec -it statefulsets/slurm-controller -- bash --login ``` On the controller pod (e.g. host `slurm@slurm-controller-0`), run the following diff --git a/apps/slinky/kustomization.yaml b/apps/slinky/kustomization.yaml index b562569..eb053fb 100644 --- a/apps/slinky/kustomization.yaml +++ b/apps/slinky/kustomization.yaml @@ -1,5 +1,5 @@ --- resources: - - ../cert-manager/ + # - ../cert-manager/ - ../slinky-slurm-operator/ - ../slinky-slurm-controlplane/ From 97f579ea83b11098601a736c5747eaf2e1b2a71e Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Wed, 29 Jan 2025 16:26:43 +0000 Subject: [PATCH 18/46] Simplify values in juypterhub --- apps/jupyterhub/configmap.yaml | 140 ++++++++------------------------- 1 file changed, 33 insertions(+), 107 deletions(-) diff --git a/apps/jupyterhub/configmap.yaml b/apps/jupyterhub/configmap.yaml index 8b809f7..aa5eff9 100644 --- a/apps/jupyterhub/configmap.yaml +++ b/apps/jupyterhub/configmap.yaml @@ -12,6 +12,9 @@ data: # We don't need a load balancer for the proxy # since we want to use ingress instead. + # + # To access manually try: + # kubectl port-forward -n jupyterhub svc/proxy-public 8080:80 proxy: service: type: ClusterIP @@ -19,28 +22,28 @@ data: # Make JupyterHub accessible via ingress ingress: enabled: false - ingressClassName: nginx - annotations: - cert-manager.io/cluster-issuer: letsencrypt-prod - hosts: - # IP must match NGINX ingress controller's - # load balancer IP. - # See `kubectl get svc -n ingress-nginx` - - &host 128-232-226-29.sslip.io - pathSuffix: "" - tls: - - hosts: - - *host - secretName: jupyterhub-ingress-cert + # ingressClassName: nginx + # annotations: + # cert-manager.io/cluster-issuer: letsencrypt-prod + # hosts: + # # IP must match NGINX ingress controller's + # # load balancer IP. + # # See `kubectl get svc -n ingress-nginx` + # - &host 128-232-226-29.sslip.io + # pathSuffix: "" + # tls: + # - hosts: + # - *host + # secretName: jupyterhub-ingress-cert hub: allowNamedServers: true - namedServerLimitPerUser: 5 - activeServerLimit: 3 + namedServerLimitPerUser: 3 + activeServerLimit: 2 # Server startup fails with default # restrictive network policy. networkPolicy: - enabled: false + enabled: false # # Configure Keycloak auth # config: @@ -76,81 +79,7 @@ data: # # Label for the 'Sign in with ___' button # login_service: Keycloak - # We install the kubernetes client here so that we can use it to - # detect weather the cluster has any GPU nodes, allowing us to show/hide - # GPU nodebook profiles automatically. - args: - - bash - - -c - - "pip install kubernetes && jupyterhub --config /usr/local/etc/jupyterhub/jupyterhub_config.py --upgrade-db" - extraConfig: - # Add user profiles dynamically based on cluster hardware - customspawner.py: | - from kubespawner import KubeSpawner - from kubernetes import client, config - - def build_profiles(spawner: KubeSpawner): - profiles = [ - { - "display_name": "Minimal environment", - "description": "To avoid too much bells and whistles: Python.", - "default": True, - }, - { - "display_name": "Datascience environment", - "description": "If you want the additional bells and whistles: Python, R, and Julia.", - "kubespawner_override": { - "image": "quay.io/jupyter/datascience-notebook:2024-08-05" - }, - }, - { - "display_name": "Pytorch environment (CPU)", - "description": "The official Jupyter Pytorch.", - "kubespawner_override": { - "image": "quay.io/jupyter/pytorch-notebook:pytorch-2.4.0", - }, - }, - ] - - config.load_incluster_config() - api = client.CoreV1Api() - nodes = api.list_node().items - - has_nvidia_gpu = lambda node: node.metadata.labels.get("nvidia.com/gpu.present", "") == "true" - has_intel_gpu = lambda node: node.metadata.labels.get("gpu.intel.com/device-id.0380-0bd5.present", "") == "true" - - if any(map(has_nvidia_gpu, nodes)): - profiles.append({ - "display_name": "Pytorch environment (Nvidia GPU)", - "description": "The official Jupyter Pytorch + CUDA image. Requires a GPU compatible notebook server.", - "kubespawner_override": { - "image": "quay.io/jupyter/pytorch-notebook:cuda12-pytorch-2.4.0", - "extra_resource_limits": { - "nvidia.com/gpu": "1", - }, - }, - }) - if any(map(has_intel_gpu, nodes)): - profiles.append({ - "display_name": "Pytorch environment (Intel GPU)", - "description": "A Jupyter + Intel Pytorch image. Requires a GPU compatible notebook server.", - "kubespawner_override": { - "image": "ghcr.io/stackhpc/jupyterhub-pytorch-intel-gpu:v0.0.1", - "extra_resource_limits": { - "gpu.intel.com/i915": "1", - }, - "supplemental_gids":[ - "110", # Ubuntu render group GID, requred for permission to use Intel GPU device - ], - }, - }) - - return profiles - - c.KubeSpawner.profile_list = build_profiles - - # Pre-pullers are not useful when profile list - # is built dynamically + # turn this off for now prePuller: hook: enabled: false @@ -158,21 +87,18 @@ data: enabled: false singleuser: - defaultUrl: /lab - # Defines the default image image: name: quay.io/jupyter/minimal-notebook - tag: "2025-01-20" - # Build these dynamically in extraConfig above instead - # profileList: - # - display_name: "Minimal environment" - # description: "To avoid too much bells and whistles: Python." - # default: true - # - display_name: "Datascience environment" - # description: "If you want the additional bells and whistles: Python, R, and Julia." - # kubespawner_override: - # image: quay.io/jupyter/datascience-notebook:2024-08-05 - # - display_name: "Spark environment" - # description: "The Jupyter Stacks spark image!" - # kubespawner_override: - # image: quay.io/jupyter/all-spark-notebook:2024-08-05 + tag: "2025-01-28" + profileList: + - display_name: "Minimal environment" + description: "To avoid too much bells and whistles: Python." + default: true + - display_name: "Datascience environment" + description: "If you want the additional bells and whistles: Python, R, and Julia." + kubespawner_override: + image: quay.io/jupyter/datascience-notebook:2025-01-28 + - display_name: "Pytorch environment" + description: "Pytorch Jupyter Stacks image!" + kubespawner_override: + image: quay.io/jupyter/pytorch-notebook:2025-01-28 From 7925dc9e08811141caf29d4a0c7e4f2a5d211084 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Thu, 30 Jan 2025 19:00:16 +0000 Subject: [PATCH 19/46] Fix up storage class with new name --- apps/jupyterhub/configmap.yaml | 26 ++++++++++---------- apps/slinky-slurm-controlplane/defaults.yaml | 4 +-- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/apps/jupyterhub/configmap.yaml b/apps/jupyterhub/configmap.yaml index aa5eff9..53a4bad 100644 --- a/apps/jupyterhub/configmap.yaml +++ b/apps/jupyterhub/configmap.yaml @@ -22,19 +22,19 @@ data: # Make JupyterHub accessible via ingress ingress: enabled: false - # ingressClassName: nginx - # annotations: - # cert-manager.io/cluster-issuer: letsencrypt-prod - # hosts: - # # IP must match NGINX ingress controller's - # # load balancer IP. - # # See `kubectl get svc -n ingress-nginx` - # - &host 128-232-226-29.sslip.io - # pathSuffix: "" - # tls: - # - hosts: - # - *host - # secretName: jupyterhub-ingress-cert + ingressClassName: nginx + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + hosts: + # IP must match NGINX ingress controller's + # load balancer IP. + # See `kubectl get svc -n ingress-nginx` + - &host jh.dawntest.128-232-224-75.nip.io + pathSuffix: "" + tls: + - hosts: + - *host + secretName: jupyterhub-ingress-cert hub: allowNamedServers: true diff --git a/apps/slinky-slurm-controlplane/defaults.yaml b/apps/slinky-slurm-controlplane/defaults.yaml index 717caac..78bbed5 100644 --- a/apps/slinky-slurm-controlplane/defaults.yaml +++ b/apps/slinky-slurm-controlplane/defaults.yaml @@ -224,7 +224,7 @@ controller: # # -- (string) # Create a `PersistentVolumeClaim` with this storage class. - storageClass: csi-cinder + storageClass: csi-manila # # -- (list) # Create a `PersistentVolumeClaim` with these access modes. @@ -531,7 +531,7 @@ mariadb: persistence: enabled: false existingClaim: "" - storageClass: csi-cinder + storageClass: csi-manila labels: {} annotations: {} accessModes: From 0cb5684d0de2a6664c56049f7c4f88dbbc737343 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Thu, 30 Jan 2025 19:10:05 +0000 Subject: [PATCH 20/46] Expose XPUs --- apps/jupyterhub/configmap.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/apps/jupyterhub/configmap.yaml b/apps/jupyterhub/configmap.yaml index 53a4bad..70c4e76 100644 --- a/apps/jupyterhub/configmap.yaml +++ b/apps/jupyterhub/configmap.yaml @@ -98,7 +98,11 @@ data: description: "If you want the additional bells and whistles: Python, R, and Julia." kubespawner_override: image: quay.io/jupyter/datascience-notebook:2025-01-28 - - display_name: "Pytorch environment" + - display_name: "Pytorch environment with 2 Intel XPUs" description: "Pytorch Jupyter Stacks image!" kubespawner_override: image: quay.io/jupyter/pytorch-notebook:2025-01-28 + extra_resource_limits: + "gpu.intel.com/i915": "2" + supplemental_gids: + - "110" # Ubuntu render group GID, requred for permission to use Intel GPU device From 60805e0d99baad29ab6016eeeb6284b75e45a04c Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 31 Jan 2025 17:06:34 +0000 Subject: [PATCH 21/46] Add customer juypter image build --- .github/workflows/build-images.yml | 45 +++++++++++++++ apps/jupyterhub/configmap.yaml | 13 ++++- images/jupyterhub-intel-gpu/Dockerfile | 78 ++++++++++++++++++++++++++ 3 files changed, 134 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/build-images.yml create mode 100644 images/jupyterhub-intel-gpu/Dockerfile diff --git a/.github/workflows/build-images.yml b/.github/workflows/build-images.yml new file mode 100644 index 0000000..8537721 --- /dev/null +++ b/.github/workflows/build-images.yml @@ -0,0 +1,45 @@ +name: Publish Container Images +on: + push: + paths: + - images/** +jobs: + build_push_images: + name: Build and push images + runs-on: ubuntu-22.04 + strategy: + matrix: + include: + - image: jupyterhub-intel-gpu + steps: + - name: Check out the repository + uses: actions/checkout@v4 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Calculate metadata for image + id: image-meta + uses: docker/metadata-action@v5 + with: + images: ghcr.io/johngarbutt/${{ matrix.image }} + # Produce the branch name or tag and the SHA as tags + tags: | + type=ref,event=branch + type=ref,event=tag + type=sha,prefix= + + - name: Build and push image + uses: azimuth-cloud/github-actions/docker-multiarch-build-push@master + with: + cache-key: ${{ matrix.image }} + context: ./images/${{ matrix.image }} + platforms: linux/amd64 + push: true + tags: ${{ steps.image-meta.outputs.tags }} + labels: ${{ steps.image-meta.outputs.labels }} + diff --git a/apps/jupyterhub/configmap.yaml b/apps/jupyterhub/configmap.yaml index 70c4e76..9e226b6 100644 --- a/apps/jupyterhub/configmap.yaml +++ b/apps/jupyterhub/configmap.yaml @@ -98,11 +98,20 @@ data: description: "If you want the additional bells and whistles: Python, R, and Julia." kubespawner_override: image: quay.io/jupyter/datascience-notebook:2025-01-28 - - display_name: "Pytorch environment with 2 Intel XPUs" + - display_name: "Pytorch environment with Intel XPUs (v0.0.1)" description: "Pytorch Jupyter Stacks image!" kubespawner_override: image: quay.io/jupyter/pytorch-notebook:2025-01-28 + #image: ghcr.io/stackhpc/jupyterhub-pytorch-intel-gpu:v0.0.1 extra_resource_limits: - "gpu.intel.com/i915": "2" + "gpu.intel.com/i915": "1" supplemental_gids: - "110" # Ubuntu render group GID, requred for permission to use Intel GPU device + # https://pytorch.org/docs/stable/notes/get_start_xpu.html + # lifecycle_hooks: + # postStart: + # exec: + # command: + # - "sh" + # - "-c" + # - "pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu" diff --git a/images/jupyterhub-intel-gpu/Dockerfile b/images/jupyterhub-intel-gpu/Dockerfile new file mode 100644 index 0000000..4d3020a --- /dev/null +++ b/images/jupyterhub-intel-gpu/Dockerfile @@ -0,0 +1,78 @@ +FROM quay.io/jupyter/pytorch-notebook:2025-01-28 + +##### +# Add Intel GPU components +##### + +USER root + +ENV LANG=C.UTF-8 + +ARG DEBIAN_FRONTEND=noninteractive + +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + apt-utils \ + build-essential \ + ca-certificates \ + clinfo \ + curl \ + git \ + gnupg2 \ + gpg-agent \ + rsync \ + sudo \ + unzip \ + wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \ + gpg --dearmor --yes --output /usr/share/keyrings/intel-graphics.gpg +RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy unified" | \ + tee /etc/apt/sources.list.d/intel-gpu-jammy.list &&\ + apt update + +ARG ICD_VER=24.22.29735.27-914~22.04 +ARG LEVEL_ZERO_GPU_VER=1.3.29735.27-914~22.04 +ARG LEVEL_ZERO_VER=1.17.6-914~22.04 +ARG LEVEL_ZERO_DEV_VER=1.17.6-914~22.04 + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + intel-opencl-icd=${ICD_VER} \ + intel-level-zero-gpu=${LEVEL_ZERO_GPU_VER} \ + libze1=${LEVEL_ZERO_VER} \ + libze-dev=${LEVEL_ZERO_DEV_VER} && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ + | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \ + echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \ + | tee /etc/apt/sources.list.d/oneAPI.list + +ARG DPCPP_VER=2024.2.1-1079 +ARG MKL_VER=2024.2.1-103 +ARG CCL_VER=2021.13.1-31 + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + intel-oneapi-runtime-dpcpp-cpp=${DPCPP_VER} \ + intel-oneapi-runtime-mkl=${MKL_VER} \ + intel-oneapi-runtime-ccl=${CCL_VER} && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN groupadd -g 110 render + +##### +# User-level python components +##### + +USER ${NB_USER} + +pip3 install torch torchvision torchaudio \ + --index-url https://download.pytorch.org/whl/test/xpu From be2d370d9fbf275b61f24cc5143526a014405642 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 31 Jan 2025 17:08:41 +0000 Subject: [PATCH 22/46] Fix typo in dockerfile --- images/jupyterhub-intel-gpu/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/images/jupyterhub-intel-gpu/Dockerfile b/images/jupyterhub-intel-gpu/Dockerfile index 4d3020a..8e8eb09 100644 --- a/images/jupyterhub-intel-gpu/Dockerfile +++ b/images/jupyterhub-intel-gpu/Dockerfile @@ -74,5 +74,5 @@ RUN groupadd -g 110 render USER ${NB_USER} -pip3 install torch torchvision torchaudio \ +RUN pip3 install torch torchvision torchaudio \ --index-url https://download.pytorch.org/whl/test/xpu From 6421ba1b217591348d7b013f639df1d39b19feaa Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 31 Jan 2025 17:50:30 +0000 Subject: [PATCH 23/46] Add the missing -U upgrade flag --- apps/jupyterhub/configmap.yaml | 5 +++-- images/jupyterhub-intel-gpu/Dockerfile | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/jupyterhub/configmap.yaml b/apps/jupyterhub/configmap.yaml index 9e226b6..c3c50e3 100644 --- a/apps/jupyterhub/configmap.yaml +++ b/apps/jupyterhub/configmap.yaml @@ -98,11 +98,12 @@ data: description: "If you want the additional bells and whistles: Python, R, and Julia." kubespawner_override: image: quay.io/jupyter/datascience-notebook:2025-01-28 - - display_name: "Pytorch environment with Intel XPUs (v0.0.1)" + - display_name: "Pytorch environment with Intel XPUs (v0.0.2)" description: "Pytorch Jupyter Stacks image!" kubespawner_override: - image: quay.io/jupyter/pytorch-notebook:2025-01-28 + #image: quay.io/jupyter/pytorch-notebook:2025-01-28 #image: ghcr.io/stackhpc/jupyterhub-pytorch-intel-gpu:v0.0.1 + image: ghcr.io/johngarbutt/jupyterhub-intel-gpu:be2d370 extra_resource_limits: "gpu.intel.com/i915": "1" supplemental_gids: diff --git a/images/jupyterhub-intel-gpu/Dockerfile b/images/jupyterhub-intel-gpu/Dockerfile index 8e8eb09..df18db0 100644 --- a/images/jupyterhub-intel-gpu/Dockerfile +++ b/images/jupyterhub-intel-gpu/Dockerfile @@ -74,5 +74,5 @@ RUN groupadd -g 110 render USER ${NB_USER} -RUN pip3 install torch torchvision torchaudio \ +RUN pip3 install -U torch torchvision torchaudio \ --index-url https://download.pytorch.org/whl/test/xpu From 1a74c1cee61d04066a5230769bab53636d1edfa3 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 31 Jan 2025 17:51:51 +0000 Subject: [PATCH 24/46] Fix up the workflow permissions --- .github/workflows/build-images.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/build-images.yml b/.github/workflows/build-images.yml index 8537721..06ab587 100644 --- a/.github/workflows/build-images.yml +++ b/.github/workflows/build-images.yml @@ -6,6 +6,11 @@ on: jobs: build_push_images: name: Build and push images + permissions: + contents: read + id-token: write # needed for signing the images with GitHub OIDC Token + packages: write # required for pushing container images + security-events: write # required for pushing SARIF files runs-on: ubuntu-22.04 strategy: matrix: From df5f475a56684d6dbb1400add30c1806fec593d2 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 31 Jan 2025 18:04:35 +0000 Subject: [PATCH 25/46] Add examples into notebook image --- images/jupyterhub-intel-gpu/Dockerfile | 2 + images/jupyterhub-intel-gpu/examples.ipynb | 84 ++++++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 images/jupyterhub-intel-gpu/examples.ipynb diff --git a/images/jupyterhub-intel-gpu/Dockerfile b/images/jupyterhub-intel-gpu/Dockerfile index df18db0..f7a6482 100644 --- a/images/jupyterhub-intel-gpu/Dockerfile +++ b/images/jupyterhub-intel-gpu/Dockerfile @@ -76,3 +76,5 @@ USER ${NB_USER} RUN pip3 install -U torch torchvision torchaudio \ --index-url https://download.pytorch.org/whl/test/xpu + +COPY examples.ipynb . diff --git a/images/jupyterhub-intel-gpu/examples.ipynb b/images/jupyterhub-intel-gpu/examples.ipynb new file mode 100644 index 0000000..a8e5f0e --- /dev/null +++ b/images/jupyterhub-intel-gpu/examples.ipynb @@ -0,0 +1,84 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "7f446447-6d6b-41d9-9cc6-90e86703d0fc", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# See: https://pytorch.org/docs/stable/notes/get_start_xpu.html\n", + "import torch\n", + "torch.xpu.is_available() # torch.xpu is the API for Intel GPU support" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80cb2dab-ac88-4486-93ab-8777ceb70f0b", + "metadata": {}, + "outputs": [], + "source": [ + "!clinfo -l" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a2d7573-f0df-4ed1-906d-099e5146660c", + "metadata": {}, + "outputs": [], + "source": [ + "torch.xpu.device_count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc672f30-01a2-403b-b90d-3669e8409c6c", + "metadata": {}, + "outputs": [], + "source": [ + "# Inference with FP32\n", + "\n", + "import torch\n", + "import torchvision.models as models\n", + "\n", + "model = models.resnet50(weights=\"ResNet50_Weights.DEFAULT\")\n", + "model.eval()\n", + "data = torch.rand(1, 3, 224, 224)\n", + "\n", + "model = model.to(\"xpu\")\n", + "data = data.to(\"xpu\")\n", + "\n", + "with torch.no_grad():\n", + " model(data)\n", + "\n", + "print(\"Execution finished\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From effd9b51fd4d98e91fbad1f1ee8b89c28ed8b0fb Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 31 Jan 2025 18:24:35 +0000 Subject: [PATCH 26/46] Initial xpu profiles --- apps/jupyterhub/configmap.yaml | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/apps/jupyterhub/configmap.yaml b/apps/jupyterhub/configmap.yaml index c3c50e3..e414ecf 100644 --- a/apps/jupyterhub/configmap.yaml +++ b/apps/jupyterhub/configmap.yaml @@ -39,7 +39,7 @@ data: hub: allowNamedServers: true namedServerLimitPerUser: 3 - activeServerLimit: 2 + activeServerLimit: 5 # Server startup fails with default # restrictive network policy. networkPolicy: @@ -98,21 +98,29 @@ data: description: "If you want the additional bells and whistles: Python, R, and Julia." kubespawner_override: image: quay.io/jupyter/datascience-notebook:2025-01-28 - - display_name: "Pytorch environment with Intel XPUs (v0.0.2)" + - display_name: "Pytorch environment with 1 x Intel XPUs" description: "Pytorch Jupyter Stacks image!" kubespawner_override: #image: quay.io/jupyter/pytorch-notebook:2025-01-28 #image: ghcr.io/stackhpc/jupyterhub-pytorch-intel-gpu:v0.0.1 - image: ghcr.io/johngarbutt/jupyterhub-intel-gpu:be2d370 + image: ghcr.io/johngarbutt/jupyterhub-intel-gpu:6421ba1 extra_resource_limits: "gpu.intel.com/i915": "1" supplemental_gids: - "110" # Ubuntu render group GID, requred for permission to use Intel GPU device - # https://pytorch.org/docs/stable/notes/get_start_xpu.html - # lifecycle_hooks: - # postStart: - # exec: - # command: - # - "sh" - # - "-c" - # - "pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu" + - display_name: "Pytorch environment with 2 x Intel XPUs" + description: "Pytorch Jupyter Stacks image!" + kubespawner_override: + image: ghcr.io/johngarbutt/jupyterhub-intel-gpu:6421ba1 + extra_resource_limits: + "gpu.intel.com/i915": "2" + supplemental_gids: + - "110" # Ubuntu render group GID, requred for permission to use Intel GPU device + - display_name: "Pytorch environment with 4 x Intel XPUs" + description: "Pytorch Jupyter Stacks image!" + kubespawner_override: + image: ghcr.io/johngarbutt/jupyterhub-intel-gpu:6421ba1 + extra_resource_limits: + "gpu.intel.com/i915": "4" + supplemental_gids: + - "110" # Ubuntu render group GID, requred for permission to use Intel GPU device From 0ca30f584d89045bdb9a7ab046a8008d0c2901e7 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Mon, 3 Feb 2025 18:15:35 +0000 Subject: [PATCH 27/46] Adding inital rdma test --- apps/jupyterhub/configmap.yaml | 3 +++ apps/rdmatest/hostdevice-network-pod1.yaml | 27 ++++++++++++++++++++ apps/rdmatest/hostdevice-network-pod2.yaml | 27 ++++++++++++++++++++ apps/rdmatest/hostdevice-network.yaml | 22 ++++++++++++++++ apps/rdmatest/kustomization.yaml | 7 +++++ apps/rdmatest/namespace.yaml | 5 ++++ apps/slinky-slurm-controlplane/defaults.yaml | 7 ++--- 7 files changed, 95 insertions(+), 3 deletions(-) create mode 100644 apps/rdmatest/hostdevice-network-pod1.yaml create mode 100644 apps/rdmatest/hostdevice-network-pod2.yaml create mode 100644 apps/rdmatest/hostdevice-network.yaml create mode 100644 apps/rdmatest/kustomization.yaml create mode 100644 apps/rdmatest/namespace.yaml diff --git a/apps/jupyterhub/configmap.yaml b/apps/jupyterhub/configmap.yaml index e414ecf..7c09c18 100644 --- a/apps/jupyterhub/configmap.yaml +++ b/apps/jupyterhub/configmap.yaml @@ -106,6 +106,7 @@ data: image: ghcr.io/johngarbutt/jupyterhub-intel-gpu:6421ba1 extra_resource_limits: "gpu.intel.com/i915": "1" + "nvidia.com/hostdev": "1" supplemental_gids: - "110" # Ubuntu render group GID, requred for permission to use Intel GPU device - display_name: "Pytorch environment with 2 x Intel XPUs" @@ -114,6 +115,7 @@ data: image: ghcr.io/johngarbutt/jupyterhub-intel-gpu:6421ba1 extra_resource_limits: "gpu.intel.com/i915": "2" + "nvidia.com/hostdev": "2" supplemental_gids: - "110" # Ubuntu render group GID, requred for permission to use Intel GPU device - display_name: "Pytorch environment with 4 x Intel XPUs" @@ -122,5 +124,6 @@ data: image: ghcr.io/johngarbutt/jupyterhub-intel-gpu:6421ba1 extra_resource_limits: "gpu.intel.com/i915": "4" + "nvidia.com/hostdev": "4" supplemental_gids: - "110" # Ubuntu render group GID, requred for permission to use Intel GPU device diff --git a/apps/rdmatest/hostdevice-network-pod1.yaml b/apps/rdmatest/hostdevice-network-pod1.yaml new file mode 100644 index 0000000..d1661ee --- /dev/null +++ b/apps/rdmatest/hostdevice-network-pod1.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +kind: Pod +metadata: + name: test-hostdev-pod-1 + annotations: + k8s.v1.cni.cncf.io/networks: example-hostdevice-network +spec: + nodeSelector: + # Note: Replace hostname or remove selector altogether + kubernetes.io/hostname: dawntest-dawn-ztt9k-jzczg + containers: + - name: test-hostdev-pod + #image: ghcr.io/stackhpc/kube-perftest-mpi-benchmarks:19e96a8 + image: mellanox/rping-test + imagePullPolicy: IfNotPresent + command: [ "/bin/bash", "-c", "--" ] + args: [ "while true; do sleep 300; done;" ] + securityContext: + capabilities: + add: [ "IPC_LOCK" ] + resources: + requests: + nvidia.com/hostdev: "2" + gpu.intel.com/i915: "2" + limits: + nvidia.com/hostdev: "2" + gpu.intel.com/i915: "2" diff --git a/apps/rdmatest/hostdevice-network-pod2.yaml b/apps/rdmatest/hostdevice-network-pod2.yaml new file mode 100644 index 0000000..5de6b35 --- /dev/null +++ b/apps/rdmatest/hostdevice-network-pod2.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +kind: Pod +metadata: + name: test-hostdev-pod-2 + annotations: + k8s.v1.cni.cncf.io/networks: example-hostdevice-network +spec: + nodeSelector: + # Note: Replace hostname or remove selector altogether + kubernetes.io/hostname: dawntest-dawn-ztt9k-4zrm4 + containers: + - name: test-hostdev-pod + #image: ghcr.io/stackhpc/kube-perftest-mpi-benchmarks:19e96a8 + image: mellanox/rping-test + imagePullPolicy: IfNotPresent + command: [ "/bin/bash", "-c", "--" ] + args: [ "while true; do sleep 300; done;" ] + securityContext: + capabilities: + add: [ "IPC_LOCK" ] + resources: + requests: + nvidia.com/hostdev: "2" + gpu.intel.com/i915: "2" + limits: + nvidia.com/hostdev: "2" + gpu.intel.com/i915: "2" diff --git a/apps/rdmatest/hostdevice-network.yaml b/apps/rdmatest/hostdevice-network.yaml new file mode 100644 index 0000000..85215fb --- /dev/null +++ b/apps/rdmatest/hostdevice-network.yaml @@ -0,0 +1,22 @@ +apiVersion: mellanox.com/v1alpha1 +kind: HostDeviceNetwork +metadata: + name: example-hostdevice-network +spec: + networkNamespace: "default" + resourceName: "hostdev" + ipam: | + { + "type": "whereabouts", + "datastore": "kubernetes", + "kubernetes": { + "kubeconfig": "/etc/cni/net.d/whereabouts.d/whereabouts.kubeconfig" + }, + "range": "192.168.42.0/24", + "exclude": [ + "192.168.42.0/32", + "192.168.42.255/32", + ], + "log_file" : "/var/log/whereabouts.log", + "log_level" : "info" + } \ No newline at end of file diff --git a/apps/rdmatest/kustomization.yaml b/apps/rdmatest/kustomization.yaml new file mode 100644 index 0000000..028a80d --- /dev/null +++ b/apps/rdmatest/kustomization.yaml @@ -0,0 +1,7 @@ +namespace: rdmatest + +resources: + - namespace.yaml + - hostdevice-network.yaml + - hostdevice-network-pod1.yaml + - hostdevice-network-pod2.yaml diff --git a/apps/rdmatest/namespace.yaml b/apps/rdmatest/namespace.yaml new file mode 100644 index 0000000..a5f6b03 --- /dev/null +++ b/apps/rdmatest/namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: rdmatest diff --git a/apps/slinky-slurm-controlplane/defaults.yaml b/apps/slinky-slurm-controlplane/defaults.yaml index 78bbed5..3666009 100644 --- a/apps/slinky-slurm-controlplane/defaults.yaml +++ b/apps/slinky-slurm-controlplane/defaults.yaml @@ -268,7 +268,7 @@ compute: # # -- (string) # Name of NodeSet. Must be unique. - - name: debug + - name: dawn # # -- (bool) # Enables the NodeSet in Slurm. @@ -309,8 +309,9 @@ compute: # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container resources: limits: - cpu: 1 - memory: 1Gi + cpu: 16 + memory: 100Gi + "gpu.intel.com/i915": "4" # # -- (map) # Selector which must match a node's labels for the pod to be scheduled on that node. From f25178276ef93d11c4bc36887dc1db4d6b8bb319 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Thu, 6 Feb 2025 11:41:44 +0000 Subject: [PATCH 28/46] Attempt to reduce privilage of juypterhub pods --- apps/jupyterhub/configmap.yaml | 8 +++++++- apps/jupyterhub/namespace.yaml | 9 +++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/apps/jupyterhub/configmap.yaml b/apps/jupyterhub/configmap.yaml index 7c09c18..e834f2f 100644 --- a/apps/jupyterhub/configmap.yaml +++ b/apps/jupyterhub/configmap.yaml @@ -106,9 +106,15 @@ data: image: ghcr.io/johngarbutt/jupyterhub-intel-gpu:6421ba1 extra_resource_limits: "gpu.intel.com/i915": "1" - "nvidia.com/hostdev": "1" + # "nvidia.com/hostdev": "1" supplemental_gids: - "110" # Ubuntu render group GID, requred for permission to use Intel GPU device + privilaged: false + container_security_context: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL - display_name: "Pytorch environment with 2 x Intel XPUs" description: "Pytorch Jupyter Stacks image!" kubespawner_override: diff --git a/apps/jupyterhub/namespace.yaml b/apps/jupyterhub/namespace.yaml index 241052e..6b63493 100644 --- a/apps/jupyterhub/namespace.yaml +++ b/apps/jupyterhub/namespace.yaml @@ -3,3 +3,12 @@ apiVersion: v1 kind: Namespace metadata: name: jupyterhub + labels: + # Set the pod security standard for the namespace + # https://kubernetes.io/docs/tutorials/security/ns-level-pss/ + # pod-security.kubernetes.io/enforce: privileged + # pod-security.kubernetes.io/enforce: baseline + # pod-security.kubernetes.io/enforce: restricted + pod-security.kubernetes.io/enforce: baseline + pod-security.kubernetes.io/warn: restricted + pod-security.kubernetes.io/audit: restricted From 09a4ef6d04ebdaea111465d1c4a93a9112d21413 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Thu, 6 Feb 2025 12:08:57 +0000 Subject: [PATCH 29/46] Add kube-perftest --- apps/kube-perftest/configmap.yaml | 8 ++++++++ apps/kube-perftest/helmchart.yaml | 12 ++++++++++++ apps/kube-perftest/helmrelease.yaml | 23 +++++++++++++++++++++++ apps/kube-perftest/helmrepository.yaml | 8 ++++++++ apps/kube-perftest/kustomization.yaml | 9 +++++++++ apps/kube-perftest/namespace.yaml | 13 +++++++++++++ 6 files changed, 73 insertions(+) create mode 100644 apps/kube-perftest/configmap.yaml create mode 100644 apps/kube-perftest/helmchart.yaml create mode 100644 apps/kube-perftest/helmrelease.yaml create mode 100644 apps/kube-perftest/helmrepository.yaml create mode 100644 apps/kube-perftest/kustomization.yaml create mode 100644 apps/kube-perftest/namespace.yaml diff --git a/apps/kube-perftest/configmap.yaml b/apps/kube-perftest/configmap.yaml new file mode 100644 index 0000000..4976bf4 --- /dev/null +++ b/apps/kube-perftest/configmap.yaml @@ -0,0 +1,8 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: kube-perftest-config +data: + values.yaml: | + # TODO... \ No newline at end of file diff --git a/apps/kube-perftest/helmchart.yaml b/apps/kube-perftest/helmchart.yaml new file mode 100644 index 0000000..1b96205 --- /dev/null +++ b/apps/kube-perftest/helmchart.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmChart +metadata: + name: kube-perftest-operator +spec: + chart: kube-perftest-operator + version: ">=0.1.0-dev.0.main.0,<0.1.0-dev.0.main.99999999999" + sourceRef: + kind: HelmRepository + name: kube-perftest + interval: 10m0s diff --git a/apps/kube-perftest/helmrelease.yaml b/apps/kube-perftest/helmrelease.yaml new file mode 100644 index 0000000..9dc3cf3 --- /dev/null +++ b/apps/kube-perftest/helmrelease.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: kube-perftest-operator +spec: + chartRef: + kind: HelmChart + name: kube-perftest-operator + releaseName: kube-perftest-operator + valuesFrom: + - kind: ConfigMap + name: kube-perftest-config + install: + createNamespace: true + remediation: + retries: 3 + upgrade: + remediation: + retries: 3 + driftDetection: + mode: enabled + interval: 5m diff --git a/apps/kube-perftest/helmrepository.yaml b/apps/kube-perftest/helmrepository.yaml new file mode 100644 index 0000000..84b4c56 --- /dev/null +++ b/apps/kube-perftest/helmrepository.yaml @@ -0,0 +1,8 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: kube-perftest +spec: + url: https://stackhpc.github.io/kube-perftest + interval: 1h diff --git a/apps/kube-perftest/kustomization.yaml b/apps/kube-perftest/kustomization.yaml new file mode 100644 index 0000000..7358c2a --- /dev/null +++ b/apps/kube-perftest/kustomization.yaml @@ -0,0 +1,9 @@ +--- +namespace: kube-perftest + +resources: + - namespace.yaml + - helmrepository.yaml + - helmchart.yaml + - helmrelease.yaml + - configmap.yaml diff --git a/apps/kube-perftest/namespace.yaml b/apps/kube-perftest/namespace.yaml new file mode 100644 index 0000000..b2c8c18 --- /dev/null +++ b/apps/kube-perftest/namespace.yaml @@ -0,0 +1,13 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: kube-perftest + labels: + # Set the pod security standard for the namespace + # https://kubernetes.io/docs/tutorials/security/ns-level-pss/ + # pod-security.kubernetes.io/enforce: privileged + # pod-security.kubernetes.io/enforce: baseline + # pod-security.kubernetes.io/enforce: restricted + # pod-security.kubernetes.io/warn: restricted + # pod-security.kubernetes.io/audit: restricted From 5494585f7a56d6411e31d96ab93a5e7114809081 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 6 Feb 2025 14:46:23 +0000 Subject: [PATCH 30/46] fixed slurm control plane getting deleted by helm --- apps/slinky-slurm-controlplane/helmrelease.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/apps/slinky-slurm-controlplane/helmrelease.yaml b/apps/slinky-slurm-controlplane/helmrelease.yaml index f8224d1..0bbb804 100644 --- a/apps/slinky-slurm-controlplane/helmrelease.yaml +++ b/apps/slinky-slurm-controlplane/helmrelease.yaml @@ -13,11 +13,6 @@ spec: valuesKey: values.yaml install: createNamespace: true - remediation: - retries: -1 - upgrade: - remediation: - retries: -1 driftDetection: mode: disabled interval: 40m From 50e29939e309c661041cf4bd97f17b6ed14ef48f Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 7 Feb 2025 15:22:55 +0000 Subject: [PATCH 31/46] added operator as dependency of control plane --- apps/slinky-slurm-controlplane/helmrelease.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/apps/slinky-slurm-controlplane/helmrelease.yaml b/apps/slinky-slurm-controlplane/helmrelease.yaml index 0bbb804..9367025 100644 --- a/apps/slinky-slurm-controlplane/helmrelease.yaml +++ b/apps/slinky-slurm-controlplane/helmrelease.yaml @@ -11,6 +11,9 @@ spec: - kind: ConfigMap name: slinky-slurm-defaults valuesKey: values.yaml + dependsOn: + - name: slurm-operator + namespace: slinky install: createNamespace: true driftDetection: From e32748e8ab7a05ff18f91899b9d62d9685b559d6 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 6 Feb 2025 09:44:08 +0000 Subject: [PATCH 32/46] now slinky compute nodes now preempted by jupyter labs --- apps/slinky-slurm-controlplane/defaults.yaml | 38 ++++++++----------- .../kustomization.yaml | 1 + .../priorityclass.yaml | 6 +++ 3 files changed, 23 insertions(+), 22 deletions(-) create mode 100644 apps/slinky-slurm-controlplane/priorityclass.yaml diff --git a/apps/slinky-slurm-controlplane/defaults.yaml b/apps/slinky-slurm-controlplane/defaults.yaml index 3666009..ea6ca41 100644 --- a/apps/slinky-slurm-controlplane/defaults.yaml +++ b/apps/slinky-slurm-controlplane/defaults.yaml @@ -277,7 +277,7 @@ compute: # -- (integer) # Set the number of replicas to deploy. # NOTE: if empty, all nodes matching affinity will have a replica (like DaemonSet). - replicas: 1 + replicas: 3 # TODO: set to max nodes in cluster # # -- (int) # The minimum number of seconds for which a newly created NodeSet Pod should be ready @@ -302,7 +302,7 @@ compute: # -- (string) # Set the priority class to use. # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass - priorityClassName: "" + priorityClassName: "slinky-low-priority" # # -- (object) # Set container resource requests and limits for Kubernetes Pod scheduling. @@ -320,26 +320,20 @@ compute: # # -- (object) # Set affinity for Kubernetes Pod scheduling. - affinity: {} - # nodeAffinity: - # requiredDuringSchedulingIgnoredDuringExecution: - # nodeSelectorTerms: - # - matchExpressions: - # - key: "kubernetes.io/os" - # operator: In - # values: - # - linux - # podAntiAffinity: - # requiredDuringSchedulingIgnoredDuringExecution: - # - topologyKey: "kubernetes.io/hostname" - # labelSelector: - # matchExpressions: - # - key: "app.kubernetes.io/name" - # operator: In - # values: - # - slurmctld - # - slurmdbd - # - slurmrestd + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: app.kubernetes.io/component + operator: In + values: + - singleuser-server + topologyKey: "kubernetes.io/hostname" + namespaces: + - slurm + - jupyterhub + # # -- (object) # Set the update strategy configuration. diff --git a/apps/slinky-slurm-controlplane/kustomization.yaml b/apps/slinky-slurm-controlplane/kustomization.yaml index 8c1cd0c..c73face 100644 --- a/apps/slinky-slurm-controlplane/kustomization.yaml +++ b/apps/slinky-slurm-controlplane/kustomization.yaml @@ -13,3 +13,4 @@ resources: - namespace.yaml - ocirepository.yaml - helmrelease.yaml + - priorityclass.yaml diff --git a/apps/slinky-slurm-controlplane/priorityclass.yaml b/apps/slinky-slurm-controlplane/priorityclass.yaml new file mode 100644 index 0000000..ee1ae7c --- /dev/null +++ b/apps/slinky-slurm-controlplane/priorityclass.yaml @@ -0,0 +1,6 @@ +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: slinky-low-priority +value: -1 +globalDefault: false From 618dd10070d78adbefb8c7946a1b38905a37da5a Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 7 Feb 2025 09:17:25 +0000 Subject: [PATCH 33/46] Added keda autoscaling --- apps/keda/helmrelease.yaml | 15 ++++++++++++++ apps/keda/helmrepository.yaml | 9 +++++++++ apps/keda/kustomization.yaml | 6 ++++++ apps/kube-prometheus-stack/defaults.yaml | 9 +++++++++ .../kustomization.yaml | 1 + .../scaledobject.yaml | 20 +++++++++++++++++++ 6 files changed, 60 insertions(+) create mode 100644 apps/keda/helmrelease.yaml create mode 100644 apps/keda/helmrepository.yaml create mode 100644 apps/keda/kustomization.yaml create mode 100644 apps/kube-prometheus-stack/defaults.yaml create mode 100644 apps/slinky-slurm-controlplane/scaledobject.yaml diff --git a/apps/keda/helmrelease.yaml b/apps/keda/helmrelease.yaml new file mode 100644 index 0000000..85f12d5 --- /dev/null +++ b/apps/keda/helmrelease.yaml @@ -0,0 +1,15 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: keda +spec: + chart: + spec: + chart: keda + sourceRef: + kind: HelmRepository + name: keda + interval: 5m + install: + createNamespace: true diff --git a/apps/keda/helmrepository.yaml b/apps/keda/helmrepository.yaml new file mode 100644 index 0000000..a7dec6d --- /dev/null +++ b/apps/keda/helmrepository.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: keda + namespace: keda +spec: + url: https://kedacore.github.io/charts + interval: 5m diff --git a/apps/keda/kustomization.yaml b/apps/keda/kustomization.yaml new file mode 100644 index 0000000..81aeb15 --- /dev/null +++ b/apps/keda/kustomization.yaml @@ -0,0 +1,6 @@ +--- +namespace: keda + +resources: + - helmrelease.yaml + - helmrepository.yaml diff --git a/apps/kube-prometheus-stack/defaults.yaml b/apps/kube-prometheus-stack/defaults.yaml new file mode 100644 index 0000000..5997202 --- /dev/null +++ b/apps/kube-prometheus-stack/defaults.yaml @@ -0,0 +1,9 @@ +prometheus: + prometheusSpec: + additionalScrapeConfigs: + - job_name: "slurm_exporter" + scrape_interval: 10s + scrape_timeout: 30s + static_configs: + - targets: + - slurm-exporter.slurm.svc.cluster.local:8080 diff --git a/apps/slinky-slurm-controlplane/kustomization.yaml b/apps/slinky-slurm-controlplane/kustomization.yaml index c73face..eb1698f 100644 --- a/apps/slinky-slurm-controlplane/kustomization.yaml +++ b/apps/slinky-slurm-controlplane/kustomization.yaml @@ -14,3 +14,4 @@ resources: - ocirepository.yaml - helmrelease.yaml - priorityclass.yaml + - scaledobject.yaml diff --git a/apps/slinky-slurm-controlplane/scaledobject.yaml b/apps/slinky-slurm-controlplane/scaledobject.yaml new file mode 100644 index 0000000..db185c4 --- /dev/null +++ b/apps/slinky-slurm-controlplane/scaledobject.yaml @@ -0,0 +1,20 @@ +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: dawn-radar +spec: + scaleTargetRef: + apiVersion: slinky.slurm.net/v1alpha1 + kind: NodeSet + name: slurm-compute-dawn + idleReplicaCount: 0 + minReplicaCount: 1 + maxReplicaCount: 3 + cooldownPeriod: 600 #TODO: set to partition max job time + triggers: + - type: prometheus + metricType: Value + metadata: + serverAddress: http://prometheus-kube-prometheus-prometheus.prometheus:9090 + query: slurm_partition_pending_jobs{partition="dawn"} + threshold: '1' From 9ffac641551343fddb838f271a948ec0345e78ce Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Tue, 11 Feb 2025 06:45:35 +0000 Subject: [PATCH 34/46] Be sure to install keda with slinky And keda needed its namespace creating --- apps/keda/kustomization.yaml | 1 + apps/keda/namespace.yaml | 5 +++++ apps/kube-perftest/helmchart.yaml | 2 +- apps/kube-perftest/mpitests.yaml | 15 +++++++++++++++ apps/kube-perftest/namespace.yaml | 16 ++++++++-------- apps/slinky/kustomization.yaml | 2 ++ 6 files changed, 32 insertions(+), 9 deletions(-) create mode 100644 apps/keda/namespace.yaml create mode 100644 apps/kube-perftest/mpitests.yaml diff --git a/apps/keda/kustomization.yaml b/apps/keda/kustomization.yaml index 81aeb15..e58e893 100644 --- a/apps/keda/kustomization.yaml +++ b/apps/keda/kustomization.yaml @@ -2,5 +2,6 @@ namespace: keda resources: + - namespace.yaml - helmrelease.yaml - helmrepository.yaml diff --git a/apps/keda/namespace.yaml b/apps/keda/namespace.yaml new file mode 100644 index 0000000..02e3ca7 --- /dev/null +++ b/apps/keda/namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: keda diff --git a/apps/kube-perftest/helmchart.yaml b/apps/kube-perftest/helmchart.yaml index 1b96205..1465aab 100644 --- a/apps/kube-perftest/helmchart.yaml +++ b/apps/kube-perftest/helmchart.yaml @@ -5,7 +5,7 @@ metadata: name: kube-perftest-operator spec: chart: kube-perftest-operator - version: ">=0.1.0-dev.0.main.0,<0.1.0-dev.0.main.99999999999" + version: "0.1.0-dev.0.refresh-images.193" sourceRef: kind: HelmRepository name: kube-perftest diff --git a/apps/kube-perftest/mpitests.yaml b/apps/kube-perftest/mpitests.yaml new file mode 100644 index 0000000..82f6e4e --- /dev/null +++ b/apps/kube-perftest/mpitests.yaml @@ -0,0 +1,15 @@ +apiVersion: perftest.stackhpc.com/v1alpha1 +kind: BenchmarkSet +metadata: + name: mpi-pingpong-cni +spec: + template: + apiVersion: perftest.stackhpc.com/v1alpha1 + kind: MPIPingPong + spec: + imagePullPolicy: Always + hostNetwork: false + # mtu: 9000 + transport: TCP + maxlog: 25 + repetitions: 5 \ No newline at end of file diff --git a/apps/kube-perftest/namespace.yaml b/apps/kube-perftest/namespace.yaml index b2c8c18..0db6b8d 100644 --- a/apps/kube-perftest/namespace.yaml +++ b/apps/kube-perftest/namespace.yaml @@ -3,11 +3,11 @@ apiVersion: v1 kind: Namespace metadata: name: kube-perftest - labels: - # Set the pod security standard for the namespace - # https://kubernetes.io/docs/tutorials/security/ns-level-pss/ - # pod-security.kubernetes.io/enforce: privileged - # pod-security.kubernetes.io/enforce: baseline - # pod-security.kubernetes.io/enforce: restricted - # pod-security.kubernetes.io/warn: restricted - # pod-security.kubernetes.io/audit: restricted + # labels: + # # Set the pod security standard for the namespace + # # https://kubernetes.io/docs/tutorials/security/ns-level-pss/ + # # pod-security.kubernetes.io/enforce: privileged + # # pod-security.kubernetes.io/enforce: baseline + # # pod-security.kubernetes.io/enforce: restricted + # # pod-security.kubernetes.io/warn: restricted + # # pod-security.kubernetes.io/audit: restricted diff --git a/apps/slinky/kustomization.yaml b/apps/slinky/kustomization.yaml index eb053fb..9d49efe 100644 --- a/apps/slinky/kustomization.yaml +++ b/apps/slinky/kustomization.yaml @@ -1,5 +1,7 @@ --- resources: # - ../cert-manager/ + - ../keda/ + # todo - need above to install before below - ../slinky-slurm-operator/ - ../slinky-slurm-controlplane/ From aab92f4fbecbd33f79885a4cde4242ec75209c10 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Tue, 11 Feb 2025 06:48:10 +0000 Subject: [PATCH 35/46] Restore JuypterHub while lockdown is wip --- apps/jupyterhub/configmap.yaml | 12 ++++++------ apps/jupyterhub/kustomization.yaml | 3 ++- apps/jupyterhub/namespace.yaml | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/apps/jupyterhub/configmap.yaml b/apps/jupyterhub/configmap.yaml index e834f2f..ec3dabf 100644 --- a/apps/jupyterhub/configmap.yaml +++ b/apps/jupyterhub/configmap.yaml @@ -109,12 +109,12 @@ data: # "nvidia.com/hostdev": "1" supplemental_gids: - "110" # Ubuntu render group GID, requred for permission to use Intel GPU device - privilaged: false - container_security_context: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL + # privilaged: false + # container_security_context: + # allowPrivilegeEscalation: false + # capabilities: + # drop: + # - ALL - display_name: "Pytorch environment with 2 x Intel XPUs" description: "Pytorch Jupyter Stacks image!" kubespawner_override: diff --git a/apps/jupyterhub/kustomization.yaml b/apps/jupyterhub/kustomization.yaml index 620cff3..7fd6ffe 100644 --- a/apps/jupyterhub/kustomization.yaml +++ b/apps/jupyterhub/kustomization.yaml @@ -5,4 +5,5 @@ resources: - helmrelease.yaml - configmap.yaml # - secret.yaml - - extra-rbac.yaml + # TODO - restore the auto profile detection + # - extra-rbac.yaml diff --git a/apps/jupyterhub/namespace.yaml b/apps/jupyterhub/namespace.yaml index 6b63493..c6dd34d 100644 --- a/apps/jupyterhub/namespace.yaml +++ b/apps/jupyterhub/namespace.yaml @@ -9,6 +9,6 @@ metadata: # pod-security.kubernetes.io/enforce: privileged # pod-security.kubernetes.io/enforce: baseline # pod-security.kubernetes.io/enforce: restricted - pod-security.kubernetes.io/enforce: baseline + pod-security.kubernetes.io/enforce: privileged pod-security.kubernetes.io/warn: restricted pod-security.kubernetes.io/audit: restricted From f0863a32b444a5b1a9f8cf7c90678cdcbe7ae497 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Tue, 11 Feb 2025 06:53:06 +0000 Subject: [PATCH 36/46] Use kubeperf test 0.1.0 release --- apps/kube-perftest/helmchart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/kube-perftest/helmchart.yaml b/apps/kube-perftest/helmchart.yaml index 1465aab..db2784c 100644 --- a/apps/kube-perftest/helmchart.yaml +++ b/apps/kube-perftest/helmchart.yaml @@ -5,7 +5,7 @@ metadata: name: kube-perftest-operator spec: chart: kube-perftest-operator - version: "0.1.0-dev.0.refresh-images.193" + version: "0.1.0" sourceRef: kind: HelmRepository name: kube-perftest From 3f68024d65613eca7fe03f09898b50f9b74736cb Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Tue, 11 Feb 2025 07:32:02 +0000 Subject: [PATCH 37/46] Remote prometheus bits, we have service monitor already --- apps/kube-prometheus-stack/defaults.yaml | 9 --------- apps/slinky-slurm-controlplane/defaults.yaml | 4 ++-- apps/slinky-slurm-controlplane/scaledobject.yaml | 4 ++-- 3 files changed, 4 insertions(+), 13 deletions(-) delete mode 100644 apps/kube-prometheus-stack/defaults.yaml diff --git a/apps/kube-prometheus-stack/defaults.yaml b/apps/kube-prometheus-stack/defaults.yaml deleted file mode 100644 index 5997202..0000000 --- a/apps/kube-prometheus-stack/defaults.yaml +++ /dev/null @@ -1,9 +0,0 @@ -prometheus: - prometheusSpec: - additionalScrapeConfigs: - - job_name: "slurm_exporter" - scrape_interval: 10s - scrape_timeout: 30s - static_configs: - - targets: - - slurm-exporter.slurm.svc.cluster.local:8080 diff --git a/apps/slinky-slurm-controlplane/defaults.yaml b/apps/slinky-slurm-controlplane/defaults.yaml index ea6ca41..2fd09fd 100644 --- a/apps/slinky-slurm-controlplane/defaults.yaml +++ b/apps/slinky-slurm-controlplane/defaults.yaml @@ -535,9 +535,9 @@ mariadb: selector: {} priorityClassName: "" metrics: - enabled: false + enabled: true serviceMonitor: - enabled: false + enabled: true affinity: {} resources: {} diff --git a/apps/slinky-slurm-controlplane/scaledobject.yaml b/apps/slinky-slurm-controlplane/scaledobject.yaml index db185c4..fe3884d 100644 --- a/apps/slinky-slurm-controlplane/scaledobject.yaml +++ b/apps/slinky-slurm-controlplane/scaledobject.yaml @@ -8,8 +8,8 @@ spec: kind: NodeSet name: slurm-compute-dawn idleReplicaCount: 0 - minReplicaCount: 1 - maxReplicaCount: 3 + minReplicaCount: 0 + maxReplicaCount: 2 cooldownPeriod: 600 #TODO: set to partition max job time triggers: - type: prometheus From de8a237054c36b37dac4836d9b1ad19fbb565d57 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Tue, 11 Feb 2025 10:22:23 +0000 Subject: [PATCH 38/46] Add extra namespace to slinky --- apps/slinky-slurm-controlplane/defaults.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/slinky-slurm-controlplane/defaults.yaml b/apps/slinky-slurm-controlplane/defaults.yaml index 2fd09fd..aee2b66 100644 --- a/apps/slinky-slurm-controlplane/defaults.yaml +++ b/apps/slinky-slurm-controlplane/defaults.yaml @@ -330,9 +330,11 @@ compute: values: - singleuser-server topologyKey: "kubernetes.io/hostname" + # TODO - make this more easily configurable namespaces: - slurm - jupyterhub + - testproject2 # # -- (object) From c59f1690c6fdefd1547c48aabf97ebf165aafc3b Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Thu, 13 Feb 2025 16:39:27 +0000 Subject: [PATCH 39/46] Update build-images.yml --- .github/workflows/build-images.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-images.yml b/.github/workflows/build-images.yml index 06ab587..ec4cd05 100644 --- a/.github/workflows/build-images.yml +++ b/.github/workflows/build-images.yml @@ -31,7 +31,7 @@ jobs: id: image-meta uses: docker/metadata-action@v5 with: - images: ghcr.io/johngarbutt/${{ matrix.image }} + images: ghcr.io/${{ github.repository_owner }}/${{ matrix.image }} # Produce the branch name or tag and the SHA as tags tags: | type=ref,event=branch From f2eecd3a687f0f6c83fe74ad992c9b848f5f2ff8 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Thu, 13 Feb 2025 16:42:02 +0000 Subject: [PATCH 40/46] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 01f8e0e..12117de 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,9 @@ # fluxcd-demo-apps A repository of example apps deployed and managed using Flux CD +> [!CAUTION] +> This is very much a work in progress!! + ## Creating Sealed Secrets We assume the use of sealed secrets. From c5e5813e68bf161573726bccce96e0972703b132 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 14 Feb 2025 09:22:06 +0000 Subject: [PATCH 41/46] disabled privileged initcontainers --- apps/jupyterhub/configmap.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/jupyterhub/configmap.yaml b/apps/jupyterhub/configmap.yaml index ec3dabf..aac0713 100644 --- a/apps/jupyterhub/configmap.yaml +++ b/apps/jupyterhub/configmap.yaml @@ -90,6 +90,8 @@ data: image: name: quay.io/jupyter/minimal-notebook tag: "2025-01-28" + cloudMetadata: + blockWithIptables: false profileList: - display_name: "Minimal environment" description: "To avoid too much bells and whistles: Python." From 6670396debe74c174821d8c8b0de9bcd39715748 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Fri, 14 Feb 2025 10:49:55 +0000 Subject: [PATCH 42/46] Move to baseline in juypterhub --- apps/jupyterhub/namespace.yaml | 2 +- apps/slinky-slurm-controlplane/defaults.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/jupyterhub/namespace.yaml b/apps/jupyterhub/namespace.yaml index c6dd34d..6b63493 100644 --- a/apps/jupyterhub/namespace.yaml +++ b/apps/jupyterhub/namespace.yaml @@ -9,6 +9,6 @@ metadata: # pod-security.kubernetes.io/enforce: privileged # pod-security.kubernetes.io/enforce: baseline # pod-security.kubernetes.io/enforce: restricted - pod-security.kubernetes.io/enforce: privileged + pod-security.kubernetes.io/enforce: baseline pod-security.kubernetes.io/warn: restricted pod-security.kubernetes.io/audit: restricted diff --git a/apps/slinky-slurm-controlplane/defaults.yaml b/apps/slinky-slurm-controlplane/defaults.yaml index aee2b66..5fd93b5 100644 --- a/apps/slinky-slurm-controlplane/defaults.yaml +++ b/apps/slinky-slurm-controlplane/defaults.yaml @@ -277,7 +277,7 @@ compute: # -- (integer) # Set the number of replicas to deploy. # NOTE: if empty, all nodes matching affinity will have a replica (like DaemonSet). - replicas: 3 # TODO: set to max nodes in cluster + replicas: 2 # TODO: set to max nodes in cluster # # -- (int) # The minimum number of seconds for which a newly created NodeSet Pod should be ready From 4a59e82575064fadb5027ac1df7ed865582d4149 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Sun, 16 Feb 2025 17:51:58 +0000 Subject: [PATCH 43/46] Add in opencost --- apps/opencost/configmap.yaml | 55 +++++++++++++++++++++++++++++++ apps/opencost/helmchart.yaml | 12 +++++++ apps/opencost/helmrelease.yaml | 23 +++++++++++++ apps/opencost/helmrepository.yaml | 8 +++++ apps/opencost/kustomization.yaml | 8 +++++ apps/opencost/namespace.yaml | 14 ++++++++ 6 files changed, 120 insertions(+) create mode 100644 apps/opencost/configmap.yaml create mode 100644 apps/opencost/helmchart.yaml create mode 100644 apps/opencost/helmrelease.yaml create mode 100644 apps/opencost/helmrepository.yaml create mode 100644 apps/opencost/kustomization.yaml create mode 100644 apps/opencost/namespace.yaml diff --git a/apps/opencost/configmap.yaml b/apps/opencost/configmap.yaml new file mode 100644 index 0000000..b9f9a8e --- /dev/null +++ b/apps/opencost/configmap.yaml @@ -0,0 +1,55 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: opencost-config +data: + values.yaml: | + opencost: + customPricing: + enabled: true + provider: custom + costModel: + description: Modified pricing configuration. + CPU: 0.031611 + spotCPU: 0.006655 + RAM: 0.004237 + spotRAM: 0.000892 + GPU: 0.95 + storage: 0.00005479452 + zoneNetworkEgress: 0.01 + regionNetworkEgress: 0.01 + internetNetworkEgress: 0.143 + spotLabel: "" + spotLabelValue: "" + awsServiceKeyName: "" + awsServiceKeySecret: "" + awsSpotDataRegion: "" + awsSpotDataBucket: "" + awsSpotDataPrefix: "" + athenaBucketName: "" + athenaRegion: "" + athenaDatabase: "" + athenaTable: "" + projectID: "${ACCOUNT_ID}" + exporter: + defaultClusterId: dawntest + extraEnv: + EMIT_KSM_V1_METRICS: "false" + EMIT_KSM_V1_METRICS_ONLY: "true" + LOG_LEVEL: debug # warn + prometheus: + internal: + enabled: true + serviceName: kube-prometheus-stack-prometheus + namespaceName: monitoring-system + port: 9090 + ui: + enabled: true + metrics: + serviceMonitor: + enabled: true + namespace: monitoring-system + carbonCost: + # TODO! + enabled: false diff --git a/apps/opencost/helmchart.yaml b/apps/opencost/helmchart.yaml new file mode 100644 index 0000000..4c96b6e --- /dev/null +++ b/apps/opencost/helmchart.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmChart +metadata: + name: opencost +spec: + chart: opencost + version: "1.43.2" + sourceRef: + kind: HelmRepository + name: opencost + interval: 10m0s diff --git a/apps/opencost/helmrelease.yaml b/apps/opencost/helmrelease.yaml new file mode 100644 index 0000000..2084910 --- /dev/null +++ b/apps/opencost/helmrelease.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: opencost +spec: + chartRef: + kind: HelmChart + name: opencost + releaseName: opencost + valuesFrom: + - kind: ConfigMap + name: opencost-config + install: + createNamespace: true + remediation: + retries: 3 + upgrade: + remediation: + retries: 3 + driftDetection: + mode: enabled + interval: 5m diff --git a/apps/opencost/helmrepository.yaml b/apps/opencost/helmrepository.yaml new file mode 100644 index 0000000..a438570 --- /dev/null +++ b/apps/opencost/helmrepository.yaml @@ -0,0 +1,8 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: opencost +spec: + url: https://opencost.github.io/opencost-helm-chart + interval: 1h diff --git a/apps/opencost/kustomization.yaml b/apps/opencost/kustomization.yaml new file mode 100644 index 0000000..57fabf7 --- /dev/null +++ b/apps/opencost/kustomization.yaml @@ -0,0 +1,8 @@ +namespace: opencost + +resources: + - namespace.yaml + - helmrepository.yaml + - helmchart.yaml + - helmrelease.yaml + - configmap.yaml diff --git a/apps/opencost/namespace.yaml b/apps/opencost/namespace.yaml new file mode 100644 index 0000000..07af73d --- /dev/null +++ b/apps/opencost/namespace.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: opencost + labels: + # Set the pod security standard for the namespace + # https://kubernetes.io/docs/tutorials/security/ns-level-pss/ + # pod-security.kubernetes.io/enforce: privileged + # pod-security.kubernetes.io/enforce: baseline + # pod-security.kubernetes.io/enforce: restricted + pod-security.kubernetes.io/enforce: baseline + pod-security.kubernetes.io/warn: restricted + pod-security.kubernetes.io/audit: restricted From ebbc1ac4a7caf53d05758b655977c5e8c5889546 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Mon, 10 Mar 2025 10:03:33 +0000 Subject: [PATCH 44/46] Add tetragon example --- apps/tetragon/configmap.yaml | 15 ++ apps/tetragon/file_monitoring.yaml | 176 ++++++++++++++++++++++ apps/tetragon/helmchart.yaml | 12 ++ apps/tetragon/kustomization.yaml | 12 ++ apps/tetragon/network_egress_cluster.yaml | 19 +++ 5 files changed, 234 insertions(+) create mode 100644 apps/tetragon/configmap.yaml create mode 100644 apps/tetragon/file_monitoring.yaml create mode 100644 apps/tetragon/helmchart.yaml create mode 100644 apps/tetragon/kustomization.yaml create mode 100644 apps/tetragon/network_egress_cluster.yaml diff --git a/apps/tetragon/configmap.yaml b/apps/tetragon/configmap.yaml new file mode 100644 index 0000000..220251d --- /dev/null +++ b/apps/tetragon/configmap.yaml @@ -0,0 +1,15 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: tetragon +data: + values.yaml: | + tetragon: + prometheus: + serviceMonitor: + enabled: true + tetragonOperator: + prometheus: + serviceMonitor: + enabled: true \ No newline at end of file diff --git a/apps/tetragon/file_monitoring.yaml b/apps/tetragon/file_monitoring.yaml new file mode 100644 index 0000000..1374936 --- /dev/null +++ b/apps/tetragon/file_monitoring.yaml @@ -0,0 +1,176 @@ +apiVersion: cilium.io/v1alpha1 +kind: TracingPolicy +metadata: + name: "file-monitoring-filtered" +spec: + kprobes: + - call: "security_file_permission" + syscall: false + return: true + args: + - index: 0 + type: "file" # (struct file *) used for getting the path + - index: 1 + type: "int" # 0x04 is MAY_READ, 0x02 is MAY_WRITE + returnArg: + index: 0 + type: "int" + returnArgAction: "Post" + selectors: + - matchArgs: + - index: 0 + operator: "Prefix" + values: + - "/boot" # Reads to sensitive directories + - "/root/.ssh" # Reads to sensitive files we want to know about + - "/etc/shadow" + - "/etc/profile" + - "/etc/sudoers" + - "/etc/pam.conf" # Reads global shell configs bash/csh supported + - "/etc/bashrc" + - "/etc/csh.cshrc" + - "/etc/csh.login" # Add additional sensitive files here + - index: 1 + operator: "Equal" + values: + - "4" # MAY_READ + - matchArgs: + - index: 0 + operator: "Postfix" + values: + - ".bashrc" # Reads to shell config files bash, csh supported + - ".bash_profile" # add any other shell support here. + - ".bash_login" + - ".bash_logout" + - ".cshrc" + - ".cshdirs" + - ".profile" # Reads to common environments files + - ".login" + - ".logout" + - ".history" # Add additional sensitive files here + - index: 1 + operator: "Equal" + values: + - "4" # MAY_READ + - matchArgs: + - index: 0 + operator: "Prefix" + values: + - "/etc" # Writes to sensitive directories + - "/boot" + - "/lib" + - "/lib64" + - "/bin" + - "/usr/lib" + - "/usr/local/lib" + - "/usr/local/sbin" + - "/usr/local/bin" + - "/usr/bin" + - "/usr/sbin" + - "/var/log" # Writes to logs + - "/dev/log" + - "/root/.ssh" # Writes to sensitive files add here. + - index: 1 + operator: "Equal" + values: + - "2" # MAY_WRITE + - call: "security_mmap_file" + syscall: false + return: true + args: + - index: 0 + type: "file" # (struct file *) used for getting the path + - index: 1 + type: "uint32" # the prot flags PROT_READ(0x01), PROT_WRITE(0x02), PROT_EXEC(0x04) + - index: 2 + type: "uint32" # the mmap flags (i.e. MAP_SHARED, ...) + returnArg: + index: 0 + type: "int" + returnArgAction: "Post" + selectors: + - matchArgs: + - index: 0 + operator: "Prefix" + values: + - "/boot" # Reads to sensitive directories + - "/root/.ssh" # Reads to sensitive files we want to know about + - "/etc/shadow" + - "/etc/sudoers" + - "/etc/pam.conf" # Reads global shell configs bash/csh supported + - "/etc/profile" + - "/etc/bashrc" + - "/etc/csh.cshrc" + - "/etc/csh.login" + - ".bashrc" # Reads to shell config files bash, csh supported + - ".bash_profile" # add any other shell support here. + - ".bash_login" + - ".bash_logout" + - ".cshrc" + - ".cshdirs" + - ".profile" # Reads to common environments files + - ".login" + - ".logout" + - ".history" # Add additional sensitive mmap files here + - index: 1 + operator: "Equal" + values: + - "1" # MAY_READ + - index: 2 + operator: "Mask" + values: + - "1" # MAP_SHARED + - matchArgs: + - index: 0 + operator: "Prefix" + values: + - "/etc" # Writes to sensitive directories + - "/boot" + - "/lib" + - "/lib64" + - "/bin" + - "/usr/lib" + - "/usr/local/lib" + - "/usr/local/sbin" + - "/usr/local/bin" + - "/usr/bin" + - "/usr/sbin" + - "/var/log" # Writes to logs + - "/dev/log" + - "/root/.ssh" # Writes to sensitive files add here. + - index: 1 + operator: "Mask" + values: + - "2" # PROT_WRITE + - index: 2 + operator: "Mask" + values: + - "1" # MAP_SHARED + - call: "security_path_truncate" + syscall: false + return: true + args: + - index: 0 + type: "path" # (struct path *) used for getting the path + returnArg: + index: 0 + type: "int" + returnArgAction: "Post" + selectors: + - matchArgs: + - index: 0 + operator: "Prefix" + values: + - "/etc" # Truncate to sensitive directories + - "/boot" + - "/lib" + - "/lib64" + - "/usr/lib" + - "/usr/local/lib" + - "/usr/local/sbin" + - "/usr/local/bin" + - "/usr/bin" + - "/usr/sbin" + - "/var/log" # Truncate to logs + - "/dev/log" + - "/root/.ssh" # Truncate to sensitive files add here. \ No newline at end of file diff --git a/apps/tetragon/helmchart.yaml b/apps/tetragon/helmchart.yaml new file mode 100644 index 0000000..550608f --- /dev/null +++ b/apps/tetragon/helmchart.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmChart +metadata: + name: tetragon +spec: + chart: tetragon + version: "v1.3.0" + sourceRef: + kind: HelmRepository + name: cilium + interval: 10m0s diff --git a/apps/tetragon/kustomization.yaml b/apps/tetragon/kustomization.yaml new file mode 100644 index 0000000..ec759cb --- /dev/null +++ b/apps/tetragon/kustomization.yaml @@ -0,0 +1,12 @@ +--- +namespace: tetragon + +resources: + - namespace.yaml + - helmrepository.yaml + - helmchart.yaml + - helmrelease.yaml + - configmap.yaml + # TODO - these need to depend on helm chart install + - file_monitoring.yaml + - network_egress_cluster.yaml diff --git a/apps/tetragon/network_egress_cluster.yaml b/apps/tetragon/network_egress_cluster.yaml new file mode 100644 index 0000000..acd95d0 --- /dev/null +++ b/apps/tetragon/network_egress_cluster.yaml @@ -0,0 +1,19 @@ +apiVersion: cilium.io/v1alpha1 +kind: TracingPolicy +metadata: + name: "monitor-network-activity-outside-cluster-cidr-range" +spec: + kprobes: + - call: "tcp_connect" + syscall: false + args: + - index: 0 + type: "sock" + selectors: + - matchArgs: + - index: 0 + operator: "NotDAddr" + values: + - 127.0.0.1 + - 172.16.0.0/13 # pods + - 172.24.0.0/13 # services \ No newline at end of file From 616e0bbdde8f5c07d77f54da1aefbfdbf243ed5f Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Mon, 10 Mar 2025 10:03:33 +0000 Subject: [PATCH 45/46] Add tetragon example --- apps/tetragon/configmap.yaml | 15 ++ apps/tetragon/file_monitoring.yaml | 176 ++++++++++++++++++++++ apps/tetragon/helmchart.yaml | 12 ++ apps/tetragon/kustomization.yaml | 12 ++ apps/tetragon/network_egress_cluster.yaml | 19 +++ 5 files changed, 234 insertions(+) create mode 100644 apps/tetragon/configmap.yaml create mode 100644 apps/tetragon/file_monitoring.yaml create mode 100644 apps/tetragon/helmchart.yaml create mode 100644 apps/tetragon/kustomization.yaml create mode 100644 apps/tetragon/network_egress_cluster.yaml diff --git a/apps/tetragon/configmap.yaml b/apps/tetragon/configmap.yaml new file mode 100644 index 0000000..220251d --- /dev/null +++ b/apps/tetragon/configmap.yaml @@ -0,0 +1,15 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: tetragon +data: + values.yaml: | + tetragon: + prometheus: + serviceMonitor: + enabled: true + tetragonOperator: + prometheus: + serviceMonitor: + enabled: true \ No newline at end of file diff --git a/apps/tetragon/file_monitoring.yaml b/apps/tetragon/file_monitoring.yaml new file mode 100644 index 0000000..1374936 --- /dev/null +++ b/apps/tetragon/file_monitoring.yaml @@ -0,0 +1,176 @@ +apiVersion: cilium.io/v1alpha1 +kind: TracingPolicy +metadata: + name: "file-monitoring-filtered" +spec: + kprobes: + - call: "security_file_permission" + syscall: false + return: true + args: + - index: 0 + type: "file" # (struct file *) used for getting the path + - index: 1 + type: "int" # 0x04 is MAY_READ, 0x02 is MAY_WRITE + returnArg: + index: 0 + type: "int" + returnArgAction: "Post" + selectors: + - matchArgs: + - index: 0 + operator: "Prefix" + values: + - "/boot" # Reads to sensitive directories + - "/root/.ssh" # Reads to sensitive files we want to know about + - "/etc/shadow" + - "/etc/profile" + - "/etc/sudoers" + - "/etc/pam.conf" # Reads global shell configs bash/csh supported + - "/etc/bashrc" + - "/etc/csh.cshrc" + - "/etc/csh.login" # Add additional sensitive files here + - index: 1 + operator: "Equal" + values: + - "4" # MAY_READ + - matchArgs: + - index: 0 + operator: "Postfix" + values: + - ".bashrc" # Reads to shell config files bash, csh supported + - ".bash_profile" # add any other shell support here. + - ".bash_login" + - ".bash_logout" + - ".cshrc" + - ".cshdirs" + - ".profile" # Reads to common environments files + - ".login" + - ".logout" + - ".history" # Add additional sensitive files here + - index: 1 + operator: "Equal" + values: + - "4" # MAY_READ + - matchArgs: + - index: 0 + operator: "Prefix" + values: + - "/etc" # Writes to sensitive directories + - "/boot" + - "/lib" + - "/lib64" + - "/bin" + - "/usr/lib" + - "/usr/local/lib" + - "/usr/local/sbin" + - "/usr/local/bin" + - "/usr/bin" + - "/usr/sbin" + - "/var/log" # Writes to logs + - "/dev/log" + - "/root/.ssh" # Writes to sensitive files add here. + - index: 1 + operator: "Equal" + values: + - "2" # MAY_WRITE + - call: "security_mmap_file" + syscall: false + return: true + args: + - index: 0 + type: "file" # (struct file *) used for getting the path + - index: 1 + type: "uint32" # the prot flags PROT_READ(0x01), PROT_WRITE(0x02), PROT_EXEC(0x04) + - index: 2 + type: "uint32" # the mmap flags (i.e. MAP_SHARED, ...) + returnArg: + index: 0 + type: "int" + returnArgAction: "Post" + selectors: + - matchArgs: + - index: 0 + operator: "Prefix" + values: + - "/boot" # Reads to sensitive directories + - "/root/.ssh" # Reads to sensitive files we want to know about + - "/etc/shadow" + - "/etc/sudoers" + - "/etc/pam.conf" # Reads global shell configs bash/csh supported + - "/etc/profile" + - "/etc/bashrc" + - "/etc/csh.cshrc" + - "/etc/csh.login" + - ".bashrc" # Reads to shell config files bash, csh supported + - ".bash_profile" # add any other shell support here. + - ".bash_login" + - ".bash_logout" + - ".cshrc" + - ".cshdirs" + - ".profile" # Reads to common environments files + - ".login" + - ".logout" + - ".history" # Add additional sensitive mmap files here + - index: 1 + operator: "Equal" + values: + - "1" # MAY_READ + - index: 2 + operator: "Mask" + values: + - "1" # MAP_SHARED + - matchArgs: + - index: 0 + operator: "Prefix" + values: + - "/etc" # Writes to sensitive directories + - "/boot" + - "/lib" + - "/lib64" + - "/bin" + - "/usr/lib" + - "/usr/local/lib" + - "/usr/local/sbin" + - "/usr/local/bin" + - "/usr/bin" + - "/usr/sbin" + - "/var/log" # Writes to logs + - "/dev/log" + - "/root/.ssh" # Writes to sensitive files add here. + - index: 1 + operator: "Mask" + values: + - "2" # PROT_WRITE + - index: 2 + operator: "Mask" + values: + - "1" # MAP_SHARED + - call: "security_path_truncate" + syscall: false + return: true + args: + - index: 0 + type: "path" # (struct path *) used for getting the path + returnArg: + index: 0 + type: "int" + returnArgAction: "Post" + selectors: + - matchArgs: + - index: 0 + operator: "Prefix" + values: + - "/etc" # Truncate to sensitive directories + - "/boot" + - "/lib" + - "/lib64" + - "/usr/lib" + - "/usr/local/lib" + - "/usr/local/sbin" + - "/usr/local/bin" + - "/usr/bin" + - "/usr/sbin" + - "/var/log" # Truncate to logs + - "/dev/log" + - "/root/.ssh" # Truncate to sensitive files add here. \ No newline at end of file diff --git a/apps/tetragon/helmchart.yaml b/apps/tetragon/helmchart.yaml new file mode 100644 index 0000000..550608f --- /dev/null +++ b/apps/tetragon/helmchart.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmChart +metadata: + name: tetragon +spec: + chart: tetragon + version: "v1.3.0" + sourceRef: + kind: HelmRepository + name: cilium + interval: 10m0s diff --git a/apps/tetragon/kustomization.yaml b/apps/tetragon/kustomization.yaml new file mode 100644 index 0000000..ec759cb --- /dev/null +++ b/apps/tetragon/kustomization.yaml @@ -0,0 +1,12 @@ +--- +namespace: tetragon + +resources: + - namespace.yaml + - helmrepository.yaml + - helmchart.yaml + - helmrelease.yaml + - configmap.yaml + # TODO - these need to depend on helm chart install + - file_monitoring.yaml + - network_egress_cluster.yaml diff --git a/apps/tetragon/network_egress_cluster.yaml b/apps/tetragon/network_egress_cluster.yaml new file mode 100644 index 0000000..acd95d0 --- /dev/null +++ b/apps/tetragon/network_egress_cluster.yaml @@ -0,0 +1,19 @@ +apiVersion: cilium.io/v1alpha1 +kind: TracingPolicy +metadata: + name: "monitor-network-activity-outside-cluster-cidr-range" +spec: + kprobes: + - call: "tcp_connect" + syscall: false + args: + - index: 0 + type: "sock" + selectors: + - matchArgs: + - index: 0 + operator: "NotDAddr" + values: + - 127.0.0.1 + - 172.16.0.0/13 # pods + - 172.24.0.0/13 # services \ No newline at end of file From 7aacc4febdc46a5a048e05349355ab7a4fd67823 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Mon, 10 Mar 2025 10:03:33 +0000 Subject: [PATCH 46/46] Add tetragon example