diff --git a/.github/workflows/build-images.yml b/.github/workflows/build-images.yml new file mode 100644 index 0000000..ec4cd05 --- /dev/null +++ b/.github/workflows/build-images.yml @@ -0,0 +1,50 @@ +name: Publish Container Images +on: + push: + paths: + - images/** +jobs: + build_push_images: + name: Build and push images + permissions: + contents: read + id-token: write # needed for signing the images with GitHub OIDC Token + packages: write # required for pushing container images + security-events: write # required for pushing SARIF files + runs-on: ubuntu-22.04 + strategy: + matrix: + include: + - image: jupyterhub-intel-gpu + steps: + - name: Check out the repository + uses: actions/checkout@v4 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Calculate metadata for image + id: image-meta + uses: docker/metadata-action@v5 + with: + images: ghcr.io/${{ github.repository_owner }}/${{ matrix.image }} + # Produce the branch name or tag and the SHA as tags + tags: | + type=ref,event=branch + type=ref,event=tag + type=sha,prefix= + + - name: Build and push image + uses: azimuth-cloud/github-actions/docker-multiarch-build-push@master + with: + cache-key: ${{ matrix.image }} + context: ./images/${{ matrix.image }} + platforms: linux/amd64 + push: true + tags: ${{ steps.image-meta.outputs.tags }} + labels: ${{ steps.image-meta.outputs.labels }} + diff --git a/README.md b/README.md index 4293566..12117de 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,22 @@ # fluxcd-demo-apps A repository of example apps deployed and managed using Flux CD + +> [!CAUTION] +> This is very much a work in progress!! + +## Creating Sealed Secrets + +We assume the use of sealed secrets. + +TODO: add more instructions! + +## How to install + +The host cluster must have the [Flux CD](https://fluxcd.io/) controllers installed. + +Configuring Flux to manage the apps defined in the repository is a one-time operation: + +```sh +flux create source git myapps --url= --branch=main +flux create kustomization myapps --source=GitRepository/myapps --prune=true +``` diff --git a/apps/cert-manager/configmap.yaml b/apps/cert-manager/configmap.yaml new file mode 100644 index 0000000..7acc97b --- /dev/null +++ b/apps/cert-manager/configmap.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: cert-manager-config + namespace: cert-manager +data: + values.yaml: | + installCRDs: true diff --git a/apps/cert-manager/helmchart.yaml b/apps/cert-manager/helmchart.yaml new file mode 100644 index 0000000..f08fabb --- /dev/null +++ b/apps/cert-manager/helmchart.yaml @@ -0,0 +1,13 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmChart +metadata: + name: cert-manager + namespace: cert-manager +spec: + chart: cert-manager + version: v1.16.1 + sourceRef: + kind: HelmRepository + name: jetstack + interval: 1h diff --git a/apps/cert-manager/helmrelease.yaml b/apps/cert-manager/helmrelease.yaml new file mode 100644 index 0000000..b3e258e --- /dev/null +++ b/apps/cert-manager/helmrelease.yaml @@ -0,0 +1,25 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: cert-manager + namespace: cert-manager +spec: + chartRef: + kind: HelmChart + name: cert-manager + releaseName: cert-manager + valuesFrom: + - kind: ConfigMap + name: cert-manager-config + valuesKey: values.yaml + install: + createNamespace: true + remediation: + retries: -1 + upgrade: + remediation: + retries: -1 + driftDetection: + mode: enabled + interval: 5m diff --git a/apps/cert-manager/helmrepository.yaml b/apps/cert-manager/helmrepository.yaml new file mode 100644 index 0000000..a0fd9a2 --- /dev/null +++ b/apps/cert-manager/helmrepository.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: jetstack + namespace: cert-manager +spec: + url: https://charts.jetstack.io + interval: 1h diff --git a/apps/cert-manager/kustomization.yaml b/apps/cert-manager/kustomization.yaml new file mode 100644 index 0000000..ebd317e --- /dev/null +++ b/apps/cert-manager/kustomization.yaml @@ -0,0 +1,6 @@ +resources: + - namespace.yaml + - configmap.yaml + - helmrepository.yaml + - helmchart.yaml + - helmrelease.yaml diff --git a/apps/cert-manager/namespace.yaml b/apps/cert-manager/namespace.yaml new file mode 100644 index 0000000..6bc19f4 --- /dev/null +++ b/apps/cert-manager/namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: cert-manager diff --git a/apps/jupyterhub/configmap.yaml b/apps/jupyterhub/configmap.yaml new file mode 100644 index 0000000..aac0713 --- /dev/null +++ b/apps/jupyterhub/configmap.yaml @@ -0,0 +1,137 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: jupyterhub-config + namespace: jupyterhub +data: + values.yaml: | + + # Add JupyterHub customisations here + # See https://artifacthub.io/packages/helm/jupyterhub/jupyterhub + + # We don't need a load balancer for the proxy + # since we want to use ingress instead. + # + # To access manually try: + # kubectl port-forward -n jupyterhub svc/proxy-public 8080:80 + proxy: + service: + type: ClusterIP + + # Make JupyterHub accessible via ingress + ingress: + enabled: false + ingressClassName: nginx + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + hosts: + # IP must match NGINX ingress controller's + # load balancer IP. + # See `kubectl get svc -n ingress-nginx` + - &host jh.dawntest.128-232-224-75.nip.io + pathSuffix: "" + tls: + - hosts: + - *host + secretName: jupyterhub-ingress-cert + + hub: + allowNamedServers: true + namedServerLimitPerUser: 3 + activeServerLimit: 5 + # Server startup fails with default + # restrictive network policy. + networkPolicy: + enabled: false + + # # Configure Keycloak auth + # config: + # JupyterHub: + # authenticator_class: generic-oauth + # GenericOAuthenticator: + # client_id: scott-jupyterhub-test + # # client_secret: + # # Must match ingress host + # oauth_callback_url: https://128-232-226-29.sslip.io/hub/oauth_callback + # authorize_url: https://identity.apps.hpc.cam.ac.uk/realms/az-rcp-cloud-portal-demo/protocol/openid-connect/auth + # token_url: https://identity.apps.hpc.cam.ac.uk/realms/az-rcp-cloud-portal-demo/protocol/openid-connect/token + # userdata_url: https://identity.apps.hpc.cam.ac.uk/realms/az-rcp-cloud-portal-demo/protocol/openid-connect/userinfo + # scope: + # - openid + # - groups + # username_claim: preferred_username + # claim_groups_key: groups + # userdata_params: + # state: state + + # # Limit access to specific keycloak groups + # allowed_groups: + # - /admins + # - /platform-users + + # # Allow hub admin access to keycloak users/groups + # # admin_groups: + # # - /admins + # admin_users: + # - scottd_stack + + # # Label for the 'Sign in with ___' button + # login_service: Keycloak + + # turn this off for now + prePuller: + hook: + enabled: false + continuous: + enabled: false + + singleuser: + image: + name: quay.io/jupyter/minimal-notebook + tag: "2025-01-28" + cloudMetadata: + blockWithIptables: false + profileList: + - display_name: "Minimal environment" + description: "To avoid too much bells and whistles: Python." + default: true + - display_name: "Datascience environment" + description: "If you want the additional bells and whistles: Python, R, and Julia." + kubespawner_override: + image: quay.io/jupyter/datascience-notebook:2025-01-28 + - display_name: "Pytorch environment with 1 x Intel XPUs" + description: "Pytorch Jupyter Stacks image!" + kubespawner_override: + #image: quay.io/jupyter/pytorch-notebook:2025-01-28 + #image: ghcr.io/stackhpc/jupyterhub-pytorch-intel-gpu:v0.0.1 + image: ghcr.io/johngarbutt/jupyterhub-intel-gpu:6421ba1 + extra_resource_limits: + "gpu.intel.com/i915": "1" + # "nvidia.com/hostdev": "1" + supplemental_gids: + - "110" # Ubuntu render group GID, requred for permission to use Intel GPU device + # privilaged: false + # container_security_context: + # allowPrivilegeEscalation: false + # capabilities: + # drop: + # - ALL + - display_name: "Pytorch environment with 2 x Intel XPUs" + description: "Pytorch Jupyter Stacks image!" + kubespawner_override: + image: ghcr.io/johngarbutt/jupyterhub-intel-gpu:6421ba1 + extra_resource_limits: + "gpu.intel.com/i915": "2" + "nvidia.com/hostdev": "2" + supplemental_gids: + - "110" # Ubuntu render group GID, requred for permission to use Intel GPU device + - display_name: "Pytorch environment with 4 x Intel XPUs" + description: "Pytorch Jupyter Stacks image!" + kubespawner_override: + image: ghcr.io/johngarbutt/jupyterhub-intel-gpu:6421ba1 + extra_resource_limits: + "gpu.intel.com/i915": "4" + "nvidia.com/hostdev": "4" + supplemental_gids: + - "110" # Ubuntu render group GID, requred for permission to use Intel GPU device diff --git a/apps/jupyterhub/example-secret.yaml b/apps/jupyterhub/example-secret.yaml new file mode 100644 index 0000000..6112729 --- /dev/null +++ b/apps/jupyterhub/example-secret.yaml @@ -0,0 +1,19 @@ +--- +############ +# IMPORTANT: Make sure you run kubeseal against any secret before commiting it to git! +# Example command: +# kubeseal \ +# --kubeconfig clusters/jupyterhub/kubeconfig \ +# --format yaml \ +# --controller-name sealed-secrets \ +# --controller-namespace sealed-secrets-system \ +# --secret-file components/jupyterhub/secret.yaml \ +# --sealed-secret-file components/jupyterhub/secret.yaml +############ +apiVersion: v1 +kind: Secret +metadata: + name: jupyterhub-keycloak-config + namespace: jupyterhub +stringData: + keycloakClientSecret: diff --git a/apps/jupyterhub/extra-rbac.yaml b/apps/jupyterhub/extra-rbac.yaml new file mode 100644 index 0000000..4ee2575 --- /dev/null +++ b/apps/jupyterhub/extra-rbac.yaml @@ -0,0 +1,25 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: jupyterhub-node-list +rules: +- apiGroups: + - "" + resources: + - nodes + verbs: + - list +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: jupyterhub-node-list +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: jupyterhub-node-list +subjects: +- kind: ServiceAccount + name: hub + namespace: jupyterhub diff --git a/apps/jupyterhub/helmchart.yaml b/apps/jupyterhub/helmchart.yaml new file mode 100644 index 0000000..737e8c7 --- /dev/null +++ b/apps/jupyterhub/helmchart.yaml @@ -0,0 +1,13 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmChart +metadata: + name: jupyterhub + namespace: jupyterhub +spec: + chart: jupyterhub + version: "4.1.0" + sourceRef: + kind: HelmRepository + name: jupyterhub + interval: 10m0s diff --git a/apps/jupyterhub/helmrelease.yaml b/apps/jupyterhub/helmrelease.yaml new file mode 100644 index 0000000..01c297b --- /dev/null +++ b/apps/jupyterhub/helmrelease.yaml @@ -0,0 +1,28 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: jupyterhub + namespace: jupyterhub +spec: + chartRef: + kind: HelmChart + name: jupyterhub + releaseName: jupyterhub + valuesFrom: + - kind: ConfigMap + name: jupyterhub-config + # - kind: Secret + # name: jupyterhub-keycloak-config + # valuesKey: keycloakClientSecret + # targetPath: hub.config.GenericOAuthenticator.client_secret + install: + createNamespace: true + remediation: + retries: 3 + upgrade: + remediation: + retries: 3 + driftDetection: + mode: enabled + interval: 5m diff --git a/apps/jupyterhub/helmrepository.yaml b/apps/jupyterhub/helmrepository.yaml new file mode 100644 index 0000000..2c87a51 --- /dev/null +++ b/apps/jupyterhub/helmrepository.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: jupyterhub + namespace: jupyterhub +spec: + url: https://jupyterhub.github.io/helm-chart + interval: 1h diff --git a/apps/jupyterhub/kustomization.yaml b/apps/jupyterhub/kustomization.yaml new file mode 100644 index 0000000..7fd6ffe --- /dev/null +++ b/apps/jupyterhub/kustomization.yaml @@ -0,0 +1,9 @@ +resources: + - namespace.yaml + - helmrepository.yaml + - helmchart.yaml + - helmrelease.yaml + - configmap.yaml + # - secret.yaml + # TODO - restore the auto profile detection + # - extra-rbac.yaml diff --git a/apps/jupyterhub/namespace.yaml b/apps/jupyterhub/namespace.yaml new file mode 100644 index 0000000..6b63493 --- /dev/null +++ b/apps/jupyterhub/namespace.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: jupyterhub + labels: + # Set the pod security standard for the namespace + # https://kubernetes.io/docs/tutorials/security/ns-level-pss/ + # pod-security.kubernetes.io/enforce: privileged + # pod-security.kubernetes.io/enforce: baseline + # pod-security.kubernetes.io/enforce: restricted + pod-security.kubernetes.io/enforce: baseline + pod-security.kubernetes.io/warn: restricted + pod-security.kubernetes.io/audit: restricted diff --git a/apps/keda/helmrelease.yaml b/apps/keda/helmrelease.yaml new file mode 100644 index 0000000..85f12d5 --- /dev/null +++ b/apps/keda/helmrelease.yaml @@ -0,0 +1,15 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: keda +spec: + chart: + spec: + chart: keda + sourceRef: + kind: HelmRepository + name: keda + interval: 5m + install: + createNamespace: true diff --git a/apps/keda/helmrepository.yaml b/apps/keda/helmrepository.yaml new file mode 100644 index 0000000..a7dec6d --- /dev/null +++ b/apps/keda/helmrepository.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: keda + namespace: keda +spec: + url: https://kedacore.github.io/charts + interval: 5m diff --git a/apps/keda/kustomization.yaml b/apps/keda/kustomization.yaml new file mode 100644 index 0000000..e58e893 --- /dev/null +++ b/apps/keda/kustomization.yaml @@ -0,0 +1,7 @@ +--- +namespace: keda + +resources: + - namespace.yaml + - helmrelease.yaml + - helmrepository.yaml diff --git a/apps/keda/namespace.yaml b/apps/keda/namespace.yaml new file mode 100644 index 0000000..02e3ca7 --- /dev/null +++ b/apps/keda/namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: keda diff --git a/apps/kube-perftest/configmap.yaml b/apps/kube-perftest/configmap.yaml new file mode 100644 index 0000000..4976bf4 --- /dev/null +++ b/apps/kube-perftest/configmap.yaml @@ -0,0 +1,8 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: kube-perftest-config +data: + values.yaml: | + # TODO... \ No newline at end of file diff --git a/apps/kube-perftest/helmchart.yaml b/apps/kube-perftest/helmchart.yaml new file mode 100644 index 0000000..db2784c --- /dev/null +++ b/apps/kube-perftest/helmchart.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmChart +metadata: + name: kube-perftest-operator +spec: + chart: kube-perftest-operator + version: "0.1.0" + sourceRef: + kind: HelmRepository + name: kube-perftest + interval: 10m0s diff --git a/apps/kube-perftest/helmrelease.yaml b/apps/kube-perftest/helmrelease.yaml new file mode 100644 index 0000000..9dc3cf3 --- /dev/null +++ b/apps/kube-perftest/helmrelease.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: kube-perftest-operator +spec: + chartRef: + kind: HelmChart + name: kube-perftest-operator + releaseName: kube-perftest-operator + valuesFrom: + - kind: ConfigMap + name: kube-perftest-config + install: + createNamespace: true + remediation: + retries: 3 + upgrade: + remediation: + retries: 3 + driftDetection: + mode: enabled + interval: 5m diff --git a/apps/kube-perftest/helmrepository.yaml b/apps/kube-perftest/helmrepository.yaml new file mode 100644 index 0000000..84b4c56 --- /dev/null +++ b/apps/kube-perftest/helmrepository.yaml @@ -0,0 +1,8 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: kube-perftest +spec: + url: https://stackhpc.github.io/kube-perftest + interval: 1h diff --git a/apps/kube-perftest/kustomization.yaml b/apps/kube-perftest/kustomization.yaml new file mode 100644 index 0000000..7358c2a --- /dev/null +++ b/apps/kube-perftest/kustomization.yaml @@ -0,0 +1,9 @@ +--- +namespace: kube-perftest + +resources: + - namespace.yaml + - helmrepository.yaml + - helmchart.yaml + - helmrelease.yaml + - configmap.yaml diff --git a/apps/kube-perftest/mpitests.yaml b/apps/kube-perftest/mpitests.yaml new file mode 100644 index 0000000..82f6e4e --- /dev/null +++ b/apps/kube-perftest/mpitests.yaml @@ -0,0 +1,15 @@ +apiVersion: perftest.stackhpc.com/v1alpha1 +kind: BenchmarkSet +metadata: + name: mpi-pingpong-cni +spec: + template: + apiVersion: perftest.stackhpc.com/v1alpha1 + kind: MPIPingPong + spec: + imagePullPolicy: Always + hostNetwork: false + # mtu: 9000 + transport: TCP + maxlog: 25 + repetitions: 5 \ No newline at end of file diff --git a/apps/kube-perftest/namespace.yaml b/apps/kube-perftest/namespace.yaml new file mode 100644 index 0000000..0db6b8d --- /dev/null +++ b/apps/kube-perftest/namespace.yaml @@ -0,0 +1,13 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: kube-perftest + # labels: + # # Set the pod security standard for the namespace + # # https://kubernetes.io/docs/tutorials/security/ns-level-pss/ + # # pod-security.kubernetes.io/enforce: privileged + # # pod-security.kubernetes.io/enforce: baseline + # # pod-security.kubernetes.io/enforce: restricted + # # pod-security.kubernetes.io/warn: restricted + # # pod-security.kubernetes.io/audit: restricted diff --git a/apps/opencost/configmap.yaml b/apps/opencost/configmap.yaml new file mode 100644 index 0000000..b9f9a8e --- /dev/null +++ b/apps/opencost/configmap.yaml @@ -0,0 +1,55 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: opencost-config +data: + values.yaml: | + opencost: + customPricing: + enabled: true + provider: custom + costModel: + description: Modified pricing configuration. + CPU: 0.031611 + spotCPU: 0.006655 + RAM: 0.004237 + spotRAM: 0.000892 + GPU: 0.95 + storage: 0.00005479452 + zoneNetworkEgress: 0.01 + regionNetworkEgress: 0.01 + internetNetworkEgress: 0.143 + spotLabel: "" + spotLabelValue: "" + awsServiceKeyName: "" + awsServiceKeySecret: "" + awsSpotDataRegion: "" + awsSpotDataBucket: "" + awsSpotDataPrefix: "" + athenaBucketName: "" + athenaRegion: "" + athenaDatabase: "" + athenaTable: "" + projectID: "${ACCOUNT_ID}" + exporter: + defaultClusterId: dawntest + extraEnv: + EMIT_KSM_V1_METRICS: "false" + EMIT_KSM_V1_METRICS_ONLY: "true" + LOG_LEVEL: debug # warn + prometheus: + internal: + enabled: true + serviceName: kube-prometheus-stack-prometheus + namespaceName: monitoring-system + port: 9090 + ui: + enabled: true + metrics: + serviceMonitor: + enabled: true + namespace: monitoring-system + carbonCost: + # TODO! + enabled: false diff --git a/apps/opencost/helmchart.yaml b/apps/opencost/helmchart.yaml new file mode 100644 index 0000000..4c96b6e --- /dev/null +++ b/apps/opencost/helmchart.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmChart +metadata: + name: opencost +spec: + chart: opencost + version: "1.43.2" + sourceRef: + kind: HelmRepository + name: opencost + interval: 10m0s diff --git a/apps/opencost/helmrelease.yaml b/apps/opencost/helmrelease.yaml new file mode 100644 index 0000000..2084910 --- /dev/null +++ b/apps/opencost/helmrelease.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: opencost +spec: + chartRef: + kind: HelmChart + name: opencost + releaseName: opencost + valuesFrom: + - kind: ConfigMap + name: opencost-config + install: + createNamespace: true + remediation: + retries: 3 + upgrade: + remediation: + retries: 3 + driftDetection: + mode: enabled + interval: 5m diff --git a/apps/opencost/helmrepository.yaml b/apps/opencost/helmrepository.yaml new file mode 100644 index 0000000..a438570 --- /dev/null +++ b/apps/opencost/helmrepository.yaml @@ -0,0 +1,8 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: opencost +spec: + url: https://opencost.github.io/opencost-helm-chart + interval: 1h diff --git a/apps/opencost/kustomization.yaml b/apps/opencost/kustomization.yaml new file mode 100644 index 0000000..57fabf7 --- /dev/null +++ b/apps/opencost/kustomization.yaml @@ -0,0 +1,8 @@ +namespace: opencost + +resources: + - namespace.yaml + - helmrepository.yaml + - helmchart.yaml + - helmrelease.yaml + - configmap.yaml diff --git a/apps/opencost/namespace.yaml b/apps/opencost/namespace.yaml new file mode 100644 index 0000000..07af73d --- /dev/null +++ b/apps/opencost/namespace.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: opencost + labels: + # Set the pod security standard for the namespace + # https://kubernetes.io/docs/tutorials/security/ns-level-pss/ + # pod-security.kubernetes.io/enforce: privileged + # pod-security.kubernetes.io/enforce: baseline + # pod-security.kubernetes.io/enforce: restricted + pod-security.kubernetes.io/enforce: baseline + pod-security.kubernetes.io/warn: restricted + pod-security.kubernetes.io/audit: restricted diff --git a/apps/rdmatest/hostdevice-network-pod1.yaml b/apps/rdmatest/hostdevice-network-pod1.yaml new file mode 100644 index 0000000..d1661ee --- /dev/null +++ b/apps/rdmatest/hostdevice-network-pod1.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +kind: Pod +metadata: + name: test-hostdev-pod-1 + annotations: + k8s.v1.cni.cncf.io/networks: example-hostdevice-network +spec: + nodeSelector: + # Note: Replace hostname or remove selector altogether + kubernetes.io/hostname: dawntest-dawn-ztt9k-jzczg + containers: + - name: test-hostdev-pod + #image: ghcr.io/stackhpc/kube-perftest-mpi-benchmarks:19e96a8 + image: mellanox/rping-test + imagePullPolicy: IfNotPresent + command: [ "/bin/bash", "-c", "--" ] + args: [ "while true; do sleep 300; done;" ] + securityContext: + capabilities: + add: [ "IPC_LOCK" ] + resources: + requests: + nvidia.com/hostdev: "2" + gpu.intel.com/i915: "2" + limits: + nvidia.com/hostdev: "2" + gpu.intel.com/i915: "2" diff --git a/apps/rdmatest/hostdevice-network-pod2.yaml b/apps/rdmatest/hostdevice-network-pod2.yaml new file mode 100644 index 0000000..5de6b35 --- /dev/null +++ b/apps/rdmatest/hostdevice-network-pod2.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +kind: Pod +metadata: + name: test-hostdev-pod-2 + annotations: + k8s.v1.cni.cncf.io/networks: example-hostdevice-network +spec: + nodeSelector: + # Note: Replace hostname or remove selector altogether + kubernetes.io/hostname: dawntest-dawn-ztt9k-4zrm4 + containers: + - name: test-hostdev-pod + #image: ghcr.io/stackhpc/kube-perftest-mpi-benchmarks:19e96a8 + image: mellanox/rping-test + imagePullPolicy: IfNotPresent + command: [ "/bin/bash", "-c", "--" ] + args: [ "while true; do sleep 300; done;" ] + securityContext: + capabilities: + add: [ "IPC_LOCK" ] + resources: + requests: + nvidia.com/hostdev: "2" + gpu.intel.com/i915: "2" + limits: + nvidia.com/hostdev: "2" + gpu.intel.com/i915: "2" diff --git a/apps/rdmatest/hostdevice-network.yaml b/apps/rdmatest/hostdevice-network.yaml new file mode 100644 index 0000000..85215fb --- /dev/null +++ b/apps/rdmatest/hostdevice-network.yaml @@ -0,0 +1,22 @@ +apiVersion: mellanox.com/v1alpha1 +kind: HostDeviceNetwork +metadata: + name: example-hostdevice-network +spec: + networkNamespace: "default" + resourceName: "hostdev" + ipam: | + { + "type": "whereabouts", + "datastore": "kubernetes", + "kubernetes": { + "kubeconfig": "/etc/cni/net.d/whereabouts.d/whereabouts.kubeconfig" + }, + "range": "192.168.42.0/24", + "exclude": [ + "192.168.42.0/32", + "192.168.42.255/32", + ], + "log_file" : "/var/log/whereabouts.log", + "log_level" : "info" + } \ No newline at end of file diff --git a/apps/rdmatest/kustomization.yaml b/apps/rdmatest/kustomization.yaml new file mode 100644 index 0000000..028a80d --- /dev/null +++ b/apps/rdmatest/kustomization.yaml @@ -0,0 +1,7 @@ +namespace: rdmatest + +resources: + - namespace.yaml + - hostdevice-network.yaml + - hostdevice-network-pod1.yaml + - hostdevice-network-pod2.yaml diff --git a/apps/rdmatest/namespace.yaml b/apps/rdmatest/namespace.yaml new file mode 100644 index 0000000..a5f6b03 --- /dev/null +++ b/apps/rdmatest/namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: rdmatest diff --git a/apps/sealed-secrets/helmchart.yaml b/apps/sealed-secrets/helmchart.yaml new file mode 100644 index 0000000..b3e3599 --- /dev/null +++ b/apps/sealed-secrets/helmchart.yaml @@ -0,0 +1,13 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmChart +metadata: + name: sealed-secrets + namespace: sealed-secrets-system +spec: + chart: sealed-secrets + version: 2.16.1 + sourceRef: + kind: HelmRepository + name: sealed-secrets + interval: 1h diff --git a/apps/sealed-secrets/helmrelease.yaml b/apps/sealed-secrets/helmrelease.yaml new file mode 100644 index 0000000..b5283e7 --- /dev/null +++ b/apps/sealed-secrets/helmrelease.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: sealed-secrets + namespace: sealed-secrets-system +spec: + chartRef: + kind: HelmChart + name: sealed-secrets + releaseName: sealed-secrets + install: + createNamespace: true + remediation: + retries: -1 + upgrade: + remediation: + retries: -1 + driftDetection: + mode: enabled + interval: 5m diff --git a/apps/sealed-secrets/helmrepository.yaml b/apps/sealed-secrets/helmrepository.yaml new file mode 100644 index 0000000..8434309 --- /dev/null +++ b/apps/sealed-secrets/helmrepository.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: sealed-secrets + namespace: sealed-secrets-system +spec: + url: https://bitnami-labs.github.io/sealed-secrets + interval: 1h diff --git a/apps/sealed-secrets/kustomization.yaml b/apps/sealed-secrets/kustomization.yaml new file mode 100644 index 0000000..c4286f8 --- /dev/null +++ b/apps/sealed-secrets/kustomization.yaml @@ -0,0 +1,5 @@ +resources: + - namespace.yaml + - helmrepository.yaml + - helmchart.yaml + - helmrelease.yaml diff --git a/apps/sealed-secrets/namespace.yaml b/apps/sealed-secrets/namespace.yaml new file mode 100644 index 0000000..46be057 --- /dev/null +++ b/apps/sealed-secrets/namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: sealed-secrets-system diff --git a/apps/slinky-slurm-controlplane/defaults.yaml b/apps/slinky-slurm-controlplane/defaults.yaml new file mode 100644 index 0000000..5fd93b5 --- /dev/null +++ b/apps/slinky-slurm-controlplane/defaults.yaml @@ -0,0 +1,600 @@ +--- +# +# Debug configuration. +# @ignored +debug: + # + # -- (bool) + # Enables debug configuration. + enabled: false + # + # -- (bool) + # Allow a locally running operator to communicate with slurm cluster via port-forward. + # NOTE: use when running the operator in a local debugger. + localOperator: true + +# +# -- (string) +# Overrides the name of the release. +nameOverride: "" + +# +# -- (string) +# Overrides the full name of the release. +fullnameOverride: "" + +# +# -- (string) +# Overrides the namespace of the release. +namespaceOverride: "" + +# +# -- (list) +# Set the secrets for image pull. +# Ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ +imagePullSecrets: [] + # - name: regcred + +# +# -- (string) +# Set the image pull policy. +imagePullPolicy: IfNotPresent + +# +# -- (string) +# Set the priority class to use. +# Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass +priorityClassName: "" + +# +# Slurm JWT authentication. +jwt: + # + # JWT hs256 configurations. + hs256: + # + # -- (string) + # The existing secret to use otherwise one will be generated. + existingSecret: "" + +# +# Slurm configurations. +slurm: + # + # Slurm authentication configurations. + auth: + # + # -- (string) + # The existing secret to use otherwise one will be generated. + existingSecret: "" + # + # -- (string) + # Extra slurmdbd configuration lines to append to `slurmdbd.conf`. + # WARNING: Values can override existing ones. + # Ref: https://slurm.schedmd.com/slurmdbd.conf.html + extraSlurmdbdConf: |- + CommitDelay=1 + # + # -- (string) + # Extra slurm configuration lines to append to `slurm.conf`. + # WARNING: Values can override existing ones. + # Ref: https://slurm.schedmd.com/slurm.conf.html + extraSlurmConf: |- + SchedulerParameters=batch_sched_delay=20,bf_continue,bf_interval=300,bf_min_age_reserve=10800,bf_resolution=600,bf_yield_interval=1000000,partition_job_depth=500,sched_max_job_start=200,sched_min_interval=2000000 + DefMemPerCPU=1 + # + # -- (map[string]string) + # Optional raw Slurm configuration files, as a map. + # The map key represents the config file by name; the map value represents config file contents as a string. + # Ref: https://slurm.schedmd.com/man_index.html#configuration_files + configFiles: {} + # acct_gather.conf: | + # # Ref: https://slurm.schedmd.com/acct_gather.conf.html + # burst_buffer.conf: | + # # Ref: https://slurm.schedmd.com/burst_buffer.conf.html + # gres.conf: | + # # Ref: https://slurm.schedmd.com/gres.conf.html + # helpers.conf: | + # # Ref: https://slurm.schedmd.com/helpers.conf.html + # job_container.conf: | + # # Ref: https://slurm.schedmd.com/job_container.conf.html + # mpi.conf: | + # # Ref: https://slurm.schedmd.com/mpi.conf.html + # oci.conf: | + # # Ref: https://slurm.schedmd.com/oci.conf.html + # plugstack.conf: | + # # Ref: https://slurm.schedmd.com/plugstack.conf.html + # topology.conf: | + # # Ref: https://slurm.schedmd.com/topology.conf.html + # + # -- (map[string]string) + # The Prolog scripts for compute nodesets, as a map. + # The map key represents the filename; the map value represents the script contents. + # WARNING: The script must include a shebang (!) so it can be executed correctly by Slurm. + # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog + # Ref: https://slurm.schedmd.com/prolog_epilog.html + # Ref: https://en.wikipedia.org/wiki/Shebang_(Unix) + prologScripts: {} + # empty: | + # #!/usr/bin/env bash + # exit 0 + # + # -- (map[string]string) + # The Epilog scripts for compute nodesets, as a map. + # The map key represents the filename; the map value represents the script contents. + # WARNING: The script must include a shebang (!) so it can be executed correctly by Slurm. + # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog + # Ref: https://slurm.schedmd.com/prolog_epilog.html + # Ref: https://en.wikipedia.org/wiki/Shebang_(Unix) + epilogScripts: {} + # empty: | + # #!/usr/bin/env bash + # exit 0 + +# +# Slurm authcred (sackd) configurations. +authcred: + # + # Set the image to use. + image: + # + # -- (string) + # Set the image repository to use. + repository: ghcr.io/slinkyproject/sackd + # + # -- (string) + # Set the image tag to use. + tag: 24.05-ubuntu-24.04 + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: {} + # requests: + # cpu: 1 + # memory: 1Gi + # limits: + # cpu: 2 + # memory: 4Gi + +# +# Slurm controller (slurmctld) configurations. +controller: + # + # -- (bool) + # Enables the controller node. + enabled: true + # + # -- (integer) + # Set the number of replicas to deploy. + replicas: 1 + # + # -- (string) + # Set the image pull policy. + imagePullPolicy: IfNotPresent + # + # Set the image to use. + image: + # + # -- (string) + # Set the image repository to use. + repository: ghcr.io/slinkyproject/slurmctld + # + # -- (string) + # Set the image tag to use. + tag: 24.05-ubuntu-24.04 + # + # -- (string) + # Set the priority class to use. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass + priorityClassName: + # + # -- (object) + # Set affinity for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: {} + # requests: + # cpu: 1 + # memory: 1Gi + # limits: + # cpu: 2 + # memory: 4Gi + # + # Define a persistent volume for the slurm controller to store its save-state. + # Used to recover from system failures or from pod upgrades. + persistence: + # + # -- (string) + # Name of an existing `PersistentVolumeClaim` to use instead of creating one from definition. + # NOTE: When not empty, the other persistence fields will be ignored. + existingClaim: "" + # + # -- (object) + # Create a `PersistentVolumeClaim` with these annotations. + annotations: {} + # + # -- (object) + # Create a `PersistentVolumeClaim` with these labels. + labels: {} + # + # -- (string) + # Create a `PersistentVolumeClaim` with this storage class. + storageClass: csi-manila + # + # -- (list) + # Create a `PersistentVolumeClaim` with these access modes. + accessModes: + - ReadWriteOnce + # + # -- (string) + # Create a `PersistentVolumeClaim` with this storage size. + size: 4Gi + # + # -- (object) + # Selector to match an existing `PersistentVolume`. + selector: {} + # matchLabels: + # app: foo + +# +# Slurm compute (slurmd) configurations. +compute: + # + # -- (string) + # Set the image pull policy. + imagePullPolicy: IfNotPresent + # + # Default image for the nodeset pod (slurmd) + # Each nodeset may override this setting. + image: + # + # -- (string) + # Set the image repository to use. + repository: ghcr.io/slinkyproject/slurmd + # + # -- (string) + # Set the image tag to use. + # @default -- The Release appVersion. + tag: 24.05-ubuntu-24.04 + # + # -- (list) + # Slurm NodeSets by object list. + nodesets: + # + # -- (string) + # Name of NodeSet. Must be unique. + - name: dawn + # + # -- (bool) + # Enables the NodeSet in Slurm. + enabled: true + # + # -- (integer) + # Set the number of replicas to deploy. + # NOTE: if empty, all nodes matching affinity will have a replica (like DaemonSet). + replicas: 2 # TODO: set to max nodes in cluster + # + # -- (int) + # The minimum number of seconds for which a newly created NodeSet Pod should be ready + # without any of its container crashing, for it to be considered available. + minReadySeconds: 0 + # + # -- (string) + # Set the image pull policy. + imagePullPolicy: IfNotPresent + # + # Set the image to use. + image: + # + # -- (string) + # Set the image repository to use. + repository: "" + # + # -- (string) + # Set the image tag to use. + tag: "" + # + # -- (string) + # Set the priority class to use. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass + priorityClassName: "slinky-low-priority" + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: + limits: + cpu: 16 + memory: 100Gi + "gpu.intel.com/i915": "4" + # + # -- (map) + # Selector which must match a node's labels for the pod to be scheduled on that node. + nodeSelector: + kubernetes.io/os: linux + # + # -- (object) + # Set affinity for Kubernetes Pod scheduling. + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: app.kubernetes.io/component + operator: In + values: + - singleuser-server + topologyKey: "kubernetes.io/hostname" + # TODO - make this more easily configurable + namespaces: + - slurm + - jupyterhub + - testproject2 + + # + # -- (object) + # Set the update strategy configuration. + updateStrategy: + # + # -- (string) + # Set the update strategy type. + # Can be either: "RollingUpdate"; "OnDelete". + type: RollingUpdate + # + # -- (object) + # Define the rolling update policy. + # Only used when "updateStrategy.type=RollingUpdate". + rollingUpdate: + # + # -- (string) + # The maximum number of pods that can be unavailable during the update. + # Value can be an absolute number (ex: 5) or a percentage of desired + # pods (ex: 10%). Absolute number is calculated from percentage by + # rounding up. This can not be 0. Defaults to 1. + maxUnavailable: 20% + # + # -- (int) + # Partition indicates the number of NodeSet pods that should be + # not be updated to the latest version. + partition: 0 + # + # -- (bool) + # Pause will halt rollingUpdate while this value is true. + paused: false + # + # -- (object) + # The policy used for PVCs created from the NodeSet VolumeClaimTemplates. + persistentVolumeClaimRetentionPolicy: + # + # -- (string) + # WhenDeleted specifies what happens to PVCs created from NodeSet + # VolumeClaimTemplates when the NodeSet is deleted. The default policy + # of `Retain` causes PVCs to not be affected by NodeSet deletion. The + # `Delete` policy causes those PVCs to be deleted. + whenDeleted: Retain + # + # --(list) + # List of claims that pods are allowed to reference. + # The NodeSet controller is responsible for mapping network identities to + # claims in a way that maintains the identity of a pod. + volumeClaimTemplates: [] + # - metadata: + # name: data + # spec: + # storageClassName: standard + # mountPath: /mnt/data + # accessModes: + # - ReadWriteOnce + # resources: + # requests: + # storage: 1Gi + # + # -- (object) + # Partition describes the partition created specifically for this NodeSet to be added. + partition: + # + # -- (bool) + # Enables this NodeSet's partition line to be added in Slurm. + enabled: true + # + # -- (string) + # Extra Slurm partition configuration appended onto the partition line. + # Ref: https://slurm.schedmd.com/slurm.conf.html#lbAI + config: >- + State=UP + MaxTime=INFINITE + # + # -- (string) + # Set Slurm node GRES. + # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Gres_1 + nodeGres: "" + # + # -- (list) + # Set Slurm node Features as a list(string). + # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Features + nodeFeatures: [] + # + # -- (string) + # Set Slurm node weight for Slurm scheduling. + # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Weight + nodeWeight: 1 + # + # -- (list) + # Slurm Partitions by object list. + partitions: + # + # -- (string) + # Name of Partition. Must be unique. + - name: all + # + # -- (bool) + # Enables the partition in Slurm. + enabled: true + # + # -- (list) + # NodeSets to put into this Partition by name/key. + # NOTE: 'ALL' is a Slurm meta value to mean all nodes in the system. + nodesets: + - ALL + # + # -- (string) + # Extra Slurm partition configuration appended onto the partition line. + # Ref: https://slurm.schedmd.com/slurm.conf.html#lbAI + config: >- + State=UP + Default=YES + MaxTime=INFINITE + +# +# Slurm accounting (slurmdbd) configurations. +accounting: + # + # -- (bool) + # Enables accounting services. + enabled: true + # + # -- (integer) + # Set the number of replicas to deploy. + replicas: 1 + # + # -- (string) + # Set the image pull policy. + imagePullPolicy: IfNotPresent + # + # Set the image to use. + image: + # + # -- (string) + # Set the image repository to use. + repository: ghcr.io/slinkyproject/slurmdbd + # + # -- (string) + # Set the image tag to use. + tag: 24.05-ubuntu-24.04 + # + # -- (object) + # Set affinity for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: {} + # requests: + # cpu: 1 + # memory: 1Gi + # limits: + # cpu: 2 + # memory: 4Gi + # + # Configuration for an external accounting instance (slurmdbd). + external: + # + # -- (bool) + # Use an external acounting instance (slurmdbd) instead of deploying one. + enabled: false + # + # -- (string) + # The external acounting instance (slurmdbd) host. + host: "" + # + # -- (integer) + # The external acounting instance (slurmdbd) port. + port: 6819 + +# +# `bitnami/mariadb` subchart configurations. +# Ref: https://github.com/bitnami/charts/blob/main/bitnami/mariadb/values.yaml +mariadb: + enabled: true + auth: + username: slurm + database: slurm_acct_db + existingSecret: "slurm-mariadb-passwords" + initdbScripts: + # NOTE: https://slurm.schedmd.com/accounting.html#slurm-accounting-configuration-before-build + slurm-accounting.sql: |- + SET GLOBAL innodb_buffer_pool_size=(4 * 1024 * 1024 * 1024); + SET GLOBAL innodb_log_file_size=(64 * 1024 * 1024); + SET GLOBAL innodb_lock_wait_timeout=900; + SET GLOBAL max_allowed_packet=(16 * 1024 * 1024); + primary: + persistence: + enabled: false + existingClaim: "" + storageClass: csi-manila + labels: {} + annotations: {} + accessModes: + - ReadWriteOnce + size: 8Gi + selector: {} + priorityClassName: "" + metrics: + enabled: true + serviceMonitor: + enabled: true + affinity: {} + resources: {} + +# +# Slurm REST API (slurmrestd) configurations. +restapi: + # + # -- (bool) + # Enables restapi services. + enabled: true + # + # -- (integer) + # Set the number of replicas to deploy. + replicas: 1 + # + # -- (string) + # Set the image pull policy. + imagePullPolicy: IfNotPresent + # + # Set the image to use. + image: + # + # -- (string) + # Set the image repository to use. + repository: ghcr.io/slinkyproject/slurmrestd + # + # -- (string) + # Set the image tag to use. + tag: 24.05-ubuntu-24.04 + # + # -- (string) + # Set the priority class to use. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass + priorityClassName: "" + # + # -- (object) + # Set affinity for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: {} + # requests: + # cpu: 1 + # memory: 1Gi + # limits: + # cpu: 2 + # memory: 4Gi + +# +# `slurm-exporter` subchart configurations. +# Ref: https://github.com/SlinkyProject/slurm-exporter/-/blob/main/helm/slurm-exporter/values.yaml +slurm-exporter: + exporter: + enabled: true + secretName: "slurm-token-exporter" diff --git a/apps/slinky-slurm-controlplane/helmrelease.yaml b/apps/slinky-slurm-controlplane/helmrelease.yaml new file mode 100644 index 0000000..9367025 --- /dev/null +++ b/apps/slinky-slurm-controlplane/helmrelease.yaml @@ -0,0 +1,22 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: slurm +spec: + chartRef: + kind: OCIRepository + name: slinky-slurm + valuesFrom: + - kind: ConfigMap + name: slinky-slurm-defaults + valuesKey: values.yaml + dependsOn: + - name: slurm-operator + namespace: slinky + install: + createNamespace: true + driftDetection: + mode: disabled + interval: 40m + timeout: 30m diff --git a/apps/slinky-slurm-controlplane/kustomization.yaml b/apps/slinky-slurm-controlplane/kustomization.yaml new file mode 100644 index 0000000..eb1698f --- /dev/null +++ b/apps/slinky-slurm-controlplane/kustomization.yaml @@ -0,0 +1,17 @@ +--- +namespace: slurm + +configurations: + - kustomizeconfig.yaml + +configMapGenerator: + - name: slinky-slurm-defaults + files: + - values.yaml=defaults.yaml + +resources: + - namespace.yaml + - ocirepository.yaml + - helmrelease.yaml + - priorityclass.yaml + - scaledobject.yaml diff --git a/apps/slinky-slurm-controlplane/kustomizeconfig.yaml b/apps/slinky-slurm-controlplane/kustomizeconfig.yaml new file mode 100644 index 0000000..26387e1 --- /dev/null +++ b/apps/slinky-slurm-controlplane/kustomizeconfig.yaml @@ -0,0 +1,7 @@ +# Make sure configmap references in HelmReleases are updated +nameReference: + - kind: ConfigMap + version: v1 + fieldSpecs: + - path: spec/valuesFrom/name + kind: HelmRelease diff --git a/apps/slinky-slurm-controlplane/namespace.yaml b/apps/slinky-slurm-controlplane/namespace.yaml new file mode 100644 index 0000000..62b754e --- /dev/null +++ b/apps/slinky-slurm-controlplane/namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: slurm diff --git a/apps/slinky-slurm-controlplane/ocirepository.yaml b/apps/slinky-slurm-controlplane/ocirepository.yaml new file mode 100644 index 0000000..8326499 --- /dev/null +++ b/apps/slinky-slurm-controlplane/ocirepository.yaml @@ -0,0 +1,10 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1beta2 +kind: OCIRepository +metadata: + name: slinky-slurm +spec: + interval: 10m + url: oci://ghcr.io/slinkyproject/charts/slurm + ref: + semver: "0.1.0" diff --git a/apps/slinky-slurm-controlplane/priorityclass.yaml b/apps/slinky-slurm-controlplane/priorityclass.yaml new file mode 100644 index 0000000..ee1ae7c --- /dev/null +++ b/apps/slinky-slurm-controlplane/priorityclass.yaml @@ -0,0 +1,6 @@ +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: slinky-low-priority +value: -1 +globalDefault: false diff --git a/apps/slinky-slurm-controlplane/scaledobject.yaml b/apps/slinky-slurm-controlplane/scaledobject.yaml new file mode 100644 index 0000000..fe3884d --- /dev/null +++ b/apps/slinky-slurm-controlplane/scaledobject.yaml @@ -0,0 +1,20 @@ +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: dawn-radar +spec: + scaleTargetRef: + apiVersion: slinky.slurm.net/v1alpha1 + kind: NodeSet + name: slurm-compute-dawn + idleReplicaCount: 0 + minReplicaCount: 0 + maxReplicaCount: 2 + cooldownPeriod: 600 #TODO: set to partition max job time + triggers: + - type: prometheus + metricType: Value + metadata: + serverAddress: http://prometheus-kube-prometheus-prometheus.prometheus:9090 + query: slurm_partition_pending_jobs{partition="dawn"} + threshold: '1' diff --git a/apps/slinky-slurm-operator/defaults.yaml b/apps/slinky-slurm-operator/defaults.yaml new file mode 100644 index 0000000..844a765 --- /dev/null +++ b/apps/slinky-slurm-operator/defaults.yaml @@ -0,0 +1,161 @@ +--- + +# +# -- (string) +# Overrides the name of the release. +nameOverride: "" + +# +# -- (string) +# Overrides the full name of the release. +fullnameOverride: "" + +# +# -- (string) +# Overrides the namespace of the release. +namespaceOverride: "" + +# +# -- (list) +# Sets the image pull secrets. +# Ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ +imagePullSecrets: [] + # - name: regcred + +# +# -- (string) +# Set the image pull policy. +imagePullPolicy: IfNotPresent + +# +# Image configurations. +image: + # + # -- (string) + # Sets the image repository to use. + repository: ghcr.io/slinkyproject/slurm-operator + # + # -- (string) + # Sets the image tag to use. + # @default -- The Release appVersion. + tag: "0.1.0" + +# +# -- (string) +# Set the priority class to use. +# Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass +priorityClassName: "" + +# +# Operator configurations. +operator: + # + # -- (bool) + # Enables the operator. + enabled: true + # + # -- (integer) + # Set the number of replicas to deploy. + replicas: 1 + # + # Service account configurations. + serviceAccount: + # + # -- (bool) + # Allows chart to create the service account. + create: true + # + # -- (string) + # Set the service account to use (and create). + name: "" + # + # -- (object) + # Set affinity for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: {} + # requests: + # cpu: 1 + # memory: 1Gi + # limits: + # cpu: 2 + # memory: 4Gi + # + # -- (integer) + # Set the max concurrent workers for the Cluster controller. + clusterWorkers: 1 + # + # -- (integer) + # Set the max concurrent workers for the NodeSet controller. + nodesetWorkers: 1 + # + # -- (string) + # Set the log level by string (e.g. error, info, debug) or number (e.g. 1..5). + logLevel: info + +# +# Webhook configurations. +webhook: + # + # -- (bool) + # Enables the webhook. + enabled: true + # + # -- (integer) + # Set the number of replicas to deploy. + replicas: 1 + # + # Service account configurations. + serviceAccount: + # + # -- (bool) + # Allows chart to create the service account. + create: true + # + # -- (string) + # Set the service account to use (and create). + name: "" + # + # -- (object) + # Set affinity for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: {} + # requests: + # cpu: 1 + # memory: 1Gi + # limits: + # cpu: 2 + # memory: 4Gi + # + # -- (string) + # Set the log level by string (e.g. error, info, debug) or number (e.g. 1..5). + logLevel: info + +# +# Cert-Manager certificate configurations. +certManager: + # + # -- (bool) + # Enables cert-manager for certificate management. + enabled: true + # + # -- (string) + # The secret to be (created and) mounted. + secretName: slurm-operator-webhook-ca + # + # -- (string) + # Duration of certificate life. + duration: 43800h0m0s # 5 year + # + # -- (string) + # Certificate renewal time. Should be before the expiration. + renewBefore: 8760h0m0s # 1 year diff --git a/apps/slinky-slurm-operator/helmrelease.yaml b/apps/slinky-slurm-operator/helmrelease.yaml new file mode 100644 index 0000000..b85e5b5 --- /dev/null +++ b/apps/slinky-slurm-operator/helmrelease.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: slurm-operator +spec: + chartRef: + kind: OCIRepository + name: slinky-slurm-operator + valuesFrom: + - kind: ConfigMap + name: slinky-slurm-operator-defaults + valuesKey: values.yaml + install: + createNamespace: true + remediation: + retries: -1 + upgrade: + remediation: + retries: -1 + driftDetection: + mode: disabled + interval: 5m diff --git a/apps/slinky-slurm-operator/kustomization.yaml b/apps/slinky-slurm-operator/kustomization.yaml new file mode 100644 index 0000000..1b8cb0c --- /dev/null +++ b/apps/slinky-slurm-operator/kustomization.yaml @@ -0,0 +1,15 @@ +--- +namespace: slinky + +configurations: + - kustomizeconfig.yaml + +configMapGenerator: + - name: slinky-slurm-operator-defaults + files: + - values.yaml=defaults.yaml + +resources: + - namespace.yaml + - ocirepository.yaml + - helmrelease.yaml diff --git a/apps/slinky-slurm-operator/kustomizeconfig.yaml b/apps/slinky-slurm-operator/kustomizeconfig.yaml new file mode 100644 index 0000000..26387e1 --- /dev/null +++ b/apps/slinky-slurm-operator/kustomizeconfig.yaml @@ -0,0 +1,7 @@ +# Make sure configmap references in HelmReleases are updated +nameReference: + - kind: ConfigMap + version: v1 + fieldSpecs: + - path: spec/valuesFrom/name + kind: HelmRelease diff --git a/apps/slinky-slurm-operator/namespace.yaml b/apps/slinky-slurm-operator/namespace.yaml new file mode 100644 index 0000000..a7b1ef8 --- /dev/null +++ b/apps/slinky-slurm-operator/namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: slinky diff --git a/apps/slinky-slurm-operator/ocirepository.yaml b/apps/slinky-slurm-operator/ocirepository.yaml new file mode 100644 index 0000000..d8ae9ab --- /dev/null +++ b/apps/slinky-slurm-operator/ocirepository.yaml @@ -0,0 +1,10 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1beta2 +kind: OCIRepository +metadata: + name: slinky-slurm-operator +spec: + interval: 10m + url: oci://ghcr.io/slinkyproject/charts/slurm-operator + ref: + semver: "0.1.0" diff --git a/apps/slinky/README.md b/apps/slinky/README.md new file mode 100644 index 0000000..58ea7d0 --- /dev/null +++ b/apps/slinky/README.md @@ -0,0 +1,30 @@ +# Slinky setup + +Based on: +https://github.com/SlinkyProject/slurm-operator/blob/main/docs/user/quickstart.md + +# Testing Slinky Slurm + +To test Slurm functionality, connect to the controller to use Slurm client +commands: + +```sh +kubectl -n slurm exec -it statefulsets/slurm-controller -- bash --login +``` + +On the controller pod (e.g. host `slurm@slurm-controller-0`), run the following +commands to quickly test Slurm is functioning: + +```sh +sinfo +srun hostname +sbatch --wrap="sleep 60" +squeue +``` + +See [Slurm Commands][slurm-commands] for more details on how to interact with +Slurm. + + + +[slurm-commands]: https://slurm.schedmd.com/quickstart.html#commands \ No newline at end of file diff --git a/apps/slinky/kustomization.yaml b/apps/slinky/kustomization.yaml new file mode 100644 index 0000000..9d49efe --- /dev/null +++ b/apps/slinky/kustomization.yaml @@ -0,0 +1,7 @@ +--- +resources: + # - ../cert-manager/ + - ../keda/ + # todo - need above to install before below + - ../slinky-slurm-operator/ + - ../slinky-slurm-controlplane/ diff --git a/apps/spegel/helmrelease.yaml b/apps/spegel/helmrelease.yaml new file mode 100644 index 0000000..e859a31 --- /dev/null +++ b/apps/spegel/helmrelease.yaml @@ -0,0 +1,15 @@ +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: spegel + namespace: spegel +spec: + interval: 1m + chart: + spec: + chart: spegel + version: "v0.0.23" + interval: 5m + sourceRef: + kind: HelmRepository + name: spegel diff --git a/apps/spegel/helmrepository.yaml b/apps/spegel/helmrepository.yaml new file mode 100644 index 0000000..62b88ac --- /dev/null +++ b/apps/spegel/helmrepository.yaml @@ -0,0 +1,9 @@ +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: spegel + namespace: spegel +spec: + type: "oci" + interval: 5m0s + url: oci://ghcr.io/spegel-org/helm-charts diff --git a/apps/spegel/kustomization.yaml b/apps/spegel/kustomization.yaml new file mode 100644 index 0000000..157da08 --- /dev/null +++ b/apps/spegel/kustomization.yaml @@ -0,0 +1,4 @@ +resources: + - namespace.yaml + - helmrepository.yaml + - helmrelease.yaml diff --git a/apps/spegel/namespace.yaml b/apps/spegel/namespace.yaml new file mode 100644 index 0000000..2b70b05 --- /dev/null +++ b/apps/spegel/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: spegel diff --git a/apps/tetragon/configmap.yaml b/apps/tetragon/configmap.yaml new file mode 100644 index 0000000..220251d --- /dev/null +++ b/apps/tetragon/configmap.yaml @@ -0,0 +1,15 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: tetragon +data: + values.yaml: | + tetragon: + prometheus: + serviceMonitor: + enabled: true + tetragonOperator: + prometheus: + serviceMonitor: + enabled: true \ No newline at end of file diff --git a/apps/tetragon/file_monitoring.yaml b/apps/tetragon/file_monitoring.yaml new file mode 100644 index 0000000..1374936 --- /dev/null +++ b/apps/tetragon/file_monitoring.yaml @@ -0,0 +1,176 @@ +apiVersion: cilium.io/v1alpha1 +kind: TracingPolicy +metadata: + name: "file-monitoring-filtered" +spec: + kprobes: + - call: "security_file_permission" + syscall: false + return: true + args: + - index: 0 + type: "file" # (struct file *) used for getting the path + - index: 1 + type: "int" # 0x04 is MAY_READ, 0x02 is MAY_WRITE + returnArg: + index: 0 + type: "int" + returnArgAction: "Post" + selectors: + - matchArgs: + - index: 0 + operator: "Prefix" + values: + - "/boot" # Reads to sensitive directories + - "/root/.ssh" # Reads to sensitive files we want to know about + - "/etc/shadow" + - "/etc/profile" + - "/etc/sudoers" + - "/etc/pam.conf" # Reads global shell configs bash/csh supported + - "/etc/bashrc" + - "/etc/csh.cshrc" + - "/etc/csh.login" # Add additional sensitive files here + - index: 1 + operator: "Equal" + values: + - "4" # MAY_READ + - matchArgs: + - index: 0 + operator: "Postfix" + values: + - ".bashrc" # Reads to shell config files bash, csh supported + - ".bash_profile" # add any other shell support here. + - ".bash_login" + - ".bash_logout" + - ".cshrc" + - ".cshdirs" + - ".profile" # Reads to common environments files + - ".login" + - ".logout" + - ".history" # Add additional sensitive files here + - index: 1 + operator: "Equal" + values: + - "4" # MAY_READ + - matchArgs: + - index: 0 + operator: "Prefix" + values: + - "/etc" # Writes to sensitive directories + - "/boot" + - "/lib" + - "/lib64" + - "/bin" + - "/usr/lib" + - "/usr/local/lib" + - "/usr/local/sbin" + - "/usr/local/bin" + - "/usr/bin" + - "/usr/sbin" + - "/var/log" # Writes to logs + - "/dev/log" + - "/root/.ssh" # Writes to sensitive files add here. + - index: 1 + operator: "Equal" + values: + - "2" # MAY_WRITE + - call: "security_mmap_file" + syscall: false + return: true + args: + - index: 0 + type: "file" # (struct file *) used for getting the path + - index: 1 + type: "uint32" # the prot flags PROT_READ(0x01), PROT_WRITE(0x02), PROT_EXEC(0x04) + - index: 2 + type: "uint32" # the mmap flags (i.e. MAP_SHARED, ...) + returnArg: + index: 0 + type: "int" + returnArgAction: "Post" + selectors: + - matchArgs: + - index: 0 + operator: "Prefix" + values: + - "/boot" # Reads to sensitive directories + - "/root/.ssh" # Reads to sensitive files we want to know about + - "/etc/shadow" + - "/etc/sudoers" + - "/etc/pam.conf" # Reads global shell configs bash/csh supported + - "/etc/profile" + - "/etc/bashrc" + - "/etc/csh.cshrc" + - "/etc/csh.login" + - ".bashrc" # Reads to shell config files bash, csh supported + - ".bash_profile" # add any other shell support here. + - ".bash_login" + - ".bash_logout" + - ".cshrc" + - ".cshdirs" + - ".profile" # Reads to common environments files + - ".login" + - ".logout" + - ".history" # Add additional sensitive mmap files here + - index: 1 + operator: "Equal" + values: + - "1" # MAY_READ + - index: 2 + operator: "Mask" + values: + - "1" # MAP_SHARED + - matchArgs: + - index: 0 + operator: "Prefix" + values: + - "/etc" # Writes to sensitive directories + - "/boot" + - "/lib" + - "/lib64" + - "/bin" + - "/usr/lib" + - "/usr/local/lib" + - "/usr/local/sbin" + - "/usr/local/bin" + - "/usr/bin" + - "/usr/sbin" + - "/var/log" # Writes to logs + - "/dev/log" + - "/root/.ssh" # Writes to sensitive files add here. + - index: 1 + operator: "Mask" + values: + - "2" # PROT_WRITE + - index: 2 + operator: "Mask" + values: + - "1" # MAP_SHARED + - call: "security_path_truncate" + syscall: false + return: true + args: + - index: 0 + type: "path" # (struct path *) used for getting the path + returnArg: + index: 0 + type: "int" + returnArgAction: "Post" + selectors: + - matchArgs: + - index: 0 + operator: "Prefix" + values: + - "/etc" # Truncate to sensitive directories + - "/boot" + - "/lib" + - "/lib64" + - "/usr/lib" + - "/usr/local/lib" + - "/usr/local/sbin" + - "/usr/local/bin" + - "/usr/bin" + - "/usr/sbin" + - "/var/log" # Truncate to logs + - "/dev/log" + - "/root/.ssh" # Truncate to sensitive files add here. \ No newline at end of file diff --git a/apps/tetragon/helmchart.yaml b/apps/tetragon/helmchart.yaml new file mode 100644 index 0000000..550608f --- /dev/null +++ b/apps/tetragon/helmchart.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmChart +metadata: + name: tetragon +spec: + chart: tetragon + version: "v1.3.0" + sourceRef: + kind: HelmRepository + name: cilium + interval: 10m0s diff --git a/apps/tetragon/kustomization.yaml b/apps/tetragon/kustomization.yaml new file mode 100644 index 0000000..ec759cb --- /dev/null +++ b/apps/tetragon/kustomization.yaml @@ -0,0 +1,12 @@ +--- +namespace: tetragon + +resources: + - namespace.yaml + - helmrepository.yaml + - helmchart.yaml + - helmrelease.yaml + - configmap.yaml + # TODO - these need to depend on helm chart install + - file_monitoring.yaml + - network_egress_cluster.yaml diff --git a/apps/tetragon/network_egress_cluster.yaml b/apps/tetragon/network_egress_cluster.yaml new file mode 100644 index 0000000..acd95d0 --- /dev/null +++ b/apps/tetragon/network_egress_cluster.yaml @@ -0,0 +1,19 @@ +apiVersion: cilium.io/v1alpha1 +kind: TracingPolicy +metadata: + name: "monitor-network-activity-outside-cluster-cidr-range" +spec: + kprobes: + - call: "tcp_connect" + syscall: false + args: + - index: 0 + type: "sock" + selectors: + - matchArgs: + - index: 0 + operator: "NotDAddr" + values: + - 127.0.0.1 + - 172.16.0.0/13 # pods + - 172.24.0.0/13 # services \ No newline at end of file diff --git a/apps/waldur/configmap.yaml b/apps/waldur/configmap.yaml new file mode 100644 index 0000000..dde1398 --- /dev/null +++ b/apps/waldur/configmap.yaml @@ -0,0 +1,48 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: waldur-config + namespace: waldur +data: + values.yaml: | + + postgresql: + enabled: true + rabbitmq: + enabled: true + + ### Example config for exposing the Waldur UI via sslio.io + + # NOTE: If using sslip.io, the host IP must match the FIP + # which was assigned to the NGINX ingress controller's + # load balancer. An alternative host name may be used if + # an appropriate DNS entry for the load balancer's IP + # is added to your upstream DNS servers. It is not possible + # to create / manage such a DNS entry via Kubernetes. + + # apiHostname: 128-232-226-47.sslip.io + # apiScheme: https + # homeportHostname: 128-232-226-47.sslip.io + # homeportScheme: https + # ingress: + # tls: + # enabled: true + # source: letsEncrypt + + ### Example config for Keycloak integration + + # waldur: + # authMethods: + # - LOCAL_SIGNIN + # - SOCIAL_SIGNUP + # socialAuthMethods: + # - label: Keycloak + # provider: keycloak + # # clientId: + # # clientSecret: + # # discoveryUrl: + # managementUrl: "" + # protectedFields: + # - full_name + # - email diff --git a/apps/waldur/example-secret.yaml b/apps/waldur/example-secret.yaml new file mode 100644 index 0000000..82a69a8 --- /dev/null +++ b/apps/waldur/example-secret.yaml @@ -0,0 +1,13 @@ +--- +############ +# IMPORTANT: Make sure you run kubeseal against any secret before commiting it to git! +############ +apiVersion: v1 +kind: Secret +metadata: + name: waldur-keycloak-config + namespace: waldur +stringData: + keycloakClientId: + keycloakClientSecret: + keycloakDiscoveryUrl: /.well-known/openid-configuration diff --git a/apps/waldur/helmchart.yaml b/apps/waldur/helmchart.yaml new file mode 100644 index 0000000..f57a126 --- /dev/null +++ b/apps/waldur/helmchart.yaml @@ -0,0 +1,13 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmChart +metadata: + name: waldur + namespace: waldur +spec: + chart: waldur + version: 6.7.3 + sourceRef: + kind: HelmRepository + name: waldur-charts + interval: 1h diff --git a/apps/waldur/helmrelease.yaml b/apps/waldur/helmrelease.yaml new file mode 100644 index 0000000..ec2f3bf --- /dev/null +++ b/apps/waldur/helmrelease.yaml @@ -0,0 +1,40 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: waldur + namespace: waldur +spec: + chartRef: + kind: HelmChart + name: waldur + releaseName: waldur + valuesFrom: + - kind: ConfigMap + name: waldur-config + # - kind: Secret + # name: waldur-keycloak-config + # valuesKey: keycloakClientId + # targetPath: waldur.socialAuthMethods[0].clientId + # - kind: Secret + # name: waldur-keycloak-config + # valuesKey: keycloakClientSecret + # targetPath: waldur.socialAuthMethods[0].clientSecret + # - kind: Secret + # name: waldur-keycloak-config + # valuesKey: keycloakDiscoveryUrl + # targetPath: waldur.socialAuthMethods[0].discoveryUrl + + install: + createNamespace: true + remediation: + retries: 3 + upgrade: + remediation: + retries: 3 + driftDetection: + mode: enabled + interval: 5m + # Waldur DB migrations during install/upgrade + # can take a while so increase install timeout + timeout: 60m diff --git a/apps/waldur/helmrepository.yaml b/apps/waldur/helmrepository.yaml new file mode 100644 index 0000000..413a05d --- /dev/null +++ b/apps/waldur/helmrepository.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: waldur-charts + namespace: waldur +spec: + url: https://waldur.github.io/waldur-helm + interval: 1h diff --git a/apps/waldur/kustomization.yaml b/apps/waldur/kustomization.yaml new file mode 100644 index 0000000..a62890a --- /dev/null +++ b/apps/waldur/kustomization.yaml @@ -0,0 +1,7 @@ +resources: + - namespace.yaml + - helmrepository.yaml + - helmchart.yaml + - helmrelease.yaml + - configmap.yaml + # - secret.yaml diff --git a/apps/waldur/namespace.yaml b/apps/waldur/namespace.yaml new file mode 100644 index 0000000..3457677 --- /dev/null +++ b/apps/waldur/namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: waldur diff --git a/images/jupyterhub-intel-gpu/Dockerfile b/images/jupyterhub-intel-gpu/Dockerfile new file mode 100644 index 0000000..f7a6482 --- /dev/null +++ b/images/jupyterhub-intel-gpu/Dockerfile @@ -0,0 +1,80 @@ +FROM quay.io/jupyter/pytorch-notebook:2025-01-28 + +##### +# Add Intel GPU components +##### + +USER root + +ENV LANG=C.UTF-8 + +ARG DEBIAN_FRONTEND=noninteractive + +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + apt-utils \ + build-essential \ + ca-certificates \ + clinfo \ + curl \ + git \ + gnupg2 \ + gpg-agent \ + rsync \ + sudo \ + unzip \ + wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \ + gpg --dearmor --yes --output /usr/share/keyrings/intel-graphics.gpg +RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy unified" | \ + tee /etc/apt/sources.list.d/intel-gpu-jammy.list &&\ + apt update + +ARG ICD_VER=24.22.29735.27-914~22.04 +ARG LEVEL_ZERO_GPU_VER=1.3.29735.27-914~22.04 +ARG LEVEL_ZERO_VER=1.17.6-914~22.04 +ARG LEVEL_ZERO_DEV_VER=1.17.6-914~22.04 + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + intel-opencl-icd=${ICD_VER} \ + intel-level-zero-gpu=${LEVEL_ZERO_GPU_VER} \ + libze1=${LEVEL_ZERO_VER} \ + libze-dev=${LEVEL_ZERO_DEV_VER} && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ + | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \ + echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \ + | tee /etc/apt/sources.list.d/oneAPI.list + +ARG DPCPP_VER=2024.2.1-1079 +ARG MKL_VER=2024.2.1-103 +ARG CCL_VER=2021.13.1-31 + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + intel-oneapi-runtime-dpcpp-cpp=${DPCPP_VER} \ + intel-oneapi-runtime-mkl=${MKL_VER} \ + intel-oneapi-runtime-ccl=${CCL_VER} && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN groupadd -g 110 render + +##### +# User-level python components +##### + +USER ${NB_USER} + +RUN pip3 install -U torch torchvision torchaudio \ + --index-url https://download.pytorch.org/whl/test/xpu + +COPY examples.ipynb . diff --git a/images/jupyterhub-intel-gpu/examples.ipynb b/images/jupyterhub-intel-gpu/examples.ipynb new file mode 100644 index 0000000..a8e5f0e --- /dev/null +++ b/images/jupyterhub-intel-gpu/examples.ipynb @@ -0,0 +1,84 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "7f446447-6d6b-41d9-9cc6-90e86703d0fc", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# See: https://pytorch.org/docs/stable/notes/get_start_xpu.html\n", + "import torch\n", + "torch.xpu.is_available() # torch.xpu is the API for Intel GPU support" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80cb2dab-ac88-4486-93ab-8777ceb70f0b", + "metadata": {}, + "outputs": [], + "source": [ + "!clinfo -l" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a2d7573-f0df-4ed1-906d-099e5146660c", + "metadata": {}, + "outputs": [], + "source": [ + "torch.xpu.device_count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc672f30-01a2-403b-b90d-3669e8409c6c", + "metadata": {}, + "outputs": [], + "source": [ + "# Inference with FP32\n", + "\n", + "import torch\n", + "import torchvision.models as models\n", + "\n", + "model = models.resnet50(weights=\"ResNet50_Weights.DEFAULT\")\n", + "model.eval()\n", + "data = torch.rand(1, 3, 224, 224)\n", + "\n", + "model = model.to(\"xpu\")\n", + "data = data.to(\"xpu\")\n", + "\n", + "with torch.no_grad():\n", + " model(data)\n", + "\n", + "print(\"Execution finished\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/kustomization.yaml b/kustomization.yaml new file mode 100644 index 0000000..0dcb967 --- /dev/null +++ b/kustomization.yaml @@ -0,0 +1,4 @@ +--- +resources: + - apps/spegel + - apps/jupyterhub