diff --git a/charts/Chart.yaml b/charts/Chart.yaml new file mode 100644 index 000000000..31184ad36 --- /dev/null +++ b/charts/Chart.yaml @@ -0,0 +1,34 @@ +apiVersion: v2 +name: codabench-chart +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.16.0" + +dependencies: + - name: rabbitmq + version: "14.7.0" + repository: "oci://registry.cern.ch/kubeflow/charts" + condition: rabbitmq.enabled + - name: redis + version: "19.5.4" + repository: "oci://registry.cern.ch/kubeflow/charts" + condition: redis.enabled diff --git a/charts/dockerfiles/Dockerfile b/charts/dockerfiles/Dockerfile new file mode 100644 index 000000000..b19060f94 --- /dev/null +++ b/charts/dockerfiles/Dockerfile @@ -0,0 +1,43 @@ +# Stage 1: Node.js builder +FROM node:10 AS builder + +# Setup volume +WORKDIR /app + +# Install packages +ADD package.json . +RUN npm install + +# Copy all files and build +COPY . . +RUN export PATH=./node_modules/.bin:$PATH && npm run build-stylus && npm run build-riot && npm run concat-riot + +# Stage 2: Python/Django (identical to main Dockerfile) +FROM python:3.9.20 + +# Install system dependencies +RUN apt-get update && apt-get install -y gcc build-essential && rm -rf /var/lib/apt/lists/* + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV PATH=$PATH:/root/.local/bin + +# Install Poetry +RUN curl -sSL https://install.python-poetry.org | python3 - --version 1.8.3 +RUN poetry config virtualenvs.create false +RUN poetry config virtualenvs.in-project false + +# Set work directory before copying files +WORKDIR /app + +# Copy only dependency descriptors first (for caching) +COPY pyproject.toml poetry.lock ./ +RUN poetry install + +# Copy the rest of the application code +COPY . /app + +# Copy built files from builder stage +COPY --from=builder /app /app + +RUN ./manage.py collectstatic --noinput diff --git a/charts/dockerfiles/Dockerfile.compute_worker b/charts/dockerfiles/Dockerfile.compute_worker new file mode 100644 index 000000000..83568ea65 --- /dev/null +++ b/charts/dockerfiles/Dockerfile.compute_worker @@ -0,0 +1,35 @@ +FROM --platform=linux/amd64 python:3.9 + +# This makes output not buffer and return immediately, nice for seeing results in stdout +ENV PYTHONUNBUFFERED=1 + +# Install Docker +RUN apt-get update && curl -fsSL https://get.docker.com | sh + +# Install kubectl +RUN curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" && \ + install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl && \ + rm kubectl + +RUN curl -sSL https://install.python-poetry.org | python3 - --version 1.8.3 +# Poetry location so future commands (below) work +ENV PATH=$PATH:/root/.local/bin +# Want poetry to use system python of docker container +RUN poetry config virtualenvs.create false +RUN poetry config virtualenvs.in-project false +RUN mkdir codabench +WORKDIR /app/ +COPY ./compute_worker/ ./ +COPY ./compute_worker/pyproject.toml ./ +COPY ./compute_worker/poetry.lock ./ +RUN poetry install +RUN pip install redis +RUN pip install kubernetes + +ADD compute_worker . + +CMD celery -A compute_worker worker \ + -l info \ + -Q compute-worker \ + -n compute-worker@%n \ + --concurrency=1 diff --git a/charts/dockerfiles/Dockerfile.flower b/charts/dockerfiles/Dockerfile.flower new file mode 100644 index 000000000..30d482a8e --- /dev/null +++ b/charts/dockerfiles/Dockerfile.flower @@ -0,0 +1,31 @@ +FROM python:3.9 + +# PYTHONUNBUFFERED: Force stdin, stdout and stderr to be totally unbuffered. (equivalent to `python -u`) +# PYTHONHASHSEED: Enable hash randomization (equivalent to `python -R`) +# PYTHONDONTWRITEBYTECODE: Do not write byte files to disk, since we maintain it as readonly. (equivalent to `python -B`) +ENV PYTHONUNBUFFERED=1 PYTHONHASHSEED=random PYTHONDONTWRITEBYTECODE=1 + +# Get latest root certificates +RUN apt-get update && apt-get install -y ca-certificates && update-ca-certificates + +# # Install the required packages +RUN curl -sSL https://install.python-poetry.org | python3 - --version 1.8.3 +# Poetry location so future commands (below) work +ENV PATH $PATH:/root/.local/bin +# Want poetry to use system python of docker container +RUN poetry config virtualenvs.create false +RUN poetry config virtualenvs.in-project false + +RUN poetry init --no-interaction + +RUN poetry add redis=3.0.1 +RUN poetry add flower=0.9.3 +RUN poetry add celery="<5.0.0" + +# Default port +EXPOSE 5555 + +# Run as a non-root user by default, run as user with least privileges. +USER nobody + +ENTRYPOINT ["flower"] diff --git a/charts/templates/app-state-pvc.yaml b/charts/templates/app-state-pvc.yaml new file mode 100644 index 000000000..8e8fa52b8 --- /dev/null +++ b/charts/templates/app-state-pvc.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Values.appState.pvcName }} +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: {{ .Values.appState.storage }} + storageClassName: {{ .Values.appState.storageClass }} diff --git a/charts/templates/compute-worker-rbac.yaml b/charts/templates/compute-worker-rbac.yaml new file mode 100644 index 000000000..519db211e --- /dev/null +++ b/charts/templates/compute-worker-rbac.yaml @@ -0,0 +1,35 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: compute-worker-sa + namespace: {{ .Release.Namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: compute-worker-role + namespace: {{ .Release.Namespace }} +rules: + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["create", "get", "list", "watch", "delete"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch", "delete"] + - apiGroups: [""] + resources: ["pods/exec", "pods/log"] + verbs: ["create", "get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: compute-worker-bind + namespace: {{ .Release.Namespace }} +subjects: + - kind: ServiceAccount + name: compute-worker-sa + namespace: {{ .Release.Namespace }} +roleRef: + kind: Role + name: compute-worker-role + apiGroup: rbac.authorization.k8s.io diff --git a/charts/templates/compute_worker-deployment.yaml b/charts/templates/compute_worker-deployment.yaml new file mode 100644 index 000000000..6b073ca04 --- /dev/null +++ b/charts/templates/compute_worker-deployment.yaml @@ -0,0 +1,73 @@ +{{- range .Values.compute_worker.brokers }} +{{- $isDefault := eq .name "default" }} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: compute-worker{{ if not $isDefault }}-{{ .name }}{{ end }} + labels: + app: compute-worker +spec: + replicas: 1 + selector: + matchLabels: + app: compute-worker + template: + metadata: + labels: + app: compute-worker + spec: + serviceAccountName: compute-worker-sa + containers: + - name: compute-worker + image: "{{ $.Values.compute_worker.image.repository }}:{{ $.Values.compute_worker.image.tag }}" + imagePullPolicy: {{ $.Values.compute_worker.image.pullPolicy | default "IfNotPresent" }} + command: + - bash + - -c + - > + watchmedo auto-restart -p '*.py' --recursive -- celery -A compute_worker worker -l info -Q compute-worker -n compute-worker{{ if not $isDefault }}-{{ .name }}{{ end }}@%n + workingDir: /app + env: + - name: USE_GPU + value: '{{ .gpu.enabled }}' + - name: RESOURCE_LIMITS + value: '{{ toJson .gpu.resourceLimits }}' + - name: NODE_SELECTOR + value: '{{ toJson .gpu.nodeSelector }}' + - name: NUMBER_OF_POD_CREATION_RETRIES + value: '{{ $.Values.compute_worker.podCreationRetries.numberOfRetries }}' + - name: SLEEP_TIME_BETWEEN_RETRIES + value: '{{ $.Values.compute_worker.podCreationRetries.sleepTimeBetweenRetries }}' + - name: USERID + value: '{{ $.Values.compute_worker.submissionPods.securityContext.runAsUser }}' + - name: GROUPID + value: '{{ $.Values.compute_worker.submissionPods.securityContext.runAsGroup }}' + - name: FSGROUP + value: '{{ $.Values.compute_worker.submissionPods.securityContext.fsGroup }}' + - name: COMPUTE_WORKER_LABELS + value: '{{ toJson $.Values.compute_worker.submissionPods.metadata.labels }}' + - name: BROKER_URL + value: "{{ if .url }}{{ .url }}{{ else }}pyamqp://{{ $.Values.env.RABBITMQ_DEFAULT_USER }}:{{ $.Values.env.RABBITMQ_DEFAULT_PASS }}@{{ $.Values.env.RABBITMQ_HOST }}:{{ $.Values.env.RABBITMQ_PORT }}//{{ end }}" + - name: CODALAB_IGNORE_CLEANUP_STEP + value: "1" + {{- range $key, $value := $.Values.env }} + - name: {{ $key }} + value: "{{ $value }}" + {{- end }} + resources: + {{- toYaml $.Values.compute_worker.resources | nindent 12 }} + volumeMounts: + - name: docker-socket + mountPath: /var/run/docker.sock + - name: codabench-storage + mountPath: /codabench + volumes: + - name: docker-socket + hostPath: + path: /var/run/docker.sock + type: Socket + - name: codabench-storage + persistentVolumeClaim: + claimName: {{ $.Values.compute_worker.volumes.pvcName }} +{{- end }} diff --git a/charts/templates/django-deployment.yaml b/charts/templates/django-deployment.yaml new file mode 100644 index 000000000..ee44ca7b8 --- /dev/null +++ b/charts/templates/django-deployment.yaml @@ -0,0 +1,40 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: django +spec: + replicas: {{ .Values.django.replicas }} + selector: + matchLabels: + app: django + template: + metadata: + labels: + app: django + spec: + containers: + - name: django + image: "{{ .Values.django.image.repository }}:{{ .Values.django.image.tag }}" + imagePullPolicy: {{ .Values.django.image.pullPolicy }} + command: + - bash + - -c + - > + python manage.py collectstatic --noinput && + cd {{ .Values.django.workingDir }} && + watchmedo auto-restart -p '*.py' --recursive -- + gunicorn asgi:application -w {{ .Values.django.gunicorn.workers }} -k uvicorn.workers.UvicornWorker -b :{{ .Values.django.port }} --capture-output --log-level {{ .Values.django.gunicorn.logLevel }} + env: + {{- range $key, $value := .Values.env }} + - name: {{ $key }} + value: "{{ $value }}" + {{- end }} + - name: DATABASE_URL + value: "postgres://{{ .Values.db.username }}:{{ .Values.db.password }}@{{ .Values.db.host }}:{{ .Values.db.port }}/{{ .Values.db.name }}" + volumeMounts: + - name: app-state + mountPath: /app/app-state + volumes: + - name: app-state + persistentVolumeClaim: + claimName: {{ .Values.appState.pvcName }} diff --git a/charts/templates/django-service.yaml b/charts/templates/django-service.yaml new file mode 100644 index 000000000..08a94857d --- /dev/null +++ b/charts/templates/django-service.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Service +metadata: + name: django +spec: + selector: + app: django + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + diff --git a/charts/templates/flower-deployment.yaml b/charts/templates/flower-deployment.yaml new file mode 100644 index 000000000..401ba3173 --- /dev/null +++ b/charts/templates/flower-deployment.yaml @@ -0,0 +1,27 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flower + labels: + app: flower +spec: + replicas: {{ .Values.flower.replicas }} + selector: + matchLabels: + app: flower + template: + metadata: + labels: + app: flower + spec: + containers: + - name: flower + image: "{{ .Values.flower.image.repository }}:{{ .Values.flower.image.tag }}" + imagePullPolicy: {{ .Values.flower.image.pullPolicy }} + ports: + - containerPort: {{ .Values.flower.service.port }} + env: + - name: CELERY_BROKER_URL + value: "pyamqp://{{ .Values.env.RABBITMQ_DEFAULT_USER }}:{{ .Values.env.RABBITMQ_DEFAULT_PASS }}@{{ .Values.env.RABBITMQ_HOST }}:{{ .Values.env.RABBITMQ_PORT }}//" + - name: FLOWER_PORT + value: "{{ .Values.flower.service.port }}" diff --git a/charts/templates/flower-service.yaml b/charts/templates/flower-service.yaml new file mode 100644 index 000000000..5ec854ec4 --- /dev/null +++ b/charts/templates/flower-service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: flower + labels: + app: flower +spec: + type: ClusterIP + ports: + - port: {{ .Values.flower.service.port }} + targetPort: 5555 + selector: + app: flower \ No newline at end of file diff --git a/charts/templates/ingress.yaml b/charts/templates/ingress.yaml new file mode 100644 index 000000000..ae722a430 --- /dev/null +++ b/charts/templates/ingress.yaml @@ -0,0 +1,7 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: sample-http-ingress +spec: + {{- toYaml .Values.ingress.spec | nindent 2 }} + diff --git a/charts/templates/istio.yaml b/charts/templates/istio.yaml new file mode 100644 index 000000000..19343026c --- /dev/null +++ b/charts/templates/istio.yaml @@ -0,0 +1,11 @@ +{{- if .Values.istio.enableVirtualService}} +apiVersion: networking.istio.io/v1alpha3 +kind: VirtualService +metadata: + name: codabench-vs +spec: + {{- toYaml .Values.istio.spec | nindent 12 }} + +{{- else }} +{{- end }} + diff --git a/charts/templates/shared-pvc.yaml b/charts/templates/shared-pvc.yaml new file mode 100644 index 000000000..ab12b14a7 --- /dev/null +++ b/charts/templates/shared-pvc.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Values.sharedJob.pvcName }} +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: {{ .Values.sharedJob.storage }} + storageClassName: {{ .Values.sharedJob.storageClass }} + diff --git a/charts/templates/site-worker-deployment.yaml b/charts/templates/site-worker-deployment.yaml new file mode 100644 index 000000000..9c5359efc --- /dev/null +++ b/charts/templates/site-worker-deployment.yaml @@ -0,0 +1,52 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: site-worker + labels: + app: site-worker +spec: + replicas: {{ .Values.siteWorker.replicas }} + selector: + matchLabels: + app: site-worker + template: + metadata: + labels: + app: site-worker + {{- with .Values.siteWorker.metadata.labels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.siteWorker.securityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: site-worker + image: "{{ .Values.siteWorker.image.repository }}:{{ .Values.siteWorker.image.tag }}" + imagePullPolicy: {{ .Values.siteWorker.image.pullPolicy }} + workingDir: {{ .Values.siteWorker.workingDir }} + env: + - name: PYTHONPATH + value: {{ .Values.siteWorker.workingDir }} + - name: DATABASE_URL + value: "postgres://{{ .Values.db.username }}:{{ .Values.db.password }}@{{ .Values.db.host }}:{{ .Values.db.port }}/{{ .Values.db.name }}" + {{- range $key, $value := .Values.env }} + - name: {{ $key }} + value: "{{ $value }}" + {{- end }} + command: + - bash + - -c + - > + watchmedo auto-restart -p '*.py' --recursive -- + celery -A celery_config worker -B -Q site-worker -l info + -n site-worker@%n --concurrency={{ .Values.siteWorker.concurrency }} + volumeMounts: + - name: app-state + mountPath: /app/app-state + volumes: + - name: app-state + persistentVolumeClaim: + claimName: {{ .Values.siteWorker.volumes.pvcName }} + diff --git a/charts/values.yaml b/charts/values.yaml new file mode 100644 index 000000000..47a60d1a2 --- /dev/null +++ b/charts/values.yaml @@ -0,0 +1,198 @@ +istio: + enableVirtualService: false + spec: + gateways: + - / # e.g. kubeflow/kubeflow-gateway + hosts: + - '*' # Adjust if you want a specific hostname + http: + - match: + - uri: + exact: / + redirect: + uri: // + route: + - destination: + host: django + port: + number: 8000 + - match: + - uri: + prefix: / + route: + - destination: + host: django + port: + number: 8000 + +ingress: + spec: + ingressClassName: # e.g. nginx or traefik + rules: + - http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: django + port: + number: 8000 + +django: + image: + repository: + pullPolicy: Always + tag: + replicas: 1 + port: 8000 + workingDir: /app/src + gunicorn: + workers: 2 + logLevel: info + +flower: + image: + repository: + tag: + pullPolicy: Always + replicas: 1 + service: + port: 5555 + +compute_worker: + image: + repository: + tag: + pullPolicy: Always + podCreationRetries: + numberOfRetries: 30 + sleepTimeBetweenRetries: 10 + submissionPods: + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + resources: + requests: + memory: 256Mi + limits: + memory: 512Mi + volumes: + pvcName: + brokers: + - name: "default" + gpu: + enabled: false + +redis: + image: + registry: + repository: kubeflow/bitnamilegacy/redis + tag: 7.2.5-debian-12-r0 + pullPolicy: Always + enabled: true + auth: + enabled: false + architecture: standalone + master: + persistence: + enabled: false + storageClass: + size: 1Gi + service: + ports: + redis: 6379 + +rabbitmq: + image: + registry: + repository: kubeflow/bitnamilegacy/rabbitmq + tag: 3.13.7-debian-12-r2 + pullPolicy: Always + auth: + username: rabbit-username + password: rabbit-password-you-should-change + service: + ports: + amqp: 5672 + persistence: + enabled: true + storageClass: + size: 1Gi + extraEnvVars: + - name: RABBITMQ_SERVER_ADDITIONAL_ERL_ARGS + value: "-rabbit consumer_timeout 100000000" + +siteWorker: + image: + repository: + tag: + pullPolicy: Always + replicas: 1 + workingDir: /app/src + concurrency: 2 + securityContext: + runAsUser: 0 + runAsGroup: 0 + fsGroup: 0 + volumes: + pvcName: + +global: + defaultStorageClass: + +sharedJob: + storageClass: + pvcName: + storage: 1Gi + +appState: + storageClass: + pvcName: + storage: 1Gi + +db: + host: + name: + username: + password: + port: + +env: + SECRET_KEY: + + DJANGO_SETTINGS_MODULE: settings.develop + ALLOWED_HOSTS: + SUBMISSIONS_API_URL: http://django:8000/challenges/api + MAX_EXECUTION_TIME_LIMIT: "600" + + DOMAIN_NAME: + + TLS_EMAIL: + + RABBITMQ_HOST: + RABBITMQ_DEFAULT_USER: + RABBITMQ_DEFAULT_PASS: + RABBITMQ_MANAGEMENT_PORT: "15672" + RABBITMQ_PORT: "5672" + + FLOWER_PUBLIC_PORT: "5555" + FLOWER_BASIC_AUTH: + + SELENIUM_HOSTNAME: + + RERUN_SUBMISSION_LIMIT: "30" + + ENABLE_SIGN_UP: "False" + ENABLE_SIGN_IN: "False" + + STORAGE_TYPE: s3 + AWS_ACCESS_KEY_ID: + AWS_SECRET_ACCESS_KEY: + AWS_STORAGE_BUCKET_NAME: + AWS_STORAGE_PRIVATE_BUCKET_NAME: + AWS_S3_ENDPOINT_URL: + AWS_QUERYSTRING_AUTH: "False" + + REDIS_URL: redis://:6379