Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
471 changes: 471 additions & 0 deletions e2e-tests/llm-katan/deploy/docs/README.md

Large diffs are not rendered by default.

144 changes: 144 additions & 0 deletions e2e-tests/llm-katan/deploy/kubernetes/base/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: llm-katan
spec:
selector:
matchLabels: {}
replicas: 1
template:
metadata:
labels: {}
spec:
# Create a non-root user for security (matching Dockerfile)
securityContext:
fsGroup: 1000
runAsUser: 1000
runAsNonRoot: true

initContainers:
# Pre-download model to cache for faster startup
# Uses lightweight python:3.11-slim image and checks if model exists before downloading
- name: model-downloader
image: python:3.11-slim
imagePullPolicy: IfNotPresent
securityContext:
runAsUser: 0 # Run as root to install packages
runAsNonRoot: false
allowPrivilegeEscalation: false
command: ["/bin/bash", "-c"]
args:
- |
set -e

MODEL_ID="${YLLM_MODEL:-Qwen/Qwen3-0.6B}"
MODEL_DIR=$(basename "$MODEL_ID")

mkdir -p /cache/models
cd /cache/models

# Check if model already exists in PVC
if [ -d "$MODEL_DIR" ]; then
echo "Model $MODEL_ID already cached. Skipping download."
exit 0
fi

# Model not found, proceed with download
echo "Downloading model $MODEL_ID..."
pip install --no-cache-dir huggingface_hub[cli]
hf download "$MODEL_ID" --local-dir "$MODEL_DIR"
env:
- name: YLLM_MODEL
value: "Qwen/Qwen3-0.6B"
- name: HF_HUB_CACHE
value: "/tmp/hf_cache"
volumeMounts:
- name: models-volume
mountPath: /cache/models
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "1Gi"
cpu: "500m"

containers:
- name: llm-katan
image: llm-katan:latest
imagePullPolicy: IfNotPresent

# Command is set via environment variables
# Default: llm-katan --model Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8000

ports:
- name: http
containerPort: 8000
protocol: TCP

env:
# These can be overridden via ConfigMap in overlays
- name: YLLM_MODEL
value: "/cache/models/Qwen3-0.6B" # Local path to downloaded model
- name: YLLM_PORT
value: "8000"
- name: YLLM_HOST
value: "0.0.0.0"
- name: YLLM_BACKEND
value: "transformers"
- name: PYTHONUNBUFFERED
value: "1"
- name: PYTHONDONTWRITEBYTECODE
value: "1"

volumeMounts:
- name: models-volume
mountPath: /cache/models

livenessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 15
periodSeconds: 20
timeoutSeconds: 5
failureThreshold: 3

readinessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 3

startupProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 30
periodSeconds: 15
timeoutSeconds: 5
failureThreshold: 60 # 15 minutes max startup time (for slow model downloads)

resources:
requests:
memory: "3Gi"
cpu: "1"
limits:
memory: "6Gi"
cpu: "2"

securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: false # HuggingFace needs to write to cache
runAsNonRoot: true
capabilities:
drop:
- ALL

volumes:
- name: models-volume
persistentVolumeClaim:
claimName: llm-katan-models
21 changes: 21 additions & 0 deletions e2e-tests/llm-katan/deploy/kubernetes/base/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

metadata:
name: llm-katan-base

namespace: llm-katan-system


resources:
- namespace.yaml
- pvc.yaml
- deployment.yaml
- service.yaml

# Images (can be overridden in overlays)
images:
- name: llm-katan
newName: llm-katan
newTag: latest

4 changes: 4 additions & 0 deletions e2e-tests/llm-katan/deploy/kubernetes/base/namespace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: llm-katan-system
10 changes: 10 additions & 0 deletions e2e-tests/llm-katan/deploy/kubernetes/base/pvc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: llm-katan-models
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 5Gi # Increased for model cache (~600MB model + overhead)
14 changes: 14 additions & 0 deletions e2e-tests/llm-katan/deploy/kubernetes/base/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: v1
kind: Service
metadata:
name: llm-katan
spec:
type: ClusterIP
selector:
app: llm-katan
ports:
- name: http
port: 8000
targetPort: http
protocol: TCP

Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: kustomize.config.k8s.io/v1alpha1
kind: Component

# Common labels applied to all resources that use this component
labels:
- includeSelectors: true
pairs:
app.kubernetes.io/name: llm-katan
app.kubernetes.io/part-of: semantic-router-workspaces
app.kubernetes.io/managed-by: kustomize
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

metadata:
name: llm-katan-claude

resources:
- ../../base

components:
- ../../components/common

nameSuffix: -claude

patches:
- target:
kind: Deployment
name: llm-katan
patch: |-
- op: add
path: /spec/template/spec/containers/0/env/-
value:
name: YLLM_SERVED_MODEL_NAME
value: "claude-3-haiku-20240307"
- op: add
path: /spec/template/metadata/labels/model-alias
value: "claude-3-haiku"
- target:
kind: Service
name: llm-katan
patch: |-
- op: add
path: /metadata/labels/model-alias
value: "claude-3-haiku"
# Update PVC reference in deployment to match suffixed PVC name
- target:
kind: Deployment
name: llm-katan
patch: |-
- op: replace
path: /spec/template/spec/volumes/0/persistentVolumeClaim/claimName
value: llm-katan-models-claude
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

resources:
- ../../base

components:
- ../../components/common

nameSuffix: -gpt35

patches:
- target:
kind: Deployment
name: llm-katan
patch: |-
- op: add
path: /spec/template/spec/containers/0/env/-
value:
name: YLLM_SERVED_MODEL_NAME
value: "gpt-3.5-turbo"
- op: add
path: /spec/template/metadata/labels/model-alias
value: "gpt-3.5-turbo"

- target:
kind: Service
name: llm-katan
patch: |-
- op: add
path: /metadata/labels/model-alias
value: "gpt-3.5-turbo"

# Update PVC reference in deployment to match suffixed PVC name
- target:
kind: Deployment
name: llm-katan
patch: |-
- op: replace
path: /spec/template/spec/volumes/0/persistentVolumeClaim/claimName
value: llm-katan-models-gpt35
5 changes: 2 additions & 3 deletions e2e-tests/llm-katan/llm_katan/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,14 @@ def __post_init__(self):

# Apply environment variable overrides
self.model_name = os.getenv("YLLM_MODEL", self.model_name)
self.served_model_name = os.getenv("YLLM_SERVED_MODEL_NAME", self.served_model_name)
self.port = int(os.getenv("YLLM_PORT", str(self.port)))
self.backend = os.getenv("YLLM_BACKEND", self.backend)
self.host = os.getenv("YLLM_HOST", self.host)

# Validate backend
if self.backend not in ["transformers", "vllm"]:
raise ValueError(
f"Invalid backend: {self.backend}. Must be 'transformers' or 'vllm'"
)
raise ValueError(f"Invalid backend: {self.backend}. Must be 'transformers' or 'vllm'")

@property
def device_auto(self) -> str:
Expand Down
Loading