Skip to content

Commit 62e5d54

Browse files
author
Noa Limoy
committed
feat(llm-katan): Add Kubernetes deployment support
- Add comprehensive Kustomize manifests (base + overlays for gpt35/claude) - Implement initContainer for efficient model caching using PVC - Fix config.py to read YLLM_SERVED_MODEL_NAME from environment variables - Add deployment documentation with examples for Kind cluster / Minikube This enables running multiple llm-katan instances in Kubernetes, each serving different model aliases while sharing the same underlying model. The overlays (gpt35, claude) demonstrate multi-instance deployments where each instance exposes a different served model name (e.g., gpt-3.5-turbo, claude-3-haiku-20240307) via the API. The served model name now works via environment variables, enabling Kubernetes deployments to expose diffrent model name via the API. Signed-off-by: Noa Limoy <nlimoy@nlimoy-thinkpadp1gen7.raanaii.csb>
1 parent c3ce62e commit 62e5d54

File tree

10 files changed

+759
-3
lines changed

10 files changed

+759
-3
lines changed

e2e-tests/llm-katan/deploy/docs/README.md

Lines changed: 471 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: llm-katan
5+
spec:
6+
selector:
7+
matchLabels: {}
8+
replicas: 1
9+
template:
10+
metadata:
11+
labels: {}
12+
spec:
13+
# Create a non-root user for security (matching Dockerfile)
14+
securityContext:
15+
fsGroup: 1000
16+
runAsUser: 1000
17+
runAsNonRoot: true
18+
19+
initContainers:
20+
# Pre-download model to cache for faster startup
21+
# Uses lightweight python:3.11-slim image and checks if model exists before downloading
22+
- name: model-downloader
23+
image: python:3.11-slim
24+
imagePullPolicy: IfNotPresent
25+
securityContext:
26+
runAsUser: 0 # Run as root to install packages
27+
runAsNonRoot: false
28+
allowPrivilegeEscalation: false
29+
command: ["/bin/bash", "-c"]
30+
args:
31+
- |
32+
set -e
33+
34+
MODEL_ID="${YLLM_MODEL:-Qwen/Qwen3-0.6B}"
35+
MODEL_DIR=$(basename "$MODEL_ID")
36+
37+
mkdir -p /cache/models
38+
cd /cache/models
39+
40+
# Check if model already exists in PVC
41+
if [ -d "$MODEL_DIR" ]; then
42+
echo "Model $MODEL_ID already cached. Skipping download."
43+
exit 0
44+
fi
45+
46+
# Model not found, proceed with download
47+
echo "Downloading model $MODEL_ID..."
48+
pip install --no-cache-dir huggingface_hub[cli]
49+
hf download "$MODEL_ID" --local-dir "$MODEL_DIR"
50+
env:
51+
- name: YLLM_MODEL
52+
value: "Qwen/Qwen3-0.6B"
53+
- name: HF_HUB_CACHE
54+
value: "/tmp/hf_cache"
55+
volumeMounts:
56+
- name: models-volume
57+
mountPath: /cache/models
58+
resources:
59+
requests:
60+
memory: "512Mi"
61+
cpu: "250m"
62+
limits:
63+
memory: "1Gi"
64+
cpu: "500m"
65+
66+
containers:
67+
- name: llm-katan
68+
image: llm-katan:latest
69+
imagePullPolicy: IfNotPresent
70+
71+
# Command is set via environment variables
72+
# Default: llm-katan --model Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8000
73+
74+
ports:
75+
- name: http
76+
containerPort: 8000
77+
protocol: TCP
78+
79+
env:
80+
# These can be overridden via ConfigMap in overlays
81+
- name: YLLM_MODEL
82+
value: "/cache/models/Qwen3-0.6B" # Local path to downloaded model
83+
- name: YLLM_PORT
84+
value: "8000"
85+
- name: YLLM_HOST
86+
value: "0.0.0.0"
87+
- name: YLLM_BACKEND
88+
value: "transformers"
89+
- name: PYTHONUNBUFFERED
90+
value: "1"
91+
- name: PYTHONDONTWRITEBYTECODE
92+
value: "1"
93+
94+
volumeMounts:
95+
- name: models-volume
96+
mountPath: /cache/models
97+
98+
livenessProbe:
99+
httpGet:
100+
path: /health
101+
port: http
102+
initialDelaySeconds: 15
103+
periodSeconds: 20
104+
timeoutSeconds: 5
105+
failureThreshold: 3
106+
107+
readinessProbe:
108+
httpGet:
109+
path: /health
110+
port: http
111+
initialDelaySeconds: 5
112+
periodSeconds: 10
113+
timeoutSeconds: 3
114+
failureThreshold: 3
115+
116+
startupProbe:
117+
httpGet:
118+
path: /health
119+
port: http
120+
initialDelaySeconds: 30
121+
periodSeconds: 15
122+
timeoutSeconds: 5
123+
failureThreshold: 60 # 15 minutes max startup time (for slow model downloads)
124+
125+
resources:
126+
requests:
127+
memory: "3Gi"
128+
cpu: "1"
129+
limits:
130+
memory: "6Gi"
131+
cpu: "2"
132+
133+
securityContext:
134+
allowPrivilegeEscalation: false
135+
readOnlyRootFilesystem: false # HuggingFace needs to write to cache
136+
runAsNonRoot: true
137+
capabilities:
138+
drop:
139+
- ALL
140+
141+
volumes:
142+
- name: models-volume
143+
persistentVolumeClaim:
144+
claimName: llm-katan-models
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
apiVersion: kustomize.config.k8s.io/v1beta1
2+
kind: Kustomization
3+
4+
metadata:
5+
name: llm-katan-base
6+
7+
namespace: llm-katan-system
8+
9+
10+
resources:
11+
- namespace.yaml
12+
- pvc.yaml
13+
- deployment.yaml
14+
- service.yaml
15+
16+
# Images (can be overridden in overlays)
17+
images:
18+
- name: llm-katan
19+
newName: llm-katan
20+
newTag: latest
21+
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
apiVersion: v1
2+
kind: Namespace
3+
metadata:
4+
name: llm-katan-system
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
apiVersion: v1
2+
kind: PersistentVolumeClaim
3+
metadata:
4+
name: llm-katan-models
5+
spec:
6+
accessModes:
7+
- ReadWriteOnce
8+
resources:
9+
requests:
10+
storage: 5Gi # Increased for model cache (~600MB model + overhead)
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
apiVersion: v1
2+
kind: Service
3+
metadata:
4+
name: llm-katan
5+
spec:
6+
type: ClusterIP
7+
selector:
8+
app: llm-katan
9+
ports:
10+
- name: http
11+
port: 8000
12+
targetPort: http
13+
protocol: TCP
14+
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
apiVersion: kustomize.config.k8s.io/v1alpha1
2+
kind: Component
3+
4+
# Common labels applied to all resources that use this component
5+
labels:
6+
- includeSelectors: true
7+
pairs:
8+
app.kubernetes.io/name: llm-katan
9+
app.kubernetes.io/part-of: semantic-router-workspaces
10+
app.kubernetes.io/managed-by: kustomize
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
apiVersion: kustomize.config.k8s.io/v1beta1
2+
kind: Kustomization
3+
4+
metadata:
5+
name: llm-katan-claude
6+
7+
resources:
8+
- ../../base
9+
10+
components:
11+
- ../../components/common
12+
13+
nameSuffix: -claude
14+
15+
patches:
16+
- target:
17+
kind: Deployment
18+
name: llm-katan
19+
patch: |-
20+
- op: add
21+
path: /spec/template/spec/containers/0/env/-
22+
value:
23+
name: YLLM_SERVED_MODEL_NAME
24+
value: "claude-3-haiku-20240307"
25+
- op: add
26+
path: /spec/template/metadata/labels/model-alias
27+
value: "claude-3-haiku"
28+
- target:
29+
kind: Service
30+
name: llm-katan
31+
patch: |-
32+
- op: add
33+
path: /metadata/labels/model-alias
34+
value: "claude-3-haiku"
35+
# Update PVC reference in deployment to match suffixed PVC name
36+
- target:
37+
kind: Deployment
38+
name: llm-katan
39+
patch: |-
40+
- op: replace
41+
path: /spec/template/spec/volumes/0/persistentVolumeClaim/claimName
42+
value: llm-katan-models-claude
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
apiVersion: kustomize.config.k8s.io/v1beta1
2+
kind: Kustomization
3+
4+
resources:
5+
- ../../base
6+
7+
components:
8+
- ../../components/common
9+
10+
nameSuffix: -gpt35
11+
12+
patches:
13+
- target:
14+
kind: Deployment
15+
name: llm-katan
16+
patch: |-
17+
- op: add
18+
path: /spec/template/spec/containers/0/env/-
19+
value:
20+
name: YLLM_SERVED_MODEL_NAME
21+
value: "gpt-3.5-turbo"
22+
- op: add
23+
path: /spec/template/metadata/labels/model-alias
24+
value: "gpt-3.5-turbo"
25+
26+
- target:
27+
kind: Service
28+
name: llm-katan
29+
patch: |-
30+
- op: add
31+
path: /metadata/labels/model-alias
32+
value: "gpt-3.5-turbo"
33+
34+
# Update PVC reference in deployment to match suffixed PVC name
35+
- target:
36+
kind: Deployment
37+
name: llm-katan
38+
patch: |-
39+
- op: replace
40+
path: /spec/template/spec/volumes/0/persistentVolumeClaim/claimName
41+
value: llm-katan-models-gpt35

e2e-tests/llm-katan/llm_katan/config.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,14 @@ def __post_init__(self):
3131

3232
# Apply environment variable overrides
3333
self.model_name = os.getenv("YLLM_MODEL", self.model_name)
34+
self.served_model_name = os.getenv("YLLM_SERVED_MODEL_NAME", self.served_model_name)
3435
self.port = int(os.getenv("YLLM_PORT", str(self.port)))
3536
self.backend = os.getenv("YLLM_BACKEND", self.backend)
3637
self.host = os.getenv("YLLM_HOST", self.host)
3738

3839
# Validate backend
3940
if self.backend not in ["transformers", "vllm"]:
40-
raise ValueError(
41-
f"Invalid backend: {self.backend}. Must be 'transformers' or 'vllm'"
42-
)
41+
raise ValueError(f"Invalid backend: {self.backend}. Must be 'transformers' or 'vllm'")
4342

4443
@property
4544
def device_auto(self) -> str:

0 commit comments

Comments
 (0)