Skip to content

Commit e9d0fa9

Browse files
committed
feat: add support for Kubernetes startup probes
Add full support for Kubernetes startup probes with proper initialDelaySeconds handling. Startup probes now: - Respect initialDelaySeconds configuration before first execution - Run in background without blocking container startup - Block readiness and liveness probes until successful - Support both HTTP and Exec probe types - Include proper cleanup and error handling Changes: - Update ContainerCommand struct to include startupProbes field - Extend translateKubernetesProbes to handle startup probe translation - Implement runStartupProbe and waitForStartupProbes functions - Update probe metadata storage/loading for startup probe counts - Add startup probe example demonstrating 300s initial delay - Fix probe script generation to properly sequence probe execution Startup probes follow Kubernetes semantics where they must succeed before other probes begin execution, preventing premature readiness/liveness checks during application initialization. Signed-off-by: Diego Ciangottini <diego.ciangottini@pg.infn.it>
1 parent e3682d5 commit e9d0fa9

File tree

6 files changed

+223
-24
lines changed

6 files changed

+223
-24
lines changed
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
name: startup-probe-test-pod
5+
namespace: vk
6+
spec:
7+
containers:
8+
- name: app-container
9+
image: nginx:alpine
10+
ports:
11+
- containerPort: 80
12+
startupProbe:
13+
httpGet:
14+
path: /
15+
port: 80
16+
scheme: HTTP
17+
initialDelaySeconds: 300
18+
periodSeconds: 10
19+
timeoutSeconds: 5
20+
successThreshold: 1
21+
failureThreshold: 3
22+
readinessProbe:
23+
httpGet:
24+
path: /
25+
port: 80
26+
scheme: HTTP
27+
initialDelaySeconds: 5
28+
periodSeconds: 10
29+
timeoutSeconds: 5
30+
successThreshold: 1
31+
failureThreshold: 3
32+
livenessProbe:
33+
httpGet:
34+
path: /
35+
port: 80
36+
scheme: HTTP
37+
initialDelaySeconds: 15
38+
periodSeconds: 20
39+
timeoutSeconds: 5
40+
successThreshold: 1
41+
failureThreshold: 3
42+
restartPolicy: Always

pkg/slurm/Create.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -148,14 +148,15 @@ func (h *SidecarHandler) SubmitHandler(w http.ResponseWriter, r *http.Request) {
148148
)
149149

150150
// Process probes if enabled
151-
var readinessProbes, livenessProbes []ProbeCommand
151+
var readinessProbes, livenessProbes, startupProbes []ProbeCommand
152152
if h.Config.EnableProbes && !isInit {
153-
readinessProbes, livenessProbes = translateKubernetesProbes(spanCtx, container)
154-
if len(readinessProbes) > 0 || len(livenessProbes) > 0 {
153+
readinessProbes, livenessProbes, startupProbes = translateKubernetesProbes(spanCtx, container)
154+
if len(readinessProbes) > 0 || len(livenessProbes) > 0 || len(startupProbes) > 0 {
155155
log.G(h.Ctx).Info("-- Container " + container.Name + " has probes configured")
156156
span.SetAttributes(
157157
attribute.Int("job.container"+strconv.Itoa(i)+".readiness_probes", len(readinessProbes)),
158158
attribute.Int("job.container"+strconv.Itoa(i)+".liveness_probes", len(livenessProbes)),
159+
attribute.Int("job.container"+strconv.Itoa(i)+".startup_probes", len(startupProbes)),
159160
)
160161
}
161162
}
@@ -168,6 +169,7 @@ func (h *SidecarHandler) SubmitHandler(w http.ResponseWriter, r *http.Request) {
168169
isInitContainer: isInit,
169170
readinessProbes: readinessProbes,
170171
livenessProbes: livenessProbes,
172+
startupProbes: startupProbes,
171173
containerImage: image,
172174
})
173175
}

pkg/slurm/Status.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ func (h *SidecarHandler) StatusHandler(w http.ResponseWriter, r *http.Request) {
212212
}
213213
for _, ct := range pod.Spec.Containers {
214214
// Check probe status for container readiness
215-
readinessCount, _, err := loadProbeMetadata(path, ct.Name)
215+
readinessCount, _, _, err := loadProbeMetadata(path, ct.Name)
216216
isReady := true
217217
if err != nil {
218218
log.G(h.Ctx).Debug("Failed to load probe metadata for container ", ct.Name, ": ", err)
@@ -287,7 +287,7 @@ func (h *SidecarHandler) StatusHandler(w http.ResponseWriter, r *http.Request) {
287287
}
288288
for _, ct := range pod.Spec.Containers {
289289
// Check probe status for container readiness
290-
readinessCount, _, err := loadProbeMetadata(path, ct.Name)
290+
readinessCount, _, _, err := loadProbeMetadata(path, ct.Name)
291291
isReady := true
292292
if err != nil {
293293
log.G(h.Ctx).Debug("Failed to load probe metadata for container ", ct.Name, ": ", err)

pkg/slurm/prepare.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -836,15 +836,15 @@ highestExitCode=0
836836
// Generate probe cleanup script first if any probes exist
837837
var hasProbes bool
838838
for _, containerCommand := range commands {
839-
if len(containerCommand.readinessProbes) > 0 || len(containerCommand.livenessProbes) > 0 {
839+
if len(containerCommand.readinessProbes) > 0 || len(containerCommand.livenessProbes) > 0 || len(containerCommand.startupProbes) > 0 {
840840
hasProbes = true
841841
break
842842
}
843843
}
844844
if hasProbes && config.EnableProbes {
845845
for _, containerCommand := range commands {
846-
if len(containerCommand.readinessProbes) > 0 || len(containerCommand.livenessProbes) > 0 {
847-
cleanupScript := generateProbeCleanupScript(containerCommand.containerName, containerCommand.readinessProbes, containerCommand.livenessProbes)
846+
if len(containerCommand.readinessProbes) > 0 || len(containerCommand.livenessProbes) > 0 || len(containerCommand.startupProbes) > 0 {
847+
cleanupScript := generateProbeCleanupScript(containerCommand.containerName, containerCommand.readinessProbes, containerCommand.livenessProbes, containerCommand.startupProbes)
848848
stringToBeWritten.WriteString(cleanupScript)
849849
break // Only need one cleanup script
850850
}
@@ -898,7 +898,7 @@ highestExitCode=0
898898
}
899899

900900
// Generate probe scripts if enabled and not an init container
901-
if config.EnableProbes && !containerCommand.isInitContainer && (len(containerCommand.readinessProbes) > 0 || len(containerCommand.livenessProbes) > 0) {
901+
if config.EnableProbes && !containerCommand.isInitContainer && (len(containerCommand.readinessProbes) > 0 || len(containerCommand.livenessProbes) > 0 || len(containerCommand.startupProbes) > 0) {
902902
// Extract the image name from the singularity command
903903
var imageName string
904904
for i, arg := range containerCommand.runtimeCommand {
@@ -922,12 +922,12 @@ highestExitCode=0
922922

923923
if imageName != "" {
924924
// Store probe metadata for status checking
925-
err := storeProbeMetadata(path, containerCommand.containerName, len(containerCommand.readinessProbes), len(containerCommand.livenessProbes))
925+
err := storeProbeMetadata(path, containerCommand.containerName, len(containerCommand.readinessProbes), len(containerCommand.livenessProbes), len(containerCommand.startupProbes))
926926
if err != nil {
927927
log.G(Ctx).Error("Failed to store probe metadata: ", err)
928928
}
929929

930-
probeScript := generateProbeScript(Ctx, config, containerCommand.containerName, imageName, containerCommand.readinessProbes, containerCommand.livenessProbes)
930+
probeScript := generateProbeScript(Ctx, config, containerCommand.containerName, imageName, containerCommand.readinessProbes, containerCommand.livenessProbes, containerCommand.startupProbes)
931931
stringToBeWritten.WriteString("\n")
932932
stringToBeWritten.WriteString(probeScript)
933933
}

pkg/slurm/probes.go

Lines changed: 167 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,19 @@ import (
1515
)
1616

1717
// translateKubernetesProbes converts Kubernetes probe specifications to internal ProbeCommand format
18-
func translateKubernetesProbes(ctx context.Context, container v1.Container) ([]ProbeCommand, []ProbeCommand) {
19-
var readinessProbes, livenessProbes []ProbeCommand
18+
func translateKubernetesProbes(ctx context.Context, container v1.Container) ([]ProbeCommand, []ProbeCommand, []ProbeCommand) {
19+
var readinessProbes, livenessProbes, startupProbes []ProbeCommand
2020
span := trace.SpanFromContext(ctx)
2121

22+
// Handle startup probe
23+
if container.StartupProbe != nil {
24+
probe := translateSingleProbe(ctx, container.StartupProbe)
25+
if probe != nil {
26+
startupProbes = append(startupProbes, *probe)
27+
span.AddEvent("Translated startup probe for container " + container.Name)
28+
}
29+
}
30+
2231
// Handle readiness probe
2332
if container.ReadinessProbe != nil {
2433
probe := translateSingleProbe(ctx, container.ReadinessProbe)
@@ -37,7 +46,7 @@ func translateKubernetesProbes(ctx context.Context, container v1.Container) ([]P
3746
}
3847
}
3948

40-
return readinessProbes, livenessProbes
49+
return readinessProbes, livenessProbes, startupProbes
4150
}
4251

4352
// translateSingleProbe converts a single Kubernetes probe to internal format
@@ -103,11 +112,11 @@ func translateSingleProbe(ctx context.Context, k8sProbe *v1.Probe) *ProbeCommand
103112
}
104113

105114
// generateProbeScript generates the shell script commands for executing probes
106-
func generateProbeScript(ctx context.Context, config SlurmConfig, containerName string, imageName string, readinessProbes []ProbeCommand, livenessProbes []ProbeCommand) string {
115+
func generateProbeScript(ctx context.Context, config SlurmConfig, containerName string, imageName string, readinessProbes []ProbeCommand, livenessProbes []ProbeCommand, startupProbes []ProbeCommand) string {
107116
span := trace.SpanFromContext(ctx)
108117
span.AddEvent("Generating probe script for container " + containerName)
109118

110-
if len(readinessProbes) == 0 && len(livenessProbes) == 0 {
119+
if len(readinessProbes) == 0 && len(livenessProbes) == 0 && len(startupProbes) == 0 {
111120
return ""
112121
}
113122

@@ -231,8 +240,134 @@ runProbe() {
231240
return 0
232241
}
233242
243+
runStartupProbe() {
244+
local probe_type="$1"
245+
local container_name="$2"
246+
local initial_delay="$3"
247+
local period="$4"
248+
local timeout="$5"
249+
local success_threshold="$6"
250+
local failure_threshold="$7"
251+
local probe_name="$8"
252+
local probe_index="$9"
253+
shift 9
254+
local probe_args=("$@")
255+
256+
local probe_status_file="${workingPath}/${probe_name}-probe-${container_name}-${probe_index}.status"
257+
258+
printf "%%s\n" "$(date -Is --utc) Starting ${probe_name} probe for container ${container_name}..."
259+
260+
# Initialize probe status as running
261+
echo "RUNNING" > "$probe_status_file"
262+
263+
# Initial delay - startup probe waits before starting
264+
if [ "$initial_delay" -gt 0 ]; then
265+
printf "%%s\n" "$(date -Is --utc) Waiting ${initial_delay}s before starting ${probe_name} probe..."
266+
sleep "$initial_delay"
267+
fi
268+
269+
local consecutive_successes=0
270+
local consecutive_failures=0
271+
272+
while true; do
273+
if [ "$probe_type" = "http" ]; then
274+
executeHTTPProbe "${probe_args[@]}" "$container_name"
275+
elif [ "$probe_type" = "exec" ]; then
276+
executeExecProbe "$timeout" "$container_name" "${probe_args[@]}"
277+
fi
278+
279+
local exit_code=$?
280+
281+
if [ $exit_code -eq 0 ]; then
282+
consecutive_successes=$((consecutive_successes + 1))
283+
consecutive_failures=0
284+
printf "%%s\n" "$(date -Is --utc) ${probe_name} probe succeeded for ${container_name} (${consecutive_successes}/${success_threshold})"
285+
286+
if [ $consecutive_successes -ge $success_threshold ]; then
287+
printf "%%s\n" "$(date -Is --utc) ${probe_name} probe successful for ${container_name} - other probes can now start"
288+
echo "SUCCESS" > "$probe_status_file"
289+
return 0
290+
fi
291+
else
292+
consecutive_failures=$((consecutive_failures + 1))
293+
consecutive_successes=0
294+
printf "%%s\n" "$(date -Is --utc) ${probe_name} probe failed for ${container_name} (${consecutive_failures}/${failure_threshold})"
295+
296+
if [ $consecutive_failures -ge $failure_threshold ]; then
297+
printf "%%s\n" "$(date -Is --utc) ${probe_name} probe failed for ${container_name} after ${failure_threshold} attempts - container should be restarted" >&2
298+
echo "FAILED_THRESHOLD" > "$probe_status_file"
299+
return 1
300+
fi
301+
fi
302+
303+
sleep "$period"
304+
done
305+
}
306+
307+
waitForStartupProbes() {
308+
local container_name="$1"
309+
local startup_probe_count="$2"
310+
311+
if [ "$startup_probe_count" -eq 0 ]; then
312+
return 0
313+
fi
314+
315+
printf "%%s\n" "$(date -Is --utc) Waiting for startup probes to succeed before starting other probes for ${container_name}..."
316+
317+
while true; do
318+
local all_startup_probes_successful=true
319+
320+
for i in $(seq 0 $((startup_probe_count - 1))); do
321+
local probe_status_file="${workingPath}/startup-probe-${container_name}-${i}.status"
322+
if [ ! -f "$probe_status_file" ]; then
323+
all_startup_probes_successful=false
324+
break
325+
fi
326+
327+
local status=$(cat "$probe_status_file")
328+
if [ "$status" != "SUCCESS" ]; then
329+
if [ "$status" = "FAILED_THRESHOLD" ]; then
330+
printf "%%s\n" "$(date -Is --utc) Startup probe failed for ${container_name} - other probes will not start" >&2
331+
return 1
332+
fi
333+
all_startup_probes_successful=false
334+
break
335+
fi
336+
done
337+
338+
if [ "$all_startup_probes_successful" = true ]; then
339+
printf "%%s\n" "$(date -Is --utc) All startup probes successful for ${container_name} - other probes can now start"
340+
return 0
341+
fi
342+
343+
sleep 1
344+
done
345+
}
346+
234347
`)
235348

349+
// Generate startup probe calls - these run in background but block other probes
350+
for i, probe := range startupProbes {
351+
probeArgs := buildProbeArgs(probe)
352+
containerVarName := strings.ReplaceAll(containerName, "-", "_")
353+
scriptBuilder.WriteString(fmt.Sprintf(`
354+
# Startup probe %d for %s
355+
runStartupProbe "%s" "%s" %d %d %d %d %d "startup" %d %s &
356+
STARTUP_PROBE_%s_%d_PID=$!
357+
`, i, containerName, probe.Type, containerName, probe.InitialDelaySeconds, probe.PeriodSeconds,
358+
probe.TimeoutSeconds, probe.SuccessThreshold, probe.FailureThreshold, i, probeArgs, containerVarName, i))
359+
}
360+
361+
// Wait for startup probes before starting other probes
362+
if len(startupProbes) > 0 {
363+
scriptBuilder.WriteString(fmt.Sprintf(`
364+
# Wait for startup probes to complete before starting readiness/liveness probes
365+
(
366+
waitForStartupProbes "%s" %d
367+
if [ $? -eq 0 ]; then
368+
`, containerName, len(startupProbes)))
369+
}
370+
236371
// Generate readiness probe calls
237372
for i, probe := range readinessProbes {
238373
probeArgs := buildProbeArgs(probe)
@@ -257,10 +392,19 @@ LIVENESS_PROBE_%s_%d_PID=$!
257392
probe.TimeoutSeconds, probe.SuccessThreshold, probe.FailureThreshold, i, probeArgs, containerVarName, i))
258393
}
259394

395+
// Close the startup probe conditional block
396+
if len(startupProbes) > 0 {
397+
scriptBuilder.WriteString(`
398+
fi
399+
) &
400+
`)
401+
}
402+
260403
span.SetAttributes(
261404
attribute.String("probes.container.name", containerName),
262405
attribute.Int("probes.readiness.count", len(readinessProbes)),
263406
attribute.Int("probes.liveness.count", len(livenessProbes)),
407+
attribute.Int("probes.startup.count", len(startupProbes)),
264408
)
265409

266410
return scriptBuilder.String()
@@ -287,8 +431,8 @@ func buildProbeArgs(probe ProbeCommand) string {
287431
}
288432

289433
// generateProbeCleanupScript generates cleanup commands for probe processes
290-
func generateProbeCleanupScript(containerName string, readinessProbes []ProbeCommand, livenessProbes []ProbeCommand) string {
291-
if len(readinessProbes) == 0 && len(livenessProbes) == 0 {
434+
func generateProbeCleanupScript(containerName string, readinessProbes []ProbeCommand, livenessProbes []ProbeCommand, startupProbes []ProbeCommand) string {
435+
if len(readinessProbes) == 0 && len(livenessProbes) == 0 && len(startupProbes) == 0 {
292436
return ""
293437
}
294438

@@ -317,6 +461,14 @@ cleanup_probes() {
317461
`, containerVarName, i, containerVarName, i))
318462
}
319463

464+
// Kill startup probes
465+
for i := range startupProbes {
466+
scriptBuilder.WriteString(fmt.Sprintf(` if [ ! -z "$STARTUP_PROBE_%s_%d_PID" ]; then
467+
kill $STARTUP_PROBE_%s_%d_PID 2>/dev/null || true
468+
fi
469+
`, containerVarName, i, containerVarName, i))
470+
}
471+
320472
scriptBuilder.WriteString(`}
321473
322474
# Set up trap to cleanup probes on exit
@@ -436,22 +588,22 @@ func checkContainerLiveness(ctx context.Context, config SlurmConfig, workingPath
436588
}
437589

438590
// storeProbeMetadata saves probe count information for later status checking
439-
func storeProbeMetadata(workingPath, containerName string, readinessProbeCount, livenessProbeCount int) error {
591+
func storeProbeMetadata(workingPath, containerName string, readinessProbeCount, livenessProbeCount, startupProbeCount int) error {
440592
metadataFile := fmt.Sprintf("%s/probe-metadata-%s.txt", workingPath, containerName)
441-
content := fmt.Sprintf("readiness:%d\nliveness:%d", readinessProbeCount, livenessProbeCount)
593+
content := fmt.Sprintf("readiness:%d\nliveness:%d\nstartup:%d", readinessProbeCount, livenessProbeCount, startupProbeCount)
442594
return os.WriteFile(metadataFile, []byte(content), 0644)
443595
}
444596

445597
// loadProbeMetadata loads probe count information for status checking
446-
func loadProbeMetadata(workingPath, containerName string) (readinessCount, livenessCount int, err error) {
598+
func loadProbeMetadata(workingPath, containerName string) (readinessCount, livenessCount, startupCount int, err error) {
447599
metadataFile := fmt.Sprintf("%s/probe-metadata-%s.txt", workingPath, containerName)
448600
content, err := os.ReadFile(metadataFile)
449601
if err != nil {
450602
if os.IsNotExist(err) {
451603
// No probe metadata file means no probes configured
452-
return 0, 0, nil
604+
return 0, 0, 0, nil
453605
}
454-
return 0, 0, err
606+
return 0, 0, 0, err
455607
}
456608

457609
lines := strings.Split(string(content), "\n")
@@ -471,8 +623,10 @@ func loadProbeMetadata(workingPath, containerName string) (readinessCount, liven
471623
readinessCount = count
472624
case "liveness":
473625
livenessCount = count
626+
case "startup":
627+
startupCount = count
474628
}
475629
}
476630

477-
return readinessCount, livenessCount, nil
631+
return readinessCount, livenessCount, startupCount, nil
478632
}

0 commit comments

Comments
 (0)