From 5e70cfd4825be43c900f914c9c201d60c9c5c4c0 Mon Sep 17 00:00:00 2001 From: Diego Ciangottini Date: Thu, 2 Oct 2025 19:47:35 +0200 Subject: [PATCH 1/2] feat: add support for Kubernetes startup probes Add full support for Kubernetes startup probes with proper initialDelaySeconds handling. Startup probes now: - Respect initialDelaySeconds configuration before first execution - Run in background without blocking container startup - Block readiness and liveness probes until successful - Support both HTTP and Exec probe types - Include proper cleanup and error handling Changes: - Add vendor/ to .gitignore to exclude vendored dependencies - Update ContainerCommand struct to include startupProbes field - Extend translateKubernetesProbes to handle startup probe translation - Implement runStartupProbe and waitForStartupProbes functions - Update probe metadata storage/loading for startup probe counts - Add startup probe example demonstrating 300s initial delay - Fix probe script generation to properly sequence probe execution Startup probes follow Kubernetes semantics where they must succeed before other probes begin execution, preventing premature readiness/liveness checks during application initialization. Signed-off-by: Diego Ciangottini --- .gitignore | 2 + examples/startup-probe-example.yaml | 42 +++++++ pkg/slurm/Create.go | 8 +- pkg/slurm/Status.go | 4 +- pkg/slurm/prepare.go | 14 +-- pkg/slurm/probes.go | 182 +++++++++++++++++++++++++--- pkg/slurm/types.go | 1 + 7 files changed, 227 insertions(+), 26 deletions(-) create mode 100644 examples/startup-probe-example.yaml diff --git a/.gitignore b/.gitignore index d4ef62c..a57173b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ envs.sh bin +vendor/ + # Eclipse IDE .project .settings diff --git a/examples/startup-probe-example.yaml b/examples/startup-probe-example.yaml new file mode 100644 index 0000000..b9e45c2 --- /dev/null +++ b/examples/startup-probe-example.yaml @@ -0,0 +1,42 @@ +apiVersion: v1 +kind: Pod +metadata: + name: startup-probe-test-pod + namespace: vk +spec: + containers: + - name: app-container + image: nginx:alpine + ports: + - containerPort: 80 + startupProbe: + httpGet: + path: / + port: 80 + scheme: HTTP + initialDelaySeconds: 300 + periodSeconds: 10 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + readinessProbe: + httpGet: + path: / + port: 80 + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + livenessProbe: + httpGet: + path: / + port: 80 + scheme: HTTP + initialDelaySeconds: 15 + periodSeconds: 20 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + restartPolicy: Always \ No newline at end of file diff --git a/pkg/slurm/Create.go b/pkg/slurm/Create.go index 5bd709c..dbe7d1e 100644 --- a/pkg/slurm/Create.go +++ b/pkg/slurm/Create.go @@ -148,14 +148,15 @@ func (h *SidecarHandler) SubmitHandler(w http.ResponseWriter, r *http.Request) { ) // Process probes if enabled - var readinessProbes, livenessProbes []ProbeCommand + var readinessProbes, livenessProbes, startupProbes []ProbeCommand if h.Config.EnableProbes && !isInit { - readinessProbes, livenessProbes = translateKubernetesProbes(spanCtx, container) - if len(readinessProbes) > 0 || len(livenessProbes) > 0 { + readinessProbes, livenessProbes, startupProbes = translateKubernetesProbes(spanCtx, container) + if len(readinessProbes) > 0 || len(livenessProbes) > 0 || len(startupProbes) > 0 { log.G(h.Ctx).Info("-- Container " + container.Name + " has probes configured") span.SetAttributes( attribute.Int("job.container"+strconv.Itoa(i)+".readiness_probes", len(readinessProbes)), attribute.Int("job.container"+strconv.Itoa(i)+".liveness_probes", len(livenessProbes)), + attribute.Int("job.container"+strconv.Itoa(i)+".startup_probes", len(startupProbes)), ) } } @@ -168,6 +169,7 @@ func (h *SidecarHandler) SubmitHandler(w http.ResponseWriter, r *http.Request) { isInitContainer: isInit, readinessProbes: readinessProbes, livenessProbes: livenessProbes, + startupProbes: startupProbes, containerImage: image, }) } diff --git a/pkg/slurm/Status.go b/pkg/slurm/Status.go index bea0187..8876468 100644 --- a/pkg/slurm/Status.go +++ b/pkg/slurm/Status.go @@ -212,7 +212,7 @@ func (h *SidecarHandler) StatusHandler(w http.ResponseWriter, r *http.Request) { } for _, ct := range pod.Spec.Containers { // Check probe status for container readiness - readinessCount, _, err := loadProbeMetadata(path, ct.Name) + readinessCount, _, _, err := loadProbeMetadata(path, ct.Name) isReady := true if err != nil { log.G(h.Ctx).Debug("Failed to load probe metadata for container ", ct.Name, ": ", err) @@ -287,7 +287,7 @@ func (h *SidecarHandler) StatusHandler(w http.ResponseWriter, r *http.Request) { } for _, ct := range pod.Spec.Containers { // Check probe status for container readiness - readinessCount, _, err := loadProbeMetadata(path, ct.Name) + readinessCount, _, _, err := loadProbeMetadata(path, ct.Name) isReady := true if err != nil { log.G(h.Ctx).Debug("Failed to load probe metadata for container ", ct.Name, ": ", err) diff --git a/pkg/slurm/prepare.go b/pkg/slurm/prepare.go index a8f67ea..da0b34b 100644 --- a/pkg/slurm/prepare.go +++ b/pkg/slurm/prepare.go @@ -836,15 +836,15 @@ highestExitCode=0 // Generate probe cleanup script first if any probes exist var hasProbes bool for _, containerCommand := range commands { - if len(containerCommand.readinessProbes) > 0 || len(containerCommand.livenessProbes) > 0 { + if len(containerCommand.readinessProbes) > 0 || len(containerCommand.livenessProbes) > 0 || len(containerCommand.startupProbes) > 0 { hasProbes = true break } } if hasProbes && config.EnableProbes { for _, containerCommand := range commands { - if len(containerCommand.readinessProbes) > 0 || len(containerCommand.livenessProbes) > 0 { - cleanupScript := generateProbeCleanupScript(containerCommand.containerName, containerCommand.readinessProbes, containerCommand.livenessProbes) + if len(containerCommand.readinessProbes) > 0 || len(containerCommand.livenessProbes) > 0 || len(containerCommand.startupProbes) > 0 { + cleanupScript := generateProbeCleanupScript(containerCommand.containerName, containerCommand.readinessProbes, containerCommand.livenessProbes, containerCommand.startupProbes) stringToBeWritten.WriteString(cleanupScript) break // Only need one cleanup script } @@ -898,7 +898,7 @@ highestExitCode=0 } // Generate probe scripts if enabled and not an init container - if config.EnableProbes && !containerCommand.isInitContainer && (len(containerCommand.readinessProbes) > 0 || len(containerCommand.livenessProbes) > 0) { + if config.EnableProbes && !containerCommand.isInitContainer && (len(containerCommand.readinessProbes) > 0 || len(containerCommand.livenessProbes) > 0 || len(containerCommand.startupProbes) > 0) { // Extract the image name from the singularity command var imageName string for i, arg := range containerCommand.runtimeCommand { @@ -922,12 +922,12 @@ highestExitCode=0 if imageName != "" { // Store probe metadata for status checking - err := storeProbeMetadata(path, containerCommand.containerName, len(containerCommand.readinessProbes), len(containerCommand.livenessProbes)) + err := storeProbeMetadata(path, containerCommand.containerName, len(containerCommand.readinessProbes), len(containerCommand.livenessProbes), len(containerCommand.startupProbes)) if err != nil { log.G(Ctx).Error("Failed to store probe metadata: ", err) } - probeScript := generateProbeScript(Ctx, config, containerCommand.containerName, imageName, containerCommand.readinessProbes, containerCommand.livenessProbes) + probeScript := generateProbeScript(Ctx, config, containerCommand.containerName, imageName, containerCommand.readinessProbes, containerCommand.livenessProbes, containerCommand.startupProbes) stringToBeWritten.WriteString("\n") stringToBeWritten.WriteString(probeScript) } @@ -1381,7 +1381,7 @@ func prepareRuntimeCommand(config SlurmConfig, container v1.Container, metadata singularityOptions = singOpts } - // See https://github.com/interTwin-eu/interlink-slurm-plugin/issues/32#issuecomment-2416031030 + // See https://github.com/interlink-hq/interlink-slurm-plugin/issues/32#issuecomment-2416031030 // singularity run will honor the entrypoint/command (if exist) in container image, while exec will override entrypoint. // Thus if pod command (equivalent to container entrypoint) exist, we do exec, and other case we do run singularityCommand := "" diff --git a/pkg/slurm/probes.go b/pkg/slurm/probes.go index c9d2b84..48ffc01 100644 --- a/pkg/slurm/probes.go +++ b/pkg/slurm/probes.go @@ -15,10 +15,19 @@ import ( ) // translateKubernetesProbes converts Kubernetes probe specifications to internal ProbeCommand format -func translateKubernetesProbes(ctx context.Context, container v1.Container) ([]ProbeCommand, []ProbeCommand) { - var readinessProbes, livenessProbes []ProbeCommand +func translateKubernetesProbes(ctx context.Context, container v1.Container) ([]ProbeCommand, []ProbeCommand, []ProbeCommand) { + var readinessProbes, livenessProbes, startupProbes []ProbeCommand span := trace.SpanFromContext(ctx) + // Handle startup probe + if container.StartupProbe != nil { + probe := translateSingleProbe(ctx, container.StartupProbe) + if probe != nil { + startupProbes = append(startupProbes, *probe) + span.AddEvent("Translated startup probe for container " + container.Name) + } + } + // Handle readiness probe if container.ReadinessProbe != nil { probe := translateSingleProbe(ctx, container.ReadinessProbe) @@ -37,7 +46,7 @@ func translateKubernetesProbes(ctx context.Context, container v1.Container) ([]P } } - return readinessProbes, livenessProbes + return readinessProbes, livenessProbes, startupProbes } // translateSingleProbe converts a single Kubernetes probe to internal format @@ -103,11 +112,11 @@ func translateSingleProbe(ctx context.Context, k8sProbe *v1.Probe) *ProbeCommand } // generateProbeScript generates the shell script commands for executing probes -func generateProbeScript(ctx context.Context, config SlurmConfig, containerName string, imageName string, readinessProbes []ProbeCommand, livenessProbes []ProbeCommand) string { +func generateProbeScript(ctx context.Context, config SlurmConfig, containerName string, imageName string, readinessProbes []ProbeCommand, livenessProbes []ProbeCommand, startupProbes []ProbeCommand) string { span := trace.SpanFromContext(ctx) span.AddEvent("Generating probe script for container " + containerName) - if len(readinessProbes) == 0 && len(livenessProbes) == 0 { + if len(readinessProbes) == 0 && len(livenessProbes) == 0 && len(startupProbes) == 0 { return "" } @@ -231,8 +240,134 @@ runProbe() { return 0 } +runStartupProbe() { + local probe_type="$1" + local container_name="$2" + local initial_delay="$3" + local period="$4" + local timeout="$5" + local success_threshold="$6" + local failure_threshold="$7" + local probe_name="$8" + local probe_index="$9" + shift 9 + local probe_args=("$@") + + local probe_status_file="${workingPath}/${probe_name}-probe-${container_name}-${probe_index}.status" + + printf "%%s\n" "$(date -Is --utc) Starting ${probe_name} probe for container ${container_name}..." + + # Initialize probe status as running + echo "RUNNING" > "$probe_status_file" + + # Initial delay - startup probe waits before starting + if [ "$initial_delay" -gt 0 ]; then + printf "%%s\n" "$(date -Is --utc) Waiting ${initial_delay}s before starting ${probe_name} probe..." + sleep "$initial_delay" + fi + + local consecutive_successes=0 + local consecutive_failures=0 + + while true; do + if [ "$probe_type" = "http" ]; then + executeHTTPProbe "${probe_args[@]}" "$container_name" + elif [ "$probe_type" = "exec" ]; then + executeExecProbe "$timeout" "$container_name" "${probe_args[@]}" + fi + + local exit_code=$? + + if [ $exit_code -eq 0 ]; then + consecutive_successes=$((consecutive_successes + 1)) + consecutive_failures=0 + printf "%%s\n" "$(date -Is --utc) ${probe_name} probe succeeded for ${container_name} (${consecutive_successes}/${success_threshold})" + + if [ $consecutive_successes -ge $success_threshold ]; then + printf "%%s\n" "$(date -Is --utc) ${probe_name} probe successful for ${container_name} - other probes can now start" + echo "SUCCESS" > "$probe_status_file" + return 0 + fi + else + consecutive_failures=$((consecutive_failures + 1)) + consecutive_successes=0 + printf "%%s\n" "$(date -Is --utc) ${probe_name} probe failed for ${container_name} (${consecutive_failures}/${failure_threshold})" + + if [ $consecutive_failures -ge $failure_threshold ]; then + printf "%%s\n" "$(date -Is --utc) ${probe_name} probe failed for ${container_name} after ${failure_threshold} attempts - container should be restarted" >&2 + echo "FAILED_THRESHOLD" > "$probe_status_file" + return 1 + fi + fi + + sleep "$period" + done +} + +waitForStartupProbes() { + local container_name="$1" + local startup_probe_count="$2" + + if [ "$startup_probe_count" -eq 0 ]; then + return 0 + fi + + printf "%%s\n" "$(date -Is --utc) Waiting for startup probes to succeed before starting other probes for ${container_name}..." + + while true; do + local all_startup_probes_successful=true + + for i in $(seq 0 $((startup_probe_count - 1))); do + local probe_status_file="${workingPath}/startup-probe-${container_name}-${i}.status" + if [ ! -f "$probe_status_file" ]; then + all_startup_probes_successful=false + break + fi + + local status=$(cat "$probe_status_file") + if [ "$status" != "SUCCESS" ]; then + if [ "$status" = "FAILED_THRESHOLD" ]; then + printf "%%s\n" "$(date -Is --utc) Startup probe failed for ${container_name} - other probes will not start" >&2 + return 1 + fi + all_startup_probes_successful=false + break + fi + done + + if [ "$all_startup_probes_successful" = true ]; then + printf "%%s\n" "$(date -Is --utc) All startup probes successful for ${container_name} - other probes can now start" + return 0 + fi + + sleep 1 + done +} + `) + // Generate startup probe calls - these run in background but block other probes + for i, probe := range startupProbes { + probeArgs := buildProbeArgs(probe) + containerVarName := strings.ReplaceAll(containerName, "-", "_") + scriptBuilder.WriteString(fmt.Sprintf(` +# Startup probe %d for %s +runStartupProbe "%s" "%s" %d %d %d %d %d "startup" %d %s & +STARTUP_PROBE_%s_%d_PID=$! +`, i, containerName, probe.Type, containerName, probe.InitialDelaySeconds, probe.PeriodSeconds, + probe.TimeoutSeconds, probe.SuccessThreshold, probe.FailureThreshold, i, probeArgs, containerVarName, i)) + } + + // Wait for startup probes before starting other probes + if len(startupProbes) > 0 { + scriptBuilder.WriteString(fmt.Sprintf(` +# Wait for startup probes to complete before starting readiness/liveness probes +( + waitForStartupProbes "%s" %d + if [ $? -eq 0 ]; then +`, containerName, len(startupProbes))) + } + // Generate readiness probe calls for i, probe := range readinessProbes { probeArgs := buildProbeArgs(probe) @@ -257,10 +392,19 @@ LIVENESS_PROBE_%s_%d_PID=$! probe.TimeoutSeconds, probe.SuccessThreshold, probe.FailureThreshold, i, probeArgs, containerVarName, i)) } + // Close the startup probe conditional block + if len(startupProbes) > 0 { + scriptBuilder.WriteString(` + fi +) & +`) + } + span.SetAttributes( attribute.String("probes.container.name", containerName), attribute.Int("probes.readiness.count", len(readinessProbes)), attribute.Int("probes.liveness.count", len(livenessProbes)), + attribute.Int("probes.startup.count", len(startupProbes)), ) return scriptBuilder.String() @@ -287,8 +431,8 @@ func buildProbeArgs(probe ProbeCommand) string { } // generateProbeCleanupScript generates cleanup commands for probe processes -func generateProbeCleanupScript(containerName string, readinessProbes []ProbeCommand, livenessProbes []ProbeCommand) string { - if len(readinessProbes) == 0 && len(livenessProbes) == 0 { +func generateProbeCleanupScript(containerName string, readinessProbes []ProbeCommand, livenessProbes []ProbeCommand, startupProbes []ProbeCommand) string { + if len(readinessProbes) == 0 && len(livenessProbes) == 0 && len(startupProbes) == 0 { return "" } @@ -300,7 +444,7 @@ cleanup_probes() { `) containerVarName := strings.ReplaceAll(containerName, "-", "_") - + // Kill readiness probes for i := range readinessProbes { scriptBuilder.WriteString(fmt.Sprintf(` if [ ! -z "$READINESS_PROBE_%s_%d_PID" ]; then @@ -317,6 +461,14 @@ cleanup_probes() { `, containerVarName, i, containerVarName, i)) } + // Kill startup probes + for i := range startupProbes { + scriptBuilder.WriteString(fmt.Sprintf(` if [ ! -z "$STARTUP_PROBE_%s_%d_PID" ]; then + kill $STARTUP_PROBE_%s_%d_PID 2>/dev/null || true + fi +`, containerVarName, i, containerVarName, i)) + } + scriptBuilder.WriteString(`} # Set up trap to cleanup probes on exit @@ -436,22 +588,22 @@ func checkContainerLiveness(ctx context.Context, config SlurmConfig, workingPath } // storeProbeMetadata saves probe count information for later status checking -func storeProbeMetadata(workingPath, containerName string, readinessProbeCount, livenessProbeCount int) error { +func storeProbeMetadata(workingPath, containerName string, readinessProbeCount, livenessProbeCount, startupProbeCount int) error { metadataFile := fmt.Sprintf("%s/probe-metadata-%s.txt", workingPath, containerName) - content := fmt.Sprintf("readiness:%d\nliveness:%d", readinessProbeCount, livenessProbeCount) + content := fmt.Sprintf("readiness:%d\nliveness:%d\nstartup:%d", readinessProbeCount, livenessProbeCount, startupProbeCount) return os.WriteFile(metadataFile, []byte(content), 0644) } // loadProbeMetadata loads probe count information for status checking -func loadProbeMetadata(workingPath, containerName string) (readinessCount, livenessCount int, err error) { +func loadProbeMetadata(workingPath, containerName string) (readinessCount, livenessCount, startupCount int, err error) { metadataFile := fmt.Sprintf("%s/probe-metadata-%s.txt", workingPath, containerName) content, err := os.ReadFile(metadataFile) if err != nil { if os.IsNotExist(err) { // No probe metadata file means no probes configured - return 0, 0, nil + return 0, 0, 0, nil } - return 0, 0, err + return 0, 0, 0, err } lines := strings.Split(string(content), "\n") @@ -471,8 +623,10 @@ func loadProbeMetadata(workingPath, containerName string) (readinessCount, liven readinessCount = count case "liveness": livenessCount = count + case "startup": + startupCount = count } } - return readinessCount, livenessCount, nil + return readinessCount, livenessCount, startupCount, nil } diff --git a/pkg/slurm/types.go b/pkg/slurm/types.go index 9ea3b2e..f27e8cc 100644 --- a/pkg/slurm/types.go +++ b/pkg/slurm/types.go @@ -74,4 +74,5 @@ type ContainerCommand struct { containerImage string readinessProbes []ProbeCommand livenessProbes []ProbeCommand + startupProbes []ProbeCommand } From fac72a0d7f2ca91416f1758163f539c0314f3164 Mon Sep 17 00:00:00 2001 From: Diego Ciangottini Date: Thu, 2 Oct 2025 19:54:01 +0200 Subject: [PATCH 2/2] fix: set default ContainerRuntime to singularity when not configured Fix issue where ContainerRuntime defaults to empty string when not specified in config file, causing plugin startup failure. Now properly defaults to 'singularity' for backward compatibility with configs from versions prior to 0.5.2. This allows users to upgrade without modifying their existing config files. Fixes #91 Signed-off-by: Diego Ciangottini --- pkg/slurm/func.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pkg/slurm/func.go b/pkg/slurm/func.go index d8aa439..0aff21f 100644 --- a/pkg/slurm/func.go +++ b/pkg/slurm/func.go @@ -98,6 +98,12 @@ func NewSlurmConfig() (SlurmConfig, error) { SlurmConfigInst.Tsockspath = path } + + // Set default ContainerRuntime if not configured + if SlurmConfigInst.ContainerRuntime == "" { + SlurmConfigInst.ContainerRuntime = "singularity" + } + // Check if a supported container runtime is configured (supported: singularity, enroot) if SlurmConfigInst.ContainerRuntime != "singularity" && SlurmConfigInst.ContainerRuntime != "enroot" { err := fmt.Errorf("container runtime %q is not supported. Please configure a supported one (singularity, enroot)", SlurmConfigInst.ContainerRuntime)