adding efficient polling to waitForStepsToFinish

pritidesai · pritidesai · commit c8420e5994bd · 2025-07-22T14:09:30.000-07:00
The current waitForStepsToFinish implementation is a classic busy-wait.
It checks for file existence without any sleep, resulting in a high
CPU usage. Adding a profile with a unit test to show that almost all
time is spent in system calls with a high total sample count. This led to
execssive CPU usage by the sidecar even when just waiting.

The function now sleeps 100ms between checks, drastically reducing the
frequency. The sidecar now uses minimal CPU while waiting.

Signed-off-by: Priti Desai &lt;pdesai@us.ibm.com&gt;
diff --git a/config/config-defaults.yaml b/config/config-defaults.yaml
@@ -149,3 +149,8 @@ data:
     #     limits:
     #       memory: "256Mi"
     #       cpu: "500m"
+
+    # sidecar-log-polling-interval specifies the polling interval for the Tekton sidecar log results container.
+    # This controls how frequently the sidecar checks for step completion files.
+    # Example values: "100ms", "500ms", "1s"
+    sidecar-log-polling-interval: "100ms"
diff --git a/internal/sidecarlogresults/sidecarlogresults.go b/internal/sidecarlogresults/sidecarlogresults.go
@@ -26,6 +26,7 @@ import (
 	"os"
 	"path/filepath"
 	"strings"
+	"time"
 
 	"github.com/tektoncd/pipeline/pkg/apis/config"
 	"github.com/tektoncd/pipeline/pkg/apis/pipeline"
@@ -74,7 +75,7 @@ func encode(w io.Writer, v any) error {
 	return json.NewEncoder(w).Encode(v)
 }
 
-func waitForStepsToFinish(runDir string) error {
+func waitForStepsToFinish(runDir string, sleepInterval time.Duration) error {
 	steps := make(map[string]bool)
 	files, err := os.ReadDir(runDir)
 	if err != nil {
@@ -103,6 +104,9 @@ func waitForStepsToFinish(runDir string) error {
 				return err
 			}
 		}
+		if sleepInterval > 0 {
+			time.Sleep(sleepInterval)
+		}
 	}
 	return nil
 }
@@ -143,7 +147,15 @@ func readResults(resultsDir, resultFile, stepName string, resultType SidecarLogR
 // in their results path and prints them in a structured way to its
 // stdout so that the reconciler can parse those logs.
 func LookForResults(w io.Writer, runDir string, resultsDir string, resultNames []string, stepResultsDir string, stepResults map[string][]string) error {
-	if err := waitForStepsToFinish(runDir); err != nil {
+	intervalStr := os.Getenv("SIDECAR_LOG_POLLING_INTERVAL")
+	if intervalStr == "" {
+		intervalStr = "100ms"
+	}
+	interval, err := time.ParseDuration(intervalStr)
+	if err != nil {
+		interval = 100 * time.Millisecond
+	}
+	if err := waitForStepsToFinish(runDir, interval); err != nil {
 		return fmt.Errorf("error while waiting for the steps to finish  %w", err)
 	}
 	results := make(chan SidecarLogResult)
@@ -205,7 +217,15 @@ func LookForResults(w io.Writer, runDir string, resultsDir string, resultNames [
 // If the provenance file exists, the function extracts artifact information, formats it into a
 // JSON string, and encodes it for output alongside relevant metadata (step name, artifact type).
 func LookForArtifacts(w io.Writer, names []string, runDir string) error {
-	if err := waitForStepsToFinish(runDir); err != nil {
+	intervalStr := os.Getenv("SIDECAR_LOG_POLLING_INTERVAL")
+	if intervalStr == "" {
+		intervalStr = "100ms"
+	}
+	interval, err := time.ParseDuration(intervalStr)
+	if err != nil {
+		interval = 100 * time.Millisecond
+	}
+	if err := waitForStepsToFinish(runDir, interval); err != nil {
 		return err
 	}
 
diff --git a/internal/sidecarlogresults/sidecarlogresults_test.go b/internal/sidecarlogresults/sidecarlogresults_test.go
@@ -23,9 +23,11 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"runtime/pprof"
 	"sort"
 	"strings"
 	"testing"
+	"time"
 
 	"github.com/google/go-cmp/cmp"
 	v1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1"
@@ -608,6 +610,66 @@ func TestExtractStepAndResultFromSidecarResultName_Error(t *testing.T) {
 	}
 }
 
+// TestWaitForStepsToFinish_Profile ensures that waitForStepsToFinish correctly waits for all step output files to appear before returning
+// The test creates a file called cpu.prof and starts Go's CPU profiler
+// A temporary directory is created to simulate the Tekton step run directory.
+// The test creates a large number of subdirectories e.g. step0, step1, ..., each representing a step in a TaskRun
+// A goroutine is started that, one by one, writes an out file in each step directory, with a small delay between each
+// The test calls the function and waits for it to complete and the profile is saved for later analysis
+// This is helpful to compare the impact of code changes, provides a reproducible way to profile and optimize the function waitForStepsToFinish
+func TestWaitForStepsToFinish_Profile(t *testing.T) {
+	f, err := os.Create("cpu.prof")
+	if err != nil {
+		t.Fatalf("could not create CPU profile: %v", err)
+	}
+	defer func(f *os.File) {
+		err := f.Close()
+		if err != nil {
+			return
+		}
+	}(f)
+	err = pprof.StartCPUProfile(f)
+	if err != nil {
+		return
+	}
+	defer pprof.StopCPUProfile()
+
+	// Setup: create a temp runDir with many fake step files
+	runDir := t.TempDir()
+	stepCount := 100
+	for i := range stepCount {
+		dir := filepath.Join(runDir, fmt.Sprintf("step%d", i))
+		err := os.MkdirAll(dir, 0755)
+		if err != nil {
+			return
+		}
+	}
+
+	// Simulate steps finishing one by one with a delay
+	go func() {
+		for i := range stepCount {
+			file := filepath.Join(runDir, fmt.Sprintf("step%d", i), "out")
+			err := os.WriteFile(file, []byte("done"), 0644)
+			if err != nil {
+				return
+			}
+			time.Sleep(10 * time.Millisecond)
+		}
+	}()
+
+	intervalStr := os.Getenv("SIDECAR_LOG_POLLING_INTERVAL")
+	if intervalStr == "" {
+		intervalStr = "100ms"
+	}
+	interval, err := time.ParseDuration(intervalStr)
+	if err != nil {
+		interval = 100 * time.Millisecond
+	}
+	if err := waitForStepsToFinish(runDir, interval); err != nil {
+		t.Fatalf("waitForStepsToFinish failed: %v", err)
+	}
+}
+
 func TestLookForArtifacts(t *testing.T) {
 	base := basicArtifacts()
 	modified := base.DeepCopy()
diff --git a/pkg/apis/config/default.go b/pkg/apis/config/default.go
@@ -67,6 +67,8 @@ const (
 	defaultContainerResourceRequirementsKey = "default-container-resource-requirements"
 	defaultImagePullBackOffTimeout          = "default-imagepullbackoff-timeout"
 	defaultMaximumResolutionTimeout         = "default-maximum-resolution-timeout"
+	// Add the new key for sidecar log polling interval
+	sidecarLogPollingIntervalKey = "sidecar-log-polling-interval"
 )
 
 // DefaultConfig holds all the default configurations for the config.
@@ -88,6 +90,7 @@ type Defaults struct {
 	DefaultContainerResourceRequirements map[string]corev1.ResourceRequirements
 	DefaultImagePullBackOffTimeout       time.Duration
 	DefaultMaximumResolutionTimeout      time.Duration
+	SidecarLogPollingInterval            time.Duration
 }
 
 // GetDefaultsConfigName returns the name of the configmap containing all
@@ -220,6 +223,14 @@ func NewDefaultsFromMap(cfgMap map[string]string) (*Defaults, error) {
 		tc.DefaultMaximumResolutionTimeout = timeout
 	}
 
+	if defaultSidecarPollingInterval, ok := cfgMap[sidecarLogPollingIntervalKey]; ok {
+		interval, err := time.ParseDuration(defaultSidecarPollingInterval)
+		if err != nil {
+			return nil, fmt.Errorf("failed parsing default config %q", defaultSidecarPollingInterval)
+		}
+		tc.SidecarLogPollingInterval = interval
+	}
+
 	return &tc, nil
 }
 
diff --git a/pkg/pod/pod.go b/pkg/pod/pod.go
@@ -25,6 +25,7 @@ import (
 	"path/filepath"
 	"strconv"
 	"strings"
+	"time"
 
 	"github.com/tektoncd/pipeline/internal/artifactref"
 	"github.com/tektoncd/pipeline/pkg/apis/config"
@@ -203,10 +204,11 @@ func (b *Builder) Build(ctx context.Context, taskRun *v1.TaskRun, taskSpec v1.Ta
 	}
 
 	windows := usesWindows(taskRun)
+	pollingInterval := config.FromContextOrDefaults(ctx).Defaults.SidecarLogPollingInterval
 	if sidecarLogsResultsEnabled {
 		if taskSpec.Results != nil || artifactsPathReferenced(steps) {
 			// create a results sidecar
-			resultsSidecar, err := createResultsSidecar(taskSpec, b.Images.SidecarLogResultsImage, securityContextConfig, windows)
+			resultsSidecar, err := createResultsSidecar(taskSpec, b.Images.SidecarLogResultsImage, securityContextConfig, windows, pollingInterval)
 			if err != nil {
 				return nil, err
 			}
@@ -618,7 +620,7 @@ func entrypointInitContainer(image string, steps []v1.Step, securityContext Secu
 // whether it will run on a windows node, and whether the sidecar should include a security context
 // that will allow it to run in namespaces with "restricted" pod security admission.
 // It will also provide arguments to the binary that allow it to surface the step results.
-func createResultsSidecar(taskSpec v1.TaskSpec, image string, securityContext SecurityContextConfig, windows bool) (v1.Sidecar, error) {
+func createResultsSidecar(taskSpec v1.TaskSpec, image string, securityContext SecurityContextConfig, windows bool, pollingInterval time.Duration) (v1.Sidecar, error) {
 	names := make([]string, 0, len(taskSpec.Results))
 	for _, r := range taskSpec.Results {
 		names = append(names, r.Name)
@@ -660,6 +662,12 @@ func createResultsSidecar(taskSpec v1.TaskSpec, image string, securityContext Se
 		Name:    pipeline.ReservedResultsSidecarName,
 		Image:   image,
 		Command: command,
+		Env: []corev1.EnvVar{
+			{
+				Name:  "SIDECAR_LOG_POLLING_INTERVAL",
+				Value: pollingInterval.String(),
+			},
+		},
 	}
 
 	if securityContext.SetSecurityContext {
diff --git a/pkg/pod/pod_test.go b/pkg/pod/pod_test.go
@@ -2006,6 +2006,7 @@ _EOF_
 						{Name: "tekton-internal-bin", ReadOnly: true, MountPath: "/tekton/bin"},
 						{Name: "tekton-internal-run-0", ReadOnly: true, MountPath: "/tekton/run/0"},
 					}, implicitVolumeMounts...),
+					Env: []corev1.EnvVar{{Name: "SIDECAR_LOG_POLLING_INTERVAL", Value: "0s"}},
 				}},
 				Volumes: append(implicitVolumes, binVolume, runVolume(0), downwardVolume, corev1.Volume{
 					Name:         "tekton-creds-init-home-0",
@@ -2087,6 +2088,7 @@ _EOF_
 						{Name: "tekton-internal-bin", ReadOnly: true, MountPath: "/tekton/bin"},
 						{Name: "tekton-internal-run-0", ReadOnly: true, MountPath: "/tekton/run/0"},
 					}, implicitVolumeMounts...),
+					Env: []corev1.EnvVar{{Name: "SIDECAR_LOG_POLLING_INTERVAL", Value: "0s"}},
 				}},
 				Volumes: append(implicitVolumes, binVolume, runVolume(0), downwardVolume, corev1.Volume{
 					Name:         "tekton-creds-init-home-0",
@@ -2163,6 +2165,7 @@ _EOF_
 						{Name: "tekton-internal-run-0", ReadOnly: true, MountPath: "/tekton/run/0"},
 					}, implicitVolumeMounts...),
 					SecurityContext: SecurityContextConfig{SetSecurityContext: true, SetReadOnlyRootFilesystem: true}.GetSecurityContext(false),
+					Env:             []corev1.EnvVar{{Name: "SIDECAR_LOG_POLLING_INTERVAL", Value: "0s"}},
 				}},
 				Volumes: append(implicitVolumes, binVolume, runVolume(0), downwardVolume, corev1.Volume{
 					Name:         "tekton-creds-init-home-0",
@@ -2241,6 +2244,7 @@ _EOF_
 						{Name: "tekton-internal-bin", ReadOnly: true, MountPath: "/tekton/bin"},
 						{Name: "tekton-internal-run-0", ReadOnly: true, MountPath: "/tekton/run/0"},
 					}, implicitVolumeMounts...),
+					Env: []corev1.EnvVar{{Name: "SIDECAR_LOG_POLLING_INTERVAL", Value: "0s"}},
 				}},
 				Volumes: append(implicitVolumes, binVolume, runVolume(0), downwardVolume, corev1.Volume{
 					Name:         "tekton-creds-init-home-0",
@@ -2325,6 +2329,7 @@ _EOF_
 						{Name: "tekton-internal-bin", ReadOnly: true, MountPath: "/tekton/bin"},
 						{Name: "tekton-internal-run-0", ReadOnly: true, MountPath: "/tekton/run/0"},
 					}, implicitVolumeMounts...),
+					Env: []corev1.EnvVar{{Name: "SIDECAR_LOG_POLLING_INTERVAL", Value: "0s"}},
 				}},
 				Volumes: append(implicitVolumes, binVolume, runVolume(0), downwardVolume, corev1.Volume{
 					Name:         "tekton-creds-init-home-0",
@@ -2404,6 +2409,7 @@ _EOF_
 						{Name: "tekton-internal-run-0", ReadOnly: true, MountPath: "/tekton/run/0"},
 					}, implicitVolumeMounts...),
 					SecurityContext: SecurityContextConfig{SetSecurityContext: true, SetReadOnlyRootFilesystem: true}.GetSecurityContext(false),
+					Env:             []corev1.EnvVar{{Name: "SIDECAR_LOG_POLLING_INTERVAL", Value: "0s"}},
 				}},
 				Volumes: append(implicitVolumes, binVolume, runVolume(0), downwardVolume, corev1.Volume{
 					Name:         "tekton-creds-init-home-0",