Skip to content

Commit c8420e5

Browse files
committed
adding efficient polling to waitForStepsToFinish
The current waitForStepsToFinish implementation is a classic busy-wait. It checks for file existence without any sleep, resulting in a high CPU usage. Adding a profile with a unit test to show that almost all time is spent in system calls with a high total sample count. This led to execssive CPU usage by the sidecar even when just waiting. The function now sleeps 100ms between checks, drastically reducing the frequency. The sidecar now uses minimal CPU while waiting. Signed-off-by: Priti Desai <pdesai@us.ibm.com>
1 parent d504890 commit c8420e5

File tree

6 files changed

+117
-5
lines changed

6 files changed

+117
-5
lines changed

config/config-defaults.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,3 +149,8 @@ data:
149149
# limits:
150150
# memory: "256Mi"
151151
# cpu: "500m"
152+
153+
# sidecar-log-polling-interval specifies the polling interval for the Tekton sidecar log results container.
154+
# This controls how frequently the sidecar checks for step completion files.
155+
# Example values: "100ms", "500ms", "1s"
156+
sidecar-log-polling-interval: "100ms"

internal/sidecarlogresults/sidecarlogresults.go

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"os"
2727
"path/filepath"
2828
"strings"
29+
"time"
2930

3031
"github.com/tektoncd/pipeline/pkg/apis/config"
3132
"github.com/tektoncd/pipeline/pkg/apis/pipeline"
@@ -74,7 +75,7 @@ func encode(w io.Writer, v any) error {
7475
return json.NewEncoder(w).Encode(v)
7576
}
7677

77-
func waitForStepsToFinish(runDir string) error {
78+
func waitForStepsToFinish(runDir string, sleepInterval time.Duration) error {
7879
steps := make(map[string]bool)
7980
files, err := os.ReadDir(runDir)
8081
if err != nil {
@@ -103,6 +104,9 @@ func waitForStepsToFinish(runDir string) error {
103104
return err
104105
}
105106
}
107+
if sleepInterval > 0 {
108+
time.Sleep(sleepInterval)
109+
}
106110
}
107111
return nil
108112
}
@@ -143,7 +147,15 @@ func readResults(resultsDir, resultFile, stepName string, resultType SidecarLogR
143147
// in their results path and prints them in a structured way to its
144148
// stdout so that the reconciler can parse those logs.
145149
func LookForResults(w io.Writer, runDir string, resultsDir string, resultNames []string, stepResultsDir string, stepResults map[string][]string) error {
146-
if err := waitForStepsToFinish(runDir); err != nil {
150+
intervalStr := os.Getenv("SIDECAR_LOG_POLLING_INTERVAL")
151+
if intervalStr == "" {
152+
intervalStr = "100ms"
153+
}
154+
interval, err := time.ParseDuration(intervalStr)
155+
if err != nil {
156+
interval = 100 * time.Millisecond
157+
}
158+
if err := waitForStepsToFinish(runDir, interval); err != nil {
147159
return fmt.Errorf("error while waiting for the steps to finish %w", err)
148160
}
149161
results := make(chan SidecarLogResult)
@@ -205,7 +217,15 @@ func LookForResults(w io.Writer, runDir string, resultsDir string, resultNames [
205217
// If the provenance file exists, the function extracts artifact information, formats it into a
206218
// JSON string, and encodes it for output alongside relevant metadata (step name, artifact type).
207219
func LookForArtifacts(w io.Writer, names []string, runDir string) error {
208-
if err := waitForStepsToFinish(runDir); err != nil {
220+
intervalStr := os.Getenv("SIDECAR_LOG_POLLING_INTERVAL")
221+
if intervalStr == "" {
222+
intervalStr = "100ms"
223+
}
224+
interval, err := time.ParseDuration(intervalStr)
225+
if err != nil {
226+
interval = 100 * time.Millisecond
227+
}
228+
if err := waitForStepsToFinish(runDir, interval); err != nil {
209229
return err
210230
}
211231

internal/sidecarlogresults/sidecarlogresults_test.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,11 @@ import (
2323
"fmt"
2424
"os"
2525
"path/filepath"
26+
"runtime/pprof"
2627
"sort"
2728
"strings"
2829
"testing"
30+
"time"
2931

3032
"github.com/google/go-cmp/cmp"
3133
v1 "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1"
@@ -608,6 +610,66 @@ func TestExtractStepAndResultFromSidecarResultName_Error(t *testing.T) {
608610
}
609611
}
610612

613+
// TestWaitForStepsToFinish_Profile ensures that waitForStepsToFinish correctly waits for all step output files to appear before returning
614+
// The test creates a file called cpu.prof and starts Go's CPU profiler
615+
// A temporary directory is created to simulate the Tekton step run directory.
616+
// The test creates a large number of subdirectories e.g. step0, step1, ..., each representing a step in a TaskRun
617+
// A goroutine is started that, one by one, writes an out file in each step directory, with a small delay between each
618+
// The test calls the function and waits for it to complete and the profile is saved for later analysis
619+
// This is helpful to compare the impact of code changes, provides a reproducible way to profile and optimize the function waitForStepsToFinish
620+
func TestWaitForStepsToFinish_Profile(t *testing.T) {
621+
f, err := os.Create("cpu.prof")
622+
if err != nil {
623+
t.Fatalf("could not create CPU profile: %v", err)
624+
}
625+
defer func(f *os.File) {
626+
err := f.Close()
627+
if err != nil {
628+
return
629+
}
630+
}(f)
631+
err = pprof.StartCPUProfile(f)
632+
if err != nil {
633+
return
634+
}
635+
defer pprof.StopCPUProfile()
636+
637+
// Setup: create a temp runDir with many fake step files
638+
runDir := t.TempDir()
639+
stepCount := 100
640+
for i := range stepCount {
641+
dir := filepath.Join(runDir, fmt.Sprintf("step%d", i))
642+
err := os.MkdirAll(dir, 0755)
643+
if err != nil {
644+
return
645+
}
646+
}
647+
648+
// Simulate steps finishing one by one with a delay
649+
go func() {
650+
for i := range stepCount {
651+
file := filepath.Join(runDir, fmt.Sprintf("step%d", i), "out")
652+
err := os.WriteFile(file, []byte("done"), 0644)
653+
if err != nil {
654+
return
655+
}
656+
time.Sleep(10 * time.Millisecond)
657+
}
658+
}()
659+
660+
intervalStr := os.Getenv("SIDECAR_LOG_POLLING_INTERVAL")
661+
if intervalStr == "" {
662+
intervalStr = "100ms"
663+
}
664+
interval, err := time.ParseDuration(intervalStr)
665+
if err != nil {
666+
interval = 100 * time.Millisecond
667+
}
668+
if err := waitForStepsToFinish(runDir, interval); err != nil {
669+
t.Fatalf("waitForStepsToFinish failed: %v", err)
670+
}
671+
}
672+
611673
func TestLookForArtifacts(t *testing.T) {
612674
base := basicArtifacts()
613675
modified := base.DeepCopy()

pkg/apis/config/default.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ const (
6767
defaultContainerResourceRequirementsKey = "default-container-resource-requirements"
6868
defaultImagePullBackOffTimeout = "default-imagepullbackoff-timeout"
6969
defaultMaximumResolutionTimeout = "default-maximum-resolution-timeout"
70+
// Add the new key for sidecar log polling interval
71+
sidecarLogPollingIntervalKey = "sidecar-log-polling-interval"
7072
)
7173

7274
// DefaultConfig holds all the default configurations for the config.
@@ -88,6 +90,7 @@ type Defaults struct {
8890
DefaultContainerResourceRequirements map[string]corev1.ResourceRequirements
8991
DefaultImagePullBackOffTimeout time.Duration
9092
DefaultMaximumResolutionTimeout time.Duration
93+
SidecarLogPollingInterval time.Duration
9194
}
9295

9396
// GetDefaultsConfigName returns the name of the configmap containing all
@@ -220,6 +223,14 @@ func NewDefaultsFromMap(cfgMap map[string]string) (*Defaults, error) {
220223
tc.DefaultMaximumResolutionTimeout = timeout
221224
}
222225

226+
if defaultSidecarPollingInterval, ok := cfgMap[sidecarLogPollingIntervalKey]; ok {
227+
interval, err := time.ParseDuration(defaultSidecarPollingInterval)
228+
if err != nil {
229+
return nil, fmt.Errorf("failed parsing default config %q", defaultSidecarPollingInterval)
230+
}
231+
tc.SidecarLogPollingInterval = interval
232+
}
233+
223234
return &tc, nil
224235
}
225236

pkg/pod/pod.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"path/filepath"
2626
"strconv"
2727
"strings"
28+
"time"
2829

2930
"github.com/tektoncd/pipeline/internal/artifactref"
3031
"github.com/tektoncd/pipeline/pkg/apis/config"
@@ -203,10 +204,11 @@ func (b *Builder) Build(ctx context.Context, taskRun *v1.TaskRun, taskSpec v1.Ta
203204
}
204205

205206
windows := usesWindows(taskRun)
207+
pollingInterval := config.FromContextOrDefaults(ctx).Defaults.SidecarLogPollingInterval
206208
if sidecarLogsResultsEnabled {
207209
if taskSpec.Results != nil || artifactsPathReferenced(steps) {
208210
// create a results sidecar
209-
resultsSidecar, err := createResultsSidecar(taskSpec, b.Images.SidecarLogResultsImage, securityContextConfig, windows)
211+
resultsSidecar, err := createResultsSidecar(taskSpec, b.Images.SidecarLogResultsImage, securityContextConfig, windows, pollingInterval)
210212
if err != nil {
211213
return nil, err
212214
}
@@ -618,7 +620,7 @@ func entrypointInitContainer(image string, steps []v1.Step, securityContext Secu
618620
// whether it will run on a windows node, and whether the sidecar should include a security context
619621
// that will allow it to run in namespaces with "restricted" pod security admission.
620622
// It will also provide arguments to the binary that allow it to surface the step results.
621-
func createResultsSidecar(taskSpec v1.TaskSpec, image string, securityContext SecurityContextConfig, windows bool) (v1.Sidecar, error) {
623+
func createResultsSidecar(taskSpec v1.TaskSpec, image string, securityContext SecurityContextConfig, windows bool, pollingInterval time.Duration) (v1.Sidecar, error) {
622624
names := make([]string, 0, len(taskSpec.Results))
623625
for _, r := range taskSpec.Results {
624626
names = append(names, r.Name)
@@ -660,6 +662,12 @@ func createResultsSidecar(taskSpec v1.TaskSpec, image string, securityContext Se
660662
Name: pipeline.ReservedResultsSidecarName,
661663
Image: image,
662664
Command: command,
665+
Env: []corev1.EnvVar{
666+
{
667+
Name: "SIDECAR_LOG_POLLING_INTERVAL",
668+
Value: pollingInterval.String(),
669+
},
670+
},
663671
}
664672

665673
if securityContext.SetSecurityContext {

pkg/pod/pod_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2006,6 +2006,7 @@ _EOF_
20062006
{Name: "tekton-internal-bin", ReadOnly: true, MountPath: "/tekton/bin"},
20072007
{Name: "tekton-internal-run-0", ReadOnly: true, MountPath: "/tekton/run/0"},
20082008
}, implicitVolumeMounts...),
2009+
Env: []corev1.EnvVar{{Name: "SIDECAR_LOG_POLLING_INTERVAL", Value: "0s"}},
20092010
}},
20102011
Volumes: append(implicitVolumes, binVolume, runVolume(0), downwardVolume, corev1.Volume{
20112012
Name: "tekton-creds-init-home-0",
@@ -2087,6 +2088,7 @@ _EOF_
20872088
{Name: "tekton-internal-bin", ReadOnly: true, MountPath: "/tekton/bin"},
20882089
{Name: "tekton-internal-run-0", ReadOnly: true, MountPath: "/tekton/run/0"},
20892090
}, implicitVolumeMounts...),
2091+
Env: []corev1.EnvVar{{Name: "SIDECAR_LOG_POLLING_INTERVAL", Value: "0s"}},
20902092
}},
20912093
Volumes: append(implicitVolumes, binVolume, runVolume(0), downwardVolume, corev1.Volume{
20922094
Name: "tekton-creds-init-home-0",
@@ -2163,6 +2165,7 @@ _EOF_
21632165
{Name: "tekton-internal-run-0", ReadOnly: true, MountPath: "/tekton/run/0"},
21642166
}, implicitVolumeMounts...),
21652167
SecurityContext: SecurityContextConfig{SetSecurityContext: true, SetReadOnlyRootFilesystem: true}.GetSecurityContext(false),
2168+
Env: []corev1.EnvVar{{Name: "SIDECAR_LOG_POLLING_INTERVAL", Value: "0s"}},
21662169
}},
21672170
Volumes: append(implicitVolumes, binVolume, runVolume(0), downwardVolume, corev1.Volume{
21682171
Name: "tekton-creds-init-home-0",
@@ -2241,6 +2244,7 @@ _EOF_
22412244
{Name: "tekton-internal-bin", ReadOnly: true, MountPath: "/tekton/bin"},
22422245
{Name: "tekton-internal-run-0", ReadOnly: true, MountPath: "/tekton/run/0"},
22432246
}, implicitVolumeMounts...),
2247+
Env: []corev1.EnvVar{{Name: "SIDECAR_LOG_POLLING_INTERVAL", Value: "0s"}},
22442248
}},
22452249
Volumes: append(implicitVolumes, binVolume, runVolume(0), downwardVolume, corev1.Volume{
22462250
Name: "tekton-creds-init-home-0",
@@ -2325,6 +2329,7 @@ _EOF_
23252329
{Name: "tekton-internal-bin", ReadOnly: true, MountPath: "/tekton/bin"},
23262330
{Name: "tekton-internal-run-0", ReadOnly: true, MountPath: "/tekton/run/0"},
23272331
}, implicitVolumeMounts...),
2332+
Env: []corev1.EnvVar{{Name: "SIDECAR_LOG_POLLING_INTERVAL", Value: "0s"}},
23282333
}},
23292334
Volumes: append(implicitVolumes, binVolume, runVolume(0), downwardVolume, corev1.Volume{
23302335
Name: "tekton-creds-init-home-0",
@@ -2404,6 +2409,7 @@ _EOF_
24042409
{Name: "tekton-internal-run-0", ReadOnly: true, MountPath: "/tekton/run/0"},
24052410
}, implicitVolumeMounts...),
24062411
SecurityContext: SecurityContextConfig{SetSecurityContext: true, SetReadOnlyRootFilesystem: true}.GetSecurityContext(false),
2412+
Env: []corev1.EnvVar{{Name: "SIDECAR_LOG_POLLING_INTERVAL", Value: "0s"}},
24072413
}},
24082414
Volumes: append(implicitVolumes, binVolume, runVolume(0), downwardVolume, corev1.Volume{
24092415
Name: "tekton-creds-init-home-0",

0 commit comments

Comments
 (0)