Skip to content

Commit 2e93482

Browse files
authored
Fix recycling failures due to PSI check (#10558)
Temporary fix for a large number of recycling failures that happen due to this PSI check (we get lots of warnings like "Discarding CPU PSI stats: full-stall duration 19.742ms exceeds execution duration 5.920766ms", and both the full-stall duration and execution duration are very small values). More context in the implementation comments.
1 parent e332c78 commit 2e93482

File tree

1 file changed

+14
-9
lines changed
  • enterprise/server/remote_execution/runner

1 file changed

+14
-9
lines changed

enterprise/server/remote_execution/runner/runner.go

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -304,28 +304,33 @@ func (r *taskRunner) Run(ctx context.Context, ioStats *repb.IOStats) (res *inter
304304
start := time.Now()
305305
defer func() {
306306
// Discard nonsensical PSI full-stall durations which are greater
307-
// than the execution duration.
307+
// than the execution duration by a significant amount.
308308
// See https://bugzilla.kernel.org/show_bug.cgi?id=219194
309-
// TL;DR: very rarely, the total stall duration is reported as a number
310-
// which is much larger than the actual execution duration, and is
311-
// sometimes exactly equal to UINT32_MAX nanoseconds, which is
312-
// suspicious and suggests there is a bug in the way this number is
313-
// reported.
309+
// TL;DR: very rarely, a kernel bug causes UINT32_MAX to be reported
310+
// instead of the actual value.
314311
// Also, skip recycling in this case, because the nonsensical result
315312
// will persist across tasks.
316313
runDuration := time.Since(start)
317314
stats := res.UsageStats
318-
if cpuStallDuration := time.Duration(stats.GetCpuPressure().GetFull().GetTotal()) * time.Microsecond; cpuStallDuration > runDuration {
315+
const psiCheckDurationThreshold = 1 * time.Second
316+
// TODO(bduffany): remove this durationThreshold. This is needed because
317+
// we technically track stats for slightly longer than the execution
318+
// stage, because we reset the stats baseline relative to the last
319+
// measurement, not the current value at the start of the execution. We
320+
// should fix TrackExecution to take an initial baseline measurement
321+
// instead, but need to fix some container implementations to support
322+
// it. Podman in particular runs into a deadlock if we try to do this.
323+
if cpuStallDuration := time.Duration(stats.GetCpuPressure().GetFull().GetTotal()) * time.Microsecond; cpuStallDuration > runDuration && cpuStallDuration > psiCheckDurationThreshold {
319324
log.CtxWarningf(ctx, "Discarding CPU PSI stats: full-stall duration %s exceeds execution duration %s", cpuStallDuration, runDuration)
320325
stats.CpuPressure = nil
321326
res.DoNotRecycle = true
322327
}
323-
if memStallDuration := time.Duration(stats.GetMemoryPressure().GetFull().GetTotal()) * time.Microsecond; memStallDuration > runDuration {
328+
if memStallDuration := time.Duration(stats.GetMemoryPressure().GetFull().GetTotal()) * time.Microsecond; memStallDuration > runDuration && memStallDuration > psiCheckDurationThreshold {
324329
log.CtxWarningf(ctx, "Discarding memory PSI stats: full-stall duration %s exceeds execution duration %s", memStallDuration, runDuration)
325330
stats.MemoryPressure = nil
326331
res.DoNotRecycle = true
327332
}
328-
if ioStallDuration := time.Duration(stats.GetIoPressure().GetFull().GetTotal()) * time.Microsecond; ioStallDuration > runDuration {
333+
if ioStallDuration := time.Duration(stats.GetIoPressure().GetFull().GetTotal()) * time.Microsecond; ioStallDuration > runDuration && ioStallDuration > psiCheckDurationThreshold {
329334
log.CtxWarningf(ctx, "Discarding IO PSI stats: full-stall duration %s exceeds execution duration %s", ioStallDuration, runDuration)
330335
stats.IoPressure = nil
331336
res.DoNotRecycle = true

0 commit comments

Comments
 (0)