@@ -304,28 +304,33 @@ func (r *taskRunner) Run(ctx context.Context, ioStats *repb.IOStats) (res *inter
304304 start := time .Now ()
305305 defer func () {
306306 // Discard nonsensical PSI full-stall durations which are greater
307- // than the execution duration.
307+ // than the execution duration by a significant amount .
308308 // See https://bugzilla.kernel.org/show_bug.cgi?id=219194
309- // TL;DR: very rarely, the total stall duration is reported as a number
310- // which is much larger than the actual execution duration, and is
311- // sometimes exactly equal to UINT32_MAX nanoseconds, which is
312- // suspicious and suggests there is a bug in the way this number is
313- // reported.
309+ // TL;DR: very rarely, a kernel bug causes UINT32_MAX to be reported
310+ // instead of the actual value.
314311 // Also, skip recycling in this case, because the nonsensical result
315312 // will persist across tasks.
316313 runDuration := time .Since (start )
317314 stats := res .UsageStats
318- if cpuStallDuration := time .Duration (stats .GetCpuPressure ().GetFull ().GetTotal ()) * time .Microsecond ; cpuStallDuration > runDuration {
315+ const psiCheckDurationThreshold = 1 * time .Second
316+ // TODO(bduffany): remove this durationThreshold. This is needed because
317+ // we technically track stats for slightly longer than the execution
318+ // stage, because we reset the stats baseline relative to the last
319+ // measurement, not the current value at the start of the execution. We
320+ // should fix TrackExecution to take an initial baseline measurement
321+ // instead, but need to fix some container implementations to support
322+ // it. Podman in particular runs into a deadlock if we try to do this.
323+ if cpuStallDuration := time .Duration (stats .GetCpuPressure ().GetFull ().GetTotal ()) * time .Microsecond ; cpuStallDuration > runDuration && cpuStallDuration > psiCheckDurationThreshold {
319324 log .CtxWarningf (ctx , "Discarding CPU PSI stats: full-stall duration %s exceeds execution duration %s" , cpuStallDuration , runDuration )
320325 stats .CpuPressure = nil
321326 res .DoNotRecycle = true
322327 }
323- if memStallDuration := time .Duration (stats .GetMemoryPressure ().GetFull ().GetTotal ()) * time .Microsecond ; memStallDuration > runDuration {
328+ if memStallDuration := time .Duration (stats .GetMemoryPressure ().GetFull ().GetTotal ()) * time .Microsecond ; memStallDuration > runDuration && memStallDuration > psiCheckDurationThreshold {
324329 log .CtxWarningf (ctx , "Discarding memory PSI stats: full-stall duration %s exceeds execution duration %s" , memStallDuration , runDuration )
325330 stats .MemoryPressure = nil
326331 res .DoNotRecycle = true
327332 }
328- if ioStallDuration := time .Duration (stats .GetIoPressure ().GetFull ().GetTotal ()) * time .Microsecond ; ioStallDuration > runDuration {
333+ if ioStallDuration := time .Duration (stats .GetIoPressure ().GetFull ().GetTotal ()) * time .Microsecond ; ioStallDuration > runDuration && ioStallDuration > psiCheckDurationThreshold {
329334 log .CtxWarningf (ctx , "Discarding IO PSI stats: full-stall duration %s exceeds execution duration %s" , ioStallDuration , runDuration )
330335 stats .IoPressure = nil
331336 res .DoNotRecycle = true
0 commit comments