Skip to content

Commit 2ebc900

Browse files
committed
Ignore spurious warnings when killing containers
There are certain messages logged by OCI runtimes when killing a container that has already stopped that we really do not care about when stopping a container. Due to our architecture, there are inherent races around stopping containers, and so we cannot guarantee that *we* are the people to kill it - but that doesn't matter because Podman only cares that the container has stopped, not who delivered the fatal signal. Unfortunately, the OCI runtimes don't understand this, and log various warning messages when the `kill` command is invoked on a container that was already dead. These cause our tests to fail, as we now check for clean STDERR when running Podman. To work around this, capture STDERR for the OCI runtime in a buffer only for stopping containers, and go through and discard any of the warnings we identified as spurious. Signed-off-by: Matthew Heon <matthew.heon@pm.me>
1 parent f1ecdca commit 2ebc900

File tree

1 file changed

+76
-30
lines changed

1 file changed

+76
-30
lines changed

libpod/oci_conmon_common.go

Lines changed: 76 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -356,10 +356,20 @@ func generateResourceFile(res *spec.LinuxResources) (string, []string, error) {
356356
// If all is set, send to all PIDs in the container.
357357
// All is only supported if the container created cgroups.
358358
func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool) error {
359+
if _, err := r.killContainer(ctr, signal, all, false); err != nil {
360+
return err
361+
}
362+
363+
return nil
364+
}
365+
366+
// If captureStderr is requested, OCI runtime STDERR will be captured as a
367+
// *bytes.buffer and returned; otherwise, it is set to os.Stderr.
368+
func (r *ConmonOCIRuntime) killContainer(ctr *Container, signal uint, all, captureStderr bool) (*bytes.Buffer, error) {
359369
logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID())
360370
runtimeDir, err := util.GetRuntimeDir()
361371
if err != nil {
362-
return err
372+
return nil, err
363373
}
364374
env := []string{fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)}
365375
var args []string
@@ -369,19 +379,27 @@ func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool)
369379
} else {
370380
args = append(args, "kill", ctr.ID(), fmt.Sprintf("%d", signal))
371381
}
372-
if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, os.Stderr, env, r.path, args...); err != nil {
382+
var (
383+
stderr io.Writer = os.Stderr
384+
stderrBuffer *bytes.Buffer
385+
)
386+
if captureStderr {
387+
stderrBuffer = new(bytes.Buffer)
388+
stderr = stderrBuffer
389+
}
390+
if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, stderr, env, r.path, args...); err != nil {
373391
// Update container state - there's a chance we failed because
374392
// the container exited in the meantime.
375393
if err2 := r.UpdateContainerStatus(ctr); err2 != nil {
376394
logrus.Infof("Error updating status for container %s: %v", ctr.ID(), err2)
377395
}
378396
if ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) {
379-
return fmt.Errorf("%w: %s", define.ErrCtrStateInvalid, ctr.state.State)
397+
return stderrBuffer, fmt.Errorf("%w: %s", define.ErrCtrStateInvalid, ctr.state.State)
380398
}
381-
return fmt.Errorf("sending signal to container %s: %w", ctr.ID(), err)
399+
return stderrBuffer, fmt.Errorf("sending signal to container %s: %w", ctr.ID(), err)
382400
}
383401

384-
return nil
402+
return stderrBuffer, nil
385403
}
386404

387405
// StopContainer stops a container, first using its given stop signal (or
@@ -400,23 +418,65 @@ func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool)
400418
return nil
401419
}
402420

403-
if timeout > 0 {
404-
stopSignal := ctr.config.StopSignal
405-
if stopSignal == 0 {
406-
stopSignal = uint(syscall.SIGTERM)
421+
killCtr := func(signal uint) (bool, error) {
422+
stderr, err := r.killContainer(ctr, signal, all, true)
423+
424+
// Before handling error from KillContainer, convert STDERR to a []string
425+
// (one string per line of output) and print it, ignoring known OCI runtime
426+
// errors that we don't care about
427+
stderrLines := strings.Split(stderr.String(), "\n")
428+
for _, line := range stderrLines {
429+
if line == "" {
430+
continue
431+
}
432+
if strings.Contains(line, "container not running") || strings.Contains(line, "open pidfd: No such process") {
433+
logrus.Debugf("Failure to kill container (already stopped?): logged %s", line)
434+
continue
435+
}
436+
fmt.Fprintf(os.Stderr, "%s\n", line)
407437
}
408-
if err := r.KillContainer(ctr, stopSignal, all); err != nil {
438+
439+
if err != nil {
440+
// There's an inherent race with the cleanup process (see
441+
// #16142, #17142). If the container has already been marked as
442+
// stopped or exited by the cleanup process, we can return
443+
// immediately.
444+
if errors.Is(err, define.ErrCtrStateInvalid) && ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) {
445+
return true, nil
446+
}
447+
448+
// If the PID is 0, then the container is already stopped.
449+
if ctr.state.PID == 0 {
450+
return true, nil
451+
}
452+
409453
// Is the container gone?
410454
// If so, it probably died between the first check and
411455
// our sending the signal
412456
// The container is stopped, so exit cleanly
413457
err := unix.Kill(ctr.state.PID, 0)
414458
if err == unix.ESRCH {
415-
return nil
459+
return true, nil
416460
}
417461

462+
return false, err
463+
}
464+
return false, nil
465+
}
466+
467+
if timeout > 0 {
468+
stopSignal := ctr.config.StopSignal
469+
if stopSignal == 0 {
470+
stopSignal = uint(syscall.SIGTERM)
471+
}
472+
473+
stopped, err := killCtr(stopSignal)
474+
if err != nil {
418475
return err
419476
}
477+
if stopped {
478+
return nil
479+
}
420480

421481
if err := waitContainerStop(ctr, time.Duration(timeout)*time.Second); err != nil {
422482
logrus.Debugf("Timed out stopping container %s with %s, resorting to SIGKILL: %v", ctr.ID(), unix.SignalName(syscall.Signal(stopSignal)), err)
@@ -427,27 +487,13 @@ func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool)
427487
}
428488
}
429489

430-
// If the timeout was set to 0 or if stopping the container with the
431-
// specified signal did not work, use the big hammer with SIGKILL.
432-
if err := r.KillContainer(ctr, uint(unix.SIGKILL), all); err != nil {
433-
// There's an inherent race with the cleanup process (see
434-
// #16142, #17142). If the container has already been marked as
435-
// stopped or exited by the cleanup process, we can return
436-
// immediately.
437-
if errors.Is(err, define.ErrCtrStateInvalid) && ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) {
438-
return nil
439-
}
440-
441-
// If the PID is 0, then the container is already stopped.
442-
if ctr.state.PID == 0 {
443-
return nil
444-
}
445-
// Again, check if the container is gone. If it is, exit cleanly.
446-
if aliveErr := unix.Kill(ctr.state.PID, 0); errors.Is(aliveErr, unix.ESRCH) {
447-
return nil
448-
}
490+
stopped, err := killCtr(uint(unix.SIGKILL))
491+
if err != nil {
449492
return fmt.Errorf("sending SIGKILL to container %s: %w", ctr.ID(), err)
450493
}
494+
if stopped {
495+
return nil
496+
}
451497

452498
// Give runtime a few seconds to make it happen
453499
if err := waitContainerStop(ctr, killContainerTimeout); err != nil {

0 commit comments

Comments
 (0)