From c34dd2b9e47f815c119e532a4ae6089d4bd62549 Mon Sep 17 00:00:00 2001 From: Sergio Correia Date: Wed, 17 Dec 2025 22:29:19 -0300 Subject: [PATCH] sss: kill entire process group during decryption cleanup When using SSS with multiple pins (e.g., tang) and threshold < n, after successful decryption with t pins, the remaining child processes and their grandchildren (like curl) were not being killed. The fix uses process groups for decryption only: - Add use_pgrp parameter to call() to control process group creation - For decryption: use_pgrp=true - child calls setpgid(0, 0) to become a process group leader, parent also calls setpgid() to eliminate race - For encryption: use_pgrp=false - no process groups needed since we wait for all children anyway - Cleanup uses kill(-pid, SIGTERM) to kill the entire process group - Falls back to direct kill if process group doesn't exist Signed-off-by: Sergio Correia --- src/pins/sss/clevis-decrypt-sss.c | 10 +- src/pins/sss/clevis-encrypt-sss.c | 2 +- src/pins/sss/meson.build | 3 + src/pins/sss/sss.c | 32 +++- src/pins/sss/sss.h | 4 +- src/pins/sss/tests/meson.build | 8 + src/pins/sss/tests/sss-child-process-cleanup | 177 +++++++++++++++++++ 7 files changed, 230 insertions(+), 6 deletions(-) create mode 100644 src/pins/sss/tests/meson.build create mode 100755 src/pins/sss/tests/sss-child-process-cleanup diff --git a/src/pins/sss/clevis-decrypt-sss.c b/src/pins/sss/clevis-decrypt-sss.c index 15183764..c0ac6ba8 100644 --- a/src/pins/sss/clevis-decrypt-sss.c +++ b/src/pins/sss/clevis-decrypt-sss.c @@ -191,7 +191,7 @@ main(int argc, char *argv[]) chldrn.next = pin; pin->file = call(args, json_string_value(val), - json_string_length(val), &pin->pid); + json_string_length(val), &pin->pid, true); if (!pin->file) goto egress; @@ -314,7 +314,13 @@ main(int argc, char *argv[]) fclose(pin->file); if (pin->pid > 0) { - kill(pin->pid, SIGTERM); + /* + * Kill entire process group (negative PID). This ensures + * grandchildren (e.g., curl spawned by clevis-decrypt-tang) + * are terminated. Fall back to direct kill if group doesn't exist. + */ + if (kill(-pin->pid, SIGTERM) < 0) + (void) kill(pin->pid, SIGTERM); waitpid(pin->pid, NULL, 0); } diff --git a/src/pins/sss/clevis-encrypt-sss.c b/src/pins/sss/clevis-encrypt-sss.c index 1b2cc314..34cde20e 100644 --- a/src/pins/sss/clevis-encrypt-sss.c +++ b/src/pins/sss/clevis-encrypt-sss.c @@ -109,7 +109,7 @@ encrypt_frag(json_t *sss, const char *pin, const json_t *cfg, int assume_yes) if (!pnt) return NULL; - pipe = call(args, pnt, pntl, &pid); + pipe = call(args, pnt, pntl, &pid, false); OPENSSL_cleanse(pnt, pntl); free(pnt); if (!pipe) diff --git a/src/pins/sss/meson.build b/src/pins/sss/meson.build index 2a5295ac..2c0a7486 100644 --- a/src/pins/sss/meson.build +++ b/src/pins/sss/meson.build @@ -21,6 +21,7 @@ if jansson.found() and libcrypto.found() env.prepend('PATH', join_paths(meson.source_root(), 'src'), join_paths(meson.source_root(), 'src', 'pins', 'tang'), + join_paths(meson.build_root(), 'src', 'pins', 'tang', 'tests'), meson.current_build_dir(), '/usr/libexec', libexecdir, @@ -33,6 +34,8 @@ if jansson.found() and libcrypto.found() test('pin-sss', find_program(join_paths(src, 'pin-sss')), env: env) test('pin-null', find_program(join_paths(src, 'pin-null')), env: env) + + subdir('tests') else warning('Will not install sss pin due to missing dependencies!') endif diff --git a/src/pins/sss/sss.c b/src/pins/sss/sss.c index 7486d6c5..57f1ec41 100644 --- a/src/pins/sss/sss.c +++ b/src/pins/sss/sss.c @@ -340,7 +340,8 @@ enum { }; FILE * -call(char *const argv[], const void *buf, size_t len, pid_t *pid) +call(char *const argv[], const void *buf, size_t len, pid_t *pid, + bool use_pgrp) { int dump[2] = { -1, -1 }; int load[2] = { -1, -1 }; @@ -360,6 +361,15 @@ call(char *const argv[], const void *buf, size_t len, pid_t *pid) goto error; if (*pid == 0) { + /* + * Create a new process group so we can kill all descendants. + * Only do this when use_pgrp is set (i.e., for decryption where + * we may need to kill leftover children). For encryption, this + * can cause issues with terminal job control in child scripts. + */ + if (use_pgrp && setpgid(0, 0) < 0) + exit(EXIT_FAILURE); + if (dup2(dump[PIPE_RD], STDIN_FILENO) < 0 || dup2(load[PIPE_WR], STDOUT_FILENO) < 0) exit(EXIT_FAILURE); @@ -368,6 +378,13 @@ call(char *const argv[], const void *buf, size_t len, pid_t *pid) exit(EXIT_FAILURE); } + if (use_pgrp) { + /* Also set from parent to eliminate race with child's setpgid(). + * This may fail if the child has already called setpgid(0,0) or + * exec'd, which is fine - the important thing is one succeeds. */ + (void) setpgid(*pid, *pid); + } + for (const uint8_t *tmp = buf; len > 0; tmp += wr, len -= wr) { wr = write(dump[PIPE_WR], tmp, len); if (wr < 0) @@ -390,7 +407,18 @@ call(char *const argv[], const void *buf, size_t len, pid_t *pid) close(load[PIPE_WR]); if (*pid > 0) { - kill(*pid, SIGTERM); + if (use_pgrp) { + /* + * Kill entire process group (negative PID). This ensures + * grandchildren (e.g., curl spawned by clevis-decrypt-tang) + * are terminated. Fall back to direct kill if group doesn't + * exist. + */ + if (kill(-*pid, SIGTERM) < 0) + (void) kill(*pid, SIGTERM); + } else { + (void) kill(*pid, SIGTERM); + } waitpid(*pid, NULL, 0); *pid = 0; } diff --git a/src/pins/sss/sss.h b/src/pins/sss/sss.h index 81faf791..8b9713cb 100644 --- a/src/pins/sss/sss.h +++ b/src/pins/sss/sss.h @@ -19,6 +19,7 @@ #pragma once #include +#include #include #include @@ -32,4 +33,5 @@ json_t * sss_recover(const json_t *p, size_t npnts, const uint8_t *pnts[]); FILE * -call(char *const argv[], const void *buf, size_t len, pid_t *pid); +call(char *const argv[], const void *buf, size_t len, pid_t *pid, + bool use_pgrp); diff --git a/src/pins/sss/tests/meson.build b/src/pins/sss/tests/meson.build new file mode 100644 index 00000000..4c5eacd0 --- /dev/null +++ b/src/pins/sss/tests/meson.build @@ -0,0 +1,8 @@ +lsof = find_program('lsof', required: false) +pgrep = find_program('pgrep', required: false) + +if lsof.found() and pgrep.found() + test('sss-child-process-cleanup', find_program('sss-child-process-cleanup'), env: env, timeout: 30) +else + warning('Will not run sss-child-process-cleanup test due to missing dependencies (lsof, pgrep)!') +endif diff --git a/src/pins/sss/tests/sss-child-process-cleanup b/src/pins/sss/tests/sss-child-process-cleanup new file mode 100755 index 00000000..a336f746 --- /dev/null +++ b/src/pins/sss/tests/sss-child-process-cleanup @@ -0,0 +1,177 @@ +#!/bin/bash -e +# vim: set tabstop=8 shiftwidth=4 softtabstop=4 expandtab smarttab colorcolumn=80: +# +# Copyright (c) 2025 Red Hat, Inc. +# Author: Sergio Correia +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Test to verify that clevis-decrypt-sss properly kills all child processes +# and their descendants (grandchildren) after successful decryption. +# +# This test addresses issue #460: +# https://github.com/latchset/clevis/issues/460 +# +# The issue: When using SSS with multiple pins (e.g., tang, fido2) and t < n, +# after successful decryption with t pins, the remaining child processes +# (and their grandchildren like curl or fido2-assert) were not being killed. +# +# The fix uses process groups: each child becomes a process group leader +# via setpgid(0,0), and cleanup uses kill(-pid, SIGTERM) to kill the +# entire process group. +# +# Test strategy: +# 1. Start a real tang server that responds quickly +# 2. Start a "hanging" server that accepts connections but never responds +# 3. Encrypt with SSS t=1, two tang pins (real + hanging) +# 4. Decrypt - the real tang succeeds, the hanging one's child should be killed +# 5. Verify no orphaned curl processes remain after decryption + +. tang-common-test-functions + +TMP="$(mktemp -d)" + +# Force exit - kills background jobs first, then exits +force_exit() { + local code="$1" + trap - EXIT # Disable trap to prevent recursion + + # Kill all child processes to prevent bash from waiting + # Use both pkill and direct kill to be thorough + pkill -9 -P $$ 2>/dev/null || true + jobs -p 2>/dev/null | xargs -r kill -9 2>/dev/null || true + + if [ "${code}" -eq 0 ]; then + # Success - normal exit after killing children + exit 0 + else + # Failure - kill entire process group for immediate exit + kill -9 -$$ 2>/dev/null || kill -9 $$ 2>/dev/null || exit "${code}" + fi +} + +# No cleanup trap - we'll force kill on exit + +# Start a real tang server +tang_run "${TMP}" sig exc +port_real=$(tang_get_port "${TMP}") +url_real="http://localhost:${port_real}" + +# Get the advertisement from the real tang server +adv="${TMP}/adv.jws" +tang_get_adv "${port_real}" "${adv}" + +# Start a "hanging" server - accepts TCP connections but never sends HTTP response. +# This simulates a tang server that is unreachable or very slow. +# We use socat to listen and spawn a process that just sleeps forever. +# Use disown to prevent bash from waiting for this background job on exit. +"${SOCAT}" TCP4-LISTEN:0,fork EXEC:"sleep 3600" & +hang_pid=$! +disown "${hang_pid}" +echo "${hang_pid}" > "${TMP}/hang.pid" + +# Wait for the hanging server to start listening +sleep 0.5 +port_hang=$(lsof -nP -a -p "${hang_pid}" -iTCP -sTCP:LISTEN -Fn 2>/dev/null \ + | grep '^n.*:' | head -1 | sed 's/.*://') + +if [ -z "${port_hang}" ]; then + echo "Failed to start hanging server" >&2 + force_exit 1 +fi + +url_hang="http://localhost:${port_hang}" + +echo "Real tang server on port ${port_real}" +echo "Hanging server on port ${port_hang}" + +# Encrypt with SSS: t=1, two tang pins +# Both pins use the same advertisement (from the real server) so encryption works. +# During decryption, one will contact the real server (succeeds) and one will +# contact the hanging server (curl hangs waiting for response). +cfg=$(cat </dev/null | wc -l || echo 0) + +echo "Starting decryption... ($(date +%T))" + +# Decrypt with a timeout. The real tang should respond quickly, so decryption +# should complete in a few seconds. The hanging server's child process +# (and its curl grandchild) should be killed after decryption succeeds. +# +# Without the fix: curl connecting to the hanging server would remain running +# after clevis-decrypt-sss exits (orphaned to init). +# +# With the fix: the entire process group is killed, including curl. + +# Decrypt the data. Without the fix, decryption succeeds but leaves orphaned +# grandchild processes (curl) which we detect below. +dec="" +if ! dec="$(echo -n "${enc}" | clevis decrypt)"; then + echo "FAIL: Decryption failed" >&2 + force_exit 1 +fi + +if [ "${dec}" != "test-cleanup-data" ]; then + echo "FAIL: Decrypted data mismatch: got '${dec}'" >&2 + force_exit 1 +fi + +echo "Decryption successful: '${dec}' ($(date +%T))" + +# Give a moment for process cleanup +sleep 0.5 + +# Check for orphaned curl processes connecting to the hanging server. +# Use lsof to find any process with a connection to port_hang. +# -n: no hostname resolution (faster) +# -P: no port name resolution (faster) +# -t: terse output (just PIDs) +orphaned_connections=$(lsof -nP -i "TCP:${port_hang}" -t 2>/dev/null | grep -v "^${hang_pid}$" || true) + +if [ -n "${orphaned_connections}" ]; then + echo "FAIL: Found orphaned processes connecting to hanging server:" >&2 + echo "${orphaned_connections}" >&2 + force_exit 1 +fi + +# Also check that we don't have more curl processes than before +# (accounting for possible legitimate curl usage) +curl_after=$(pgrep -x curl 2>/dev/null | wc -l || echo 0) + +# We expect curl_after <= curl_before (the curl that was connecting to hang +# server should have been killed) +if [ "${curl_after}" -gt "${curl_before}" ]; then + echo "WARNING: More curl processes after decryption (${curl_after}) than before (${curl_before})" >&2 + echo "This might indicate orphaned processes, but checking connections is more reliable." >&2 +fi + +echo "SUCCESS: No orphaned processes detected after SSS decryption cleanup" +force_exit 0