Skip to content

Commit 052e02d

Browse files
committed
wip:debugging multinode tests
1 parent da9ce71 commit 052e02d

File tree

1 file changed

+18
-16
lines changed

1 file changed

+18
-16
lines changed

scripts/multinode-runner.sh

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,6 @@
11
#!/bin/bash
22
set -euo pipefail
33

4-
# Debug: Show bash version and options in CI
5-
if [[ -n "${CI:-}" ]]; then
6-
echo "[DEBUG] Bash version: $BASH_VERSION" >&2
7-
echo "[DEBUG] Bash options: $-" >&2
8-
fi
9-
104
# Troupe Multi-Node Test Runner - Refactored Version
115
# Orchestrates multi-node tests with proper cleanup and output synchronization
126
#
@@ -152,7 +146,6 @@ cleanup() {
152146
# CRITICAL: Capture the original exit code immediately
153147
# This must be the FIRST command in cleanup to preserve test results
154148
local exit_code=$?
155-
echo "[DEBUG] Cleanup called with exit code: $exit_code" >&2
156149
log "Cleaning up test processes (exit code: $exit_code)..."
157150

158151
local cleaned_count=0
@@ -317,7 +310,6 @@ cleanup() {
317310
# CRITICAL: Exit with the original exit code from the test, not from cleanup
318311
# Without this, the script would exit with the status of the last cleanup command
319312
# This is why tests were "failing" in CI even though they succeeded
320-
echo "[DEBUG] Exiting cleanup with code: $exit_code" >&2
321313
exit $exit_code
322314
}
323315

@@ -632,10 +624,11 @@ run_node() {
632624
local node_index="$2"
633625
local test_dir="$3"
634626
local output_dir="$4"
635-
627+
local in_parallel="${5:-false}" # New parameter to indicate parallel mode
628+
636629
local node_config
637630
node_config=$(jq -r ".nodes[$node_index]" "$config_file")
638-
631+
639632
local node_id script port start_delay expected_exit_code extra_argv
640633
node_id=$(echo "$node_config" | jq -r '.id')
641634
script=$(echo "$node_config" | jq -r '.script')
@@ -762,14 +755,23 @@ run_node() {
762755
log "Node $node_id timed out as expected after ${scaled_timeout}s"
763756
else
764757
display_node_error "$node_id" "timeout" "$scaled_timeout" "$expected_exit_code" "$output_file" "$error_file" "$timeout_val"
765-
error "Node $node_id timed out unexpectedly"
758+
if [[ "$in_parallel" == "true" ]]; then
759+
return 1 # Return error code instead of exiting in parallel mode
760+
else
761+
error "Node $node_id timed out unexpectedly"
762+
fi
766763
fi
767764
elif [[ "$actual_exit_code" != "$expected_exit_code" ]]; then
768765
display_node_error "$node_id" "exit_code" "$actual_exit_code" "$expected_exit_code" "$output_file" "$error_file"
769-
error "Node $node_id exited with code $actual_exit_code, expected $expected_exit_code"
766+
if [[ "$in_parallel" == "true" ]]; then
767+
return 1 # Return error code instead of exiting in parallel mode
768+
else
769+
error "Node $node_id exited with code $actual_exit_code, expected $expected_exit_code"
770+
fi
770771
fi
771772

772773
log "Node $node_id completed successfully (exit code: $actual_exit_code)"
774+
return 0 # Explicitly return success
773775
}
774776

775777
merge_outputs() {
@@ -864,26 +866,26 @@ run_test() {
864866
# Start all nodes simultaneously
865867
local node_pids=()
866868
for ((i=0; i<node_count; i++)); do
867-
run_node "$config_file" "$i" "$test_dir" "$output_dir" &
869+
run_node "$config_file" "$i" "$test_dir" "$output_dir" "true" &
868870
node_pids+=($!)
869871
done
870-
872+
871873
# Wait for all nodes
872874
local failed=false
873875
for pid in "${node_pids[@]}"; do
874876
if ! wait "$pid"; then
875877
failed=true
876878
fi
877879
done
878-
880+
879881
if [[ "$failed" == "true" ]]; then
880882
error "One or more nodes failed"
881883
fi
882884
;;
883885
"sequential")
884886
# Start nodes one after another
885887
for ((i=0; i<node_count; i++)); do
886-
run_node "$config_file" "$i" "$test_dir" "$output_dir"
888+
run_node "$config_file" "$i" "$test_dir" "$output_dir" "false"
887889
done
888890
;;
889891
*)

0 commit comments

Comments
 (0)