From de8a820263452c28f97761f3c9ba24ae6266fb99 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 18 Jul 2025 11:00:08 -0400 Subject: [PATCH 1/3] debug hang --- .github/workflows/phoenix/submit-bench.sh | 2 +- .github/workflows/phoenix/submit.sh | 2 +- .github/workflows/phoenix/test.sh | 2 +- toolchain/mfc/args.py | 1 + toolchain/mfc/sched.py | 78 +++++++++++++++++++++-- 5 files changed, 76 insertions(+), 9 deletions(-) diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh index 91160dd73..9c5e8b714 100644 --- a/.github/workflows/phoenix/submit-bench.sh +++ b/.github/workflows/phoenix/submit-bench.sh @@ -21,7 +21,7 @@ sbatch_common_opts="\ #SBATCH -J shb-${sbatch_script%%.sh}-$device # job name #SBATCH --account=gts-sbryngelson3 # account #SBATCH -N1 # nodes -#SBATCH -t 02:00:00 # walltime +#SBATCH -t 03:00:00 # walltime #SBATCH -q embers # QOS #SBATCH -o $job_slug.out # stdout+stderr #SBATCH --mem-per-cpu=2G # default mem (overridden below) diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index 182d7a253..11d13df46 100644 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -21,7 +21,7 @@ sbatch_common_opts="\ #SBATCH -J shb-${sbatch_script%%.sh}-$device # job name #SBATCH --account=gts-sbryngelson3 # account #SBATCH -N1 # nodes -#SBATCH -t 03:00:00 # walltime +#SBATCH -t 04:00:00 # walltime #SBATCH -q embers # QOS #SBATCH -o $job_slug.out # stdout+stderr #SBATCH --mem-per-cpu=2G # default mem (overridden below) diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh index 60b9920f5..86267b2eb 100644 --- a/.github/workflows/phoenix/test.sh +++ b/.github/workflows/phoenix/test.sh @@ -22,7 +22,7 @@ if [ "$job_device" = "gpu" ]; then n_test_threads=`expr $gpu_count \* 2` fi -./mfc.sh test --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix +./mfc.sh test --max-attempts 3 -a --schedul-debug -j $n_test_threads $device_opts -- -c phoenix sleep 10 rm -rf "$currentdir" || true diff --git a/toolchain/mfc/args.py b/toolchain/mfc/args.py index d3de6769c..f1f2b8964 100644 --- a/toolchain/mfc/args.py +++ b/toolchain/mfc/args.py @@ -91,6 +91,7 @@ def add_common_arguments(p, mask = None): test.add_argument( "--no-examples", action="store_true", default=False, help="Do not test example cases." ) test.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.") test.add_argument( "--dry-run", action="store_true", default=False, help="Build and generate case files but do not run tests.") + test.add_argument( "--sched-debug", action="store_true", default=False, help="Enable detailed scheduler debug logging.") test_meg = test.add_mutually_exclusive_group() test_meg.add_argument("--generate", action="store_true", default=False, help="(Test Generation) Generate golden files.") diff --git a/toolchain/mfc/sched.py b/toolchain/mfc/sched.py index 45ec5affb..58aaf1ea5 100644 --- a/toolchain/mfc/sched.py +++ b/toolchain/mfc/sched.py @@ -3,6 +3,7 @@ import traceback from .printer import cons +from .state import ARG class WorkerThread(threading.Thread): def __init__(self, *args, **kwargs): @@ -44,65 +45,120 @@ def sched(tasks: typing.List[Task], nThreads: int, devices: typing.Set[int] = No sched.LOAD = { id: 0.0 for id in devices or [] } + # Debug logging setup + gpu_mode = devices is not None and len(devices) > 0 + debug_enabled = ARG("sched_debug", False) # Check for --sched-debug flag + + def debug_log(msg): + if debug_enabled: + cons.print(msg) + + if debug_enabled: + debug_log(f"[SCHED DEBUG] Starting scheduler: {len(tasks)} tasks, {nThreads} threads, GPU mode: {gpu_mode}") + if gpu_mode: + debug_log(f"[SCHED DEBUG] GPU devices: {devices}") + def join_first_dead_thread(progress, complete_tracker) -> None: nonlocal threads, nAvailable + debug_log(f"[SCHED DEBUG] Checking {len(threads)} active threads for completion") + for threadID, threadHolder in enumerate(threads): # Check if thread is not alive OR if it's been running for too long thread_not_alive = not threadHolder.thread.is_alive() + + debug_log(f"[SCHED DEBUG] Thread {threadID}: alive={threadHolder.thread.is_alive()}, devices={threadHolder.devices}") if thread_not_alive: + debug_log(f"[SCHED DEBUG] Thread {threadID} detected as dead, attempting to join...") + # Properly join the thread with timeout to prevent infinite hangs + join_start_time = time.time() + timeout_duration = 120.0 if gpu_mode else 30.0 # Longer timeout for GPU + + debug_log(f"[SCHED DEBUG] Joining thread {threadID} with {timeout_duration}s timeout...") + try: - threadHolder.thread.join(timeout=30.0) # 30 second timeout + threadHolder.thread.join(timeout=timeout_duration) + join_end_time = time.time() + join_duration = join_end_time - join_start_time + + debug_log(f"[SCHED DEBUG] Thread {threadID} join completed in {join_duration:.2f}s") # Double-check that thread actually finished joining if threadHolder.thread.is_alive(): # Thread didn't finish within timeout - this is a serious issue - raise RuntimeError(f"Thread {threadID} failed to join within 30 seconds timeout. " - f"Thread may be hung or in an inconsistent state.") + debug_log(f"[SCHED DEBUG] ERROR: Thread {threadID} still alive after {timeout_duration}s timeout!") + debug_log(f"[SCHED DEBUG] Thread {threadID} devices: {threadHolder.devices}") + debug_log(f"[SCHED DEBUG] Thread {threadID} exception: {threadHolder.thread.exc}") + raise RuntimeError(f"Thread {threadID} failed to join within {timeout_duration} seconds timeout. " + f"Thread may be hung or in an inconsistent state. " + f"GPU devices: {threadHolder.devices}") except Exception as join_exc: # Handle join-specific exceptions with more context + debug_log(f"[SCHED DEBUG] Exception during thread {threadID} join: {join_exc}") raise RuntimeError(f"Failed to join thread {threadID}: {join_exc}. " - f"This may indicate a system threading issue or hung test case.") from join_exc + f"This may indicate a system threading issue or hung test case. " + f"GPU devices: {threadHolder.devices}") from join_exc + + debug_log(f"[SCHED DEBUG] Thread {threadID} successfully joined") # Check for and propagate any exceptions that occurred in the worker thread # But only if the worker function didn't complete successfully # (This allows test failures to be handled gracefully by handle_case) if threadHolder.thread.exc is not None: + debug_log(f"[SCHED DEBUG] Thread {threadID} had exception: {threadHolder.thread.exc}") + debug_log(f"[SCHED DEBUG] Thread {threadID} completed successfully: {threadHolder.thread.completed_successfully}") + if threadHolder.thread.completed_successfully: # Test framework handled the exception gracefully (e.g., test failure) # Don't re-raise - this is expected behavior + debug_log(f"[SCHED DEBUG] Thread {threadID} exception was handled gracefully by test framework") pass # Unhandled exception - this indicates a real problem elif hasattr(threadHolder.thread, 'exc_info') and threadHolder.thread.exc_info: error_msg = f"Worker thread {threadID} failed with unhandled exception:\n{threadHolder.thread.exc_info}" + debug_log(f"[SCHED DEBUG] Thread {threadID} had unhandled exception!") raise RuntimeError(error_msg) from threadHolder.thread.exc else: + debug_log(f"[SCHED DEBUG] Thread {threadID} had unhandled exception without details") raise threadHolder.thread.exc + # Update scheduler state nAvailable += threadHolder.ppn for device in threadHolder.devices or set(): + old_load = sched.LOAD[device] sched.LOAD[device] -= threadHolder.load / threadHolder.ppn + debug_log(f"[SCHED DEBUG] Device {device} load: {old_load:.3f} -> {sched.LOAD[device]:.3f}") progress.advance(complete_tracker) + debug_log(f"[SCHED DEBUG] Thread {threadID} cleanup complete, removing from active threads") del threads[threadID] break + + debug_log(f"[SCHED DEBUG] join_first_dead_thread completed, {len(threads)} threads remaining") with rich.progress.Progress(console=cons.raw, transient=True) as progress: queue_tracker = progress.add_task("Queued ", total=len(tasks)) complete_tracker = progress.add_task("Completed", total=len(tasks)) + debug_log(f"[SCHED DEBUG] Starting task queue processing...") + # Queue Tests - for task in tasks: + for task_idx, task in enumerate(tasks): + debug_log(f"[SCHED DEBUG] Processing task {task_idx+1}/{len(tasks)}: ppn={task.ppn}, load={task.load}") + # Wait until there are threads available while nAvailable < task.ppn: + debug_log(f"[SCHED DEBUG] Waiting for resources: need {task.ppn}, have {nAvailable}") + # This is important if "-j 1" is used (the default) since there # are test cases that require test.ppn=2 if task.ppn > nThreads and nAvailable > 0: + debug_log(f"[SCHED DEBUG] Task requires more threads ({task.ppn}) than available ({nThreads}), but some are free ({nAvailable})") break # Keep track of threads that are done @@ -118,24 +174,34 @@ def join_first_dead_thread(progress, complete_tracker) -> None: # Use the least loaded devices if devices is not None: use_devices = set() - for _ in range(task.ppn): + debug_log(f"[SCHED DEBUG] Assigning GPU devices for task {task_idx+1}") + for device_idx in range(task.ppn): device = min(sched.LOAD.items(), key=lambda x: x[1])[0] sched.LOAD[device] += task.load / task.ppn use_devices.add(device) + debug_log(f"[SCHED DEBUG] Assigned device {device} (load now: {sched.LOAD[device]:.3f})") nAvailable -= task.ppn + debug_log(f"[SCHED DEBUG] Starting thread for task {task_idx+1}, devices: {use_devices}") thread = WorkerThread(target=task.func, args=tuple(task.args) + (use_devices,)) thread.start() threads.append(WorkerThreadHolder(thread, task.ppn, task.load, use_devices)) + debug_log(f"[SCHED DEBUG] Thread started for task {task_idx+1}, {len(threads)} active threads") + + debug_log(f"[SCHED DEBUG] All tasks queued, waiting for completion...") # Wait for the last tests to complete (MOVED INSIDE CONTEXT) while len(threads) != 0: + debug_log(f"[SCHED DEBUG] Waiting for {len(threads)} threads to complete...") + # Keep track of threads that are done join_first_dead_thread(progress, complete_tracker) # Do not overwhelm this core with this loop time.sleep(0.05) + debug_log(f"[SCHED DEBUG] Scheduler completed successfully!") + sched.LOAD = {} From 84e5fe991d6b87bf6f7f5f61e93d1b27c0fd66cd Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 18 Jul 2025 11:26:35 -0400 Subject: [PATCH 2/3] debug --- .github/workflows/phoenix/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh index 86267b2eb..9db6e3040 100644 --- a/.github/workflows/phoenix/test.sh +++ b/.github/workflows/phoenix/test.sh @@ -22,7 +22,7 @@ if [ "$job_device" = "gpu" ]; then n_test_threads=`expr $gpu_count \* 2` fi -./mfc.sh test --max-attempts 3 -a --schedul-debug -j $n_test_threads $device_opts -- -c phoenix +./mfc.sh test --max-attempts 3 -a --schedule-debug -j $n_test_threads $device_opts -- -c phoenix sleep 10 rm -rf "$currentdir" || true From 05df25ca0f9d8e41933a78abd65116106c72663f Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 18 Jul 2025 11:57:52 -0400 Subject: [PATCH 3/3] fix --- .github/workflows/frontier/test.sh | 4 ++-- .github/workflows/phoenix/test.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/frontier/test.sh b/.github/workflows/frontier/test.sh index 57481fa94..63fff389a 100644 --- a/.github/workflows/frontier/test.sh +++ b/.github/workflows/frontier/test.sh @@ -4,7 +4,7 @@ gpus=`rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ngpus=`echo "$gpus" | tr -d '[:space:]' | wc -c` if [ "$job_device" = "gpu" ]; then - ./mfc.sh test --max-attempts 3 -j $ngpus -- -c frontier + ./mfc.sh test --max-attempts 3 -j $ngpus --sched-debug -- -c frontier else - ./mfc.sh test --max-attempts 3 -j 32 -- -c frontier + ./mfc.sh test --max-attempts 3 -j 32 --sched-debug -- -c frontier fi diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh index 9db6e3040..b15e1a0c9 100644 --- a/.github/workflows/phoenix/test.sh +++ b/.github/workflows/phoenix/test.sh @@ -22,7 +22,7 @@ if [ "$job_device" = "gpu" ]; then n_test_threads=`expr $gpu_count \* 2` fi -./mfc.sh test --max-attempts 3 -a --schedule-debug -j $n_test_threads $device_opts -- -c phoenix +./mfc.sh test --max-attempts 3 -a --sched-debug -j $n_test_threads $device_opts -- -c phoenix sleep 10 rm -rf "$currentdir" || true