fix: add retry and top-level exception handling to prevent unexpected crashes

yana1205 · yana1205 · commit 341f5b89e96f · 2025-08-16T05:48:02.000+09:00
Signed-off-by: Takumi Yanagawa &lt;yana@jp.ibm.com&gt;
diff --git a/docker/agent-harness/entrypoint.sh b/docker/agent-harness/entrypoint.sh
@@ -5,13 +5,17 @@ cd /etc/agent-benchmark
 port="443"
 root_path="/bench-server"
 benchmark_timeout="300"
+benchmark_exec_max_attempts="3"
+benchmark_exec_retry_interval="5"
 
 while [[ $# -gt 0 ]]; do
   case "$1" in
     --host) host="$2"; shift 2 ;;
     --port) port="$2"; shift 2 ;;
     --root_path) root_path="$2"; shift 2 ;;
     --benchmark_timeout) benchmark_timeout="$2"; shift 2 ;;
+    --benchmark_exec_max_attempts) benchmark_exec_max_attempts="$2"; shift 2 ;;
+    --benchmark_exec_retry_interval) benchmark_exec_retry_interval="$2"; shift 2 ;;
     *) echo "Unknown option: $1"; exit 1 ;;
   esac
 done
@@ -27,4 +31,6 @@ python itbench_utilities/agent_harness/main.py \
   --root_path $root_path \
   --ssl \
   --benchmark_timeout $benchmark_timeout \
+  --benchmark_exec_max_attempts $benchmark_exec_max_attempts \
+  --benchmark_exec_retry_interval $benchmark_exec_retry_interval \
   --single_run
diff --git a/itbench_utilities/agent_harness/agent.py b/itbench_utilities/agent_harness/agent.py
@@ -47,6 +47,11 @@ class AgentHarnessConfig(BaseModel):
     path_to_data_pushed_to_scenario: Optional[str] = None
 
 
+class AgentHarnessOpts(BaseModel):
+    benchmark_exec_max_attempts: int = 3
+    benchmark_exec_retry_interval: int = 5
+
+
 class AgentHarness:
 
     def __init__(
@@ -62,6 +67,7 @@ def __init__(
         single_run=False,
         interval=5,
         benchmark_timeout=300,
+        opts: Optional[AgentHarnessOpts] = AgentHarnessOpts(),
     ) -> None:
         self.agent_manifest = agent_manifest
         self.agent_directory = agent_directory
@@ -81,6 +87,7 @@ def __init__(
         )
         self.stop_event = asyncio.Event()
         self.task_history = []
+        self.opts = opts
 
     async def run(self):
 
@@ -107,17 +114,12 @@ async def run(self):
                     benchmark_id = benchmark_entry.benchmark_id
                     logger.info(f"Take the benchmark '{benchmark_entry.benchmark_id}'")
                     self.add_history(benchmark_id)
-                    self.rest_client.put(
-                        f"{self.agent_manifest.manifest_endpoint}/benchmark-entries/{benchmark_id}",
-                        Status(phase=AgentPhaseEnum.Executing).model_dump_json(),
-                    )
-                    is_completed = await self.run_benchmark(benchmark_id, benchmark_entry.agent_access_info.id)
-                    if is_completed:
-                        phase = AgentPhaseEnum.Finished
-                    else:
-                        phase = AgentPhaseEnum.TimeedOut
-                    self.rest_client.put(
-                        f"{self.agent_manifest.manifest_endpoint}/benchmark-entries/{benchmark_id}", Status(phase=phase).model_dump_json()
+                    await run_with_retry(
+                        self.run_benchmark_with_status_update,
+                        retries=self.opts.benchmark_exec_max_attempts,
+                        delay=self.opts.benchmark_exec_retry_interval,
+                        benchmark_id=benchmark_id,
+                        benchmark_entry=benchmark_entry,
                     )
                 if self.single_run:
                     logger.info("Task completed. Exiting due to run-once mode.")
@@ -127,6 +129,18 @@ async def run(self):
             await asyncio.sleep(self.interval)
             elapsed_time += self.interval
 
+    async def run_benchmark_with_status_update(self, benchmark_id, benchmark_entry: AgentBenchmarkEntry):
+        self.rest_client.put(
+            f"{self.agent_manifest.manifest_endpoint}/benchmark-entries/{benchmark_id}",
+            Status(phase=AgentPhaseEnum.Executing).model_dump_json(),
+        )
+        is_completed = await self.run_benchmark(benchmark_id, benchmark_entry.agent_access_info.id)
+        if is_completed:
+            phase = AgentPhaseEnum.Finished
+        else:
+            phase = AgentPhaseEnum.TimeedOut
+        self.rest_client.put(f"{self.agent_manifest.manifest_endpoint}/benchmark-entries/{benchmark_id}", Status(phase=phase).model_dump_json())
+
     async def run_benchmark(self, benchmark_id, agent_id):
 
         timeout = self.benchmark_timeout
@@ -231,6 +245,17 @@ def add_history(self, benchmark_id: str, bundle: Optional[Bundle] = None, agent_
         self.task_history.append(item)
 
 
+async def run_with_retry(func, retries=3, delay=5, *args, **kwargs):
+    for attempt in range(1, retries + 1):
+        try:
+            return await func(*args, **kwargs)
+        except Exception as e:
+            logger.error(f"Attempt {attempt}/{retries} failed for {func.__name__}: {e}")
+            if attempt < retries:
+                await asyncio.sleep(delay)
+    raise RuntimeError(f"{func.__name__} failed after {retries} attempts")
+
+
 def run(args):
     with open(args.input) as f:
         agent_manifest = AgentManifest.model_validate_json(f.read())
@@ -241,6 +266,10 @@ def run(args):
             data = yaml.safe_load(f.read())
             config = AgentHarnessConfig.model_validate(data)
 
+    opts = AgentHarnessOpts(
+        benchmark_exec_retry_interval=args.benchmark_exec_retry_interval,
+        benchmark_exec_max_attempts=args.benchmark_exec_max_attempts,
+    )
     agent_harness = AgentHarness(
         agent_manifest,
         args.agent_directory,
@@ -252,5 +281,6 @@ def run(args):
         benchmark_timeout=args.benchmark_timeout,
         config=config,
         single_run=args.single_run,
+        opts=opts,
     )
     asyncio.run(agent_harness.run())
diff --git a/itbench_utilities/agent_harness/main.py b/itbench_utilities/agent_harness/main.py
@@ -53,6 +53,8 @@ def main():
         action="store_true",
         help="Process one benchmark job and exit",
     )
+    parser.add_argument("--benchmark_exec_max_attempts", type=int, default=3, help=f"Maximum number of attempts to run the benchmark with status updates (default: 3).")
+    parser.add_argument("--benchmark_exec_retry_interval", type=int, default=5, help=f"Seconds to wait between retry attempts for benchmark execution (default: 5).")
 
     args = parser.parse_args()
 

Original file line number	Diff line number	Diff line change
`@@ -53,6 +53,8 @@ def main():`
`53`	`53`	`action="store_true",`
`54`	`54`	`help="Process one benchmark job and exit",`
`55`	`55`	`)`
	`56`	`+ parser.add_argument("--benchmark_exec_max_attempts", type=int, default=3, help=f"Maximum number of attempts to run the benchmark with status updates (default: 3).")`
	`57`	`+ parser.add_argument("--benchmark_exec_retry_interval", type=int, default=5, help=f"Seconds to wait between retry attempts for benchmark execution (default: 5).")`
`56`	`58`
`57`	`59`	`args = parser.parse_args()`
`58`	`60`