From ebdcb0a7017fcdc06a52c2ef0892b2af7c70578f Mon Sep 17 00:00:00 2001
From: phi-jkim <jinhaworld@gmail.com>
Date: Thu, 14 Aug 2025 11:25:06 -0700
Subject: [PATCH 1/3] Add runner train and optimizer runner

---
 adalflow/adalflow/components/agent/agent.py   |   4 +-
 adalflow/adalflow/components/agent/react.py   |   9 +-
 adalflow/adalflow/components/agent/runner.py  | 316 ++++++++-
 .../components/agent/runner_trainer.py        | 219 ++++++
 adalflow/adalflow/core/functional.py          |  11 +-
 adalflow/adalflow/core/generator.py           |   5 +-
 adalflow/adalflow/core/types.py               |   4 +
 adalflow/adalflow/datasets/hotpot_qa.py       |   1 -
 adalflow/adalflow/optim/parameter.py          |   2 +
 adalflow/tests/test_runner.py                 | 323 +++++++++
 adalflow/tests/test_runner_trainer.py         | 650 ++++++++++++++++++
 .../hotpot_qa/adal_exp/train_agent_rag.py     |   4 +
 docs/datasets_benchmarks.md                   | 226 ++++++
 ...adalflow_classification_optimization.ipynb | 128 ++--
 .../tutorials/adalflow_rag_optimization.ipynb | 112 ++-
 poetry.lock                                   |  87 ++-
 pyproject.toml                                |   3 +
 .../runner_trainable_tutorial.py              | 273 ++++++++
 use_cases/question_answering/gsm8k/train.py   |  27 +-
 19 files changed, 2268 insertions(+), 136 deletions(-)
 create mode 100644 adalflow/adalflow/components/agent/runner_trainer.py
 create mode 100644 adalflow/tests/test_runner_trainer.py
 create mode 100644 docs/datasets_benchmarks.md
 create mode 100644 tutorials/v1_agent_runner/runner_trainable_tutorial.py

diff --git a/adalflow/adalflow/components/agent/agent.py b/adalflow/adalflow/components/agent/agent.py
index 5c6f3001e..9cfb9ae61 100644
--- a/adalflow/adalflow/components/agent/agent.py
+++ b/adalflow/adalflow/components/agent/agent.py
@@ -132,7 +132,7 @@ def create_default_planner(
     cache_path: Optional[str] = None,
     use_cache: Optional[bool] = False,
     # default agent parameters
-    max_steps: Optional[int] = 10,
+    max_steps: Optional[int] = 3,
     is_thinking_model: Optional[bool] = False,
     answer_data_type: Optional[Type[T]] = str,
     **kwargs,
@@ -259,7 +259,7 @@ def __init__(
         use_cache: Optional[bool] = True,
         # default agent parameters
         answer_data_type: Optional[Type[T]] = str,  # the data type of the final answer
-        max_steps: Optional[int] = 10,
+        max_steps: Optional[int] = DEFAULT_MAX_STEPS,
         is_thinking_model: Optional[bool] = False,  # support thinking model in agent
         # for fully customize the agent
         tool_manager: Optional[
diff --git a/adalflow/adalflow/components/agent/react.py b/adalflow/adalflow/components/agent/react.py
index c5e3d5977..d8295981b 100644
--- a/adalflow/adalflow/components/agent/react.py
+++ b/adalflow/adalflow/components/agent/react.py
@@ -559,7 +559,7 @@ def _run_one_step(
                 )
 
         log.debug(
-            f"Running step {step} with prompt: {self.planner.prompt(**prompt_kwargs)}"
+            f"Running step {step} with prompt: {self.planner.get_prompt(**prompt_kwargs)}"
         )
         try:
 
@@ -682,7 +682,8 @@ def _get_answer(
                 step_history=step_history,
                 id=last_step.data_id,
                 react_agent_task_desc=self.planner.prompt_kwargs[
-                    "react_agent_task_desc"
+                    # "react_agent_task_desc"
+                    "task_desc"
                 ],
             )
 
@@ -720,7 +721,7 @@ def _is_step_output_last_step(self, step_output: StepOutput) -> bool:
     def bicall(
         self,
         input: str,  # open up to the external
-        promt_kwargs: Optional[Dict] = {},
+        prompt_kwargs: Optional[Dict] = {},
         model_kwargs: Optional[Dict] = {},
         id: Optional[str] = None,
     ) -> Union["Parameter", ReActOutput]:
@@ -729,7 +730,7 @@ def bicall(
 
         # set up the prompts
         prompt_kwargs = {
-            **promt_kwargs,
+            **prompt_kwargs,
             "input_str": input,
         }
 
diff --git a/adalflow/adalflow/components/agent/runner.py b/adalflow/adalflow/components/agent/runner.py
index 017835c2b..41aee019e 100644
--- a/adalflow/adalflow/components/agent/runner.py
+++ b/adalflow/adalflow/components/agent/runner.py
@@ -20,10 +20,12 @@
 from typing_extensions import TypeAlias
 
 
-from adalflow.optim.parameter import Parameter
+from adalflow.optim.parameter import Parameter, ParameterType, OutputParameter
 from adalflow.utils import printc
 from adalflow.core.component import Component
 from adalflow.components.agent.agent import Agent
+from adalflow.optim.function import BackwardContext
+
 
 from adalflow.core.types import (
     GeneratorOutput,
@@ -53,6 +55,7 @@
     response_span,
     step_span,
 )
+from adalflow.optim.grad_component import GradComponent
 
 
 __all__ = ["Runner"]
@@ -68,9 +71,26 @@
 ]  # Replace with your actual Adalflow dataclass type if available
 
 
+class CombineStepHistoryAndRunnerResult(GradComponent):
+    def __init__(self):
+        super().__init__(desc="Extract the final answer from the step history.")
+
+    def call(
+        self,
+        step_history: List[StepOutput], 
+        runner_result: RunnerResult,
+        task_desc: str,  # skip connection
+        id: Optional[str] = None,
+    ) -> str:
+        if not runner_result:
+            return ""
+        # answer = step_history[-1].observation
+        answer = runner_result.answer
+        return answer
+
 # The runner will create tool call request, add a unique call id.
 # TODO: move this to repo adalflow/agent
-class Runner(Component):
+class Runner(GradComponent):
     """Executes Agent instances with multi-step iterative planning and tool execution.
 
     The Runner orchestrates the execution of an Agent through multiple reasoning and action
@@ -106,6 +126,7 @@ def __init__(
         max_steps: Optional[int] = None, # this will overwrite the agent's max_steps
         permission_manager: Optional[PermissionManager] = None,
         conversation_memory: Optional[ConversationMemory] = None,
+        training: Optional[bool] = False,
         **kwargs,
     ) -> None:
         """Initialize runner with an agent and configuration.
@@ -118,7 +139,9 @@ def __init__(
             permission_manager: Optional permission manager for tool approval
             conversation_memory: Optional conversation memory
         """
-        super().__init__(**kwargs)
+        Component.__init__(self, **kwargs)
+        GradComponent.__init__(self, desc="Generate a response using LLM model.", **kwargs)
+        # creates a backward engine if it is not passed in the kwargs
         self.agent = agent
         self.tool_manager = agent.tool_manager
         self.permission_manager = permission_manager
@@ -137,6 +160,7 @@ def __init__(
         else:
             # overwrite the agent's max_steps
             self.agent.max_steps = max_steps
+
         self.answer_data_type = agent.answer_data_type or str
 
         self.step_history: List[StepOutput] = []
@@ -153,6 +177,11 @@ def __init__(
         self._cancel_callbacks = []
         self._current_task = None  # Track the current running task
         self._current_streaming_result = None  # Track the current streaming result
+        self.training = training
+
+        # combine step history
+        self.combine_step_history_and_runner_result = CombineStepHistoryAndRunnerResult()
+
 
     def _init_permission_manager(self):
         """Initialize the permission manager and register tools that require approval."""
@@ -181,8 +210,6 @@ def set_permission_manager(
         if permission_manager is not None:
             permission_manager.set_tool_manager(self.tool_manager)
 
-
-
     def is_cancelled(self) -> bool:
         """Check if execution has been cancelled."""
         return self._cancelled
@@ -204,15 +231,19 @@ async def cancel(self) -> None:
         self._cancelled = True
 
         # Try to emit a test event if we have a streaming result
-        if hasattr(self, '_current_streaming_result') and self._current_streaming_result:
+        if (
+            hasattr(self, "_current_streaming_result")
+            and self._current_streaming_result
+        ):
             try:
                 cancel_received_event = RunItemStreamEvent(
                     name="runner.cancel_received",
                     item=FinalOutputItem(
                         data={
-                        "status": "cancel_received",
-                        "message": "Cancel request received",
-                    })
+                            "status": "cancel_received",
+                            "message": "Cancel request received",
+                        }
+                    ),
                 )
                 self._current_streaming_result.put_nowait(cancel_received_event)
                 log.info("Emitted cancel_received event")
@@ -232,10 +263,7 @@ async def _wait_for_cancellation(self):
         if self._current_task:
             try:
                 # Wait up to 1 second for task to cancel gracefully
-                await asyncio.wait_for(
-                    self._current_task,
-                    timeout=1.0
-                )
+                await asyncio.wait_for(self._current_task, timeout=1.0)
             except (asyncio.TimeoutError, asyncio.CancelledError):
                 # Task didn't cancel in time or was cancelled - that's ok
                 pass
@@ -253,16 +281,69 @@ def _get_final_answer(self, function: Function) -> Any:
             return self._process_data(function._answer)
         return None
 
-
-    def _create_runner_result(self, answer: Any, step_history, error: Optional[str] = None,  ) -> RunnerResult:
+    def _create_runner_result(
+        self,
+        answer: Any,
+        step_history,
+        error: Optional[str] = None,
+        id: Optional[str] = None,
+    ) -> RunnerResult:
         """Create a RunnerResult object with the final answer and error."""
         return RunnerResult(
             answer=answer,
             step_history=step_history.copy(),
             error=error,
+            id=id,
             # ctx=self.ctx,
         )
-    def _create_execution_complete_stream_event(self, streaming_result: RunnerStreamingResult, final_output_item: FinalOutputItem):
+
+    def _create_trainable_runner_result(
+        self,
+        runner_result: RunnerResult,
+        name: str,
+        previous_output: Parameter,
+        prompt_kwargs: Dict[str, Any],
+        id: Optional[str] = None,
+    ) -> OutputParameter:
+        """Helper function to create a trainable OutputParameter with proper grad_fn setup.
+        
+        Args:
+            runner_result: The RunnerResult data
+            name: Name for the parameter
+            previous_output: The predecessor parameter to link to
+            prompt_kwargs: Original prompt arguments for gradient computation
+            id: Optional identifier
+            
+        Returns:
+            OutputParameter with grad_fn properly configured
+        """
+        runner_result_parameter = OutputParameter(
+            name=name,
+            data=runner_result,
+            param_type=ParameterType.OUTPUT,
+            requires_opt=True,
+        )
+        
+        runner_result_parameter.set_predecessors([previous_output])
+
+        runner_result_parameter.set_grad_fn(
+            BackwardContext(
+                backward_fn=self.backward,
+                backward_engine=self.backward_engine,
+                response=runner_result_parameter,
+                prompt_kwargs=prompt_kwargs,
+                template=self.agent.planner.template,
+                prompt_str=self.agent.planner.get_prompt(**prompt_kwargs),
+                disable_backward_engine=self._disable_backward_engine,
+                id=id,
+            )
+        )
+
+        return runner_result_parameter
+
+    def _create_execution_complete_stream_event(
+        self, streaming_result: RunnerStreamingResult, final_output_item: FinalOutputItem
+    ):
         """Complete the streaming execution by adding a sentinel."""
         final_output_event = RunItemStreamEvent(
             name="agent.execution_complete",
@@ -289,8 +370,14 @@ def _add_assistant_response_to_memory(self, final_output_item: FinalOutputItem):
                 )
             )
 
-    def create_response_span(self, runner_result, step_count: int, streaming_result: RunnerStreamingResult, runner_span_instance, workflow_status: str = "stream_completed"):
-
+    def create_response_span(
+        self,
+        runner_result,
+        step_count: int,
+        streaming_result: RunnerStreamingResult,
+        runner_span_instance,
+        workflow_status: str = "stream_completed",
+    ):
         runner_span_instance.span_data.update_attributes(
             {
                 "steps_executed": step_count + 1,
@@ -313,15 +400,13 @@ def create_response_span(self, runner_result, step_count: int, streaming_result:
         ):
             pass
 
-
-
-
     async def _process_stream_final_step(
         self,
         answer: Any,
         step_count: int,
         streaming_result,
-        runner_span_instance
+        runner_span_instance,
+        id: Optional[str] = None,
     ) -> FinalOutputItem:
         """Process the final step and trace it."""
         # processed_data = self._get_final_answer(function)
@@ -332,6 +417,7 @@ async def _process_stream_final_step(
         runner_result = self._create_runner_result(
             answer=answer,
             step_history=self.step_history,
+            id=id,
         )
 
         # Update runner span with final results
@@ -401,7 +487,9 @@ def _process_data(
                     except json.JSONDecodeError as e:
                         raise ValueError(f"Invalid JSON in data: {e}")
                     if not isinstance(data, dict):
-                        raise ValueError(f"Expected dict after JSON parsing, got {type(data)}")
+                        raise ValueError(
+                            f"Expected dict after JSON parsing, got {type(data)}"
+                        )
                 log.info(
                     f"initial answer after being evaluated using json: {data}, type: {type(data)}"
                 )
@@ -458,7 +546,7 @@ def call(
         model_kwargs: Optional[Dict[str, Any]] = None,
         use_cache: Optional[bool] = None,
         id: Optional[str] = None,  # global run id
-    ) -> RunnerResult:
+    ) -> Union[RunnerResult, OutputParameter]:
         """Execute the planner synchronously for multiple steps with function calling support.
 
         At the last step the action should be set to "finish" instead which terminates the sequence
@@ -559,6 +647,7 @@ def call(
                             last_output = RunnerResult(
                                 answer=processed_data,
                                 step_history=self.step_history.copy(),
+                                id=id,
                                 # ctx=self.ctx,
                             )
 
@@ -690,8 +779,182 @@ def call(
                 answer=current_error or f"No output generated after {step_count} steps (max_steps: {self.max_steps})",
                 step_history=self.step_history.copy(),
                 error=current_error,
+                id=id,
             )
 
+    # trainable version of Runner call 
+    def forward(
+        self,
+        prompt_kwargs: Dict[str, Any],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+        use_cache: Optional[bool] = None,
+        id: Optional[str] = None,
+    ) -> Parameter:
+        """Trainable version of call that chains generator outputs and returns a trainable Parameter.
+        
+        Each generator output becomes a trainable predecessor for the next one, and the final
+        result is wrapped in a trainable Parameter.
+        """
+        # Initialize tracking variables
+        self.agent.planner.training = True
+
+        previous_output = None
+        step_count = 0
+        self.step_history = []
+        
+        # Set up the initial prompt
+        prompt_kwargs = prompt_kwargs.copy() if prompt_kwargs else {}
+        prompt_kwargs["step_history"] = self.step_history
+        
+        if self.use_conversation_memory:
+            self.conversation_memory.reset_pending_query()
+            prompt_kwargs["chat_history_str"] = self.conversation_memory()
+            query_metadata = {"context_str": prompt_kwargs.get("context_str", None)}
+            self.conversation_memory.add_user_query(
+                UserQuery(
+                    query_str=prompt_kwargs.get("input_str", None),
+                    metadata=query_metadata,
+                )
+            )
+        
+        prompt_kwargs["max_steps"] = self.max_steps
+        model_kwargs = model_kwargs.copy() if model_kwargs else {}
+        
+        # Main training loop
+        while step_count < self.max_steps:
+            try:
+                # Get the next generator output
+                current_output = self.agent.planner.forward(
+                    prompt_kwargs=prompt_kwargs,
+                    model_kwargs=model_kwargs,
+                    # use_cache=use_cache, # TODO no use_cache method for forward
+                    id=id,
+                    # predecessor=previous_output  # Chain to previous output
+                )
+                printc(
+                    f"agent planner prompt call: {self.agent.planner.get_prompt(**prompt_kwargs)}"
+                )
+
+                
+                # If this is the first step, we need to create a trainable parameter
+                trainable_output = current_output 
+                trainable_output.name = f"step_{step_count}"
+                trainable_output.requires_opt = True
+                if previous_output is not None:
+                    # Chain to previous trainable output
+                    # TODO make cleaner 
+                    if trainable_output.predecessors is None:
+                        trainable_output.predecessors = set()
+                    trainable_output.predecessors.add(previous_output)
+
+                previous_output = trainable_output
+
+                generator_output = current_output.data
+
+                printc(f"planner output: {generator_output}")
+                
+                # Process the function call if needed
+                if hasattr(generator_output, 'data') and isinstance(generator_output.data, Function):
+                    function = generator_output.data
+                    function.id = str(uuid.uuid4())
+                    
+                    # Handle final step
+                    if self._check_last_step(function):
+                        printc(f"Function answer: {function._answer}")
+                        processed_data = self._process_data(function._answer)
+                        printc(f"Final processed_data: {processed_data}")
+                        runner_result = RunnerResult(
+                            answer=processed_data,
+                            step_history=self.step_history.copy(),
+                            id=id,
+                        )
+                        return self._create_trainable_runner_result(
+                            runner_result=runner_result,
+                            name="Runner Result",
+                            previous_output=previous_output,
+                            prompt_kwargs=prompt_kwargs,
+                            id=id,
+                        )
+
+                        # return self.combine_step_history_and_runner_result(
+                        #     step_history=self.step_history,
+                        #     runner_result=runner_result,
+                        #     id=id,
+                        #     task_desc=self.agent.planner.prompt_kwargs["task_desc"],
+                        # )
+                    
+                    # Execute the function and update context
+                    function_results = self._tool_execute_sync(function)
+                    step_output = StepOutput(
+                        step=step_count,
+                        action=function,
+                        function=function,
+                        observation=function_results.output.observation if hasattr(function_results.output, 'observation') else function_results.output,
+                    )
+                    
+                    self.step_history.append(step_output)
+                    prompt_kwargs["step_history"] = self.step_history
+
+                # Update for next iteration
+                step_count += 1
+                
+            except Exception as e:
+                error_msg = f"Error in step {step_count}: {str(e)}"
+                log.error(error_msg)
+                runner_result = RunnerResult(
+                    answer=error_msg,
+                    step_history=self.step_history.copy(),
+                    error=error_msg,
+                    id=id,
+                )
+                return self._create_trainable_runner_result(
+                    runner_result=runner_result,
+                    name="Runner Error",
+                    previous_output=previous_output,
+                    prompt_kwargs=prompt_kwargs,
+                    id=id,
+                )
+                # return self.combine_step_history_and_runner_result(
+                #     step_history=self.step_history,
+                #     runner_result=runner_result,
+                #     id=id,
+                #     task_desc=self.agent.planner.prompt_kwargs["task_desc"],
+                # )
+        
+        # If we exit the loop without a final result
+        runner_result = RunnerResult(
+            answer=f"Max steps ({self.max_steps}) reached without completion",
+            step_history=self.step_history.copy(),
+            error="Max steps reached",
+            id=id,
+        )
+
+        runner_result_parameter = self._create_trainable_runner_result(
+            runner_result=runner_result,
+            name="Runner Incomplete",
+            previous_output=previous_output,
+            prompt_kwargs=prompt_kwargs,
+            id=id,
+        )
+
+        return runner_result_parameter
+
+        # return self.combine_step_history_and_runner_result(
+        #     step_history=step_history,
+        #     runner_result=runner_result,
+        #     id=id,
+        #     task_desc=self.agent.planner.prompt_kwargs["task_desc"],
+        # )
+
+    
+    def __call__(self, *args, **kwargs) -> Union[Parameter, RunnerResult]:
+        if self.training:
+            log.debug("Training mode")
+            return self.forward(*args, **kwargs)
+        else:
+            log.debug("Inference mode")
+            return self.call(*args, **kwargs)
+    
     def _tool_execute_sync(
         self,
         func: Function,
@@ -843,6 +1106,7 @@ async def acall(
                             last_output = RunnerResult(
                                 answer=processed_data,
                                 step_history=self.step_history.copy(),
+                                id=id,
                                 # ctx=self.ctx,
                             )
 
@@ -980,6 +1244,7 @@ async def acall(
                 answer=current_error or f"No output generated after {step_count} steps (max_steps: {self.max_steps})",
                 step_history=self.step_history.copy(),
                 error=current_error,
+                id=id,
             )
 
     def astream(
@@ -1212,6 +1477,7 @@ async def impl_astream(
                                     step_count=step_count,
                                     streaming_result=streaming_result,
                                     runner_span_instance=runner_span_instance,
+                                    id=id,
                                 )
                                 stop_the_loop = True
                                 workflow_status = "stream_completed"
@@ -1361,6 +1627,7 @@ async def impl_astream(
                     answer=f"No output generated after {step_count} steps (max_steps: {self.max_steps})",
                     error=current_error,
                     step_history=self.step_history.copy(),
+                    id=id,
 
                 )
                 final_output_item = FinalOutputItem(data=runner_result)
@@ -1383,6 +1650,7 @@ async def impl_astream(
                 answer=final_output_item.data.answer if final_output_item.data else None,
                 step_history=self.step_history.copy(),
                 error=current_error,
+                id=id,
             )
 
             self._create_execution_complete_stream_event(
diff --git a/adalflow/adalflow/components/agent/runner_trainer.py b/adalflow/adalflow/components/agent/runner_trainer.py
new file mode 100644
index 000000000..43c606734
--- /dev/null
+++ b/adalflow/adalflow/components/agent/runner_trainer.py
@@ -0,0 +1,219 @@
+from re import A
+from typing import Any, Callable, Dict, Optional, Tuple, List
+import adalflow as adal
+from adalflow.datasets.types import GSM8KData as Example
+from adalflow.datasets.gsm8k import GSM8K
+from adalflow.eval.answer_match_acc import AnswerMatchAcc
+from adalflow.core.types import RunnerResult
+from adalflow.optim.optimizer import Optimizer
+from adalflow.components.agent.react import ReActAgent
+from adalflow.optim.parameter import Parameter, ParameterType
+from adalflow.core.func_tool import FunctionTool
+
+
+def load_datasets():
+    train_data = GSM8K(split="train", size=100)
+    val_data = GSM8K(split="val", size=50)
+    test_data = GSM8K(split="test", size=100)
+    return train_data, val_data, test_data
+
+
+def default_eval_fn(y: str, y_gt: str) -> float:
+    """Default dummy evaluation function that performs exact string match."""
+    return 1.0 if str(y).strip() == str(y_gt).strip() else 0.0
+
+
+def default_loss_fn_factory(eval_fn: Callable = None):
+    """Factory function to create default loss function."""
+    if eval_fn is None:
+        eval_fn = default_eval_fn
+    return adal.EvalFnToTextLoss(
+        eval_fn=eval_fn,
+        eval_fn_desc="Default evaluation: 1 if str(y) == str(y_gt) else 0",
+    )
+
+
+class RunnerTrainer(adal.AdalComponent):
+    """Generic trainer that accepts a Runner model and allows custom eval_fn and loss_fn."""
+
+    def __init__(
+        self,
+        runner: adal.Component,
+        eval_fn: Optional[Callable] = None,
+        loss_fn: Optional[Callable] = None,
+        backward_engine_model_config: Optional[Dict] = None,
+        teacher_model_config: Optional[Dict] = None,
+        text_optimizer_model_config: Optional[Dict] = None,
+        original_react_agent: Optional[ReActAgent] = False, 
+    ):
+        # Use provided eval_fn or create default
+        if eval_fn is None:
+            eval_fn = default_eval_fn
+        
+        # Use provided loss_fn or create default
+        if loss_fn is None:
+            loss_fn = default_loss_fn_factory(eval_fn)
+        
+        super().__init__(task=runner, eval_fn=eval_fn, loss_fn=loss_fn)
+
+        self.backward_engine_model_config = backward_engine_model_config
+        self.teacher_model_config = teacher_model_config
+        self.text_optimizer_model_config = text_optimizer_model_config
+        self.original_react_agent = original_react_agent
+
+    def prepare_task(self, sample: Example) -> Tuple[Callable, Dict[str, Any]]:
+        # Runner.call expects prompt_kwargs parameter, not direct keyword args
+        # prepare task should not return in training mode
+        # return self.task.forward, {"prompt_kwargs": {"input_str": sample.question}, "id": sample.id} 
+        if self.original_react_agent:
+            return self.task.__call__, {"input": sample.question, "id": sample.id}
+        return self.task.__call__, {"prompt_kwargs": {"input_str": sample.question}, "id": sample.id}
+
+    def prepare_eval(
+        self, sample: Example, y_pred: RunnerResult
+    ) -> Tuple[float, Dict[str, Any]]:
+        y_label = ""
+
+        if not self.original_react_agent and y_pred is not None and y_pred.answer is not None:
+            y_label = y_pred.answer
+        elif self.original_react_agent:
+            y_label = y_pred
+        return self.eval_fn, {"y": y_label, "y_gt": sample.answer}
+
+    def prepare_loss(
+        self, sample: Example, pred: adal.Parameter
+    ) -> Tuple[Callable, Dict[str, Any]]:
+        print("pred", pred)
+        y_gt = adal.Parameter(
+            name="y_gt",
+            data=sample.answer,
+            eval_input=sample.answer,
+            requires_opt=False,
+        )
+
+        if self.original_react_agent:
+            pred.eval_input = pred.data
+        else: 
+            pred.eval_input = pred.data.answer
+
+        return self.loss_fn, {"kwargs": {"y": pred, "y_gt": y_gt}, "id": sample.id}
+
+
+def train_original_react():
+    from adalflow.utils import setup_env
+
+    setup_env() 
+    train_data, val_data, test_data = load_datasets()
+    
+    # Create Anthropic client configuration instead of GPT-3
+    anthropic_config = {
+        "model_client": adal.AnthropicAPIClient(),
+        "model_kwargs": {
+            "model": "claude-3-5-sonnet-20241022",
+            # "max_tokens": 2000,
+            "temperature": 0.0,
+        }
+    }
+    
+    def llm_as_tool(input: str, id: Optional[str] = None) -> str:
+        """Used as a calculator tool."""
+        printc(f"llm_as_tool: {input}", color="yellow")
+
+        return self.llm_tool(prompt_kwargs={"input_str": input}, id=id)
+
+    react_agent = ReActAgent(
+        tools=[FunctionTool(llm_as_tool)],
+        max_steps=6,
+        add_llm_as_fallback=True,
+        **anthropic_config
+    )
+    
+    # Create specific eval_fn and loss_fn like in GSM8K train.py
+    eval_fn = AnswerMatchAcc(type="exact_match").compute_single_item
+    loss_fn = adal.EvalFnToTextLoss(
+        eval_fn=eval_fn,
+        eval_fn_desc="exact_match: 1 if str(y) == str(y_gt) else 0",
+    )
+    
+    adal_component = RunnerTrainer(
+        runner=react_agent,
+        eval_fn=eval_fn,
+        loss_fn=loss_fn,
+        backward_engine_model_config=anthropic_config,
+        text_optimizer_model_config=anthropic_config,
+        original_react_agent=True,
+    )
+    trainer = adal.Trainer(
+        adaltask=adal_component,
+        strategy="random",
+        max_steps=10,
+        text_optimizers_config_kwargs={"max_past_history": 5},
+    )
+    trainer.fit(
+        train_dataset=train_data,
+        val_dataset=val_data,
+        test_dataset=test_data,
+        debug=False, 
+    )
+
+def train():
+    from adalflow.utils import setup_env
+
+    setup_env() 
+    train_data, val_data, test_data = load_datasets()
+    
+    # Create Anthropic client configuration instead of GPT-3
+    anthropic_config = {
+        "model_client": adal.AnthropicAPIClient(),
+        "model_kwargs": {
+            "model": "claude-3-5-sonnet-20241022",
+            # "max_tokens": 2000,
+            "temperature": 0.0,
+        }
+    }
+    
+    # Create Agent first, then Runner  
+    agent = adal.Agent(
+        name="gsm8k_agent",
+        add_llm_as_fallback=True,
+        max_steps = 6, 
+        **anthropic_config
+    )
+    
+    runner = adal.Runner(agent=agent, training=True)
+    
+    # Create specific eval_fn and loss_fn like in GSM8K train.py
+    eval_fn = AnswerMatchAcc(type="exact_match").compute_single_item
+    loss_fn = adal.EvalFnToTextLoss(
+        eval_fn=eval_fn,
+        eval_fn_desc="exact_match: 1 if str(y) == str(y_gt) else 0",
+    )
+    
+    adal_component = RunnerTrainer(
+        runner=runner,
+        eval_fn=eval_fn,
+        loss_fn=loss_fn,
+        backward_engine_model_config=anthropic_config,
+        text_optimizer_model_config=anthropic_config,
+    )
+    trainer = adal.Trainer(
+        adaltask=adal_component,
+        strategy="random",
+        max_steps=10,
+        # resume_from_ckpt="./Users/jinhakim/.adalflow/ckpt/RunnerTrainer/random_max_steps_5_2bd11_run_1.json",
+        text_optimizers_config_kwargs={"max_past_history": 5},
+    )
+    trainer.fit(
+        train_dataset=train_data,
+        val_dataset=val_data,
+        test_dataset=test_data,
+        debug=False, 
+        # resume_from_ckpt="./Users/jinhakim/.adalflow/ckpt/RunnerTrainer/random_max_steps_2_cc5fb_run_1.json",
+    )
+
+    # feedback, visualizatio n
+
+
+if __name__ == "__main__":
+    # train()
+    train_original_react()
\ No newline at end of file
diff --git a/adalflow/adalflow/core/functional.py b/adalflow/adalflow/core/functional.py
index 154925c84..8a60a782f 100644
--- a/adalflow/adalflow/core/functional.py
+++ b/adalflow/adalflow/core/functional.py
@@ -83,8 +83,17 @@ def _asdict_inner(obj, dict_factory, exclude):
             )
             for k, v in obj.items()
         )
+    elif isinstance(obj, set):
+        # Handle sets specifically - preserve them as sets
+        return type(obj)(_asdict_inner(v, dict_factory, exclude) for v in obj)
     else:
-        return obj
+        # Check if the object is JSON serializable
+        try:
+            json.dumps(obj)
+            return obj
+        except: 
+            # If not JSON serializable, convert to string representation
+            return str(obj)
         # return deepcopy(obj)
 
 
diff --git a/adalflow/adalflow/core/generator.py b/adalflow/adalflow/core/generator.py
index ef1715786..4def444cd 100644
--- a/adalflow/adalflow/core/generator.py
+++ b/adalflow/adalflow/core/generator.py
@@ -162,6 +162,7 @@ def __init__(
         CachedEngine.__init__(self, cache_path=self.cache_path)
 
         Component.__init__(self)
+        # creates a backward engine if none is called 
         GradComponent.__init__(self, desc="Generate a response using LLM model.")
         CallbackManager.__init__(self)
 
@@ -739,7 +740,7 @@ def data_to_prompt_map_fn(data: Parameter) -> str:
         # **** end of the special to the generator ****
 
         # if not self.backward_engine:
-        #     # self.set_backward_engine()
+        #     self.set_backward_engine()
         #     log.debug(f"Backward engine: {self.backward_engine}")
 
         # attach a funtion to compute gradient for predecessors
@@ -778,7 +779,7 @@ def backward(
         log.debug(
             f"backward pass setup: {backward_pass_setup}, name: {self.name}",
             color="red",
-        )
+        ) 
 
         children_params = response.predecessors
         is_intermediate_node = True
diff --git a/adalflow/adalflow/core/types.py b/adalflow/adalflow/core/types.py
index 97a7b8ba3..779a36e9e 100644
--- a/adalflow/adalflow/core/types.py
+++ b/adalflow/adalflow/core/types.py
@@ -1311,6 +1311,10 @@ class RunnerResult:
         metadata={"desc": "The context of the execution"},
         default=None,
     )
+    id: Optional[str] = field(
+        metadata={"desc": "The id of the execution"},
+        default=None,
+    )
 
 
 @dataclass
diff --git a/adalflow/adalflow/datasets/hotpot_qa.py b/adalflow/adalflow/datasets/hotpot_qa.py
index afec6cff1..c809ee72e 100644
--- a/adalflow/adalflow/datasets/hotpot_qa.py
+++ b/adalflow/adalflow/datasets/hotpot_qa.py
@@ -11,7 +11,6 @@
 from adalflow.core.base_data_class import DataClass
 from adalflow.datasets.types import HotPotQAData
 
-
 class HotPotQA(Dataset):
     def __init__(
         self,
diff --git a/adalflow/adalflow/optim/parameter.py b/adalflow/adalflow/optim/parameter.py
index 7fb68ddd4..1275a3185 100644
--- a/adalflow/adalflow/optim/parameter.py
+++ b/adalflow/adalflow/optim/parameter.py
@@ -865,6 +865,8 @@ def draw_interactive_html_graph(
 
         # Add nodes to the graph
         node_ids = set()
+        if nodes is None:
+            nodes = []
         for node in nodes:
             self.generate_node_html(node, output_dir=filepath)
 
diff --git a/adalflow/tests/test_runner.py b/adalflow/tests/test_runner.py
index 39f24051a..58d457aaf 100644
--- a/adalflow/tests/test_runner.py
+++ b/adalflow/tests/test_runner.py
@@ -816,6 +816,329 @@ def test_conversation_memory_clear_streaming(self):
         asyncio.run(self._test_conversation_memory_clear_streaming())
 
 
+class TestRunnerForward(unittest.TestCase):
+    """Tests for the Runner.forward method (trainable mode)."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.runner = Runner(
+            agent=DummyAgent(
+                planner=None, answer_data_type=None, tool_manager=MockToolManager()
+            )
+        )
+
+    def test_forward_single_step_final_answer(self):
+        """Test forward method with single step that produces final answer."""
+        from adalflow.optim.parameter import Parameter
+        
+        fn = DummyFunction(name="answer_output", _is_answer_final=True, _answer="forward_answer")
+        
+        class TrainablePlanner:
+            def __init__(self):
+                self.training = False
+                
+            def forward(self, *, prompt_kwargs, model_kwargs=None, id=None):
+                # Return a Parameter wrapping GeneratorOutput 
+                param = Parameter(
+                    name="planner_output",
+                    data=GeneratorOutput(data=fn),
+                    requires_opt=True,
+                )
+                return param
+                
+            def get_prompt(self, **kwargs):
+                return "test prompt"
+        
+        agent = DummyAgent(
+            planner=TrainablePlanner(),
+            answer_data_type=None,
+            tool_manager=MockToolManager()
+        )
+        runner = Runner(agent=agent)
+        
+        result = runner.forward(prompt_kwargs={"input_str": "test"})
+        
+        # Should return OutputParameter with RunnerResult
+        from adalflow.optim.parameter import OutputParameter
+        self.assertIsInstance(result, OutputParameter)
+        self.assertIsInstance(result.data, RunnerResult)
+        self.assertEqual(result.data.answer, "forward_answer")
+        self.assertTrue(result.requires_opt)
+
+    def test_forward_multi_step_execution(self):
+        """Test forward method with multiple steps before final answer."""
+        from adalflow.optim.parameter import Parameter
+        
+        fn1 = DummyFunction(name="search", _is_answer_final=False)
+        fn2 = DummyFunction(name="answer_output", _is_answer_final=True, _answer="multi_step_answer")
+        
+        class MultiStepTrainablePlanner:
+            def __init__(self):
+                self.training = False
+                self.call_count = 0
+                
+            def forward(self, *, prompt_kwargs, model_kwargs=None, id=None):
+                self.call_count += 1
+                if self.call_count == 1:
+                    data = GeneratorOutput(data=fn1)
+                else:
+                    data = GeneratorOutput(data=fn2)
+                
+                param = Parameter(
+                    name=f"planner_output_{self.call_count}",
+                    data=data,
+                    requires_opt=True,
+                )
+                return param
+                
+            def get_prompt(self, **kwargs):
+                return "test prompt"
+        
+        agent = DummyAgent(
+            planner=MultiStepTrainablePlanner(),
+            answer_data_type=None,
+            tool_manager=MockToolManager()
+        )
+        runner = Runner(agent=agent)
+        
+        result = runner.forward(prompt_kwargs={"input_str": "test"})
+        
+        # Should return OutputParameter with RunnerResult
+        from adalflow.optim.parameter import OutputParameter
+        self.assertIsInstance(result, OutputParameter)
+        self.assertIsInstance(result.data, RunnerResult)
+        self.assertEqual(result.data.answer, "multi_step_answer")
+        self.assertEqual(len(runner.step_history), 1)  # Only non-final step in history
+
+    def test_forward_parameter_chaining(self):
+        """Test that forward method properly chains parameters."""
+        from adalflow.optim.parameter import Parameter
+        
+        fn = DummyFunction(name="answer_output", _is_answer_final=True, _answer="chained_answer")
+        
+        class ChainingPlanner:
+            def __init__(self):
+                self.training = False
+                self.call_count = 0
+                
+            def forward(self, *, prompt_kwargs, model_kwargs=None, id=None):
+                self.call_count += 1
+                param = Parameter(
+                    name=f"step_{self.call_count}",
+                    data=GeneratorOutput(data=fn),
+                    requires_opt=True,
+                )
+                return param
+                
+            def get_prompt(self, **kwargs):
+                return "test prompt"
+        
+        agent = DummyAgent(
+            planner=ChainingPlanner(),
+            answer_data_type=None,
+            tool_manager=MockToolManager()
+        )
+        runner = Runner(agent=agent)
+        
+        result = runner.forward(prompt_kwargs={"input_str": "test"})
+        
+        # Should return OutputParameter
+        from adalflow.optim.parameter import OutputParameter
+        self.assertIsInstance(result, OutputParameter)
+        
+        # Should have gradient function set up
+        self.assertIsNotNone(result.grad_fn)
+        self.assertTrue(result.requires_opt)
+
+    def test_forward_error_handling(self):
+        """Test forward method handles errors gracefully."""
+        from adalflow.optim.parameter import Parameter
+        
+        class ErrorPlanner:
+            def __init__(self):
+                self.training = False
+                
+            def forward(self, *, prompt_kwargs, model_kwargs=None, id=None):
+                raise ValueError("Planning error")
+                
+            def get_prompt(self, **kwargs):
+                return "test prompt"
+        
+        agent = DummyAgent(
+            planner=ErrorPlanner(),
+            answer_data_type=None,
+            tool_manager=MockToolManager()
+        )
+        runner = Runner(agent=agent)
+        
+        result = runner.forward(prompt_kwargs={"input_str": "test"})
+        
+        # Should return OutputParameter with error in RunnerResult
+        from adalflow.optim.parameter import OutputParameter
+        self.assertIsInstance(result, OutputParameter)
+        self.assertIsInstance(result.data, RunnerResult)
+        self.assertIn("Error in step 0", result.data.answer)
+        self.assertIsNotNone(result.data.error)
+
+    def test_forward_max_steps_reached(self):
+        """Test forward method when max steps are reached without final answer."""
+        from adalflow.optim.parameter import Parameter
+        
+        fn = DummyFunction(name="continue", _is_answer_final=False)
+        
+        class ContinuousPlanner:
+            def __init__(self):
+                self.training = False
+                self.call_count = 0
+                
+            def forward(self, *, prompt_kwargs, model_kwargs=None, id=None):
+                self.call_count += 1
+                param = Parameter(
+                    name=f"step_{self.call_count}",
+                    data=GeneratorOutput(data=fn),
+                    requires_opt=True,
+                )
+                return param
+                
+            def get_prompt(self, **kwargs):
+                return "test prompt"
+        
+        agent = DummyAgent(
+            planner=ContinuousPlanner(),
+            answer_data_type=None,
+            tool_manager=MockToolManager(),
+            max_steps=2
+        )
+        runner = Runner(agent=agent)
+        
+        result = runner.forward(prompt_kwargs={"input_str": "test"})
+        
+        # Should return OutputParameter with incomplete message
+        from adalflow.optim.parameter import OutputParameter
+        self.assertIsInstance(result, OutputParameter)
+        self.assertIsInstance(result.data, RunnerResult)
+        self.assertIn("Max steps (2) reached", result.data.answer)
+        self.assertEqual(len(runner.step_history), 2)
+
+    def test_forward_training_mode_flag(self):
+        """Test that forward method sets training mode on planner."""
+        from adalflow.optim.parameter import Parameter
+        
+        fn = DummyFunction(name="answer_output", _is_answer_final=True, _answer="training_test")
+        
+        class TrainingAwarePlanner:
+            def __init__(self):
+                self.training = False
+                
+            def forward(self, *, prompt_kwargs, model_kwargs=None, id=None):
+                # Should be called in training mode
+                assert self.training == True, "Planner should be in training mode"
+                param = Parameter(
+                    name="planner_output",
+                    data=GeneratorOutput(data=fn),
+                    requires_opt=True,
+                )
+                return param
+                
+            def get_prompt(self, **kwargs):
+                return "test prompt"
+        
+        agent = DummyAgent(
+            planner=TrainingAwarePlanner(),
+            answer_data_type=None,
+            tool_manager=MockToolManager()
+        )
+        runner = Runner(agent=agent)
+        
+        # This should set planner.training = True and not raise assertion error
+        result = runner.forward(prompt_kwargs={"input_str": "test"})
+        
+        from adalflow.optim.parameter import OutputParameter
+        self.assertIsInstance(result, OutputParameter)
+
+    def test_forward_with_conversation_memory(self):
+        """Test forward method with conversation memory."""
+        from adalflow.optim.parameter import Parameter
+        from adalflow.components.memory.memory import ConversationMemory
+        
+        fn = DummyFunction(name="answer_output", _is_answer_final=True, _answer="memory_test")
+        captured_prompt_kwargs = []
+        
+        class MemoryPlanner:
+            def __init__(self):
+                self.training = False
+                
+            def forward(self, *, prompt_kwargs, model_kwargs=None, id=None):
+                captured_prompt_kwargs.append(prompt_kwargs.copy())
+                param = Parameter(
+                    name="planner_output",
+                    data=GeneratorOutput(data=fn),
+                    requires_opt=True,
+                )
+                return param
+                
+            def get_prompt(self, **kwargs):
+                return "test prompt"
+        
+        memory = ConversationMemory()
+        agent = DummyAgent(
+            planner=MemoryPlanner(),
+            answer_data_type=None,
+            tool_manager=MockToolManager()
+        )
+        runner = Runner(agent=agent, conversation_memory=memory)
+        
+        result = runner.forward(prompt_kwargs={"input_str": "Hello"})
+        
+        # Should have chat_history_str in prompt_kwargs
+        self.assertEqual(len(captured_prompt_kwargs), 1)
+        self.assertIn("chat_history_str", captured_prompt_kwargs[0])
+        self.assertEqual(captured_prompt_kwargs[0]["chat_history_str"], "")  # Empty on first call
+
+    def test_forward_predecessor_chaining(self):
+        """Test that forward method properly chains predecessors across steps."""
+        from adalflow.optim.parameter import Parameter
+        
+        fn1 = DummyFunction(name="search", _is_answer_final=False)
+        fn2 = DummyFunction(name="answer_output", _is_answer_final=True, _answer="predecessor_test")
+        
+        class PredecessorPlanner:
+            def __init__(self):
+                self.training = False
+                self.call_count = 0
+                
+            def forward(self, *, prompt_kwargs, model_kwargs=None, id=None):
+                self.call_count += 1
+                if self.call_count == 1:
+                    data = GeneratorOutput(data=fn1)
+                else:
+                    data = GeneratorOutput(data=fn2)
+                
+                param = Parameter(
+                    name=f"step_{self.call_count}",
+                    data=data,
+                    requires_opt=True,
+                )
+                return param
+                
+            def get_prompt(self, **kwargs):
+                return "test prompt"
+        
+        agent = DummyAgent(
+            planner=PredecessorPlanner(),
+            answer_data_type=None,
+            tool_manager=MockToolManager()
+        )
+        runner = Runner(agent=agent)
+        
+        result = runner.forward(prompt_kwargs={"input_str": "test"})
+        
+        # The final result should have predecessors set up
+        from adalflow.optim.parameter import OutputParameter
+        self.assertIsInstance(result, OutputParameter)
+        self.assertTrue(len(result.predecessors) > 0)  # Should have predecessor
+
+
 class TestRunnerBugFixes(unittest.TestCase):
     """Tests for specific bugs that were found and fixed in the runner implementation."""
 
diff --git a/adalflow/tests/test_runner_trainer.py b/adalflow/tests/test_runner_trainer.py
new file mode 100644
index 000000000..75f3381a0
--- /dev/null
+++ b/adalflow/tests/test_runner_trainer.py
@@ -0,0 +1,650 @@
+import unittest
+import unittest.mock
+from unittest.mock import Mock, MagicMock, patch
+from types import SimpleNamespace
+from typing import Dict, Any, Tuple, Callable
+
+import adalflow as adal
+from adalflow.components.agent.runner_trainer import (
+    RunnerTrainer,
+    load_datasets,
+    default_eval_fn,
+    default_loss_fn_factory,
+)
+from adalflow.datasets.types import GSM8KData as Example
+from adalflow.core.types import RunnerResult
+from adalflow.optim.parameter import Parameter
+from adalflow.components.agent.react import ReActAgent
+
+
+class TestRunnerTrainer(unittest.TestCase):
+    """Tests for the RunnerTrainer class."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        # Create mock runner
+        self.mock_runner = Mock()
+        self.mock_runner.__class__ = adal.Component
+        
+        # Create test sample
+        self.test_sample = Example(
+            id="test_1",
+            question="What is 2 + 2?",
+            answer="4"
+        )
+
+    def test_init_with_defaults(self):
+        """Test RunnerTrainer initialization with default parameters."""
+        trainer = RunnerTrainer(runner=self.mock_runner)
+        
+        # Check that defaults are set
+        self.assertEqual(trainer.task, self.mock_runner)
+        self.assertIsNotNone(trainer.eval_fn)
+        self.assertIsNotNone(trainer.loss_fn)
+        self.assertIsNone(trainer.backward_engine_model_config)
+        self.assertIsNone(trainer.teacher_model_config)
+        self.assertIsNone(trainer.text_optimizer_model_config)
+        self.assertFalse(trainer.original_react_agent)
+
+    def test_init_with_custom_parameters(self):
+        """Test RunnerTrainer initialization with custom parameters."""
+        custom_eval_fn = lambda y, y_gt: 0.8
+        custom_loss_fn = Mock()
+        backward_config = {"model": "test"}
+        teacher_config = {"teacher": "test"}
+        text_optimizer_config = {"optimizer": "test"}
+        
+        trainer = RunnerTrainer(
+            runner=self.mock_runner,
+            eval_fn=custom_eval_fn,
+            loss_fn=custom_loss_fn,
+            backward_engine_model_config=backward_config,
+            teacher_model_config=teacher_config,
+            text_optimizer_model_config=text_optimizer_config,
+            original_react_agent=True
+        )
+        
+        self.assertEqual(trainer.eval_fn, custom_eval_fn)
+        self.assertEqual(trainer.loss_fn, custom_loss_fn)
+        self.assertEqual(trainer.backward_engine_model_config, backward_config)
+        self.assertEqual(trainer.teacher_model_config, teacher_config)
+        self.assertEqual(trainer.text_optimizer_model_config, text_optimizer_config)
+        self.assertTrue(trainer.original_react_agent)
+
+    def test_prepare_task_standard_runner(self):
+        """Test prepare_task method for standard Runner."""
+        trainer = RunnerTrainer(runner=self.mock_runner)
+        
+        task_fn, kwargs = trainer.prepare_task(self.test_sample)
+        
+        self.assertEqual(task_fn, self.mock_runner.__call__)
+        self.assertIn("prompt_kwargs", kwargs)
+        self.assertEqual(kwargs["prompt_kwargs"]["input_str"], "What is 2 + 2?")
+        self.assertEqual(kwargs["id"], "test_1")
+
+    def test_prepare_task_original_react_agent(self):
+        """Test prepare_task method for original ReActAgent."""
+        trainer = RunnerTrainer(runner=self.mock_runner, original_react_agent=True)
+        
+        task_fn, kwargs = trainer.prepare_task(self.test_sample)
+        
+        self.assertEqual(task_fn, self.mock_runner.__call__)
+        self.assertIn("input", kwargs)
+        self.assertEqual(kwargs["input"], "What is 2 + 2?")
+        self.assertEqual(kwargs["id"], "test_1")
+
+    def test_prepare_eval_standard_runner(self):
+        """Test prepare_eval method for standard Runner."""
+        trainer = RunnerTrainer(runner=self.mock_runner)
+        
+        # Create mock RunnerResult
+        mock_result = RunnerResult(answer="4", step_history=[])
+        
+        eval_fn, kwargs = trainer.prepare_eval(self.test_sample, mock_result)
+        
+        self.assertEqual(eval_fn, trainer.eval_fn)
+        self.assertEqual(kwargs["y"], "4")
+        self.assertEqual(kwargs["y_gt"], "4")
+
+    def test_prepare_eval_original_react_agent(self):
+        """Test prepare_eval method for original ReActAgent."""
+        trainer = RunnerTrainer(runner=self.mock_runner, original_react_agent=True)
+        
+        prediction = "4"
+        
+        eval_fn, kwargs = trainer.prepare_eval(self.test_sample, prediction)
+        
+        self.assertEqual(eval_fn, trainer.eval_fn)
+        self.assertEqual(kwargs["y"], "4")
+        self.assertEqual(kwargs["y_gt"], "4")
+
+    def test_prepare_eval_none_result(self):
+        """Test prepare_eval method with None result."""
+        trainer = RunnerTrainer(runner=self.mock_runner)
+        
+        eval_fn, kwargs = trainer.prepare_eval(self.test_sample, None)
+        
+        self.assertEqual(eval_fn, trainer.eval_fn)
+        self.assertEqual(kwargs["y"], "")
+        self.assertEqual(kwargs["y_gt"], "4")
+
+    def test_prepare_loss_standard_runner(self):
+        """Test prepare_loss method for standard Runner."""
+        trainer = RunnerTrainer(runner=self.mock_runner)
+        
+        # Create mock prediction parameter with RunnerResult
+        mock_result = RunnerResult(answer="4", step_history=[])
+        mock_pred = Parameter(
+            name="pred",
+            data=mock_result,
+            requires_opt=True
+        )
+        
+        loss_fn, kwargs = trainer.prepare_loss(self.test_sample, mock_pred)
+        
+        self.assertEqual(loss_fn, trainer.loss_fn)
+        self.assertIn("kwargs", kwargs)
+        self.assertIn("y", kwargs["kwargs"])
+        self.assertIn("y_gt", kwargs["kwargs"])
+        self.assertEqual(kwargs["id"], "test_1")
+        
+        # Check that pred.eval_input is set correctly
+        self.assertEqual(mock_pred.eval_input, "4")
+        
+        # Check y_gt parameter
+        y_gt = kwargs["kwargs"]["y_gt"]
+        self.assertIsInstance(y_gt, Parameter)
+        self.assertEqual(y_gt.data, "4")
+        self.assertFalse(y_gt.requires_opt)
+
+    def test_prepare_loss_original_react_agent(self):
+        """Test prepare_loss method for original ReActAgent."""
+        trainer = RunnerTrainer(runner=self.mock_runner, original_react_agent=True)
+        
+        # Create mock prediction parameter with string data
+        mock_pred = Parameter(
+            name="pred",
+            data="4",
+            requires_opt=True
+        )
+        
+        loss_fn, kwargs = trainer.prepare_loss(self.test_sample, mock_pred)
+        
+        self.assertEqual(loss_fn, trainer.loss_fn)
+        
+        # Check that pred.eval_input is set directly to data
+        self.assertEqual(mock_pred.eval_input, "4")
+
+    @patch('builtins.print')  # Mock print to avoid output during tests
+    def test_prepare_loss_prints_pred(self, mock_print):
+        """Test that prepare_loss prints the prediction parameter."""
+        trainer = RunnerTrainer(runner=self.mock_runner)
+        
+        mock_result = RunnerResult(answer="4", step_history=[])
+        mock_pred = Parameter(
+            name="pred",
+            data=mock_result,
+            requires_opt=True
+        )
+        
+        trainer.prepare_loss(self.test_sample, mock_pred)
+        
+        # Verify print was called with the prediction
+        mock_print.assert_called_once_with("pred", mock_pred)
+
+    def test_inheritance_from_adal_component(self):
+        """Test that RunnerTrainer properly inherits from AdalComponent."""
+        trainer = RunnerTrainer(runner=self.mock_runner)
+        
+        self.assertIsInstance(trainer, adal.AdalComponent)
+        self.assertTrue(hasattr(trainer, 'task'))
+        self.assertTrue(hasattr(trainer, 'eval_fn'))
+        self.assertTrue(hasattr(trainer, 'loss_fn'))
+
+
+class TestRunnerTrainerHelperFunctions(unittest.TestCase):
+    """Tests for helper functions in runner_trainer module."""
+
+    def test_default_eval_fn_exact_match(self):
+        """Test default_eval_fn with exact matches."""
+        result = default_eval_fn("hello", "hello")
+        self.assertEqual(result, 1.0)
+        
+        result = default_eval_fn("  hello  ", "hello")
+        self.assertEqual(result, 1.0)  # Should strip whitespace
+        
+        result = default_eval_fn(42, "42")
+        self.assertEqual(result, 1.0)  # Should convert to string
+
+    def test_default_eval_fn_no_match(self):
+        """Test default_eval_fn with non-matches."""
+        result = default_eval_fn("hello", "world")
+        self.assertEqual(result, 0.0)
+        
+        result = default_eval_fn("42", "43")
+        self.assertEqual(result, 0.0)
+
+    def test_default_loss_fn_factory_with_default_eval(self):
+        """Test default_loss_fn_factory with default eval function."""
+        loss_fn = default_loss_fn_factory()
+        
+        self.assertIsInstance(loss_fn, adal.EvalFnToTextLoss)
+        # Check that it has the expected description
+        self.assertIn("Default evaluation", loss_fn.eval_fn_desc)
+
+    def test_default_loss_fn_factory_with_custom_eval(self):
+        """Test default_loss_fn_factory with custom eval function."""
+        custom_eval = lambda y, y_gt: 0.5
+        loss_fn = default_loss_fn_factory(custom_eval)
+        
+        self.assertIsInstance(loss_fn, adal.EvalFnToTextLoss)
+        # The eval_fn should be wrapped in the loss function
+        self.assertIn("Default evaluation", loss_fn.eval_fn_desc)
+
+    @patch('adalflow.components.agent.runner_trainer.GSM8K')
+    def test_load_datasets(self, mock_gsm8k_class):
+        """Test load_datasets function."""
+        # Mock the GSM8K constructor
+        mock_train = Mock()
+        mock_val = Mock() 
+        mock_test = Mock()
+        
+        def mock_gsm8k_side_effect(split, size):
+            if split == "train":
+                return mock_train
+            elif split == "val":
+                return mock_val
+            elif split == "test":
+                return mock_test
+                
+        mock_gsm8k_class.side_effect = mock_gsm8k_side_effect
+        
+        train_data, val_data, test_data = load_datasets()
+        
+        # Check that GSM8K was called with correct parameters
+        expected_calls = [
+            unittest.mock.call(split="train", size=100),
+            unittest.mock.call(split="val", size=50),
+            unittest.mock.call(split="test", size=100)
+        ]
+        mock_gsm8k_class.assert_has_calls(expected_calls)
+        
+        # Check return values
+        self.assertEqual(train_data, mock_train)
+        self.assertEqual(val_data, mock_val)
+        self.assertEqual(test_data, mock_test)
+
+
+class TestRunnerTrainerIntegration(unittest.TestCase):
+    """Integration tests for RunnerTrainer with mocked dependencies."""
+
+    def setUp(self):
+        """Set up integration test fixtures."""
+        self.mock_runner = Mock(spec=adal.Component)
+        self.test_sample = Example(
+            id="integration_test",
+            question="What is the capital of France?",
+            answer="Paris"
+        )
+
+    def test_full_workflow_standard_runner(self):
+        """Test complete workflow with standard Runner."""
+        # Create trainer
+        trainer = RunnerTrainer(runner=self.mock_runner)
+        
+        # Test prepare_task
+        task_fn, task_kwargs = trainer.prepare_task(self.test_sample)
+        self.assertEqual(task_fn, self.mock_runner.__call__)
+        
+        # Simulate runner execution
+        mock_result = RunnerResult(answer="Paris", step_history=[])
+        
+        # Test prepare_eval
+        eval_fn, eval_kwargs = trainer.prepare_eval(self.test_sample, mock_result)
+        
+        # Execute evaluation
+        eval_score = eval_fn(**eval_kwargs)
+        self.assertEqual(eval_score, 1.0)  # Should match exactly
+        
+        # Test prepare_loss with Parameter
+        mock_pred = Parameter(
+            name="prediction",
+            data=mock_result,
+            requires_opt=True
+        )
+        
+        loss_fn, loss_kwargs = trainer.prepare_loss(self.test_sample, mock_pred)
+        
+        # Verify loss preparation
+        self.assertIn("kwargs", loss_kwargs)
+        self.assertEqual(loss_kwargs["id"], "integration_test")
+
+    def test_full_workflow_original_react_agent(self):
+        """Test complete workflow with original ReActAgent."""
+        # Create trainer for original ReActAgent
+        trainer = RunnerTrainer(runner=self.mock_runner, original_react_agent=True)
+        
+        # Test prepare_task
+        task_fn, task_kwargs = trainer.prepare_task(self.test_sample)
+        self.assertIn("input", task_kwargs)
+        self.assertEqual(task_kwargs["input"], "What is the capital of France?")
+        
+        # Test prepare_eval with string prediction
+        prediction = "Paris"
+        eval_fn, eval_kwargs = trainer.prepare_eval(self.test_sample, prediction)
+        
+        # Execute evaluation
+        eval_score = eval_fn(**eval_kwargs)
+        self.assertEqual(eval_score, 1.0)
+        
+        # Test prepare_loss
+        mock_pred = Parameter(
+            name="prediction",
+            data="Paris",
+            requires_opt=True
+        )
+        
+        loss_fn, loss_kwargs = trainer.prepare_loss(self.test_sample, mock_pred)
+        
+        # Verify eval_input is set directly to data
+        self.assertEqual(mock_pred.eval_input, "Paris")
+
+    def test_error_handling_in_eval(self):
+        """Test error handling in evaluation methods."""
+        # Create trainer with faulty eval function
+        def faulty_eval(y, y_gt):
+            raise ValueError("Evaluation error")
+            
+        trainer = RunnerTrainer(runner=self.mock_runner, eval_fn=faulty_eval)
+        
+        mock_result = RunnerResult(answer="Test", step_history=[])
+        eval_fn, eval_kwargs = trainer.prepare_eval(self.test_sample, mock_result)
+        
+        # Should propagate the error
+        with self.assertRaises(ValueError):
+            eval_fn(**eval_kwargs)
+
+    def test_custom_configurations(self):
+        """Test that custom configurations are preserved."""
+        backward_config = {"model_client": "test_backward"}
+        teacher_config = {"model_client": "test_teacher"}
+        text_optimizer_config = {"model_client": "test_optimizer"}
+        
+        trainer = RunnerTrainer(
+            runner=self.mock_runner,
+            backward_engine_model_config=backward_config,
+            teacher_model_config=teacher_config,
+            text_optimizer_model_config=text_optimizer_config
+        )
+        
+        self.assertEqual(trainer.backward_engine_model_config, backward_config)
+        self.assertEqual(trainer.teacher_model_config, teacher_config)
+        self.assertEqual(trainer.text_optimizer_model_config, text_optimizer_config)
+
+
+class TestRunnerTrainerEdgeCases(unittest.TestCase):
+    """Tests for edge cases and error conditions."""
+
+    def setUp(self):
+        """Set up edge case test fixtures."""
+        self.mock_runner = Mock(spec=adal.Component)
+        
+    def test_empty_answer(self):
+        """Test handling of empty answers."""
+        trainer = RunnerTrainer(runner=self.mock_runner)
+        
+        empty_sample = Example(
+            id="empty_test",
+            question="Test question?",
+            answer=""
+        )
+        
+        # Test with empty RunnerResult answer
+        mock_result = RunnerResult(answer="", step_history=[])
+        eval_fn, eval_kwargs = trainer.prepare_eval(empty_sample, mock_result)
+        
+        # Should match empty string
+        result = eval_fn(**eval_kwargs)
+        self.assertEqual(result, 1.0)
+
+    def test_none_runner_result_answer(self):
+        """Test handling of RunnerResult with None answer."""
+        trainer = RunnerTrainer(runner=self.mock_runner)
+        
+        test_sample = Example(
+            id="none_test",
+            question="Test question?",
+            answer="expected"
+        )
+        
+        # Test with RunnerResult having None answer
+        mock_result = RunnerResult(answer=None, step_history=[])
+        eval_fn, eval_kwargs = trainer.prepare_eval(test_sample, mock_result)
+        
+        # Should use empty string for None answer
+        self.assertEqual(eval_kwargs["y"], "")
+
+    def test_special_characters_in_answers(self):
+        """Test handling of special characters in answers."""
+        trainer = RunnerTrainer(runner=self.mock_runner)
+        
+        special_sample = Example(
+            id="special_test",
+            question="What is the result?",
+            answer="$100.50"
+        )
+        
+        mock_result = RunnerResult(answer="$100.50", step_history=[])
+        eval_fn, eval_kwargs = trainer.prepare_eval(special_sample, mock_result)
+        
+        result = eval_fn(**eval_kwargs)
+        self.assertEqual(result, 1.0)
+
+    def test_whitespace_normalization(self):
+        """Test that whitespace is properly normalized in evaluation."""
+        trainer = RunnerTrainer(runner=self.mock_runner)
+        
+        test_sample = Example(
+            id="whitespace_test",
+            question="Test?",
+            answer="answer"
+        )
+        
+        # Test with extra whitespace
+        mock_result = RunnerResult(answer="  answer  ", step_history=[])
+        eval_fn, eval_kwargs = trainer.prepare_eval(test_sample, mock_result)
+        
+        result = eval_fn(**eval_kwargs)
+        self.assertEqual(result, 1.0)  # Should match after stripping
+
+
+class TestRunnerTrainerNewRunnerFeatures(unittest.TestCase):
+    """Tests for RunnerTrainer compatibility with new Runner features."""
+
+    def setUp(self):
+        """Set up test fixtures for new Runner features."""
+        self.mock_runner = Mock(spec=adal.Component)
+        self.test_sample = Example(
+            id="new_feature_test",
+            question="Test with new runner?",
+            answer="New runner works"
+        )
+
+    def test_runner_trainer_with_training_flag(self):
+        """Test RunnerTrainer works with Runner that has training parameter."""
+        # Create a mock runner that simulates training behavior
+        mock_runner = Mock(spec=adal.Component)
+        mock_runner.training = False
+        
+        trainer = RunnerTrainer(runner=mock_runner)
+        
+        # Test that trainer initialization works with training-capable runners
+        self.assertEqual(trainer.task, mock_runner)
+        self.assertIsNotNone(trainer.eval_fn)
+        self.assertIsNotNone(trainer.loss_fn)
+
+    def test_runner_result_with_new_fields(self):
+        """Test handling of RunnerResult with potentially new fields."""
+        trainer = RunnerTrainer(runner=self.mock_runner)
+        
+        # Create RunnerResult with standard fields plus potential new ones
+        mock_result = RunnerResult(
+            answer="test answer",
+            step_history=[],
+            error=None,
+            id="test_id_123"
+        )
+        
+        eval_fn, eval_kwargs = trainer.prepare_eval(self.test_sample, mock_result)
+        
+        # Should handle RunnerResult regardless of additional fields
+        self.assertEqual(eval_fn, trainer.eval_fn)
+        self.assertEqual(eval_kwargs["y"], "test answer")
+        self.assertEqual(eval_kwargs["y_gt"], "New runner works")
+
+    def test_parameter_handling_with_new_runner_output(self):
+        """Test Parameter creation with new Runner output types."""
+        trainer = RunnerTrainer(runner=self.mock_runner)
+        
+        # Test with OutputParameter (which is now potentially returned by new Runner)
+        from adalflow.optim.parameter import OutputParameter
+        
+        # Create a mock RunnerResult
+        mock_result = RunnerResult(answer="output parameter test", step_history=[])
+        
+        # Create an OutputParameter wrapping the result (as new Runner might return)
+        mock_output_param = OutputParameter(
+            name="runner_output",
+            data=mock_result,
+            requires_opt=True
+        )
+        
+        # Test prepare_loss can handle OutputParameter input
+        loss_fn, loss_kwargs = trainer.prepare_loss(self.test_sample, mock_output_param)
+        
+        self.assertEqual(loss_fn, trainer.loss_fn)
+        self.assertIn("kwargs", loss_kwargs)
+        self.assertEqual(loss_kwargs["id"], "new_feature_test")
+        
+        # Check eval_input is set correctly for OutputParameter
+        self.assertEqual(mock_output_param.eval_input, "output parameter test")
+
+    def test_runner_trainer_initialization_with_new_runner_kwargs(self):
+        """Test RunnerTrainer handles any new Runner constructor parameters gracefully."""
+        # Test with various configurations that might be used with new Runner
+        configs = [
+            {"training": True},
+            {"conversation_memory": None},
+            {"permission_manager": None},
+            {"ctx": {"test": "context"}},
+        ]
+        
+        for config in configs:
+            with self.subTest(config=config):
+                # Mock runner should accept any additional parameters
+                mock_runner = Mock(spec=adal.Component)
+                for key, value in config.items():
+                    setattr(mock_runner, key, value)
+                
+                trainer = RunnerTrainer(runner=mock_runner)
+                
+                # Should initialize successfully regardless of runner's additional features
+                self.assertIsNotNone(trainer.task)
+                self.assertIsNotNone(trainer.eval_fn)
+                self.assertIsNotNone(trainer.loss_fn)
+
+    def test_step_history_handling_with_enhanced_runner(self):
+        """Test that step history handling works with potentially enhanced Runner."""
+        trainer = RunnerTrainer(runner=self.mock_runner)
+        
+        # Create mock step history with potentially new StepOutput format
+        mock_step = Mock()
+        mock_step.step = 1
+        mock_step.function = Mock()
+        mock_step.function.name = "test_function"
+        mock_step.observation = "test observation"
+        
+        mock_result = RunnerResult(
+            answer="final answer",
+            step_history=[mock_step],
+            error=None
+        )
+        
+        eval_fn, eval_kwargs = trainer.prepare_eval(self.test_sample, mock_result)
+        
+        # Should extract answer correctly regardless of step history format
+        self.assertEqual(eval_kwargs["y"], "final answer")
+
+    def test_error_handling_with_new_runner_exceptions(self):
+        """Test error handling with potentially new Runner exception types."""
+        trainer = RunnerTrainer(runner=self.mock_runner)
+        
+        # Test with RunnerResult containing error information
+        mock_result = RunnerResult(
+            answer=None,
+            step_history=[],
+            error="New runner error type"
+        )
+        
+        eval_fn, eval_kwargs = trainer.prepare_eval(self.test_sample, mock_result)
+        
+        # Should handle error cases gracefully
+        self.assertEqual(eval_kwargs["y"], "")  # Empty string for None answer
+        self.assertEqual(eval_kwargs["y_gt"], "New runner works")
+
+    def test_context_preservation_with_new_runner(self):
+        """Test that context and metadata are preserved with new Runner features."""
+        trainer = RunnerTrainer(runner=self.mock_runner)
+        
+        # Test prepare_task with additional context that new Runner might use
+        sample_with_metadata = Example(
+            id="context_test",
+            question="Test with context?",
+            answer="Context preserved"
+        )
+        
+        task_fn, kwargs = trainer.prepare_task(sample_with_metadata)
+        
+        # Should preserve all necessary information for new Runner
+        self.assertEqual(task_fn, self.mock_runner.__call__)
+        self.assertIn("prompt_kwargs", kwargs)
+        self.assertEqual(kwargs["prompt_kwargs"]["input_str"], "Test with context?")
+        self.assertEqual(kwargs["id"], "context_test")
+
+    def test_backward_compatibility_with_legacy_runner_features(self):
+        """Test that RunnerTrainer maintains backward compatibility."""
+        trainer = RunnerTrainer(runner=self.mock_runner)
+        
+        # Test with "old style" runner result
+        old_style_result = RunnerResult(answer="legacy answer", step_history=[])
+        
+        eval_fn, eval_kwargs = trainer.prepare_eval(self.test_sample, old_style_result)
+        
+        # Should work exactly as before
+        eval_result = eval_fn(**eval_kwargs)
+        # This should fail since answers don't match, proving evaluation works
+        self.assertEqual(eval_result, 0.0)  # "legacy answer" != "New runner works"
+
+    def test_concurrent_runner_execution_support(self):
+        """Test that RunnerTrainer supports potential concurrent execution features."""
+        trainer = RunnerTrainer(runner=self.mock_runner)
+        
+        # Test multiple samples in sequence (simulating batch processing)
+        samples = [
+            Example(id=f"concurrent_{i}", question=f"Question {i}?", answer=f"Answer {i}")
+            for i in range(3)
+        ]
+        
+        results = []
+        for sample in samples:
+            mock_result = RunnerResult(answer=f"Answer {sample.id.split('_')[1]}", step_history=[])
+            eval_fn, eval_kwargs = trainer.prepare_eval(sample, mock_result)
+            result = eval_fn(**eval_kwargs)
+            results.append(result)
+        
+        # All should evaluate correctly
+        self.assertEqual(results, [1.0, 1.0, 1.0])
+
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/benchmarks/hotpot_qa/adal_exp/train_agent_rag.py b/benchmarks/hotpot_qa/adal_exp/train_agent_rag.py
index 370ec97e5..9419a1517 100644
--- a/benchmarks/hotpot_qa/adal_exp/train_agent_rag.py
+++ b/benchmarks/hotpot_qa/adal_exp/train_agent_rag.py
@@ -3,6 +3,10 @@
 import adalflow as adal
 from adalflow.eval.answer_match_acc import AnswerMatchAcc
 from adalflow.datasets.types import HotPotQAData
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
 
 from benchmarks.hotpot_qa.config import load_datasets
 from benchmarks.hotpot_qa.adal_exp.build_multi_hop_rag import AgenticRAG
diff --git a/docs/datasets_benchmarks.md b/docs/datasets_benchmarks.md
new file mode 100644
index 000000000..f4dfa771c
--- /dev/null
+++ b/docs/datasets_benchmarks.md
@@ -0,0 +1,226 @@
+# State-of-the-Art AI Agent Benchmark Datasets (2024-2025)
+
+## **SWE Benchmark Datasets**
+
+### **SWE-bench: Core Dataset**
+
+**Generic description**: An evaluation framework consisting of 2,294 software engineering problems drawn from real GitHub issues and corresponding pull requests across 12 popular Python repositories.
+
+SWE-bench evaluates how well language models can solve real-world software engineering tasks. The evaluation process involves:
+1. **Input**: Code repository and issue description
+2. **Task**: Generate a patch that resolves the described problem
+3. **Evaluation**: Test patch against existing test suites and issue requirements
+4. **Environment**: Fully containerized evaluation using Docker for reproducible results
+
+### **Latest SWE-bench Variants (2024-2025)**
+
+#### **SWE-bench Verified (August 2024)**
+A human-validated subset of 500 high-quality test cases from the original benchmark. Each task has been carefully reviewed and validated by human experts to ensure:
+- Clear problem statements without ambiguity
+- Correct expected solutions
+- Proper test case coverage
+- Elimination of data leakage issues
+
+**Evaluation methodology**: Uses the same Docker-based evaluation harness as the original SWE-bench, but with stricter quality controls and human verification of ground truth solutions.
+
+#### **SWE-bench Multimodal (January 2025)**
+Extends the original benchmark to include issues with visual elements and UI components. Features:
+- Screenshots and visual mockups as part of issue descriptions
+- UI/UX related bug reports and feature requests
+- Frontend component implementation tasks
+- Visual regression testing capabilities
+
+#### **SWE-bench Live (2025)**
+A continuously updated benchmark to prevent data contamination. Key features:
+- Monthly updates with new GitHub issues
+- Automated dataset curation pipeline
+- Real-time issue collection from active repositories
+- Covers 164 repositories across diverse domains
+
+### **SWE-Lancer: Economic-Driven Benchmark (2024)**
+**Generic description**: Real freelance software engineering tasks from Upwork, providing economic context to evaluation.
+
+**Evaluation methodology**:
+- Tasks sourced from actual Upwork freelance projects
+- Economic value ranges from $50 bug fixes to $32,000 feature implementations
+- End-to-end testing using Playwright-powered browser automation
+- Includes both technical implementation and project management assessment
+- Success measured by client acceptance criteria rather than just unit tests
+
+### **Expansion Opportunities for AdalFlow**
+
+Could expand to take not only issues but also pull requests labeled as features for testing CLI's capability of implementing actual features and not just issues. We could start with AdalFlow and see if it can implement core features given design documents in Notion. Open source libraries like PyTorch already provide specific documentation and detail for each feature.
+
+For these repositories, we can create a similar dataset with the base commit before the feature was merged, provide the documentation and details, and evaluate on the tests part of the feature PR.
+
+
+## **MLE Benchmark (Kaggle Competitions)**
+
+### **MLE-bench: OpenAI's Machine Learning Engineering Benchmark (2024)**
+
+**Generic description**: A benchmark for measuring how well AI agents perform at machine learning engineering, constructed using a collection of 75 ML engineering competitions sourced from Kaggle.
+
+**Evaluation methodology**:
+1. **Environment Setup**: Each competition runs in an isolated Docker container with standardized dependencies
+2. **Data Access**: Agents receive training data and competition description, must produce final submission
+3. **Evaluation Pipeline**: 
+   - Agents have access to local evaluation tools for validation
+   - Final submissions scored against held-out test sets
+   - Performance measured using Kaggle's medal system (Bronze/Silver/Gold)
+4. **Resource Management**: Computational limits and timeouts enforced per competition
+5. **Multi-domain Testing**: Covers NLP, computer vision, signal processing, and tabular data tasks
+
+**Key Infrastructure**:
+- Preparation scripts split publicly available training sets into new train/test divisions
+- Each competition includes problem description, dataset, local evaluation tools, and automated grading
+- Standardized agent interfaces across all 75 competitions
+- Reproducible evaluation with consistent environment specifications
+
+### **Latest Kaggle AI Agent Competitions (2024-2025)**
+
+#### **ARC Prize 2024 & 2025**
+**Generic description**: Evaluation of AI systems on novel reasoning tasks requiring efficient learning of new skills.
+
+**Evaluation methodology**:
+- Tests on abstract reasoning corpus requiring pattern recognition and rule inference
+- Computational budget constraints ($50 for 120 evaluation tasks)
+- Measures both performance accuracy and efficiency per task
+- Human baseline comparison for intelligence measurement
+
+#### **Kaggle Game Arena (2024)**
+**Generic description**: Platform for head-to-head AI model competition in strategic games.
+
+**Evaluation framework**:
+- Real-time strategic game competitions (Go, poker, video games)
+- ELO rating system for skill measurement
+- Head-to-head matchmaking and tournament structures
+- Dynamic leaderboards with continuous evaluation
+
+### **Expansion Opportunities for AdalFlow**
+
+Pull Kaggle competitions and split public dataset into training, validation and testing dataset and evaluate model submission performance as medals. From the evaluation, a base Docker environment is created for each agent that can be used for each of the competitions (common dependencies). Similarly, can integrate cloud compute or provided Docker environments to execute code and submit against model competitions.
+
+The agent should be able to compute using a docker container on the cloud or locally. The cloud would be preferable for parallelizing training and testin.
+
+## **Paper Implementation Datasets**
+
+### **Paper2Code: Automating Code Generation from Scientific Papers**
+
+**Link**: https://github.com/going-doer/Paper2Code
+
+**Generic description**: A benchmark that evaluates agents' ability to implement papers, which can be an important aspect of AI development.
+
+#### **Paper2CodeBench (2024)**
+**Dataset Construction Methodology**:
+1. **Paper Collection**: Uses OpenReview API to gather accepted papers from ICLR, ICML, and NeurIPS 2024
+2. **Filtering Criteria**: 
+   - Code availability requirement
+   - Token limit <70,000 to ensure LLM processing feasibility
+   - Quality screening using GPT-4o model-based evaluation
+3. **Final Selection**: Top 30 papers from each venue (90 total papers)
+
+**Evaluation Framework**:
+- **Input**: Research paper PDF and any supplementary materials
+- **Task**: Generate complete, functional code repository implementing the paper's methodology
+- **Assessment**: Multi-stage evaluation including:
+  - Code functionality and correctness
+  - Implementation completeness
+  - Adherence to paper specifications
+  - Reproducibility of reported results
+
+#### **PaperBench Code-Dev**
+**Generic description**: 20 papers from ICML 2024 with human-annotated evaluation rubrics.
+
+**Evaluation methodology**:
+- Paper-specific rubrics created by human experts
+- LLM-based evaluation using structured assessment criteria
+- Focus on implementation correctness rather than just code generation
+- Rubrics cover algorithm accuracy, code quality, and experimental reproducibility
+
+### **MLR-Bench: Machine Learning Research Benchmark (2024)**
+
+**Generic description**: A comprehensive benchmark for evaluating AI agents on open-ended machine learning research tasks.
+
+**Evaluation Infrastructure**:
+1. **MLR-Judge**: Automated evaluation framework using LLM-based reviewers with structured rubrics
+2. **MLR-Agent**: Modular agent scaffold providing standardized interface for task completion
+3. **Multi-stage Assessment**: Research proposal, methodology, implementation, and result analysis
+
+**Task Methodology**:
+- **Task Categories**: 9 core ML topics covering diverse research areas
+- **Open-ended Format**: Tasks require creative problem-solving rather than just implementation
+- **Research Pipeline**: Full research cycle from problem formulation to experimental validation
+- **Peer Review Simulation**: LLM-based reviewers assess work quality using academic standards
+
+### **Expansion Opportunities for AdalFlow**
+
+**(Implementing papers)**
+
+**Paper2Code: Automating Code Generation from Scientific Papers in Machine Learning**
+
+They evaluated agents' ability to implement papers which can be an important aspect.
+
+We construct a new benchmark dataset, which we refer to as Paper2CodeBench. Specifically, we collect the accepted papers from recent machine learning venues (such as ICLR, ICML, and NeurIPS 2024) with the OpenReview API, and filter them based on the availability of code with its total number of tokens less than 70,000, to ensure the full repository remains within reasonable processing limits of modern LLMs for generation and evaluation. Also, to maintain the benchmark quality, we perform model-based evaluation with GPT-4o on all the collected repositories and select the top 30 from each venue, resulting in a total of 90 papers.
+
+In addition to Paper2CodeBench, we also use the recently released PaperBench Code-Dev, which consists of 20 papers from ICML 2024 with paper-specific rubrics annotated by humans. In particular, those rubrics are used to judge the correct implementation based on LLM-based evaluation.
+
+## **Additional State-of-the-Art Benchmarks (2024-2025)**
+
+### **Agent-Specific Benchmarks**
+
+#### **AgentBoard (NeurIPS 2024)**
+**Generic description**: Analytical evaluation board for multi-turn LLM agents focusing on complex interaction scenarios.
+
+**Evaluation methodology**:
+- Multi-turn conversation evaluation with state persistence
+- Partially-observable environment maintenance across interactions
+- Long-horizon task completion assessment
+- Comprehensive evaluation across diverse agent capabilities
+
+#### **WebAgent for Real-World Web Tasks (ICLR 2024)**
+**Generic description**: LLM-driven agent evaluation on real website interaction tasks.
+
+**Evaluation framework**:
+- Real website interaction using browser automation
+- Natural language instruction parsing and execution
+- Task decomposition into canonical sub-instructions
+- HTML document processing and summarization capabilities
+- End-to-end task completion measurement
+
+#### **MetaGPT Multi-Agent Framework (ICLR 2024)**
+**Generic description**: Meta-programming framework for LLM-based multi-agent collaborations.
+
+**Evaluation approach**:
+- Multi-agent coordination and communication assessment
+- Workflow efficiency measurement in collaborative tasks
+- Human workflow integration and optimization
+- Complex project completion evaluation
+
+## **How AdalFlow Datasets Could Be Constructed**
+
+Based on the analysis of state-of-the-art benchmarks, AdalFlow could construct datasets in several key areas:
+
+### **1. SWE-bench Style Implementation**
+**Evaluation methodology**:
+- Use AdalFlow's own GitHub issues and feature requests as source material
+- Create base commits before features were merged to establish clean starting points
+- Provide design documents from Notion as contextual requirements
+- Evaluate against existing test suites and integration tests
+- Include Docker containerization for reproducible evaluation environments
+- Test both bug fixes and feature implementations
+
+### **2. MLE-bench Integration**
+**Evaluation framework**:
+- Curate machine learning competitions specifically relevant to AdalFlow's optimization capabilities
+- Create standardized Docker environments for reproducible evaluation across different tasks
+- Focus evaluation on prompt optimization, few-shot learning, and auto-differentiation tasks
+- Implement cloud compute integration for scalable evaluation infrastructure
+- Develop metrics for measuring optimization efficiency alongside task performance
+
+### **3. Paper Implementation Focus**
+**Evaluation approach**:
+- Target papers related to auto-optimization, prompt engineering, and LLM workflow optimization
+- Use AdalFlow's research on "Auto-Differentiating Any LLM Workflow" as baseline implementation
+- Create specialized rubrics for evaluating optimization algorithm implementations
+- Focus on recent LLM workflow optimization papers from top-tier venues
+- Implement automated testing for optimization convergence and performance improvement
diff --git a/notebooks/tutorials/adalflow_classification_optimization.ipynb b/notebooks/tutorials/adalflow_classification_optimization.ipynb
index e83980d14..b95ef7957 100644
--- a/notebooks/tutorials/adalflow_classification_optimization.ipynb
+++ b/notebooks/tutorials/adalflow_classification_optimization.ipynb
@@ -78,7 +78,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
@@ -91,7 +91,6 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Please enter your OpenAI API key: ··········\n",
       "API keys have been set.\n"
      ]
     }
@@ -122,7 +121,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {
     "id": "ZZIEtZYHNVjo"
    },
@@ -203,7 +202,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {
     "id": "yAvzn7DZeUX-"
    },
@@ -251,7 +250,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {
     "id": "3Q3H9XC4Ncfi"
    },
@@ -337,7 +336,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
@@ -347,10 +346,50 @@
    },
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "TrecData(id='e73a82a7-6a3d-4947-90f5-03739e169db0', question='When reading classified ads , what does EENTY : other stand for ?', class_name='ABBR', class_index=0)\n"
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "290486826f394c0086392b96b373d810",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "README.md: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f4bd8e8cbced4ae184858046b4d670e1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "trec.py: 0.00B [00:00, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "ename": "RuntimeError",
+     "evalue": "Dataset scripts are no longer supported, but found trec.py",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mRuntimeError\u001b[39m                              Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;66;03m# load dataset to get one example\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m train_dataset, val_dataset, test_dataset = \u001b[43mload_datasets\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m      4\u001b[39m example = train_dataset[\u001b[32m0\u001b[39m]\n\u001b[32m      5\u001b[39m \u001b[38;5;28mprint\u001b[39m(example)\n",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 68\u001b[39m, in \u001b[36mload_datasets\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m     66\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mload_datasets\u001b[39m():\n\u001b[32m     67\u001b[39m \u001b[38;5;250m    \u001b[39m\u001b[33;03m\"\"\"Load the dataset\"\"\"\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m68\u001b[39m     train_data = \u001b[43mTrecDataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43msplit\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtrain\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m     69\u001b[39m     val_data = TrecDataset(split=\u001b[33m\"\u001b[39m\u001b[33mval\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m     70\u001b[39m     test_data = TrecDataset(split=\u001b[33m\"\u001b[39m\u001b[33mtest\u001b[39m\u001b[33m\"\u001b[39m)\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/code/AdalFlow/adalflow/adalflow/datasets/trec.py:171\u001b[39m, in \u001b[36mTrecDataset.__init__\u001b[39m\u001b[34m(self, root, split)\u001b[39m\n\u001b[32m    169\u001b[39m data_path = prepare_dataset_path(\u001b[38;5;28mself\u001b[39m.root, \u001b[38;5;28mself\u001b[39m.task_name)\n\u001b[32m    170\u001b[39m \u001b[38;5;66;03m# download and save\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m171\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_check_or_download_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msplit\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    172\u001b[39m \u001b[38;5;66;03m# load from csv\u001b[39;00m\n\u001b[32m    173\u001b[39m \u001b[38;5;28mself\u001b[39m.data = []\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/code/AdalFlow/adalflow/adalflow/datasets/trec.py:198\u001b[39m, in \u001b[36mTrecDataset._check_or_download_dataset\u001b[39m\u001b[34m(self, data_path, split)\u001b[39m\n\u001b[32m    195\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01muuid\u001b[39;00m\n\u001b[32m    197\u001b[39m \u001b[38;5;66;03m# prepare all the data\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m198\u001b[39m train_dataset, val_dataset, test_dataset = \u001b[43mprepare_datasets\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    199\u001b[39m \u001b[38;5;28mprint\u001b[39m(\n\u001b[32m    200\u001b[39m     \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mtrain: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(train_dataset)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m, val: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(val_dataset)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m, test: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(test_dataset)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m    201\u001b[39m )\n\u001b[32m    202\u001b[39m \u001b[38;5;66;03m# save to csv\u001b[39;00m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/code/AdalFlow/adalflow/adalflow/datasets/trec.py:47\u001b[39m, in \u001b[36mprepare_datasets\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m     44\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mprepare_datasets\u001b[39m():\n\u001b[32m     45\u001b[39m     \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mdatasets\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m load_dataset\n\u001b[32m---> \u001b[39m\u001b[32m47\u001b[39m     dataset = \u001b[43mload_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtrec\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m     48\u001b[39m     \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mtrain: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(dataset[\u001b[33m'\u001b[39m\u001b[33mtrain\u001b[39m\u001b[33m'\u001b[39m])\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m, test: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(dataset[\u001b[33m'\u001b[39m\u001b[33mtest\u001b[39m\u001b[33m'\u001b[39m])\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)  \u001b[38;5;66;03m# 5452, 500\u001b[39;00m\n\u001b[32m     49\u001b[39m     \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mtrain example: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdataset[\u001b[33m'\u001b[39m\u001b[33mtrain\u001b[39m\u001b[33m'\u001b[39m][\u001b[32m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/Caches/pypoetry/virtualenvs/adalflow-project-Y74mvs4e-py3.12/lib/python3.12/site-packages/datasets/load.py:1392\u001b[39m, in \u001b[36mload_dataset\u001b[39m\u001b[34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options, **config_kwargs)\u001b[39m\n\u001b[32m   1387\u001b[39m verification_mode = VerificationMode(\n\u001b[32m   1388\u001b[39m     (verification_mode \u001b[38;5;129;01mor\u001b[39;00m VerificationMode.BASIC_CHECKS) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m save_infos \u001b[38;5;28;01melse\u001b[39;00m VerificationMode.ALL_CHECKS\n\u001b[32m   1389\u001b[39m )\n\u001b[32m   1391\u001b[39m \u001b[38;5;66;03m# Create a dataset builder\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1392\u001b[39m builder_instance = \u001b[43mload_dataset_builder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1393\u001b[39m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1394\u001b[39m \u001b[43m    \u001b[49m\u001b[43mname\u001b[49m\u001b[43m=\u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1395\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1396\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1397\u001b[39m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1398\u001b[39m \u001b[43m    \u001b[49m\u001b[43mfeatures\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfeatures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1399\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1400\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1401\u001b[39m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1402\u001b[39m \u001b[43m    \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1403\u001b[39m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1404\u001b[39m \u001b[43m    \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mconfig_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1405\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1407\u001b[39m \u001b[38;5;66;03m# Return iterable dataset in case of streaming\u001b[39;00m\n\u001b[32m   1408\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m streaming:\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/Caches/pypoetry/virtualenvs/adalflow-project-Y74mvs4e-py3.12/lib/python3.12/site-packages/datasets/load.py:1132\u001b[39m, in \u001b[36mload_dataset_builder\u001b[39m\u001b[34m(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, token, storage_options, **config_kwargs)\u001b[39m\n\u001b[32m   1130\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m features \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m   1131\u001b[39m     features = _fix_for_backward_compatible_features(features)\n\u001b[32m-> \u001b[39m\u001b[32m1132\u001b[39m dataset_module = \u001b[43mdataset_module_factory\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1133\u001b[39m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1134\u001b[39m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1135\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1136\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1137\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1138\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1139\u001b[39m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1140\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1141\u001b[39m \u001b[38;5;66;03m# Get dataset builder class\u001b[39;00m\n\u001b[32m   1142\u001b[39m builder_kwargs = dataset_module.builder_kwargs\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/Caches/pypoetry/virtualenvs/adalflow-project-Y74mvs4e-py3.12/lib/python3.12/site-packages/datasets/load.py:1031\u001b[39m, in \u001b[36mdataset_module_factory\u001b[39m\u001b[34m(path, revision, download_config, download_mode, data_dir, data_files, cache_dir, **download_kwargs)\u001b[39m\n\u001b[32m   1026\u001b[39m             \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e1, \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m):\n\u001b[32m   1027\u001b[39m                 \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\n\u001b[32m   1028\u001b[39m                     \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCouldn\u001b[39m\u001b[33m'\u001b[39m\u001b[33mt find any data file at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrelative_to_absolute_path(path)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m. \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m   1029\u001b[39m                     \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCouldn\u001b[39m\u001b[33m'\u001b[39m\u001b[33mt find \u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m on the Hugging Face Hub either: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(e1).\u001b[34m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me1\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m   1030\u001b[39m                 ) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1031\u001b[39m             \u001b[38;5;28;01mraise\u001b[39;00m e1 \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m   1032\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m   1033\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCouldn\u001b[39m\u001b[33m'\u001b[39m\u001b[33mt find any data file at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrelative_to_absolute_path(path)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m)\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/Caches/pypoetry/virtualenvs/adalflow-project-Y74mvs4e-py3.12/lib/python3.12/site-packages/datasets/load.py:989\u001b[39m, in \u001b[36mdataset_module_factory\u001b[39m\u001b[34m(path, revision, download_config, download_mode, data_dir, data_files, cache_dir, **download_kwargs)\u001b[39m\n\u001b[32m    981\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m    982\u001b[39m     api.hf_hub_download(\n\u001b[32m    983\u001b[39m         repo_id=path,\n\u001b[32m    984\u001b[39m         filename=filename,\n\u001b[32m   (...)\u001b[39m\u001b[32m    987\u001b[39m         proxies=download_config.proxies,\n\u001b[32m    988\u001b[39m     )\n\u001b[32m--> \u001b[39m\u001b[32m989\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mDataset scripts are no longer supported, but found \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfilename\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m    990\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m EntryNotFoundError:\n\u001b[32m    991\u001b[39m     \u001b[38;5;66;03m# Use the infos from the parquet export except in some cases:\u001b[39;00m\n\u001b[32m    992\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m data_dir \u001b[38;5;129;01mor\u001b[39;00m data_files \u001b[38;5;129;01mor\u001b[39;00m (revision \u001b[38;5;129;01mand\u001b[39;00m revision != \u001b[33m\"\u001b[39m\u001b[33mmain\u001b[39m\u001b[33m\"\u001b[39m):\n",
+      "\u001b[31mRuntimeError\u001b[39m: Dataset scripts are no longer supported, but found trec.py"
      ]
     }
    ],
@@ -364,7 +403,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
@@ -377,44 +416,21 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Parameter(name=Generator_output, requires_opt=True, param_type=generator_output (The output of the generator.), role_desc=Output from (llm) Generator, data=```\n",
-      "rationale: The question is asking for the meaning of the abbreviation \"EENTY\" in classified ads, which falls under the ABBR class.\n",
-      "class_name: ABBR\n",
-      "```, predecessors={Parameter(name=Output_for, requires_opt=False, param_type=prompt (Instruction to the language model on task, data, and format.), role_desc=Output format requirements, data=Your output should be formatted as a standard YAML instance with the following schema:\n",
-      "```\n",
-      "rationale: Your step-by-step reasoning to classify the question to class_name (str) (optional)\n",
-      "class_name: One of {ABBR, ENTY, DESC, HUM, LOC, NUM} (str) (optional)\n",
-      "```\n",
-      "-Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\n",
-      "-Follow the YAML formatting conventions with an indent of 2 spaces.\n",
-      "-DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the YAML output.\n",
-      "-Quote the string values properly., predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={}), Parameter(name=Few_shot_e, requires_opt=True, param_type=demos (A few examples to guide the language model.), role_desc=Few shot examples to help the model, data=None, predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={}), Parameter(name=Input_to_t, requires_opt=False, param_type=none (), role_desc=input to the LLM, data=question: 'When reading classified ads , what does EENTY : other stand for ?', predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={}), Parameter(name=Task_descr, requires_opt=True, param_type=prompt (Instruction to the language model on task, data, and format.), role_desc=Task description, data=You are a classifier. Given a question, you need to classify it into one of the following classes:\n",
-      " Format: class_index. class_name, class_description\n",
-      " 0. ABBR, Abbreviation: Questions about abbreviations and their meanings\n",
-      " 1. DESC, Description: Questions seeking descriptions of people, things, or concepts\n",
-      " 2. ENTY, Entity: Questions about entities (e.g., animals, colors, inventions)\n",
-      " 3. HUM, Human: Questions about people or organizations\n",
-      " 4. LOC, Location: Questions about places, cities, countries\n",
-      " 5. NUM, Numeric: Questions seeking numeric answers (e.g., dates, amounts, distances)\n",
-      " - Do not try to answer the question:\n",
-      " , predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={})}, gradients=[],            raw_response=None, input_args={'prompt_kwargs': {'system_prompt': Parameter(name=Task_descr, requires_opt=True, param_type=prompt (Instruction to the language model on task, data, and format.), role_desc=Task description, data=You are a classifier. Given a question, you need to classify it into one of the following classes:\n",
-      " Format: class_index. class_name, class_description\n",
-      " 0. ABBR, Abbreviation: Questions about abbreviations and their meanings\n",
-      " 1. DESC, Description: Questions seeking descriptions of people, things, or concepts\n",
-      " 2. ENTY, Entity: Questions about entities (e.g., animals, colors, inventions)\n",
-      " 3. HUM, Human: Questions about people or organizations\n",
-      " 4. LOC, Location: Questions about places, cities, countries\n",
-      " 5. NUM, Numeric: Questions seeking numeric answers (e.g., dates, amounts, distances)\n",
-      " - Do not try to answer the question:\n",
-      " , predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={}), 'output_format_str': Parameter(name=Output_for, requires_opt=False, param_type=prompt (Instruction to the language model on task, data, and format.), role_desc=Output format requirements, data=Your output should be formatted as a standard YAML instance with the following schema:\n",
-      "```\n",
-      "rationale: Your step-by-step reasoning to classify the question to class_name (str) (optional)\n",
-      "class_name: One of {ABBR, ENTY, DESC, HUM, LOC, NUM} (str) (optional)\n",
-      "```\n",
-      "-Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\n",
-      "-Follow the YAML formatting conventions with an indent of 2 spaces.\n",
-      "-DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the YAML output.\n",
-      "-Quote the string values properly., predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={}), 'few_shot_demos': Parameter(name=Few_shot_e, requires_opt=True, param_type=demos (A few examples to guide the language model.), role_desc=Few shot examples to help the model, data=None, predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={}), 'input_str': Parameter(name=Input_to_t, requires_opt=False, param_type=none (), role_desc=input to the LLM, data=question: 'When reading classified ads , what does EENTY : other stand for ?', predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={})}, 'model_kwargs': {'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, traces={})\n"
+      "Prompt kwargs: {'classes': [{'label': 'ABBR', 'desc': 'Abbreviation: Questions about abbreviations and their meanings'}, {'label': 'DESC', 'desc': 'Description: Questions seeking descriptions of people, things, or concepts'}, {'label': 'ENTY', 'desc': 'Entity: Questions about entities (e.g., animals, colors, inventions)'}, {'label': 'HUM', 'desc': 'Human: Questions about people or organizations'}, {'label': 'LOC', 'desc': 'Location: Questions about places, cities, countries'}, {'label': 'NUM', 'desc': 'Numeric: Questions seeking numeric answers (e.g., dates, amounts, distances)'}]}\n",
+      "Prompt kwargs after conversion: {'classes': [{'label': 'ABBR', 'desc': 'Abbreviation: Questions about abbreviations and their meanings'}, {'label': 'DESC', 'desc': 'Description: Questions seeking descriptions of people, things, or concepts'}, {'label': 'ENTY', 'desc': 'Entity: Questions about entities (e.g., animals, colors, inventions)'}, {'label': 'HUM', 'desc': 'Human: Questions about people or organizations'}, {'label': 'LOC', 'desc': 'Location: Questions about places, cities, countries'}, {'label': 'NUM', 'desc': 'Numeric: Questions seeking numeric answers (e.g., dates, amounts, distances)'}]}\n",
+      "Prompt kwargs: {'schema': 'rationale: Your step-by-step reasoning to classify the question to class_name (str) (optional)\\nclass_name: One of {ABBR, ENTY, DESC, HUM, LOC, NUM} (str) (optional)'}\n",
+      "Prompt kwargs after conversion: {'schema': 'rationale: Your step-by-step reasoning to classify the question to class_name (str) (optional)\\nclass_name: One of {ABBR, ENTY, DESC, HUM, LOC, NUM} (str) (optional)'}\n"
+     ]
+    },
+    {
+     "ename": "NameError",
+     "evalue": "name 'example' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mNameError\u001b[39m                                 Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[7]\u001b[39m\u001b[32m, line 7\u001b[39m\n\u001b[32m      1\u001b[39m task = TRECClassifierStructuredOutput(\n\u001b[32m      2\u001b[39m     model_client=gpt_3_model[\u001b[33m\"\u001b[39m\u001b[33mmodel_client\u001b[39m\u001b[33m\"\u001b[39m],\n\u001b[32m      3\u001b[39m     model_kwargs=gpt_3_model[\u001b[33m\"\u001b[39m\u001b[33mmodel_kwargs\u001b[39m\u001b[33m\"\u001b[39m],\n\u001b[32m      4\u001b[39m )\n\u001b[32m      5\u001b[39m task.train()\n\u001b[32m----> \u001b[39m\u001b[32m7\u001b[39m output = task(question=\u001b[43mexample\u001b[49m.question, \u001b[38;5;28mid\u001b[39m=example.id)\n\u001b[32m      8\u001b[39m \u001b[38;5;28mprint\u001b[39m(output)\n",
+      "\u001b[31mNameError\u001b[39m: name 'example' is not defined"
      ]
     }
    ],
@@ -904,11 +920,21 @@
    "provenance": []
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "adalflow-project-Y74mvs4e-py3.12",
+   "language": "python",
    "name": "python3"
   },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/tutorials/adalflow_rag_optimization.ipynb b/notebooks/tutorials/adalflow_rag_optimization.ipynb
index c05d9ad47..4d8497ead 100644
--- a/notebooks/tutorials/adalflow_rag_optimization.ipynb
+++ b/notebooks/tutorials/adalflow_rag_optimization.ipynb
@@ -56,7 +56,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 1,
    "metadata": {
     "id": "tAp3eDjOCma1"
    },
@@ -72,9 +72,67 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found existing installation: httpx 0.28.1\n",
+      "Uninstalling httpx-0.28.1:\n",
+      "  Successfully uninstalled httpx-0.28.1\n",
+      "Found existing installation: anyio 4.9.0\n",
+      "Uninstalling anyio-4.9.0:\n",
+      "  Successfully uninstalled anyio-4.9.0\n",
+      "\u001b[33mDEPRECATION: Loading egg at /Users/jinhakim/miniforge3/lib/python3.12/site-packages/matlabengine-24.2-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330\u001b[0m\u001b[33m\n",
+      "\u001b[0mCollecting anyio<4.0,>=3.1.0\n",
+      "  Downloading anyio-3.7.1-py3-none-any.whl.metadata (4.7 kB)\n",
+      "Requirement already satisfied: idna>=2.8 in /Users/jinhakim/miniforge3/lib/python3.12/site-packages (from anyio<4.0,>=3.1.0) (3.10)\n",
+      "Requirement already satisfied: sniffio>=1.1 in /Users/jinhakim/miniforge3/lib/python3.12/site-packages (from anyio<4.0,>=3.1.0) (1.3.1)\n",
+      "Downloading anyio-3.7.1-py3-none-any.whl (80 kB)\n",
+      "Installing collected packages: anyio\n",
+      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+      "mcp 1.9.4 requires httpx>=0.27, which is not installed.\n",
+      "openai 1.88.0 requires httpx<1,>=0.23.0, which is not installed.\n",
+      "litellm 1.75.3 requires httpx>=0.23.0, which is not installed.\n",
+      "fastmcp 2.9.0 requires httpx>=0.28.1, which is not installed.\n",
+      "mcp 1.9.4 requires anyio>=4.5, but you have anyio 3.7.1 which is incompatible.\n",
+      "sse-starlette 2.3.6 requires anyio>=4.7.0, but you have anyio 3.7.1 which is incompatible.\u001b[0m\u001b[31m\n",
+      "\u001b[0mSuccessfully installed anyio-3.7.1\n",
+      "\u001b[33mDEPRECATION: Loading egg at /Users/jinhakim/miniforge3/lib/python3.12/site-packages/matlabengine-24.2-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330\u001b[0m\u001b[33m\n",
+      "\u001b[0mCollecting httpx==0.24.1\n",
+      "  Downloading httpx-0.24.1-py3-none-any.whl.metadata (7.4 kB)\n",
+      "Requirement already satisfied: certifi in /Users/jinhakim/miniforge3/lib/python3.12/site-packages (from httpx==0.24.1) (2025.6.15)\n",
+      "Collecting httpcore<0.18.0,>=0.15.0 (from httpx==0.24.1)\n",
+      "  Downloading httpcore-0.17.3-py3-none-any.whl.metadata (18 kB)\n",
+      "Requirement already satisfied: idna in /Users/jinhakim/miniforge3/lib/python3.12/site-packages (from httpx==0.24.1) (3.10)\n",
+      "Requirement already satisfied: sniffio in /Users/jinhakim/miniforge3/lib/python3.12/site-packages (from httpx==0.24.1) (1.3.1)\n",
+      "Collecting h11<0.15,>=0.13 (from httpcore<0.18.0,>=0.15.0->httpx==0.24.1)\n",
+      "  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)\n",
+      "Requirement already satisfied: anyio<5.0,>=3.0 in /Users/jinhakim/miniforge3/lib/python3.12/site-packages (from httpcore<0.18.0,>=0.15.0->httpx==0.24.1) (3.7.1)\n",
+      "Downloading httpx-0.24.1-py3-none-any.whl (75 kB)\n",
+      "Downloading httpcore-0.17.3-py3-none-any.whl (74 kB)\n",
+      "Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
+      "Installing collected packages: h11, httpcore, httpx\n",
+      "  Attempting uninstall: h11\n",
+      "    Found existing installation: h11 0.16.0\n",
+      "    Uninstalling h11-0.16.0:\n",
+      "      Successfully uninstalled h11-0.16.0\n",
+      "  Attempting uninstall: httpcore\n",
+      "    Found existing installation: httpcore 1.0.9\n",
+      "    Uninstalling httpcore-1.0.9:\n",
+      "      Successfully uninstalled httpcore-1.0.9\n",
+      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+      "mcp 1.9.4 requires anyio>=4.5, but you have anyio 3.7.1 which is incompatible.\n",
+      "mcp 1.9.4 requires httpx>=0.27, but you have httpx 0.24.1 which is incompatible.\n",
+      "python-telegram-bot 21.10 requires httpx~=0.27, but you have httpx 0.24.1 which is incompatible.\n",
+      "fastmcp 2.9.0 requires httpx>=0.28.1, but you have httpx 0.24.1 which is incompatible.\n",
+      "langgraph-sdk 0.1.70 requires httpx>=0.25.2, but you have httpx 0.24.1 which is incompatible.\u001b[0m\u001b[31m\n",
+      "\u001b[0mSuccessfully installed h11-0.14.0 httpcore-0.17.3 httpx-0.24.1\n"
+     ]
+    }
+   ],
    "source": [
     "!pip uninstall httpx anyio -y\n",
     "!pip install \"anyio>=3.1.0,<4.0\"\n",
@@ -98,7 +156,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
@@ -111,7 +169,6 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Please enter your OpenAI API key: ··········\n",
       "API keys have been set.\n"
      ]
     }
@@ -133,7 +190,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 2,
    "metadata": {
     "id": "aE3I05BqOmd7"
    },
@@ -156,7 +213,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {
     "id": "cqUUoua9fUxQ"
    },
@@ -181,7 +238,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 4,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
@@ -194,18 +251,25 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "HotPotQAData(id='5a8b57f25542995d1e6f1371', question='Were Scott Derrickson and Ed Wood of the same nationality?', answer='yes', gold_titles=\"{'Scott Derrickson', 'Ed Wood'}\") <class 'adalflow.datasets.types.HotPotQAData'>\n"
+      "split_csv_path: /Users/jinhakim/.adalflow/cache_datasets/hotpot_qa_dev_titles/train.json\n"
      ]
     },
     {
-     "data": {
-      "text/plain": [
-       "HotPotQAData(id='5a8b57f25542995d1e6f1371', question='Were Scott Derrickson and Ed Wood of the same nationality?', answer='yes', gold_titles=\"{'Scott Derrickson', 'Ed Wood'}\")"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
+     "ename": "RuntimeError",
+     "evalue": "Dataset scripts are no longer supported, but found hotpot_qa.py",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mRuntimeError\u001b[39m                              Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 22\u001b[39m\n\u001b[32m     15\u001b[39m     answer: \u001b[38;5;28mstr\u001b[39m = field(\n\u001b[32m     16\u001b[39m         metadata={\u001b[33m\"\u001b[39m\u001b[33mdesc\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mThe answer you produced\u001b[39m\u001b[33m\"\u001b[39m},\n\u001b[32m     17\u001b[39m     )\n\u001b[32m     19\u001b[39m     __output_fields__ = [\u001b[33m\"\u001b[39m\u001b[33mreasoning\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33manswer\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m---> \u001b[39m\u001b[32m22\u001b[39m dataset = \u001b[43mHotPotQA\u001b[49m\u001b[43m(\u001b[49m\u001b[43msplit\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtrain\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msize\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m20\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m     23\u001b[39m \u001b[38;5;28mprint\u001b[39m(dataset[\u001b[32m0\u001b[39m], \u001b[38;5;28mtype\u001b[39m(dataset[\u001b[32m0\u001b[39m]))\n\u001b[32m     25\u001b[39m HotPotQAData(\n\u001b[32m     26\u001b[39m     \u001b[38;5;28mid\u001b[39m=\u001b[33m\"\u001b[39m\u001b[33m5a8b57f25542995d1e6f1371\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     27\u001b[39m     question=\u001b[33m\"\u001b[39m\u001b[33mWere Scott Derrickson and Ed Wood of the same nationality?\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     28\u001b[39m     answer=\u001b[33m\"\u001b[39m\u001b[33myes\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     29\u001b[39m     gold_titles=\u001b[33m\"\u001b[39m\u001b[33m{\u001b[39m\u001b[33m'\u001b[39m\u001b[33mScott Derrickson\u001b[39m\u001b[33m'\u001b[39m\u001b[33m, \u001b[39m\u001b[33m'\u001b[39m\u001b[33mEd Wood\u001b[39m\u001b[33m'\u001b[39m\u001b[33m}\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     30\u001b[39m )\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/code/AdalFlow/adalflow/adalflow/datasets/hotpot_qa.py:50\u001b[39m, in \u001b[36mHotPotQA.__init__\u001b[39m\u001b[34m(self, only_hard_examples, root, split, keep_details, size, **kwargs)\u001b[39m\n\u001b[32m     48\u001b[39m split_csv_path = os.path.join(data_path, \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msplit\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m.json\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m     49\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33msplit_csv_path: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msplit_csv_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m50\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_check_or_download_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m     51\u001b[39m \u001b[43m    \u001b[49m\u001b[43msplit_csv_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msplit\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43monly_hard_examples\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeep_details\u001b[49m\n\u001b[32m     52\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     54\u001b[39m \u001b[38;5;66;03m# load from csv\u001b[39;00m\n\u001b[32m     55\u001b[39m \u001b[38;5;28mself\u001b[39m.data = []\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/code/AdalFlow/adalflow/adalflow/datasets/hotpot_qa.py:111\u001b[39m, in \u001b[36mHotPotQA._check_or_download_dataset\u001b[39m\u001b[34m(self, data_path, split, only_hard_examples, keep_details)\u001b[39m\n\u001b[32m    100\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m only_hard_examples, (\n\u001b[32m    101\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mCare must be taken when adding support for easy examples.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m    102\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mDev must be all hard to match official dev, but training can be flexible.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m    103\u001b[39m )\n\u001b[32m    105\u001b[39m \u001b[38;5;66;03m# hf_official_train = load_dataset(\u001b[39;00m\n\u001b[32m    106\u001b[39m \u001b[38;5;66;03m#     \"hotpot_qa\", \"fullwiki\", split=\"train\", trust_remote_code=True\u001b[39;00m\n\u001b[32m    107\u001b[39m \u001b[38;5;66;03m# )\u001b[39;00m\n\u001b[32m    108\u001b[39m \u001b[38;5;66;03m# hf_official_dev = load_dataset(\u001b[39;00m\n\u001b[32m    109\u001b[39m \u001b[38;5;66;03m#     \"hotpot_qa\", \"fullwiki\", split=\"validation\", trust_remote_code=True\u001b[39;00m\n\u001b[32m    110\u001b[39m \u001b[38;5;66;03m# )\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m111\u001b[39m hf_official_train = \u001b[43mload_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    112\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mhotpot_qa\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfullwiki\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msplit\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtrain\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\n\u001b[32m    113\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    114\u001b[39m hf_official_dev = load_dataset(\n\u001b[32m    115\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mhotpot_qa\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mfullwiki\u001b[39m\u001b[33m\"\u001b[39m, split=\u001b[33m\"\u001b[39m\u001b[33mvalidation\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m    116\u001b[39m )\n\u001b[32m    117\u001b[39m data_path_dir = os.path.dirname(data_path)\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/Caches/pypoetry/virtualenvs/adalflow-project-Y74mvs4e-py3.12/lib/python3.12/site-packages/datasets/load.py:1392\u001b[39m, in \u001b[36mload_dataset\u001b[39m\u001b[34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options, **config_kwargs)\u001b[39m\n\u001b[32m   1387\u001b[39m verification_mode = VerificationMode(\n\u001b[32m   1388\u001b[39m     (verification_mode \u001b[38;5;129;01mor\u001b[39;00m VerificationMode.BASIC_CHECKS) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m save_infos \u001b[38;5;28;01melse\u001b[39;00m VerificationMode.ALL_CHECKS\n\u001b[32m   1389\u001b[39m )\n\u001b[32m   1391\u001b[39m \u001b[38;5;66;03m# Create a dataset builder\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1392\u001b[39m builder_instance = \u001b[43mload_dataset_builder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1393\u001b[39m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1394\u001b[39m \u001b[43m    \u001b[49m\u001b[43mname\u001b[49m\u001b[43m=\u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1395\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1396\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1397\u001b[39m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1398\u001b[39m \u001b[43m    \u001b[49m\u001b[43mfeatures\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfeatures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1399\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1400\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1401\u001b[39m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1402\u001b[39m \u001b[43m    \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1403\u001b[39m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1404\u001b[39m \u001b[43m    \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mconfig_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1405\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1407\u001b[39m \u001b[38;5;66;03m# Return iterable dataset in case of streaming\u001b[39;00m\n\u001b[32m   1408\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m streaming:\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/Caches/pypoetry/virtualenvs/adalflow-project-Y74mvs4e-py3.12/lib/python3.12/site-packages/datasets/load.py:1132\u001b[39m, in \u001b[36mload_dataset_builder\u001b[39m\u001b[34m(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, token, storage_options, **config_kwargs)\u001b[39m\n\u001b[32m   1130\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m features \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m   1131\u001b[39m     features = _fix_for_backward_compatible_features(features)\n\u001b[32m-> \u001b[39m\u001b[32m1132\u001b[39m dataset_module = \u001b[43mdataset_module_factory\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1133\u001b[39m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1134\u001b[39m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1135\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1136\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1137\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1138\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1139\u001b[39m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1140\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1141\u001b[39m \u001b[38;5;66;03m# Get dataset builder class\u001b[39;00m\n\u001b[32m   1142\u001b[39m builder_kwargs = dataset_module.builder_kwargs\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/Caches/pypoetry/virtualenvs/adalflow-project-Y74mvs4e-py3.12/lib/python3.12/site-packages/datasets/load.py:1031\u001b[39m, in \u001b[36mdataset_module_factory\u001b[39m\u001b[34m(path, revision, download_config, download_mode, data_dir, data_files, cache_dir, **download_kwargs)\u001b[39m\n\u001b[32m   1026\u001b[39m             \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e1, \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m):\n\u001b[32m   1027\u001b[39m                 \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\n\u001b[32m   1028\u001b[39m                     \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCouldn\u001b[39m\u001b[33m'\u001b[39m\u001b[33mt find any data file at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrelative_to_absolute_path(path)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m. \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m   1029\u001b[39m                     \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCouldn\u001b[39m\u001b[33m'\u001b[39m\u001b[33mt find \u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m on the Hugging Face Hub either: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(e1).\u001b[34m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me1\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m   1030\u001b[39m                 ) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1031\u001b[39m             \u001b[38;5;28;01mraise\u001b[39;00m e1 \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m   1032\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m   1033\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCouldn\u001b[39m\u001b[33m'\u001b[39m\u001b[33mt find any data file at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrelative_to_absolute_path(path)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m)\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/Caches/pypoetry/virtualenvs/adalflow-project-Y74mvs4e-py3.12/lib/python3.12/site-packages/datasets/load.py:989\u001b[39m, in \u001b[36mdataset_module_factory\u001b[39m\u001b[34m(path, revision, download_config, download_mode, data_dir, data_files, cache_dir, **download_kwargs)\u001b[39m\n\u001b[32m    981\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m    982\u001b[39m     api.hf_hub_download(\n\u001b[32m    983\u001b[39m         repo_id=path,\n\u001b[32m    984\u001b[39m         filename=filename,\n\u001b[32m   (...)\u001b[39m\u001b[32m    987\u001b[39m         proxies=download_config.proxies,\n\u001b[32m    988\u001b[39m     )\n\u001b[32m--> \u001b[39m\u001b[32m989\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mDataset scripts are no longer supported, but found \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfilename\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m    990\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m EntryNotFoundError:\n\u001b[32m    991\u001b[39m     \u001b[38;5;66;03m# Use the infos from the parquet export except in some cases:\u001b[39;00m\n\u001b[32m    992\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m data_dir \u001b[38;5;129;01mor\u001b[39;00m data_files \u001b[38;5;129;01mor\u001b[39;00m (revision \u001b[38;5;129;01mand\u001b[39;00m revision != \u001b[33m\"\u001b[39m\u001b[33mmain\u001b[39m\u001b[33m\"\u001b[39m):\n",
+      "\u001b[31mRuntimeError\u001b[39m: Dataset scripts are no longer supported, but found hotpot_qa.py"
+     ]
     }
    ],
    "source": [
@@ -497,11 +561,21 @@
    "provenance": []
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "adalflow-project-Y74mvs4e-py3.12",
+   "language": "python",
    "name": "python3"
   },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
   }
  },
  "nbformat": 4,
diff --git a/poetry.lock b/poetry.lock
index 308f6d5e3..db6aa3d2b 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.1.4 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand.
 
 [[package]]
 name = "absl-py"
@@ -114,7 +114,7 @@ version = "2.6.1"
 description = "Happy Eyeballs for asyncio"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8"},
     {file = "aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558"},
@@ -126,7 +126,7 @@ version = "3.12.15"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "aiohttp-3.12.15-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b6fc902bff74d9b1879ad55f5404153e2b33a82e72a95c89cec5eb6cc9e92fbc"},
     {file = "aiohttp-3.12.15-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:098e92835b8119b54c693f2f88a1dec690e20798ca5f5fe5f0520245253ee0af"},
@@ -250,7 +250,7 @@ version = "1.4.0"
 description = "aiosignal: a list of registered asynchronous callbacks"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e"},
     {file = "aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7"},
@@ -1324,7 +1324,7 @@ version = "4.0.0"
 description = "HuggingFace community-driven open-source library of datasets"
 optional = false
 python-versions = ">=3.9.0"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "datasets-4.0.0-py3-none-any.whl", hash = "sha256:7ef95e62025fd122882dbce6cb904c8cd3fbc829de6669a5eb939c77d50e203d"},
     {file = "datasets-4.0.0.tar.gz", hash = "sha256:9657e7140a9050db13443ba21cb5de185af8af944479b00e7ff1e00a61c8dbf1"},
@@ -1426,7 +1426,7 @@ version = "0.3.8"
 description = "serialize all of Python"
 optional = false
 python-versions = ">=3.8"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "dill-0.3.8-py3-none-any.whl", hash = "sha256:c36ca9ffb54365bdd2f8eb3eff7d2a21237f8452b57ace88b1ac615b7e815bd7"},
     {file = "dill-0.3.8.tar.gz", hash = "sha256:3ebe3c479ad625c4553aca177444d89b486b1d84982eeacded644afc0cf797ca"},
@@ -1735,7 +1735,7 @@ version = "3.18.0"
 description = "A platform independent file lock."
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "filelock-3.18.0-py3-none-any.whl", hash = "sha256:c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de"},
     {file = "filelock-3.18.0.tar.gz", hash = "sha256:adbc88eabb99d2fec8c9c1b229b171f18afa655400173ddc653d5d01501fb9f2"},
@@ -1875,7 +1875,7 @@ version = "1.7.0"
 description = "A list-like structure which implements collections.abc.MutableSequence"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "frozenlist-1.7.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cc4df77d638aa2ed703b878dd093725b72a824c3c546c076e8fdf276f78ee84a"},
     {file = "frozenlist-1.7.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:716a9973a2cc963160394f701964fe25012600f3d311f60c790400b00e568b61"},
@@ -1989,7 +1989,7 @@ version = "2025.3.0"
 description = "File-system specification"
 optional = false
 python-versions = ">=3.8"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "fsspec-2025.3.0-py3-none-any.whl", hash = "sha256:efb87af3efa9103f94ca91a7f8cb7a4df91af9f74fc106c9c7ea0efd7277c1b3"},
     {file = "fsspec-2025.3.0.tar.gz", hash = "sha256:a935fd1ea872591f2b5148907d103488fc523295e6c64b835cfad8c3eca44972"},
@@ -2291,7 +2291,7 @@ version = "0.21"
 description = "Simple Python interface for Graphviz"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "graphviz-0.21-py3-none-any.whl", hash = "sha256:54f33de9f4f911d7e84e4191749cac8cc5653f815b06738c54db9a15ab8b1e42"},
     {file = "graphviz-0.21.tar.gz", hash = "sha256:20743e7183be82aaaa8ad6c93f8893c923bd6658a04c32ee115edb3c8a835f78"},
@@ -2528,7 +2528,7 @@ version = "1.1.5"
 description = "Fast transfer of large files with the Hugging Face Hub."
 optional = false
 python-versions = ">=3.8"
-groups = ["dev"]
+groups = ["main", "dev"]
 markers = "platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"arm64\" or platform_machine == \"aarch64\""
 files = [
     {file = "hf_xet-1.1.5-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:f52c2fa3635b8c37c7764d8796dfa72706cc4eded19d638331161e82b0792e23"},
@@ -2643,7 +2643,7 @@ version = "0.34.3"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
 python-versions = ">=3.8.0"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "huggingface_hub-0.34.3-py3-none-any.whl", hash = "sha256:5444550099e2d86e68b2898b09e85878fbd788fc2957b506c6a79ce060e39492"},
     {file = "huggingface_hub-0.34.3.tar.gz", hash = "sha256:d58130fd5aa7408480681475491c0abd7e835442082fbc3ef4d45b6c39f83853"},
@@ -3104,6 +3104,25 @@ files = [
     {file = "jsonpath_python-1.0.6-py3-none-any.whl", hash = "sha256:1e3b78df579f5efc23565293612decee04214609208a2335884b3ee3f786b575"},
 ]
 
+[[package]]
+name = "jsonpickle"
+version = "4.1.1"
+description = "jsonpickle encodes/decodes any Python object to/from JSON"
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "jsonpickle-4.1.1-py3-none-any.whl", hash = "sha256:bb141da6057898aa2438ff268362b126826c812a1721e31cf08a6e142910dc91"},
+    {file = "jsonpickle-4.1.1.tar.gz", hash = "sha256:f86e18f13e2b96c1c1eede0b7b90095bbb61d99fedc14813c44dc2f361dbbae1"},
+]
+
+[package.extras]
+cov = ["pytest-cov"]
+dev = ["black", "pyupgrade"]
+docs = ["furo", "rst.linker (>=1.9)", "sphinx (>=3.5)"]
+packaging = ["build", "setuptools (>=61.2)", "setuptools_scm[toml] (>=6.0)", "twine"]
+testing = ["PyYAML", "atheris (>=2.3.0,<2.4.0) ; python_version < \"3.12\"", "bson", "ecdsa", "feedparser", "gmpy2", "numpy", "pandas", "pymongo", "pytest (>=6.0,!=8.1.*)", "pytest-benchmark", "pytest-benchmark[histogram]", "pytest-checkdocs (>=1.2.3)", "pytest-enabler (>=1.0.1)", "pytest-ruff (>=0.2.1)", "scikit-learn", "scipy (>=1.9.3) ; python_version > \"3.10\"", "scipy ; python_version <= \"3.10\"", "simplejson", "sqlalchemy", "ujson"]
+
 [[package]]
 name = "jsonpointer"
 version = "3.0.0"
@@ -4214,7 +4233,7 @@ version = "6.6.3"
 description = "multidict implementation"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "multidict-6.6.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a2be5b7b35271f7fff1397204ba6708365e3d773579fe2a30625e16c4b4ce817"},
     {file = "multidict-6.6.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:12f4581d2930840295c461764b9a65732ec01250b46c6b2c510d7ee68872b140"},
@@ -4334,7 +4353,7 @@ version = "0.70.16"
 description = "better multiprocessing and multithreading in Python"
 optional = false
 python-versions = ">=3.8"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "multiprocess-0.70.16-pp310-pypy310_pp73-macosx_10_13_x86_64.whl", hash = "sha256:476887be10e2f59ff183c006af746cb6f1fd0eadcfd4ef49e605cbe2659920ee"},
     {file = "multiprocess-0.70.16-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:d951bed82c8f73929ac82c61f01a7b5ce8f3e5ef40f5b52553b4f547ce2b08ec"},
@@ -4465,7 +4484,7 @@ version = "3.5"
 description = "Python package for creating and manipulating graphs and networks"
 optional = false
 python-versions = ">=3.11"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "networkx-3.5-py3-none-any.whl", hash = "sha256:0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec"},
     {file = "networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037"},
@@ -5116,7 +5135,7 @@ version = "2.3.1"
 description = "Powerful data structures for data analysis, time series, and statistics"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "pandas-2.3.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:22c2e866f7209ebc3a8f08d75766566aae02bcc91d196935a1d9e59c7b990ac9"},
     {file = "pandas-2.3.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3583d348546201aff730c8c47e49bc159833f971c2899d6097bce68b9112a4f1"},
@@ -5479,7 +5498,7 @@ version = "0.3.2"
 description = "Accelerated property cache"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "propcache-0.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:22d9962a358aedbb7a2e36187ff273adeaab9743373a272976d2e348d08c7770"},
     {file = "propcache-0.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0d0fda578d1dc3f77b6b5a5dce3b9ad69a8250a891760a548df850a5e8da87f3"},
@@ -5678,7 +5697,7 @@ version = "20.0.0"
 description = "Python library for Apache Arrow"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "pyarrow-20.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:c7dd06fd7d7b410ca5dc839cc9d485d2bc4ae5240851bcd45d85105cc90a47d7"},
     {file = "pyarrow-20.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:d5382de8dc34c943249b01c19110783d0d64b207167c728461add1ecc2db88e4"},
@@ -6079,7 +6098,7 @@ version = "2.9.0.post0"
 description = "Extensions to the standard Python datetime module"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
     {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
@@ -6136,12 +6155,29 @@ version = "2025.2"
 description = "World timezone definitions, modern and historical"
 optional = false
 python-versions = "*"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00"},
     {file = "pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3"},
 ]
 
+[[package]]
+name = "pyvis"
+version = "0.3.2"
+description = "A Python network graph visualization library"
+optional = false
+python-versions = ">3.6"
+groups = ["main"]
+files = [
+    {file = "pyvis-0.3.2-py3-none-any.whl", hash = "sha256:5720c4ca8161dc5d9ab352015723abb7a8bb8fb443edeb07f7a322db34a97555"},
+]
+
+[package.dependencies]
+ipython = ">=5.3.0"
+jinja2 = ">=2.9.6"
+jsonpickle = ">=1.4.1"
+networkx = ">=1.11"
+
 [[package]]
 name = "pywin32"
 version = "311"
@@ -7049,7 +7085,7 @@ version = "1.17.0"
 description = "Python 2 and 3 compatibility utilities"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"},
     {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"},
@@ -7880,7 +7916,6 @@ files = [
     {file = "typing_extensions-4.14.1-py3-none-any.whl", hash = "sha256:d1e1e3b58374dc93031d6eda2420a48ea44a36c2b4766a4fdeb3710755731d76"},
     {file = "typing_extensions-4.14.1.tar.gz", hash = "sha256:38b39f4aeeab64884ce9f74c94263ef78f3c22467c8724005483154c26648d36"},
 ]
-markers = {main = "python_version == \"3.11\""}
 
 [[package]]
 name = "typing-inspect"
@@ -7919,7 +7954,7 @@ version = "2025.2"
 description = "Provider of IANA time zone data"
 optional = false
 python-versions = ">=2"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8"},
     {file = "tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"},
@@ -8220,7 +8255,7 @@ version = "3.5.0"
 description = "Python binding for xxHash"
 optional = false
 python-versions = ">=3.7"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "xxhash-3.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ece616532c499ee9afbb83078b1b952beffef121d989841f7f4b3dc5ac0fd212"},
     {file = "xxhash-3.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3171f693dbc2cef6477054a665dc255d996646b4023fe56cb4db80e26f4cc520"},
@@ -8353,7 +8388,7 @@ version = "1.20.1"
 description = "Yet another URL library"
 optional = false
 python-versions = ">=3.9"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "yarl-1.20.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6032e6da6abd41e4acda34d75a816012717000fa6839f37124a47fcefc49bec4"},
     {file = "yarl-1.20.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2c7b34d804b8cf9b214f05015c4fee2ebe7ed05cf581e7192c06555c71f4446a"},
@@ -8489,4 +8524,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.11, <4.0"
-content-hash = "6fcfca26063930dfdb7d9c8aade6b90293abeb0ceb205861bc52a23f0b52188e"
+content-hash = "b7d22133e9fafd9adb0b4c5b65bf36e392c8e975e4fcdee90eeb74288e8971b7"
diff --git a/pyproject.toml b/pyproject.toml
index f90e32269..fe9579a85 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,6 +20,9 @@ adalflow = { path = "adalflow", develop = true }
 black = { extras = ["jupyter"], version = "^24.10.0" }
 aioitertools = "^0.12.0"
 asyncstdlib = "^3.13.1"
+datasets = "^4.0.0"
+pyvis = "^0.3.2"
+graphviz = "^0.21"
 
 [tool.poetry.group.dev.dependencies]
 
diff --git a/tutorials/v1_agent_runner/runner_trainable_tutorial.py b/tutorials/v1_agent_runner/runner_trainable_tutorial.py
new file mode 100644
index 000000000..dfac9fde0
--- /dev/null
+++ b/tutorials/v1_agent_runner/runner_trainable_tutorial.py
@@ -0,0 +1,273 @@
+"""
+Tutorial: Using Runner.forward() Method to Get Trainable Results
+
+This tutorial demonstrates how to use the Runner's forward() method to get 
+trainable outputs that can be used for optimization. The key difference between
+runner.call() and runner.forward() is that forward() returns trainable Parameter 
+objects that maintain gradient information for optimization.
+"""
+
+import adalflow as adal
+from adalflow.core.types import RunnerResult
+from adalflow.optim.parameter import OutputParameter
+from adalflow.utils import setup_env
+
+setup_env()
+
+
+def create_simple_agent():
+    """Create a simple agent for demonstration."""
+    # Configure model client
+    model_config = {
+        "model_client": adal.AnthropicAPIClient(),
+        "model_kwargs": {
+            "model": "claude-3-5-sonnet-20241022",
+            "temperature": 0.0,
+        }
+    }
+    
+    # Create agent with simple tools
+    agent = adal.Agent(
+        name="demo_agent",
+        add_llm_as_fallback=True,
+        max_steps=3,
+        **model_config
+    )
+    
+    return agent
+
+
+def example_1_basic_forward_usage():
+    """Example 1: Basic usage of runner.forward() method."""
+    print("=== Example 1: Basic Forward Usage ===")
+    
+    agent = create_simple_agent()
+    runner = adal.Runner(agent=agent, training=True)  # Important: set training=True
+    
+    # Use forward() instead of call() to get trainable results
+    prompt_kwargs = {
+        "input_str": "What is 2+2? Use the finish action to provide your final answer."
+    }
+    
+    # This returns an OutputParameter with gradient information
+    trainable_result = runner.forward(prompt_kwargs=prompt_kwargs)
+    
+    print(f"Result type: {type(trainable_result)}")
+    print(f"Is trainable (requires_opt): {trainable_result.requires_opt}")
+    print(f"Has gradient function: {trainable_result.grad_fn is not None}")
+    
+    # Access the actual RunnerResult from the parameter
+    runner_result: RunnerResult = trainable_result.data
+    print(f"Final answer: {runner_result.answer}")
+    print(f"Number of steps: {len(runner_result.step_history)}")
+    
+    return trainable_result
+
+
+def example_2_extracting_final_output():
+    """Example 2: How to extract the final output from trainable result."""
+    print("\n=== Example 2: Extracting Final Output ===")
+    
+    agent = create_simple_agent()
+    runner = adal.Runner(agent=agent, training=True)
+    
+    prompt_kwargs = {
+        "input_str": "Calculate 15 * 3 and explain your reasoning. Use finish to provide your final answer."
+    }
+    
+    # Get trainable result
+    trainable_result = runner.forward(prompt_kwargs=prompt_kwargs)
+    
+    # Method 1: Access through .data attribute
+    final_answer = trainable_result.data.answer
+    print(f"Method 1 - Final answer via .data: {final_answer}")
+    
+    # Method 2: Access step history for detailed breakdown
+    step_history = trainable_result.data.step_history
+    print(f"Method 2 - Step history length: {len(step_history)}")
+    for i, step in enumerate(step_history):
+        print(f"  Step {i}: {step.function.name if step.function else 'None'} -> {step.observation}")
+    
+    # Method 3: Check if there were any errors
+    if trainable_result.data.error:
+        print(f"Method 3 - Error occurred: {trainable_result.data.error}")
+    else:
+        print("Method 3 - No errors occurred")
+    
+    return trainable_result
+
+
+def example_3_comparison_call_vs_forward():
+    """Example 3: Comparing runner.call() vs runner.forward()."""
+    print("\n=== Example 3: Call vs Forward Comparison ===")
+    
+    agent = create_simple_agent()
+    runner = adal.Runner(agent=agent)
+    
+    prompt_kwargs = {
+        "input_str": "What is the capital of France? Use finish to provide your final answer."
+    }
+    
+    # Using call() - returns RunnerResult directly (not trainable)
+    runner.training = False  # Inference mode
+    call_result = runner(prompt_kwargs=prompt_kwargs)
+    
+    print("Call Result:")
+    print(f"  Type: {type(call_result)}")
+    print(f"  Answer: {call_result.answer}")
+    print(f"  Is trainable: No (RunnerResult object)")
+    
+    # Using forward() - returns trainable OutputParameter
+    runner.training = True  # Training mode
+    forward_result = runner.forward(prompt_kwargs=prompt_kwargs)
+    
+    print("\nForward Result:")
+    print(f"  Type: {type(forward_result)}")
+    print(f"  Answer: {forward_result.data.answer}")
+    print(f"  Is trainable: {forward_result.requires_opt}")
+    print(f"  Has predecessors: {len(forward_result.predecessors) > 0}")
+    
+    return call_result, forward_result
+
+
+def example_4_using_with_trainer():
+    """Example 4: How to use trainable results with AdalFlow trainer."""
+    print("\n=== Example 4: Using with Trainer ===")
+    
+    from adalflow.datasets.gsm8k import GSM8K
+    from adalflow.eval.answer_match_acc import AnswerMatchAcc
+    
+    # Load a small dataset for demonstration
+    train_data = GSM8K(split="train", size=5)
+    
+    agent = create_simple_agent()
+    runner = adal.Runner(agent=agent, training=True)
+    
+    # Create a simple trainer component
+    class RunnerTrainerDemo(adal.AdalComponent):
+        def __init__(self, runner):
+            eval_fn = AnswerMatchAcc(type="exact_match").compute_single_item
+            loss_fn = adal.EvalFnToTextLoss(
+                eval_fn=eval_fn,
+                eval_fn_desc="exact_match: 1 if str(y) == str(y_gt) else 0",
+            )
+            super().__init__(task=runner, eval_fn=eval_fn, loss_fn=loss_fn)
+        
+        def prepare_task(self, sample):
+            # This calls runner.forward() when in training mode
+            return self.task.forward, {"prompt_kwargs": {"input_str": sample.question}, "id": sample.id}
+        
+        def prepare_eval(self, sample, y_pred):
+            # y_pred will be an OutputParameter when using forward()
+            if isinstance(y_pred, OutputParameter):
+                y_label = y_pred.data.answer if y_pred.data.answer else ""
+            else:
+                y_label = y_pred.answer if y_pred.answer else ""
+            return self.eval_fn, {"y": y_label, "y_gt": sample.answer}
+        
+        def prepare_loss(self, sample, pred):
+            # pred is the OutputParameter from forward()
+            y_gt = adal.Parameter(
+                name="y_gt",
+                data=sample.answer,
+                eval_input=sample.answer,
+                requires_opt=False,
+            )
+            
+            # Set eval_input for the prediction
+            if hasattr(pred.data, 'answer') and pred.data.answer is not None:
+                pred.eval_input = pred.data.answer
+            else:
+                pred.eval_input = str(pred.data) if pred.data is not None else ""
+            
+            return self.loss_fn, {"kwargs": {"y": pred, "y_gt": y_gt}, "id": sample.id}
+    
+    # Create trainer component
+    trainer_component = RunnerTrainerDemo(runner)
+    
+    # Test with a single sample
+    sample = train_data[0]
+    print(f"Sample question: {sample.question}")
+    print(f"Ground truth: {sample.answer}")
+    
+    # This will call runner.forward() and return trainable result
+    task_fn, task_kwargs = trainer_component.prepare_task(sample)
+    trainable_result = task_fn(**task_kwargs)
+    
+    print("Sample question: ", sample.question)
+    print(f"Trainable result type: {type(trainable_result)}")
+    print(f"Final answer: {trainable_result.data.answer}")
+    print(f"Is optimizable: {trainable_result.requires_opt}")
+    
+    return trainable_result
+
+
+def example_5_gradient_tracking():
+    """Example 5: Understanding gradient tracking in trainable results."""
+    print("\n=== Example 5: Gradient Tracking ===")
+    
+    agent = create_simple_agent()
+    runner = adal.Runner(agent=agent, training=True)
+    
+    prompt_kwargs = {
+        "input_str": "Solve: If John has 5 apples and gives away 2, how many does he have left?"
+    }
+    
+    # Get trainable result
+    trainable_result = runner.forward(prompt_kwargs=prompt_kwargs)
+    
+    print("Gradient Information:")
+    print(f"  Has grad_fn: {trainable_result.grad_fn is not None}")
+    print(f"  Requires optimization: {trainable_result.requires_opt}")
+    print(f"  Parameter name: {trainable_result.name}")
+    print(f"  Number of predecessors: {len(trainable_result.predecessors)}")
+    
+    # The grad_fn contains backward context for optimization
+    if trainable_result.grad_fn:
+        backward_ctx = trainable_result.grad_fn
+        print(backward_ctx)
+        print(f"  Backward function available: {backward_ctx.backward_fn is not None}")
+        print(f"  Template available: {backward_ctx.kwargs['template'] is not None}")
+        print(f"  Prompt string length: {len(backward_ctx.kwargs['prompt_str']) if backward_ctx.kwargs['prompt_str'] else 0}")
+    
+    return trainable_result
+
+
+def main():
+    """Run all examples."""
+    print("Tutorial: Runner.forward() for Trainable Results")
+    print("=" * 50)
+    
+    try:
+        # Run examples
+        # example_1_basic_forward_usage()
+        # example_2_extracting_final_output()
+        # example_3_comparison_call_vs_forward()
+        example_4_using_with_trainer()
+        # example_5_gradient_tracking()
+        
+        print("\n" + "=" * 50)
+        print("Tutorial completed successfully!")
+        print("\nKey takeaways:")
+        print("1. Use runner.forward() instead of runner.call() for trainable results")
+        print("2. Set training=True when creating Runner for optimization")
+        print("3. Forward() returns OutputParameter with gradient information")
+        print("4. Access final answer via result.data.answer")
+        print("5. Trainable results can be used with AdalFlow's optimization system")
+        
+    except Exception as e:
+        print(f"Error occurred during tutorial: {e}")
+        print("Make sure you have:")
+        print("1. Set up your API key (ANTHROPIC_API_KEY)")
+        print("2. Installed AdalFlow with all dependencies")
+        return False
+    
+    return True
+
+
+if __name__ == "__main__":
+    # Set up environment (you may need to configure your API key)
+    from adalflow.utils import setup_env
+    setup_env()
+    
+    main()
\ No newline at end of file
diff --git a/use_cases/question_answering/gsm8k/train.py b/use_cases/question_answering/gsm8k/train.py
index 63fc2ef40..cf3b764d8 100644
--- a/use_cases/question_answering/gsm8k/train.py
+++ b/use_cases/question_answering/gsm8k/train.py
@@ -21,8 +21,9 @@ def __init__(
         backward_engine_model_config: Optional[Dict] = None,
         teacher_model_config: Optional[Dict] = None,
         text_optimizer_model_config: Optional[Dict] = None,
+        config: Optional[Dict] = None,
     ):
-        task = GSM8KTask(**gpt_3_model)
+        task = GSM8KTask(**config)
         eval_fn = AnswerMatchAcc(type="exact_match").compute_single_item
         loss_fn = adal.EvalFnToTextLoss(
             eval_fn=eval_fn,
@@ -66,16 +67,29 @@ def prepare_loss(
 
 def train():
     train_data, val_data, test_data = load_datasets()
-    task = GSM8KTask(**gpt_3_model)
+
+    # Create Anthropic client configuration instead of GPT-3
+    anthropic_config = {
+        "model_client": adal.AnthropicAPIClient(),
+        "model_kwargs": {
+            "model": "claude-3-5-sonnet-20241022",
+            # "max_tokens": 2000,
+            "temperature": 0.0,
+        }
+    }
+
+    task = GSM8KTask(**anthropic_config)
     adal_component = StandardTrain(
         task=task,
-        backward_engine_model_config=gpt_o3_mini_model,
-        text_optimizer_model_config=gpt_o3_mini_model,
+        backward_engine_model_config=anthropic_config,
+        text_optimizer_model_config=anthropic_config,
+        config=anthropic_config,
     )
     trainer = adal.Trainer(
         adaltask=adal_component,
         strategy="random",
-        max_steps=10,
+        # max_steps=10,
+        max_steps=1, 
         text_optimizers_config_kwargs={"max_past_history": 5},
     )
     trainer.fit(
@@ -83,7 +97,8 @@ def train():
         val_dataset=val_data,
         test_dataset=test_data,
         debug=False,
-        resume_from_ckpt="/Users/liyin/.adalflow/ckpt/StandardTrain/random_max_steps_10_57bb8_run_1.json",
+        # resume_from_ckpt=None,
+        # resume_from_ckpt="/Users/jinhakim/.adalflow/ckpt/StandardTrain/random_max_steps_1_e35e9_run_1.json",
     )
 
 

From af8858fb73ca6ccd2431b911d7b2382f36113750 Mon Sep 17 00:00:00 2001
From: phi-jkim <jinhaworld@gmail.com>
Date: Thu, 14 Aug 2025 11:30:46 -0700
Subject: [PATCH 2/3] revert notebooks

---
 adalflow/adalflow/components/agent/runner.py  |   2 +-
 .../components/agent/runner_trainer.py        |   4 +-
 ...adalflow_classification_optimization.ipynb | 122 +++++++-----------
 .../tutorials/adalflow_rag_optimization.ipynb | 106 +++------------
 4 files changed, 69 insertions(+), 165 deletions(-)

diff --git a/adalflow/adalflow/components/agent/runner.py b/adalflow/adalflow/components/agent/runner.py
index 45a9b1c92..b6d1e3067 100644
--- a/adalflow/adalflow/components/agent/runner.py
+++ b/adalflow/adalflow/components/agent/runner.py
@@ -335,7 +335,7 @@ def _create_trainable_runner_result(
                 backward_engine=self.backward_engine,
                 response=runner_result_parameter,
                 prompt_kwargs=prompt_kwargs,
-                template=self.agent.planner.template,
+                template=getattr(self.agent.planner, 'template', None),
                 prompt_str=self.agent.planner.get_prompt(**prompt_kwargs),
                 disable_backward_engine=self._disable_backward_engine,
                 id=id,
diff --git a/adalflow/adalflow/components/agent/runner_trainer.py b/adalflow/adalflow/components/agent/runner_trainer.py
index 43c606734..12f41be43 100644
--- a/adalflow/adalflow/components/agent/runner_trainer.py
+++ b/adalflow/adalflow/components/agent/runner_trainer.py
@@ -215,5 +215,5 @@ def train():
 
 
 if __name__ == "__main__":
-    # train()
-    train_original_react()
\ No newline at end of file
+    train()
+    # train_original_react()
\ No newline at end of file
diff --git a/notebooks/tutorials/adalflow_classification_optimization.ipynb b/notebooks/tutorials/adalflow_classification_optimization.ipynb
index b95ef7957..733630545 100644
--- a/notebooks/tutorials/adalflow_classification_optimization.ipynb
+++ b/notebooks/tutorials/adalflow_classification_optimization.ipynb
@@ -78,7 +78,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
@@ -91,6 +91,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Please enter your OpenAI API key: ··········\n",
       "API keys have been set.\n"
      ]
     }
@@ -121,7 +122,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {
     "id": "ZZIEtZYHNVjo"
    },
@@ -202,7 +203,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {
     "id": "yAvzn7DZeUX-"
    },
@@ -250,7 +251,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {
     "id": "3Q3H9XC4Ncfi"
    },
@@ -336,7 +337,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
@@ -346,50 +347,10 @@
    },
    "outputs": [
     {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "290486826f394c0086392b96b373d810",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "README.md: 0.00B [00:00, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f4bd8e8cbced4ae184858046b4d670e1",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "trec.py: 0.00B [00:00, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "ename": "RuntimeError",
-     "evalue": "Dataset scripts are no longer supported, but found trec.py",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-      "\u001b[31mRuntimeError\u001b[39m                              Traceback (most recent call last)",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;66;03m# load dataset to get one example\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m train_dataset, val_dataset, test_dataset = \u001b[43mload_datasets\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m      4\u001b[39m example = train_dataset[\u001b[32m0\u001b[39m]\n\u001b[32m      5\u001b[39m \u001b[38;5;28mprint\u001b[39m(example)\n",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 68\u001b[39m, in \u001b[36mload_datasets\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m     66\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mload_datasets\u001b[39m():\n\u001b[32m     67\u001b[39m \u001b[38;5;250m    \u001b[39m\u001b[33;03m\"\"\"Load the dataset\"\"\"\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m68\u001b[39m     train_data = \u001b[43mTrecDataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43msplit\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtrain\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m     69\u001b[39m     val_data = TrecDataset(split=\u001b[33m\"\u001b[39m\u001b[33mval\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m     70\u001b[39m     test_data = TrecDataset(split=\u001b[33m\"\u001b[39m\u001b[33mtest\u001b[39m\u001b[33m\"\u001b[39m)\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/code/AdalFlow/adalflow/adalflow/datasets/trec.py:171\u001b[39m, in \u001b[36mTrecDataset.__init__\u001b[39m\u001b[34m(self, root, split)\u001b[39m\n\u001b[32m    169\u001b[39m data_path = prepare_dataset_path(\u001b[38;5;28mself\u001b[39m.root, \u001b[38;5;28mself\u001b[39m.task_name)\n\u001b[32m    170\u001b[39m \u001b[38;5;66;03m# download and save\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m171\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_check_or_download_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msplit\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    172\u001b[39m \u001b[38;5;66;03m# load from csv\u001b[39;00m\n\u001b[32m    173\u001b[39m \u001b[38;5;28mself\u001b[39m.data = []\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/code/AdalFlow/adalflow/adalflow/datasets/trec.py:198\u001b[39m, in \u001b[36mTrecDataset._check_or_download_dataset\u001b[39m\u001b[34m(self, data_path, split)\u001b[39m\n\u001b[32m    195\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01muuid\u001b[39;00m\n\u001b[32m    197\u001b[39m \u001b[38;5;66;03m# prepare all the data\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m198\u001b[39m train_dataset, val_dataset, test_dataset = \u001b[43mprepare_datasets\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    199\u001b[39m \u001b[38;5;28mprint\u001b[39m(\n\u001b[32m    200\u001b[39m     \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mtrain: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(train_dataset)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m, val: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(val_dataset)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m, test: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(test_dataset)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m    201\u001b[39m )\n\u001b[32m    202\u001b[39m \u001b[38;5;66;03m# save to csv\u001b[39;00m\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/code/AdalFlow/adalflow/adalflow/datasets/trec.py:47\u001b[39m, in \u001b[36mprepare_datasets\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m     44\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mprepare_datasets\u001b[39m():\n\u001b[32m     45\u001b[39m     \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mdatasets\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m load_dataset\n\u001b[32m---> \u001b[39m\u001b[32m47\u001b[39m     dataset = \u001b[43mload_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtrec\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m     48\u001b[39m     \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mtrain: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(dataset[\u001b[33m'\u001b[39m\u001b[33mtrain\u001b[39m\u001b[33m'\u001b[39m])\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m, test: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(dataset[\u001b[33m'\u001b[39m\u001b[33mtest\u001b[39m\u001b[33m'\u001b[39m])\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)  \u001b[38;5;66;03m# 5452, 500\u001b[39;00m\n\u001b[32m     49\u001b[39m     \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mtrain example: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdataset[\u001b[33m'\u001b[39m\u001b[33mtrain\u001b[39m\u001b[33m'\u001b[39m][\u001b[32m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/Caches/pypoetry/virtualenvs/adalflow-project-Y74mvs4e-py3.12/lib/python3.12/site-packages/datasets/load.py:1392\u001b[39m, in \u001b[36mload_dataset\u001b[39m\u001b[34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options, **config_kwargs)\u001b[39m\n\u001b[32m   1387\u001b[39m verification_mode = VerificationMode(\n\u001b[32m   1388\u001b[39m     (verification_mode \u001b[38;5;129;01mor\u001b[39;00m VerificationMode.BASIC_CHECKS) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m save_infos \u001b[38;5;28;01melse\u001b[39;00m VerificationMode.ALL_CHECKS\n\u001b[32m   1389\u001b[39m )\n\u001b[32m   1391\u001b[39m \u001b[38;5;66;03m# Create a dataset builder\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1392\u001b[39m builder_instance = \u001b[43mload_dataset_builder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1393\u001b[39m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1394\u001b[39m \u001b[43m    \u001b[49m\u001b[43mname\u001b[49m\u001b[43m=\u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1395\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1396\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1397\u001b[39m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1398\u001b[39m \u001b[43m    \u001b[49m\u001b[43mfeatures\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfeatures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1399\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1400\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1401\u001b[39m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1402\u001b[39m \u001b[43m    \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1403\u001b[39m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1404\u001b[39m \u001b[43m    \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mconfig_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1405\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1407\u001b[39m \u001b[38;5;66;03m# Return iterable dataset in case of streaming\u001b[39;00m\n\u001b[32m   1408\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m streaming:\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/Caches/pypoetry/virtualenvs/adalflow-project-Y74mvs4e-py3.12/lib/python3.12/site-packages/datasets/load.py:1132\u001b[39m, in \u001b[36mload_dataset_builder\u001b[39m\u001b[34m(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, token, storage_options, **config_kwargs)\u001b[39m\n\u001b[32m   1130\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m features \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m   1131\u001b[39m     features = _fix_for_backward_compatible_features(features)\n\u001b[32m-> \u001b[39m\u001b[32m1132\u001b[39m dataset_module = \u001b[43mdataset_module_factory\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1133\u001b[39m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1134\u001b[39m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1135\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1136\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1137\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1138\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1139\u001b[39m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1140\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1141\u001b[39m \u001b[38;5;66;03m# Get dataset builder class\u001b[39;00m\n\u001b[32m   1142\u001b[39m builder_kwargs = dataset_module.builder_kwargs\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/Caches/pypoetry/virtualenvs/adalflow-project-Y74mvs4e-py3.12/lib/python3.12/site-packages/datasets/load.py:1031\u001b[39m, in \u001b[36mdataset_module_factory\u001b[39m\u001b[34m(path, revision, download_config, download_mode, data_dir, data_files, cache_dir, **download_kwargs)\u001b[39m\n\u001b[32m   1026\u001b[39m             \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e1, \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m):\n\u001b[32m   1027\u001b[39m                 \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\n\u001b[32m   1028\u001b[39m                     \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCouldn\u001b[39m\u001b[33m'\u001b[39m\u001b[33mt find any data file at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrelative_to_absolute_path(path)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m. \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m   1029\u001b[39m                     \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCouldn\u001b[39m\u001b[33m'\u001b[39m\u001b[33mt find \u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m on the Hugging Face Hub either: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(e1).\u001b[34m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me1\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m   1030\u001b[39m                 ) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1031\u001b[39m             \u001b[38;5;28;01mraise\u001b[39;00m e1 \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m   1032\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m   1033\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCouldn\u001b[39m\u001b[33m'\u001b[39m\u001b[33mt find any data file at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrelative_to_absolute_path(path)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m)\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/Caches/pypoetry/virtualenvs/adalflow-project-Y74mvs4e-py3.12/lib/python3.12/site-packages/datasets/load.py:989\u001b[39m, in \u001b[36mdataset_module_factory\u001b[39m\u001b[34m(path, revision, download_config, download_mode, data_dir, data_files, cache_dir, **download_kwargs)\u001b[39m\n\u001b[32m    981\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m    982\u001b[39m     api.hf_hub_download(\n\u001b[32m    983\u001b[39m         repo_id=path,\n\u001b[32m    984\u001b[39m         filename=filename,\n\u001b[32m   (...)\u001b[39m\u001b[32m    987\u001b[39m         proxies=download_config.proxies,\n\u001b[32m    988\u001b[39m     )\n\u001b[32m--> \u001b[39m\u001b[32m989\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mDataset scripts are no longer supported, but found \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfilename\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m    990\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m EntryNotFoundError:\n\u001b[32m    991\u001b[39m     \u001b[38;5;66;03m# Use the infos from the parquet export except in some cases:\u001b[39;00m\n\u001b[32m    992\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m data_dir \u001b[38;5;129;01mor\u001b[39;00m data_files \u001b[38;5;129;01mor\u001b[39;00m (revision \u001b[38;5;129;01mand\u001b[39;00m revision != \u001b[33m\"\u001b[39m\u001b[33mmain\u001b[39m\u001b[33m\"\u001b[39m):\n",
-      "\u001b[31mRuntimeError\u001b[39m: Dataset scripts are no longer supported, but found trec.py"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TrecData(id='e73a82a7-6a3d-4947-90f5-03739e169db0', question='When reading classified ads , what does EENTY : other stand for ?', class_name='ABBR', class_index=0)\n"
      ]
     }
    ],
@@ -403,7 +364,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
@@ -416,21 +377,44 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Prompt kwargs: {'classes': [{'label': 'ABBR', 'desc': 'Abbreviation: Questions about abbreviations and their meanings'}, {'label': 'DESC', 'desc': 'Description: Questions seeking descriptions of people, things, or concepts'}, {'label': 'ENTY', 'desc': 'Entity: Questions about entities (e.g., animals, colors, inventions)'}, {'label': 'HUM', 'desc': 'Human: Questions about people or organizations'}, {'label': 'LOC', 'desc': 'Location: Questions about places, cities, countries'}, {'label': 'NUM', 'desc': 'Numeric: Questions seeking numeric answers (e.g., dates, amounts, distances)'}]}\n",
-      "Prompt kwargs after conversion: {'classes': [{'label': 'ABBR', 'desc': 'Abbreviation: Questions about abbreviations and their meanings'}, {'label': 'DESC', 'desc': 'Description: Questions seeking descriptions of people, things, or concepts'}, {'label': 'ENTY', 'desc': 'Entity: Questions about entities (e.g., animals, colors, inventions)'}, {'label': 'HUM', 'desc': 'Human: Questions about people or organizations'}, {'label': 'LOC', 'desc': 'Location: Questions about places, cities, countries'}, {'label': 'NUM', 'desc': 'Numeric: Questions seeking numeric answers (e.g., dates, amounts, distances)'}]}\n",
-      "Prompt kwargs: {'schema': 'rationale: Your step-by-step reasoning to classify the question to class_name (str) (optional)\\nclass_name: One of {ABBR, ENTY, DESC, HUM, LOC, NUM} (str) (optional)'}\n",
-      "Prompt kwargs after conversion: {'schema': 'rationale: Your step-by-step reasoning to classify the question to class_name (str) (optional)\\nclass_name: One of {ABBR, ENTY, DESC, HUM, LOC, NUM} (str) (optional)'}\n"
-     ]
-    },
-    {
-     "ename": "NameError",
-     "evalue": "name 'example' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-      "\u001b[31mNameError\u001b[39m                                 Traceback (most recent call last)",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[7]\u001b[39m\u001b[32m, line 7\u001b[39m\n\u001b[32m      1\u001b[39m task = TRECClassifierStructuredOutput(\n\u001b[32m      2\u001b[39m     model_client=gpt_3_model[\u001b[33m\"\u001b[39m\u001b[33mmodel_client\u001b[39m\u001b[33m\"\u001b[39m],\n\u001b[32m      3\u001b[39m     model_kwargs=gpt_3_model[\u001b[33m\"\u001b[39m\u001b[33mmodel_kwargs\u001b[39m\u001b[33m\"\u001b[39m],\n\u001b[32m      4\u001b[39m )\n\u001b[32m      5\u001b[39m task.train()\n\u001b[32m----> \u001b[39m\u001b[32m7\u001b[39m output = task(question=\u001b[43mexample\u001b[49m.question, \u001b[38;5;28mid\u001b[39m=example.id)\n\u001b[32m      8\u001b[39m \u001b[38;5;28mprint\u001b[39m(output)\n",
-      "\u001b[31mNameError\u001b[39m: name 'example' is not defined"
+      "Parameter(name=Generator_output, requires_opt=True, param_type=generator_output (The output of the generator.), role_desc=Output from (llm) Generator, data=```\n",
+      "rationale: The question is asking for the meaning of the abbreviation \"EENTY\" in classified ads, which falls under the ABBR class.\n",
+      "class_name: ABBR\n",
+      "```, predecessors={Parameter(name=Output_for, requires_opt=False, param_type=prompt (Instruction to the language model on task, data, and format.), role_desc=Output format requirements, data=Your output should be formatted as a standard YAML instance with the following schema:\n",
+      "```\n",
+      "rationale: Your step-by-step reasoning to classify the question to class_name (str) (optional)\n",
+      "class_name: One of {ABBR, ENTY, DESC, HUM, LOC, NUM} (str) (optional)\n",
+      "```\n",
+      "-Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\n",
+      "-Follow the YAML formatting conventions with an indent of 2 spaces.\n",
+      "-DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the YAML output.\n",
+      "-Quote the string values properly., predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={}), Parameter(name=Few_shot_e, requires_opt=True, param_type=demos (A few examples to guide the language model.), role_desc=Few shot examples to help the model, data=None, predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={}), Parameter(name=Input_to_t, requires_opt=False, param_type=none (), role_desc=input to the LLM, data=question: 'When reading classified ads , what does EENTY : other stand for ?', predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={}), Parameter(name=Task_descr, requires_opt=True, param_type=prompt (Instruction to the language model on task, data, and format.), role_desc=Task description, data=You are a classifier. Given a question, you need to classify it into one of the following classes:\n",
+      " Format: class_index. class_name, class_description\n",
+      " 0. ABBR, Abbreviation: Questions about abbreviations and their meanings\n",
+      " 1. DESC, Description: Questions seeking descriptions of people, things, or concepts\n",
+      " 2. ENTY, Entity: Questions about entities (e.g., animals, colors, inventions)\n",
+      " 3. HUM, Human: Questions about people or organizations\n",
+      " 4. LOC, Location: Questions about places, cities, countries\n",
+      " 5. NUM, Numeric: Questions seeking numeric answers (e.g., dates, amounts, distances)\n",
+      " - Do not try to answer the question:\n",
+      " , predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={})}, gradients=[],            raw_response=None, input_args={'prompt_kwargs': {'system_prompt': Parameter(name=Task_descr, requires_opt=True, param_type=prompt (Instruction to the language model on task, data, and format.), role_desc=Task description, data=You are a classifier. Given a question, you need to classify it into one of the following classes:\n",
+      " Format: class_index. class_name, class_description\n",
+      " 0. ABBR, Abbreviation: Questions about abbreviations and their meanings\n",
+      " 1. DESC, Description: Questions seeking descriptions of people, things, or concepts\n",
+      " 2. ENTY, Entity: Questions about entities (e.g., animals, colors, inventions)\n",
+      " 3. HUM, Human: Questions about people or organizations\n",
+      " 4. LOC, Location: Questions about places, cities, countries\n",
+      " 5. NUM, Numeric: Questions seeking numeric answers (e.g., dates, amounts, distances)\n",
+      " - Do not try to answer the question:\n",
+      " , predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={}), 'output_format_str': Parameter(name=Output_for, requires_opt=False, param_type=prompt (Instruction to the language model on task, data, and format.), role_desc=Output format requirements, data=Your output should be formatted as a standard YAML instance with the following schema:\n",
+      "```\n",
+      "rationale: Your step-by-step reasoning to classify the question to class_name (str) (optional)\n",
+      "class_name: One of {ABBR, ENTY, DESC, HUM, LOC, NUM} (str) (optional)\n",
+      "```\n",
+      "-Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\n",
+      "-Follow the YAML formatting conventions with an indent of 2 spaces.\n",
+      "-DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the YAML output.\n",
+      "-Quote the string values properly., predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={}), 'few_shot_demos': Parameter(name=Few_shot_e, requires_opt=True, param_type=demos (A few examples to guide the language model.), role_desc=Few shot examples to help the model, data=None, predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={}), 'input_str': Parameter(name=Input_to_t, requires_opt=False, param_type=none (), role_desc=input to the LLM, data=question: 'When reading classified ads , what does EENTY : other stand for ?', predecessors=set(), gradients=[],            raw_response=None, input_args=None, traces={})}, 'model_kwargs': {'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, traces={})\n"
      ]
     }
    ],
@@ -925,15 +909,7 @@
    "name": "python3"
   },
   "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
    "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
    "version": "3.12.7"
   }
  },
diff --git a/notebooks/tutorials/adalflow_rag_optimization.ipynb b/notebooks/tutorials/adalflow_rag_optimization.ipynb
index 4d8497ead..8cca7fd7b 100644
--- a/notebooks/tutorials/adalflow_rag_optimization.ipynb
+++ b/notebooks/tutorials/adalflow_rag_optimization.ipynb
@@ -56,7 +56,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 17,
    "metadata": {
     "id": "tAp3eDjOCma1"
    },
@@ -72,67 +72,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Found existing installation: httpx 0.28.1\n",
-      "Uninstalling httpx-0.28.1:\n",
-      "  Successfully uninstalled httpx-0.28.1\n",
-      "Found existing installation: anyio 4.9.0\n",
-      "Uninstalling anyio-4.9.0:\n",
-      "  Successfully uninstalled anyio-4.9.0\n",
-      "\u001b[33mDEPRECATION: Loading egg at /Users/jinhakim/miniforge3/lib/python3.12/site-packages/matlabengine-24.2-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330\u001b[0m\u001b[33m\n",
-      "\u001b[0mCollecting anyio<4.0,>=3.1.0\n",
-      "  Downloading anyio-3.7.1-py3-none-any.whl.metadata (4.7 kB)\n",
-      "Requirement already satisfied: idna>=2.8 in /Users/jinhakim/miniforge3/lib/python3.12/site-packages (from anyio<4.0,>=3.1.0) (3.10)\n",
-      "Requirement already satisfied: sniffio>=1.1 in /Users/jinhakim/miniforge3/lib/python3.12/site-packages (from anyio<4.0,>=3.1.0) (1.3.1)\n",
-      "Downloading anyio-3.7.1-py3-none-any.whl (80 kB)\n",
-      "Installing collected packages: anyio\n",
-      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
-      "mcp 1.9.4 requires httpx>=0.27, which is not installed.\n",
-      "openai 1.88.0 requires httpx<1,>=0.23.0, which is not installed.\n",
-      "litellm 1.75.3 requires httpx>=0.23.0, which is not installed.\n",
-      "fastmcp 2.9.0 requires httpx>=0.28.1, which is not installed.\n",
-      "mcp 1.9.4 requires anyio>=4.5, but you have anyio 3.7.1 which is incompatible.\n",
-      "sse-starlette 2.3.6 requires anyio>=4.7.0, but you have anyio 3.7.1 which is incompatible.\u001b[0m\u001b[31m\n",
-      "\u001b[0mSuccessfully installed anyio-3.7.1\n",
-      "\u001b[33mDEPRECATION: Loading egg at /Users/jinhakim/miniforge3/lib/python3.12/site-packages/matlabengine-24.2-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330\u001b[0m\u001b[33m\n",
-      "\u001b[0mCollecting httpx==0.24.1\n",
-      "  Downloading httpx-0.24.1-py3-none-any.whl.metadata (7.4 kB)\n",
-      "Requirement already satisfied: certifi in /Users/jinhakim/miniforge3/lib/python3.12/site-packages (from httpx==0.24.1) (2025.6.15)\n",
-      "Collecting httpcore<0.18.0,>=0.15.0 (from httpx==0.24.1)\n",
-      "  Downloading httpcore-0.17.3-py3-none-any.whl.metadata (18 kB)\n",
-      "Requirement already satisfied: idna in /Users/jinhakim/miniforge3/lib/python3.12/site-packages (from httpx==0.24.1) (3.10)\n",
-      "Requirement already satisfied: sniffio in /Users/jinhakim/miniforge3/lib/python3.12/site-packages (from httpx==0.24.1) (1.3.1)\n",
-      "Collecting h11<0.15,>=0.13 (from httpcore<0.18.0,>=0.15.0->httpx==0.24.1)\n",
-      "  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)\n",
-      "Requirement already satisfied: anyio<5.0,>=3.0 in /Users/jinhakim/miniforge3/lib/python3.12/site-packages (from httpcore<0.18.0,>=0.15.0->httpx==0.24.1) (3.7.1)\n",
-      "Downloading httpx-0.24.1-py3-none-any.whl (75 kB)\n",
-      "Downloading httpcore-0.17.3-py3-none-any.whl (74 kB)\n",
-      "Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
-      "Installing collected packages: h11, httpcore, httpx\n",
-      "  Attempting uninstall: h11\n",
-      "    Found existing installation: h11 0.16.0\n",
-      "    Uninstalling h11-0.16.0:\n",
-      "      Successfully uninstalled h11-0.16.0\n",
-      "  Attempting uninstall: httpcore\n",
-      "    Found existing installation: httpcore 1.0.9\n",
-      "    Uninstalling httpcore-1.0.9:\n",
-      "      Successfully uninstalled httpcore-1.0.9\n",
-      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
-      "mcp 1.9.4 requires anyio>=4.5, but you have anyio 3.7.1 which is incompatible.\n",
-      "mcp 1.9.4 requires httpx>=0.27, but you have httpx 0.24.1 which is incompatible.\n",
-      "python-telegram-bot 21.10 requires httpx~=0.27, but you have httpx 0.24.1 which is incompatible.\n",
-      "fastmcp 2.9.0 requires httpx>=0.28.1, but you have httpx 0.24.1 which is incompatible.\n",
-      "langgraph-sdk 0.1.70 requires httpx>=0.25.2, but you have httpx 0.24.1 which is incompatible.\u001b[0m\u001b[31m\n",
-      "\u001b[0mSuccessfully installed h11-0.14.0 httpcore-0.17.3 httpx-0.24.1\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "!pip uninstall httpx anyio -y\n",
     "!pip install \"anyio>=3.1.0,<4.0\"\n",
@@ -156,7 +98,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
@@ -169,6 +111,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Please enter your OpenAI API key: ··········\n",
       "API keys have been set.\n"
      ]
     }
@@ -190,7 +133,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 20,
    "metadata": {
     "id": "aE3I05BqOmd7"
    },
@@ -213,7 +156,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {
     "id": "cqUUoua9fUxQ"
    },
@@ -238,7 +181,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 22,
    "metadata": {
     "colab": {
      "base_uri": "https://localhost:8080/"
@@ -251,25 +194,18 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "split_csv_path: /Users/jinhakim/.adalflow/cache_datasets/hotpot_qa_dev_titles/train.json\n"
+      "HotPotQAData(id='5a8b57f25542995d1e6f1371', question='Were Scott Derrickson and Ed Wood of the same nationality?', answer='yes', gold_titles=\"{'Scott Derrickson', 'Ed Wood'}\") <class 'adalflow.datasets.types.HotPotQAData'>\n"
      ]
     },
     {
-     "ename": "RuntimeError",
-     "evalue": "Dataset scripts are no longer supported, but found hotpot_qa.py",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-      "\u001b[31mRuntimeError\u001b[39m                              Traceback (most recent call last)",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 22\u001b[39m\n\u001b[32m     15\u001b[39m     answer: \u001b[38;5;28mstr\u001b[39m = field(\n\u001b[32m     16\u001b[39m         metadata={\u001b[33m\"\u001b[39m\u001b[33mdesc\u001b[39m\u001b[33m\"\u001b[39m: \u001b[33m\"\u001b[39m\u001b[33mThe answer you produced\u001b[39m\u001b[33m\"\u001b[39m},\n\u001b[32m     17\u001b[39m     )\n\u001b[32m     19\u001b[39m     __output_fields__ = [\u001b[33m\"\u001b[39m\u001b[33mreasoning\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33manswer\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m---> \u001b[39m\u001b[32m22\u001b[39m dataset = \u001b[43mHotPotQA\u001b[49m\u001b[43m(\u001b[49m\u001b[43msplit\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtrain\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msize\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m20\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m     23\u001b[39m \u001b[38;5;28mprint\u001b[39m(dataset[\u001b[32m0\u001b[39m], \u001b[38;5;28mtype\u001b[39m(dataset[\u001b[32m0\u001b[39m]))\n\u001b[32m     25\u001b[39m HotPotQAData(\n\u001b[32m     26\u001b[39m     \u001b[38;5;28mid\u001b[39m=\u001b[33m\"\u001b[39m\u001b[33m5a8b57f25542995d1e6f1371\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     27\u001b[39m     question=\u001b[33m\"\u001b[39m\u001b[33mWere Scott Derrickson and Ed Wood of the same nationality?\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     28\u001b[39m     answer=\u001b[33m\"\u001b[39m\u001b[33myes\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     29\u001b[39m     gold_titles=\u001b[33m\"\u001b[39m\u001b[33m{\u001b[39m\u001b[33m'\u001b[39m\u001b[33mScott Derrickson\u001b[39m\u001b[33m'\u001b[39m\u001b[33m, \u001b[39m\u001b[33m'\u001b[39m\u001b[33mEd Wood\u001b[39m\u001b[33m'\u001b[39m\u001b[33m}\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m     30\u001b[39m )\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/code/AdalFlow/adalflow/adalflow/datasets/hotpot_qa.py:50\u001b[39m, in \u001b[36mHotPotQA.__init__\u001b[39m\u001b[34m(self, only_hard_examples, root, split, keep_details, size, **kwargs)\u001b[39m\n\u001b[32m     48\u001b[39m split_csv_path = os.path.join(data_path, \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msplit\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m.json\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m     49\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33msplit_csv_path: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msplit_csv_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m50\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_check_or_download_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m     51\u001b[39m \u001b[43m    \u001b[49m\u001b[43msplit_csv_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msplit\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43monly_hard_examples\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeep_details\u001b[49m\n\u001b[32m     52\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     54\u001b[39m \u001b[38;5;66;03m# load from csv\u001b[39;00m\n\u001b[32m     55\u001b[39m \u001b[38;5;28mself\u001b[39m.data = []\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/code/AdalFlow/adalflow/adalflow/datasets/hotpot_qa.py:111\u001b[39m, in \u001b[36mHotPotQA._check_or_download_dataset\u001b[39m\u001b[34m(self, data_path, split, only_hard_examples, keep_details)\u001b[39m\n\u001b[32m    100\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m only_hard_examples, (\n\u001b[32m    101\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mCare must be taken when adding support for easy examples.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m    102\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mDev must be all hard to match official dev, but training can be flexible.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m    103\u001b[39m )\n\u001b[32m    105\u001b[39m \u001b[38;5;66;03m# hf_official_train = load_dataset(\u001b[39;00m\n\u001b[32m    106\u001b[39m \u001b[38;5;66;03m#     \"hotpot_qa\", \"fullwiki\", split=\"train\", trust_remote_code=True\u001b[39;00m\n\u001b[32m    107\u001b[39m \u001b[38;5;66;03m# )\u001b[39;00m\n\u001b[32m    108\u001b[39m \u001b[38;5;66;03m# hf_official_dev = load_dataset(\u001b[39;00m\n\u001b[32m    109\u001b[39m \u001b[38;5;66;03m#     \"hotpot_qa\", \"fullwiki\", split=\"validation\", trust_remote_code=True\u001b[39;00m\n\u001b[32m    110\u001b[39m \u001b[38;5;66;03m# )\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m111\u001b[39m hf_official_train = \u001b[43mload_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m    112\u001b[39m \u001b[43m    \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mhotpot_qa\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfullwiki\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msplit\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtrain\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\n\u001b[32m    113\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    114\u001b[39m hf_official_dev = load_dataset(\n\u001b[32m    115\u001b[39m     \u001b[33m\"\u001b[39m\u001b[33mhotpot_qa\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mfullwiki\u001b[39m\u001b[33m\"\u001b[39m, split=\u001b[33m\"\u001b[39m\u001b[33mvalidation\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m    116\u001b[39m )\n\u001b[32m    117\u001b[39m data_path_dir = os.path.dirname(data_path)\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/Caches/pypoetry/virtualenvs/adalflow-project-Y74mvs4e-py3.12/lib/python3.12/site-packages/datasets/load.py:1392\u001b[39m, in \u001b[36mload_dataset\u001b[39m\u001b[34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options, **config_kwargs)\u001b[39m\n\u001b[32m   1387\u001b[39m verification_mode = VerificationMode(\n\u001b[32m   1388\u001b[39m     (verification_mode \u001b[38;5;129;01mor\u001b[39;00m VerificationMode.BASIC_CHECKS) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m save_infos \u001b[38;5;28;01melse\u001b[39;00m VerificationMode.ALL_CHECKS\n\u001b[32m   1389\u001b[39m )\n\u001b[32m   1391\u001b[39m \u001b[38;5;66;03m# Create a dataset builder\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1392\u001b[39m builder_instance = \u001b[43mload_dataset_builder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1393\u001b[39m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1394\u001b[39m \u001b[43m    \u001b[49m\u001b[43mname\u001b[49m\u001b[43m=\u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1395\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1396\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1397\u001b[39m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1398\u001b[39m \u001b[43m    \u001b[49m\u001b[43mfeatures\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfeatures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1399\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1400\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1401\u001b[39m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1402\u001b[39m \u001b[43m    \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1403\u001b[39m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1404\u001b[39m \u001b[43m    \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mconfig_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1405\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1407\u001b[39m \u001b[38;5;66;03m# Return iterable dataset in case of streaming\u001b[39;00m\n\u001b[32m   1408\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m streaming:\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/Caches/pypoetry/virtualenvs/adalflow-project-Y74mvs4e-py3.12/lib/python3.12/site-packages/datasets/load.py:1132\u001b[39m, in \u001b[36mload_dataset_builder\u001b[39m\u001b[34m(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, token, storage_options, **config_kwargs)\u001b[39m\n\u001b[32m   1130\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m features \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m   1131\u001b[39m     features = _fix_for_backward_compatible_features(features)\n\u001b[32m-> \u001b[39m\u001b[32m1132\u001b[39m dataset_module = \u001b[43mdataset_module_factory\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1133\u001b[39m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1134\u001b[39m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1135\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1136\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1137\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1138\u001b[39m \u001b[43m    \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1139\u001b[39m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1140\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1141\u001b[39m \u001b[38;5;66;03m# Get dataset builder class\u001b[39;00m\n\u001b[32m   1142\u001b[39m builder_kwargs = dataset_module.builder_kwargs\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/Caches/pypoetry/virtualenvs/adalflow-project-Y74mvs4e-py3.12/lib/python3.12/site-packages/datasets/load.py:1031\u001b[39m, in \u001b[36mdataset_module_factory\u001b[39m\u001b[34m(path, revision, download_config, download_mode, data_dir, data_files, cache_dir, **download_kwargs)\u001b[39m\n\u001b[32m   1026\u001b[39m             \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e1, \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m):\n\u001b[32m   1027\u001b[39m                 \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\n\u001b[32m   1028\u001b[39m                     \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCouldn\u001b[39m\u001b[33m'\u001b[39m\u001b[33mt find any data file at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrelative_to_absolute_path(path)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m. \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m   1029\u001b[39m                     \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCouldn\u001b[39m\u001b[33m'\u001b[39m\u001b[33mt find \u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m on the Hugging Face Hub either: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(e1).\u001b[34m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me1\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m   1030\u001b[39m                 ) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1031\u001b[39m             \u001b[38;5;28;01mraise\u001b[39;00m e1 \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m   1032\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m   1033\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCouldn\u001b[39m\u001b[33m'\u001b[39m\u001b[33mt find any data file at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrelative_to_absolute_path(path)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m)\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/Library/Caches/pypoetry/virtualenvs/adalflow-project-Y74mvs4e-py3.12/lib/python3.12/site-packages/datasets/load.py:989\u001b[39m, in \u001b[36mdataset_module_factory\u001b[39m\u001b[34m(path, revision, download_config, download_mode, data_dir, data_files, cache_dir, **download_kwargs)\u001b[39m\n\u001b[32m    981\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m    982\u001b[39m     api.hf_hub_download(\n\u001b[32m    983\u001b[39m         repo_id=path,\n\u001b[32m    984\u001b[39m         filename=filename,\n\u001b[32m   (...)\u001b[39m\u001b[32m    987\u001b[39m         proxies=download_config.proxies,\n\u001b[32m    988\u001b[39m     )\n\u001b[32m--> \u001b[39m\u001b[32m989\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mDataset scripts are no longer supported, but found \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfilename\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m    990\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m EntryNotFoundError:\n\u001b[32m    991\u001b[39m     \u001b[38;5;66;03m# Use the infos from the parquet export except in some cases:\u001b[39;00m\n\u001b[32m    992\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m data_dir \u001b[38;5;129;01mor\u001b[39;00m data_files \u001b[38;5;129;01mor\u001b[39;00m (revision \u001b[38;5;129;01mand\u001b[39;00m revision != \u001b[33m\"\u001b[39m\u001b[33mmain\u001b[39m\u001b[33m\"\u001b[39m):\n",
-      "\u001b[31mRuntimeError\u001b[39m: Dataset scripts are no longer supported, but found hotpot_qa.py"
-     ]
+     "data": {
+      "text/plain": [
+       "HotPotQAData(id='5a8b57f25542995d1e6f1371', question='Were Scott Derrickson and Ed Wood of the same nationality?', answer='yes', gold_titles=\"{'Scott Derrickson', 'Ed Wood'}\")"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -566,15 +502,7 @@
    "name": "python3"
   },
   "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
    "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
    "version": "3.12.7"
   }
  },

From 93c89de1a55126b2d95d4c85c18289535ebc78ad Mon Sep 17 00:00:00 2001
From: phi-jkim <jinhaworld@gmail.com>
Date: Thu, 14 Aug 2025 11:53:16 -0700
Subject: [PATCH 3/3] Remove design document and modify tutorial

---
 docs/datasets_benchmarks.md                   | 226 ------------------
 .../runner_trainable_tutorial.py              |  89 +------
 2 files changed, 7 insertions(+), 308 deletions(-)
 delete mode 100644 docs/datasets_benchmarks.md

diff --git a/docs/datasets_benchmarks.md b/docs/datasets_benchmarks.md
deleted file mode 100644
index f4dfa771c..000000000
--- a/docs/datasets_benchmarks.md
+++ /dev/null
@@ -1,226 +0,0 @@
-# State-of-the-Art AI Agent Benchmark Datasets (2024-2025)
-
-## **SWE Benchmark Datasets**
-
-### **SWE-bench: Core Dataset**
-
-**Generic description**: An evaluation framework consisting of 2,294 software engineering problems drawn from real GitHub issues and corresponding pull requests across 12 popular Python repositories.
-
-SWE-bench evaluates how well language models can solve real-world software engineering tasks. The evaluation process involves:
-1. **Input**: Code repository and issue description
-2. **Task**: Generate a patch that resolves the described problem
-3. **Evaluation**: Test patch against existing test suites and issue requirements
-4. **Environment**: Fully containerized evaluation using Docker for reproducible results
-
-### **Latest SWE-bench Variants (2024-2025)**
-
-#### **SWE-bench Verified (August 2024)**
-A human-validated subset of 500 high-quality test cases from the original benchmark. Each task has been carefully reviewed and validated by human experts to ensure:
-- Clear problem statements without ambiguity
-- Correct expected solutions
-- Proper test case coverage
-- Elimination of data leakage issues
-
-**Evaluation methodology**: Uses the same Docker-based evaluation harness as the original SWE-bench, but with stricter quality controls and human verification of ground truth solutions.
-
-#### **SWE-bench Multimodal (January 2025)**
-Extends the original benchmark to include issues with visual elements and UI components. Features:
-- Screenshots and visual mockups as part of issue descriptions
-- UI/UX related bug reports and feature requests
-- Frontend component implementation tasks
-- Visual regression testing capabilities
-
-#### **SWE-bench Live (2025)**
-A continuously updated benchmark to prevent data contamination. Key features:
-- Monthly updates with new GitHub issues
-- Automated dataset curation pipeline
-- Real-time issue collection from active repositories
-- Covers 164 repositories across diverse domains
-
-### **SWE-Lancer: Economic-Driven Benchmark (2024)**
-**Generic description**: Real freelance software engineering tasks from Upwork, providing economic context to evaluation.
-
-**Evaluation methodology**:
-- Tasks sourced from actual Upwork freelance projects
-- Economic value ranges from $50 bug fixes to $32,000 feature implementations
-- End-to-end testing using Playwright-powered browser automation
-- Includes both technical implementation and project management assessment
-- Success measured by client acceptance criteria rather than just unit tests
-
-### **Expansion Opportunities for AdalFlow**
-
-Could expand to take not only issues but also pull requests labeled as features for testing CLI's capability of implementing actual features and not just issues. We could start with AdalFlow and see if it can implement core features given design documents in Notion. Open source libraries like PyTorch already provide specific documentation and detail for each feature.
-
-For these repositories, we can create a similar dataset with the base commit before the feature was merged, provide the documentation and details, and evaluate on the tests part of the feature PR.
-
-
-## **MLE Benchmark (Kaggle Competitions)**
-
-### **MLE-bench: OpenAI's Machine Learning Engineering Benchmark (2024)**
-
-**Generic description**: A benchmark for measuring how well AI agents perform at machine learning engineering, constructed using a collection of 75 ML engineering competitions sourced from Kaggle.
-
-**Evaluation methodology**:
-1. **Environment Setup**: Each competition runs in an isolated Docker container with standardized dependencies
-2. **Data Access**: Agents receive training data and competition description, must produce final submission
-3. **Evaluation Pipeline**: 
-   - Agents have access to local evaluation tools for validation
-   - Final submissions scored against held-out test sets
-   - Performance measured using Kaggle's medal system (Bronze/Silver/Gold)
-4. **Resource Management**: Computational limits and timeouts enforced per competition
-5. **Multi-domain Testing**: Covers NLP, computer vision, signal processing, and tabular data tasks
-
-**Key Infrastructure**:
-- Preparation scripts split publicly available training sets into new train/test divisions
-- Each competition includes problem description, dataset, local evaluation tools, and automated grading
-- Standardized agent interfaces across all 75 competitions
-- Reproducible evaluation with consistent environment specifications
-
-### **Latest Kaggle AI Agent Competitions (2024-2025)**
-
-#### **ARC Prize 2024 & 2025**
-**Generic description**: Evaluation of AI systems on novel reasoning tasks requiring efficient learning of new skills.
-
-**Evaluation methodology**:
-- Tests on abstract reasoning corpus requiring pattern recognition and rule inference
-- Computational budget constraints ($50 for 120 evaluation tasks)
-- Measures both performance accuracy and efficiency per task
-- Human baseline comparison for intelligence measurement
-
-#### **Kaggle Game Arena (2024)**
-**Generic description**: Platform for head-to-head AI model competition in strategic games.
-
-**Evaluation framework**:
-- Real-time strategic game competitions (Go, poker, video games)
-- ELO rating system for skill measurement
-- Head-to-head matchmaking and tournament structures
-- Dynamic leaderboards with continuous evaluation
-
-### **Expansion Opportunities for AdalFlow**
-
-Pull Kaggle competitions and split public dataset into training, validation and testing dataset and evaluate model submission performance as medals. From the evaluation, a base Docker environment is created for each agent that can be used for each of the competitions (common dependencies). Similarly, can integrate cloud compute or provided Docker environments to execute code and submit against model competitions.
-
-The agent should be able to compute using a docker container on the cloud or locally. The cloud would be preferable for parallelizing training and testin.
-
-## **Paper Implementation Datasets**
-
-### **Paper2Code: Automating Code Generation from Scientific Papers**
-
-**Link**: https://github.com/going-doer/Paper2Code
-
-**Generic description**: A benchmark that evaluates agents' ability to implement papers, which can be an important aspect of AI development.
-
-#### **Paper2CodeBench (2024)**
-**Dataset Construction Methodology**:
-1. **Paper Collection**: Uses OpenReview API to gather accepted papers from ICLR, ICML, and NeurIPS 2024
-2. **Filtering Criteria**: 
-   - Code availability requirement
-   - Token limit <70,000 to ensure LLM processing feasibility
-   - Quality screening using GPT-4o model-based evaluation
-3. **Final Selection**: Top 30 papers from each venue (90 total papers)
-
-**Evaluation Framework**:
-- **Input**: Research paper PDF and any supplementary materials
-- **Task**: Generate complete, functional code repository implementing the paper's methodology
-- **Assessment**: Multi-stage evaluation including:
-  - Code functionality and correctness
-  - Implementation completeness
-  - Adherence to paper specifications
-  - Reproducibility of reported results
-
-#### **PaperBench Code-Dev**
-**Generic description**: 20 papers from ICML 2024 with human-annotated evaluation rubrics.
-
-**Evaluation methodology**:
-- Paper-specific rubrics created by human experts
-- LLM-based evaluation using structured assessment criteria
-- Focus on implementation correctness rather than just code generation
-- Rubrics cover algorithm accuracy, code quality, and experimental reproducibility
-
-### **MLR-Bench: Machine Learning Research Benchmark (2024)**
-
-**Generic description**: A comprehensive benchmark for evaluating AI agents on open-ended machine learning research tasks.
-
-**Evaluation Infrastructure**:
-1. **MLR-Judge**: Automated evaluation framework using LLM-based reviewers with structured rubrics
-2. **MLR-Agent**: Modular agent scaffold providing standardized interface for task completion
-3. **Multi-stage Assessment**: Research proposal, methodology, implementation, and result analysis
-
-**Task Methodology**:
-- **Task Categories**: 9 core ML topics covering diverse research areas
-- **Open-ended Format**: Tasks require creative problem-solving rather than just implementation
-- **Research Pipeline**: Full research cycle from problem formulation to experimental validation
-- **Peer Review Simulation**: LLM-based reviewers assess work quality using academic standards
-
-### **Expansion Opportunities for AdalFlow**
-
-**(Implementing papers)**
-
-**Paper2Code: Automating Code Generation from Scientific Papers in Machine Learning**
-
-They evaluated agents' ability to implement papers which can be an important aspect.
-
-We construct a new benchmark dataset, which we refer to as Paper2CodeBench. Specifically, we collect the accepted papers from recent machine learning venues (such as ICLR, ICML, and NeurIPS 2024) with the OpenReview API, and filter them based on the availability of code with its total number of tokens less than 70,000, to ensure the full repository remains within reasonable processing limits of modern LLMs for generation and evaluation. Also, to maintain the benchmark quality, we perform model-based evaluation with GPT-4o on all the collected repositories and select the top 30 from each venue, resulting in a total of 90 papers.
-
-In addition to Paper2CodeBench, we also use the recently released PaperBench Code-Dev, which consists of 20 papers from ICML 2024 with paper-specific rubrics annotated by humans. In particular, those rubrics are used to judge the correct implementation based on LLM-based evaluation.
-
-## **Additional State-of-the-Art Benchmarks (2024-2025)**
-
-### **Agent-Specific Benchmarks**
-
-#### **AgentBoard (NeurIPS 2024)**
-**Generic description**: Analytical evaluation board for multi-turn LLM agents focusing on complex interaction scenarios.
-
-**Evaluation methodology**:
-- Multi-turn conversation evaluation with state persistence
-- Partially-observable environment maintenance across interactions
-- Long-horizon task completion assessment
-- Comprehensive evaluation across diverse agent capabilities
-
-#### **WebAgent for Real-World Web Tasks (ICLR 2024)**
-**Generic description**: LLM-driven agent evaluation on real website interaction tasks.
-
-**Evaluation framework**:
-- Real website interaction using browser automation
-- Natural language instruction parsing and execution
-- Task decomposition into canonical sub-instructions
-- HTML document processing and summarization capabilities
-- End-to-end task completion measurement
-
-#### **MetaGPT Multi-Agent Framework (ICLR 2024)**
-**Generic description**: Meta-programming framework for LLM-based multi-agent collaborations.
-
-**Evaluation approach**:
-- Multi-agent coordination and communication assessment
-- Workflow efficiency measurement in collaborative tasks
-- Human workflow integration and optimization
-- Complex project completion evaluation
-
-## **How AdalFlow Datasets Could Be Constructed**
-
-Based on the analysis of state-of-the-art benchmarks, AdalFlow could construct datasets in several key areas:
-
-### **1. SWE-bench Style Implementation**
-**Evaluation methodology**:
-- Use AdalFlow's own GitHub issues and feature requests as source material
-- Create base commits before features were merged to establish clean starting points
-- Provide design documents from Notion as contextual requirements
-- Evaluate against existing test suites and integration tests
-- Include Docker containerization for reproducible evaluation environments
-- Test both bug fixes and feature implementations
-
-### **2. MLE-bench Integration**
-**Evaluation framework**:
-- Curate machine learning competitions specifically relevant to AdalFlow's optimization capabilities
-- Create standardized Docker environments for reproducible evaluation across different tasks
-- Focus evaluation on prompt optimization, few-shot learning, and auto-differentiation tasks
-- Implement cloud compute integration for scalable evaluation infrastructure
-- Develop metrics for measuring optimization efficiency alongside task performance
-
-### **3. Paper Implementation Focus**
-**Evaluation approach**:
-- Target papers related to auto-optimization, prompt engineering, and LLM workflow optimization
-- Use AdalFlow's research on "Auto-Differentiating Any LLM Workflow" as baseline implementation
-- Create specialized rubrics for evaluating optimization algorithm implementations
-- Focus on recent LLM workflow optimization papers from top-tier venues
-- Implement automated testing for optimization convergence and performance improvement
diff --git a/tutorials/v1_agent_runner/runner_trainable_tutorial.py b/tutorials/v1_agent_runner/runner_trainable_tutorial.py
index dfac9fde0..d6660df68 100644
--- a/tutorials/v1_agent_runner/runner_trainable_tutorial.py
+++ b/tutorials/v1_agent_runner/runner_trainable_tutorial.py
@@ -129,82 +129,9 @@ def example_3_comparison_call_vs_forward():
     
     return call_result, forward_result
 
-
-def example_4_using_with_trainer():
-    """Example 4: How to use trainable results with AdalFlow trainer."""
-    print("\n=== Example 4: Using with Trainer ===")
-    
-    from adalflow.datasets.gsm8k import GSM8K
-    from adalflow.eval.answer_match_acc import AnswerMatchAcc
-    
-    # Load a small dataset for demonstration
-    train_data = GSM8K(split="train", size=5)
-    
-    agent = create_simple_agent()
-    runner = adal.Runner(agent=agent, training=True)
-    
-    # Create a simple trainer component
-    class RunnerTrainerDemo(adal.AdalComponent):
-        def __init__(self, runner):
-            eval_fn = AnswerMatchAcc(type="exact_match").compute_single_item
-            loss_fn = adal.EvalFnToTextLoss(
-                eval_fn=eval_fn,
-                eval_fn_desc="exact_match: 1 if str(y) == str(y_gt) else 0",
-            )
-            super().__init__(task=runner, eval_fn=eval_fn, loss_fn=loss_fn)
-        
-        def prepare_task(self, sample):
-            # This calls runner.forward() when in training mode
-            return self.task.forward, {"prompt_kwargs": {"input_str": sample.question}, "id": sample.id}
-        
-        def prepare_eval(self, sample, y_pred):
-            # y_pred will be an OutputParameter when using forward()
-            if isinstance(y_pred, OutputParameter):
-                y_label = y_pred.data.answer if y_pred.data.answer else ""
-            else:
-                y_label = y_pred.answer if y_pred.answer else ""
-            return self.eval_fn, {"y": y_label, "y_gt": sample.answer}
-        
-        def prepare_loss(self, sample, pred):
-            # pred is the OutputParameter from forward()
-            y_gt = adal.Parameter(
-                name="y_gt",
-                data=sample.answer,
-                eval_input=sample.answer,
-                requires_opt=False,
-            )
-            
-            # Set eval_input for the prediction
-            if hasattr(pred.data, 'answer') and pred.data.answer is not None:
-                pred.eval_input = pred.data.answer
-            else:
-                pred.eval_input = str(pred.data) if pred.data is not None else ""
-            
-            return self.loss_fn, {"kwargs": {"y": pred, "y_gt": y_gt}, "id": sample.id}
-    
-    # Create trainer component
-    trainer_component = RunnerTrainerDemo(runner)
-    
-    # Test with a single sample
-    sample = train_data[0]
-    print(f"Sample question: {sample.question}")
-    print(f"Ground truth: {sample.answer}")
-    
-    # This will call runner.forward() and return trainable result
-    task_fn, task_kwargs = trainer_component.prepare_task(sample)
-    trainable_result = task_fn(**task_kwargs)
-    
-    print("Sample question: ", sample.question)
-    print(f"Trainable result type: {type(trainable_result)}")
-    print(f"Final answer: {trainable_result.data.answer}")
-    print(f"Is optimizable: {trainable_result.requires_opt}")
-    
-    return trainable_result
-
-
-def example_5_gradient_tracking():
+def example_4_gradient_tracking():
     """Example 5: Understanding gradient tracking in trainable results."""
-    print("\n=== Example 5: Gradient Tracking ===")
+    print("\n=== Example 4: Gradient Tracking ===")
     
     agent = create_simple_agent()
     runner = adal.Runner(agent=agent, training=True)
@@ -240,11 +167,10 @@ def main():
     
     try:
         # Run examples
-        # example_1_basic_forward_usage()
-        # example_2_extracting_final_output()
-        # example_3_comparison_call_vs_forward()
-        example_4_using_with_trainer()
-        # example_5_gradient_tracking()
+        example_1_basic_forward_usage()
+        example_2_extracting_final_output()
+        example_3_comparison_call_vs_forward()
+        example_4_gradient_tracking()
         
         print("\n" + "=" * 50)
         print("Tutorial completed successfully!")
@@ -252,8 +178,7 @@ def main():
         print("1. Use runner.forward() instead of runner.call() for trainable results")
         print("2. Set training=True when creating Runner for optimization")
         print("3. Forward() returns OutputParameter with gradient information")
-        print("4. Access final answer via result.data.answer")
-        print("5. Trainable results can be used with AdalFlow's optimization system")
+        print("4. Trainable results can be used with AdalFlow's optimization system")
         
     except Exception as e:
         print(f"Error occurred during tutorial: {e}")