Disable parallel tool calls final answer (#1539)

aymeric-roucher · albertvillanova · web-flow · commit 2aac5218d615 · 2025-08-05T12:46:23.000+02:00
Co-authored-by: Albert Villanova del Moral &lt;8515462+albertvillanova@users.noreply.github.com&gt;
diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py
@@ -1301,6 +1301,11 @@ def _step_stream(
             yield output
             if isinstance(output, ToolOutput):
                 if output.is_final_answer:
+                    if len(chat_message.tool_calls) > 1:
+                        raise AgentExecutionError(
+                            "If you want to return an answer, please do not perform any other tool calls than the final answer tool call!",
+                            self.logger,
+                        )
                     if got_final_answer:
                         raise AgentToolExecutionError(
                             "You returned multiple final answers. Please return only one single final answer!",
diff --git a/tests/test_agents.py b/tests/test_agents.py
@@ -439,9 +439,7 @@ def fake_image_generation_tool(prompt: str) -> PIL.Image.Image:
 
             return PIL.Image.open(shared_datadir / "000000039769.png")
 
-        agent = ToolCallingAgent(
-            tools=[fake_image_generation_tool], model=FakeToolCallModelImage(), verbosity_level=10
-        )
+        agent = ToolCallingAgent(tools=[fake_image_generation_tool], model=FakeToolCallModelImage())
         output = agent.run("Make me an image.")
         assert isinstance(output, AgentImage)
         assert isinstance(agent.state["image.png"], PIL.Image.Image)
@@ -567,7 +565,6 @@ def test_function_persistence_across_steps(self):
             model=FakeCodeModelFunctionDef(),
             max_steps=2,
             additional_authorized_imports=["numpy"],
-            verbosity_level=100,
         )
         res = agent.run("ok")
         assert res[0] == 0.5
@@ -652,7 +649,7 @@ def weather_api(location: str, celsius: str = "") -> str:
             device_map="auto",
             do_sample=False,
         )
-        agent = ToolCallingAgent(model=model, tools=[weather_api], max_steps=1, verbosity_level=10)
+        agent = ToolCallingAgent(model=model, tools=[weather_api], max_steps=1)
         task = "What is the weather in Paris? "
         agent.run(task)
         assert agent.memory.steps[0].task == task
@@ -679,7 +676,6 @@ def check_always_fails(final_answer, agent_memory):
             model=FakeCodeModel(),
             tools=[],
             final_answer_checks=[lambda x, y: x == 7.2904],
-            verbosity_level=1000,
         )
         output = agent.run("Dummy task.")
         assert output == 7.2904  # Check that output is correct
@@ -1526,29 +1522,29 @@ def weather_api(location: str, date: str) -> str:
         assert agent.memory.steps[1].observations == "The weather in Paris on date:today is sunny."
 
     @patch("openai.OpenAI")
-    def test_toolcalling_agent_stream_outputs_multiple_tool_calls(self, mock_openai_client, test_tool):
-        """Test that ToolCallingAgent with stream_outputs=True returns the first final_answer when multiple are called."""
+    def test_toolcalling_agent_stream_logs_multiple_tool_calls_observations(self, mock_openai_client, test_tool):
+        """Test that ToolCallingAgent with stream_outputs=True logs the observations of all tool calls when multiple are called."""
         mock_client = mock_openai_client.return_value
         from smolagents import OpenAIServerModel
 
-        # Mock streaming response with multiple final_answer calls
+        # Mock streaming response with multiple tool calls
         mock_deltas = [
             ChoiceDelta(role=MessageRole.ASSISTANT),
             ChoiceDelta(
                 tool_calls=[
                     ChoiceDeltaToolCall(
                         index=0,
                         id="call_1",
-                        function=ChoiceDeltaToolCallFunction(name="final_answer"),
+                        function=ChoiceDeltaToolCallFunction(name="test_tool"),
                         type="function",
                     )
                 ]
             ),
             ChoiceDelta(
-                tool_calls=[ChoiceDeltaToolCall(index=0, function=ChoiceDeltaToolCallFunction(arguments='{"an'))]
+                tool_calls=[ChoiceDeltaToolCall(index=0, function=ChoiceDeltaToolCallFunction(arguments='{"in'))]
             ),
             ChoiceDelta(
-                tool_calls=[ChoiceDeltaToolCall(index=0, function=ChoiceDeltaToolCallFunction(arguments='swer"'))]
+                tool_calls=[ChoiceDeltaToolCall(index=0, function=ChoiceDeltaToolCallFunction(arguments='put"'))]
             ),
             ChoiceDelta(
                 tool_calls=[ChoiceDeltaToolCall(index=0, function=ChoiceDeltaToolCallFunction(arguments=': "out'))]
@@ -1605,13 +1601,85 @@ def __init__(self, delta):
         model = OpenAIServerModel(model_id="fakemodel")
 
         agent = ToolCallingAgent(model=model, tools=[test_tool], max_steps=1, stream_outputs=True)
-        result = agent.run("Make 2 calls to final answer: return both 'output1' and 'output2'")
-        assert len(agent.memory.steps[-1].model_output_message.tool_calls) == 2
-        assert agent.memory.steps[-1].model_output_message.tool_calls[0].function.name == "final_answer"
-        assert agent.memory.steps[-1].model_output_message.tool_calls[1].function.name == "test_tool"
+        agent.run("Dummy task")
+        assert agent.memory.steps[1].model_output_message.tool_calls[0].function.name == "test_tool"
+        assert agent.memory.steps[1].model_output_message.tool_calls[1].function.name == "test_tool"
+        assert agent.memory.steps[1].observations == "Processed: output1\nProcessed: output2"
+
+    @patch("openai.OpenAI")
+    def test_toolcalling_agent_final_answer_cannot_be_called_with_parallel_tool_calls(
+        self, mock_openai_client, test_tool
+    ):
+        """Test that ToolCallingAgent with stream_outputs=True returns the all tool calls when multiple are called."""
+        mock_client = mock_openai_client.return_value
+
+        from smolagents import OpenAIServerModel
 
-        # The agent should return the final answer call
-        assert result == "output1"
+        class ExtendedChatMessage(ChatMessage):
+            def __init__(self, *args, usage, **kwargs):
+                super().__init__(*args, **kwargs)
+
+            def model_dump(self, include=None):
+                return super().model_dump_json()
+
+        class MockChoice:
+            def __init__(self, chat_message):
+                self.message = chat_message
+
+        class MockChatCompletion:
+            def __init__(self, chat_message):
+                self.choices = [MockChoice(chat_message)]
+                self.usage = MockTokenUsage(prompt_tokens=10, completion_tokens=20)
+
+        class MockTokenUsage:
+            def __init__(self, prompt_tokens, completion_tokens):
+                self.prompt_tokens = prompt_tokens
+                self.completion_tokens = completion_tokens
+
+        from dataclasses import asdict
+
+        class ExtendedChatCompletionOutputMessage(ChatCompletionOutputMessage):
+            def __init__(self, *args, usage, **kwargs):
+                super().__init__(*args, **kwargs)
+                self.usage = usage
+
+            def model_dump(self, include=None):
+                print("TOOL CALLS", self.tool_calls)
+                return {
+                    "role": self.role,
+                    "content": self.content,
+                    "tool_calls": [asdict(tc) for tc in self.tool_calls],
+                }
+
+        mock_client.chat.completions.create.return_value = MockChatCompletion(
+            ExtendedChatCompletionOutputMessage(
+                role=MessageRole.ASSISTANT,
+                content=None,
+                tool_calls=[
+                    ChatMessageToolCall(
+                        id="call_0",
+                        type="function",
+                        function=ChatMessageToolCallFunction(name="test_tool", arguments={"input": "out1"}),
+                    ),
+                    ChatMessageToolCall(
+                        id="1",
+                        type="function",
+                        function=ChatMessageToolCallFunction(name="final_answer", arguments={"answer": "out1"}),
+                    ),
+                ],
+                usage=MockTokenUsage(prompt_tokens=10, completion_tokens=20),
+            )
+        )
+
+        model = OpenAIServerModel(model_id="fakemodel")
+
+        agent = ToolCallingAgent(model=model, tools=[test_tool], max_steps=1)
+        agent.run("Dummy task")
+        assert agent.memory.steps[1].error is not None
+        assert (
+            "do not perform any other tool calls than the final answer tool call!"
+            in agent.memory.steps[1].error.message
+        )
 
     @patch("huggingface_hub.InferenceClient")
     def test_toolcalling_agent_api_misformatted_output(self, mock_inference_client):
@@ -1690,18 +1758,12 @@ def forward(self, answer1: str, answer2: str) -> str:
                         name="final_answer", arguments={"answer1": "1", "answer2": "2"}
                     ),
                 ),
-                ChatMessageToolCall(
-                    id="call_1",
-                    type="function",
-                    function=ChatMessageToolCallFunction(name="test_tool", arguments={"input": "3"}),
-                ),
             ],
         )
         agent = ToolCallingAgent(tools=[test_tool, CustomFinalAnswerToolWithCustomInputs()], model=model)
         answer = agent.run("Fake task.")
         assert answer == "1 and 2"
         assert agent.memory.steps[-1].model_output_message.tool_calls[0].function.name == "final_answer"
-        assert agent.memory.steps[-1].model_output_message.tool_calls[1].function.name == "test_tool"
 
     @pytest.mark.parametrize(
         "test_case",
@@ -1932,7 +1994,7 @@ def test_errors_show_offending_line_and_error(self):
         assert "ValueError" in str(agent.memory.steps)
 
     def test_error_saves_previous_print_outputs(self):
-        agent = CodeAgent(tools=[PythonInterpreterTool()], model=FakeCodeModelError(), verbosity_level=10)
+        agent = CodeAgent(tools=[PythonInterpreterTool()], model=FakeCodeModelError())
         agent.run("What is 2 multiplied by 3.6452?")
         assert "Flag!" in str(agent.memory.steps[1].observations)