stanfordnlp · TomeHirata · Nov 17, 2025 · Nov 12, 2025 · Nov 12, 2025 · Nov 13, 2025
diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
@@ -231,9 +231,12 @@ def _prepare_results_output(
     ):
         return [
             (
-                merge_dicts(example, prediction) | {metric_name: score}
+                merge_dicts(
+                    example.toDict(),
+                    prediction.toDict() if hasattr(prediction, "toDict") else prediction
+                ) | {metric_name: score}
                 if prediction_is_dictlike(prediction)
-                else dict(example) | {"prediction": prediction, metric_name: score}
+                else example.toDict() | {"prediction": prediction, metric_name: score}
             )
             for example, prediction, score in results
         ]

diff --git a/dspy/primitives/example.py b/dspy/primitives/example.py
@@ -193,6 +193,9 @@ def toDict(self):  # noqa: N802
         def convert_to_serializable(value):
             if hasattr(value, "toDict"):
                 return value.toDict()
+            elif hasattr(value, "model_dump"):
+                # Handle Pydantic models (e.g., dspy.History)
+                return value.model_dump()
             elif isinstance(value, list):
                 return [convert_to_serializable(item) for item in value]
             elif isinstance(value, dict):

diff --git a/tests/evaluate/test_evaluate_with_history.py b/tests/evaluate/test_evaluate_with_history.py
@@ -0,0 +1,141 @@
+"""Test Evaluate with dspy.History objects."""
+import json
+import tempfile
+
+import dspy
+from dspy.evaluate import Evaluate
+from dspy.evaluate.metrics import answer_exact_match
+from dspy.predict import Predict
+from dspy.utils.dummies import DummyLM
+
+
+def test_evaluate_save_as_json_with_history():
+    """Test that save_as_json works with Examples containing dspy.History objects."""
+    # Setup
+    dspy.settings.configure(
+        lm=DummyLM(
+            {
+                "What is 1+1?": {"answer": "2"},
+                "What is 2+2?": {"answer": "4"},
+            }
+        )
+    )
+
+    # Create history objects
+    history1 = dspy.History(
+        messages=[
+            {"question": "Previous Q1", "answer": "Previous A1"},
+        ]
+    )
+    history2 = dspy.History(
+        messages=[
+            {"question": "Previous Q2", "answer": "Previous A2"},
+            {"question": "Previous Q3", "answer": "Previous A3"},
+        ]
+    )
+
+    # Create examples with history
+    devset = [
+        dspy.Example(question="What is 1+1?", answer="2", history=history1).with_inputs("question"),
+        dspy.Example(question="What is 2+2?", answer="4", history=history2).with_inputs("question"),
+    ]
+
+    program = Predict("question -> answer")
+
+    # Create evaluator with save_as_json
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        temp_json = f.name
+
+    try:
+        evaluator = Evaluate(
+            devset=devset,
+            metric=answer_exact_match,
+            display_progress=False,
+            save_as_json=temp_json,
+        )
+
+        result = evaluator(program)
+        assert result.score == 100.0
+
+        # Verify JSON file was created and is valid
+        with open(temp_json) as f:
+            data = json.load(f)
+
+        assert len(data) == 2
+
+        # Verify history was properly serialized in first record
+        assert "history" in data[0]
+        assert isinstance(data[0]["history"], dict)
+        assert "messages" in data[0]["history"]
+        assert len(data[0]["history"]["messages"]) == 1
+        assert data[0]["history"]["messages"][0] == {"question": "Previous Q1", "answer": "Previous A1"}
+
+        # Verify history was properly serialized in second record
+        assert "history" in data[1]
+        assert isinstance(data[1]["history"], dict)
+        assert "messages" in data[1]["history"]
+        assert len(data[1]["history"]["messages"]) == 2
+        assert data[1]["history"]["messages"][0] == {"question": "Previous Q2", "answer": "Previous A2"}
+        assert data[1]["history"]["messages"][1] == {"question": "Previous Q3", "answer": "Previous A3"}
+
+    finally:
+        import os
+        if os.path.exists(temp_json):
+            os.unlink(temp_json)
+
+
+def test_evaluate_save_as_csv_with_history():
+    """Test that save_as_csv works with Examples containing dspy.History objects."""
+    # Setup
+    dspy.settings.configure(
+        lm=DummyLM(
+            {
+                "What is 1+1?": {"answer": "2"},
+            }
+        )
+    )
+
+    # Create history object
+    history = dspy.History(
+        messages=[
+            {"question": "Previous Q", "answer": "Previous A"},
+        ]
+    )
+
+    # Create example with history
+    devset = [
+        dspy.Example(question="What is 1+1?", answer="2", history=history).with_inputs("question"),
+    ]
+
+    program = Predict("question -> answer")
+
+    # Create evaluator with save_as_csv
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
+        temp_csv = f.name
+
+    try:
+        evaluator = Evaluate(
+            devset=devset,
+            metric=answer_exact_match,
+            display_progress=False,
+            save_as_csv=temp_csv,
+        )
+
+        result = evaluator(program)
+        assert result.score == 100.0
+
+        # Verify CSV file was created
+        import csv
+        with open(temp_csv) as f:
+            reader = csv.DictReader(f)
+            rows = list(reader)
+
+        assert len(rows) == 1
+        assert "history" in rows[0]
+        # CSV will have string representation of the dict
+        assert "messages" in rows[0]["history"]
+
+    finally:
+        import os
+        if os.path.exists(temp_csv):
+            os.unlink(temp_csv)
diff --git a/tests/primitives/test_example.py b/tests/primitives/test_example.py
@@ -123,3 +123,34 @@ def test_example_copy_without():
 def test_example_to_dict():
     example = Example(a=1, b=2)
     assert example.toDict() == {"a": 1, "b": 2}
+
+
+def test_example_to_dict_with_history():
+    """Test that Example.toDict() properly serializes dspy.History objects."""
+    history = dspy.History(
+        messages=[
+            {"question": "What is the capital of France?", "answer": "Paris"},
+            {"question": "What is the capital of Germany?", "answer": "Berlin"},
+        ]
+    )
+    example = Example(question="Test question", history=history, answer="Test answer")
+
+    result = example.toDict()
+
+    # Verify the result is a dictionary
+    assert isinstance(result, dict)
+    assert "history" in result
+
+    # Verify history is serialized to a dict (not a History object)
+    assert isinstance(result["history"], dict)
+    assert "messages" in result["history"]
+    assert result["history"]["messages"] == [
+        {"question": "What is the capital of France?", "answer": "Paris"},
+        {"question": "What is the capital of Germany?", "answer": "Berlin"},
+    ]
+
+    # Verify JSON serialization works
+    import json
+    json_str = json.dumps(result)
+    restored = json.loads(json_str)
+    assert restored["history"]["messages"] == result["history"]["messages"]