Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions dspy/evaluate/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,9 +231,12 @@ def _prepare_results_output(
):
return [
(
merge_dicts(example, prediction) | {metric_name: score}
merge_dicts(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot do we still need this change after adding changes to Example?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we need both changes. While Example.toDict() now serializes History correctly, dict(example) and example.items() still return raw History objects. The original code used merge_dicts(example, prediction) which calls .items() internally, so it would still fail JSON serialization without this change.

Without the evaluate.py change: merge_dicts(example, prediction) → fails with "Object of type History is not JSON serializable"

With both changes: merge_dicts(example.toDict(), prediction) → succeeds because History is already serialized to dict

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot I see, can we change merge_dict to call toDict if needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in ebb502c. Moved the toDict() call into merge_dicts(), which now automatically serializes objects with toDict() method. This keeps the serialization logic centralized and makes _prepare_results_output() cleaner - it now passes Example objects directly to merge_dicts().

example.toDict(),
prediction.toDict() if hasattr(prediction, "toDict") else prediction
) | {metric_name: score}
if prediction_is_dictlike(prediction)
else dict(example) | {"prediction": prediction, metric_name: score}
else example.toDict() | {"prediction": prediction, metric_name: score}
)
for example, prediction, score in results
]
Expand Down
3 changes: 3 additions & 0 deletions dspy/primitives/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,9 @@ def toDict(self): # noqa: N802
def convert_to_serializable(value):
if hasattr(value, "toDict"):
return value.toDict()
elif hasattr(value, "model_dump"):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot can we use isinstance?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in 2a82f8d. Changed from hasattr(value, "model_dump") to isinstance(value, BaseModel) for more explicit type checking. Also added safe import handling with BaseModel = None fallback if pydantic is not available.

# Handle Pydantic models (e.g., dspy.History)
return value.model_dump()
elif isinstance(value, list):
return [convert_to_serializable(item) for item in value]
elif isinstance(value, dict):
Expand Down
141 changes: 141 additions & 0 deletions tests/evaluate/test_evaluate_with_history.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"""Test Evaluate with dspy.History objects."""
import json
import tempfile

import dspy
from dspy.evaluate import Evaluate
from dspy.evaluate.metrics import answer_exact_match
from dspy.predict import Predict
from dspy.utils.dummies import DummyLM


def test_evaluate_save_as_json_with_history():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot Can we add tests to existing test_evaluate.py?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in c67ba33. Moved both History tests into test_evaluate.py and removed the separate test_evaluate_with_history.py file. All 34 tests pass.

"""Test that save_as_json works with Examples containing dspy.History objects."""
# Setup
dspy.settings.configure(
lm=DummyLM(
{
"What is 1+1?": {"answer": "2"},
"What is 2+2?": {"answer": "4"},
}
)
)

# Create history objects
history1 = dspy.History(
messages=[
{"question": "Previous Q1", "answer": "Previous A1"},
]
)
history2 = dspy.History(
messages=[
{"question": "Previous Q2", "answer": "Previous A2"},
{"question": "Previous Q3", "answer": "Previous A3"},
]
)

# Create examples with history
devset = [
dspy.Example(question="What is 1+1?", answer="2", history=history1).with_inputs("question"),
dspy.Example(question="What is 2+2?", answer="4", history=history2).with_inputs("question"),
]

program = Predict("question -> answer")

# Create evaluator with save_as_json
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
temp_json = f.name

try:
evaluator = Evaluate(
devset=devset,
metric=answer_exact_match,
display_progress=False,
save_as_json=temp_json,
)

result = evaluator(program)
assert result.score == 100.0

# Verify JSON file was created and is valid
with open(temp_json) as f:
data = json.load(f)

assert len(data) == 2

# Verify history was properly serialized in first record
assert "history" in data[0]
assert isinstance(data[0]["history"], dict)
assert "messages" in data[0]["history"]
assert len(data[0]["history"]["messages"]) == 1
assert data[0]["history"]["messages"][0] == {"question": "Previous Q1", "answer": "Previous A1"}

# Verify history was properly serialized in second record
assert "history" in data[1]
assert isinstance(data[1]["history"], dict)
assert "messages" in data[1]["history"]
assert len(data[1]["history"]["messages"]) == 2
assert data[1]["history"]["messages"][0] == {"question": "Previous Q2", "answer": "Previous A2"}
assert data[1]["history"]["messages"][1] == {"question": "Previous Q3", "answer": "Previous A3"}

finally:
import os
if os.path.exists(temp_json):
os.unlink(temp_json)


def test_evaluate_save_as_csv_with_history():
"""Test that save_as_csv works with Examples containing dspy.History objects."""
# Setup
dspy.settings.configure(
lm=DummyLM(
{
"What is 1+1?": {"answer": "2"},
}
)
)

# Create history object
history = dspy.History(
messages=[
{"question": "Previous Q", "answer": "Previous A"},
]
)

# Create example with history
devset = [
dspy.Example(question="What is 1+1?", answer="2", history=history).with_inputs("question"),
]

program = Predict("question -> answer")

# Create evaluator with save_as_csv
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
temp_csv = f.name

try:
evaluator = Evaluate(
devset=devset,
metric=answer_exact_match,
display_progress=False,
save_as_csv=temp_csv,
)

result = evaluator(program)
assert result.score == 100.0

# Verify CSV file was created
import csv
with open(temp_csv) as f:
reader = csv.DictReader(f)
rows = list(reader)

assert len(rows) == 1
assert "history" in rows[0]
# CSV will have string representation of the dict
assert "messages" in rows[0]["history"]

finally:
import os
if os.path.exists(temp_csv):
os.unlink(temp_csv)
31 changes: 31 additions & 0 deletions tests/primitives/test_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,34 @@ def test_example_copy_without():
def test_example_to_dict():
example = Example(a=1, b=2)
assert example.toDict() == {"a": 1, "b": 2}


def test_example_to_dict_with_history():
"""Test that Example.toDict() properly serializes dspy.History objects."""
history = dspy.History(
messages=[
{"question": "What is the capital of France?", "answer": "Paris"},
{"question": "What is the capital of Germany?", "answer": "Berlin"},
]
)
example = Example(question="Test question", history=history, answer="Test answer")

result = example.toDict()

# Verify the result is a dictionary
assert isinstance(result, dict)
assert "history" in result

# Verify history is serialized to a dict (not a History object)
assert isinstance(result["history"], dict)
assert "messages" in result["history"]
assert result["history"]["messages"] == [
{"question": "What is the capital of France?", "answer": "Paris"},
{"question": "What is the capital of Germany?", "answer": "Berlin"},
]

# Verify JSON serialization works
import json
json_str = json.dumps(result)
restored = json.loads(json_str)
assert restored["history"]["messages"] == result["history"]["messages"]