Skip to content

Commit 983aad3

Browse files
feat(langgraph): Response model attribute on invocation spans (#5212)
Add the last response model name to LangGraph invocation spans. Contributes to #5170
1 parent 40e5083 commit 983aad3

File tree

2 files changed

+328
-0
lines changed

2 files changed

+328
-0
lines changed

sentry_sdk/integrations/langgraph.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,23 @@ def _set_usage_data(span, messages):
349349
)
350350

351351

352+
def _set_response_model_name(span, messages):
353+
# type: (sentry_sdk.tracing.Span, Any) -> None
354+
if len(messages) == 0:
355+
return
356+
357+
last_message = messages[-1]
358+
response_metadata = last_message.get("response_metadata")
359+
if response_metadata is None:
360+
return
361+
362+
model_name = response_metadata.get("model_name")
363+
if model_name is None:
364+
return
365+
366+
set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_MODEL, model_name)
367+
368+
352369
def _set_response_attributes(span, input_messages, result, integration):
353370
# type: (Any, Optional[List[Any]], Any, LanggraphIntegration) -> None
354371
parsed_response_messages = _parse_langgraph_messages(result)
@@ -358,6 +375,7 @@ def _set_response_attributes(span, input_messages, result, integration):
358375
return
359376

360377
_set_usage_data(span, new_messages)
378+
_set_response_model_name(span, new_messages)
361379

362380
if not (should_send_default_pii() and integration.include_prompts):
363381
return

tests/integrations/langgraph/test_langgraph.py

Lines changed: 310 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -831,6 +831,316 @@ async def run_test():
831831
assert invoke_agent_span["data"]["gen_ai.usage.total_tokens"] == 50
832832

833833

834+
def test_pregel_invoke_span_includes_response_model(sentry_init, capture_events):
835+
"""
836+
Test that invoke_agent spans include the response model.
837+
When an agent makes multiple LLM calls, it should report the last model used.
838+
"""
839+
sentry_init(
840+
integrations=[LanggraphIntegration()],
841+
traces_sample_rate=1.0,
842+
)
843+
events = capture_events()
844+
845+
test_state = {
846+
"messages": [
847+
MockMessage("Hello, can you help me?", name="user"),
848+
MockMessage("Of course! How can I assist you?", name="assistant"),
849+
]
850+
}
851+
852+
pregel = MockPregelInstance("test_graph")
853+
854+
expected_assistant_response = "I'll help you with that task!"
855+
expected_tool_calls = [
856+
{
857+
"id": "call_test_123",
858+
"type": "function",
859+
"function": {"name": "search_tool", "arguments": '{"query": "help"}'},
860+
}
861+
]
862+
863+
def original_invoke(self, *args, **kwargs):
864+
input_messages = args[0].get("messages", [])
865+
new_messages = input_messages + [
866+
MockMessage(
867+
content=expected_assistant_response,
868+
name="assistant",
869+
tool_calls=expected_tool_calls,
870+
response_metadata={
871+
"token_usage": {
872+
"total_tokens": 30,
873+
"prompt_tokens": 10,
874+
"completion_tokens": 20,
875+
},
876+
"model_name": "gpt-4.1-2025-04-14",
877+
},
878+
)
879+
]
880+
return {"messages": new_messages}
881+
882+
with start_transaction():
883+
wrapped_invoke = _wrap_pregel_invoke(original_invoke)
884+
result = wrapped_invoke(pregel, test_state)
885+
886+
assert result is not None
887+
888+
tx = events[0]
889+
assert tx["type"] == "transaction"
890+
891+
invoke_spans = [
892+
span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT
893+
]
894+
assert len(invoke_spans) == 1
895+
896+
invoke_agent_span = invoke_spans[0]
897+
898+
# Verify invoke_agent span has response model
899+
assert invoke_agent_span["description"] == "invoke_agent test_graph"
900+
assert "gen_ai.response.model" in invoke_agent_span["data"]
901+
assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14"
902+
903+
904+
def test_pregel_ainvoke_span_includes_response_model(sentry_init, capture_events):
905+
"""
906+
Test that invoke_agent spans include the response model.
907+
When an agent makes multiple LLM calls, it should report the last model used.
908+
"""
909+
sentry_init(
910+
integrations=[LanggraphIntegration()],
911+
traces_sample_rate=1.0,
912+
)
913+
events = capture_events()
914+
915+
test_state = {
916+
"messages": [
917+
MockMessage("Hello, can you help me?", name="user"),
918+
MockMessage("Of course! How can I assist you?", name="assistant"),
919+
]
920+
}
921+
922+
pregel = MockPregelInstance("test_graph")
923+
924+
expected_assistant_response = "I'll help you with that task!"
925+
expected_tool_calls = [
926+
{
927+
"id": "call_test_123",
928+
"type": "function",
929+
"function": {"name": "search_tool", "arguments": '{"query": "help"}'},
930+
}
931+
]
932+
933+
async def original_ainvoke(self, *args, **kwargs):
934+
input_messages = args[0].get("messages", [])
935+
new_messages = input_messages + [
936+
MockMessage(
937+
content=expected_assistant_response,
938+
name="assistant",
939+
tool_calls=expected_tool_calls,
940+
response_metadata={
941+
"token_usage": {
942+
"total_tokens": 30,
943+
"prompt_tokens": 10,
944+
"completion_tokens": 20,
945+
},
946+
"model_name": "gpt-4.1-2025-04-14",
947+
},
948+
)
949+
]
950+
return {"messages": new_messages}
951+
952+
async def run_test():
953+
with start_transaction():
954+
wrapped_ainvoke = _wrap_pregel_ainvoke(original_ainvoke)
955+
result = await wrapped_ainvoke(pregel, test_state)
956+
return result
957+
958+
result = asyncio.run(run_test())
959+
assert result is not None
960+
961+
tx = events[0]
962+
assert tx["type"] == "transaction"
963+
964+
invoke_spans = [
965+
span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT
966+
]
967+
assert len(invoke_spans) == 1
968+
969+
invoke_agent_span = invoke_spans[0]
970+
971+
# Verify invoke_agent span has response model
972+
assert invoke_agent_span["description"] == "invoke_agent test_graph"
973+
assert "gen_ai.response.model" in invoke_agent_span["data"]
974+
assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14"
975+
976+
977+
def test_pregel_invoke_span_uses_last_response_model(sentry_init, capture_events):
978+
"""
979+
Test that when an agent makes multiple LLM calls (e.g., with tools),
980+
the invoke_agent span reports the last response model used.
981+
"""
982+
sentry_init(
983+
integrations=[LanggraphIntegration()],
984+
traces_sample_rate=1.0,
985+
)
986+
events = capture_events()
987+
988+
test_state = {
989+
"messages": [
990+
MockMessage("Hello, can you help me?", name="user"),
991+
MockMessage("Of course! How can I assist you?", name="assistant"),
992+
]
993+
}
994+
995+
pregel = MockPregelInstance("test_graph")
996+
997+
expected_assistant_response = "I'll help you with that task!"
998+
expected_tool_calls = [
999+
{
1000+
"id": "call_test_123",
1001+
"type": "function",
1002+
"function": {"name": "search_tool", "arguments": '{"query": "help"}'},
1003+
}
1004+
]
1005+
1006+
def original_invoke(self, *args, **kwargs):
1007+
input_messages = args[0].get("messages", [])
1008+
new_messages = input_messages + [
1009+
MockMessage(
1010+
content=expected_assistant_response,
1011+
name="assistant",
1012+
tool_calls=expected_tool_calls,
1013+
response_metadata={
1014+
"token_usage": {
1015+
"total_tokens": 15,
1016+
"prompt_tokens": 10,
1017+
"completion_tokens": 5,
1018+
},
1019+
"model_name": "gpt-4-0613",
1020+
},
1021+
),
1022+
MockMessage(
1023+
content=expected_assistant_response,
1024+
name="assistant",
1025+
tool_calls=expected_tool_calls,
1026+
response_metadata={
1027+
"token_usage": {
1028+
"total_tokens": 35,
1029+
"prompt_tokens": 20,
1030+
"completion_tokens": 15,
1031+
},
1032+
"model_name": "gpt-4.1-2025-04-14",
1033+
},
1034+
),
1035+
]
1036+
return {"messages": new_messages}
1037+
1038+
with start_transaction():
1039+
wrapped_invoke = _wrap_pregel_invoke(original_invoke)
1040+
result = wrapped_invoke(pregel, test_state)
1041+
1042+
assert result is not None
1043+
1044+
tx = events[0]
1045+
assert tx["type"] == "transaction"
1046+
1047+
invoke_spans = [
1048+
span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT
1049+
]
1050+
assert len(invoke_spans) == 1
1051+
1052+
invoke_agent_span = invoke_spans[0]
1053+
1054+
# Verify invoke_agent span uses the LAST response model
1055+
assert "gen_ai.response.model" in invoke_agent_span["data"]
1056+
assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14"
1057+
1058+
1059+
def test_pregel_ainvoke_span_uses_last_response_model(sentry_init, capture_events):
1060+
"""
1061+
Test that when an agent makes multiple LLM calls (e.g., with tools),
1062+
the invoke_agent span reports the last response model used.
1063+
"""
1064+
sentry_init(
1065+
integrations=[LanggraphIntegration()],
1066+
traces_sample_rate=1.0,
1067+
)
1068+
events = capture_events()
1069+
1070+
test_state = {
1071+
"messages": [
1072+
MockMessage("Hello, can you help me?", name="user"),
1073+
MockMessage("Of course! How can I assist you?", name="assistant"),
1074+
]
1075+
}
1076+
1077+
pregel = MockPregelInstance("test_graph")
1078+
1079+
expected_assistant_response = "I'll help you with that task!"
1080+
expected_tool_calls = [
1081+
{
1082+
"id": "call_test_123",
1083+
"type": "function",
1084+
"function": {"name": "search_tool", "arguments": '{"query": "help"}'},
1085+
}
1086+
]
1087+
1088+
async def original_ainvoke(self, *args, **kwargs):
1089+
input_messages = args[0].get("messages", [])
1090+
new_messages = input_messages + [
1091+
MockMessage(
1092+
content=expected_assistant_response,
1093+
name="assistant",
1094+
tool_calls=expected_tool_calls,
1095+
response_metadata={
1096+
"token_usage": {
1097+
"total_tokens": 15,
1098+
"prompt_tokens": 10,
1099+
"completion_tokens": 5,
1100+
},
1101+
"model_name": "gpt-4-0613",
1102+
},
1103+
),
1104+
MockMessage(
1105+
content=expected_assistant_response,
1106+
name="assistant",
1107+
tool_calls=expected_tool_calls,
1108+
response_metadata={
1109+
"token_usage": {
1110+
"total_tokens": 35,
1111+
"prompt_tokens": 20,
1112+
"completion_tokens": 15,
1113+
},
1114+
"model_name": "gpt-4.1-2025-04-14",
1115+
},
1116+
),
1117+
]
1118+
return {"messages": new_messages}
1119+
1120+
async def run_test():
1121+
with start_transaction():
1122+
wrapped_ainvoke = _wrap_pregel_ainvoke(original_ainvoke)
1123+
result = await wrapped_ainvoke(pregel, test_state)
1124+
return result
1125+
1126+
result = asyncio.run(run_test())
1127+
assert result is not None
1128+
1129+
tx = events[0]
1130+
assert tx["type"] == "transaction"
1131+
1132+
invoke_spans = [
1133+
span for span in tx["spans"] if span["op"] == OP.GEN_AI_INVOKE_AGENT
1134+
]
1135+
assert len(invoke_spans) == 1
1136+
1137+
invoke_agent_span = invoke_spans[0]
1138+
1139+
# Verify invoke_agent span uses the LAST response model
1140+
assert "gen_ai.response.model" in invoke_agent_span["data"]
1141+
assert invoke_agent_span["data"]["gen_ai.response.model"] == "gpt-4.1-2025-04-14"
1142+
1143+
8341144
def test_complex_message_parsing():
8351145
"""Test message parsing with complex message structures."""
8361146
messages = [

0 commit comments

Comments
 (0)