From 04c6edeb0663c9293229ff62a469859c25789fc8 Mon Sep 17 00:00:00 2001 From: Erica Pisani Date: Wed, 1 Apr 2026 13:11:12 +0200 Subject: [PATCH 1/9] refactor(openai): Split token counting into separate Completions and Responses API functions Replace the shared `_calculate_token_usage()` and `_get_usage()` with two API-specific functions: `_calculate_completions_token_usage()` and `_calculate_responses_token_usage()`. This makes it clear which token fields belong to which API and enables clean removal of Chat Completions support when it is deprecated. - Completions function extracts `prompt_tokens`, `completion_tokens`, `total_tokens` and supports `streaming_message_token_usage` for stream_options include_usage - Responses function extracts `input_tokens`, `output_tokens`, `total_tokens` plus `cached_tokens` and `reasoning_tokens` details - Add API section comments in `_set_common_output_data` - Update all call sites to use the appropriate API-specific function - Convert Completions call sites to use keyword arguments - Update and rename unit tests; add Responses API token usage tests - Add sync and async streaming tests for usage-in-stream Co-Authored-By: Claude Opus 4.6 (1M context) --- sentry_sdk/integrations/openai.py | 187 +++++++++++---- tests/integrations/openai/test_openai.py | 290 +++++++++++++++++++++-- 2 files changed, 404 insertions(+), 73 deletions(-) diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py index 6707f8194b..5ea2a62af6 100644 --- a/sentry_sdk/integrations/openai.py +++ b/sentry_sdk/integrations/openai.py @@ -50,8 +50,12 @@ from sentry_sdk.tracing import Span from sentry_sdk._types import TextPart - from openai.types.responses import ResponseInputParam, SequenceNotStr - from openai.types.responses import ResponseStreamEvent + from openai.types.responses import ( + ResponseInputParam, + SequenceNotStr, + ResponseStreamEvent, + ) + from openai.types import CompletionUsage from openai import Omit try: @@ -144,44 +148,37 @@ def _capture_exception(exc: "Any", manual_span_cleanup: bool = True) -> None: sentry_sdk.capture_event(event, hint=hint) -def _get_usage(usage: "Any", names: "List[str]") -> int: - for name in names: - if hasattr(usage, name) and isinstance(getattr(usage, name), int): - return getattr(usage, name) - return 0 - - -def _calculate_token_usage( +def _calculate_completions_token_usage( messages: "Optional[Iterable[ChatCompletionMessageParam]]", response: "Any", span: "Span", streaming_message_responses: "Optional[List[str]]", + streaming_message_token_usage: "Optional[CompletionUsage]", count_tokens: "Callable[..., Any]", ) -> None: + """Extract and record token usage from a Chat Completions API response.""" input_tokens: "Optional[int]" = 0 - input_tokens_cached: "Optional[int]" = 0 output_tokens: "Optional[int]" = 0 - output_tokens_reasoning: "Optional[int]" = 0 total_tokens: "Optional[int]" = 0 + usage = None - if hasattr(response, "usage"): - input_tokens = _get_usage(response.usage, ["input_tokens", "prompt_tokens"]) - if hasattr(response.usage, "input_tokens_details"): - input_tokens_cached = _get_usage( - response.usage.input_tokens_details, ["cached_tokens"] - ) - - output_tokens = _get_usage( - response.usage, ["output_tokens", "completion_tokens"] - ) - if hasattr(response.usage, "output_tokens_details"): - output_tokens_reasoning = _get_usage( - response.usage.output_tokens_details, ["reasoning_tokens"] - ) + if streaming_message_token_usage: + usage = streaming_message_token_usage - total_tokens = _get_usage(response.usage, ["total_tokens"]) - - # Manually count tokens + if hasattr(response, "usage"): + usage = response.usage + + if usage is not None: + if hasattr(usage, "prompt_tokens") and isinstance(usage.prompt_tokens, int): + input_tokens = usage.prompt_tokens + if hasattr(usage, "completion_tokens") and isinstance( + usage.completion_tokens, int + ): + output_tokens = usage.completion_tokens + if hasattr(usage, "total_tokens") and isinstance(usage.total_tokens, int): + total_tokens = usage.total_tokens + + # Manually count input tokens if input_tokens == 0: for message in messages or []: if isinstance(message, str): @@ -191,11 +188,11 @@ def _calculate_token_usage( message_content = message.get("content") if message_content is None: continue - # Deliberate use of Completions function for both Completions and Responses input format. text_items = _get_text_items(message_content) input_tokens += sum(count_tokens(text) for text in text_items) continue + # Manually count output tokens if output_tokens == 0: if streaming_message_responses is not None: for message in streaming_message_responses: @@ -205,6 +202,71 @@ def _calculate_token_usage( if hasattr(choice, "message") and hasattr(choice.message, "content"): output_tokens += count_tokens(choice.message.content) + # Do not set token data if it is 0 + input_tokens = input_tokens or None + output_tokens = output_tokens or None + total_tokens = total_tokens or None + + record_token_usage( + span, + input_tokens=input_tokens, + output_tokens=output_tokens, + total_tokens=total_tokens, + ) + + +def _calculate_responses_token_usage( + input: "Any", + response: "Any", + span: "Span", + streaming_message_responses: "Optional[List[str]]", + count_tokens: "Callable[..., Any]", +) -> None: + """Extract and record token usage from a Responses API response.""" + input_tokens: "Optional[int]" = 0 + input_tokens_cached: "Optional[int]" = 0 + output_tokens: "Optional[int]" = 0 + output_tokens_reasoning: "Optional[int]" = 0 + total_tokens: "Optional[int]" = 0 + + if hasattr(response, "usage"): + usage = response.usage + if hasattr(usage, "input_tokens") and isinstance(usage.input_tokens, int): + input_tokens = usage.input_tokens + if hasattr(usage, "input_tokens_details"): + cached = getattr(usage.input_tokens_details, "cached_tokens", None) + if isinstance(cached, int): + input_tokens_cached = cached + if hasattr(usage, "output_tokens") and isinstance(usage.output_tokens, int): + output_tokens = usage.output_tokens + if hasattr(usage, "output_tokens_details"): + reasoning = getattr(usage.output_tokens_details, "reasoning_tokens", None) + if isinstance(reasoning, int): + output_tokens_reasoning = reasoning + if hasattr(usage, "total_tokens") and isinstance(usage.total_tokens, int): + total_tokens = usage.total_tokens + + # Manually count input tokens + if input_tokens == 0: + for message in input or []: + if isinstance(message, str): + input_tokens += count_tokens(message) + continue + elif isinstance(message, dict): + message_content = message.get("content") + if message_content is None: + continue + # Deliberate use of Completions function for both Completions and Responses input format. + text_items = _get_text_items(message_content) + input_tokens += sum(count_tokens(text) for text in text_items) + continue + + # Manually count output tokens + if output_tokens == 0: + if streaming_message_responses is not None: + for message in streaming_message_responses: + output_tokens += count_tokens(message) + # Do not set token data if it is 0 input_tokens = input_tokens or None input_tokens_cached = input_tokens_cached or None @@ -486,6 +548,7 @@ def _set_common_output_data( if hasattr(response, "model"): set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_MODEL, response.model) + # Chat Completions API if hasattr(response, "choices"): if should_send_default_pii() and integration.include_prompts: response_text = [ @@ -496,11 +559,19 @@ def _set_common_output_data( if len(response_text) > 0: set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, response_text) - _calculate_token_usage(input, response, span, None, integration.count_tokens) + _calculate_completions_token_usage( + messages=input, + response=response, + span=span, + streaming_message_responses=None, + streaming_message_token_usage=None, + count_tokens=integration.count_tokens, + ) if finish_span: span.__exit__(None, None, None) + # Responses API elif hasattr(response, "output"): if should_send_default_pii() and integration.include_prompts: output_messages: "dict[str, list[Any]]" = { @@ -532,12 +603,22 @@ def _set_common_output_data( span, SPANDATA.GEN_AI_RESPONSE_TEXT, output_messages["response"] ) - _calculate_token_usage(input, response, span, None, integration.count_tokens) + _calculate_responses_token_usage( + input, response, span, None, integration.count_tokens + ) if finish_span: span.__exit__(None, None, None) + # Embeddings API (fallback for responses with neither choices nor output) else: - _calculate_token_usage(input, response, span, None, integration.count_tokens) + _calculate_completions_token_usage( + messages=input, + response=response, + span=span, + streaming_message_responses=None, + streaming_message_token_usage=None, + count_tokens=integration.count_tokens, + ) if finish_span: span.__exit__(None, None, None) @@ -655,6 +736,7 @@ def _wrap_synchronous_completions_chunk_iterator( """ ttft = None data_buf: "list[list[str]]" = [] # one for each choice + streaming_message_token_usage = None for x in old_iterator: span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, x.model) @@ -671,6 +753,8 @@ def _wrap_synchronous_completions_chunk_iterator( data_buf.append([]) data_buf[choice_index].append(content or "") choice_index += 1 + if hasattr(x, "usage"): + streaming_message_token_usage = x.usage yield x @@ -683,12 +767,13 @@ def _wrap_synchronous_completions_chunk_iterator( all_responses = ["".join(chunk) for chunk in data_buf] if should_send_default_pii() and integration.include_prompts: set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses) - _calculate_token_usage( - messages, - response, - span, - all_responses, - integration.count_tokens, + _calculate_completions_token_usage( + messages=messages, + response=response, + span=span, + streaming_message_responses=all_responses, + streaming_message_token_usage=streaming_message_token_usage, + count_tokens=integration.count_tokens, ) if finish_span: @@ -711,6 +796,7 @@ async def _wrap_asynchronous_completions_chunk_iterator( """ ttft = None data_buf: "list[list[str]]" = [] # one for each choice + streaming_message_token_usage = None async for x in old_iterator: span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, x.model) @@ -727,6 +813,8 @@ async def _wrap_asynchronous_completions_chunk_iterator( data_buf.append([]) data_buf[choice_index].append(content or "") choice_index += 1 + if hasattr(x, "usage"): + streaming_message_token_usage = x.usage yield x @@ -739,12 +827,13 @@ async def _wrap_asynchronous_completions_chunk_iterator( all_responses = ["".join(chunk) for chunk in data_buf] if should_send_default_pii() and integration.include_prompts: set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses) - _calculate_token_usage( - messages, - response, - span, - all_responses, - integration.count_tokens, + _calculate_completions_token_usage( + messages=messages, + response=response, + span=span, + streaming_message_responses=all_responses, + streaming_message_token_usage=streaming_message_token_usage, + count_tokens=integration.count_tokens, ) if finish_span: @@ -781,7 +870,7 @@ def _wrap_synchronous_responses_event_iterator( if isinstance(x, ResponseCompletedEvent): span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, x.response.model) - _calculate_token_usage( + _calculate_responses_token_usage( input, x.response, span, @@ -802,7 +891,7 @@ def _wrap_synchronous_responses_event_iterator( if should_send_default_pii() and integration.include_prompts: set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses) if count_tokens_manually: - _calculate_token_usage( + _calculate_responses_token_usage( input, response, span, @@ -844,7 +933,7 @@ async def _wrap_asynchronous_responses_event_iterator( if isinstance(x, ResponseCompletedEvent): span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, x.response.model) - _calculate_token_usage( + _calculate_responses_token_usage( input, x.response, span, @@ -865,7 +954,7 @@ async def _wrap_asynchronous_responses_event_iterator( if should_send_default_pii() and integration.include_prompts: set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses) if count_tokens_manually: - _calculate_token_usage( + _calculate_responses_token_usage( input, response, span, diff --git a/tests/integrations/openai/test_openai.py b/tests/integrations/openai/test_openai.py index 0fd049e742..4f1456a9f0 100644 --- a/tests/integrations/openai/test_openai.py +++ b/tests/integrations/openai/test_openai.py @@ -44,7 +44,8 @@ from sentry_sdk.consts import SPANDATA, OP from sentry_sdk.integrations.openai import ( OpenAIIntegration, - _calculate_token_usage, + _calculate_completions_token_usage, + _calculate_responses_token_usage, ) from sentry_sdk.utils import safe_serialize @@ -610,6 +611,166 @@ def test_streaming_chat_completion_no_prompts( pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly +def test_streaming_chat_completion_with_usage_in_stream( + sentry_init, + capture_events, + get_model_response, + server_side_event_chunks, +): + """When stream_options=include_usage is set, token usage comes from the final chunk's usage field.""" + sentry_init( + integrations=[OpenAIIntegration(include_prompts=False)], + traces_sample_rate=1.0, + send_default_pii=False, + ) + events = capture_events() + + client = OpenAI(api_key="z") + returned_stream = get_model_response( + server_side_event_chunks( + [ + ChatCompletionChunk( + id="1", + choices=[ + DeltaChoice( + index=0, + delta=ChoiceDelta(content="hel"), + finish_reason=None, + ) + ], + created=100000, + model="model-id", + object="chat.completion.chunk", + ), + ChatCompletionChunk( + id="1", + choices=[ + DeltaChoice( + index=0, + delta=ChoiceDelta(content="lo"), + finish_reason="stop", + ) + ], + created=100000, + model="model-id", + object="chat.completion.chunk", + usage=CompletionUsage( + prompt_tokens=20, + completion_tokens=10, + total_tokens=30, + ), + ), + ], + include_event_type=False, + ) + ) + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ): + with start_transaction(name="openai tx"): + response_stream = client.chat.completions.create( + model="some-model", + messages=[{"role": "user", "content": "hello"}], + stream=True, + stream_options={"include_usage": True}, + ) + for _ in response_stream: + pass + + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.output_tokens"] == 10 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 + + +@pytest.mark.asyncio +async def test_streaming_chat_completion_async_with_usage_in_stream( + sentry_init, + capture_events, + get_model_response, + async_iterator, + server_side_event_chunks, +): + """When stream_options=include_usage is set, token usage comes from the final chunk's usage field (async).""" + sentry_init( + integrations=[OpenAIIntegration(include_prompts=False)], + traces_sample_rate=1.0, + send_default_pii=False, + ) + events = capture_events() + + client = AsyncOpenAI(api_key="z") + returned_stream = get_model_response( + async_iterator( + server_side_event_chunks( + [ + ChatCompletionChunk( + id="1", + choices=[ + DeltaChoice( + index=0, + delta=ChoiceDelta(content="hel"), + finish_reason=None, + ) + ], + created=100000, + model="model-id", + object="chat.completion.chunk", + ), + ChatCompletionChunk( + id="1", + choices=[ + DeltaChoice( + index=0, + delta=ChoiceDelta(content="lo"), + finish_reason="stop", + ) + ], + created=100000, + model="model-id", + object="chat.completion.chunk", + usage=CompletionUsage( + prompt_tokens=20, + completion_tokens=10, + total_tokens=30, + ), + ), + ], + include_event_type=False, + ) + ) + ) + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ): + with start_transaction(name="openai tx"): + response_stream = await client.chat.completions.create( + model="some-model", + messages=[{"role": "user", "content": "hello"}], + stream=True, + stream_options={"include_usage": True}, + ) + async for _ in response_stream: + pass + + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.output_tokens"] == 10 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 + + # noinspection PyTypeChecker @pytest.mark.parametrize( "messages", @@ -1780,7 +1941,8 @@ async def test_span_origin_embeddings_async(sentry_init, capture_events): assert event["spans"][0]["origin"] == "auto.ai.openai" -def test_calculate_token_usage_a(): +def test_completions_token_usage_from_response(): + """Token counts are extracted from response.usage using Completions API field names.""" span = mock.MagicMock() def count_tokens(msg): @@ -1797,20 +1959,24 @@ def count_tokens(msg): with mock.patch( "sentry_sdk.integrations.openai.record_token_usage" ) as mock_record_token_usage: - _calculate_token_usage( - messages, response, span, streaming_message_responses, count_tokens + _calculate_completions_token_usage( + messages=messages, + response=response, + span=span, + streaming_message_responses=streaming_message_responses, + streaming_message_token_usage=None, + count_tokens=count_tokens, ) mock_record_token_usage.assert_called_once_with( span, input_tokens=20, - input_tokens_cached=None, output_tokens=10, - output_tokens_reasoning=None, total_tokens=30, ) -def test_calculate_token_usage_b(): +def test_completions_token_usage_manual_input_counting(): + """When prompt_tokens is missing, input tokens are counted manually from messages.""" span = mock.MagicMock() def count_tokens(msg): @@ -1830,20 +1996,24 @@ def count_tokens(msg): with mock.patch( "sentry_sdk.integrations.openai.record_token_usage" ) as mock_record_token_usage: - _calculate_token_usage( - messages, response, span, streaming_message_responses, count_tokens + _calculate_completions_token_usage( + messages=messages, + response=response, + span=span, + streaming_message_responses=streaming_message_responses, + streaming_message_token_usage=None, + count_tokens=count_tokens, ) mock_record_token_usage.assert_called_once_with( span, input_tokens=11, - input_tokens_cached=None, output_tokens=10, - output_tokens_reasoning=None, total_tokens=10, ) -def test_calculate_token_usage_c(): +def test_completions_token_usage_manual_output_counting_streaming(): + """When completion_tokens is missing, output tokens are counted from streaming responses.""" span = mock.MagicMock() def count_tokens(msg): @@ -1863,20 +2033,24 @@ def count_tokens(msg): with mock.patch( "sentry_sdk.integrations.openai.record_token_usage" ) as mock_record_token_usage: - _calculate_token_usage( - messages, response, span, streaming_message_responses, count_tokens + _calculate_completions_token_usage( + messages=messages, + response=response, + span=span, + streaming_message_responses=streaming_message_responses, + streaming_message_token_usage=None, + count_tokens=count_tokens, ) mock_record_token_usage.assert_called_once_with( span, input_tokens=20, - input_tokens_cached=None, output_tokens=11, - output_tokens_reasoning=None, total_tokens=20, ) -def test_calculate_token_usage_d(): +def test_completions_token_usage_manual_output_counting_choices(): + """When completion_tokens is missing, output tokens are counted from response.choices.""" span = mock.MagicMock() def count_tokens(msg): @@ -1897,20 +2071,24 @@ def count_tokens(msg): with mock.patch( "sentry_sdk.integrations.openai.record_token_usage" ) as mock_record_token_usage: - _calculate_token_usage( - messages, response, span, streaming_message_responses, count_tokens + _calculate_completions_token_usage( + messages=messages, + response=response, + span=span, + streaming_message_responses=streaming_message_responses, + streaming_message_token_usage=None, + count_tokens=count_tokens, ) mock_record_token_usage.assert_called_once_with( span, input_tokens=20, - input_tokens_cached=None, output_tokens=None, - output_tokens_reasoning=None, total_tokens=20, ) -def test_calculate_token_usage_e(): +def test_completions_token_usage_no_usage_data(): + """When response has no usage data and no streaming responses, all tokens are None.""" span = mock.MagicMock() def count_tokens(msg): @@ -1923,8 +2101,72 @@ def count_tokens(msg): with mock.patch( "sentry_sdk.integrations.openai.record_token_usage" ) as mock_record_token_usage: - _calculate_token_usage( - messages, response, span, streaming_message_responses, count_tokens + _calculate_completions_token_usage( + messages=messages, + response=response, + span=span, + streaming_message_responses=streaming_message_responses, + streaming_message_token_usage=None, + count_tokens=count_tokens, + ) + mock_record_token_usage.assert_called_once_with( + span, + input_tokens=None, + output_tokens=None, + total_tokens=None, + ) + + +@pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") +def test_responses_token_usage_from_response(): + """Token counts including cached and reasoning tokens are extracted from Responses API.""" + span = mock.MagicMock() + + def count_tokens(msg): + return len(str(msg)) + + response = mock.MagicMock() + response.usage = mock.MagicMock() + response.usage.input_tokens = 20 + response.usage.input_tokens_details = mock.MagicMock() + response.usage.input_tokens_details.cached_tokens = 5 + response.usage.output_tokens = 10 + response.usage.output_tokens_details = mock.MagicMock() + response.usage.output_tokens_details.reasoning_tokens = 8 + response.usage.total_tokens = 30 + input = [] + + with mock.patch( + "sentry_sdk.integrations.openai.record_token_usage" + ) as mock_record_token_usage: + _calculate_responses_token_usage(input, response, span, None, count_tokens) + mock_record_token_usage.assert_called_once_with( + span, + input_tokens=20, + input_tokens_cached=5, + output_tokens=10, + output_tokens_reasoning=8, + total_tokens=30, + ) + + +@pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") +def test_responses_token_usage_no_usage_data(): + """When Responses API response has no usage data, all tokens are None.""" + span = mock.MagicMock() + + def count_tokens(msg): + return len(str(msg)) + + response = mock.MagicMock() + input = [] + streaming_message_responses = None + + with mock.patch( + "sentry_sdk.integrations.openai.record_token_usage" + ) as mock_record_token_usage: + _calculate_responses_token_usage( + input, response, span, streaming_message_responses, count_tokens ) mock_record_token_usage.assert_called_once_with( span, From d1ae0b296ec3cf0243f77581d856054c9e4960e2 Mon Sep 17 00:00:00 2001 From: Erica Pisani Date: Wed, 1 Apr 2026 13:27:07 +0200 Subject: [PATCH 2/9] refactor(openai): Use keyword arguments for _calculate_responses_token_usage call sites Co-Authored-By: Claude Opus 4.6 (1M context) --- sentry_sdk/integrations/openai.py | 46 +++++++++++++++++-------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py index 5ea2a62af6..9ddbe419f0 100644 --- a/sentry_sdk/integrations/openai.py +++ b/sentry_sdk/integrations/openai.py @@ -604,7 +604,11 @@ def _set_common_output_data( ) _calculate_responses_token_usage( - input, response, span, None, integration.count_tokens + input=input, + response=response, + span=span, + streaming_message_responses=None, + count_tokens=integration.count_tokens, ) if finish_span: @@ -871,11 +875,11 @@ def _wrap_synchronous_responses_event_iterator( span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, x.response.model) _calculate_responses_token_usage( - input, - x.response, - span, - None, - integration.count_tokens, + input=input, + response=x.response, + span=span, + streaming_message_responses=None, + count_tokens=integration.count_tokens, ) count_tokens_manually = False @@ -892,11 +896,11 @@ def _wrap_synchronous_responses_event_iterator( set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses) if count_tokens_manually: _calculate_responses_token_usage( - input, - response, - span, - all_responses, - integration.count_tokens, + input=input, + response=response, + span=span, + streaming_message_responses=all_responses, + count_tokens=integration.count_tokens, ) if finish_span: @@ -934,11 +938,11 @@ async def _wrap_asynchronous_responses_event_iterator( span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, x.response.model) _calculate_responses_token_usage( - input, - x.response, - span, - None, - integration.count_tokens, + input=input, + response=x.response, + span=span, + streaming_message_responses=None, + count_tokens=integration.count_tokens, ) count_tokens_manually = False @@ -955,11 +959,11 @@ async def _wrap_asynchronous_responses_event_iterator( set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses) if count_tokens_manually: _calculate_responses_token_usage( - input, - response, - span, - all_responses, - integration.count_tokens, + input=input, + response=response, + span=span, + streaming_message_responses=all_responses, + count_tokens=integration.count_tokens, ) if finish_span: span.__exit__(None, None, None) From d8677247094a824647bb8e172bd37bee4ebdead2 Mon Sep 17 00:00:00 2001 From: Erica Pisani Date: Wed, 1 Apr 2026 14:50:08 +0200 Subject: [PATCH 3/9] fix(openai): Extract cached and reasoning tokens in Completions token usage The refactor that split _calculate_token_usage into separate Completions and Responses functions dropped extraction of prompt_tokens_details.cached_tokens and completion_tokens_details.reasoning_tokens from the Completions path. This restores those fields so spans for cached prompts and reasoning models (e.g. o1/o3) report complete token usage metrics. Also fixes streaming usage priority: streaming_message_token_usage now correctly takes precedence over response.usage via elif. Co-Authored-By: Claude Opus 4.6 (1M context) --- sentry_sdk/integrations/openai.py | 19 +++++++++- tests/integrations/openai/test_openai.py | 48 ++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py index 9ddbe419f0..6c1d38e145 100644 --- a/sentry_sdk/integrations/openai.py +++ b/sentry_sdk/integrations/openai.py @@ -158,23 +158,34 @@ def _calculate_completions_token_usage( ) -> None: """Extract and record token usage from a Chat Completions API response.""" input_tokens: "Optional[int]" = 0 + input_tokens_cached: "Optional[int]" = 0 output_tokens: "Optional[int]" = 0 + output_tokens_reasoning: "Optional[int]" = 0 total_tokens: "Optional[int]" = 0 usage = None if streaming_message_token_usage: usage = streaming_message_token_usage - - if hasattr(response, "usage"): + elif hasattr(response, "usage"): usage = response.usage if usage is not None: if hasattr(usage, "prompt_tokens") and isinstance(usage.prompt_tokens, int): input_tokens = usage.prompt_tokens + if hasattr(usage, "prompt_tokens_details"): + cached = getattr(usage.prompt_tokens_details, "cached_tokens", None) + if isinstance(cached, int): + input_tokens_cached = cached if hasattr(usage, "completion_tokens") and isinstance( usage.completion_tokens, int ): output_tokens = usage.completion_tokens + if hasattr(usage, "completion_tokens_details"): + reasoning = getattr( + usage.completion_tokens_details, "reasoning_tokens", None + ) + if isinstance(reasoning, int): + output_tokens_reasoning = reasoning if hasattr(usage, "total_tokens") and isinstance(usage.total_tokens, int): total_tokens = usage.total_tokens @@ -204,13 +215,17 @@ def _calculate_completions_token_usage( # Do not set token data if it is 0 input_tokens = input_tokens or None + input_tokens_cached = input_tokens_cached or None output_tokens = output_tokens or None + output_tokens_reasoning = output_tokens_reasoning or None total_tokens = total_tokens or None record_token_usage( span, input_tokens=input_tokens, + input_tokens_cached=input_tokens_cached, output_tokens=output_tokens, + output_tokens_reasoning=output_tokens_reasoning, total_tokens=total_tokens, ) diff --git a/tests/integrations/openai/test_openai.py b/tests/integrations/openai/test_openai.py index 4f1456a9f0..b345cb74fc 100644 --- a/tests/integrations/openai/test_openai.py +++ b/tests/integrations/openai/test_openai.py @@ -1970,7 +1970,47 @@ def count_tokens(msg): mock_record_token_usage.assert_called_once_with( span, input_tokens=20, + input_tokens_cached=None, + output_tokens=10, + output_tokens_reasoning=None, + total_tokens=30, + ) + + +def test_completions_token_usage_with_detailed_fields(): + """Cached and reasoning token counts are extracted from prompt_tokens_details and completion_tokens_details.""" + span = mock.MagicMock() + + def count_tokens(msg): + return len(str(msg)) + + response = mock.MagicMock() + response.usage = mock.MagicMock() + response.usage.prompt_tokens = 20 + response.usage.prompt_tokens_details = mock.MagicMock() + response.usage.prompt_tokens_details.cached_tokens = 5 + response.usage.completion_tokens = 10 + response.usage.completion_tokens_details = mock.MagicMock() + response.usage.completion_tokens_details.reasoning_tokens = 8 + response.usage.total_tokens = 30 + + with mock.patch( + "sentry_sdk.integrations.openai.record_token_usage" + ) as mock_record_token_usage: + _calculate_completions_token_usage( + messages=[], + response=response, + span=span, + streaming_message_responses=[], + streaming_message_token_usage=None, + count_tokens=count_tokens, + ) + mock_record_token_usage.assert_called_once_with( + span, + input_tokens=20, + input_tokens_cached=5, output_tokens=10, + output_tokens_reasoning=8, total_tokens=30, ) @@ -2007,7 +2047,9 @@ def count_tokens(msg): mock_record_token_usage.assert_called_once_with( span, input_tokens=11, + input_tokens_cached=None, output_tokens=10, + output_tokens_reasoning=None, total_tokens=10, ) @@ -2044,7 +2086,9 @@ def count_tokens(msg): mock_record_token_usage.assert_called_once_with( span, input_tokens=20, + input_tokens_cached=None, output_tokens=11, + output_tokens_reasoning=None, total_tokens=20, ) @@ -2082,7 +2126,9 @@ def count_tokens(msg): mock_record_token_usage.assert_called_once_with( span, input_tokens=20, + input_tokens_cached=None, output_tokens=None, + output_tokens_reasoning=None, total_tokens=20, ) @@ -2112,7 +2158,9 @@ def count_tokens(msg): mock_record_token_usage.assert_called_once_with( span, input_tokens=None, + input_tokens_cached=None, output_tokens=None, + output_tokens_reasoning=None, total_tokens=None, ) From 9f4b4fb2f4f462c3aa8ae0004e9e06c14de4be9c Mon Sep 17 00:00:00 2001 From: Erica Pisani Date: Wed, 1 Apr 2026 15:10:21 +0200 Subject: [PATCH 4/9] fix(openai): Use realistic Choice objects in manual output counting test The test used MagicMock(message="one") where message was a plain string, but the real OpenAI API returns Choice objects with message.content. The counting code checks hasattr(choice.message, "content"), which failed on strings, so manual token counting was never exercised. Use real Choice and ChatCompletionMessage objects and fix the expected output_tokens. Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/integrations/openai/test_openai.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/tests/integrations/openai/test_openai.py b/tests/integrations/openai/test_openai.py index b345cb74fc..1a3960f534 100644 --- a/tests/integrations/openai/test_openai.py +++ b/tests/integrations/openai/test_openai.py @@ -2105,12 +2105,24 @@ def count_tokens(msg): response.usage.prompt_tokens = 20 response.usage.total_tokens = 20 response.choices = [ - mock.MagicMock(message="one"), - mock.MagicMock(message="two"), - mock.MagicMock(message="three"), + Choice( + index=0, + finish_reason="stop", + message=ChatCompletionMessage(role="assistant", content="one"), + ), + Choice( + index=1, + finish_reason="stop", + message=ChatCompletionMessage(role="assistant", content="two"), + ), + Choice( + index=2, + finish_reason="stop", + message=ChatCompletionMessage(role="assistant", content="three"), + ), ] messages = [] - streaming_message_responses = [] + streaming_message_responses = None with mock.patch( "sentry_sdk.integrations.openai.record_token_usage" @@ -2127,7 +2139,7 @@ def count_tokens(msg): span, input_tokens=20, input_tokens_cached=None, - output_tokens=None, + output_tokens=11, output_tokens_reasoning=None, total_tokens=20, ) From 6d4810025f3d84b37c068bf5b027fd673e91212a Mon Sep 17 00:00:00 2001 From: Erica Pisani Date: Wed, 1 Apr 2026 15:47:55 +0200 Subject: [PATCH 5/9] fix(openai): Skip stream_options tests on OpenAI SDK <=1.1.0 The stream_options parameter was not available in early versions of the OpenAI Python SDK, causing TypeError on v1.0.1 CI runs. Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/integrations/openai/test_openai.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/integrations/openai/test_openai.py b/tests/integrations/openai/test_openai.py index 1a3960f534..550fb892be 100644 --- a/tests/integrations/openai/test_openai.py +++ b/tests/integrations/openai/test_openai.py @@ -611,6 +611,10 @@ def test_streaming_chat_completion_no_prompts( pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly +@pytest.mark.skipif( + OPENAI_VERSION <= (1, 1, 0), + reason="OpenAI versions <=1.1.0 do not support the stream_options parameter.", +) def test_streaming_chat_completion_with_usage_in_stream( sentry_init, capture_events, @@ -689,6 +693,10 @@ def test_streaming_chat_completion_with_usage_in_stream( assert span["data"]["gen_ai.usage.total_tokens"] == 30 +@pytest.mark.skipif( + OPENAI_VERSION <= (1, 1, 0), + reason="OpenAI versions <=1.1.0 do not support the stream_options parameter.", +) @pytest.mark.asyncio async def test_streaming_chat_completion_async_with_usage_in_stream( sentry_init, From d9b2b07d11831fa056cde0fcba954e5bb48d01e5 Mon Sep 17 00:00:00 2001 From: Erica Pisani Date: Thu, 2 Apr 2026 10:13:59 +0200 Subject: [PATCH 6/9] ref(openai): Rename streaming_message_token_usage to streaming_message_total_token_usage Clarify that this variable holds the total token usage from streaming responses. Also fix the None check to use `is not None` instead of truthiness, preventing false negatives when usage has zero-valued fields. Co-Authored-By: Claude Opus 4.6 (1M context) --- sentry_sdk/integrations/openai.py | 22 +++++++++++----------- tests/integrations/openai/test_openai.py | 13 +++++++------ 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py index 6c1d38e145..ea7668c3a8 100644 --- a/sentry_sdk/integrations/openai.py +++ b/sentry_sdk/integrations/openai.py @@ -153,7 +153,7 @@ def _calculate_completions_token_usage( response: "Any", span: "Span", streaming_message_responses: "Optional[List[str]]", - streaming_message_token_usage: "Optional[CompletionUsage]", + streaming_message_total_token_usage: "Optional[CompletionUsage]", count_tokens: "Callable[..., Any]", ) -> None: """Extract and record token usage from a Chat Completions API response.""" @@ -164,8 +164,8 @@ def _calculate_completions_token_usage( total_tokens: "Optional[int]" = 0 usage = None - if streaming_message_token_usage: - usage = streaming_message_token_usage + if streaming_message_total_token_usage is not None: + usage = streaming_message_total_token_usage elif hasattr(response, "usage"): usage = response.usage @@ -579,7 +579,7 @@ def _set_common_output_data( response=response, span=span, streaming_message_responses=None, - streaming_message_token_usage=None, + streaming_message_total_token_usage=None, count_tokens=integration.count_tokens, ) @@ -635,7 +635,7 @@ def _set_common_output_data( response=response, span=span, streaming_message_responses=None, - streaming_message_token_usage=None, + streaming_message_total_token_usage=None, count_tokens=integration.count_tokens, ) if finish_span: @@ -755,7 +755,7 @@ def _wrap_synchronous_completions_chunk_iterator( """ ttft = None data_buf: "list[list[str]]" = [] # one for each choice - streaming_message_token_usage = None + streaming_message_total_token_usage = None for x in old_iterator: span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, x.model) @@ -773,7 +773,7 @@ def _wrap_synchronous_completions_chunk_iterator( data_buf[choice_index].append(content or "") choice_index += 1 if hasattr(x, "usage"): - streaming_message_token_usage = x.usage + streaming_message_total_token_usage = x.usage yield x @@ -791,7 +791,7 @@ def _wrap_synchronous_completions_chunk_iterator( response=response, span=span, streaming_message_responses=all_responses, - streaming_message_token_usage=streaming_message_token_usage, + streaming_message_total_token_usage=streaming_message_total_token_usage, count_tokens=integration.count_tokens, ) @@ -815,7 +815,7 @@ async def _wrap_asynchronous_completions_chunk_iterator( """ ttft = None data_buf: "list[list[str]]" = [] # one for each choice - streaming_message_token_usage = None + streaming_message_total_token_usage = None async for x in old_iterator: span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, x.model) @@ -833,7 +833,7 @@ async def _wrap_asynchronous_completions_chunk_iterator( data_buf[choice_index].append(content or "") choice_index += 1 if hasattr(x, "usage"): - streaming_message_token_usage = x.usage + streaming_message_total_token_usage = x.usage yield x @@ -851,7 +851,7 @@ async def _wrap_asynchronous_completions_chunk_iterator( response=response, span=span, streaming_message_responses=all_responses, - streaming_message_token_usage=streaming_message_token_usage, + streaming_message_total_token_usage=streaming_message_total_token_usage, count_tokens=integration.count_tokens, ) diff --git a/tests/integrations/openai/test_openai.py b/tests/integrations/openai/test_openai.py index 550fb892be..82fbb72ae8 100644 --- a/tests/integrations/openai/test_openai.py +++ b/tests/integrations/openai/test_openai.py @@ -1972,7 +1972,7 @@ def count_tokens(msg): response=response, span=span, streaming_message_responses=streaming_message_responses, - streaming_message_token_usage=None, + streaming_message_total_token_usage=None, count_tokens=count_tokens, ) mock_record_token_usage.assert_called_once_with( @@ -2010,7 +2010,7 @@ def count_tokens(msg): response=response, span=span, streaming_message_responses=[], - streaming_message_token_usage=None, + streaming_message_total_token_usage=None, count_tokens=count_tokens, ) mock_record_token_usage.assert_called_once_with( @@ -2049,7 +2049,7 @@ def count_tokens(msg): response=response, span=span, streaming_message_responses=streaming_message_responses, - streaming_message_token_usage=None, + streaming_message_total_token_usage=None, count_tokens=count_tokens, ) mock_record_token_usage.assert_called_once_with( @@ -2088,7 +2088,7 @@ def count_tokens(msg): response=response, span=span, streaming_message_responses=streaming_message_responses, - streaming_message_token_usage=None, + streaming_message_total_token_usage=None, count_tokens=count_tokens, ) mock_record_token_usage.assert_called_once_with( @@ -2140,7 +2140,7 @@ def count_tokens(msg): response=response, span=span, streaming_message_responses=streaming_message_responses, - streaming_message_token_usage=None, + streaming_message_total_token_usage=None, count_tokens=count_tokens, ) mock_record_token_usage.assert_called_once_with( @@ -2172,7 +2172,7 @@ def count_tokens(msg): response=response, span=span, streaming_message_responses=streaming_message_responses, - streaming_message_token_usage=None, + streaming_message_total_token_usage=None, count_tokens=count_tokens, ) mock_record_token_usage.assert_called_once_with( @@ -2227,6 +2227,7 @@ def count_tokens(msg): return len(str(msg)) response = mock.MagicMock() + response.usage = None input = [] streaming_message_responses = None From ba471e198a5ceea3c320444e1017e8c006d72945 Mon Sep 17 00:00:00 2001 From: Erica Pisani Date: Thu, 2 Apr 2026 10:52:10 +0200 Subject: [PATCH 7/9] ref(openai): Extract _has_attr_and_is_int helper for token usage checks Replace inline hasattr+isinstance patterns in both _calculate_completions_token_usage and _calculate_responses_token_usage with a shared helper that uses getattr safely. Co-Authored-By: Claude Opus 4.6 (1M context) --- sentry_sdk/integrations/openai.py | 36 ++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py index ea7668c3a8..defdc43a79 100644 --- a/sentry_sdk/integrations/openai.py +++ b/sentry_sdk/integrations/openai.py @@ -50,6 +50,7 @@ from sentry_sdk.tracing import Span from sentry_sdk._types import TextPart + from openai.types.responses.response_usage import ResponseUsage from openai.types.responses import ( ResponseInputParam, SequenceNotStr, @@ -148,6 +149,14 @@ def _capture_exception(exc: "Any", manual_span_cleanup: bool = True) -> None: sentry_sdk.capture_event(event, hint=hint) +def _has_attr_and_is_int( + token_usage: Union["CompletionUsage", "ResponseUsage"], attr_name: str +) -> bool: + return hasattr(token_usage, attr_name) and isinstance( + getattr(token_usage, attr_name, None), int + ) + + def _calculate_completions_token_usage( messages: "Optional[Iterable[ChatCompletionMessageParam]]", response: "Any", @@ -170,24 +179,24 @@ def _calculate_completions_token_usage( usage = response.usage if usage is not None: - if hasattr(usage, "prompt_tokens") and isinstance(usage.prompt_tokens, int): + if _has_attr_and_is_int(usage, "prompt_tokens"): input_tokens = usage.prompt_tokens + if _has_attr_and_is_int(usage, "completion_tokens"): + output_tokens = usage.completion_tokens + if _has_attr_and_is_int(usage, "total_tokens"): + total_tokens = usage.total_tokens + if hasattr(usage, "prompt_tokens_details"): cached = getattr(usage.prompt_tokens_details, "cached_tokens", None) if isinstance(cached, int): input_tokens_cached = cached - if hasattr(usage, "completion_tokens") and isinstance( - usage.completion_tokens, int - ): - output_tokens = usage.completion_tokens + if hasattr(usage, "completion_tokens_details"): reasoning = getattr( usage.completion_tokens_details, "reasoning_tokens", None ) if isinstance(reasoning, int): output_tokens_reasoning = reasoning - if hasattr(usage, "total_tokens") and isinstance(usage.total_tokens, int): - total_tokens = usage.total_tokens # Manually count input tokens if input_tokens == 0: @@ -246,20 +255,23 @@ def _calculate_responses_token_usage( if hasattr(response, "usage"): usage = response.usage - if hasattr(usage, "input_tokens") and isinstance(usage.input_tokens, int): + + if _has_attr_and_is_int(usage, "input_tokens"): input_tokens = usage.input_tokens + if _has_attr_and_is_int(usage, "output_tokens"): + output_tokens = usage.output_tokens + if _has_attr_and_is_int(usage, "total_tokens"): + total_tokens = usage.total_tokens + if hasattr(usage, "input_tokens_details"): cached = getattr(usage.input_tokens_details, "cached_tokens", None) if isinstance(cached, int): input_tokens_cached = cached - if hasattr(usage, "output_tokens") and isinstance(usage.output_tokens, int): - output_tokens = usage.output_tokens + if hasattr(usage, "output_tokens_details"): reasoning = getattr(usage.output_tokens_details, "reasoning_tokens", None) if isinstance(reasoning, int): output_tokens_reasoning = reasoning - if hasattr(usage, "total_tokens") and isinstance(usage.total_tokens, int): - total_tokens = usage.total_tokens # Manually count input tokens if input_tokens == 0: From cee1173acb9cfcd70dae48ecd594913b8b7432e9 Mon Sep 17 00:00:00 2001 From: Erica Pisani Date: Thu, 2 Apr 2026 10:58:09 +0200 Subject: [PATCH 8/9] Borked a typing import --- sentry_sdk/integrations/openai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py index defdc43a79..274e056e2e 100644 --- a/sentry_sdk/integrations/openai.py +++ b/sentry_sdk/integrations/openai.py @@ -150,7 +150,7 @@ def _capture_exception(exc: "Any", manual_span_cleanup: bool = True) -> None: def _has_attr_and_is_int( - token_usage: Union["CompletionUsage", "ResponseUsage"], attr_name: str + token_usage: "Union[CompletionUsage, ResponseUsage]", attr_name: str ) -> bool: return hasattr(token_usage, attr_name) and isinstance( getattr(token_usage, attr_name, None), int From 6e7fffca31eb3bde9a6cc3286ed1579ef325c0cd Mon Sep 17 00:00:00 2001 From: Erica Pisani Date: Thu, 2 Apr 2026 13:39:07 +0200 Subject: [PATCH 9/9] ref(openai): Fix token usage reporting for empty streams and non-streaming responses Move _calculate_completions_token_usage outside the data_buf check so token usage from stream metadata is recorded even when no content chunks are produced (e.g. content filter). Also count output tokens from response.output when streaming_message_responses is absent in the Responses API path. Co-Authored-By: Claude Opus 4.6 (1M context) --- sentry_sdk/integrations/openai.py | 43 +++-- tests/integrations/openai/test_openai.py | 194 +++++++++++++++++++++++ 2 files changed, 221 insertions(+), 16 deletions(-) diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py index 274e056e2e..480db9132d 100644 --- a/sentry_sdk/integrations/openai.py +++ b/sentry_sdk/integrations/openai.py @@ -293,6 +293,12 @@ def _calculate_responses_token_usage( if streaming_message_responses is not None: for message in streaming_message_responses: output_tokens += count_tokens(message) + elif hasattr(response, "output"): + for output_item in response.output: + if hasattr(output_item, "content"): + for content_item in output_item.content: + if hasattr(content_item, "text"): + output_tokens += count_tokens(content_item.text) # Do not set token data if it is 0 input_tokens = input_tokens or None @@ -794,18 +800,20 @@ def _wrap_synchronous_completions_chunk_iterator( set_data_normalized( span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft ) + all_responses = None if len(data_buf) > 0: all_responses = ["".join(chunk) for chunk in data_buf] if should_send_default_pii() and integration.include_prompts: set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses) - _calculate_completions_token_usage( - messages=messages, - response=response, - span=span, - streaming_message_responses=all_responses, - streaming_message_total_token_usage=streaming_message_total_token_usage, - count_tokens=integration.count_tokens, - ) + + _calculate_completions_token_usage( + messages=messages, + response=response, + span=span, + streaming_message_responses=all_responses, + streaming_message_total_token_usage=streaming_message_total_token_usage, + count_tokens=integration.count_tokens, + ) if finish_span: span.__exit__(None, None, None) @@ -854,18 +862,20 @@ async def _wrap_asynchronous_completions_chunk_iterator( set_data_normalized( span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft ) + all_responses = None if len(data_buf) > 0: all_responses = ["".join(chunk) for chunk in data_buf] if should_send_default_pii() and integration.include_prompts: set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses) - _calculate_completions_token_usage( - messages=messages, - response=response, - span=span, - streaming_message_responses=all_responses, - streaming_message_total_token_usage=streaming_message_total_token_usage, - count_tokens=integration.count_tokens, - ) + + _calculate_completions_token_usage( + messages=messages, + response=response, + span=span, + streaming_message_responses=all_responses, + streaming_message_total_token_usage=streaming_message_total_token_usage, + count_tokens=integration.count_tokens, + ) if finish_span: span.__exit__(None, None, None) @@ -921,6 +931,7 @@ def _wrap_synchronous_responses_event_iterator( all_responses = ["".join(chunk) for chunk in data_buf] if should_send_default_pii() and integration.include_prompts: set_data_normalized(span, SPANDATA.GEN_AI_RESPONSE_TEXT, all_responses) + if count_tokens_manually: _calculate_responses_token_usage( input=input, diff --git a/tests/integrations/openai/test_openai.py b/tests/integrations/openai/test_openai.py index 82fbb72ae8..ada2e633de 100644 --- a/tests/integrations/openai/test_openai.py +++ b/tests/integrations/openai/test_openai.py @@ -693,6 +693,136 @@ def test_streaming_chat_completion_with_usage_in_stream( assert span["data"]["gen_ai.usage.total_tokens"] == 30 +@pytest.mark.skipif( + OPENAI_VERSION <= (1, 1, 0), + reason="OpenAI versions <=1.1.0 do not support the stream_options parameter.", +) +def test_streaming_chat_completion_empty_content_preserves_token_usage( + sentry_init, + capture_events, + get_model_response, + server_side_event_chunks, +): + """Token usage from the stream is recorded even when no content is produced (e.g. content filter).""" + sentry_init( + integrations=[OpenAIIntegration(include_prompts=False)], + traces_sample_rate=1.0, + send_default_pii=False, + ) + events = capture_events() + + client = OpenAI(api_key="z") + returned_stream = get_model_response( + server_side_event_chunks( + [ + ChatCompletionChunk( + id="1", + choices=[], + created=100000, + model="model-id", + object="chat.completion.chunk", + usage=CompletionUsage( + prompt_tokens=20, + completion_tokens=0, + total_tokens=20, + ), + ), + ], + include_event_type=False, + ) + ) + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ): + with start_transaction(name="openai tx"): + response_stream = client.chat.completions.create( + model="some-model", + messages=[{"role": "user", "content": "hello"}], + stream=True, + stream_options={"include_usage": True}, + ) + for _ in response_stream: + pass + + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert "gen_ai.usage.output_tokens" not in span["data"] + assert span["data"]["gen_ai.usage.total_tokens"] == 20 + + +@pytest.mark.skipif( + OPENAI_VERSION <= (1, 1, 0), + reason="OpenAI versions <=1.1.0 do not support the stream_options parameter.", +) +@pytest.mark.asyncio +async def test_streaming_chat_completion_empty_content_preserves_token_usage_async( + sentry_init, + capture_events, + get_model_response, + async_iterator, + server_side_event_chunks, +): + """Token usage from the stream is recorded even when no content is produced - async variant.""" + sentry_init( + integrations=[OpenAIIntegration(include_prompts=False)], + traces_sample_rate=1.0, + send_default_pii=False, + ) + events = capture_events() + + client = AsyncOpenAI(api_key="z") + returned_stream = get_model_response( + async_iterator( + server_side_event_chunks( + [ + ChatCompletionChunk( + id="1", + choices=[], + created=100000, + model="model-id", + object="chat.completion.chunk", + usage=CompletionUsage( + prompt_tokens=20, + completion_tokens=0, + total_tokens=20, + ), + ), + ], + include_event_type=False, + ) + ) + ) + + with mock.patch.object( + client.chat._client._client, + "send", + return_value=returned_stream, + ): + with start_transaction(name="openai tx"): + response_stream = await client.chat.completions.create( + model="some-model", + messages=[{"role": "user", "content": "hello"}], + stream=True, + stream_options={"include_usage": True}, + ) + async for _ in response_stream: + pass + + tx = events[0] + assert tx["type"] == "transaction" + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert "gen_ai.usage.output_tokens" not in span["data"] + assert span["data"]["gen_ai.usage.total_tokens"] == 20 + + @pytest.mark.skipif( OPENAI_VERSION <= (1, 1, 0), reason="OpenAI versions <=1.1.0 do not support the stream_options parameter.", @@ -2247,6 +2377,70 @@ def count_tokens(msg): ) +@pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") +def test_responses_token_usage_manual_output_counting_response_output(): + """When output_tokens is missing, output tokens are counted from response.output.""" + span = mock.MagicMock() + + def count_tokens(msg): + return len(str(msg)) + + response = mock.MagicMock() + response.usage = mock.MagicMock() + response.usage.input_tokens = 20 + response.usage.total_tokens = 20 + response.output = [ + ResponseOutputMessage( + id="msg-1", + content=[ + ResponseOutputText( + annotations=[], + text="one", + type="output_text", + ), + ], + role="assistant", + status="completed", + type="message", + ), + ResponseOutputMessage( + id="msg-2", + content=[ + ResponseOutputText( + annotations=[], + text="two", + type="output_text", + ), + ResponseOutputText( + annotations=[], + text="three", + type="output_text", + ), + ], + role="assistant", + status="completed", + type="message", + ), + ] + input = [] + streaming_message_responses = None + + with mock.patch( + "sentry_sdk.integrations.openai.record_token_usage" + ) as mock_record_token_usage: + _calculate_responses_token_usage( + input, response, span, streaming_message_responses, count_tokens + ) + mock_record_token_usage.assert_called_once_with( + span, + input_tokens=20, + input_tokens_cached=None, + output_tokens=11, + output_tokens_reasoning=None, + total_tokens=20, + ) + + @pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") def test_ai_client_span_responses_api_no_pii(sentry_init, capture_events): sentry_init(