BerriAI · krrishdholakia · Oct 28, 2025 · Oct 25, 2025 · Oct 25, 2025
diff --git a/litellm/integrations/langfuse/langfuse_otel.py b/litellm/integrations/langfuse/langfuse_otel.py
@@ -48,11 +48,12 @@ def set_langfuse_otel_attributes(span: Span, kwargs, response_obj):
         _utils.set_attributes(span, kwargs, response_obj)
 
         #########################################################
-        # Set Langfuse specific attributes eg Langfuse Environment
+        # Set Langfuse specific attributes
         #########################################################
         LangfuseOtelLogger._set_langfuse_specific_attributes(
             span=span,
-            kwargs=kwargs
+            kwargs=kwargs,
+            response_obj=response_obj
         )
         return
 
@@ -86,7 +87,7 @@ def _extract_langfuse_metadata(kwargs: dict) -> dict:
         return metadata
 
     @staticmethod
-    def _set_langfuse_specific_attributes(span: Span, kwargs):
+    def _set_langfuse_specific_attributes(span: Span, kwargs, response_obj):
         """
         Sets Langfuse specific metadata attributes onto the OTEL span.
 
@@ -96,6 +97,7 @@ def _set_langfuse_specific_attributes(span: Span, kwargs):
         compatibility.
         """
         from litellm.integrations.arize._utils import safe_set_attribute
+        from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
 
         # 1) Environment variable override
         langfuse_environment = os.environ.get("LANGFUSE_TRACING_ENVIRONMENT")
@@ -141,6 +143,75 @@ def _set_langfuse_specific_attributes(span: Span, kwargs):
                         value = str(value)
                 safe_set_attribute(span, enum_attr.value, value)
 
+        # 3) Set observation input/output for better UI display
+        #
+        # These Langfuse-specific attributes provide better UI display,
+        # especially for tool calls and function calling.
+        # Set observation input (messages)
+        messages = kwargs.get("messages")
+        if messages:
+            safe_set_attribute(
+                span,
+                LangfuseSpanAttributes.OBSERVATION_INPUT.value,
+                safe_dumps(messages),
+            )
+
+        # Set observation output (response with tool_calls if present)
+        if response_obj and hasattr(response_obj, "get"):
+            choices = response_obj.get("choices", [])
+            if choices:
+                # Extract the first choice's message
+                first_choice = choices[0]
+                message = first_choice.get("message", {})
+
+                # Check if there are tool_calls
+                tool_calls = message.get("tool_calls")
+                if tool_calls:
+                    # Transform tool_calls to Langfuse-expected format
+                    transformed_tool_calls = []
+                    for tool_call in tool_calls:
+                        function = tool_call.get("function", {})
+                        arguments_str = function.get("arguments", "{}")
+
+                        # Parse arguments from JSON string to object
+                        try:
+                            arguments_obj = json.loads(arguments_str) if isinstance(arguments_str, str) else arguments_str
+                        except json.JSONDecodeError:
+                            arguments_obj = {}
+
+                        # Create Langfuse-compatible tool call object
+                        langfuse_tool_call = {
+                            "id": response_obj.get("id", ""),
+                            "name": function.get("name", ""),
+                            "call_id": tool_call.get("id", ""),
+                            "type": "function_call",
+                            "arguments": arguments_obj,
+                        }
+                        transformed_tool_calls.append(langfuse_tool_call)
+
+                    # Set the observation output with transformed tool_calls
+                    safe_set_attribute(
+                        span,
+                        LangfuseSpanAttributes.OBSERVATION_OUTPUT.value,
+                        safe_dumps(transformed_tool_calls),
+                    )
+                else:
+                    # No tool_calls, use regular content-based output
+                    output_data = {}
+
+                    if message.get("role"):
+                        output_data["role"] = message.get("role")
+
+                    if message.get("content") is not None:
+                        output_data["content"] = message.get("content")
+
+                    if output_data:
+                        safe_set_attribute(
+                            span,
+                            LangfuseSpanAttributes.OBSERVATION_OUTPUT.value,
+                            safe_dumps(output_data),
+                        )
+
     @staticmethod
     def _get_langfuse_otel_host() -> Optional[str]:
         """

diff --git a/litellm/types/integrations/langfuse_otel.py b/litellm/types/integrations/langfuse_otel.py
@@ -24,6 +24,10 @@ class LangfuseSpanAttributes(str, Enum):
     MASK_INPUT = "langfuse.generation.mask_input"
     MASK_OUTPUT = "langfuse.generation.mask_output"
 
+    # ---- Observation input/output ----
+    OBSERVATION_INPUT = "langfuse.observation.input"
+    OBSERVATION_OUTPUT = "langfuse.observation.output"
+
     # ---- Trace-level metadata ----
     TRACE_USER_ID = "user.id"
     SESSION_ID = "session.id"

diff --git a/tests/test_litellm/integrations/test_langfuse_otel.py b/tests/test_litellm/integrations/test_langfuse_otel.py
@@ -92,7 +92,7 @@ def test_set_langfuse_environment_attribute(self):
 
         with patch.dict(os.environ, {'LANGFUSE_TRACING_ENVIRONMENT': test_env}):
             with patch('litellm.integrations.arize._utils.safe_set_attribute') as mock_safe_set_attribute:
-                LangfuseOtelLogger._set_langfuse_specific_attributes(mock_span, mock_kwargs)
+                LangfuseOtelLogger._set_langfuse_specific_attributes(mock_span, mock_kwargs, {})
 
                 # safe_set_attribute(span, key, value) → positional args
                 mock_safe_set_attribute.assert_called_once_with(
@@ -130,7 +130,7 @@ def add_metadata_from_header(litellm_params, metadata):
         assert extracted.get("foo") == "bar"
         assert extracted.get("enriched") is True
 
-    def test_set_langfuse_specific_attributes_full_mapping(self):
+    def test_set_langfuse_specific_attributes_metadata(self):
         """Verify every supported metadata key maps to the correct OTEL attribute and complex types are JSON-serialised."""
         # Build a sample metadata payload covering all mappings
         metadata = {
@@ -156,7 +156,7 @@ def test_set_langfuse_specific_attributes_full_mapping(self):
 
         # Capture calls to safe_set_attribute
         with patch('litellm.integrations.arize._utils.safe_set_attribute') as mock_safe_set_attribute:
-            LangfuseOtelLogger._set_langfuse_specific_attributes(MagicMock(), kwargs)
+            LangfuseOtelLogger._set_langfuse_specific_attributes(MagicMock(), kwargs, None)
 
             # Build expected calls manually for clarity
             from litellm.types.integrations.langfuse_otel import LangfuseSpanAttributes
@@ -189,6 +189,109 @@ def test_set_langfuse_specific_attributes_full_mapping(self):
 
             assert actual == expected, "Mismatch between expected and actual OTEL attribute mapping."
 
+    def test_set_langfuse_specific_attributes_with_content(self):
+        """Test that _set_langfuse_specific_attributes correctly sets observation.output with regular content response."""
+        from litellm.types.utils import Choices, ModelResponse
+        from litellm.types.integrations.langfuse_otel import LangfuseSpanAttributes
+
+        # Create response with content
+        response_obj = ModelResponse(
+            id='chatcmpl-test',
+            model='gpt-4o',
+            choices=[
+                Choices(
+                    finish_reason='stop',
+                    message={
+                        "role": "assistant",
+                        "content": "The weather in Tokyo is sunny."
+                    }
+                )
+            ],
+        )
+
+        kwargs = {
+            "messages": [{"role": "user", "content": "What's the weather in Tokyo?"}],
+        }
+
+        with patch('litellm.integrations.arize._utils.safe_set_attribute') as mock_safe_set_attribute:
+            LangfuseOtelLogger._set_langfuse_specific_attributes(MagicMock(), kwargs, response_obj)
+
+            expect_output = {
+                LangfuseSpanAttributes.OBSERVATION_INPUT.value: [
+                    {
+                        "role": "user",
+                        "content": "What's the weather in Tokyo?"
+                    }
+                ],
+                LangfuseSpanAttributes.OBSERVATION_OUTPUT.value: {
+                    "role": "assistant",
+                    "content": "The weather in Tokyo is sunny."
+                }
+            }
+
+            # Flatten the actual calls into {key: value}
+            actual = {
+                call.args[1]: json.loads(call.args[2])
+                for call in mock_safe_set_attribute.call_args_list
+            }
+
+            assert actual == expect_output, "Mismatch in observation input/output OTEL attributes."
+
+
+    def test_set_langfuse_specific_attributes_with_tool_calls(self):
+        """Test that _set_langfuse_specific_attributes correctly sets observation.output with tool calls in Langfuse format."""
+        from litellm.types.utils import Choices, Function, ChatCompletionMessageToolCall, ModelResponse
+        from litellm.types.integrations.langfuse_otel import LangfuseSpanAttributes
+
+        # Create response with tool calls
+        response_obj = ModelResponse(
+            id='chatcmpl-test',
+            model='gpt-4o',
+            choices=[
+                Choices(
+                    finish_reason='tool_calls',
+                    message={
+                        "role": "assistant",
+                        "content": None,
+                        "tool_calls": [
+                            ChatCompletionMessageToolCall(
+                                function=Function(
+                                    arguments='{"location":"Tokyo"}',
+                                    name='get_weather'
+                                ),
+                                id='call_123',
+                                type='function'
+                            )
+                        ]
+                    }
+                )
+            ],
+        )
+
+        with patch('litellm.integrations.arize._utils.safe_set_attribute') as mock_safe_set_attribute:
+            LangfuseOtelLogger._set_langfuse_specific_attributes(MagicMock(), {}, 
+            response_obj)
+
+            expected = {
+                LangfuseSpanAttributes.OBSERVATION_OUTPUT.value: [
+                        {
+                            "id": "chatcmpl-test",
+                            "name": "get_weather",
+                            "arguments": {"location": "Tokyo"},
+                            "call_id": "call_123",
+                            "type": "function_call"
+                        }
+                ]
+            }
+
+            # Flatten the actual calls into {key: value}
+            actual = {
+                call.args[1]: json.loads(call.args[2])
+                for call in mock_safe_set_attribute.call_args_list
+            }
+            assert actual == expected, "Mismatch in observation output OTEL attribute for tool calls."
+
+
     def test_construct_dynamic_otel_headers_with_langfuse_keys(self):
         """Test that construct_dynamic_otel_headers creates proper auth headers when langfuse keys are provided."""
         from litellm.types.utils import StandardCallbackDynamicParams
@@ -352,7 +455,7 @@ def test_responses_api_langfuse_specific_attributes(self):
         mock_span = MagicMock()
 
         with patch('litellm.integrations.arize._utils.safe_set_attribute') as mock_safe_set_attribute:
-            LangfuseOtelLogger._set_langfuse_specific_attributes(mock_span, kwargs)
+            LangfuseOtelLogger._set_langfuse_specific_attributes(mock_span, kwargs, {})
 
             # Verify specific attributes were set
             from litellm.types.integrations.langfuse_otel import LangfuseSpanAttributes
@@ -371,8 +474,6 @@ def test_responses_api_langfuse_specific_attributes(self):
             for expected_call in expected_calls:
                 mock_safe_set_attribute.assert_any_call(*expected_call)
 
-
-
 
 if __name__ == "__main__":
     pytest.main([__file__])