tpaulshippy · tpaulshippy · Aug 29, 2025 · Aug 25, 2025 · Aug 28, 2025 · Aug 28, 2025
diff --git a/lib/ruby_llm/image_attachment.rb b/lib/ruby_llm/image_attachment.rb
@@ -3,13 +3,15 @@
 module RubyLLM
   # A class representing a file attachment that is an image generated by an LLM.
   class ImageAttachment < Attachment
-    attr_reader :image, :content
+    attr_reader :image, :content, :id, :reasoning_id
 
-    def initialize(data:, mime_type:, model_id:)
+    def initialize(data:, mime_type:, model_id:, id: nil, reasoning_id: nil)
       super(nil, filename: nil)
       @image = Image.new(data:, mime_type:, model_id:)
       @content = Base64.strict_decode64(data)
       @mime_type = mime_type
+      @id = id
+      @reasoning_id = reasoning_id
     end
 
     def image?

diff --git a/lib/ruby_llm/message.rb b/lib/ruby_llm/message.rb
@@ -6,7 +6,7 @@ class Message
     ROLES = %i[system user assistant tool].freeze
 
     attr_reader :role, :tool_calls, :tool_call_id, :input_tokens, :output_tokens, :model_id, :raw,
-                :cached_tokens, :cache_creation_tokens
+                :cached_tokens, :cache_creation_tokens, :reasoning_id
     attr_writer :content
 
     def initialize(options = {})
@@ -19,6 +19,7 @@ def initialize(options = {})
       @tool_call_id = options[:tool_call_id]
       @cached_tokens = options[:cached_tokens]
       @cache_creation_tokens = options[:cache_creation_tokens]
+      @reasoning_id = options[:reasoning_id]
       @raw = options[:raw]
 
       ensure_valid_role
@@ -54,7 +55,8 @@ def to_h
         output_tokens: output_tokens,
         model_id: model_id,
         cache_creation_tokens: cache_creation_tokens,
-        cached_tokens: cached_tokens
+        cached_tokens: cached_tokens,
+        reasoning_id: reasoning_id
       }.compact
     end
 

diff --git a/lib/ruby_llm/providers/openai/response.rb b/lib/ruby_llm/providers/openai/response.rb
@@ -40,39 +40,72 @@ def render_response_payload(messages, tools:, temperature:, model:, cache_prompt
           payload
         end
 
-        def format_input(messages) # rubocop:disable Metrics/PerceivedComplexity
+        def format_input(messages)
           all_tool_calls = messages.flat_map do |m|
             m.tool_calls&.values || []
           end
-          messages.flat_map do |msg|
-            if msg.tool_call?
-              msg.tool_calls.map do |_, tc|
-                {
-                  type: 'function_call',
-                  call_id: tc.id,
-                  name: tc.name,
-                  arguments: JSON.generate(tc.arguments),
-                  status: 'completed'
-                }
-              end
-            elsif msg.role == :tool
-              {
-                type: 'function_call_output',
-                call_id: all_tool_calls.detect { |tc| tc.id == msg.tool_call_id }&.id,
-                output: msg.content,
-                status: 'completed'
-              }
-            else
-              {
-                type: 'message',
-                role: format_role(msg.role),
-                content: ResponseMedia.format_content(msg.content),
-                status: 'completed'
-              }.compact
-            end
+          messages.flat_map { |msg| format_message_input(msg, all_tool_calls) }.flatten
+        end
+
+        def format_message_input(msg, all_tool_calls)
+          if msg.tool_call?
+            format_tool_call_message(msg)
+          elsif msg.role == :tool
+            format_tool_response_message(msg, all_tool_calls)
+          elsif assistant_message_with_image_attachment?(msg)
+            format_image_generation_message(msg)
+          else
+            format_regular_message(msg)
           end
         end
 
+        def format_tool_call_message(msg)
+          msg.tool_calls.map do |_, tc|
+            {
+              type: 'function_call',
+              call_id: tc.id,
+              name: tc.name,
+              arguments: JSON.generate(tc.arguments),
+              status: 'completed'
+            }
+          end
+        end
+
+        def format_tool_response_message(msg, all_tool_calls)
+          {
+            type: 'function_call_output',
+            call_id: all_tool_calls.detect { |tc| tc.id == msg.tool_call_id }&.id,
+            output: msg.content,
+            status: 'completed'
+          }
+        end
+
+        def format_image_generation_message(msg)
+          items = []
+          image_attachment = msg.content.attachments.first
+          if image_attachment.reasoning_id
+            items << {
+              type: 'reasoning',
+              id: image_attachment.reasoning_id,
+              summary: []
+            }
+          end
+          items << {
+            type: 'image_generation_call',
+            id: image_attachment.id
+          }
+          items
+        end
+
+        def format_regular_message(msg)
+          {
+            type: 'message',
+            role: format_role(msg.role),
+            content: ResponseMedia.format_content(msg.content),
+            status: 'completed'
+          }.compact
+        end
+
         def format_role(role)
           case role
           when :system
@@ -93,23 +126,81 @@ def parse_respond_response(response)
 
           Message.new(
             role: :assistant,
-            content: all_output_text(outputs),
+            content: all_output_content(outputs),
             tool_calls: parse_response_tool_calls(outputs),
             input_tokens: data['usage']['input_tokens'],
             output_tokens: data['usage']['output_tokens'],
             cached_tokens: data.dig('usage', 'input_tokens_details', 'cached_tokens'),
             model_id: data['model'],
+            reasoning_id: extract_reasoning_id(outputs),
             raw: response
           )
         end
 
+        def all_output_content(outputs)
+          @current_outputs = outputs
+          text_content = extract_text_content(outputs)
+          image_outputs = outputs.select { |o| o['type'] == 'image_generation_call' }
+
+          return text_content unless image_outputs.any?
+
+          build_content_with_images(text_content, image_outputs)
+        end
+
+        private
+
+        def extract_text_content(outputs)
+          outputs.select { |o| o['type'] == 'message' }.flat_map do |o|
+            o['content'].filter_map do |c|
+              c['type'] == 'output_text' && c['text']
+            end
+          end.join("\n")
+        end
+
+        def build_content_with_images(text_content, image_outputs)
+          content = RubyLLM::Content.new(text_content)
+          reasoning_id = extract_reasoning_id(@current_outputs)
+          image_outputs.each do |output|
+            attach_image_to_content(content, output, reasoning_id)
+          end
+          content
+        end
+
+        def attach_image_to_content(content, output, reasoning_id)
+          image_data = output['result']
+          output_format = output['output_format'] || 'png'
+          mime_type = "image/#{output_format}"
+
+          content.attach(
+            RubyLLM::ImageAttachment.new(
+              data: image_data,
+              mime_type: mime_type,
+              model_id: nil,
+              id: output['id'],
+              reasoning_id: reasoning_id
+            )
+          )
+        end
+
         def all_output_text(outputs)
           outputs.select { |o| o['type'] == 'message' }.flat_map do |o|
             o['content'].filter_map do |c|
               c['type'] == 'output_text' && c['text']
             end
           end.join("\n")
         end
+
+        def assistant_message_with_image_attachment?(msg)
+          msg.role == :assistant &&
+            msg.content.is_a?(RubyLLM::Content) &&
+            msg.content.attachments.any? &&
+            msg.content.attachments.first.is_a?(RubyLLM::ImageAttachment)
+        end
+
+        def extract_reasoning_id(outputs)
+          reasoning_item = outputs.find { |o| o['type'] == 'reasoning' }
+          reasoning_item&.dig('id')
+        end
       end
     end
   end

diff --git a/lib/ruby_llm/providers/openai/response_media.rb b/lib/ruby_llm/providers/openai/response_media.rb
@@ -7,12 +7,12 @@ class OpenAI
       module ResponseMedia
         module_function
 
-        def format_content(content)
+        def format_content(content) # rubocop:disable Metrics/PerceivedComplexity
           return content.to_json if content.is_a?(Hash) || content.is_a?(Array)
           return content unless content.is_a?(Content)
 
           parts = []
-          parts << format_text(content.text) if content.text
+          parts << format_text(content.text) if content.text && !content.text.empty?
 
           content.attachments.each do |attachment|
             case attachment.type

diff --git a/lib/ruby_llm/providers/openai/streaming.rb b/lib/ruby_llm/providers/openai/streaming.rb
@@ -27,42 +27,64 @@ def build_chunk(data)
         def build_responses_chunk(data)
           case data['type']
           when 'response.output_text.delta'
-            Chunk.new(
-              role: :assistant,
-              model_id: nil,
-              content: data['delta'],
-              tool_calls: nil,
-              input_tokens: nil,
-              output_tokens: nil
-            )
+            build_text_delta_chunk(data)
           when 'response.function_call_arguments.delta'
             build_tool_call_delta_chunk(data)
+          when 'response.image_generation_call.partial_image'
+            build_partial_image_chunk(data)
           when 'response.output_item.added'
-            if data.dig('item', 'type') == 'function_call'
-              build_tool_call_start_chunk(data)
-            else
-              build_empty_chunk(data)
-            end
+            handle_output_item_added(data)
           when 'response.output_item.done'
-            if data.dig('item', 'type') == 'function_call'
-              build_tool_call_complete_chunk(data)
-            else
-              build_empty_chunk(data)
-            end
+            handle_output_item_done(data)
           when 'response.completed'
-            Chunk.new(
-              role: :assistant,
-              model_id: data.dig('response', 'model'),
-              content: nil,
-              tool_calls: nil,
-              input_tokens: data.dig('response', 'usage', 'input_tokens'),
-              output_tokens: data.dig('response', 'usage', 'output_tokens')
-            )
+            build_completion_chunk(data)
+          else
+            build_empty_chunk(data)
+          end
+        end
+
+        def build_text_delta_chunk(data)
+          Chunk.new(
+            role: :assistant,
+            model_id: nil,
+            content: data['delta'],
+            tool_calls: nil,
+            input_tokens: nil,
+            output_tokens: nil
+          )
+        end
+
+        def handle_output_item_added(data)
+          if data.dig('item', 'type') == 'function_call'
+            build_tool_call_start_chunk(data)
+          elsif data.dig('item', 'type') == 'reasoning'
+            build_reasoning_chunk(data)
           else
             build_empty_chunk(data)
           end
         end
 
+        def handle_output_item_done(data)
+          if data.dig('item', 'type') == 'function_call'
+            build_tool_call_complete_chunk(data)
+          elsif data.dig('item', 'type') == 'image_generation_call'
+            build_completed_image_chunk(data)
+          else
+            build_empty_chunk(data)
+          end
+        end
+
+        def build_completion_chunk(data)
+          Chunk.new(
+            role: :assistant,
+            model_id: data.dig('response', 'model'),
+            content: nil,
+            tool_calls: nil,
+            input_tokens: data.dig('response', 'usage', 'input_tokens'),
+            output_tokens: data.dig('response', 'usage', 'output_tokens')
+          )
+        end
+
         def build_chat_completions_chunk(data)
           Chunk.new(
             role: :assistant,
@@ -145,6 +167,63 @@ def build_empty_chunk(_data)
           )
         end
 
+        def build_partial_image_chunk(data)
+          content = build_image_content(data['partial_image_b64'], 'image/png', nil, nil)
+
+          Chunk.new(
+            role: :assistant,
+            model_id: nil,
+            content: content,
+            tool_calls: nil,
+            input_tokens: nil,
+            output_tokens: nil
+          )
+        end
+
+        def build_completed_image_chunk(data)
+          item = data['item']
+          image_data = item['result']
+          output_format = item['output_format'] || 'png'
+          mime_type = "image/#{output_format}"
+          revised_prompt = item['revised_prompt']
+
+          content = build_image_content(image_data, mime_type, nil, revised_prompt)
+
+          Chunk.new(
+            role: :assistant,
+            model_id: nil,
+            content: content,
+            tool_calls: nil,
+            input_tokens: nil,
+            output_tokens: nil
+          )
+        end
+
+        def build_reasoning_chunk(data)
+          Chunk.new(
+            role: :assistant,
+            model_id: nil,
+            content: nil,
+            tool_calls: nil,
+            input_tokens: nil,
+            output_tokens: nil,
+            reasoning_id: data.dig('item', 'id')
+          )
+        end
+
+        def build_image_content(base64_data, mime_type, model_id, revised_prompt = nil)
+          text_content = revised_prompt || ''
+          content = RubyLLM::Content.new(text_content)
+          content.attach(
+            RubyLLM::ImageAttachment.new(
+              data: base64_data,
+              mime_type: mime_type,
+              model_id: model_id
+            )
+          )
+          content
+        end
+
         def create_streaming_tool_call(tool_call_data)
           ToolCall.new(
             id: tool_call_data['id'],