crmne · tpaulshippy · Jun 14, 2025 · Jun 14, 2025 · Jun 14, 2025 · Jun 14, 2025
diff --git a/lib/ruby_llm/content.rb b/lib/ruby_llm/content.rb
@@ -19,6 +19,11 @@ def add_attachment(source, filename: nil)
       self
     end
 
+    def attach(attachment)
+      @attachments << attachment
+      self
+    end
+
     def format
       if @text && @attachments.empty?
         @text

diff --git a/lib/ruby_llm/image_attachment.rb b/lib/ruby_llm/image_attachment.rb
@@ -0,0 +1,19 @@
+# frozen_string_literal: true
+
+module RubyLLM
+  # A class representing a file attachment that is an image generated by an LLM.
+  class ImageAttachment < Attachment
+    attr_reader :image, :content
+
+    def initialize(data:, mime_type:, model_id:)
+      super(nil, filename: nil)
+      @image = Image.new(data:, mime_type:, model_id:)
+      @content = Base64.strict_decode64(data)
+      @mime_type = mime_type
+    end
+
+    def image?
+      true
+    end
+  end
+end
diff --git a/lib/ruby_llm/providers/gemini/capabilities.rb b/lib/ruby_llm/providers/gemini/capabilities.rb
@@ -280,6 +280,9 @@ def modalities_for(model_id)
           # Embedding output
           modalities[:output] << 'embeddings' if model_id.match?(/embedding|gemini-embedding/)
 
+          # Image output
+          modalities[:output] << 'image' if model_id.match?(/image-generation/)
+
           modalities
         end
 

diff --git a/lib/ruby_llm/providers/gemini/chat.rb b/lib/ruby_llm/providers/gemini/chat.rb
@@ -16,7 +16,8 @@ def render_payload(messages, tools:, temperature:, model:, stream: false) # rubo
           payload = {
             contents: format_messages(messages),
             generationConfig: {
-              temperature: temperature
+              temperature: temperature,
+              responseModalities: capabilities.modalities_for(model)[:output]
             }
           }
           payload[:tools] = format_tools(tools) if tools.any?
@@ -79,21 +80,6 @@ def parse_completion_response(response)
           )
         end
 
-        def extract_content(data)
-          candidate = data.dig('candidates', 0)
-          return '' unless candidate
-
-          # Content will be empty for function calls
-          return '' if function_call?(candidate)
-
-          # Extract text content
-          parts = candidate.dig('content', 'parts')
-          text_parts = parts&.select { |p| p['text'] }
-          return '' unless text_parts&.any?
-
-          text_parts.map { |p| p['text'] }.join
-        end
-
         def function_call?(candidate)
           parts = candidate.dig('content', 'parts')
           parts&.any? { |p| p['functionCall'] }

diff --git a/lib/ruby_llm/providers/gemini/streaming.rb b/lib/ruby_llm/providers/gemini/streaming.rb
@@ -34,7 +34,21 @@ def extract_content(data)
           return nil unless parts
 
           text_parts = parts.select { |p| p['text'] }
-          text_parts.map { |p| p['text'] }.join if text_parts.any?
+          image_parts = parts.select { |p| p['inlineData'] }
+
+          content = RubyLLM::Content.new(text_parts.map { |p| p['text'] }.join)
+
+          image_parts.map do |p|
+            content.attach(
+              ImageAttachment.new(
+                data: p['inlineData']['data'],
+                mime_type: p['inlineData']['mimeType'],
+                model_id: data['modelVersion']
+              )
+            )
+          end
+
+          content
         end
 
         def extract_input_tokens(data)

diff --git a/...basic_functionality_gemini_gemini-2_0-flash-preview-image-generation_can_paint_images.yml b/...basic_functionality_gemini_gemini-2_0-flash-preview-image-generation_can_paint_images.yml
diff --git a/..._gemini_gemini-2_0-flash-preview-image-generation_can_refine_images_in_a_conversation.yml b/..._gemini_gemini-2_0-flash-preview-image-generation_can_refine_images_in_a_conversation.yml
diff --git a/spec/ruby_llm/image_to_image_spec.rb b/spec/ruby_llm/image_to_image_spec.rb
@@ -0,0 +1,66 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+require 'tempfile'
+
+def save_and_verify_image(image)
+  # Create a temp file to save to
+  temp_file = Tempfile.new(['image', '.png'])
+  temp_path = temp_file.path
+  temp_file.close
+
+  begin
+    saved_path = image.save(temp_path)
+    expect(saved_path).to eq(temp_path)
+    expect(File.exist?(temp_path)).to be true
+
+    file_size = File.size(temp_path)
+    expect(file_size).to be > 1000 # Any real image should be larger than 1KB
+  ensure
+    # Clean up
+    File.delete(temp_path)
+  end
+end
+
+RSpec.describe RubyLLM::Image do
+  include_context 'with configured RubyLLM'
+
+  describe 'basic functionality' do
+    it 'gemini/gemini-2.0-flash-preview-image-generation can paint images' do # rubocop:disable RSpec/MultipleExpectations, RSpec/ExampleLength
+      chat = RubyLLM.chat(model: 'gemini-2.0-flash-preview-image-generation')
+      response = chat.ask('put this in a ring', with: 'spec/fixtures/ruby.png')
+
+      expect(response.content.text).to include('ruby')
+
+      expect(response.content.attachments).to be_an(Array)
+      expect(response.content.attachments).not_to be_empty
+
+      image = response.content.attachments.first.image
+
+      expect(image.base64?).to be(true)
+      expect(image.data).to be_present
+      expect(image.mime_type).to include('image')
+
+      save_and_verify_image image
+    end
+
+    it 'gemini/gemini-2.0-flash-preview-image-generation can refine images in a conversation' do # rubocop:disable RSpec/MultipleExpectations, RSpec/ExampleLength
+      chat = RubyLLM.chat(model: 'gemini-2.0-flash-preview-image-generation')
+      response = chat.ask('put this in a ring', with: 'spec/fixtures/ruby.png')
+      response = chat.ask('change the background to blue')
+
+      expect(response.content.text).to include('ruby')
+
+      expect(response.content.attachments).to be_an(Array)
+      expect(response.content.attachments).not_to be_empty
+
+      image = response.content.attachments.first.image
+
+      expect(image.base64?).to be(true)
+      expect(image.data).to be_present
+      expect(image.mime_type).to include('image')
+
+      save_and_verify_image image
+    end
+  end
+end