Skip to content

Image to image with gemini-2.0-flash-preview-image-generation #248

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions lib/ruby_llm/content.rb
Original file line number Diff line number Diff line change
@@ -19,6 +19,11 @@ def add_attachment(source, filename: nil)
self
end

def attach(attachment)
@attachments << attachment
self
end

def format
if @text && @attachments.empty?
@text
19 changes: 19 additions & 0 deletions lib/ruby_llm/image_attachment.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# frozen_string_literal: true

module RubyLLM
# A class representing a file attachment that is an image generated by an LLM.
class ImageAttachment < Attachment
attr_reader :image, :content

def initialize(data:, mime_type:, model_id:)
super(nil, filename: nil)
@image = Image.new(data:, mime_type:, model_id:)
@content = Base64.strict_decode64(data)
@mime_type = mime_type
end

def image?
true
end
end
end
3 changes: 3 additions & 0 deletions lib/ruby_llm/providers/gemini/capabilities.rb
Original file line number Diff line number Diff line change
@@ -280,6 +280,9 @@ def modalities_for(model_id)
# Embedding output
modalities[:output] << 'embeddings' if model_id.match?(/embedding|gemini-embedding/)

# Image output
modalities[:output] << 'image' if model_id.match?(/image-generation/)

modalities
end

18 changes: 2 additions & 16 deletions lib/ruby_llm/providers/gemini/chat.rb
Original file line number Diff line number Diff line change
@@ -16,7 +16,8 @@ def render_payload(messages, tools:, temperature:, model:, stream: false) # rubo
payload = {
contents: format_messages(messages),
generationConfig: {
temperature: temperature
temperature: temperature,
responseModalities: capabilities.modalities_for(model)[:output]
}
}
payload[:tools] = format_tools(tools) if tools.any?
@@ -79,21 +80,6 @@ def parse_completion_response(response)
)
end

def extract_content(data)
candidate = data.dig('candidates', 0)
return '' unless candidate

# Content will be empty for function calls
return '' if function_call?(candidate)

# Extract text content
parts = candidate.dig('content', 'parts')
text_parts = parts&.select { |p| p['text'] }
return '' unless text_parts&.any?

text_parts.map { |p| p['text'] }.join
end

def function_call?(candidate)
parts = candidate.dig('content', 'parts')
parts&.any? { |p| p['functionCall'] }
16 changes: 15 additions & 1 deletion lib/ruby_llm/providers/gemini/streaming.rb
Original file line number Diff line number Diff line change
@@ -34,7 +34,21 @@ def extract_content(data)
return nil unless parts

text_parts = parts.select { |p| p['text'] }
text_parts.map { |p| p['text'] }.join if text_parts.any?
image_parts = parts.select { |p| p['inlineData'] }

content = RubyLLM::Content.new(text_parts.map { |p| p['text'] }.join)

image_parts.map do |p|
content.attach(
ImageAttachment.new(
data: p['inlineData']['data'],
mime_type: p['inlineData']['mimeType'],
model_id: data['modelVersion']
)
)
end

content
end

def extract_input_tokens(data)

Large diffs are not rendered by default.

Large diffs are not rendered by default.

66 changes: 66 additions & 0 deletions spec/ruby_llm/image_to_image_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# frozen_string_literal: true

require 'spec_helper'
require 'tempfile'

def save_and_verify_image(image)
# Create a temp file to save to
temp_file = Tempfile.new(['image', '.png'])
temp_path = temp_file.path
temp_file.close

begin
saved_path = image.save(temp_path)
expect(saved_path).to eq(temp_path)
expect(File.exist?(temp_path)).to be true

file_size = File.size(temp_path)
expect(file_size).to be > 1000 # Any real image should be larger than 1KB
ensure
# Clean up
File.delete(temp_path)
end
end

RSpec.describe RubyLLM::Image do
include_context 'with configured RubyLLM'

describe 'basic functionality' do
it 'gemini/gemini-2.0-flash-preview-image-generation can paint images' do # rubocop:disable RSpec/MultipleExpectations, RSpec/ExampleLength
chat = RubyLLM.chat(model: 'gemini-2.0-flash-preview-image-generation')
response = chat.ask('put this in a ring', with: 'spec/fixtures/ruby.png')

expect(response.content.text).to include('ruby')

expect(response.content.attachments).to be_an(Array)
expect(response.content.attachments).not_to be_empty

image = response.content.attachments.first.image

expect(image.base64?).to be(true)
expect(image.data).to be_present
expect(image.mime_type).to include('image')

save_and_verify_image image
end

it 'gemini/gemini-2.0-flash-preview-image-generation can refine images in a conversation' do # rubocop:disable RSpec/MultipleExpectations, RSpec/ExampleLength
chat = RubyLLM.chat(model: 'gemini-2.0-flash-preview-image-generation')
response = chat.ask('put this in a ring', with: 'spec/fixtures/ruby.png')
response = chat.ask('change the background to blue')

expect(response.content.text).to include('ruby')

expect(response.content.attachments).to be_an(Array)
expect(response.content.attachments).not_to be_empty

image = response.content.attachments.first.image

expect(image.base64?).to be(true)
expect(image.data).to be_present
expect(image.mime_type).to include('image')

save_and_verify_image image
end
end
end