Skip to content

Commit 4feaa21

Browse files
authored
Merge pull request #6 from tpaulshippy/image-tool
Support OpenAI's image_generation tool for multi-turn image editing
2 parents c5b13ce + e42417e commit 4feaa21

15 files changed

+1295
-125
lines changed

lib/ruby_llm/image_attachment.rb

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,15 @@
33
module RubyLLM
44
# A class representing a file attachment that is an image generated by an LLM.
55
class ImageAttachment < Attachment
6-
attr_reader :image, :content
6+
attr_reader :image, :content, :id, :reasoning_id
77

8-
def initialize(data:, mime_type:, model_id:)
8+
def initialize(data:, mime_type:, model_id:, id: nil, reasoning_id: nil)
99
super(nil, filename: nil)
1010
@image = Image.new(data:, mime_type:, model_id:)
1111
@content = Base64.strict_decode64(data)
1212
@mime_type = mime_type
13+
@id = id
14+
@reasoning_id = reasoning_id
1315
end
1416

1517
def image?

lib/ruby_llm/message.rb

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ class Message
66
ROLES = %i[system user assistant tool].freeze
77

88
attr_reader :role, :tool_calls, :tool_call_id, :input_tokens, :output_tokens, :model_id, :raw,
9-
:cached_tokens, :cache_creation_tokens
9+
:cached_tokens, :cache_creation_tokens, :reasoning_id
1010
attr_writer :content
1111

1212
def initialize(options = {})
@@ -19,6 +19,7 @@ def initialize(options = {})
1919
@tool_call_id = options[:tool_call_id]
2020
@cached_tokens = options[:cached_tokens]
2121
@cache_creation_tokens = options[:cache_creation_tokens]
22+
@reasoning_id = options[:reasoning_id]
2223
@raw = options[:raw]
2324

2425
ensure_valid_role
@@ -54,7 +55,8 @@ def to_h
5455
output_tokens: output_tokens,
5556
model_id: model_id,
5657
cache_creation_tokens: cache_creation_tokens,
57-
cached_tokens: cached_tokens
58+
cached_tokens: cached_tokens,
59+
reasoning_id: reasoning_id
5860
}.compact
5961
end
6062

lib/ruby_llm/providers/openai/response.rb

Lines changed: 119 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -40,39 +40,72 @@ def render_response_payload(messages, tools:, temperature:, model:, cache_prompt
4040
payload
4141
end
4242

43-
def format_input(messages) # rubocop:disable Metrics/PerceivedComplexity
43+
def format_input(messages)
4444
all_tool_calls = messages.flat_map do |m|
4545
m.tool_calls&.values || []
4646
end
47-
messages.flat_map do |msg|
48-
if msg.tool_call?
49-
msg.tool_calls.map do |_, tc|
50-
{
51-
type: 'function_call',
52-
call_id: tc.id,
53-
name: tc.name,
54-
arguments: JSON.generate(tc.arguments),
55-
status: 'completed'
56-
}
57-
end
58-
elsif msg.role == :tool
59-
{
60-
type: 'function_call_output',
61-
call_id: all_tool_calls.detect { |tc| tc.id == msg.tool_call_id }&.id,
62-
output: msg.content,
63-
status: 'completed'
64-
}
65-
else
66-
{
67-
type: 'message',
68-
role: format_role(msg.role),
69-
content: ResponseMedia.format_content(msg.content),
70-
status: 'completed'
71-
}.compact
72-
end
47+
messages.flat_map { |msg| format_message_input(msg, all_tool_calls) }.flatten
48+
end
49+
50+
def format_message_input(msg, all_tool_calls)
51+
if msg.tool_call?
52+
format_tool_call_message(msg)
53+
elsif msg.role == :tool
54+
format_tool_response_message(msg, all_tool_calls)
55+
elsif assistant_message_with_image_attachment?(msg)
56+
format_image_generation_message(msg)
57+
else
58+
format_regular_message(msg)
7359
end
7460
end
7561

62+
def format_tool_call_message(msg)
63+
msg.tool_calls.map do |_, tc|
64+
{
65+
type: 'function_call',
66+
call_id: tc.id,
67+
name: tc.name,
68+
arguments: JSON.generate(tc.arguments),
69+
status: 'completed'
70+
}
71+
end
72+
end
73+
74+
def format_tool_response_message(msg, all_tool_calls)
75+
{
76+
type: 'function_call_output',
77+
call_id: all_tool_calls.detect { |tc| tc.id == msg.tool_call_id }&.id,
78+
output: msg.content,
79+
status: 'completed'
80+
}
81+
end
82+
83+
def format_image_generation_message(msg)
84+
items = []
85+
image_attachment = msg.content.attachments.first
86+
if image_attachment.reasoning_id
87+
items << {
88+
type: 'reasoning',
89+
id: image_attachment.reasoning_id,
90+
summary: []
91+
}
92+
end
93+
items << {
94+
type: 'image_generation_call',
95+
id: image_attachment.id
96+
}
97+
items
98+
end
99+
100+
def format_regular_message(msg)
101+
{
102+
type: 'message',
103+
role: format_role(msg.role),
104+
content: ResponseMedia.format_content(msg.content),
105+
status: 'completed'
106+
}.compact
107+
end
108+
76109
def format_role(role)
77110
case role
78111
when :system
@@ -93,23 +126,81 @@ def parse_respond_response(response)
93126

94127
Message.new(
95128
role: :assistant,
96-
content: all_output_text(outputs),
129+
content: all_output_content(outputs),
97130
tool_calls: parse_response_tool_calls(outputs),
98131
input_tokens: data['usage']['input_tokens'],
99132
output_tokens: data['usage']['output_tokens'],
100133
cached_tokens: data.dig('usage', 'input_tokens_details', 'cached_tokens'),
101134
model_id: data['model'],
135+
reasoning_id: extract_reasoning_id(outputs),
102136
raw: response
103137
)
104138
end
105139

140+
def all_output_content(outputs)
141+
@current_outputs = outputs
142+
text_content = extract_text_content(outputs)
143+
image_outputs = outputs.select { |o| o['type'] == 'image_generation_call' }
144+
145+
return text_content unless image_outputs.any?
146+
147+
build_content_with_images(text_content, image_outputs)
148+
end
149+
150+
private
151+
152+
def extract_text_content(outputs)
153+
outputs.select { |o| o['type'] == 'message' }.flat_map do |o|
154+
o['content'].filter_map do |c|
155+
c['type'] == 'output_text' && c['text']
156+
end
157+
end.join("\n")
158+
end
159+
160+
def build_content_with_images(text_content, image_outputs)
161+
content = RubyLLM::Content.new(text_content)
162+
reasoning_id = extract_reasoning_id(@current_outputs)
163+
image_outputs.each do |output|
164+
attach_image_to_content(content, output, reasoning_id)
165+
end
166+
content
167+
end
168+
169+
def attach_image_to_content(content, output, reasoning_id)
170+
image_data = output['result']
171+
output_format = output['output_format'] || 'png'
172+
mime_type = "image/#{output_format}"
173+
174+
content.attach(
175+
RubyLLM::ImageAttachment.new(
176+
data: image_data,
177+
mime_type: mime_type,
178+
model_id: nil,
179+
id: output['id'],
180+
reasoning_id: reasoning_id
181+
)
182+
)
183+
end
184+
106185
def all_output_text(outputs)
107186
outputs.select { |o| o['type'] == 'message' }.flat_map do |o|
108187
o['content'].filter_map do |c|
109188
c['type'] == 'output_text' && c['text']
110189
end
111190
end.join("\n")
112191
end
192+
193+
def assistant_message_with_image_attachment?(msg)
194+
msg.role == :assistant &&
195+
msg.content.is_a?(RubyLLM::Content) &&
196+
msg.content.attachments.any? &&
197+
msg.content.attachments.first.is_a?(RubyLLM::ImageAttachment)
198+
end
199+
200+
def extract_reasoning_id(outputs)
201+
reasoning_item = outputs.find { |o| o['type'] == 'reasoning' }
202+
reasoning_item&.dig('id')
203+
end
113204
end
114205
end
115206
end

lib/ruby_llm/providers/openai/response_media.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@ class OpenAI
77
module ResponseMedia
88
module_function
99

10-
def format_content(content)
10+
def format_content(content) # rubocop:disable Metrics/PerceivedComplexity
1111
return content.to_json if content.is_a?(Hash) || content.is_a?(Array)
1212
return content unless content.is_a?(Content)
1313

1414
parts = []
15-
parts << format_text(content.text) if content.text
15+
parts << format_text(content.text) if content.text && !content.text.empty?
1616

1717
content.attachments.each do |attachment|
1818
case attachment.type

lib/ruby_llm/providers/openai/streaming.rb

Lines changed: 105 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -27,42 +27,64 @@ def build_chunk(data)
2727
def build_responses_chunk(data)
2828
case data['type']
2929
when 'response.output_text.delta'
30-
Chunk.new(
31-
role: :assistant,
32-
model_id: nil,
33-
content: data['delta'],
34-
tool_calls: nil,
35-
input_tokens: nil,
36-
output_tokens: nil
37-
)
30+
build_text_delta_chunk(data)
3831
when 'response.function_call_arguments.delta'
3932
build_tool_call_delta_chunk(data)
33+
when 'response.image_generation_call.partial_image'
34+
build_partial_image_chunk(data)
4035
when 'response.output_item.added'
41-
if data.dig('item', 'type') == 'function_call'
42-
build_tool_call_start_chunk(data)
43-
else
44-
build_empty_chunk(data)
45-
end
36+
handle_output_item_added(data)
4637
when 'response.output_item.done'
47-
if data.dig('item', 'type') == 'function_call'
48-
build_tool_call_complete_chunk(data)
49-
else
50-
build_empty_chunk(data)
51-
end
38+
handle_output_item_done(data)
5239
when 'response.completed'
53-
Chunk.new(
54-
role: :assistant,
55-
model_id: data.dig('response', 'model'),
56-
content: nil,
57-
tool_calls: nil,
58-
input_tokens: data.dig('response', 'usage', 'input_tokens'),
59-
output_tokens: data.dig('response', 'usage', 'output_tokens')
60-
)
40+
build_completion_chunk(data)
41+
else
42+
build_empty_chunk(data)
43+
end
44+
end
45+
46+
def build_text_delta_chunk(data)
47+
Chunk.new(
48+
role: :assistant,
49+
model_id: nil,
50+
content: data['delta'],
51+
tool_calls: nil,
52+
input_tokens: nil,
53+
output_tokens: nil
54+
)
55+
end
56+
57+
def handle_output_item_added(data)
58+
if data.dig('item', 'type') == 'function_call'
59+
build_tool_call_start_chunk(data)
60+
elsif data.dig('item', 'type') == 'reasoning'
61+
build_reasoning_chunk(data)
6162
else
6263
build_empty_chunk(data)
6364
end
6465
end
6566

67+
def handle_output_item_done(data)
68+
if data.dig('item', 'type') == 'function_call'
69+
build_tool_call_complete_chunk(data)
70+
elsif data.dig('item', 'type') == 'image_generation_call'
71+
build_completed_image_chunk(data)
72+
else
73+
build_empty_chunk(data)
74+
end
75+
end
76+
77+
def build_completion_chunk(data)
78+
Chunk.new(
79+
role: :assistant,
80+
model_id: data.dig('response', 'model'),
81+
content: nil,
82+
tool_calls: nil,
83+
input_tokens: data.dig('response', 'usage', 'input_tokens'),
84+
output_tokens: data.dig('response', 'usage', 'output_tokens')
85+
)
86+
end
87+
6688
def build_chat_completions_chunk(data)
6789
Chunk.new(
6890
role: :assistant,
@@ -145,6 +167,63 @@ def build_empty_chunk(_data)
145167
)
146168
end
147169

170+
def build_partial_image_chunk(data)
171+
content = build_image_content(data['partial_image_b64'], 'image/png', nil, nil)
172+
173+
Chunk.new(
174+
role: :assistant,
175+
model_id: nil,
176+
content: content,
177+
tool_calls: nil,
178+
input_tokens: nil,
179+
output_tokens: nil
180+
)
181+
end
182+
183+
def build_completed_image_chunk(data)
184+
item = data['item']
185+
image_data = item['result']
186+
output_format = item['output_format'] || 'png'
187+
mime_type = "image/#{output_format}"
188+
revised_prompt = item['revised_prompt']
189+
190+
content = build_image_content(image_data, mime_type, nil, revised_prompt)
191+
192+
Chunk.new(
193+
role: :assistant,
194+
model_id: nil,
195+
content: content,
196+
tool_calls: nil,
197+
input_tokens: nil,
198+
output_tokens: nil
199+
)
200+
end
201+
202+
def build_reasoning_chunk(data)
203+
Chunk.new(
204+
role: :assistant,
205+
model_id: nil,
206+
content: nil,
207+
tool_calls: nil,
208+
input_tokens: nil,
209+
output_tokens: nil,
210+
reasoning_id: data.dig('item', 'id')
211+
)
212+
end
213+
214+
def build_image_content(base64_data, mime_type, model_id, revised_prompt = nil)
215+
text_content = revised_prompt || ''
216+
content = RubyLLM::Content.new(text_content)
217+
content.attach(
218+
RubyLLM::ImageAttachment.new(
219+
data: base64_data,
220+
mime_type: mime_type,
221+
model_id: model_id
222+
)
223+
)
224+
content
225+
end
226+
148227
def create_streaming_tool_call(tool_call_data)
149228
ToolCall.new(
150229
id: tool_call_data['id'],

0 commit comments

Comments
 (0)