diff --git a/src/docs/src/AI/chat.md b/src/docs/src/AI/chat.md
index ec007d54fd..794a8405b2 100755
--- a/src/docs/src/AI/chat.md
+++ b/src/docs/src/AI/chat.md
@@ -33,7 +33,7 @@ An object containing the following properties:
 - `temperature` (Number) - A number between 0 and 2 indicating the randomness of the completion. Lower values make the output more focused and deterministic, while higher values make it more random. By default, the specific model's temperature is used.
 - `tools` (Array) (Optional) - Function definitions the AI can call. See [Function Calling](#function-calling) for details.
 - `reasoning_effort` / `reasoning.effort` (String) (Optional) - Controls how much effort reasoning models spend thinking. Supported values: `none`, `minimal`, `low`, `medium`, `high`, and `xhigh`. Lower values give faster responses with less reasoning. OpenAI models only.
-- `text` / `text_verbosity` (String) (Optional) - Controls how long or short responses are. Supported values: `low`, `medium`, and `high`. Lower values give shorter responses. OpenAI models only.
+- `verbosity` / `text.verbosity` (String) (Optional) - Controls how long or short responses are. Supported values: `low`, `medium`, and `high`. Lower values give shorter responses. OpenAI models only.
 
 #### `testMode` (Boolean) (Optional)
 
diff --git a/src/docs/src/AI/speech2txt.md b/src/docs/src/AI/speech2txt.md
index 8d6c607237..acc994eeb5 100644
--- a/src/docs/src/AI/speech2txt.md
+++ b/src/docs/src/AI/speech2txt.md
@@ -44,7 +44,7 @@ Fine-tune how transcription runs.
 - `chunking_strategy` (String): Required for `gpt-4o-transcribe-diarize` inputs longer than 30 seconds (recommend `"auto"`).
 - `known_speaker_names` / `known_speaker_references` (Array): Optional diarization references encoded as data URLs.
 - `extra_body` (Object): Forwarded verbatim to the OpenAI API for experimental flags.
-- `stream` (Boolean): Reserved for future streaming support. Currently rejected when `true`.
+- `stream` (Boolean): Reserved for future streaming support. Streaming is not currently supported.
 - `test_mode` (Boolean): When `true`, returns a sample response without using credits. Defaults to `false`.
 
 **xAI-specific options** (when `provider: 'xai'`):
@@ -65,8 +65,8 @@ When `true`, skips the live API call and returns a static sample transcript so y
 
 Returns a `Promise` that resolves to either:
 
-- A string (when `response_format: "text"` or you pass a shorthand `source` with no options), or
-- An object of [`Speech2TxtResult`](/Objects/speech2txtresult) containing the transcription payload (including diarization segments, timestamps, etc., depending on the selected model and format).
+- A string (when `response_format: "text"`), or
+- An object of [`Speech2TxtResult`](/Objects/speech2txtresult) containing the transcription payload (including diarization segments, timestamps, etc., depending on the selected model and format). This is the default, including when you pass a bare `source` with no options.
 
 ## Examples
 
@@ -79,7 +79,7 @@ Returns a `Promise` that resolves to either:
     <script>
         (async () => {
             const transcript = await puter.ai.speech2txt('https://assets.puter.site/example.mp3');
-            puter.print('Transcript:', transcript.text ?? transcript);
+            puter.print('Transcript:', transcript.text);
         })();
     </script>
 </body>
diff --git a/src/docs/src/AI/txt2speech.listEngines.md b/src/docs/src/AI/txt2speech.listEngines.md
index 74b521f8e6..95e2ee4951 100644
--- a/src/docs/src/AI/txt2speech.listEngines.md
+++ b/src/docs/src/AI/txt2speech.listEngines.md
@@ -32,14 +32,7 @@ Common aliases are also accepted (e.g. `'eleven'`, `'google'`, `'grok'`).
 
 ## Return value
 
-A `Promise` that resolves to an array of engine objects. Each object contains:
-
-| Field | Type | Description |
-|-------|------|-------------|
-| `id` | `String` | Engine/model identifier |
-| `name` | `String` | Human-readable engine name |
-| `provider` | `String` | Provider this engine belongs to |
-| `pricing_per_million_chars` | `Number` | Cost per million characters (may be absent) |
+A `Promise` that resolves to an array of [`TTSEngine`](/Objects/ttsengine) objects.
 
 Example response:
 
diff --git a/src/docs/src/AI/txt2speech.listVoices.md b/src/docs/src/AI/txt2speech.listVoices.md
index 78851f3e3d..0111db3126 100644
--- a/src/docs/src/AI/txt2speech.listVoices.md
+++ b/src/docs/src/AI/txt2speech.listVoices.md
@@ -26,19 +26,7 @@ When `options` is a plain string it is treated as an `engine` filter for the def
 
 ## Return value
 
-A `Promise` that resolves to an array of voice objects. Each object contains:
-
-| Field | Type | Description |
-|-------|------|-------------|
-| `id` | `String` | Voice identifier to pass to `txt2speech()` |
-| `name` | `String` | Human-readable voice name |
-| `provider` | `String` | Provider this voice belongs to |
-| `language` | `Object` | `{ name, code }` language info (may be absent) |
-| `description` | `String` | Short description of the voice (may be absent) |
-| `category` | `String` | Voice category, e.g. `'premade'` (may be absent) |
-| `labels` | `Object` | Provider-specific labels (may be absent) |
-| `supported_models` | `Array` | Model IDs this voice works with (may be absent) |
-| `supported_engines` | `Array` | Engine types this voice supports (may be absent) |
+A `Promise` that resolves to an array of [`TTSVoice`](/Objects/ttsvoice) objects.
 
 Example response:
 
diff --git a/src/docs/src/AI/txt2speech.md b/src/docs/src/AI/txt2speech.md
index 1824fbefff..68dcee450b 100755
--- a/src/docs/src/AI/txt2speech.md
+++ b/src/docs/src/AI/txt2speech.md
@@ -96,7 +96,7 @@ Available when `provider: 'xai'`:
 | `language` | `String` | BCP-47 language code. Defaults to `'en'`. Supports `'auto'` for auto-detection and 20+ languages |
 | `output_format` | `String` | Output codec. Available: `'mp3'` (default), `'wav'`, `'pcm'`, `'mulaw'`, `'alaw'` |
 
-Text supports inline speech tags like `[pause]`, `[laugh]` and wrapping tags like `<whisper>text</whisper>` for expressive delivery. Maximum 15,000 characters per request.
+Text supports inline speech tags like `[pause]`, `[laugh]` and wrapping tags like `<whisper>text</whisper>` for expressive delivery.
 
 For more details, see the [xAI TTS documentation](https://x.ai/news/grok-stt-and-tts-apis).
 
diff --git a/src/docs/src/Objects.md b/src/docs/src/Objects.md
index b945646eff..600fa6b558 100644
--- a/src/docs/src/Objects.md
+++ b/src/docs/src/Objects.md
@@ -17,6 +17,8 @@ Various object types and classes that represent different entities in the Puter
 - **[MonthlyUsage](/Objects/monthlyusage/)** - Represents user's monthly resource usage information
 - **[Speech2TxtResult](/Objects/speech2txtresult/)** - Represents speech-to-text transcription results
 - **[Subdomain](/Objects/subdomain/)** - Represents a subdomain
+- **[TTSEngine](/Objects/ttsengine/)** - Represents an available text-to-speech engine/model
+- **[TTSVoice](/Objects/ttsvoice/)** - Represents an available text-to-speech voice
 - **[ToolCall](/Objects/toolcall/)** - Represents a tool invocation request
 - **[User](/Objects/user/)** - Represents a Puter user
 - **[WorkerDeployment](/Objects/workerdeployment/)** - Represents a worker deployment result
diff --git a/src/docs/src/Objects/chatresponse.md b/src/docs/src/Objects/chatresponse.md
index 8d9a6c1feb..8d4305e5be 100644
--- a/src/docs/src/Objects/chatresponse.md
+++ b/src/docs/src/Objects/chatresponse.md
@@ -16,3 +16,9 @@ An object containing the chat message data.
 - `content` (String) - The content of the message.
 
 - `tool_calls` (Array) - An optional array of [`ToolCall`](/Objects/toolcall) objects if the model wants to call tools.
+
+- `tool_call_id` (String) - An optional identifier linking this message to the tool call it responds to.
+
+- `cache_control` (Object) - An optional object controlling prompt caching for this message. Contains a `type` (String) property.
+
+- `images` (Array) - An array of image content objects associated with the message. Each object contains a `type` (String) and an `image_url` object with a `url` (String) property.
diff --git a/src/docs/src/Objects/chatresponsechunk.md b/src/docs/src/Objects/chatresponsechunk.md
index 9a59abf387..5622481771 100644
--- a/src/docs/src/Objects/chatresponsechunk.md
+++ b/src/docs/src/Objects/chatresponsechunk.md
@@ -5,8 +5,44 @@ description: The ChatResponseChunk object containing a chunk of streaming chat r
 
 The `ChatResponseChunk` object containing a chunk of streaming chat response data.
 
+Each chunk has a `type` indicating its kind. The other attributes that are present depend on that `type`.
+
 ## Attributes
 
+#### `type` (String)
+
+The kind of chunk. One of:
+
+- `"text"` - A portion of the response text.
+- `"reasoning"` - A portion of the model's reasoning/thinking output.
+- `"tool_use"` - A tool/function the model wants to call.
+- `"extra_content"` - Provider-specific metadata.
+- `"usage"` - Token usage totals, emitted as the final chunk.
+
 #### `text` (String)
 
-A string containing a portion of the chat response text in streaming mode.
+A portion of the chat response text. Present on `text` chunks.
+
+#### `reasoning` (String)
+
+A portion of the model's reasoning output. Present on `reasoning` chunks.
+
+#### `id` (String)
+
+The unique identifier for the tool call. Present on `tool_use` chunks.
+
+#### `name` (String)
+
+The name of the function/tool to call. Present on `tool_use` chunks.
+
+#### `input` (Object)
+
+The parsed arguments for the tool call. Present on `tool_use` chunks.
+
+#### `extra_content`
+
+Provider-specific metadata attached to the stream.
+
+#### `usage` (Object)
+
+An object containing token usage totals. Present on the final `usage` chunk.
diff --git a/src/docs/src/Objects/speech2txtresult.md b/src/docs/src/Objects/speech2txtresult.md
index 48b4fcb007..8d3fa1e192 100644
--- a/src/docs/src/Objects/speech2txtresult.md
+++ b/src/docs/src/Objects/speech2txtresult.md
@@ -18,3 +18,16 @@ A string containing the detected or specified language of the audio.
 #### `segments` (Array)
 
 An optional array of segment objects containing detailed transcription information.
+
+#### `duration` (Number)
+
+An optional duration of the audio in seconds. Provider-dependent (e.g. returned by xAI).
+
+#### `words` (Array)
+
+An optional array of per-word timestamp objects. Provider-dependent (e.g. returned by xAI). Each word has:
+
+- `text` (String): The transcribed word.
+- `start` (Number): Start time of the word in seconds.
+- `end` (Number): End time of the word in seconds.
+- `speaker` (String): Detected speaker, present when `diarize: true`.
diff --git a/src/docs/src/Objects/ttsengine.md b/src/docs/src/Objects/ttsengine.md
new file mode 100644
index 0000000000..3978d783a1
--- /dev/null
+++ b/src/docs/src/Objects/ttsengine.md
@@ -0,0 +1,24 @@
+---
+title: TTSEngine
+description: The TTSEngine object describing an available text-to-speech engine/model.
+---
+
+The `TTSEngine` object describes a text-to-speech engine/model available from a provider, including pricing metadata where available. Arrays of these objects are returned by [`puter.ai.txt2speech.listEngines()`](/AI/txt2speech.listEngines).
+
+## Attributes
+
+#### `id` (String)
+
+The engine/model identifier.
+
+#### `name` (String)
+
+A human-readable engine name.
+
+#### `provider` (String)
+
+The provider this engine belongs to, e.g. `'aws-polly'`, `'openai'`, `'elevenlabs'`, `'gemini'`, `'xai'`.
+
+#### `pricing_per_million_chars` (Number)
+
+An optional cost per million characters. May be absent when the provider does not expose pricing.
diff --git a/src/docs/src/Objects/ttsvoice.md b/src/docs/src/Objects/ttsvoice.md
new file mode 100644
index 0000000000..fc6cea76ae
--- /dev/null
+++ b/src/docs/src/Objects/ttsvoice.md
@@ -0,0 +1,44 @@
+---
+title: TTSVoice
+description: The TTSVoice object describing an available text-to-speech voice.
+---
+
+The `TTSVoice` object describes a text-to-speech voice available from a provider, including metadata such as language, category, and supported models/engines. Arrays of these objects are returned by [`puter.ai.txt2speech.listVoices()`](/AI/txt2speech.listVoices).
+
+## Attributes
+
+#### `id` (String)
+
+The voice identifier to pass to [`puter.ai.txt2speech()`](/AI/txt2speech).
+
+#### `name` (String)
+
+A human-readable voice name.
+
+#### `provider` (String)
+
+The provider this voice belongs to, e.g. `'aws-polly'`, `'openai'`, `'elevenlabs'`, `'gemini'`, `'xai'`.
+
+#### `language` (Object)
+
+An optional object describing the voice's language. Contains a `name` (String) and a `code` (String) property. May be absent.
+
+#### `description` (String)
+
+An optional short description of the voice. May be absent.
+
+#### `category` (String)
+
+An optional voice category, e.g. `'premade'`. May be absent.
+
+#### `labels` (Object)
+
+An optional object of provider-specific labels. May be absent.
+
+#### `supported_models` (Array)
+
+An optional array of model IDs (Strings) this voice works with. May be absent.
+
+#### `supported_engines` (Array)
+
+An optional array of engine types (Strings) this voice supports. May be absent.
diff --git a/src/docs/src/playground/examples/ai-speech2txt.html b/src/docs/src/playground/examples/ai-speech2txt.html
index c21d758891..ba367f24cb 100644
--- a/src/docs/src/playground/examples/ai-speech2txt.html
+++ b/src/docs/src/playground/examples/ai-speech2txt.html
@@ -4,7 +4,7 @@
     <script>
     (async () => {
         const transcript = await puter.ai.speech2txt('https://assets.puter.site/example.mp3');
-        puter.print('Transcript:', transcript.text ?? transcript);
+        puter.print('Transcript:', transcript.text);
     })();
     </script>
 </body>
diff --git a/src/docs/src/sidebar.js b/src/docs/src/sidebar.js
index db265a1617..3957a332c3 100755
--- a/src/docs/src/sidebar.js
+++ b/src/docs/src/sidebar.js
@@ -1243,6 +1243,20 @@ let sidebar = [
                 source: '/Objects/subdomain.md',
                 path: '/Objects/subdomain',
             },
+            {
+                title: '<code>TTSEngine</code>',
+                title_tag: 'TTSEngine',
+                icon: '/assets/img/object.svg',
+                source: '/Objects/ttsengine.md',
+                path: '/Objects/ttsengine',
+            },
+            {
+                title: '<code>TTSVoice</code>',
+                title_tag: 'TTSVoice',
+                icon: '/assets/img/object.svg',
+                source: '/Objects/ttsvoice.md',
+                path: '/Objects/ttsvoice',
+            },
             {
                 title: '<code>ToolCall</code>',
                 title_tag: 'ToolCall',
diff --git a/src/puter-js/types/modules/ai.d.ts b/src/puter-js/types/modules/ai.d.ts
index b2e30e2b9d..9d1883d2a8 100644
--- a/src/puter-js/types/modules/ai.d.ts
+++ b/src/puter-js/types/modules/ai.d.ts
@@ -1,9 +1,17 @@
 export type AIMessageContent = string | { image_url?: { url: string } } | { video_url?: { url: string } } | Record<string, unknown>;
 
+export interface ImageContent {
+    type: string;
+    image_url: { url: string };
+}
+
 export interface ChatMessage {
     role?: string;
     content: AIMessageContent | AIMessageContent[];
     tool_calls?: ToolCall[];
+    tool_call_id?: string;
+    cache_control?: { type: string };
+    images: ImageContent[];
 }
 
 export interface ToolCall {
@@ -11,18 +19,55 @@ export interface ToolCall {
     function: { name: string, arguments: string };
 }
 
+export interface Tool {
+    type: string;
+    function: { name: string, description: string, parameters: object, strict?: boolean };
+}
+
+/**
+ * Options for a chat completion request.
+ */
 export interface ChatOptions {
+    /** The model to use for the completion. Defaults to `gpt-5-nano` if not specified. */
     model?: string;
+    /** Sampling temperature between 0 and 2. Lower values are more focused and deterministic, higher values more random. Defaults to the model's own default. */
     temperature?: number;
     max_tokens?: number;
     vision?: boolean;
     driver?: string;
-    tools?: unknown;
+    /** The provider to route the request through. */
+    provider?: string;
+    /** Function/tool definitions the model can call. See Function Calling. */
+    tools?: Tool[];
     response?: unknown;
-    reasoning?: unknown;
+    /**
+     * Controls how much effort reasoning models spend thinking. Flat form.
+     * Accepted values: `none`, `minimal`, `low`, `medium`, `high`, `xhigh`
+     * (availability varies by model; default `medium` on newer GPT-5.x models).
+     * Reasoning models only.
+     */
     reasoning_effort?: string;
-    text?: unknown;
-    verbosity?: unknown;
+    /**
+     * Nested form of `reasoning_effort`. The `effort` value accepts the same
+     * values as `reasoning_effort`. Reasoning models only.
+     */
+    reasoning?: { effort: string};
+    /**
+     * Controls how long or short responses are. Flat form. Accepted values:
+     * `low`, `medium`, `high`. Reasoning models only.
+     */
+    verbosity?: string;
+    /**
+     * Nested form of `verbosity` — it lives under `text`. The `verbosity` value
+     * accepts the same values as `verbosity`. Reasoning models only.
+     */
+    text?: { verbosity: string};
+    /**
+     * Controls image output for image-capable models.
+     * - `aspect_ratio`: aspect ratio of the generated image, e.g. `"16:9"`, `"1:1"`, `"9:16"`.
+     * - `image_size`: output quality/resolution; must be one of the model's supported quality levels.
+     */
+    image_config?: { aspect_ratio: string, image_size: string };
 }
 
 export interface StreamingChatOptions extends ChatOptions {
@@ -34,9 +79,27 @@ export interface ChatResponse {
     choices?: unknown;
 }
 
+/**
+ * A single chunk of a streaming chat response. Each chunk has a `type`
+ * discriminator; which other fields are present depends on that `type`.
+ */
 export interface ChatResponseChunk {
+    /** The kind of chunk: `"text"`, `"reasoning"`, `"tool_use"`, `"extra_content"`, or `"usage"`. */
+    type: string;
+    /** Text delta. Present on `"text"` chunks. */
     text?: string;
+    /** Reasoning/thinking delta. Present on `"reasoning"` chunks. */
     reasoning?: string;
+    /** Tool call id. Present on `"tool_use"` chunks. */
+    id?: string;
+    /** Tool/function name. Present on `"tool_use"` chunks. */
+    name?: string;
+    /** Parsed tool call arguments. Present on `"tool_use"` chunks. */
+    input?: unknown;
+    /** Provider-specific extra metadata. */
+    extra_content?: unknown;
+    /** Token usage totals. Present on the final `"usage"` chunk. */
+    usage?: Record<string, number>;
 }
 
 export interface Img2TxtOptions {
@@ -53,30 +116,88 @@ export interface Img2TxtOptions {
 }
 
 export interface Txt2ImgOptions {
+    /** Text description of the image to generate. */
     prompt?: string;
+    /**
+     * Image model to use (provider-specific). Defaults to `'gpt-image-1-mini'`
+     * (OpenAI), or `'grok-2-image'` when `provider` is `'xai'`.
+     */
     model?: string;
+    /**
+     * Image quality / output size tier. Interpretation is provider- and
+     * model-specific:
+     * - OpenAI GPT models: `'high'` | `'medium'` | `'low'` (default `'low'`);
+     *   `gpt-image-2` also accepts `'auto'`.
+     * - OpenAI DALL-E 3: `'hd'` | `'standard'` (default `'standard'`).
+     * - Gemini: output size tier `'512'` | `'1K'` | `'2K'` | `'4K'`
+     *   (availability varies by model).
+     */
     quality?: string;
+    /**
+     * An input image for image-to-image generation. Replicate expects a URL;
+     * Gemini expects a base64-encoded image.
+     */
     input_image?: string;
+    /**
+     * Multiple input images for image-to-image / multi-image generation.
+     * Gemini expects base64-encoded images; Replicate expects image URLs.
+     */
+    input_images?: string[];
+    /**
+     * MIME type of the input image(s), e.g. `'image/png'`. Used as a fallback
+     * when the type cannot be auto-detected (Gemini).
+     */
     input_image_mime_type?: string;
     driver?: string;
     provider?: string;
     service?: string;
+    /**
+     * Aspect ratio as `{ w, h }` (e.g. `{ w: 16, h: 9 }`). Supported by OpenAI,
+     * Gemini, and Replicate.
+     */
     ratio?: { w: number; h: number };
+    /** Width of the image to generate, in pixels (Together). Default `1024`. */
     width?: number;
+    /** Height of the image to generate, in pixels (Together). Default `1024`. */
     height?: number;
+    /** Alternative way to specify the aspect ratio (Together). */
     aspect_ratio?: string;
+    /**
+     * Number of generation/inference steps (Together, default `20`; Replicate
+     * `flux-schnell`).
+     */
     steps?: number;
+    /** Seed used for generation; reuse to reproduce results (Together, Replicate). */
     seed?: number;
+    /** Prompt describing what NOT to guide the image generation toward (Together). */
     negative_prompt?: string;
+    /** Number of image results to generate (Together). Default `1`. */
     n?: number;
+    /** URL of an input image for models that support it (Together). */
     image_url?: string;
+    /** Base64-encoded input image for image-to-image generation (Together). */
     image_base64?: string;
+    /** URL of a mask image for inpainting (Together). */
     mask_image_url?: string;
+    /** Base64-encoded mask image for inpainting (Together). */
     mask_image_base64?: string;
+    /** How strongly the prompt influences the output (Together). */
     prompt_strength?: number;
+    /** When `true`, disables the safety checker (Together, Replicate). */
     disable_safety_checker?: boolean;
+    /**
+     * Format of the image response. Together: `'base64'` | `'url'`. Replicate:
+     * output format, e.g. `'webp'` | `'jpg'` | `'png'`.
+     */
     response_format?: string;
+    /** When `true`, returns a sample image without using credits. */
     test_mode?: boolean;
+    /**
+     * When set, the generated image is saved to this path on the Puter
+     * filesystem. Relative paths resolve against the app's data directory
+     * (`~/AppData/<appID>/`) when called from an app, or `~/` otherwise. The
+     * caller must have write permission to the destination.
+     */
     puter_output_path?: string;
 }
 
@@ -111,24 +232,114 @@ export interface Txt2VidOptions {
 }
 
 export interface Txt2SpeechOptions {
+    /** Text to synthesize. Must be less than 3000 characters. */
     text?: string;
+    /** Language code. For AWS Polly defaults to `'en-US'`; for xAI a BCP-47 code defaulting to `'en'` (supports `'auto'`). */
     language?: string;
+    /** Voice ID used for synthesis (provider-specific). Defaults to `'Joanna'` (aws-polly), `'alloy'` (openai), `'21m00Tcm4TlvDq8ikWAM'` (elevenlabs), `'Kore'` (gemini), `'eve'` (xai). */
     voice?: string;
+    /** AWS Polly synthesis engine: `'standard'` (default), `'neural'`, `'long-form'`, or `'generative'`. */
     engine?: string;
+    /** TTS provider: `'aws-polly'` (default), `'openai'`, `'elevenlabs'`, `'gemini'`, or `'xai'`. */
     provider?: string;
+    /** Model identifier (provider-specific). */
     model?: string;
+    /** OpenAI output format: `'mp3'` (default), `'wav'`, `'opus'`, `'aac'`, `'flac'`, or `'pcm'`. */
     response_format?: string;
+    /** Output format for ElevenLabs (defaults to `'mp3_44100_128'`) and xAI (`'mp3'` default, `'wav'`, `'pcm'`, `'mulaw'`, `'alaw'`). */
     output_format?: string;
+    /** Natural-language guidance for voice style such as tone, speed, and mood (OpenAI and Gemini). */
     instructions?: string;
+    /** ElevenLabs voice tuning options (e.g. stability, similarity boost, speed). */
     voice_settings?: Record<string, unknown>;
+    /** When `true`, AWS Polly treats `text` as SSML markup. */
     ssml?: boolean;
+    /** When `true`, returns a sample audio without using credits. */
     test_mode?: boolean;
 }
 
+export interface ListTTSEnginesOptions {
+    /** TTS provider to query. Defaults to `'aws-polly'`. */
+    provider?: string;
+}
+
+/** A TTS engine/model as returned by `txt2speech.listEngines()`. */
+export interface TTSEngine {
+    /** Engine/model identifier. */
+    id: string;
+    /** Human-readable engine name. */
+    name: string;
+    /** Provider this engine belongs to. */
+    provider: string;
+    /** Cost per million characters (may be absent). */
+    pricing_per_million_chars?: number;
+}
+
+export interface ListTTSVoicesOptions {
+    /** TTS provider to query. Defaults to `'aws-polly'`. */
+    provider?: string;
+    /** Engine/model filter (provider-specific, ignored by some providers). */
+    engine?: string;
+}
+
+/** A TTS voice as returned by `txt2speech.listVoices()`. */
+export interface TTSVoice {
+    /** Voice identifier to pass to `txt2speech()`. */
+    id: string;
+    /** Human-readable voice name. */
+    name: string;
+    /** Provider this voice belongs to. */
+    provider: string;
+    /** Language info (may be absent). */
+    language?: { name: string; code: string };
+    /** Short description of the voice (may be absent). */
+    description?: string;
+    /** Voice category, e.g. `'premade'` (may be absent). */
+    category?: string;
+    /** Provider-specific labels (may be absent). */
+    labels?: Record<string, unknown>;
+    /** Model IDs this voice works with (may be absent). */
+    supported_models?: string[];
+    /** Engine types this voice supports (may be absent). */
+    supported_engines?: string[];
+}
+
+/**
+ * Converts text to speech. Callable directly, with `listEngines` and
+ * `listVoices` helpers attached for discovering available engines and voices.
+ */
+export interface Txt2Speech {
+    (text: string, testMode?: boolean): Promise<HTMLAudioElement>;
+    (text: string, options: Txt2SpeechOptions, testMode?: boolean): Promise<HTMLAudioElement>;
+    (text: string, language: string, testMode?: boolean): Promise<HTMLAudioElement>;
+    (text: string, language: string, voice: string, testMode?: boolean): Promise<HTMLAudioElement>;
+    (text: string, language: string, voice: string, engine: string, testMode?: boolean): Promise<HTMLAudioElement>;
+
+    /** List available TTS engines/models with pricing information. */
+    listEngines (provider?: string): Promise<TTSEngine[]>;
+    listEngines (options?: ListTTSEnginesOptions): Promise<TTSEngine[]>;
+
+    /** List available TTS voices, optionally filtered by provider/engine. */
+    listVoices (engine?: string): Promise<TTSVoice[]>;
+    listVoices (options?: ListTTSVoicesOptions): Promise<TTSVoice[]>;
+}
+
+export interface Speech2TxtWord {
+    text: string;
+    start: number;
+    end: number;
+    /** Detected speaker, present when `diarize: true` (xAI). */
+    speaker?: string;
+}
+
 export interface Speech2TxtResult {
     text: string;
     language: string;
     segments?: Record<string, unknown>[];
+    /** Duration of the audio in seconds (provider-dependent, e.g. xAI). */
+    duration?: number;
+    /** Per-word timestamps (provider-dependent, e.g. xAI). */
+    words?: Speech2TxtWord[];
 }
 
 interface BaseSpeech2TxtOptions {
@@ -211,7 +422,7 @@ export class AI {
     txt2vid (prompt: string, options: Txt2VidOptions): Promise<HTMLVideoElement>;
     txt2vid (options: Txt2VidOptions, testMode?: boolean): Promise<HTMLVideoElement>;
 
-    speech2txt (source: string | File | Blob, testMode?: boolean): Promise<string>;
+    speech2txt (source: string | File | Blob, testMode?: boolean): Promise<Speech2TxtResult>;
     speech2txt (source: string | File | Blob, options: TextFormatSpeech2TxtOptions, testMode?: boolean): Promise<string>;
     speech2txt (source: string | File | Blob, options: Speech2TxtOptions, testMode?: boolean): Promise<Speech2TxtResult>;
     speech2txt (options: TextFormatSpeech2TxtOptions, testMode?: boolean): Promise<string>;
@@ -221,11 +432,7 @@ export class AI {
     speech2speech (source: string | File | Blob, options: Speech2SpeechOptions, testMode?: boolean): Promise<HTMLAudioElement>;
     speech2speech (options: Speech2SpeechOptions, testMode?: boolean): Promise<HTMLAudioElement>;
 
-    txt2speech (text: string, testMode?: boolean): Promise<HTMLAudioElement>;
-    txt2speech (text: string, options: Txt2SpeechOptions, testMode?: boolean): Promise<HTMLAudioElement>;
-    txt2speech (text: string, language: string, testMode?: boolean): Promise<HTMLAudioElement>;
-    txt2speech (text: string, language: string, voice: string, testMode?: boolean): Promise<HTMLAudioElement>;
-    txt2speech (text: string, language: string, voice: string, engine: string, testMode?: boolean): Promise<HTMLAudioElement>;
+    txt2speech: Txt2Speech;
 }
 
 // NOTE: AI responses contain provider-specific payloads that are not fully typed here because