diff --git a/src/docs/src/AI/chat.md b/src/docs/src/AI/chat.md index ec007d54fd..794a8405b2 100755 --- a/src/docs/src/AI/chat.md +++ b/src/docs/src/AI/chat.md @@ -33,7 +33,7 @@ An object containing the following properties: - `temperature` (Number) - A number between 0 and 2 indicating the randomness of the completion. Lower values make the output more focused and deterministic, while higher values make it more random. By default, the specific model's temperature is used. - `tools` (Array) (Optional) - Function definitions the AI can call. See [Function Calling](#function-calling) for details. - `reasoning_effort` / `reasoning.effort` (String) (Optional) - Controls how much effort reasoning models spend thinking. Supported values: `none`, `minimal`, `low`, `medium`, `high`, and `xhigh`. Lower values give faster responses with less reasoning. OpenAI models only. -- `text` / `text_verbosity` (String) (Optional) - Controls how long or short responses are. Supported values: `low`, `medium`, and `high`. Lower values give shorter responses. OpenAI models only. +- `verbosity` / `text.verbosity` (String) (Optional) - Controls how long or short responses are. Supported values: `low`, `medium`, and `high`. Lower values give shorter responses. OpenAI models only. #### `testMode` (Boolean) (Optional) diff --git a/src/docs/src/AI/speech2txt.md b/src/docs/src/AI/speech2txt.md index 8d6c607237..acc994eeb5 100644 --- a/src/docs/src/AI/speech2txt.md +++ b/src/docs/src/AI/speech2txt.md @@ -44,7 +44,7 @@ Fine-tune how transcription runs. - `chunking_strategy` (String): Required for `gpt-4o-transcribe-diarize` inputs longer than 30 seconds (recommend `"auto"`). - `known_speaker_names` / `known_speaker_references` (Array): Optional diarization references encoded as data URLs. - `extra_body` (Object): Forwarded verbatim to the OpenAI API for experimental flags. -- `stream` (Boolean): Reserved for future streaming support. Currently rejected when `true`. +- `stream` (Boolean): Reserved for future streaming support. Streaming is not currently supported. - `test_mode` (Boolean): When `true`, returns a sample response without using credits. Defaults to `false`. **xAI-specific options** (when `provider: 'xai'`): @@ -65,8 +65,8 @@ When `true`, skips the live API call and returns a static sample transcript so y Returns a `Promise` that resolves to either: -- A string (when `response_format: "text"` or you pass a shorthand `source` with no options), or -- An object of [`Speech2TxtResult`](/Objects/speech2txtresult) containing the transcription payload (including diarization segments, timestamps, etc., depending on the selected model and format). +- A string (when `response_format: "text"`), or +- An object of [`Speech2TxtResult`](/Objects/speech2txtresult) containing the transcription payload (including diarization segments, timestamps, etc., depending on the selected model and format). This is the default, including when you pass a bare `source` with no options. ## Examples @@ -79,7 +79,7 @@ Returns a `Promise` that resolves to either: diff --git a/src/docs/src/AI/txt2speech.listEngines.md b/src/docs/src/AI/txt2speech.listEngines.md index 74b521f8e6..95e2ee4951 100644 --- a/src/docs/src/AI/txt2speech.listEngines.md +++ b/src/docs/src/AI/txt2speech.listEngines.md @@ -32,14 +32,7 @@ Common aliases are also accepted (e.g. `'eleven'`, `'google'`, `'grok'`). ## Return value -A `Promise` that resolves to an array of engine objects. Each object contains: - -| Field | Type | Description | -|-------|------|-------------| -| `id` | `String` | Engine/model identifier | -| `name` | `String` | Human-readable engine name | -| `provider` | `String` | Provider this engine belongs to | -| `pricing_per_million_chars` | `Number` | Cost per million characters (may be absent) | +A `Promise` that resolves to an array of [`TTSEngine`](/Objects/ttsengine) objects. Example response: diff --git a/src/docs/src/AI/txt2speech.listVoices.md b/src/docs/src/AI/txt2speech.listVoices.md index 78851f3e3d..0111db3126 100644 --- a/src/docs/src/AI/txt2speech.listVoices.md +++ b/src/docs/src/AI/txt2speech.listVoices.md @@ -26,19 +26,7 @@ When `options` is a plain string it is treated as an `engine` filter for the def ## Return value -A `Promise` that resolves to an array of voice objects. Each object contains: - -| Field | Type | Description | -|-------|------|-------------| -| `id` | `String` | Voice identifier to pass to `txt2speech()` | -| `name` | `String` | Human-readable voice name | -| `provider` | `String` | Provider this voice belongs to | -| `language` | `Object` | `{ name, code }` language info (may be absent) | -| `description` | `String` | Short description of the voice (may be absent) | -| `category` | `String` | Voice category, e.g. `'premade'` (may be absent) | -| `labels` | `Object` | Provider-specific labels (may be absent) | -| `supported_models` | `Array` | Model IDs this voice works with (may be absent) | -| `supported_engines` | `Array` | Engine types this voice supports (may be absent) | +A `Promise` that resolves to an array of [`TTSVoice`](/Objects/ttsvoice) objects. Example response: diff --git a/src/docs/src/AI/txt2speech.md b/src/docs/src/AI/txt2speech.md index 1824fbefff..68dcee450b 100755 --- a/src/docs/src/AI/txt2speech.md +++ b/src/docs/src/AI/txt2speech.md @@ -96,7 +96,7 @@ Available when `provider: 'xai'`: | `language` | `String` | BCP-47 language code. Defaults to `'en'`. Supports `'auto'` for auto-detection and 20+ languages | | `output_format` | `String` | Output codec. Available: `'mp3'` (default), `'wav'`, `'pcm'`, `'mulaw'`, `'alaw'` | -Text supports inline speech tags like `[pause]`, `[laugh]` and wrapping tags like `text` for expressive delivery. Maximum 15,000 characters per request. +Text supports inline speech tags like `[pause]`, `[laugh]` and wrapping tags like `text` for expressive delivery. For more details, see the [xAI TTS documentation](https://x.ai/news/grok-stt-and-tts-apis). diff --git a/src/docs/src/Objects.md b/src/docs/src/Objects.md index b945646eff..600fa6b558 100644 --- a/src/docs/src/Objects.md +++ b/src/docs/src/Objects.md @@ -17,6 +17,8 @@ Various object types and classes that represent different entities in the Puter - **[MonthlyUsage](/Objects/monthlyusage/)** - Represents user's monthly resource usage information - **[Speech2TxtResult](/Objects/speech2txtresult/)** - Represents speech-to-text transcription results - **[Subdomain](/Objects/subdomain/)** - Represents a subdomain +- **[TTSEngine](/Objects/ttsengine/)** - Represents an available text-to-speech engine/model +- **[TTSVoice](/Objects/ttsvoice/)** - Represents an available text-to-speech voice - **[ToolCall](/Objects/toolcall/)** - Represents a tool invocation request - **[User](/Objects/user/)** - Represents a Puter user - **[WorkerDeployment](/Objects/workerdeployment/)** - Represents a worker deployment result diff --git a/src/docs/src/Objects/chatresponse.md b/src/docs/src/Objects/chatresponse.md index 8d9a6c1feb..8d4305e5be 100644 --- a/src/docs/src/Objects/chatresponse.md +++ b/src/docs/src/Objects/chatresponse.md @@ -16,3 +16,9 @@ An object containing the chat message data. - `content` (String) - The content of the message. - `tool_calls` (Array) - An optional array of [`ToolCall`](/Objects/toolcall) objects if the model wants to call tools. + +- `tool_call_id` (String) - An optional identifier linking this message to the tool call it responds to. + +- `cache_control` (Object) - An optional object controlling prompt caching for this message. Contains a `type` (String) property. + +- `images` (Array) - An array of image content objects associated with the message. Each object contains a `type` (String) and an `image_url` object with a `url` (String) property. diff --git a/src/docs/src/Objects/chatresponsechunk.md b/src/docs/src/Objects/chatresponsechunk.md index 9a59abf387..5622481771 100644 --- a/src/docs/src/Objects/chatresponsechunk.md +++ b/src/docs/src/Objects/chatresponsechunk.md @@ -5,8 +5,44 @@ description: The ChatResponseChunk object containing a chunk of streaming chat r The `ChatResponseChunk` object containing a chunk of streaming chat response data. +Each chunk has a `type` indicating its kind. The other attributes that are present depend on that `type`. + ## Attributes +#### `type` (String) + +The kind of chunk. One of: + +- `"text"` - A portion of the response text. +- `"reasoning"` - A portion of the model's reasoning/thinking output. +- `"tool_use"` - A tool/function the model wants to call. +- `"extra_content"` - Provider-specific metadata. +- `"usage"` - Token usage totals, emitted as the final chunk. + #### `text` (String) -A string containing a portion of the chat response text in streaming mode. +A portion of the chat response text. Present on `text` chunks. + +#### `reasoning` (String) + +A portion of the model's reasoning output. Present on `reasoning` chunks. + +#### `id` (String) + +The unique identifier for the tool call. Present on `tool_use` chunks. + +#### `name` (String) + +The name of the function/tool to call. Present on `tool_use` chunks. + +#### `input` (Object) + +The parsed arguments for the tool call. Present on `tool_use` chunks. + +#### `extra_content` + +Provider-specific metadata attached to the stream. + +#### `usage` (Object) + +An object containing token usage totals. Present on the final `usage` chunk. diff --git a/src/docs/src/Objects/speech2txtresult.md b/src/docs/src/Objects/speech2txtresult.md index 48b4fcb007..8d3fa1e192 100644 --- a/src/docs/src/Objects/speech2txtresult.md +++ b/src/docs/src/Objects/speech2txtresult.md @@ -18,3 +18,16 @@ A string containing the detected or specified language of the audio. #### `segments` (Array) An optional array of segment objects containing detailed transcription information. + +#### `duration` (Number) + +An optional duration of the audio in seconds. Provider-dependent (e.g. returned by xAI). + +#### `words` (Array) + +An optional array of per-word timestamp objects. Provider-dependent (e.g. returned by xAI). Each word has: + +- `text` (String): The transcribed word. +- `start` (Number): Start time of the word in seconds. +- `end` (Number): End time of the word in seconds. +- `speaker` (String): Detected speaker, present when `diarize: true`. diff --git a/src/docs/src/Objects/ttsengine.md b/src/docs/src/Objects/ttsengine.md new file mode 100644 index 0000000000..3978d783a1 --- /dev/null +++ b/src/docs/src/Objects/ttsengine.md @@ -0,0 +1,24 @@ +--- +title: TTSEngine +description: The TTSEngine object describing an available text-to-speech engine/model. +--- + +The `TTSEngine` object describes a text-to-speech engine/model available from a provider, including pricing metadata where available. Arrays of these objects are returned by [`puter.ai.txt2speech.listEngines()`](/AI/txt2speech.listEngines). + +## Attributes + +#### `id` (String) + +The engine/model identifier. + +#### `name` (String) + +A human-readable engine name. + +#### `provider` (String) + +The provider this engine belongs to, e.g. `'aws-polly'`, `'openai'`, `'elevenlabs'`, `'gemini'`, `'xai'`. + +#### `pricing_per_million_chars` (Number) + +An optional cost per million characters. May be absent when the provider does not expose pricing. diff --git a/src/docs/src/Objects/ttsvoice.md b/src/docs/src/Objects/ttsvoice.md new file mode 100644 index 0000000000..fc6cea76ae --- /dev/null +++ b/src/docs/src/Objects/ttsvoice.md @@ -0,0 +1,44 @@ +--- +title: TTSVoice +description: The TTSVoice object describing an available text-to-speech voice. +--- + +The `TTSVoice` object describes a text-to-speech voice available from a provider, including metadata such as language, category, and supported models/engines. Arrays of these objects are returned by [`puter.ai.txt2speech.listVoices()`](/AI/txt2speech.listVoices). + +## Attributes + +#### `id` (String) + +The voice identifier to pass to [`puter.ai.txt2speech()`](/AI/txt2speech). + +#### `name` (String) + +A human-readable voice name. + +#### `provider` (String) + +The provider this voice belongs to, e.g. `'aws-polly'`, `'openai'`, `'elevenlabs'`, `'gemini'`, `'xai'`. + +#### `language` (Object) + +An optional object describing the voice's language. Contains a `name` (String) and a `code` (String) property. May be absent. + +#### `description` (String) + +An optional short description of the voice. May be absent. + +#### `category` (String) + +An optional voice category, e.g. `'premade'`. May be absent. + +#### `labels` (Object) + +An optional object of provider-specific labels. May be absent. + +#### `supported_models` (Array) + +An optional array of model IDs (Strings) this voice works with. May be absent. + +#### `supported_engines` (Array) + +An optional array of engine types (Strings) this voice supports. May be absent. diff --git a/src/docs/src/playground/examples/ai-speech2txt.html b/src/docs/src/playground/examples/ai-speech2txt.html index c21d758891..ba367f24cb 100644 --- a/src/docs/src/playground/examples/ai-speech2txt.html +++ b/src/docs/src/playground/examples/ai-speech2txt.html @@ -4,7 +4,7 @@ diff --git a/src/docs/src/sidebar.js b/src/docs/src/sidebar.js index db265a1617..3957a332c3 100755 --- a/src/docs/src/sidebar.js +++ b/src/docs/src/sidebar.js @@ -1243,6 +1243,20 @@ let sidebar = [ source: '/Objects/subdomain.md', path: '/Objects/subdomain', }, + { + title: 'TTSEngine', + title_tag: 'TTSEngine', + icon: '/assets/img/object.svg', + source: '/Objects/ttsengine.md', + path: '/Objects/ttsengine', + }, + { + title: 'TTSVoice', + title_tag: 'TTSVoice', + icon: '/assets/img/object.svg', + source: '/Objects/ttsvoice.md', + path: '/Objects/ttsvoice', + }, { title: 'ToolCall', title_tag: 'ToolCall', diff --git a/src/puter-js/types/modules/ai.d.ts b/src/puter-js/types/modules/ai.d.ts index b2e30e2b9d..9d1883d2a8 100644 --- a/src/puter-js/types/modules/ai.d.ts +++ b/src/puter-js/types/modules/ai.d.ts @@ -1,9 +1,17 @@ export type AIMessageContent = string | { image_url?: { url: string } } | { video_url?: { url: string } } | Record; +export interface ImageContent { + type: string; + image_url: { url: string }; +} + export interface ChatMessage { role?: string; content: AIMessageContent | AIMessageContent[]; tool_calls?: ToolCall[]; + tool_call_id?: string; + cache_control?: { type: string }; + images: ImageContent[]; } export interface ToolCall { @@ -11,18 +19,55 @@ export interface ToolCall { function: { name: string, arguments: string }; } +export interface Tool { + type: string; + function: { name: string, description: string, parameters: object, strict?: boolean }; +} + +/** + * Options for a chat completion request. + */ export interface ChatOptions { + /** The model to use for the completion. Defaults to `gpt-5-nano` if not specified. */ model?: string; + /** Sampling temperature between 0 and 2. Lower values are more focused and deterministic, higher values more random. Defaults to the model's own default. */ temperature?: number; max_tokens?: number; vision?: boolean; driver?: string; - tools?: unknown; + /** The provider to route the request through. */ + provider?: string; + /** Function/tool definitions the model can call. See Function Calling. */ + tools?: Tool[]; response?: unknown; - reasoning?: unknown; + /** + * Controls how much effort reasoning models spend thinking. Flat form. + * Accepted values: `none`, `minimal`, `low`, `medium`, `high`, `xhigh` + * (availability varies by model; default `medium` on newer GPT-5.x models). + * Reasoning models only. + */ reasoning_effort?: string; - text?: unknown; - verbosity?: unknown; + /** + * Nested form of `reasoning_effort`. The `effort` value accepts the same + * values as `reasoning_effort`. Reasoning models only. + */ + reasoning?: { effort: string}; + /** + * Controls how long or short responses are. Flat form. Accepted values: + * `low`, `medium`, `high`. Reasoning models only. + */ + verbosity?: string; + /** + * Nested form of `verbosity` — it lives under `text`. The `verbosity` value + * accepts the same values as `verbosity`. Reasoning models only. + */ + text?: { verbosity: string}; + /** + * Controls image output for image-capable models. + * - `aspect_ratio`: aspect ratio of the generated image, e.g. `"16:9"`, `"1:1"`, `"9:16"`. + * - `image_size`: output quality/resolution; must be one of the model's supported quality levels. + */ + image_config?: { aspect_ratio: string, image_size: string }; } export interface StreamingChatOptions extends ChatOptions { @@ -34,9 +79,27 @@ export interface ChatResponse { choices?: unknown; } +/** + * A single chunk of a streaming chat response. Each chunk has a `type` + * discriminator; which other fields are present depends on that `type`. + */ export interface ChatResponseChunk { + /** The kind of chunk: `"text"`, `"reasoning"`, `"tool_use"`, `"extra_content"`, or `"usage"`. */ + type: string; + /** Text delta. Present on `"text"` chunks. */ text?: string; + /** Reasoning/thinking delta. Present on `"reasoning"` chunks. */ reasoning?: string; + /** Tool call id. Present on `"tool_use"` chunks. */ + id?: string; + /** Tool/function name. Present on `"tool_use"` chunks. */ + name?: string; + /** Parsed tool call arguments. Present on `"tool_use"` chunks. */ + input?: unknown; + /** Provider-specific extra metadata. */ + extra_content?: unknown; + /** Token usage totals. Present on the final `"usage"` chunk. */ + usage?: Record; } export interface Img2TxtOptions { @@ -53,30 +116,88 @@ export interface Img2TxtOptions { } export interface Txt2ImgOptions { + /** Text description of the image to generate. */ prompt?: string; + /** + * Image model to use (provider-specific). Defaults to `'gpt-image-1-mini'` + * (OpenAI), or `'grok-2-image'` when `provider` is `'xai'`. + */ model?: string; + /** + * Image quality / output size tier. Interpretation is provider- and + * model-specific: + * - OpenAI GPT models: `'high'` | `'medium'` | `'low'` (default `'low'`); + * `gpt-image-2` also accepts `'auto'`. + * - OpenAI DALL-E 3: `'hd'` | `'standard'` (default `'standard'`). + * - Gemini: output size tier `'512'` | `'1K'` | `'2K'` | `'4K'` + * (availability varies by model). + */ quality?: string; + /** + * An input image for image-to-image generation. Replicate expects a URL; + * Gemini expects a base64-encoded image. + */ input_image?: string; + /** + * Multiple input images for image-to-image / multi-image generation. + * Gemini expects base64-encoded images; Replicate expects image URLs. + */ + input_images?: string[]; + /** + * MIME type of the input image(s), e.g. `'image/png'`. Used as a fallback + * when the type cannot be auto-detected (Gemini). + */ input_image_mime_type?: string; driver?: string; provider?: string; service?: string; + /** + * Aspect ratio as `{ w, h }` (e.g. `{ w: 16, h: 9 }`). Supported by OpenAI, + * Gemini, and Replicate. + */ ratio?: { w: number; h: number }; + /** Width of the image to generate, in pixels (Together). Default `1024`. */ width?: number; + /** Height of the image to generate, in pixels (Together). Default `1024`. */ height?: number; + /** Alternative way to specify the aspect ratio (Together). */ aspect_ratio?: string; + /** + * Number of generation/inference steps (Together, default `20`; Replicate + * `flux-schnell`). + */ steps?: number; + /** Seed used for generation; reuse to reproduce results (Together, Replicate). */ seed?: number; + /** Prompt describing what NOT to guide the image generation toward (Together). */ negative_prompt?: string; + /** Number of image results to generate (Together). Default `1`. */ n?: number; + /** URL of an input image for models that support it (Together). */ image_url?: string; + /** Base64-encoded input image for image-to-image generation (Together). */ image_base64?: string; + /** URL of a mask image for inpainting (Together). */ mask_image_url?: string; + /** Base64-encoded mask image for inpainting (Together). */ mask_image_base64?: string; + /** How strongly the prompt influences the output (Together). */ prompt_strength?: number; + /** When `true`, disables the safety checker (Together, Replicate). */ disable_safety_checker?: boolean; + /** + * Format of the image response. Together: `'base64'` | `'url'`. Replicate: + * output format, e.g. `'webp'` | `'jpg'` | `'png'`. + */ response_format?: string; + /** When `true`, returns a sample image without using credits. */ test_mode?: boolean; + /** + * When set, the generated image is saved to this path on the Puter + * filesystem. Relative paths resolve against the app's data directory + * (`~/AppData//`) when called from an app, or `~/` otherwise. The + * caller must have write permission to the destination. + */ puter_output_path?: string; } @@ -111,24 +232,114 @@ export interface Txt2VidOptions { } export interface Txt2SpeechOptions { + /** Text to synthesize. Must be less than 3000 characters. */ text?: string; + /** Language code. For AWS Polly defaults to `'en-US'`; for xAI a BCP-47 code defaulting to `'en'` (supports `'auto'`). */ language?: string; + /** Voice ID used for synthesis (provider-specific). Defaults to `'Joanna'` (aws-polly), `'alloy'` (openai), `'21m00Tcm4TlvDq8ikWAM'` (elevenlabs), `'Kore'` (gemini), `'eve'` (xai). */ voice?: string; + /** AWS Polly synthesis engine: `'standard'` (default), `'neural'`, `'long-form'`, or `'generative'`. */ engine?: string; + /** TTS provider: `'aws-polly'` (default), `'openai'`, `'elevenlabs'`, `'gemini'`, or `'xai'`. */ provider?: string; + /** Model identifier (provider-specific). */ model?: string; + /** OpenAI output format: `'mp3'` (default), `'wav'`, `'opus'`, `'aac'`, `'flac'`, or `'pcm'`. */ response_format?: string; + /** Output format for ElevenLabs (defaults to `'mp3_44100_128'`) and xAI (`'mp3'` default, `'wav'`, `'pcm'`, `'mulaw'`, `'alaw'`). */ output_format?: string; + /** Natural-language guidance for voice style such as tone, speed, and mood (OpenAI and Gemini). */ instructions?: string; + /** ElevenLabs voice tuning options (e.g. stability, similarity boost, speed). */ voice_settings?: Record; + /** When `true`, AWS Polly treats `text` as SSML markup. */ ssml?: boolean; + /** When `true`, returns a sample audio without using credits. */ test_mode?: boolean; } +export interface ListTTSEnginesOptions { + /** TTS provider to query. Defaults to `'aws-polly'`. */ + provider?: string; +} + +/** A TTS engine/model as returned by `txt2speech.listEngines()`. */ +export interface TTSEngine { + /** Engine/model identifier. */ + id: string; + /** Human-readable engine name. */ + name: string; + /** Provider this engine belongs to. */ + provider: string; + /** Cost per million characters (may be absent). */ + pricing_per_million_chars?: number; +} + +export interface ListTTSVoicesOptions { + /** TTS provider to query. Defaults to `'aws-polly'`. */ + provider?: string; + /** Engine/model filter (provider-specific, ignored by some providers). */ + engine?: string; +} + +/** A TTS voice as returned by `txt2speech.listVoices()`. */ +export interface TTSVoice { + /** Voice identifier to pass to `txt2speech()`. */ + id: string; + /** Human-readable voice name. */ + name: string; + /** Provider this voice belongs to. */ + provider: string; + /** Language info (may be absent). */ + language?: { name: string; code: string }; + /** Short description of the voice (may be absent). */ + description?: string; + /** Voice category, e.g. `'premade'` (may be absent). */ + category?: string; + /** Provider-specific labels (may be absent). */ + labels?: Record; + /** Model IDs this voice works with (may be absent). */ + supported_models?: string[]; + /** Engine types this voice supports (may be absent). */ + supported_engines?: string[]; +} + +/** + * Converts text to speech. Callable directly, with `listEngines` and + * `listVoices` helpers attached for discovering available engines and voices. + */ +export interface Txt2Speech { + (text: string, testMode?: boolean): Promise; + (text: string, options: Txt2SpeechOptions, testMode?: boolean): Promise; + (text: string, language: string, testMode?: boolean): Promise; + (text: string, language: string, voice: string, testMode?: boolean): Promise; + (text: string, language: string, voice: string, engine: string, testMode?: boolean): Promise; + + /** List available TTS engines/models with pricing information. */ + listEngines (provider?: string): Promise; + listEngines (options?: ListTTSEnginesOptions): Promise; + + /** List available TTS voices, optionally filtered by provider/engine. */ + listVoices (engine?: string): Promise; + listVoices (options?: ListTTSVoicesOptions): Promise; +} + +export interface Speech2TxtWord { + text: string; + start: number; + end: number; + /** Detected speaker, present when `diarize: true` (xAI). */ + speaker?: string; +} + export interface Speech2TxtResult { text: string; language: string; segments?: Record[]; + /** Duration of the audio in seconds (provider-dependent, e.g. xAI). */ + duration?: number; + /** Per-word timestamps (provider-dependent, e.g. xAI). */ + words?: Speech2TxtWord[]; } interface BaseSpeech2TxtOptions { @@ -211,7 +422,7 @@ export class AI { txt2vid (prompt: string, options: Txt2VidOptions): Promise; txt2vid (options: Txt2VidOptions, testMode?: boolean): Promise; - speech2txt (source: string | File | Blob, testMode?: boolean): Promise; + speech2txt (source: string | File | Blob, testMode?: boolean): Promise; speech2txt (source: string | File | Blob, options: TextFormatSpeech2TxtOptions, testMode?: boolean): Promise; speech2txt (source: string | File | Blob, options: Speech2TxtOptions, testMode?: boolean): Promise; speech2txt (options: TextFormatSpeech2TxtOptions, testMode?: boolean): Promise; @@ -221,11 +432,7 @@ export class AI { speech2speech (source: string | File | Blob, options: Speech2SpeechOptions, testMode?: boolean): Promise; speech2speech (options: Speech2SpeechOptions, testMode?: boolean): Promise; - txt2speech (text: string, testMode?: boolean): Promise; - txt2speech (text: string, options: Txt2SpeechOptions, testMode?: boolean): Promise; - txt2speech (text: string, language: string, testMode?: boolean): Promise; - txt2speech (text: string, language: string, voice: string, testMode?: boolean): Promise; - txt2speech (text: string, language: string, voice: string, engine: string, testMode?: boolean): Promise; + txt2speech: Txt2Speech; } // NOTE: AI responses contain provider-specific payloads that are not fully typed here because