diff --git a/README.md b/README.md index 62792e2..2359967 100644 --- a/README.md +++ b/README.md @@ -382,10 +382,32 @@ const vttOutput = webvtt(result); # Text to Speech +## Rest + ```js const { result } = await deepgram.speak.request({ text }, { model: "aura-asteria-en" }); ``` +## Websocket + +```js +const dgConnection = deepgram.speak.live({ model: "aura-asteria-en" }); + +dgConnection.on(LiveTTSEvents.Open, () => { + console.log("Connection opened"); + + // Send text data for TTS synthesis + dgConnection.sendText(text); + + // Send Flush message to the server after sending the text + dgConnection.flush(); + + dgConnection.on(LiveTTSEvents.Close, () => { + console.log("Connection closed"); + }); +}); +``` + [See our API reference for more info](https://developers.deepgram.com/reference/text-to-speech-api). # Text Intelligence diff --git a/examples/node-speak-live/index.js b/examples/node-speak-live/index.js new file mode 100644 index 0000000..4cc1ceb --- /dev/null +++ b/examples/node-speak-live/index.js @@ -0,0 +1,62 @@ +const fs = require("fs"); +const { createClient, LiveTTSEvents } = require("../../dist/main/index"); + +const live = async () => { + const text = "Hello, how can I help you today?"; + + const deepgram = createClient(process.env.DEEPGRAM_API_KEY); + + const dgConnection = deepgram.speak.live({ model: "aura-asteria-en" }); + + let audioBuffer = Buffer.alloc(0); + + dgConnection.on(LiveTTSEvents.Open, () => { + console.log("Connection opened"); + + // Send text data for TTS synthesis + dgConnection.sendText(text); + + // Send Flush message to the server after sending the text + dgConnection.flush(); + + dgConnection.on(LiveTTSEvents.Close, () => { + console.log("Connection closed"); + }); + + dgConnection.on(LiveTTSEvents.Metadata, (data) => { + console.dir(data, { depth: null }); + }); + + dgConnection.on(LiveTTSEvents.Audio, (data) => { + console.log("Deepgram audio data received"); + // Concatenate the audio chunks into a single buffer + const buffer = Buffer.from(data); + audioBuffer = Buffer.concat([audioBuffer, buffer]); + }); + + dgConnection.on(LiveTTSEvents.Flushed, () => { + console.log("Deepgram Flushed"); + // Write the buffered audio data to a file when the flush event is received + writeFile(); + }); + + dgConnection.on(LiveTTSEvents.Error, (err) => { + console.error(err); + }); + }); + + const writeFile = () => { + if (audioBuffer.length > 0) { + fs.writeFile("output.mp3", audioBuffer, (err) => { + if (err) { + console.error("Error writing audio file:", err); + } else { + console.log("Audio file saved as output.mp3"); + } + }); + audioBuffer = Buffer.alloc(0); // Reset buffer after writing + } + }; +}; + +live(); diff --git a/src/lib/enums/LiveTTSEvents.ts b/src/lib/enums/LiveTTSEvents.ts new file mode 100644 index 0000000..6c39a47 --- /dev/null +++ b/src/lib/enums/LiveTTSEvents.ts @@ -0,0 +1,36 @@ +/** + * Enumeration of events related to live text-to-speech synthesis. + * + * - `Open`: Built-in socket event for when the connection is opened. + * - `Close`: Built-in socket event for when the connection is closed. + * - `Error`: Built-in socket event for when an error occurs. + * - `Metadata`: Event for when metadata is received. + * - `Flushed`: Event for when the server has flushed the buffer. + * - `Warning`: Event for when a warning is received. + * - `Unhandled`: Catch-all event for any other message event. + */ +export enum LiveTTSEvents { + /** + * Built in socket events. + */ + Open = "Open", + Close = "Close", + Error = "Error", + + /** + * Message { type: string } + */ + Metadata = "Metadata", + Flushed = "Flushed", + Warning = "Warning", + + /** + * Audio data event. + */ + Audio = "Audio", + + /** + * Catch all for any other message event + */ + Unhandled = "Unhandled", +} diff --git a/src/lib/enums/index.ts b/src/lib/enums/index.ts index 42226c9..d85c2cd 100644 --- a/src/lib/enums/index.ts +++ b/src/lib/enums/index.ts @@ -1,2 +1,3 @@ export * from "./LiveConnectionState"; export * from "./LiveTranscriptionEvents"; +export * from "./LiveTTSEvents"; diff --git a/src/packages/SpeakClient.ts b/src/packages/SpeakClient.ts new file mode 100644 index 0000000..097e070 --- /dev/null +++ b/src/packages/SpeakClient.ts @@ -0,0 +1,35 @@ +import { AbstractClient } from "./AbstractClient"; +import { SpeakLiveClient } from "./SpeakLiveClient"; +import { SpeakRestClient } from "./SpeakRestClient"; +import { SpeakSchema } from "../lib/types"; +import { TextSource } from "../lib/types"; + +/** + * The `SpeakClient` class extends the `AbstractClient` class and provides access to the "speak" namespace. + * It exposes two methods: + * + * 1. `request()`: Returns a `SpeakRestClient` instance for interacting with the rest speak API. + * 2. `live(ttsOptions: SpeakSchema = {}, endpoint = ":version/speak")`: Returns a `SpeakLiveClient` instance for interacting with the live speak API, with the provided TTS options and endpoint. + */ +export class SpeakClient extends AbstractClient { + public namespace: string = "speak"; + + /** + * Returns a `SpeakRestClient` instance for interacting with the rest speak API. + */ + public request(source: TextSource, options?: SpeakSchema, endpoint = ":version/speak") { + const client = new SpeakRestClient(this.options); + + return client.request(source, options, endpoint); + } + + /** + * Returns a `SpeakLiveClient` instance for interacting with the live speak API, with the provided TTS options and endpoint. + * @param {SpeakSchema} [ttsOptions={}] - The TTS options to use for the live speak API. + * @param {string} [endpoint=":version/speak"] - The endpoint to use for the live speak API. + * @returns {SpeakLiveClient} - A `SpeakLiveClient` instance for interacting with the live speak API. + */ + public live(ttsOptions: SpeakSchema = {}, endpoint: string = ":version/speak"): SpeakLiveClient { + return new SpeakLiveClient(this.options, ttsOptions, endpoint); + } +} diff --git a/src/packages/SpeakLiveClient.ts b/src/packages/SpeakLiveClient.ts new file mode 100644 index 0000000..47b3b2f --- /dev/null +++ b/src/packages/SpeakLiveClient.ts @@ -0,0 +1,163 @@ +import { AbstractLiveClient } from "./AbstractLiveClient"; +import { LiveTTSEvents } from "../lib/enums"; +import type { SpeakSchema, DeepgramClientOptions } from "../lib/types"; + +/** + * The `SpeakLiveClient` class extends the `AbstractLiveClient` class and provides functionality for setting up and managing a WebSocket connection for live text-to-speech synthesis. + * + * The constructor takes in `DeepgramClientOptions` and an optional `SpeakSchema` object, as well as an optional `endpoint` string. It then calls the `connect` method of the parent `AbstractLiveClient` class to establish the WebSocket connection. + * + * The `setupConnection` method is responsible for handling the various events that can occur on the WebSocket connection, such as opening, closing, and receiving messages. It sets up event handlers for these events and emits the appropriate events based on the message type. + * + * The `configure` method allows you to send additional configuration options to the connected session. + * + * The `requestClose` method requests the server to close the connection. + */ +export class SpeakLiveClient extends AbstractLiveClient { + public namespace: string = "speak"; + + /** + * Constructs a new `SpeakLiveClient` instance with the provided options. + * + * @param options - The `DeepgramClientOptions` to use for the client connection. + * @param speakOptions - An optional `SpeakSchema` object containing additional configuration options for the text-to-speech. + * @param endpoint - An optional string representing the WebSocket endpoint to connect to. Defaults to `:version/speak`. + */ + constructor( + options: DeepgramClientOptions, + speakOptions: SpeakSchema = {}, + endpoint: string = ":version/speak" + ) { + super(options); + + this.connect(speakOptions, endpoint); + } + + /** + * Sets up the connection event handlers. + * This method is responsible for handling the various events that can occur on the WebSocket connection, such as opening, closing, and receiving data. + * - When the connection is opened, it emits the `LiveTTSEvents.Open` event. + * - When the connection is closed, it emits the `LiveTTSEvents.Close` event. + * - When an error occurs on the connection, it emits the `LiveTTSEvents.Error` event. + * - When a message is received, it parses the message and emits the appropriate event based on the message type, such as `LiveTTSEvents.Metadata`, `LiveTTSEvents.Flushed`, and `LiveTTSEvents.Warning`. + */ + public setupConnection(): void { + if (this.conn) { + this.conn.onopen = () => { + this.emit(LiveTTSEvents.Open, this); + }; + + this.conn.onclose = (event: any) => { + this.emit(LiveTTSEvents.Close, event); + }; + + this.conn.onerror = (event: ErrorEvent) => { + this.emit(LiveTTSEvents.Error, event); + }; + + this.conn.onmessage = (event: MessageEvent) => { + this.handleMessage(event); + }; + } + } + + /** + * Handles text messages received from the WebSocket connection. + * @param data - The parsed JSON data. + */ + protected handleTextMessage(data: any): void { + if (data.type === LiveTTSEvents.Metadata) { + this.emit(LiveTTSEvents.Metadata, data); + } else if (data.type === LiveTTSEvents.Flushed) { + this.emit(LiveTTSEvents.Flushed, data); + } else if (data.type === LiveTTSEvents.Warning) { + this.emit(LiveTTSEvents.Warning, data); + } else { + this.emit(LiveTTSEvents.Unhandled, data); + } + } + + /** + * Handles binary messages received from the WebSocket connection. + * @param data - The binary data. + */ + protected handleBinaryMessage(data: ArrayBuffer): void { + this.emit(LiveTTSEvents.Audio, data); + } + + /** + * Sends a text input message to the server. + * + * @param {string} text - The text to convert to speech. + */ + public sendText(text: string): void { + this.send( + JSON.stringify({ + type: "Speak", + text, + }) + ); + } + + /** + * Requests the server flush the current buffer and return generated audio. + */ + public flush(): void { + this.send( + JSON.stringify({ + type: "Flush", + }) + ); + } + + /** + * Requests the server reset the current buffer. + */ + public reset(): void { + this.send( + JSON.stringify({ + type: "Reset", + }) + ); + } + + /** + * Requests the server close the connection. + */ + public requestClose(): void { + this.send( + JSON.stringify({ + type: "Close", + }) + ); + } + + /** + * Handles incoming messages from the WebSocket connection. + * @param event - The MessageEvent object representing the received message. + */ + protected handleMessage(event: MessageEvent): void { + if (typeof event.data === "string") { + try { + const data = JSON.parse(event.data); + this.handleTextMessage(data); + } catch (error) { + this.emit(LiveTTSEvents.Error, { + event, + message: "Unable to parse `data` as JSON.", + error, + }); + } + } else if (event.data instanceof ArrayBuffer) { + this.handleBinaryMessage(event.data); + } else if (Buffer.isBuffer(event.data)) { + this.handleBinaryMessage(event.data.buffer); + } else { + console.log("Received unknown data type", event.data); + this.emit(LiveTTSEvents.Error, { + event, + message: "Received unknown data type.", + }); + } + } +} diff --git a/src/packages/SpeakRestClient.ts b/src/packages/SpeakRestClient.ts index 0695e96..30db0a9 100644 --- a/src/packages/SpeakRestClient.ts +++ b/src/packages/SpeakRestClient.ts @@ -75,5 +75,3 @@ export class SpeakRestClient extends AbstractRestClient { return this.result.headers; } } - -export { SpeakRestClient as SpeakClient }; diff --git a/src/packages/index.ts b/src/packages/index.ts index 92d7bdf..ce580f8 100644 --- a/src/packages/index.ts +++ b/src/packages/index.ts @@ -7,4 +7,6 @@ export * from "./ListenRestClient"; export * from "./ManageRestClient"; export * from "./ReadRestClient"; export * from "./SelfHostedRestClient"; +export * from "./SpeakClient"; +export * from "./SpeakLiveClient"; export * from "./SpeakRestClient";