forked from Significant-Gravitas/AutoGPT
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(rnd,blocks): Add D-ID Block (Significant-Gravitas#7798)
* talking head * linting * remove clip id, not needed * add more descriptive name * add min requirement to polling attempts and intervals * add docs and link to docs * remove extra space * force new tab * fix linting * add did key to .env.template
- Loading branch information
1 parent
efcd0f9
commit 012bad7
Showing
7 changed files
with
299 additions
and
157 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# Find available voices for D-ID | ||
|
||
1. **ElevenLabs** | ||
- Select any voice from the voice list: https://api.elevenlabs.io/v1/voices | ||
- Copy the voice_id | ||
- Use it as a string in the voice_id field in the CreateTalkingAvatarClip Block | ||
|
||
2. **Microsoft Azure Voices** | ||
- Select any voice from the voice gallery: https://speech.microsoft.com/portal/voicegallery | ||
- Click on the "Sample code" tab on the right | ||
- Copy the voice name, for example: config.SpeechSynthesisVoiceName ="en-GB-AbbiNeural" | ||
- Use this string en-GB-AbbiNeural in the voice_id field in the CreateTalkingAvatarClip Block | ||
|
||
3. **Amazon Polly Voices** | ||
- Select any voice from the voice list: https://docs.aws.amazon.com/polly/latest/dg/available-voices.html | ||
- Copy the voice name / ID | ||
- Use it as string in the voice_id field in the CreateTalkingAvatarClip Block |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,4 +16,7 @@ DISCORD_BOT_TOKEN= | |
SMTP_SERVER= | ||
SMTP_PORT= | ||
SMTP_USERNAME= | ||
SMTP_PASSWORD= | ||
SMTP_PASSWORD= | ||
|
||
# D-ID | ||
DID_API_KEY= |
146 changes: 146 additions & 0 deletions
146
rnd/autogpt_server/autogpt_server/blocks/talking_head.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
import time | ||
from typing import Literal | ||
|
||
import requests | ||
|
||
from autogpt_server.data.block import Block, BlockCategory, BlockOutput, BlockSchema | ||
from autogpt_server.data.model import BlockSecret, SchemaField, SecretField | ||
|
||
|
||
class CreateTalkingAvatarClipBlock(Block): | ||
class Input(BlockSchema): | ||
api_key: BlockSecret = SecretField( | ||
key="did_api_key", description="D-ID API Key" | ||
) | ||
script_input: str = SchemaField( | ||
description="The text input for the script", default="Welcome to AutoGPT" | ||
) | ||
provider: Literal["microsoft", "elevenlabs", "amazon"] = SchemaField( | ||
description="The voice provider to use", default="microsoft" | ||
) | ||
voice_id: str = SchemaField( | ||
description="The voice ID to use, get list of voices [here](https://docs.agpt.co/server/d_id)", | ||
default="en-US-JennyNeural", | ||
) | ||
presenter_id: str = SchemaField( | ||
description="The presenter ID to use", default="amy-Aq6OmGZnMt" | ||
) | ||
driver_id: str = SchemaField( | ||
description="The driver ID to use", default="Vcq0R4a8F0" | ||
) | ||
result_format: Literal["mp4", "gif", "wav"] = SchemaField( | ||
description="The desired result format", default="mp4" | ||
) | ||
crop_type: Literal["wide", "square", "vertical"] = SchemaField( | ||
description="The crop type for the presenter", default="wide" | ||
) | ||
subtitles: bool = SchemaField( | ||
description="Whether to include subtitles", default=False | ||
) | ||
ssml: bool = SchemaField(description="Whether the input is SSML", default=False) | ||
max_polling_attempts: int = SchemaField( | ||
description="Maximum number of polling attempts", default=30, ge=5 | ||
) | ||
polling_interval: int = SchemaField( | ||
description="Interval between polling attempts in seconds", default=10, ge=5 | ||
) | ||
|
||
class Output(BlockSchema): | ||
video_url: str = SchemaField(description="The URL of the created video") | ||
error: str = SchemaField(description="Error message if the request failed") | ||
|
||
def __init__(self): | ||
super().__init__( | ||
id="98c6f503-8c47-4b1c-a96d-351fc7c87dab", | ||
description="This block integrates with D-ID to create video clips and retrieve their URLs.", | ||
categories={BlockCategory.AI}, | ||
input_schema=CreateTalkingAvatarClipBlock.Input, | ||
output_schema=CreateTalkingAvatarClipBlock.Output, | ||
test_input={ | ||
"api_key": "your_test_api_key", | ||
"script_input": "Welcome to AutoGPT", | ||
"voice_id": "en-US-JennyNeural", | ||
"presenter_id": "amy-Aq6OmGZnMt", | ||
"driver_id": "Vcq0R4a8F0", | ||
"result_format": "mp4", | ||
"crop_type": "wide", | ||
"subtitles": False, | ||
"ssml": False, | ||
"max_polling_attempts": 5, | ||
"polling_interval": 5, | ||
}, | ||
test_output=[ | ||
( | ||
"video_url", | ||
"https://d-id.com/api/clips/abcd1234-5678-efgh-ijkl-mnopqrstuvwx/video", | ||
), | ||
], | ||
test_mock={ | ||
"create_clip": lambda *args, **kwargs: { | ||
"id": "abcd1234-5678-efgh-ijkl-mnopqrstuvwx", | ||
"status": "created", | ||
}, | ||
"get_clip_status": lambda *args, **kwargs: { | ||
"status": "done", | ||
"result_url": "https://d-id.com/api/clips/abcd1234-5678-efgh-ijkl-mnopqrstuvwx/video", | ||
}, | ||
}, | ||
) | ||
|
||
def create_clip(self, api_key: str, payload: dict) -> dict: | ||
url = "https://api.d-id.com/clips" | ||
headers = { | ||
"accept": "application/json", | ||
"content-type": "application/json", | ||
"authorization": f"Basic {api_key}", | ||
} | ||
response = requests.post(url, json=payload, headers=headers) | ||
response.raise_for_status() | ||
return response.json() | ||
|
||
def get_clip_status(self, api_key: str, clip_id: str) -> dict: | ||
url = f"https://api.d-id.com/clips/{clip_id}" | ||
headers = {"accept": "application/json", "authorization": f"Basic {api_key}"} | ||
response = requests.get(url, headers=headers) | ||
response.raise_for_status() | ||
return response.json() | ||
|
||
def run(self, input_data: Input) -> BlockOutput: | ||
try: | ||
# Create the clip | ||
payload = { | ||
"script": { | ||
"type": "text", | ||
"subtitles": str(input_data.subtitles).lower(), | ||
"provider": { | ||
"type": input_data.provider, | ||
"voice_id": input_data.voice_id, | ||
}, | ||
"ssml": str(input_data.ssml).lower(), | ||
"input": input_data.script_input, | ||
}, | ||
"config": {"result_format": input_data.result_format}, | ||
"presenter_config": {"crop": {"type": input_data.crop_type}}, | ||
"presenter_id": input_data.presenter_id, | ||
"driver_id": input_data.driver_id, | ||
} | ||
|
||
response = self.create_clip(input_data.api_key.get_secret_value(), payload) | ||
clip_id = response["id"] | ||
|
||
# Poll for clip status | ||
for _ in range(input_data.max_polling_attempts): | ||
status_response = self.get_clip_status( | ||
input_data.api_key.get_secret_value(), clip_id | ||
) | ||
if status_response["status"] == "done": | ||
yield "video_url", status_response["result_url"] | ||
return | ||
elif status_response["status"] == "error": | ||
yield "error", f"Clip creation failed: {status_response.get('error', 'Unknown error')}" | ||
return | ||
time.sleep(input_data.polling_interval) | ||
|
||
yield "error", "Clip creation timed out" | ||
except Exception as e: | ||
yield "error", str(e) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters