From 0ce9b705c644effbb94474c0fb111e691e5ec612 Mon Sep 17 00:00:00 2001 From: Anthony Campolo <12433465+ajcwebdev@users.noreply.github.com> Date: Fri, 11 Oct 2024 02:55:31 -0500 Subject: [PATCH 01/10] add ollama server and model checks --- src/llms/llama.ts | 2 +- src/llms/ollama.ts | 149 ++++++++++++++++++++++++++++++++++++++++----- src/types.ts | 18 ++++++ 3 files changed, 153 insertions(+), 16 deletions(-) diff --git a/src/llms/llama.ts b/src/llms/llama.ts index c1ea1ac..a849e4d 100644 --- a/src/llms/llama.ts +++ b/src/llms/llama.ts @@ -80,7 +80,7 @@ export const callLlama: LLMFunction = async ( // Write the response to the temporary file await writeFile(tempPath, response) - log(wait(' \nLLM processing completed')) + log(wait('\n LLM processing completed')) } catch (error) { console.error(`Error in callLlama: ${error instanceof Error ? (error as Error).message : String(error)}`) throw error diff --git a/src/llms/ollama.ts b/src/llms/ollama.ts index 56431a0..6722764 100644 --- a/src/llms/ollama.ts +++ b/src/llms/ollama.ts @@ -4,13 +4,14 @@ import { writeFile } from 'node:fs/promises' import { env } from 'node:process' import { OLLAMA_MODELS } from '../models.js' import { log, wait } from '../models.js' +import { spawn } from 'child_process' -import type { LLMFunction, OllamaModelType, OllamaResponse } from '../types.js' +import type { LLMFunction, OllamaModelType, OllamaResponse, OllamaTagsResponse } from '../types.js' /** * Main function to call the Llama model using the Ollama REST API. - * This function checks if the model is available, pulls it if necessary, - * and then proceeds with the chat. + * This function ensures the Ollama server is running, checks if the model is available, + * and then proceeds with the chat using a streaming response. * @param promptAndTranscript - The combined prompt and transcript content. * @param tempPath - The temporary file path to write the LLM output. * @param modelName - The name of the model to use. @@ -21,14 +22,101 @@ export const callOllama: LLMFunction = async (promptAndTranscript: string, tempP try { // Map the model name to the Ollama model identifier const ollamaModelName = OLLAMA_MODELS[modelName as OllamaModelType] || 'llama3.2:1b' + log(wait(` - modelName: ${modelName}\n - ollamaModelName: ${ollamaModelName}`)) // Get host and port from environment variables or use defaults const ollamaHost = env.OLLAMA_HOST || 'localhost' const ollamaPort = env.OLLAMA_PORT || '11434' - log(wait(` - Using Ollama model: ${ollamaModelName} at http://${ollamaHost}:${ollamaPort}`)) - // Call the Ollama chat API - log(wait(` - Sending chat request to Ollama...`)) + // Check if Ollama server is running, start if not + async function checkServer(): Promise { + try { + const serverResponse = await fetch(`http://${ollamaHost}:${ollamaPort}`) + return serverResponse.ok + } catch (error) { + return false + } + } + + if (await checkServer()) { + log(wait(' - Ollama server is already running.')) + } else { + log(wait(' - Ollama server is not running. Attempting to start...')) + const ollamaProcess = spawn('ollama', ['serve'], { + detached: true, + stdio: 'ignore' + }) + ollamaProcess.unref() + + // Wait for the server to be ready + let attempts = 0 + while (attempts < 30) { // Increased to 30 attempts, 30 seconds total + if (await checkServer()) { + log(wait(' - Ollama server is now ready.')) + break + } + await new Promise(resolve => setTimeout(resolve, 1000)) + attempts++ + } + if (attempts === 30) { + throw new Error('Ollama server failed to become ready in time.') + } + } + + // Check if the model is available, pull if not + try { + const tagsResponse = await fetch(`http://${ollamaHost}:${ollamaPort}/api/tags`) + if (!tagsResponse.ok) { + throw new Error(`HTTP error! status: ${tagsResponse.status}`) + } + const tagsData = await tagsResponse.json() as OllamaTagsResponse + const isModelAvailable = tagsData.models.some(model => model.name === ollamaModelName) + if (!isModelAvailable) { + log(wait(`\n Model ${ollamaModelName} is not available, pulling the model...`)) + const pullResponse = await fetch(`http://${ollamaHost}:${ollamaPort}/api/pull`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ name: ollamaModelName }), + }) + if (!pullResponse.ok) { + throw new Error(`Failed to initiate pull for model ${ollamaModelName}`) + } + if (!pullResponse.body) { + throw new Error('Response body is null') + } + const reader = pullResponse.body.getReader() + const decoder = new TextDecoder() + while (true) { + const { done, value } = await reader.read() + if (done) break + const chunk = decoder.decode(value) + const lines = chunk.split('\n') + for (const line of lines) { + if (line.trim() === '') continue + try { + const response = JSON.parse(line) + if (response.status === 'success') { + log(wait(` - Model ${ollamaModelName} has been pulled successfully.`)) + break + } + } catch (parseError) { + console.error(`Error parsing JSON: ${parseError}`) + } + } + } + } else { + log(wait(`\n Model ${ollamaModelName} is already available...`)) + } + } catch (error) { + console.error(`Error checking/pulling model: ${error instanceof Error ? error.message : String(error)}`) + throw error + } + + log(wait(` - Sending chat request to http://${ollamaHost}:${ollamaPort} using ${ollamaModelName} model`)) + + // Call the Ollama chat API with streaming enabled const response = await fetch(`http://${ollamaHost}:${ollamaPort}/api/chat`, { method: 'POST', headers: { @@ -37,7 +125,7 @@ export const callOllama: LLMFunction = async (promptAndTranscript: string, tempP body: JSON.stringify({ model: ollamaModelName, messages: [{ role: 'user', content: promptAndTranscript }], - stream: false, + stream: true, }), }) @@ -45,15 +133,46 @@ export const callOllama: LLMFunction = async (promptAndTranscript: string, tempP throw new Error(`HTTP error! status: ${response.status}`) } - // Type assertion to enforce the structure of the response - // const data = await response.json() as any - const data = await response.json() as OllamaResponse + if (!response.body) { + throw new Error('Response body is null') + } + + const reader = response.body.getReader() + const decoder = new TextDecoder() + let fullContent = '' + let isFirstChunk = true + + while (true) { + const { done, value } = await reader.read() + if (done) break + + const chunk = decoder.decode(value) + const lines = chunk.split('\n') + + for (const line of lines) { + if (line.trim() === '') continue + + try { + const parsedResponse = JSON.parse(line) as OllamaResponse + if (parsedResponse.message?.content) { + if (isFirstChunk) { + log(wait(` - Receiving streaming response from Ollama...`)) + isFirstChunk = false + } + fullContent += parsedResponse.message.content + } + + if (parsedResponse.done) { + log(wait(` - Completed receiving response from Ollama.`)) + } + } catch (parseError) { + console.error(`Error parsing JSON: ${parseError}`) + } + } + } - // Extract the assistant's reply and write the response to the output file - const assistantReply = data.message.content - log(wait(` - Received response from Ollama.`)) - await writeFile(tempPath, assistantReply) - log(wait(`\n Transcript saved to temporary file:\n - ${tempPath}`)) + // Write the full content to the output file + await writeFile(tempPath, fullContent) } catch (error) { console.error(`Error in callOllama: ${error instanceof Error ? (error as Error).message : String(error)}`) console.error(`Stack Trace: ${error instanceof Error ? error.stack : 'No stack trace available'}`) diff --git a/src/types.ts b/src/types.ts index 2136e4f..4357970 100644 --- a/src/types.ts +++ b/src/types.ts @@ -343,6 +343,24 @@ export type OllamaResponse = { eval_duration: number } +export type OllamaTagsResponse = { + models: Array<{ + name: string + model: string + modified_at: string + size: number + digest: string + details: { + parent_model: string + format: string + family: string + families: string[] + parameter_size: string + quantization_level: string + } + }> +} + /** * Represents the function signature for cleaning up temporary files. */ From 9c59254bb1e9ac28fc2d3e054b70d42c625936f4 Mon Sep 17 00:00:00 2001 From: Anthony Campolo <12433465+ajcwebdev@users.noreply.github.com> Date: Fri, 11 Oct 2024 02:56:04 -0500 Subject: [PATCH 02/10] check for empty whisper model case --- src/transcription/whisper.ts | 21 +++++++++++++-------- src/transcription/whisperDocker.ts | 11 ++++++++--- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/transcription/whisper.ts b/src/transcription/whisper.ts index 2e25dcf..63d2b8f 100644 --- a/src/transcription/whisper.ts +++ b/src/transcription/whisper.ts @@ -6,7 +6,7 @@ import { promisify } from 'node:util' import { existsSync } from 'node:fs' import { WHISPER_MODELS } from '../models.js' import { log, wait } from '../models.js' -import type { ProcessingOptions } from '../types.js' +import type { ProcessingOptions, WhisperModelType } from '../types.js' const execPromise = promisify(exec) @@ -21,29 +21,34 @@ export async function callWhisper(options: ProcessingOptions, finalPath: string) log(wait('\n Using Whisper for transcription...')) try { // Get the whisper model from options or use 'base' as default - const whisperModel = options.whisper || 'base' - + let whisperModel = 'base' + if (typeof options.whisper === 'string') { + whisperModel = options.whisper + } else if (options.whisper !== true) { + throw new Error('Invalid whisper option') + } + if (!(whisperModel in WHISPER_MODELS)) { throw new Error(`Unknown model type: ${whisperModel}`) } // Get the model ggml file name - const modelGGMLName = WHISPER_MODELS[whisperModel] + const modelGGMLName = WHISPER_MODELS[whisperModel as WhisperModelType] log(wait(`\n - whisperModel: ${whisperModel}\n - modelGGMLName: ${modelGGMLName}`)) // Setup Whisper if (!existsSync('./whisper.cpp')) { - log(`\nNo whisper.cpp repo found, running git clone and make...\n`) + log(`\n No whisper.cpp repo found, running git clone and make...\n`) await execPromise('git clone https://github.com/ggerganov/whisper.cpp.git && make -C whisper.cpp && cp .github/whisper.Dockerfile whisper.cpp/Dockerfile') - log(`\nwhisper.cpp clone and make commands complete.\n`) + log(`\n - whisper.cpp clone and make commands complete.\n`) } // Ensure model is downloaded if (!existsSync(`./whisper.cpp/models/ggml-${whisperModel}.bin`)) { - log(wait(` Model not found, downloading...\n - ${whisperModel}\n`)) + log(wait(`\n Model not found, downloading...\n - ${whisperModel}\n`)) await execPromise(`bash ./whisper.cpp/models/download-ggml-model.sh ${whisperModel}`) - log(wait(' Model download completed, running transcription...\n')) + log(wait(' - Model download completed, running transcription...\n')) } // Run transcription diff --git a/src/transcription/whisperDocker.ts b/src/transcription/whisperDocker.ts index 22b2566..ac11458 100644 --- a/src/transcription/whisperDocker.ts +++ b/src/transcription/whisperDocker.ts @@ -6,7 +6,7 @@ import { promisify } from 'node:util' import { join } from 'node:path' import { WHISPER_MODELS } from '../models.js' import { log, wait } from '../models.js' -import type { ProcessingOptions } from '../types.js' +import type { ProcessingOptions, WhisperModelType } from '../types.js' const execPromise = promisify(exec) @@ -21,14 +21,19 @@ export async function callWhisperDocker(options: ProcessingOptions, finalPath: s log(wait('\n Using Whisper Docker for transcription...')) try { // Get the whisper model from options or use 'base' as default - const whisperModel = options.whisperDocker || 'base' + let whisperModel = 'base' + if (typeof options.whisperDocker === 'string') { + whisperModel = options.whisperDocker + } else if (options.whisperDocker !== true) { + throw new Error('Invalid whisperDocker option') + } if (!(whisperModel in WHISPER_MODELS)) { throw new Error(`Unknown model type: ${whisperModel}`) } // Get the model ggml file name - const modelGGMLName = WHISPER_MODELS[whisperModel] + const modelGGMLName = WHISPER_MODELS[whisperModel as WhisperModelType] const CONTAINER_NAME = 'autoshow-whisper-1' const modelPathContainer = `/app/models/${modelGGMLName}` From af5d34ddc1a88ee0eca3825e5cb33a4658794bd2 Mon Sep 17 00:00:00 2001 From: Anthony Campolo <12433465+ajcwebdev@users.noreply.github.com> Date: Fri, 11 Oct 2024 03:05:14 -0500 Subject: [PATCH 03/10] update setup script, local tests, and deps --- package.json | 44 ++++++++++++------------ scripts/setup.sh | 85 +++++++++++++++++++++++++++++++--------------- test/local.test.js | 51 ++++++++++------------------ 3 files changed, 97 insertions(+), 83 deletions(-) diff --git a/package.json b/package.json index f3abd1e..8d4a65e 100644 --- a/package.json +++ b/package.json @@ -40,29 +40,29 @@ "deno-as": "deno run --allow-sys --allow-read --allow-run --allow-write --allow-env src/autoshow.ts" }, "dependencies": { - "@anthropic-ai/sdk": "^0.26.0", - "@deepgram/sdk": "^3.5.1", - "@fastify/cors": "^10.0.1", - "@google/generative-ai": "^0.17.1", - "@mistralai/mistralai": "^1.0.2", - "@octoai/sdk": "^1.5.1", - "assemblyai": "^4.6.1", - "chalk": "^5.3.0", - "cohere-ai": "^7.12.0", - "commander": "^12.1.0", - "fast-xml-parser": "^4.4.1", - "fastify": "^5.0.0", - "ffmpeg-static": "^5.2.0", - "file-type": "^19.4.1", - "inquirer": "^10.2.2", - "node-llama-cpp": "^3.1.0", - "ollama": "^0.5.9", - "openai": "^4.55.7" + "@anthropic-ai/sdk": "0.29.0", + "@deepgram/sdk": "3.8.1", + "@fastify/cors": "10.0.1", + "@google/generative-ai": "0.21.0", + "@mistralai/mistralai": "1.1.0", + "@octoai/sdk": "1.11.0", + "assemblyai": "4.7.1", + "chalk": "5.3.0", + "cohere-ai": "7.14.0", + "commander": "12.1.0", + "fast-xml-parser": "4.5.0", + "fastify": "5.0.0", + "ffmpeg-static": "5.2.0", + "file-type": "19.5.0", + "inquirer": "12.0.0", + "node-llama-cpp": "3.1.1", + "ollama": "0.5.9", + "openai": "4.67.3" }, "devDependencies": { - "@types/inquirer": "^9.0.7", - "@types/node": "^22.7.5", - "tsx": "^4.19.1", - "typescript": "^5.6.2" + "@types/inquirer": "9.0.7", + "@types/node": "22.7.5", + "tsx": "4.19.1", + "typescript": "5.6.3" } } diff --git a/scripts/setup.sh b/scripts/setup.sh index 55022b4..3a4a1df 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -1,4 +1,5 @@ #!/bin/bash +# scripts/setup.sh # Function to check if a command exists command_exists() { @@ -21,45 +22,73 @@ else echo "yt-dlp is already installed." fi +# Function to check if Ollama server is running +check_ollama_server() { + if curl -s "http://127.0.0.1:11434" &> /dev/null; then + echo "Ollama server is already running." + else + echo "Ollama server is not running. Starting Ollama server..." + ollama serve > ollama.log 2>&1 & + OLLAMA_PID=$! + echo "Ollama server started with PID $OLLAMA_PID" + sleep 5 + fi +} + +# Function to check if a model is available, and pull it if not +check_and_pull_model() { + local model=$1 + if ollama list | grep -q "$model"; then + echo "Model $model is already available." + else + echo "Model $model is not available. Pulling the model..." + ollama pull "$model" + fi +} + # Check if Ollama is installed if ! command_exists ollama; then echo "Ollama is not installed, refer to installation instructions here:" echo "https://github.com/ollama/ollama" else echo "Ollama is installed." -fi - -# Check if Ollama server is running -if ! curl -s "http://127.0.0.1:11434" &> /dev/null; then - echo "Ollama server is not running. Starting Ollama server..." - ollama serve > ollama.log 2>&1 & - OLLAMA_PID=$! - echo "Ollama server started with PID $OLLAMA_PID" - sleep 5 -else - echo "Ollama server is already running." + + # Check if Ollama server is running + check_ollama_server + + # Check and pull required models + check_and_pull_model "llama3.2:1b" fi # Install npm dependencies npm i -# Clone whisper.cpp repository -git clone https://github.com/ggerganov/whisper.cpp.git - -# Download whisper models -bash ./whisper.cpp/models/download-ggml-model.sh base -bash ./whisper.cpp/models/download-ggml-model.sh large-v2 - -# Compile whisper.cpp -make -C whisper.cpp - -# Copy Dockerfile -cp .github/whisper.Dockerfile whisper.cpp/Dockerfile - -# Download Qwen 2.5 1.5B model for Llama.cpp -curl -L "https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q6_k.gguf" -o "./src/llms/models/qwen2.5-1.5b-instruct-q6_k.gguf" +# Check if whisper.cpp directory exists +if [ -d "whisper.cpp" ]; then + echo "whisper.cpp directory already exists. Skipping clone and setup." +else + echo "Cloning whisper.cpp repository..." + git clone https://github.com/ggerganov/whisper.cpp.git + + # Download whisper models + echo "Downloading whisper models..." + bash ./whisper.cpp/models/download-ggml-model.sh base + + # Compile whisper.cpp + echo "Compiling whisper.cpp..." + make -C whisper.cpp + + # Copy Dockerfile + echo "Copying Dockerfile..." + cp .github/whisper.Dockerfile whisper.cpp/Dockerfile +fi -# Pull Llama 3.1 1B model using Ollama -ollama pull llama3.2:1b +# Check if Qwen 2.5 1.5B model exists +if [ -f "./src/llms/models/qwen2.5-1.5b-instruct-q6_k.gguf" ]; then + echo "Qwen 2.5 1.5B model already exists. Skipping download." +else + echo "Downloading Qwen 2.5 1.5B model..." + curl -L "https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q6_k.gguf" -o "./src/llms/models/qwen2.5-1.5b-instruct-q6_k.gguf" +fi echo "Setup completed successfully!" \ No newline at end of file diff --git a/test/local.test.js b/test/local.test.js index 2d7a520..bc99858 100644 --- a/test/local.test.js +++ b/test/local.test.js @@ -37,68 +37,53 @@ const commands = [ newName: 'FILE_05.md' }, { - cmd: 'npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --llama', - expectedFile: '2024-09-24-ep0-fsjam-podcast-llama-shownotes.md', - newName: 'FILE_06.md' - }, - { - cmd: 'npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --ollama', + cmd: 'npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --ollama LLAMA_3_2_3B', expectedFile: '2024-09-24-ep0-fsjam-podcast-ollama-shownotes.md', - newName: 'FILE_07.md' + newName: 'FILE_06.md' }, { cmd: 'npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --whisper tiny', expectedFile: '2024-09-24-ep0-fsjam-podcast-prompt.md', - newName: 'FILE_08.md' + newName: 'FILE_07.md' }, + // { + // cmd: 'npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --whisperDocker tiny', + // expectedFile: '2024-09-24-ep0-fsjam-podcast-prompt.md', + // newName: 'FILE_08.md' + // }, { - cmd: 'npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --whisperDocker tiny', + cmd: 'npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --prompt titles summary mediumChapters takeaways questions', expectedFile: '2024-09-24-ep0-fsjam-podcast-prompt.md', newName: 'FILE_09.md' }, { - cmd: 'npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --prompt titles', - expectedFile: '2024-09-24-ep0-fsjam-podcast-prompt.md', + cmd: 'npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --prompt titles summary shortChapters takeaways questions --whisper tiny --ollama', + expectedFile: '2024-09-24-ep0-fsjam-podcast-ollama-shownotes.md', newName: 'FILE_10.md' }, { - cmd: 'npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --prompt titles summary mediumChapters takeaways questions', - expectedFile: '2024-09-24-ep0-fsjam-podcast-prompt.md', - newName: 'FILE_11.md' - }, - { - cmd: 'npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --prompt titles summary shortChapters takeaways questions --whisper tiny --llama', - expectedFile: '2024-09-24-ep0-fsjam-podcast-llama-shownotes.md', - newName: 'FILE_12.md' - }, - { - cmd: 'npm run as -- --playlist "https://www.youtube.com/playlist?list=PLCVnrVv4KhXPz0SoAVu8Rc1emAdGPbSbr" --prompt titles --whisper tiny --llama', + cmd: 'npm run as -- --playlist "https://www.youtube.com/playlist?list=PLCVnrVv4KhXPz0SoAVu8Rc1emAdGPbSbr" --prompt titles --whisper tiny --ollama', expectedFiles: [ - { file: '2024-09-24-ep1-fsjam-podcast-llama-shownotes.md', newName: 'FILE_13A.md' }, - { file: '2024-09-24-ep0-fsjam-podcast-llama-shownotes.md', newName: 'FILE_13B.md' } + { file: '2024-09-24-ep1-fsjam-podcast-ollama-shownotes.md', newName: 'FILE_11A.md' }, + { file: '2024-09-24-ep0-fsjam-podcast-ollama-shownotes.md', newName: 'FILE_11B.md' } ] }, { cmd: 'npm run as -- --urls "content/example-urls.md" --prompt titles --whisper tiny --ollama', expectedFiles: [ - { file: '2024-09-24-ep1-fsjam-podcast-ollama-shownotes.md', newName: 'FILE_14A.md' }, - { file: '2024-09-24-ep0-fsjam-podcast-ollama-shownotes.md', newName: 'FILE_14B.md' } + { file: '2024-09-24-ep1-fsjam-podcast-ollama-shownotes.md', newName: 'FILE_12A.md' }, + { file: '2024-09-24-ep0-fsjam-podcast-ollama-shownotes.md', newName: 'FILE_12B.md' } ] }, { cmd: 'npm run as -- --rss "https://ajcwebdev.substack.com/feed"', expectedFile: '2021-05-10-thoughts-on-lambda-school-layoffs-prompt.md', - newName: 'FILE_15.md' - }, - { - cmd: 'npm run as -- --rss "https://ajcwebdev.substack.com/feed" --item "https://api.substack.com/feed/podcast/36236609/fd1f1532d9842fe1178de1c920442541.mp3" --whisper tiny --llama --prompt titles summary longChapters takeaways questions', - expectedFile: '2021-05-10-thoughts-on-lambda-school-layoffs-llama-shownotes.md', - newName: 'FILE_16.md', + newName: 'FILE_13.md' }, { cmd: 'npm run as -- --rss "https://ajcwebdev.substack.com/feed" --info', expectedFile: 'rss_info.json', - newName: 'FILE_17_rss_info.json', + newName: 'FILE_14_rss_info.json', } ] From dacb7c07fe5dca1bce1b13a982852654b0a90406 Mon Sep 17 00:00:00 2001 From: Anthony Campolo <12433465+ajcwebdev@users.noreply.github.com> Date: Sun, 13 Oct 2024 02:00:11 -0500 Subject: [PATCH 04/10] clean content directory script --- package.json | 1 + scripts/cleanContent.ts | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 scripts/cleanContent.ts diff --git a/package.json b/package.json index 8d4a65e..821bb2a 100644 --- a/package.json +++ b/package.json @@ -36,6 +36,7 @@ "t": "npm run test-local", "test-local": "node --test test/local.test.js", "test-all": "node --test test/all.test.js", + "clean": "tsx scripts/cleanContent.ts", "bun-as": "bun --env-file=.env --no-warnings src/autoshow.ts", "deno-as": "deno run --allow-sys --allow-read --allow-run --allow-write --allow-env src/autoshow.ts" }, diff --git a/scripts/cleanContent.ts b/scripts/cleanContent.ts new file mode 100644 index 0000000..7f3ef32 --- /dev/null +++ b/scripts/cleanContent.ts @@ -0,0 +1,26 @@ +// scripts/cleanContent.ts + +import { exec } from 'child_process' +import { promisify } from 'util' + +const execAsync = promisify(exec) + +async function cleanContent() { + try { + const { stdout, stderr } = await execAsync( + 'find content -type f -not \\( -name ".gitkeep" -o -name "audio.mp3" -o -name "example-urls.md" \\) -delete' + ) + if (stderr) { + console.error('Error:', stderr) + return + } + console.log('Files deleted successfully') + if (stdout) { + console.log('Output:', stdout) + } + } catch (error) { + console.error('Execution error:', error) + } +} + +cleanContent() \ No newline at end of file From e4c6fe5be3bd35cf1cdfc3f50e9b019e1985b4d5 Mon Sep 17 00:00:00 2001 From: Anthony Campolo <12433465+ajcwebdev@users.noreply.github.com> Date: Sun, 13 Oct 2024 04:10:05 -0500 Subject: [PATCH 05/10] add whisper python version for turbo --- src/autoshow.ts | 1 + src/models.ts | 24 +++++++- src/transcription/whisperPython.ts | 90 ++++++++++++++++++++++++++++++ src/types.ts | 6 +- src/utils/runTranscription.ts | 87 ++++++++++------------------- 5 files changed, 145 insertions(+), 63 deletions(-) create mode 100644 src/transcription/whisperPython.ts diff --git a/src/autoshow.ts b/src/autoshow.ts index d374438..427c7bd 100644 --- a/src/autoshow.ts +++ b/src/autoshow.ts @@ -45,6 +45,7 @@ program .option('--info', 'Generate JSON file with RSS feed information instead of processing items') .option('--whisper [model]', 'Use Whisper.cpp for transcription with optional model specification') .option('--whisperDocker [model]', 'Use Whisper.cpp in Docker for transcription with optional model specification') + .option('--whisperPython [model]', 'Use openai-whisper for transcription with optional model specification') .option('--deepgram', 'Use Deepgram for transcription') .option('--assembly', 'Use AssemblyAI for transcription') .option('--speakerLabels', 'Use speaker labels for AssemblyAI transcription') diff --git a/src/models.ts b/src/models.ts index 76b7492..5ea7929 100644 --- a/src/models.ts +++ b/src/models.ts @@ -15,13 +15,13 @@ export const log: typeof console.log = console.log export const ACTION_OPTIONS = ['video', 'playlist', 'urls', 'file', 'rss'] export const LLM_OPTIONS = ['chatgpt', 'claude', 'cohere', 'mistral', 'octo', 'llama', 'ollama', 'gemini'] -export const TRANSCRIPT_OPTIONS = ['whisper', 'whisperDocker', 'deepgram', 'assembly'] +export const TRANSCRIPT_OPTIONS = ['whisper', 'whisperDocker', 'whisperPython', 'deepgram', 'assembly'] /** - * Define available Whisper models + * Define available Whisper models for whisper.cpp * @type {Record} */ -export const WHISPER_MODELS: Record = { +export const WHISPER_MODELS: Record = { 'tiny': 'ggml-tiny.bin', 'tiny.en': 'ggml-tiny.en.bin', 'base': 'ggml-base.bin', @@ -34,6 +34,24 @@ export const WHISPER_MODELS: Record = { 'large-v2': 'ggml-large-v2.bin', } +/** + * Define available Whisper models for openai-whisper + * @type {Record} + */ +export const WHISPER_PYTHON_MODELS: Record = { + tiny: 'tiny', + 'tiny.en': 'tiny.en', + base: 'base', + 'base.en': 'base.en', + small: 'small', + 'small.en': 'small.en', + medium: 'medium', + 'medium.en': 'medium.en', + 'large-v1': 'large-v1', + 'large-v2': 'large-v2', + turbo: 'turbo', +} + /** * Map of ChatGPT model identifiers to their API names * @type {Record} diff --git a/src/transcription/whisperPython.ts b/src/transcription/whisperPython.ts new file mode 100644 index 0000000..8667f1e --- /dev/null +++ b/src/transcription/whisperPython.ts @@ -0,0 +1,90 @@ +// src/transcription/whisperPython.ts + +import { readFile, writeFile, unlink } from 'node:fs/promises' +import { exec } from 'node:child_process' +import { promisify } from 'node:util' +import { log, wait } from '../models.js' +import type { ProcessingOptions } from '../types.js' +import { WHISPER_PYTHON_MODELS } from '../models.js' + +const execPromise = promisify(exec) + +/** + * Main function to handle transcription using openai-whisper Python library. + * @param {ProcessingOptions} options - Additional processing options. + * @param {string} finalPath - The base path for the files. + * @returns {Promise} - Returns the formatted transcript content. + * @throws {Error} - If an error occurs during transcription. + */ +export async function callWhisperPython(options: ProcessingOptions, finalPath: string): Promise { + log(wait('\n Using openai-whisper Python library for transcription...')) + + try { + // Get the whisper model from options or use 'base' as default + let whisperModel: string = 'base' + if (typeof options.whisperPython === 'string') { + whisperModel = options.whisperPython + } else if (options.whisperPython !== true) { + throw new Error('Invalid whisperPython option') + } + + if (!(whisperModel in WHISPER_PYTHON_MODELS)) { + throw new Error(`Unknown model type: ${whisperModel}`) + } + + log(wait(`\n - whisperModel: ${whisperModel}`)) + + // Check if ffmpeg is installed + try { + await execPromise('ffmpeg -version') + } catch (error) { + throw new Error('ffmpeg is not installed or not available in PATH') + } + + // Check if Python is installed + try { + await execPromise('python3 --version') + } catch (error) { + throw new Error('Python is not installed or not available in PATH') + } + + // Check if the openai-whisper package is installed + try { + // await execPromise('python3 -c "import whisper"') + await execPromise('which whisper') + } catch (error) { + log(wait('\n openai-whisper not found, installing...')) + // await execPromise('pip install -U openai-whisper') + await execPromise('brew install openai-whisper') + log(wait(' - openai-whisper installed')) + } + + // Prepare the command to run the transcription + const command = `whisper "${finalPath}.wav" --model ${whisperModel} --output_dir "content" --output_format vtt --language en --word_timestamps True` + + log(wait(`\n Running transcription with command:\n ${command}\n`)) + + // Execute the command + await execPromise(command) + + // Read the generated transcript file + const transcriptContent = await readFile(`${finalPath}.vtt`, 'utf8') + + // Write the transcript to the expected output file + await writeFile(`${finalPath}.txt`, transcriptContent) + + // Create an empty LRC file to prevent cleanup errors and unlink VTT file + await writeFile(`${finalPath}.lrc`, '') + log(wait(`\n Empty LRC file created:\n - ${finalPath}.lrc\n`)) + await unlink(`${finalPath}.vtt`) + log(wait(`\n VTT file deleted:\n - ${finalPath}.vtt\n`)) + + log(wait(`\n Transcript successfully completed:\n - ${finalPath}.txt\n`)) + + return transcriptContent + + } catch (error) { + console.error('Error in callWhisperPython:', (error as Error).message) + process.exit(1) + } +} diff --git a/src/types.ts b/src/types.ts index 4357970..27d79af 100644 --- a/src/types.ts +++ b/src/types.ts @@ -26,6 +26,8 @@ export type ProcessingOptions = { noCleanUp?: boolean /** The Whisper model to use (e.g., 'tiny', 'base'). */ whisper?: WhisperModelType + /** The Whisper Python model to use (e.g., 'tiny', 'base'). */ + whisperPython?: WhisperModelType /** The Whisper model to use with Docker (e.g., 'tiny', 'base'). */ whisperDocker?: WhisperModelType /** Flag to use Deepgram for transcription. */ @@ -231,7 +233,7 @@ export type SupportedFileType = 'wav' | 'mp3' | 'm4a' | 'aac' | 'ogg' | 'flac' | * - deepgram: Use Deepgram's transcription service. * - assembly: Use AssemblyAI's transcription service. */ -export type TranscriptServices = 'whisper' | 'whisperDocker' | 'deepgram' | 'assembly' +export type TranscriptServices = 'whisper' | 'whisperDocker' | 'whisperPython' | 'deepgram' | 'assembly' /** * Represents the available Whisper model types. @@ -247,7 +249,7 @@ export type TranscriptServices = 'whisper' | 'whisperDocker' | 'deepgram' | 'ass * - large-v1: Large multilingual model version 1. * - large-v2: Large multilingual model version 2. */ -export type WhisperModelType = 'tiny' | 'tiny.en' | 'base' | 'base.en' | 'small' | 'small.en' | 'medium' | 'medium.en' | 'large-v1' | 'large-v2' +export type WhisperModelType = 'tiny' | 'tiny.en' | 'base' | 'base.en' | 'small' | 'small.en' | 'medium' | 'medium.en' | 'large-v1' | 'large-v2' | 'turbo' /** * Represents the object containing the different prompts, their instructions to the LLM, and their expected example output. diff --git a/src/utils/runTranscription.ts b/src/utils/runTranscription.ts index dfde1cc..a5cf2dc 100644 --- a/src/utils/runTranscription.ts +++ b/src/utils/runTranscription.ts @@ -1,75 +1,46 @@ // src/utils/runTranscription.ts -import { readFile, writeFile } from 'node:fs/promises' import { callWhisper } from '../transcription/whisper.js' +import { callWhisperPython } from '../transcription/whisperPython.js' import { callWhisperDocker } from '../transcription/whisperDocker.js' import { callDeepgram } from '../transcription/deepgram.js' import { callAssembly } from '../transcription/assembly.js' -import { log, step, success, wait } from '../models.js' -import type { TranscriptServices, ProcessingOptions } from '../types.js' +import { log, step } from '../models.js' +import { TranscriptServices, ProcessingOptions } from '../types.js' /** - * Main function to run transcription. + * Manages the transcription process based on the selected service. + * @param {ProcessingOptions} options - The processing options. + * * @param {TranscriptServices} transcriptServices - The transcription service to use. * @param {string} finalPath - The base path for the files. - * @param {string} frontMatter - Optional front matter content for the markdown file. - * @param {TranscriptServices} transcriptServices - The transcription service to use. - * @param {ProcessingOptions} [options] - Additional processing options. - * @returns {Promise} - Returns the final content including markdown and transcript. - * @throws {Error} - If the transcription service fails or an error occurs during processing. + * @returns {Promise} */ export async function runTranscription( options: ProcessingOptions, finalPath: string, frontMatter: string, transcriptServices?: TranscriptServices -): Promise { - log(step(`\nStep 3 - Running transcription on audio file...`)) - try { - let txtContent: string - - // Choose the transcription service based on the provided option - switch (transcriptServices) { - case 'deepgram': - txtContent = await callDeepgram(options, finalPath) - break - - case 'assembly': - txtContent = await callAssembly(options, finalPath) - break - - case 'whisperDocker': - txtContent = await callWhisperDocker(options, finalPath) - break - - case 'whisper': - default: - txtContent = await callWhisper(options, finalPath) - break - } - - let mdContent = frontMatter - try { - // Attempt to read existing markdown content - const existingContent = await readFile(`${finalPath}.md`, 'utf8') - mdContent += existingContent - } catch (error) { - if ((error as NodeJS.ErrnoException).code !== 'ENOENT') { - console.error(`Error reading markdown file: ${(error as Error).message}`) - throw error - } - // If the file does not exist, proceed without appending - } - - // Combine existing markdown content with the transcript - const finalContent = `${mdContent}\n## Transcript\n\n${txtContent}` - - // Write final markdown file, including existing content and the new transcript - await writeFile(`${finalPath}.md`, finalContent) - log(success(` Markdown file successfully updated with transcript:\n - ${finalPath}.md`)) - - return finalContent - } catch (error) { - console.error(`Error in transcription process: ${(error as Error).message}`) - throw error +): Promise { + log(step(`\nStep 3 - Running transcription on audio file using ${transcriptServices}...`)) + + // Choose the transcription service based on the provided option + switch (transcriptServices) { + case 'deepgram': + await callDeepgram(options, finalPath) + break + case 'assembly': + await callAssembly(options, finalPath) + break + case 'whisper': + await callWhisper(options, finalPath) + break + case 'whisperDocker': + await callWhisperDocker(options, finalPath) + break + case 'whisperPython': + await callWhisperPython(options, finalPath) + break + default: + throw new Error(`Unknown transcription service: ${transcriptServices}`) } } \ No newline at end of file From 1c61f0d64fdfa51a481c0c2f28e9137fe716753d Mon Sep 17 00:00:00 2001 From: Anthony Campolo <12433465+ajcwebdev@users.noreply.github.com> Date: Mon, 14 Oct 2024 21:58:47 -0500 Subject: [PATCH 06/10] add fireworks and together options --- docs/examples.md | 12 ++++++ src/autoshow.ts | 2 + src/llms/fireworks.ts | 88 +++++++++++++++++++++++++++++++++++++++++ src/llms/together.ts | 91 +++++++++++++++++++++++++++++++++++++++++++ src/models.ts | 2 +- src/types.ts | 6 ++- src/utils/runLLM.ts | 4 ++ 7 files changed, 203 insertions(+), 2 deletions(-) create mode 100644 src/llms/fireworks.ts create mode 100644 src/llms/together.ts diff --git a/docs/examples.md b/docs/examples.md index cac96c9..fcb76e3 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -245,6 +245,18 @@ npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --octo NOUS_ npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --octo WIZARD_2_8X_22B ``` +### Fireworks + +```bash +npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --fireworks +``` + +### Together + +```bash +npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --together +``` + ### Llama.cpp ```bash diff --git a/src/autoshow.ts b/src/autoshow.ts index 427c7bd..53be20e 100644 --- a/src/autoshow.ts +++ b/src/autoshow.ts @@ -54,6 +54,8 @@ program .option('--cohere [model]', 'Use Cohere for processing with optional model specification') .option('--mistral [model]', 'Use Mistral for processing') .option('--octo [model]', 'Use Octo for processing') + .option('--fireworks [model]', 'Use Fireworks AI for processing with optional model specification') + .option('--together [model]', 'Use Together AI for processing with optional model specification') .option('--llama [model]', 'Use Node Llama for processing with optional model specification') .option('--ollama [model]', 'Use Ollama for processing with optional model specification') .option('--gemini [model]', 'Use Gemini for processing with optional model specification') diff --git a/src/llms/fireworks.ts b/src/llms/fireworks.ts new file mode 100644 index 0000000..c7c2ed4 --- /dev/null +++ b/src/llms/fireworks.ts @@ -0,0 +1,88 @@ +// src/llms/fireworks.ts + +import { writeFile } from 'node:fs/promises' +import { env } from 'node:process' +import fetch from 'node-fetch' +import { log, wait } from '../models.js' +import type { LLMFunction } from '../types.js' + +/** + * Main function to call Fireworks AI API. + * @param promptAndTranscript - The combined prompt and transcript text to process. + * @param tempPath - The temporary file path to write the LLM output. + * @param model - The Fireworks model to use. + * @returns A Promise that resolves when the API call is complete. + * @throws {Error} - If an error occurs during the API call. + */ +export const callFireworks: LLMFunction = async ( + promptAndTranscript: string, + tempPath: string, + model: string = 'accounts/fireworks/models/llama-v3p2-3b-instruct' +): Promise => { + // Check if the FIREWORKS_API_KEY environment variable is set + if (!env.FIREWORKS_API_KEY) { + throw new Error('FIREWORKS_API_KEY environment variable is not set. Please set it to your Fireworks API key.') + } + + try { + log(wait(`\n Using Fireworks model:\n - ${model}`)) + + // Prepare the request body + const requestBody = { + model: "accounts/fireworks/models/llama-v3p2-3b-instruct", + messages: [ + { + role: 'user', + content: promptAndTranscript, + }, + ], + } + + // Make API call to Fireworks AI + const response = await fetch('https://api.fireworks.ai/inference/v1/chat/completions', { + method: 'POST', + headers: { + Authorization: `Bearer ${env.FIREWORKS_API_KEY}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify(requestBody), + }) + + // Check if the response is OK + if (!response.ok) { + const errorText = await response.text() + throw new Error(`Fireworks API error: ${response.status} ${response.statusText} - ${errorText}`) + } + + const data = await response.json() + + // Extract the generated content + const content = data.choices[0]?.message?.content + const finishReason = data.choices[0]?.finish_reason + const usedModel = data.model + const usage = data.usage + + if (!content) { + throw new Error('No content generated from the Fireworks API') + } + + // Write the generated content to the specified output file + await writeFile(tempPath, content) + log(wait(`\n Fireworks response saved to ${tempPath}`)) + + // Log finish reason, used model, and token usage + log(wait(`\n Finish Reason: ${finishReason}\n Model Used: ${usedModel}`)) + if (usage) { + const { prompt_tokens, completion_tokens, total_tokens } = usage + log( + wait( + ` Token Usage:\n - ${prompt_tokens} prompt tokens\n - ${completion_tokens} completion tokens\n - ${total_tokens} total tokens` + ) + ) + } + } catch (error) { + // Log any errors that occur during the process + console.error(`Error in callFireworks: ${(error as Error).message}`) + throw error // Re-throw the error for handling by the caller + } +} \ No newline at end of file diff --git a/src/llms/together.ts b/src/llms/together.ts new file mode 100644 index 0000000..d7e4142 --- /dev/null +++ b/src/llms/together.ts @@ -0,0 +1,91 @@ +// src/llms/together.ts + +import { writeFile } from 'node:fs/promises' +import { env } from 'node:process' +import fetch from 'node-fetch' +import { log, wait } from '../models.js' +import type { LLMFunction } from '../types.js' + +/** + * Main function to call Together AI API. + * @param promptAndTranscript - The combined prompt and transcript text to process. + * @param tempPath - The temporary file path to write the LLM output. + * @param model - The Together AI model to use. + * @returns A Promise that resolves when the API call is complete. + * @throws {Error} - If an error occurs during the API call. + */ +export const callTogether: LLMFunction = async ( + promptAndTranscript: string, + tempPath: string, + model: string = 'meta-llama/Llama-3.2-3B-Instruct-Turbo' +): Promise => { + // Check if the TOGETHER_API_KEY environment variable is set + if (!env.TOGETHER_API_KEY) { + throw new Error('TOGETHER_API_KEY environment variable is not set. Please set it to your Together AI API key.') + } + + try { + log(wait(`\n Using Together AI model:\n - ${model}`)) + + // Prepare the request body + const requestBody = { + model: 'meta-llama/Llama-3.2-3B-Instruct-Turbo', + messages: [ + { + role: 'user', + content: promptAndTranscript, + }, + ], + max_tokens: 2000, + temperature: 0.7, + } + + // Make API call to Together AI + const response = await fetch('https://api.together.xyz/v1/chat/completions', { + method: 'POST', + headers: { + accept: 'application/json', + 'content-type': 'application/json', + authorization: `Bearer ${env.TOGETHER_API_KEY}`, + }, + body: JSON.stringify(requestBody), + }) + + // Check if the response is OK + if (!response.ok) { + const errorText = await response.text() + throw new Error(`Together AI API error: ${response.status} ${response.statusText} - ${errorText}`) + } + + const data = await response.json() + + // Extract the generated content + const content = data.choices[0]?.message?.content + const finishReason = data.choices[0]?.finish_reason + const usedModel = data.model + const usage = data.usage + + if (!content) { + throw new Error('No content generated from the Together AI API') + } + + // Write the generated content to the specified output file + await writeFile(tempPath, content) + log(wait(`\n Together AI response saved to ${tempPath}`)) + + // Log finish reason, used model, and token usage + log(wait(`\n Finish Reason: ${finishReason}\n Model Used: ${usedModel}`)) + if (usage) { + const { prompt_tokens, completion_tokens, total_tokens } = usage + log( + wait( + ` Token Usage:\n - ${prompt_tokens} prompt tokens\n - ${completion_tokens} completion tokens\n - ${total_tokens} total tokens` + ) + ) + } + } catch (error) { + // Log any errors that occur during the process + console.error(`Error in callTogether: ${(error as Error).message}`) + throw error // Re-throw the error for handling by the caller + } +} \ No newline at end of file diff --git a/src/models.ts b/src/models.ts index 5ea7929..1684fce 100644 --- a/src/models.ts +++ b/src/models.ts @@ -14,7 +14,7 @@ export const final: ChalkInstance = chalk.bold.italic export const log: typeof console.log = console.log export const ACTION_OPTIONS = ['video', 'playlist', 'urls', 'file', 'rss'] -export const LLM_OPTIONS = ['chatgpt', 'claude', 'cohere', 'mistral', 'octo', 'llama', 'ollama', 'gemini'] +export const LLM_OPTIONS = ['chatgpt', 'claude', 'cohere', 'mistral', 'octo', 'llama', 'ollama', 'gemini', 'fireworks', 'together'] export const TRANSCRIPT_OPTIONS = ['whisper', 'whisperDocker', 'whisperPython', 'deepgram', 'assembly'] /** diff --git a/src/types.ts b/src/types.ts index 27d79af..2c68cd8 100644 --- a/src/types.ts +++ b/src/types.ts @@ -46,6 +46,10 @@ export type ProcessingOptions = { mistral?: string /** OctoAI model to use (e.g., 'LLAMA_3_1_8B'). */ octo?: string + /** Fireworks model to use (e.g., ''). */ + fireworks?: string + /** Together model to use (e.g., ''). */ + together?: string /** Ollama model to use for local inference (e.g., 'LLAMA_3_2_1B'). */ ollama?: string /** Llama model to use for local inference (e.g., 'LLAMA_3_1_8B'). */ @@ -273,7 +277,7 @@ export type PromptSection = { * - ollama: Use Ollama for processing. * - gemini: Use Google's Gemini models. */ -export type LLMServices = 'chatgpt' | 'claude' | 'cohere' | 'mistral' | 'octo' | 'llama' | 'ollama' | 'gemini' +export type LLMServices = 'chatgpt' | 'claude' | 'cohere' | 'mistral' | 'octo' | 'llama' | 'ollama' | 'gemini' | 'fireworks' | 'together' /** * Represents the options for LLM processing. diff --git a/src/utils/runLLM.ts b/src/utils/runLLM.ts index 419b5a0..1514142 100644 --- a/src/utils/runLLM.ts +++ b/src/utils/runLLM.ts @@ -9,6 +9,8 @@ import { callGemini } from '../llms/gemini.js' import { callCohere } from '../llms/cohere.js' import { callMistral } from '../llms/mistral.js' import { callOcto } from '../llms/octo.js' +import { callFireworks } from '../llms/fireworks.js' +import { callTogether } from '../llms/together.js' import { generatePrompt } from '../llms/prompt.js' import { log, step, success, wait } from '../models.js' import type { LLMServices, ProcessingOptions, LLMFunction, LLMFunctions } from '../types.js' @@ -38,6 +40,8 @@ export async function runLLM( cohere: callCohere, mistral: callMistral, octo: callOcto, + fireworks: callFireworks, + together: callTogether, } try { From 8e0bc4a820e4944a7425ad3f0c91882007485d80 Mon Sep 17 00:00:00 2001 From: Anthony Campolo <12433465+ajcwebdev@users.noreply.github.com> Date: Mon, 14 Oct 2024 21:59:07 -0500 Subject: [PATCH 07/10] fix package scripts --- package.json | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/package.json b/package.json index 821bb2a..c1be6ee 100644 --- a/package.json +++ b/package.json @@ -24,18 +24,18 @@ "docker": "docker compose run --remove-orphans autoshow", "docker-up": "docker compose up --build -d --remove-orphans --no-start", "ds": "docker compose images && docker compose ls", - "v": "node --env-file=.env --no-warnings src/autoshow.ts --whisper large-v2 --video", - "u": "node --env-file=.env --no-warnings src/autoshow.ts --whisper large-v2 --urls", - "p": "node --env-file=.env --no-warnings src/autoshow.ts --whisper large-v2 --playlist", - "f": "node --env-file=.env --no-warnings src/autoshow.ts --whisper large-v2 --file", - "r": "node --env-file=.env --no-warnings src/autoshow.ts --whisper large-v2 --rss", - "last3": "node --env-file=.env --no-warnings src/autoshow.ts --whisper large-v2 --last 3 --rss", - "serve": "node --env-file=.env --no-warnings --watch packages/server/index.ts", - "fetch-local": "node --env-file=.env --no-warnings packages/server/tests/fetch-local.ts", - "fetch-all": "node --env-file=.env --no-warnings packages/server/tests/fetch-all.ts", + "v": "tsx --env-file=.env --no-warnings src/autoshow.ts --whisper large-v2 --video", + "u": "tsx --env-file=.env --no-warnings src/autoshow.ts --whisper large-v2 --urls", + "p": "tsx --env-file=.env --no-warnings src/autoshow.ts --whisper large-v2 --playlist", + "f": "tsx --env-file=.env --no-warnings src/autoshow.ts --whisper large-v2 --file", + "r": "tsx --env-file=.env --no-warnings src/autoshow.ts --whisper large-v2 --rss", + "last3": "tsx --env-file=.env --no-warnings src/autoshow.ts --whisper large-v2 --last 3 --rss", + "serve": "tsx --env-file=.env --no-warnings --watch packages/server/index.ts", + "fetch-local": "tsx --env-file=.env --no-warnings packages/server/tests/fetch-local.ts", + "fetch-all": "tsx --env-file=.env --no-warnings packages/server/tests/fetch-all.ts", "t": "npm run test-local", - "test-local": "node --test test/local.test.js", - "test-all": "node --test test/all.test.js", + "test-local": "tsx --test test/local.test.js", + "test-all": "tsx --test test/all.test.js", "clean": "tsx scripts/cleanContent.ts", "bun-as": "bun --env-file=.env --no-warnings src/autoshow.ts", "deno-as": "deno run --allow-sys --allow-read --allow-run --allow-write --allow-env src/autoshow.ts" From d44a5450a2e3121d2e21823a71ab0b12ccb816d5 Mon Sep 17 00:00:00 2001 From: Anthony Campolo <12433465+ajcwebdev@users.noreply.github.com> Date: Mon, 14 Oct 2024 23:55:45 -0500 Subject: [PATCH 08/10] select fireworks and together model --- docs/examples.md | 24 ++++++++ src/llms/fireworks.ts | 13 ++--- src/llms/together.ts | 17 +++--- src/models.ts | 30 +++++++++- src/transcription/deepgram.ts | 36 +----------- src/types.ts | 104 ++++++++++++++++++++++++++++++++++ 6 files changed, 172 insertions(+), 52 deletions(-) diff --git a/docs/examples.md b/docs/examples.md index fcb76e3..64fec6d 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -251,12 +251,36 @@ npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --octo WIZAR npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --fireworks ``` +Select Fireworks model: + +```bash +npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --fireworks LLAMA_3_1_405B +npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --fireworks LLAMA_3_1_70B +npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --fireworks LLAMA_3_1_8B +npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --fireworks LLAMA_3_2_3B +npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --fireworks LLAMA_3_2_1B +npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --fireworks QWEN_2_5_72B +``` + ### Together ```bash npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --together ``` +Select Together model: + +```bash +npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --together LLAMA_3_2_3B +npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --together LLAMA_3_1_405B +npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --together LLAMA_3_1_70B +npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --together LLAMA_3_1_8B +npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --together GEMMA_2_27B +npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --together GEMMA_2_9B +npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --together QWEN_2_5_72B +npm run as -- --video "https://www.youtube.com/watch?v=MORMZXEaONk" --together QWEN_2_5_7B +``` + ### Llama.cpp ```bash diff --git a/src/llms/fireworks.ts b/src/llms/fireworks.ts index c7c2ed4..060d5df 100644 --- a/src/llms/fireworks.ts +++ b/src/llms/fireworks.ts @@ -2,9 +2,8 @@ import { writeFile } from 'node:fs/promises' import { env } from 'node:process' -import fetch from 'node-fetch' -import { log, wait } from '../models.js' -import type { LLMFunction } from '../types.js' +import { log, wait, FIREWORKS_MODELS } from '../models.js' +import type { LLMFunction, FireworksModelType, FireworksResponse } from '../types.js' /** * Main function to call Fireworks AI API. @@ -17,7 +16,7 @@ import type { LLMFunction } from '../types.js' export const callFireworks: LLMFunction = async ( promptAndTranscript: string, tempPath: string, - model: string = 'accounts/fireworks/models/llama-v3p2-3b-instruct' + model: string = 'LLAMA_3_2_3B' ): Promise => { // Check if the FIREWORKS_API_KEY environment variable is set if (!env.FIREWORKS_API_KEY) { @@ -25,11 +24,11 @@ export const callFireworks: LLMFunction = async ( } try { - log(wait(`\n Using Fireworks model:\n - ${model}`)) + const actualModel = FIREWORKS_MODELS[model as FireworksModelType] || FIREWORKS_MODELS.LLAMA_3_2_3B // Prepare the request body const requestBody = { - model: "accounts/fireworks/models/llama-v3p2-3b-instruct", + model: actualModel, messages: [ { role: 'user', @@ -54,7 +53,7 @@ export const callFireworks: LLMFunction = async ( throw new Error(`Fireworks API error: ${response.status} ${response.statusText} - ${errorText}`) } - const data = await response.json() + const data = await response.json() as FireworksResponse // Extract the generated content const content = data.choices[0]?.message?.content diff --git a/src/llms/together.ts b/src/llms/together.ts index d7e4142..dff3207 100644 --- a/src/llms/together.ts +++ b/src/llms/together.ts @@ -2,9 +2,8 @@ import { writeFile } from 'node:fs/promises' import { env } from 'node:process' -import fetch from 'node-fetch' -import { log, wait } from '../models.js' -import type { LLMFunction } from '../types.js' +import { log, wait, TOGETHER_MODELS } from '../models.js' +import type { LLMFunction, TogetherModelType, TogetherResponse } from '../types.js' /** * Main function to call Together AI API. @@ -17,7 +16,7 @@ import type { LLMFunction } from '../types.js' export const callTogether: LLMFunction = async ( promptAndTranscript: string, tempPath: string, - model: string = 'meta-llama/Llama-3.2-3B-Instruct-Turbo' + model: string = 'LLAMA_3_2_3B' ): Promise => { // Check if the TOGETHER_API_KEY environment variable is set if (!env.TOGETHER_API_KEY) { @@ -25,19 +24,19 @@ export const callTogether: LLMFunction = async ( } try { - log(wait(`\n Using Together AI model:\n - ${model}`)) + const actualModel = TOGETHER_MODELS[model as TogetherModelType] || TOGETHER_MODELS.LLAMA_3_2_3B // Prepare the request body const requestBody = { - model: 'meta-llama/Llama-3.2-3B-Instruct-Turbo', + model: actualModel, messages: [ { role: 'user', content: promptAndTranscript, }, ], - max_tokens: 2000, - temperature: 0.7, + // max_tokens: 2000, + // temperature: 0.7, } // Make API call to Together AI @@ -57,7 +56,7 @@ export const callTogether: LLMFunction = async ( throw new Error(`Together AI API error: ${response.status} ${response.statusText} - ${errorText}`) } - const data = await response.json() + const data = await response.json() as TogetherResponse // Extract the generated content const content = data.choices[0]?.message?.content diff --git a/src/models.ts b/src/models.ts index 1684fce..cc452ee 100644 --- a/src/models.ts +++ b/src/models.ts @@ -2,7 +2,7 @@ import chalk from 'chalk' import type { ChalkInstance } from 'chalk' -import type { WhisperModelType, ChatGPTModelType, ClaudeModelType, CohereModelType, GeminiModelType, MistralModelType, OctoModelType, LlamaModelType, OllamaModelType } from './types.js' +import type { WhisperModelType, ChatGPTModelType, ClaudeModelType, CohereModelType, GeminiModelType, MistralModelType, OctoModelType, LlamaModelType, OllamaModelType, TogetherModelType, FireworksModelType } from './types.js' export const step: ChalkInstance = chalk.bold.underline export const dim: ChalkInstance = chalk.dim @@ -118,6 +118,34 @@ export const OCTO_MODELS: Record = { WIZARD_2_8X_22B: "wizardlm-2-8x22b", } +/** + * Map of Fireworks model identifiers to their API names + * @type {Record} + */ +export const FIREWORKS_MODELS: Record = { + LLAMA_3_1_405B: "accounts/fireworks/models/llama-v3p1-405b-instruct", + LLAMA_3_1_70B: "accounts/fireworks/models/llama-v3p1-70b-instruct", + LLAMA_3_1_8B: "accounts/fireworks/models/llama-v3p1-8b-instruct", + LLAMA_3_2_3B: "accounts/fireworks/models/llama-v3p2-3b-instruct", + LLAMA_3_2_1B: "accounts/fireworks/models/llama-v3p2-1b-instruct", + QWEN_2_5_72B: "accounts/fireworks/models/qwen2p5-72b-instruct", +} + +/** + * Map of Together model identifiers to their API names + * @type {Record} + */ +export const TOGETHER_MODELS: Record = { + LLAMA_3_2_3B: "meta-llama/Llama-3.2-3B-Instruct-Turbo", + LLAMA_3_1_405B: "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", + LLAMA_3_1_70B: "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", + LLAMA_3_1_8B: "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", + GEMMA_2_27B: "google/gemma-2-27b-it", + GEMMA_2_9B: "google/gemma-2-9b-it", + QWEN_2_5_72B: "Qwen/Qwen2.5-72B-Instruct-Turbo", + QWEN_2_5_7B: "Qwen/Qwen2.5-7B-Instruct-Turbo", +} + /** * Map of local model identifiers to their filenames and URLs * @type {Record} diff --git a/src/transcription/deepgram.ts b/src/transcription/deepgram.ts index 0b7950f..14e5fd8 100644 --- a/src/transcription/deepgram.ts +++ b/src/transcription/deepgram.ts @@ -3,41 +3,7 @@ import { writeFile, readFile } from 'node:fs/promises' import { env } from 'node:process' import { log, wait } from '../models.js' -import type { ProcessingOptions } from '../types.js' - -// Define types for Deepgram API response -type DeepgramResponse = { - metadata: { - transaction_key: string - request_id: string - sha256: string - created: string - duration: number - channels: number - models: string[] - model_info: { - [key: string]: { - name: string - version: string - arch: string - } - } - } - results: { - channels: Array<{ - alternatives: Array<{ - transcript: string - confidence: number - words: Array<{ - word: string - start: number - end: number - confidence: number - }> - }> - }> - } -} +import type { ProcessingOptions, DeepgramResponse } from '../types.js' /** * Main function to handle transcription using Deepgram API. diff --git a/src/types.ts b/src/types.ts index 2c68cd8..23b856c 100644 --- a/src/types.ts +++ b/src/types.ts @@ -326,11 +326,81 @@ export type GeminiModelType = 'GEMINI_1_5_FLASH' | 'GEMINI_1_5_PRO' export type MistralModelType = 'MIXTRAL_8x7b' | 'MIXTRAL_8x22b' | 'MISTRAL_LARGE' | 'MISTRAL_NEMO' /** Define available OctoAI models. */ export type OctoModelType = 'LLAMA_3_1_8B' | 'LLAMA_3_1_70B' | 'LLAMA_3_1_405B' | 'MISTRAL_7B' | 'MIXTRAL_8X_7B' | 'NOUS_HERMES_MIXTRAL_8X_7B' | 'WIZARD_2_8X_22B' +/** Define available Fireworks models. */ +export type FireworksModelType = 'LLAMA_3_1_405B' | 'LLAMA_3_1_70B' | 'LLAMA_3_1_8B' | 'LLAMA_3_2_3B' | 'LLAMA_3_2_1B' | 'QWEN_2_5_72B' +/** Define available Together models. */ +export type TogetherModelType = 'LLAMA_3_2_3B' | 'LLAMA_3_1_405B' | 'LLAMA_3_1_70B' | 'LLAMA_3_1_8B' | 'GEMMA_2_27B' | 'GEMMA_2_9B' | 'QWEN_2_5_72B' | 'QWEN_2_5_7B' /** Define local model configurations. */ export type LlamaModelType = 'QWEN_2_5_1B' | 'QWEN_2_5_3B' | 'PHI_3_5' | 'LLAMA_3_2_1B' | 'GEMMA_2_2B' /** Define local model with Ollama. */ export type OllamaModelType = 'LLAMA_3_2_1B' | 'LLAMA_3_2_3B' | 'GEMMA_2_2B' | 'PHI_3_5' | 'QWEN_2_5_1B' | 'QWEN_2_5_3B' +export type FireworksResponse = { + id: string + object: string + created: number + model: string + prompt: any[] + choices: { + finish_reason: string + index: number + message: { + role: string + content: string + tool_calls: { + id: string + type: string + function: { + name: string + arguments: string + } + }[] + } + }[] + usage: { + prompt_tokens: number + completion_tokens: number + total_tokens: number + } +} + +export type TogetherResponse = { + id: string + object: string + created: number + model: string + prompt: any[] + choices: { + text: string + finish_reason: string + seed: number + index: number + message: { + role: string + content: string + tool_calls: { + index: number + id: string + type: string + function: { + name: string + arguments: string + } + }[] + } + logprobs: { + token_ids: number[] + tokens: string[] + token_logprobs: number[] + } + }[] + usage: { + prompt_tokens: number + completion_tokens: number + total_tokens: number + } +} + // Define the expected structure of the response from Ollama API export type OllamaResponse = { model: string @@ -367,6 +437,40 @@ export type OllamaTagsResponse = { }> } +// Define types for Deepgram API response +export type DeepgramResponse = { + metadata: { + transaction_key: string + request_id: string + sha256: string + created: string + duration: number + channels: number + models: string[] + model_info: { + [key: string]: { + name: string + version: string + arch: string + } + } + } + results: { + channels: Array<{ + alternatives: Array<{ + transcript: string + confidence: number + words: Array<{ + word: string + start: number + end: number + confidence: number + }> + }> + }> + } +} + /** * Represents the function signature for cleaning up temporary files. */ From 659557b383b9cb86a9b02afa9f548afba607ed9e Mon Sep 17 00:00:00 2001 From: Anthony Campolo <12433465+ajcwebdev@users.noreply.github.com> Date: Tue, 15 Oct 2024 03:12:16 -0500 Subject: [PATCH 09/10] add whisper diarization option --- .gitignore | 4 +- scripts/setup-python.sh | 17 +++++ src/autoshow.ts | 1 + src/models.ts | 2 +- src/transcription/whisperDiarization.ts | 90 +++++++++++++++++++++++++ src/transcription/whisperPython.ts | 10 +-- src/types.ts | 4 +- src/utils/runTranscription.ts | 4 ++ 8 files changed, 124 insertions(+), 8 deletions(-) create mode 100755 scripts/setup-python.sh create mode 100644 src/transcription/whisperDiarization.ts diff --git a/.gitignore b/.gitignore index ccb1788..8720a32 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ node_modules .DS_Store /content whisper.cpp +whisper-diarization package-lock.json .env src/llms/models @@ -12,4 +13,5 @@ out types dist NEW.md -TODO.md \ No newline at end of file +TODO.md +nemo_msdd_configs \ No newline at end of file diff --git a/scripts/setup-python.sh b/scripts/setup-python.sh new file mode 100755 index 0000000..634719b --- /dev/null +++ b/scripts/setup-python.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Clone the repository +git clone https://github.com/MahmoudAshraf97/whisper-diarization.git + +# Create and activate virtual environment +python3.12 -m venv whisper-diarization/venv +source whisper-diarization/venv/bin/activate + +# Install the requirements +pip install -c whisper-diarization/constraints.txt -r whisper-diarization/requirements.txt + +echo "Setup complete. To activate this environment in the future, run:" +echo "source whisper-diarization/venv/bin/activate" +echo "" +echo "To deactivate this environment, run:" +echo "deactivate" \ No newline at end of file diff --git a/src/autoshow.ts b/src/autoshow.ts index 53be20e..625f68c 100644 --- a/src/autoshow.ts +++ b/src/autoshow.ts @@ -46,6 +46,7 @@ program .option('--whisper [model]', 'Use Whisper.cpp for transcription with optional model specification') .option('--whisperDocker [model]', 'Use Whisper.cpp in Docker for transcription with optional model specification') .option('--whisperPython [model]', 'Use openai-whisper for transcription with optional model specification') + .option('--whisperDiarization [model]', 'Use whisper-diarization for transcription with optional model specification') .option('--deepgram', 'Use Deepgram for transcription') .option('--assembly', 'Use AssemblyAI for transcription') .option('--speakerLabels', 'Use speaker labels for AssemblyAI transcription') diff --git a/src/models.ts b/src/models.ts index cc452ee..97942dc 100644 --- a/src/models.ts +++ b/src/models.ts @@ -15,7 +15,7 @@ export const log: typeof console.log = console.log export const ACTION_OPTIONS = ['video', 'playlist', 'urls', 'file', 'rss'] export const LLM_OPTIONS = ['chatgpt', 'claude', 'cohere', 'mistral', 'octo', 'llama', 'ollama', 'gemini', 'fireworks', 'together'] -export const TRANSCRIPT_OPTIONS = ['whisper', 'whisperDocker', 'whisperPython', 'deepgram', 'assembly'] +export const TRANSCRIPT_OPTIONS = ['whisper', 'whisperDocker', 'whisperPython', 'whisperDiarization', 'deepgram', 'assembly'] /** * Define available Whisper models for whisper.cpp diff --git a/src/transcription/whisperDiarization.ts b/src/transcription/whisperDiarization.ts new file mode 100644 index 0000000..0c78570 --- /dev/null +++ b/src/transcription/whisperDiarization.ts @@ -0,0 +1,90 @@ +// src/transcription/whisperDiarization.ts + +import { readFile, writeFile, unlink } from 'node:fs/promises' +import { exec } from 'node:child_process' +import { promisify } from 'node:util' +// import { existsSync } from 'node:fs' +import { log, wait } from '../models.js' +import type { ProcessingOptions } from '../types.js' +import { WHISPER_PYTHON_MODELS } from '../models.js' + +const execPromise = promisify(exec) + +/** + * Main function to handle transcription using openai-whisper Python library. + * @param {ProcessingOptions} options - Additional processing options. + * @param {string} finalPath - The base path for the files. + * @returns {Promise} - Returns the formatted transcript content. + * @throws {Error} - If an error occurs during transcription. + */ +export async function callWhisperDiarization(options: ProcessingOptions, finalPath: string): Promise { + log(wait('\n Using openai-whisper Python library for transcription...')) + + try { + // Get the whisper model from options or use 'base' as default + let whisperModel: string = 'base' + if (typeof options.whisperDiarization === 'string') { + whisperModel = options.whisperDiarization + } else if (options.whisperDiarization !== true) { + throw new Error('Invalid whisperPython option') + } + + if (!(whisperModel in WHISPER_PYTHON_MODELS)) { + throw new Error(`Unknown model type: ${whisperModel}`) + } + + log(wait(`\n - whisperModel: ${whisperModel}`)) + + // // Check if ffmpeg is installed + // try { + // await execPromise('ffmpeg -version') + // } catch (error) { + // throw new Error('ffmpeg is not installed or not available in PATH') + // } + + // // Check if Python is installed + // try { + // await execPromise('python3 --version') + // } catch (error) { + // throw new Error('Python is not installed or not available in PATH') + // } + + // // Check if the whisper-diarization repo is cloned + // if (!existsSync('./whisper-diarization')) { + // log(`\n No whisper-diarization repo found, running git clone...\n`) + // await execPromise('git clone https://github.com/MahmoudAshraf97/whisper-diarization.git') + // log(`\n - whisper-diarization clone complete.\n`) + // } + + // Prepare the command to run the transcription + const command = `python whisper-diarization/diarize.py -a ${finalPath}.wav --whisper-model ${whisperModel}` + + log(wait(`\n Running transcription with command:\n ${command}\n`)) + + // Execute the command + await execPromise(command) + + await unlink(`${finalPath}.txt`) + log(wait(`\n Extra TXT file deleted:\n - ${finalPath}.txt\n`)) + + // Read the generated transcript file + const transcriptContent = await readFile(`${finalPath}.srt`, 'utf8') + + // Write the transcript to the expected output file + await writeFile(`${finalPath}.txt`, transcriptContent) + + // Create an empty LRC file to prevent cleanup errors and unlink SRT file + await writeFile(`${finalPath}.lrc`, '') + log(wait(`\n Empty LRC file created:\n - ${finalPath}.lrc\n`)) + await unlink(`${finalPath}.srt`) + log(wait(`\n SRT file deleted:\n - ${finalPath}.srt\n`)) + + log(wait(`\n Transcript successfully completed:\n - ${finalPath}.txt\n`)) + + return transcriptContent + + } catch (error) { + console.error('Error in callWhisperDiarization:', (error as Error).message) + process.exit(1) + } +} diff --git a/src/transcription/whisperPython.ts b/src/transcription/whisperPython.ts index 8667f1e..aa3de8b 100644 --- a/src/transcription/whisperPython.ts +++ b/src/transcription/whisperPython.ts @@ -60,7 +60,7 @@ export async function callWhisperPython(options: ProcessingOptions, finalPath: s } // Prepare the command to run the transcription - const command = `whisper "${finalPath}.wav" --model ${whisperModel} --output_dir "content" --output_format vtt --language en --word_timestamps True` + const command = `whisper "${finalPath}.wav" --model ${whisperModel} --output_dir "content" --output_format srt --language en --word_timestamps True` log(wait(`\n Running transcription with command:\n ${command}\n`)) @@ -68,16 +68,16 @@ export async function callWhisperPython(options: ProcessingOptions, finalPath: s await execPromise(command) // Read the generated transcript file - const transcriptContent = await readFile(`${finalPath}.vtt`, 'utf8') + const transcriptContent = await readFile(`${finalPath}.srt`, 'utf8') // Write the transcript to the expected output file await writeFile(`${finalPath}.txt`, transcriptContent) - // Create an empty LRC file to prevent cleanup errors and unlink VTT file + // Create an empty LRC file to prevent cleanup errors and unlink SRT file await writeFile(`${finalPath}.lrc`, '') log(wait(`\n Empty LRC file created:\n - ${finalPath}.lrc\n`)) - await unlink(`${finalPath}.vtt`) - log(wait(`\n VTT file deleted:\n - ${finalPath}.vtt\n`)) + await unlink(`${finalPath}.srt`) + log(wait(`\n SRT file deleted:\n - ${finalPath}.srt\n`)) log(wait(`\n Transcript successfully completed:\n - ${finalPath}.txt\n`)) diff --git a/src/types.ts b/src/types.ts index 23b856c..fc2ab85 100644 --- a/src/types.ts +++ b/src/types.ts @@ -28,6 +28,8 @@ export type ProcessingOptions = { whisper?: WhisperModelType /** The Whisper Python model to use (e.g., 'tiny', 'base'). */ whisperPython?: WhisperModelType + /** The Whisper Diarization model to use (e.g., 'tiny', 'base'). */ + whisperDiarization?: WhisperModelType /** The Whisper model to use with Docker (e.g., 'tiny', 'base'). */ whisperDocker?: WhisperModelType /** Flag to use Deepgram for transcription. */ @@ -237,7 +239,7 @@ export type SupportedFileType = 'wav' | 'mp3' | 'm4a' | 'aac' | 'ogg' | 'flac' | * - deepgram: Use Deepgram's transcription service. * - assembly: Use AssemblyAI's transcription service. */ -export type TranscriptServices = 'whisper' | 'whisperDocker' | 'whisperPython' | 'deepgram' | 'assembly' +export type TranscriptServices = 'whisper' | 'whisperDocker' | 'whisperPython' | 'whisperDiarization' | 'deepgram' | 'assembly' /** * Represents the available Whisper model types. diff --git a/src/utils/runTranscription.ts b/src/utils/runTranscription.ts index a5cf2dc..21099de 100644 --- a/src/utils/runTranscription.ts +++ b/src/utils/runTranscription.ts @@ -3,6 +3,7 @@ import { callWhisper } from '../transcription/whisper.js' import { callWhisperPython } from '../transcription/whisperPython.js' import { callWhisperDocker } from '../transcription/whisperDocker.js' +import { callWhisperDiarization } from '../transcription/whisperDiarization.js' import { callDeepgram } from '../transcription/deepgram.js' import { callAssembly } from '../transcription/assembly.js' import { log, step } from '../models.js' @@ -40,6 +41,9 @@ export async function runTranscription( case 'whisperPython': await callWhisperPython(options, finalPath) break + case 'whisperDiarization': + await callWhisperDiarization(options, finalPath) + break default: throw new Error(`Unknown transcription service: ${transcriptServices}`) } From 9c8b6e025e0a3b1a8de7b77eb0584b831a1c4272 Mon Sep 17 00:00:00 2001 From: Anthony Campolo <12433465+ajcwebdev@users.noreply.github.com> Date: Tue, 15 Oct 2024 03:28:51 -0500 Subject: [PATCH 10/10] format srt files --- src/transcription/whisperDiarization.ts | 49 +++++++++++++++++++------ src/transcription/whisperPython.ts | 46 ++++++++++++++++++----- 2 files changed, 74 insertions(+), 21 deletions(-) diff --git a/src/transcription/whisperDiarization.ts b/src/transcription/whisperDiarization.ts index 0c78570..0b42c4b 100644 --- a/src/transcription/whisperDiarization.ts +++ b/src/transcription/whisperDiarization.ts @@ -64,24 +64,49 @@ export async function callWhisperDiarization(options: ProcessingOptions, finalPa // Execute the command await execPromise(command) - await unlink(`${finalPath}.txt`) - log(wait(`\n Extra TXT file deleted:\n - ${finalPath}.txt\n`)) - // Read the generated transcript file - const transcriptContent = await readFile(`${finalPath}.srt`, 'utf8') - - // Write the transcript to the expected output file - await writeFile(`${finalPath}.txt`, transcriptContent) - + const srtContent = await readFile(`${finalPath}.srt`, 'utf8') + + // Process and format the SRT content + const blocks = srtContent.split('\n\n') + + const txtContent = blocks + .map(block => { + const lines = block.split('\n').filter(line => line.trim() !== '') + if (lines.length >= 2) { + // lines[0] is the sequence number + // lines[1] is the timestamp line + // lines[2...] are the subtitle text lines + const timestampLine = lines[1] + const textLines = lines.slice(2) + const match = timestampLine.match(/(\d{2}):(\d{2}):(\d{2}),\d{3}/) + if (match) { + const hours = parseInt(match[1], 10) + const minutes = parseInt(match[2], 10) + const seconds = match[3] + const totalMinutes = hours * 60 + minutes + const timestamp = `[${String(totalMinutes).padStart(2, '0')}:${seconds}]` + const text = textLines.join(' ') + return `${timestamp} ${text}` + } + } + return null + }) + .filter(line => line !== null) + .join('\n') + + // Write the formatted content to a text file + await writeFile(`${finalPath}.txt`, txtContent) + log(wait(`\n Transcript transformation successfully completed...\n - ${finalPath}.txt\n`)) + // Create an empty LRC file to prevent cleanup errors and unlink SRT file await writeFile(`${finalPath}.lrc`, '') log(wait(`\n Empty LRC file created:\n - ${finalPath}.lrc\n`)) await unlink(`${finalPath}.srt`) log(wait(`\n SRT file deleted:\n - ${finalPath}.srt\n`)) - - log(wait(`\n Transcript successfully completed:\n - ${finalPath}.txt\n`)) - - return transcriptContent + + // Return the processed content + return txtContent } catch (error) { console.error('Error in callWhisperDiarization:', (error as Error).message) diff --git a/src/transcription/whisperPython.ts b/src/transcription/whisperPython.ts index aa3de8b..6472f3d 100644 --- a/src/transcription/whisperPython.ts +++ b/src/transcription/whisperPython.ts @@ -68,20 +68,48 @@ export async function callWhisperPython(options: ProcessingOptions, finalPath: s await execPromise(command) // Read the generated transcript file - const transcriptContent = await readFile(`${finalPath}.srt`, 'utf8') - - // Write the transcript to the expected output file - await writeFile(`${finalPath}.txt`, transcriptContent) - + const srtContent = await readFile(`${finalPath}.srt`, 'utf8') + + // Process and format the SRT content + const blocks = srtContent.split('\n\n') + + const txtContent = blocks + .map(block => { + const lines = block.split('\n').filter(line => line.trim() !== '') + if (lines.length >= 2) { + // lines[0] is the sequence number + // lines[1] is the timestamp line + // lines[2...] are the subtitle text lines + const timestampLine = lines[1] + const textLines = lines.slice(2) + const match = timestampLine.match(/(\d{2}):(\d{2}):(\d{2}),\d{3}/) + if (match) { + const hours = parseInt(match[1], 10) + const minutes = parseInt(match[2], 10) + const seconds = match[3] + const totalMinutes = hours * 60 + minutes + const timestamp = `[${String(totalMinutes).padStart(2, '0')}:${seconds}]` + const text = textLines.join(' ') + return `${timestamp} ${text}` + } + } + return null + }) + .filter(line => line !== null) + .join('\n') + + // Write the formatted content to a text file + await writeFile(`${finalPath}.txt`, txtContent) + log(wait(`\n Transcript transformation successfully completed...\n - ${finalPath}.txt\n`)) + // Create an empty LRC file to prevent cleanup errors and unlink SRT file await writeFile(`${finalPath}.lrc`, '') log(wait(`\n Empty LRC file created:\n - ${finalPath}.lrc\n`)) await unlink(`${finalPath}.srt`) log(wait(`\n SRT file deleted:\n - ${finalPath}.srt\n`)) - - log(wait(`\n Transcript successfully completed:\n - ${finalPath}.txt\n`)) - - return transcriptContent + + // Return the processed content + return txtContent } catch (error) { console.error('Error in callWhisperPython:', (error as Error).message)