fix: image download and flow

ChetanXpro · Oct 29, 2023 · 9ffde6e · 9ffde6e
1 parent 72d4bd2
commit 9ffde6e
Show file tree

Hide file tree

Showing 14 changed files with 111 additions and 93 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,3 @@ basicaudio.mp3
 basicaudio.wav
 basicaudio.wav.srt
 shorts
-bg.mp3
diff --git a/base.mp4 b/base.mp4
diff --git a/bg.mp3 b/bg.mp3
diff --git a/new.mp4 b/new.mp4
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -25,7 +25,7 @@
 		"fs-extra": "^11.1.1",
 		"googleapis": "^126.0.1",
 		"langchain": "0.0.167",
-		"nodejs-whisper": "^0.1.4",
+		"nodejs-whisper": "^0.1.6",
 		"openai": "^3.3.0",
 		"shelljs": "^0.8.5"
 	},

diff --git a/src/Editing/getImageQuerys.ts b/src/Editing/getImageQuerys.ts
@@ -8,16 +8,16 @@ const configuration = new Configuration({
 })
 const openai = new OpenAIApi(configuration)
 
-export const getImageQuerys = async () => {
+export const getImageQuerys = async (subtitlesPath: string) => {
 	// fs.readFile('/home/chetan/code/ts-content-gpt/basicaudio.wav.srt', 'utf8', async function (err, data) {
 	// 	if (err) throw err
 
 	// })
 
-	const file = fs.readFileSync('/home/chetan/code/ts-content-gpt/basicaudio.wav.vtt', 'utf8')
+	const file = fs.readFileSync(subtitlesPath, 'utf8')
 
 	const chatCompletion: any = await openai.createChatCompletion({
-		model: 'gpt-3.5-turbo',
+		model: 'gpt-4',
 		messages: [{ role: 'system', content: tryy(file, 5) }],
 	})
 	console.log('chatCompletion: ', chatCompletion.data.choices[0].message.content)

diff --git a/src/audio/elevenAudio.ts b/src/audio/elevenAudio.ts
@@ -10,7 +10,7 @@ export const createAudio = async ({
 	language,
 	stabilityValue = 0.2,
 	similarityBoostValue = 0.1,
-	voice = 'VR6AewLTigWG4xSOukaG',
+	voice = 'wViXBPUzp2ZZixB1xQuM',
 	outputFilePath = 'basicaudio.mp3',
 }: {
 	script: string

diff --git a/src/images/downloadImages.ts b/src/images/downloadImages.ts
@@ -13,7 +13,7 @@ export const downloadImages = async (queries: any) => {
 
 		const url = data.photos[0].src.small
 
-		const newHeight = 200
+		const newHeight = 300
 
 		// Use string.replace() to replace the height value
 		const modifiedUrl = url.replace(/h=\d+/, `h=${newHeight}`)

diff --git a/src/images/imagesProccessing.ts b/src/images/imagesProccessing.ts
@@ -3,9 +3,9 @@ import { PromptTemplate } from 'langchain/prompts'
 import ffmpeg from 'fluent-ffmpeg'
 import { LLMChain } from 'langchain/chains'
 import { imageTemp, tryy } from '../promptTemplates/image'
-import path from 'ffmpeg-static'
+// import path from 'ffmpeg-static'
 
-ffmpeg.setFfmpegPath(path!)
+// ffmpeg.setFfmpegPath(path!)
 
 import { Configuration, OpenAIApi } from 'openai'
 import fs from 'fs'
@@ -25,10 +25,9 @@ export const imageProccessing = async ({
 	topic?: string
 	queries: any
 }) => {
-	const inputVideoPath = '/home/chetan/code/ts-content-gpt/tryyyyyyyyy.mp4'
-	const inputImagePath = '/home/chetan/code/ts-content-gpt/oo.jpg'
-	const outputVideoPath = '/home/chetan/code/ts-content-gpt/yyooooooo.mp4'
-	const targetTimestamp = '00:00:10.000'
+	const inputVideoPath = '/Users/chetan/Developer/code/short-video-automation/shorts/test.mp4'
+
+	const outputVideoPath = '/Users/chetan/Developer/code/short-video-automation/final.mp4'
 
 	interface IQuery {
 		Query: string
@@ -39,7 +38,7 @@ export const imageProccessing = async ({
 	for (const key in queries) {
 		let value = queries[key]
 
-		const newKey = `${Number(key)}-${Number(key) + 3}`
+		const newKey = `${Number(key)}-${Number(key) + 1}`
 
 		const obj = {
 			Query: value,
@@ -63,7 +62,9 @@ export const imageProccessing = async ({
 			let imgPath = ''
 
 			queryArr.forEach((query: IQuery, index: any) => {
-				imgPath += ' -i ' + `/home/chetan/code/ts-content-gpt/${query.Query.split(' ').join('')}.jpg`
+				imgPath +=
+					' -i ' +
+					`/Users/chetan/Developer/code/short-video-automation/${query.Query.split(' ').join('')}.jpg`
 			})
 
 			filter += `${imgPath} -filter_complex "[${prevIndex}:v][${currIndex}:v]overlay=(W-w)/2:(H-h)/2:enable='between(t,${startingTime.trim()},${endTime.trim()})'[v${currIndex}];[v${currIndex}]`
@@ -81,7 +82,7 @@ export const imageProccessing = async ({
 
 	// console.log('filter: ', filter)
 
-	const mainFilter = `${path} -i ${inputVideoPath}  ${filter} -map "[v]" -map 0:a -c:v libx264 -c:a copy ${outputVideoPath} `
+	const mainFilter = `ffmpeg -i ${inputVideoPath}  ${filter} -map "[v]" -map 0:a -c:v libx264 -c:a copy ${outputVideoPath} `
 
 	console.log('mainFilter: ', mainFilter)
 
@@ -94,6 +95,8 @@ export const imageProccessing = async ({
 
 		console.log('Video processing done')
 
+		// return new Promise.all()
+
 		// queryArr.forEach((query: IQuery, index: any) => {
 		// 	fs.unlink(`/home/chetan/code/ts-content-gpt/${query.Query.split(' ').join('')}.jpg`, err => {
 		// 		if (err) {

diff --git a/src/index.ts b/src/index.ts
@@ -3,71 +3,89 @@ dotenv.config()
 import { createShortScript } from './videoScript'
 import { convertToWav, createAudio } from './audio/elevenAudio'
 import { whisper } from './transcript/transcribe'
-
+import fs from 'fs'
 import express from 'express'
 const app = express()
 import path from 'path'
 
 import { mergeAudio } from './video/video'
 
-import { uploadVideos } from './upoad/upload'
-import uploadFile from './upoad/azureUpload'
+// import { uploadVideos } from './upoad/upload'
+// import uploadFile from './upoad/azureUpload'
 import { imageProccessing } from './images/imagesProccessing'
 import { downloadImages } from './images/downloadImages'
 import { getImageQuerys } from './Editing/getImageQuerys'
 const inputFilePath = path.join(__dirname, '..', 'basicaudio.mp3')
 
 const outputFilePath = path.join(__dirname, '..', 'basicaudio.wav')
 
-const videoFilePath = path.join(__dirname, '..', 'new.mp4')
+const videoFilePath = path.join(__dirname, '..', 'base.mp4')
 
 const outputVideoFilePath = path.join(__dirname, '..', 'shorts', 'test.mp4')
 
 const generateYoutubeShort = async (language: string, topic: string) => {
 	try {
-		const script = await createShortScript({ language: language, topic: topic })
+		// const script = await createShortScript({ language: language, topic: topic })
 
-		console.log('SCRIPT GENERATED: ', script)
+		const res = fs
+			.readFileSync('/Users/chetan/Developer/code/short-video-automation/basicaudio.wav.vtt', 'utf8')
+			.replace(',', '')
+			.replace('.', '')
 
-		if (!script) throw new Error('Script not generated')
+		console.log('RES: ', res)
 
-		await createAudio({ script, language, outputFilePath: inputFilePath })
+		// console.log('SCRIPT GENERATED: ', script)
 
-		console.log('AUDIO GENERATED SUCCESSFULLY', 'basicaudio.mp3')
+		// if (!script) throw new Error('Script not generated')
 
-		await convertToWav(inputFilePath, outputFilePath)
+		// await createAudio({ script, language, outputFilePath: inputFilePath })
 
-		await whisper(outputFilePath)
+		// console.log('AUDIO GENERATED SUCCESSFULLY', 'basicaudio.mp3')
 
-		// return
+		// await convertToWav(inputFilePath, outputFilePath)
+
+		// const currentDir = process.cwd()
+
+		// await whisper(outputFilePath)
 
-		console.log('MERGING AUDIO AND VIDEO')
+		// process.chdir(currentDir)
+		// // return
 
-		await mergeAudio({
-			videoFilePath,
-			audioFilePath: outputFilePath,
-			outputVideoPath: outputVideoFilePath,
-		})
+		// console.log('MERGING AUDIO AND VIDEO')
+
+		// await mergeAudio({
+		// 	videoFilePath,
+		// 	audioFilePath: outputFilePath,
+		// 	outputVideoPath: outputVideoFilePath,
+		// })
 
 		// return
 
-		const queries: any = await getImageQuerys()
+		// const queries: any = await getImageQuerys(
+		// 	'/Users/chetan/Developer/code/short-video-automation/basicaudio.wav.vtt'
+		// )
 
-		if (!queries) throw new Error('Queries not generated')
+		// if (!queries) throw new Error('Queries not generated')
 
 		// console.log('QUERIES: ', typeof queries)
 
-		await downloadImages(Object.values(queries))
+		// await downloadImages(Object.values(queries))
 
-		await imageProccessing({
-			language: '',
-			queries: queries,
-		})
+		// await imageProccessing({
+		// 	language: '',
+		// 	queries: queries,
+		// })
 
 		return
 		// Upload to youtube
-		uploadFile('videos', Math.random() + 'new.mp4', outputVideoFilePath).catch(console.error)
+		// uploadFile('videos', Math.random() + 'new.mp4', outputVideoFilePath).catch(console.error)
 	} catch (error) {
 		console.log('Error in createShortScript: ', error)
 	}
 }
+
+generateYoutubeShort('en', 'earth fact').finally(() => {
+	console.log('DIR 2 ', process.cwd())
+})
+
+console.log('DIR', process.cwd())
diff --git a/src/promptTemplates/image.ts b/src/promptTemplates/image.ts
@@ -45,26 +45,23 @@ You will output in parsable JSON object , all these json will be in a array.
 
 export const tryy = (timestamp: string, totalQuerys: any) => `
 
-You are a shorts video editor. Your audience is people from 18 yo to 40yo. Your style of editing is pretty simple, you take the transcript of your short and put a very simple google image to illustrate the narrated sentances.
+Create a script for a video editor who makes brief, interactive, and illustrative shorts for an audience aged 18-40.
 
-  Each google image is searched with a short query of two words maximum. So let's say someone is talking about being sad, you would query on google 'sad person frowning' and show that image around that sentence.
+The task includes mapping timestamps of a video to relevant keywords or phrases (maximum two words) which can be used to search for fitting illustrations on Google Images. These queried images should serve as meaningful visual accompaniments to specific sentences in the video's transcript. For example, if a sentence mentions sadness, the corresponding image might be searched using the phrase 'frowning person'.
 
-  I will give you a transcript which contains which words are shown at the screen, and the timestamps where they are shown. Understand the transcript, and time images at timestamps and, write me the query for each image. For the image queries you have two choices: concrete objects, like 'cash', 'old table', and other objects, or people in situations like 'sad person', 'happy family', ect... Generate a maximum of <<NUMBER>> image queries equally distributed in the video.
+Below are some parameters to guide the image query selections:
 
-  Avoid depicting shocking or nude / crude images, since your video will get demonetized. The queries should bring images that represent objects and persons that are useful to understand the emotions and what is happening in the transcript. The queries should describe OBJECTS or PERSONS. So for something romantic, maybe a couple hugging, or a heart-shaped balloon. For the image queries you have two choices: concrete objects, like 'cash', 'old table', and other objects, or people in situations like 'sad person', 'happy family', ect..
+Queries should focus on tangible objects (e.g., 'cash', 'old table') or individuals in distinct situations (e.g., 'sad person', 'happy family').
+Avoid anything that could be deemed explicit, shocking, or inappropriate as this may lead to video demonetization.
+Avoid overly generic or abstract words in queries that may yield unsuitable images.
+The designated image should accurately depict the ongoing narrative in the video and help viewers understand the context.
+The total number of image queries should not exceed <<${totalQuerys}>> and should be evenly distributed throughout the video's duration.
+Here is the specific transcript to work with: <<${timestamp}>>
 
-  The images should be an image representation of what is happening. Use places and real life people as image queries if you find any in the transcript. Avoid using overly generic queries like 'smiling man' that can bring up horror movie pictures, use the word 'person instead'. Instead, try to use more specific words that describe the action or emotion in the scene. Also, try to avoid queries that don't represent anything in images, such as abstract concepts, ideas, or feelings. MAKE SURE THAT THE QUERIES ARE VERY DESCRIPTIVE AND VISUAL AND CAN BE DRAWN AND NEVER USE WORDS THAT ONLY DESCRIBE AN ABSTRACT IDEA. NEVER USE ABSTRACT NOUNS IN THE QUERIES. ALWAYS USE REAL OBJECTS OR PERSONS IN THE QUERIES.
+Please review the transcript and generate <<${totalQuerys}>> image queries corresponding to this content. The queries should be formatted in JSON object (Example: { "10": "happy person", "15": "sad person", ...}). The key should be the second part of the timestamp i.e., if timestamp is 00:00:10,000 --> 00:00:15,000, then key should be 10. Make sure that each query is unique and the total count is equal to <<${totalQuerys}>>."
 
-  Transcript:
+Note: The emphasis here is on selecting image queries that are concrete, descriptive, visually representable, and relevant to the video's context. Abstract concepts or ideas should be converted into visual or tangible representations wherever possible.
 
-  <<${timestamp}>>
-
-
-  Every few transcript captions, find an image that can be shown. Really understand the context and emotions for the image to be good ! The queries should describe OBJECTS or PERSONS. Write it in a dictionary with timestamp to query format like { "1": "happy person", "3": "sad person", ...} . DON'T GENERATE A QUERY FOR EACH CAPTION. Generate <<${totalQuerys}>> image queries and time them accordingly in the video. NEVER use the same search query for multiple captions. Make sure that the timestamps make sense.
-  NEVER USE ABSTRACT NOUNS IN THE QUERIES. ALWAYS USE REAL OBJECTS OR PERSONS IN THE QUERIES.
-  In timestamp time will be like this : 00:00:01,530 here 01 us secounds , so you only have to return seconds part of timestamp , for example if timestamp is 00:00:10,000 --> 00:00:15,000 , you only have to return 10 , so i can show image after 10  seconds of video
-  Makr sure to return a parsable JSON object.
-  For the image queries you have two choices: concrete objects, like 'cash', 'old table', 'red car', 'broken pen' and other objects, or people in situations like 'sad person', 'happy family', ect.. Choose more objects than people.
-  The <<${totalQuerys}>> generated image queries and their timestamps, make sure to respect the number <<${totalQuerys}>>:
+ONLY RETURN THE PARSABLE JSON OBJECTS IN THE RESPONSE. DO NOT RETURN ANYTHING ELSE.
 
 `
diff --git a/src/video/video.ts b/src/video/video.ts
@@ -5,11 +5,6 @@ import ffmpegPath from 'ffmpeg-static'
 import ffmpegProb from 'ffprobe-static'
 import { exec } from 'child_process'
 
-if (ffmpegPath) {
-	ffmpeg.setFfprobePath(ffmpegProb.path)
-	ffmpeg.setFfmpegPath(ffmpegPath)
-}
-
 export const mergeAudio = async ({
 	videoFilePath,
 	audioFilePath,
@@ -62,46 +57,52 @@ export const mergeAudio = async ({
 	const out = path.join(__dirname, '..', '..', 'tryyyyyyyyy.mp4')
 	const subtitleStyle =
 		"force_style='Alignment=6,FontName=Trebuchet,FontSize=18,PrimaryColour=&Hffffff&,OutlineColour=&H00000000&,MarginV=25'"
-
-	const backgroundAudiocommand = `${ffmpegPath} -i ${outputVideoPath} -i ${backgroundMusicFilePath} -filter_complex "[0:a]volume=1[a1];[1:a]volume=0.4[b1];[a1][b1]amix=inputs=2[aout]" -map 0:v -map "[aout]" -c:v copy -c:a aac -shortest ${out}`
+	const tiktokFilterWithSubtitles =
+		"scale=-1:1920:force_original_aspect_ratio=decrease,crop=1080:1920,subtitles=/Users/chetan/Developer/code/short-video-automation/basicaudio.wav.vtt:force_style='Alignment=10,FontName=Trebuchet,FontSize=18,PrimaryColour=&Hffffff&,OutlineColour=&H00000000&,MarginV=25'"
+	const sortVideFilterWithSubtitles =
+		"scale=-1:1920:force_original_aspect_ratio=decrease,crop=1080:1920,subtitles=/Users/chetan/Developer/code/short-video-automation/basicaudio.wav.vtt:force_style='Alignment=6,FontName=Trebuchet,FontSize=18,PrimaryColour=&Hffffff&,OutlineColour=&H00000000&,MarginV=25'"
+	const backgroundAudiocommand = `ffmpeg -i ${outputVideoPath} -i ${backgroundMusicFilePath} -filter_complex "[0:a]volume=1[a1];[1:a]volume=0.4[b1];[a1][b1]amix=inputs=2[aout]" -map 0:v -map "[aout]" -c:v copy -c:a aac -shortest ${out} `
 
 	return new Promise((resolve, reject) => {
-		ffmpeg(videoFilePath)
+		// continue with the same part before
+
+		ffmpeg()
+			.input(videoFilePath)
 			.inputOptions(`-t ${adjustedTrimDuration}`)
 			.input(audioFilePath)
-
-			.videoFilter(videoFilter)
-			.audioFilter(audioFilter)
 			.input(backgroundMusicFilePath)
-
-			.outputOptions([
-				'-map',
-				'0:v',
-				'-map',
-				'1:a',
-				'-c:v libx264',
-				'-c:a aac',
-				'-vf',
-				`subtitles=${newSrtFilePath}:${subtitleStyle}`,
-				// '-apad',
+			.videoFilter(tiktokFilterWithSubtitles)
+			.complexFilter([
+				{
+					filter: 'volume',
+					options: 1,
+					inputs: '1:a',
+					outputs: 'volumeAdjustedAudio',
+				},
+				{
+					filter: 'volume',
+					options: 0.1,
+					inputs: '2:a',
+					outputs: 'volumeAdjustedBGM',
+				},
+				{
+					filter: 'amix',
+					options: { inputs: 2, duration: 'shortest' },
+					inputs: ['volumeAdjustedAudio', 'volumeAdjustedBGM'],
+					outputs: 'amixed',
+				},
 			])
+			.outputOptions(['-map', '0:v', '-map', '[amixed]', '-c:v libx264', '-c:a aac'])
 			.output(outputVideoPath)
 			.on('start', commandLine => {
 				console.log('Spawned Ffmpeg with command: ' + commandLine)
 			})
 			.on('end', () => {
 				console.log('Audio added to video complete!')
-
-				exec(backgroundAudiocommand, (error, stdout, stderr) => {
-					if (error) {
-						console.error('Error:', error)
-						reject(error)
-					}
-					resolve(stdout ? stdout : stderr)
-				})
+				resolve('done')
 			})
 			.on('error', err => {
-				console.error('Error during audio adding to video:', err.message)
+				console.error('Error during audio adding to video:', err)
 				reject(err)
 			})
 			.run()

diff --git a/src/videoScript.ts b/src/videoScript.ts
@@ -25,7 +25,7 @@ export const createShortScript = async ({ language, topic }: { language: string;
 
 		if (!JSON.parse(res.text).script) throw new Error('Error in Script not generated')
 
-		console.log('Script: ', JSON.parse(res.text))
+		// console.log('Script: ', JSON.parse(res.text))
 
 		return JSON.parse(res.text).script
 	} catch (error) {