Skip to content

Commit

Permalink
fix: image download and flow
Browse files Browse the repository at this point in the history
  • Loading branch information
chetan authored and chetan committed Oct 29, 2023
1 parent 72d4bd2 commit 9ffde6e
Show file tree
Hide file tree
Showing 14 changed files with 111 additions and 93 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,3 @@ basicaudio.mp3
basicaudio.wav
basicaudio.wav.srt
shorts
bg.mp3
Binary file added base.mp4
Binary file not shown.
Binary file added bg.mp3
Binary file not shown.
Binary file removed new.mp4
Binary file not shown.
8 changes: 4 additions & 4 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"fs-extra": "^11.1.1",
"googleapis": "^126.0.1",
"langchain": "0.0.167",
"nodejs-whisper": "^0.1.4",
"nodejs-whisper": "^0.1.6",
"openai": "^3.3.0",
"shelljs": "^0.8.5"
},
Expand Down
6 changes: 3 additions & 3 deletions src/Editing/getImageQuerys.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,16 @@ const configuration = new Configuration({
})
const openai = new OpenAIApi(configuration)

export const getImageQuerys = async () => {
export const getImageQuerys = async (subtitlesPath: string) => {
// fs.readFile('/home/chetan/code/ts-content-gpt/basicaudio.wav.srt', 'utf8', async function (err, data) {
// if (err) throw err

// })

const file = fs.readFileSync('/home/chetan/code/ts-content-gpt/basicaudio.wav.vtt', 'utf8')
const file = fs.readFileSync(subtitlesPath, 'utf8')

const chatCompletion: any = await openai.createChatCompletion({
model: 'gpt-3.5-turbo',
model: 'gpt-4',
messages: [{ role: 'system', content: tryy(file, 5) }],
})
console.log('chatCompletion: ', chatCompletion.data.choices[0].message.content)
Expand Down
2 changes: 1 addition & 1 deletion src/audio/elevenAudio.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ export const createAudio = async ({
language,
stabilityValue = 0.2,
similarityBoostValue = 0.1,
voice = 'VR6AewLTigWG4xSOukaG',
voice = 'wViXBPUzp2ZZixB1xQuM',
outputFilePath = 'basicaudio.mp3',
}: {
script: string
Expand Down
2 changes: 1 addition & 1 deletion src/images/downloadImages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ export const downloadImages = async (queries: any) => {

const url = data.photos[0].src.small

const newHeight = 200
const newHeight = 300

// Use string.replace() to replace the height value
const modifiedUrl = url.replace(/h=\d+/, `h=${newHeight}`)
Expand Down
21 changes: 12 additions & 9 deletions src/images/imagesProccessing.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ import { PromptTemplate } from 'langchain/prompts'
import ffmpeg from 'fluent-ffmpeg'
import { LLMChain } from 'langchain/chains'
import { imageTemp, tryy } from '../promptTemplates/image'
import path from 'ffmpeg-static'
// import path from 'ffmpeg-static'

ffmpeg.setFfmpegPath(path!)
// ffmpeg.setFfmpegPath(path!)

import { Configuration, OpenAIApi } from 'openai'
import fs from 'fs'
Expand All @@ -25,10 +25,9 @@ export const imageProccessing = async ({
topic?: string
queries: any
}) => {
const inputVideoPath = '/home/chetan/code/ts-content-gpt/tryyyyyyyyy.mp4'
const inputImagePath = '/home/chetan/code/ts-content-gpt/oo.jpg'
const outputVideoPath = '/home/chetan/code/ts-content-gpt/yyooooooo.mp4'
const targetTimestamp = '00:00:10.000'
const inputVideoPath = '/Users/chetan/Developer/code/short-video-automation/shorts/test.mp4'

const outputVideoPath = '/Users/chetan/Developer/code/short-video-automation/final.mp4'

interface IQuery {
Query: string
Expand All @@ -39,7 +38,7 @@ export const imageProccessing = async ({
for (const key in queries) {
let value = queries[key]

const newKey = `${Number(key)}-${Number(key) + 3}`
const newKey = `${Number(key)}-${Number(key) + 1}`

const obj = {
Query: value,
Expand All @@ -63,7 +62,9 @@ export const imageProccessing = async ({
let imgPath = ''

queryArr.forEach((query: IQuery, index: any) => {
imgPath += ' -i ' + `/home/chetan/code/ts-content-gpt/${query.Query.split(' ').join('')}.jpg`
imgPath +=
' -i ' +
`/Users/chetan/Developer/code/short-video-automation/${query.Query.split(' ').join('')}.jpg`
})

filter += `${imgPath} -filter_complex "[${prevIndex}:v][${currIndex}:v]overlay=(W-w)/2:(H-h)/2:enable='between(t,${startingTime.trim()},${endTime.trim()})'[v${currIndex}];[v${currIndex}]`
Expand All @@ -81,7 +82,7 @@ export const imageProccessing = async ({

// console.log('filter: ', filter)

const mainFilter = `${path} -i ${inputVideoPath} ${filter} -map "[v]" -map 0:a -c:v libx264 -c:a copy ${outputVideoPath} `
const mainFilter = `ffmpeg -i ${inputVideoPath} ${filter} -map "[v]" -map 0:a -c:v libx264 -c:a copy ${outputVideoPath} `

console.log('mainFilter: ', mainFilter)

Expand All @@ -94,6 +95,8 @@ export const imageProccessing = async ({

console.log('Video processing done')

// return new Promise.all()

// queryArr.forEach((query: IQuery, index: any) => {
// fs.unlink(`/home/chetan/code/ts-content-gpt/${query.Query.split(' ').join('')}.jpg`, err => {
// if (err) {
Expand Down
70 changes: 44 additions & 26 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,71 +3,89 @@ dotenv.config()
import { createShortScript } from './videoScript'
import { convertToWav, createAudio } from './audio/elevenAudio'
import { whisper } from './transcript/transcribe'

import fs from 'fs'
import express from 'express'
const app = express()
import path from 'path'

import { mergeAudio } from './video/video'

import { uploadVideos } from './upoad/upload'
import uploadFile from './upoad/azureUpload'
// import { uploadVideos } from './upoad/upload'
// import uploadFile from './upoad/azureUpload'
import { imageProccessing } from './images/imagesProccessing'
import { downloadImages } from './images/downloadImages'
import { getImageQuerys } from './Editing/getImageQuerys'
const inputFilePath = path.join(__dirname, '..', 'basicaudio.mp3')

const outputFilePath = path.join(__dirname, '..', 'basicaudio.wav')

const videoFilePath = path.join(__dirname, '..', 'new.mp4')
const videoFilePath = path.join(__dirname, '..', 'base.mp4')

const outputVideoFilePath = path.join(__dirname, '..', 'shorts', 'test.mp4')

const generateYoutubeShort = async (language: string, topic: string) => {
try {
const script = await createShortScript({ language: language, topic: topic })
// const script = await createShortScript({ language: language, topic: topic })

console.log('SCRIPT GENERATED: ', script)
const res = fs
.readFileSync('/Users/chetan/Developer/code/short-video-automation/basicaudio.wav.vtt', 'utf8')
.replace(',', '')
.replace('.', '')

if (!script) throw new Error('Script not generated')
console.log('RES: ', res)

await createAudio({ script, language, outputFilePath: inputFilePath })
// console.log('SCRIPT GENERATED: ', script)

console.log('AUDIO GENERATED SUCCESSFULLY', 'basicaudio.mp3')
// if (!script) throw new Error('Script not generated')

await convertToWav(inputFilePath, outputFilePath)
// await createAudio({ script, language, outputFilePath: inputFilePath })

await whisper(outputFilePath)
// console.log('AUDIO GENERATED SUCCESSFULLY', 'basicaudio.mp3')

// return
// await convertToWav(inputFilePath, outputFilePath)

// const currentDir = process.cwd()

// await whisper(outputFilePath)

console.log('MERGING AUDIO AND VIDEO')
// process.chdir(currentDir)
// // return

await mergeAudio({
videoFilePath,
audioFilePath: outputFilePath,
outputVideoPath: outputVideoFilePath,
})
// console.log('MERGING AUDIO AND VIDEO')

// await mergeAudio({
// videoFilePath,
// audioFilePath: outputFilePath,
// outputVideoPath: outputVideoFilePath,
// })

// return

const queries: any = await getImageQuerys()
// const queries: any = await getImageQuerys(
// '/Users/chetan/Developer/code/short-video-automation/basicaudio.wav.vtt'
// )

if (!queries) throw new Error('Queries not generated')
// if (!queries) throw new Error('Queries not generated')

// console.log('QUERIES: ', typeof queries)

await downloadImages(Object.values(queries))
// await downloadImages(Object.values(queries))

await imageProccessing({
language: '',
queries: queries,
})
// await imageProccessing({
// language: '',
// queries: queries,
// })

return
// Upload to youtube
uploadFile('videos', Math.random() + 'new.mp4', outputVideoFilePath).catch(console.error)
// uploadFile('videos', Math.random() + 'new.mp4', outputVideoFilePath).catch(console.error)
} catch (error) {
console.log('Error in createShortScript: ', error)
}
}

generateYoutubeShort('en', 'earth fact').finally(() => {
console.log('DIR 2 ', process.cwd())
})

console.log('DIR', process.cwd())
27 changes: 12 additions & 15 deletions src/promptTemplates/image.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,26 +45,23 @@ You will output in parsable JSON object , all these json will be in a array.

export const tryy = (timestamp: string, totalQuerys: any) => `
You are a shorts video editor. Your audience is people from 18 yo to 40yo. Your style of editing is pretty simple, you take the transcript of your short and put a very simple google image to illustrate the narrated sentances.
Create a script for a video editor who makes brief, interactive, and illustrative shorts for an audience aged 18-40.
Each google image is searched with a short query of two words maximum. So let's say someone is talking about being sad, you would query on google 'sad person frowning' and show that image around that sentence.
The task includes mapping timestamps of a video to relevant keywords or phrases (maximum two words) which can be used to search for fitting illustrations on Google Images. These queried images should serve as meaningful visual accompaniments to specific sentences in the video's transcript. For example, if a sentence mentions sadness, the corresponding image might be searched using the phrase 'frowning person'.
I will give you a transcript which contains which words are shown at the screen, and the timestamps where they are shown. Understand the transcript, and time images at timestamps and, write me the query for each image. For the image queries you have two choices: concrete objects, like 'cash', 'old table', and other objects, or people in situations like 'sad person', 'happy family', ect... Generate a maximum of <<NUMBER>> image queries equally distributed in the video.
Below are some parameters to guide the image query selections:
Avoid depicting shocking or nude / crude images, since your video will get demonetized. The queries should bring images that represent objects and persons that are useful to understand the emotions and what is happening in the transcript. The queries should describe OBJECTS or PERSONS. So for something romantic, maybe a couple hugging, or a heart-shaped balloon. For the image queries you have two choices: concrete objects, like 'cash', 'old table', and other objects, or people in situations like 'sad person', 'happy family', ect..
Queries should focus on tangible objects (e.g., 'cash', 'old table') or individuals in distinct situations (e.g., 'sad person', 'happy family').
Avoid anything that could be deemed explicit, shocking, or inappropriate as this may lead to video demonetization.
Avoid overly generic or abstract words in queries that may yield unsuitable images.
The designated image should accurately depict the ongoing narrative in the video and help viewers understand the context.
The total number of image queries should not exceed <<${totalQuerys}>> and should be evenly distributed throughout the video's duration.
Here is the specific transcript to work with: <<${timestamp}>>
The images should be an image representation of what is happening. Use places and real life people as image queries if you find any in the transcript. Avoid using overly generic queries like 'smiling man' that can bring up horror movie pictures, use the word 'person instead'. Instead, try to use more specific words that describe the action or emotion in the scene. Also, try to avoid queries that don't represent anything in images, such as abstract concepts, ideas, or feelings. MAKE SURE THAT THE QUERIES ARE VERY DESCRIPTIVE AND VISUAL AND CAN BE DRAWN AND NEVER USE WORDS THAT ONLY DESCRIBE AN ABSTRACT IDEA. NEVER USE ABSTRACT NOUNS IN THE QUERIES. ALWAYS USE REAL OBJECTS OR PERSONS IN THE QUERIES.
Please review the transcript and generate <<${totalQuerys}>> image queries corresponding to this content. The queries should be formatted in JSON object (Example: { "10": "happy person", "15": "sad person", ...}). The key should be the second part of the timestamp i.e., if timestamp is 00:00:10,000 --> 00:00:15,000, then key should be 10. Make sure that each query is unique and the total count is equal to <<${totalQuerys}>>."
Transcript:
Note: The emphasis here is on selecting image queries that are concrete, descriptive, visually representable, and relevant to the video's context. Abstract concepts or ideas should be converted into visual or tangible representations wherever possible.
<<${timestamp}>>
Every few transcript captions, find an image that can be shown. Really understand the context and emotions for the image to be good ! The queries should describe OBJECTS or PERSONS. Write it in a dictionary with timestamp to query format like { "1": "happy person", "3": "sad person", ...} . DON'T GENERATE A QUERY FOR EACH CAPTION. Generate <<${totalQuerys}>> image queries and time them accordingly in the video. NEVER use the same search query for multiple captions. Make sure that the timestamps make sense.
NEVER USE ABSTRACT NOUNS IN THE QUERIES. ALWAYS USE REAL OBJECTS OR PERSONS IN THE QUERIES.
In timestamp time will be like this : 00:00:01,530 here 01 us secounds , so you only have to return seconds part of timestamp , for example if timestamp is 00:00:10,000 --> 00:00:15,000 , you only have to return 10 , so i can show image after 10 seconds of video
Makr sure to return a parsable JSON object.
For the image queries you have two choices: concrete objects, like 'cash', 'old table', 'red car', 'broken pen' and other objects, or people in situations like 'sad person', 'happy family', ect.. Choose more objects than people.
The <<${totalQuerys}>> generated image queries and their timestamps, make sure to respect the number <<${totalQuerys}>>:
ONLY RETURN THE PARSABLE JSON OBJECTS IN THE RESPONSE. DO NOT RETURN ANYTHING ELSE.
`
63 changes: 32 additions & 31 deletions src/video/video.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,6 @@ import ffmpegPath from 'ffmpeg-static'
import ffmpegProb from 'ffprobe-static'
import { exec } from 'child_process'

if (ffmpegPath) {
ffmpeg.setFfprobePath(ffmpegProb.path)
ffmpeg.setFfmpegPath(ffmpegPath)
}

export const mergeAudio = async ({
videoFilePath,
audioFilePath,
Expand Down Expand Up @@ -62,46 +57,52 @@ export const mergeAudio = async ({
const out = path.join(__dirname, '..', '..', 'tryyyyyyyyy.mp4')
const subtitleStyle =
"force_style='Alignment=6,FontName=Trebuchet,FontSize=18,PrimaryColour=&Hffffff&,OutlineColour=&H00000000&,MarginV=25'"

const backgroundAudiocommand = `${ffmpegPath} -i ${outputVideoPath} -i ${backgroundMusicFilePath} -filter_complex "[0:a]volume=1[a1];[1:a]volume=0.4[b1];[a1][b1]amix=inputs=2[aout]" -map 0:v -map "[aout]" -c:v copy -c:a aac -shortest ${out}`
const tiktokFilterWithSubtitles =
"scale=-1:1920:force_original_aspect_ratio=decrease,crop=1080:1920,subtitles=/Users/chetan/Developer/code/short-video-automation/basicaudio.wav.vtt:force_style='Alignment=10,FontName=Trebuchet,FontSize=18,PrimaryColour=&Hffffff&,OutlineColour=&H00000000&,MarginV=25'"
const sortVideFilterWithSubtitles =
"scale=-1:1920:force_original_aspect_ratio=decrease,crop=1080:1920,subtitles=/Users/chetan/Developer/code/short-video-automation/basicaudio.wav.vtt:force_style='Alignment=6,FontName=Trebuchet,FontSize=18,PrimaryColour=&Hffffff&,OutlineColour=&H00000000&,MarginV=25'"
const backgroundAudiocommand = `ffmpeg -i ${outputVideoPath} -i ${backgroundMusicFilePath} -filter_complex "[0:a]volume=1[a1];[1:a]volume=0.4[b1];[a1][b1]amix=inputs=2[aout]" -map 0:v -map "[aout]" -c:v copy -c:a aac -shortest ${out} `

return new Promise((resolve, reject) => {
ffmpeg(videoFilePath)
// continue with the same part before

ffmpeg()
.input(videoFilePath)
.inputOptions(`-t ${adjustedTrimDuration}`)
.input(audioFilePath)

.videoFilter(videoFilter)
.audioFilter(audioFilter)
.input(backgroundMusicFilePath)

.outputOptions([
'-map',
'0:v',
'-map',
'1:a',
'-c:v libx264',
'-c:a aac',
'-vf',
`subtitles=${newSrtFilePath}:${subtitleStyle}`,
// '-apad',
.videoFilter(tiktokFilterWithSubtitles)
.complexFilter([
{
filter: 'volume',
options: 1,
inputs: '1:a',
outputs: 'volumeAdjustedAudio',
},
{
filter: 'volume',
options: 0.1,
inputs: '2:a',
outputs: 'volumeAdjustedBGM',
},
{
filter: 'amix',
options: { inputs: 2, duration: 'shortest' },
inputs: ['volumeAdjustedAudio', 'volumeAdjustedBGM'],
outputs: 'amixed',
},
])
.outputOptions(['-map', '0:v', '-map', '[amixed]', '-c:v libx264', '-c:a aac'])
.output(outputVideoPath)
.on('start', commandLine => {
console.log('Spawned Ffmpeg with command: ' + commandLine)
})
.on('end', () => {
console.log('Audio added to video complete!')

exec(backgroundAudiocommand, (error, stdout, stderr) => {
if (error) {
console.error('Error:', error)
reject(error)
}
resolve(stdout ? stdout : stderr)
})
resolve('done')
})
.on('error', err => {
console.error('Error during audio adding to video:', err.message)
console.error('Error during audio adding to video:', err)
reject(err)
})
.run()
Expand Down
2 changes: 1 addition & 1 deletion src/videoScript.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ export const createShortScript = async ({ language, topic }: { language: string;

if (!JSON.parse(res.text).script) throw new Error('Error in Script not generated')

console.log('Script: ', JSON.parse(res.text))
// console.log('Script: ', JSON.parse(res.text))

return JSON.parse(res.text).script
} catch (error) {
Expand Down

0 comments on commit 9ffde6e

Please sign in to comment.