Suggestion: Currently, the timing of the subtitles appearing is not accurate. #2

nangonghan · 2023-06-24T13:23:00Z

Hi there, thank you very much for releasing this version. Currently, I have also implemented a Node version based on your code. However, there is a problem that the subtitles appear before the speaker starts talking, which is quite troubling. I noticed that whisper_cpp itself has this problem, and someone has already implemented a Python version that seems to have fixed this issue.

Currently, the solution I can think of is to use ffmpeg's silent detection and dynamically adjust the timing of the srt subtitles.

nangonghan · 2023-06-24T13:24:02Z

const fs = require("fs");
const path = require("path");
const { spawnSync } = require("child_process");
const { spawn } = require("child_process");
const os = require('os');
const util = require('util');
const { promisify } = require('util');
const exec = util.promisify(require("child_process").exec);
function changeFileExtensionToWav(filePath) {
    const ext = path.extname(filePath);
    const newFilePath = filePath.replace(ext, '.wav');
    return newFilePath;
}

async function getVideoDuration(filePath) {
    const mediainfoPath = path.join(__dirname, 'bin', 'mediainfo');
    const command = `${mediainfoPath} --Output=JSON  '${filePath}'`;
    const { stdout, stderr } = await exec(command);
    console.log(filePath)
    const mediaInfo = JSON.parse(stdout);
    const duration = parseFloat(mediaInfo.media.track[1].Duration);
    return duration;
}
async function file_to_wav(inputFilePath) {

    let wavFilePath = changeFileExtensionToWav(inputFilePath)


    const ffmpegPath = path.join(__dirname, 'bin', "ffmpeg");

    const result = await spawnSync(ffmpegPath, [
        "-i",
        inputFilePath,
        "-ar",
        "16000",
        "-y",
        wavFilePath,
    ]);

    if (result.status !== 0) {
        console.error(`Task failed with status: ${result.status}`);
        return null;
    } else {
        const newWavPath = await renameFile(wavFilePath)
        console.log(`change ${wavFilePath} to wav`)
        return newWavPath;
    }
}
function getLastedTime(times, totalDuraton) {
    if (times === null) {
        return totalDuraton
    }
    if (times.includes(" --> ")) {

        return getTimeInSeconds(times.split(' --> ')[1])
    } else {
        return totalDuraton
    }
}

async function whisper_cpp(selectedModel, selectedLanguage, outputWavFilePath, totalDuration, startDate) {
    const mainPath = path.join(__dirname, 'bin', 'whisper');
    const modelPath = path.join(__dirname, 'model', `${modelsMapping[selectedModel]}.bin`);
    const selectedLanguageShortCut = languagesMapping[selectedLanguage];

    const task = spawn(mainPath, ['-m', modelPath, '-l', selectedLanguageShortCut, '-pp', '-osrt', '-f', outputWavFilePath, "-ot", 200], {
        stdio: ['pipe', 'pipe', 'pipe']
    });

    const startTime = Date.now();
    let outputCaptions = '';
    let progress = 0;
    let progressPercentage = 0;
    let remainingTime = '';

    function extractContentBetweenBrackets(str) {
        const regex = /\[(.*?)\]/;
        const result = str.match(regex);
        return result ? result[1] : null;
    }
    function getSpendTime(startDate) {
        const currentTime = new Date(); // 当前时间
        const timeDiff = currentTime.getTime() - startDate.getTime();
        return `${Math.round(timeDiff / 1000 / 60 * 100) / 100} min`; // 时间差值（分钟）
    }
    task.stdout.on('data', (data) => {
        const output = data.toString();

        const lines = output.split('\n');
        const lastLine = lines[lines.length - 2];
        const times = extractContentBetweenBrackets(lastLine)

        const lastedTime = getLastedTime(times, totalDuration)

        const pregress = lastedTime / totalDuration * 100
        const fileName = getPathName(outputWavFilePath)
        const spendTime = getSpendTime(startDate)
        console.log(fileName, "当前进度:", pregress.toFixed(2), "%", "花费时间:", spendTime)


    })



    const srtFilePath =
        await new Promise((resolve, reject) => {
            task.on('close', (code) => {
                progress = 1;
                progressPercentage = 100;
                remainingTime = '00:00';
                resolve(outputWavFilePath + ".srt");
            });
        });

    return srtFilePath;
}
function getTimeInSeconds(timeString) {
    const date = new Date(`1970-01-01T${timeString}Z`);
    return date.getTime() / 1000;
}

function srtToFcpxml(srtFilePath, frameRate, width, height) {

    const srt2fcpxmlPath = path.join(__dirname, 'bin', 'srt2fcpxml');
    // frame rate currently supported 23.98、24、25、29.97、30、50、59.94、60 (default "25")
    // height default 1080
    // width default 1920
    const result = spawnSync(srt2fcpxmlPath, [
        "-fd",
        frameRate,
        "-width",
        width,
        "-height",
        height,
        "-srt",
        srtFilePath
    ]);
    if (result.status !== 0) {

        return null;
    } else {
        return srtFilePath.replace(".srt", ".fcpxml");
    }
}
const unlinkAsync = util.promisify(fs.unlink);

async function deleteFile(filePath) {
    try {
        await unlinkAsync(filePath);

    } catch (err) {
        console.error(`Failed to delete file "${filePath}"`);
        console.error(err);
    }
}

function getPathName(filePath) {
    return path.parse(filePath).name
}

function isMediaFile(filePath) {
    const mediaRegex = /\.(mp4|mov|avi|mp3|wav|flac|aac|m4a)$/i;
    return mediaRegex.test(filePath);
}

const renameAsync = promisify(fs.rename);

async function renameFile(filePath) {
    try {
        const oldFileName = path.basename(filePath);
        const newFileName = oldFileName.replace('.wav', '');
        const newFilePath = path.join(path.dirname(filePath), newFileName);

        await renameAsync(filePath, newFilePath);


        return newFilePath;
    } catch (err) {
        console.error(`Failed to rename file "${filePath}"`);
        console.error(err);
    }
}

async function videoToSrtAndFcpxml(filePathString, startDate) {
    console.time(filePathString); // 开始计时
    const isMediaFiles = isMediaFile(filePathString)
    if (!isMediaFiles) return
    // change video or audio files into wave 
    const wavFilePath = await file_to_wav(filePathString)
    const fileName = getPathName(filePathString)
    const waveDuration = await getVideoDuration(wavFilePath)
    console.log(fileName, "文件路径", filePathString)
    console.log(fileName, "总时长(mins)", Math.round(waveDuration / 60))
    const srtFilePath = await whisper_cpp(selectedModel, selectedLanguage, wavFilePath, waveDuration, startDate)

    await deleteFile(wavFilePath)
    const fcpxmlPath = await srtToFcpxml(srtFilePath, frameRate, width, height)
    console.log(fileName, "Done")
    console.timeEnd(filePathString); // 开始计时
    return "Done"
}

async function getAllFilesInFolder(dirPath) {
    return new Promise((resolve, reject) => {
        fs.readdir(dirPath, { withFileTypes: true }, (err, files) => {
            if (err) {
                reject(err);
            } else {
                const filePaths = files.map((file) => {
                    const filePath = `${dirPath}/${file.name}`;
                    return file.isDirectory() ? getAllFilesInFolder(filePath) : filePath;
                });
                Promise.all(filePaths)
                    .then((files) => {
                        resolve(files.filter(Boolean).flat());
                    })
                    .catch(reject);
            }
        });
    });
}
const languages = ["Arabic", "Azerbaijani", "Armenian", "Albanian", "Afrikaans", "Amharic", "Assamese", "Bulgarian", "Bengali", "Breton", "Basque", "Bosnian", "Belarusian", "Bashkir", "Chinese", "Catalan", "Czech", "Croatian", "Dutch", "Danish", "English", "Estonian", "French", "Finnish", "Faroese", "German", "Greek", "Galician", "Georgian", "Gujarati", "Hindi", "Hebrew", "Hungarian", "Haitian creole", "Hawaiian", "Hausa", "Italian", "Indonesian", "Icelandic", "Japanese", "Javanese", "Korean", "Kannada", "Kazakh", "Khmer", "Lithuanian", "Latin", "Latvian", "Lao", "Luxembourgish", "Lingala", "Malay", "Maori", "Malayalam", "Macedonian", "Mongolian", "Marathi", "Maltese", "Myanmar", "Malagasy", "Norwegian", "Nepali", "Nynorsk", "Occitan", "Portuguese", "Polish", "Persian", "Punjabi", "Pashto", "Russian", "Romanian", "Spanish", "Swedish", "Slovak", "Serbian", "Slovenian", "Swahili", "Sinhala", "Shona", "Somali", "Sindhi", "Sanskrit", "Sundanese", "Turkish", "Tamil", "Thai", "Telugu", "Tajik", "Turkmen", "Tibetan", "Tagalog", "Tatar", "Ukrainian", "Urdu", "Uzbek", "Vietnamese", "Welsh", "Yoruba", "Yiddish"]
const languagesMapping = { "Arabic": "ar", "Azerbaijani": "az", "Armenian": "hy", "Albanian": "sq", "Afrikaans": "af", "Amharic": "am", "Assamese": "as", "Bulgarian": "bg", "Bengali": "bn", "Breton": "br", "Basque": "eu", "Bosnian": "bs", "Belarusian": "be", "Bashkir": "ba", "Chinese": "zh", "Catalan": "ca", "Czech": "cs", "Croatian": "hr", "Dutch": "nl", "Danish": "da", "English": "en", "Estonian": "et", "French": "fr", "Finnish": "fi", "Faroese": "fo", "German": "de", "Greek": "el", "Galician": "gl", "Georgian": "ka", "Gujarati": "gu", "Hindi": "hi", "Hebrew": "he", "Hungarian": "hu", "Haitian creole": "ht", "Hawaiian": "haw", "Hausa": "ha", "Italian": "it", "Indonesian": "id", "Icelandic": "is", "Japanese": "ja", "Javanese": "jw", "Korean": "ko", "Kannada": "kn", "Kazakh": "kk", "Khmer": "km", "Lithuanian": "lt", "Latin": "la", "Latvian": "lv", "Lao": "lo", "Luxembourgish": "lb", "Lingala": "ln", "Malay": "ms", "Maori": "mi", "Malayalam": "ml", "Macedonian": "mk", "Mongolian": "mn", "Marathi": "mr", "Maltese": "mt", "Myanmar": "my", "Malagasy": "mg", "Norwegian": "no", "Nepali": "ne", "Nynorsk": "nn", "Occitan": "oc", "Portuguese": "pt", "Polish": "pl", "Persian": "fa", "Punjabi": "pa", "Pashto": "ps", "Russian": "ru", "Romanian": "ro", "Spanish": "es", "Swedish": "sv", "Slovak": "sk", "Serbian": "sr", "Slovenian": "sl", "Swahili": "sw", "Sinhala": "si", "Shona": "sn", "Somali": "so", "Sindhi": "sd", "Sanskrit": "sa", "Sundanese": "su", "Turkish": "tr", "Tamil": "ta", "Thai": "th", "Telugu": "te", "Tajik": "tg", "Turkmen": "tk", "Tibetan": "bo", "Tagalog": "tl", "Tatar": "tt", "Ukrainian": "uk", "Urdu": "ur", "Uzbek": "uz", "Vietnamese": "vi", "Welsh": "cy", "Yoruba": "yo", "Yiddish": "yi" }
const models = ["Large", "Medium", "Small", "Base", "Tiny"]
const modelsMapping = { "Large": "ggml-large", "Medium": "ggml-medium", "Small": "ggml-small", "Base": "ggml-base", "Tiny": "ggml-tiny" }
const selectedModel = "Base"
const selectedLanguage = "English"
var frameRate = 25;
var width = 1920;
var height = 1080;
async function changeMediaIntosubtitle(folderPath) {
    console.time('changeMediaIntosubtitle'); // 开始计时
    const startDate = new Date()
    const files = await getAllFilesInFolder(folderPath);
    const mediaFiles = files.filter(item => isMediaFile(item))
    console.log(mediaFiles)
    for (const item of mediaFiles) {
        console.log(item)
        await videoToSrtAndFcpxml(item, startDate);
    }

    console.timeEnd('changeMediaIntosubtitle'); // 开始计时
}

changeMediaIntosubtitle('/Users/xiaoxuanxuan/Desktop/code')

shaishaicookie · 2023-06-28T15:50:54Z

Hello nangong, I just tested whisper.cpp with a few audio clips again and discovered the issue of subtitles appearing before the actual voice. Thank you for reaching out, I will test the original Python version of Whisper later to see if the issue exists there as well. I'll also take a look at the implementation of whisper.cpp. Previously, I was directly using the executable file of whisper.cpp without checking the cpp code in detail. However, currently, this application still relies on the C++ version to run fastly. For Macs (without cuda), whisper.cpp's processing speed is at least 5 times faster than the native Python version by OpenAI. Thank you for suggesting the use of ffmpeg for silent detection. I would like to see if adding some offset can align the subtitles perfectly with the audio. I will also research other methods. Once I figure it out, I will provide an update.

You mentioned that you are working on a Node.js implementation, and for the conversion of SRT files to editable subtitle style FCPXML files. In addition to the Swift code in this app, I have previously developed a CLI tool called srt2subtitles using Node.js (https://github.com/shaishaicookie/srt2subtitles-cli). I hope this tool can be helpful for your app.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Suggestion: Currently, the timing of the subtitles appearing is not accurate. #2

Suggestion: Currently, the timing of the subtitles appearing is not accurate. #2

nangonghan commented Jun 24, 2023

nangonghan commented Jun 24, 2023

shaishaicookie commented Jun 28, 2023

Suggestion: Currently, the timing of the subtitles appearing is not accurate. #2

Suggestion: Currently, the timing of the subtitles appearing is not accurate. #2

Comments

nangonghan commented Jun 24, 2023

nangonghan commented Jun 24, 2023

shaishaicookie commented Jun 28, 2023