From 6de86deea4896d4de8f022c67c17a85a4cc6185b Mon Sep 17 00:00:00 2001 From: Sevag H Date: Sun, 3 Mar 2024 14:57:16 -0500 Subject: [PATCH] Threaded inference (#10) --- .clang-format | 2 +- .github/SDR_scores.md | 50 ++++++ CMakeLists.txt | 16 +- README.md | 42 ++++- cli-apps/demucs_ft.cpp | 5 +- cli-apps/demucs_ft_mt.cpp | 286 ++++++++++++++++++++++++++++++++ cli-apps/demucs_mt.cpp | 229 +++++++++++++++++++++++++ cli-apps/threaded_inference.hpp | 190 +++++++++++++++++++++ src/crosstransformer.cpp | 21 +-- src/crosstransformer.hpp | 2 +- src/encdec.cpp | 8 +- src/encdec.hpp | 17 +- src/layers.cpp | 2 +- src/layers.hpp | 5 +- src/model.hpp | 7 +- src/model_apply.cpp | 43 +++-- src/model_inference.cpp | 2 +- 17 files changed, 864 insertions(+), 63 deletions(-) create mode 100644 cli-apps/demucs_ft_mt.cpp create mode 100644 cli-apps/demucs_mt.cpp create mode 100644 cli-apps/threaded_inference.hpp diff --git a/.clang-format b/.clang-format index 6af95b9..9e30a2f 100644 --- a/.clang-format +++ b/.clang-format @@ -3,4 +3,4 @@ IndentWidth: 4 BreakBeforeBraces: Allman AllowShortIfStatementsOnASingleLine: false IndentCaseLabels: false -ColumnLimit: 80 \ No newline at end of file +ColumnLimit: 80 diff --git a/.github/SDR_scores.md b/.github/SDR_scores.md index 5e3ee3a..c9e061e 100644 --- a/.github/SDR_scores.md +++ b/.github/SDR_scores.md @@ -59,3 +59,53 @@ drums ==> SDR: 10.463 SIR: 19.782 ISR: 17.144 SAR: 11.132 bass ==> SDR: 4.584 SIR: 9.359 ISR: 9.068 SAR: 4.885 other ==> SDR: 7.426 SIR: 12.793 ISR: 12.975 SAR: 7.830 ``` + +### Performance of multi-threaded inference + +Zeno - Signs, Demucs 4s multi-threaded using the same strategy used in . + +Optimal performance: `export OMP_NUM_THREADS=4` + 4 threads via cli args for a total of 16 physical cores on my 5950X. + +This should be identical in SDR but still worth testing since multi-threaded large waveform segmentation may still impact demixing quality: +``` +vocals ==> SDR: 8.317 SIR: 18.089 ISR: 15.887 SAR: 8.391 +drums ==> SDR: 9.987 SIR: 18.579 ISR: 16.997 SAR: 10.755 +bass ==> SDR: 4.039 SIR: 12.531 ISR: 6.822 SAR: 3.090 +other ==> SDR: 7.405 SIR: 11.246 ISR: 14.186 SAR: 8.099 +``` + +Multi-threaded fine-tuned: +``` +``` + +### Time measurements + +Regular, big threads = 1, OMP threads = 16: +``` +real 10m23.201s +user 29m42.190s +sys 4m17.248s +``` + +Fine-tuned, big threads = 1, OMP threads = 16: probably 4x the above, since it's just tautologically 4 Demucs models. + +Mt, big threads = 4, OMP threads = 4 (4x4 = 16): +``` +real 4m9.331s +user 18m59.731s +sys 3m28.465s +``` + +Ft Mt, big threads = 4, OMP threads = 4 (4x4 = 16): +``` +real 16m30.252s +user 74m27.250s +sys 14m40.643s +``` + +Mt, big threads = 8, OMP threads = 16: +``` +real 4m9.304s +user 43m21.830s +sys 10m15.712s +``` diff --git a/CMakeLists.txt b/CMakeLists.txt index 0d92148..d3be5b1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,7 @@ endif() set(CMAKE_CXX_FLAGS "-Wall -Wextra") set(CMAKE_CXX_FLAGS_DEBUG "-g -DEIGEN_FAST_MATH=0 -O0") -set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -march=native -fno-unsafe-math-optimizations -fassociative-math -freciprocal-math -fno-signed-zeros") +set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -march=native -fno-unsafe-math-optimizations -freciprocal-math -fno-signed-zeros") # define a macro NDEBUG for Eigen3 release builds set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG") @@ -91,14 +91,24 @@ add_executable(demucs_ft.cpp.main "cli-apps/demucs_ft.cpp") target_include_directories(demucs_ft.cpp.main PRIVATE vendor/libnyquist/include) target_link_libraries(demucs_ft.cpp.main demucs.cpp.lib libnyquist) -file(GLOB SOURCES_TO_LINT "src/*.cpp" "src/*.hpp" "cli-apps/*.cpp") +add_executable(demucs_mt.cpp.main "cli-apps/demucs_mt.cpp") +target_include_directories(demucs_mt.cpp.main PRIVATE vendor/libnyquist/include) +target_include_directories(demucs_mt.cpp.main PRIVATE cli-apps) +target_link_libraries(demucs_mt.cpp.main demucs.cpp.lib libnyquist) + +add_executable(demucs_ft_mt.cpp.main "cli-apps/demucs_ft_mt.cpp") +target_include_directories(demucs_ft_mt.cpp.main PRIVATE vendor/libnyquist/include) +target_include_directories(demucs_ft_mt.cpp.main PRIVATE cli-apps) +target_link_libraries(demucs_ft_mt.cpp.main demucs.cpp.lib libnyquist) + +file(GLOB SOURCES_TO_LINT "src/*.cpp" "src/*.hpp" "cli-apps/*.cpp" "cli-apps/*.hpp") # add target to run standard lints and formatters add_custom_target(lint COMMAND clang-format -i ${SOURCES_TO_LINT} --style=file # add clang-tidy command # add include dirs to clang-tidy - COMMAND cppcheck --enable=all --suppress=missingIncludeSystem ${SOURCES_TO_LINT} --std=c++17 + COMMAND cppcheck -I"src/" -I"cli-apps/" --enable=all --suppress=missingIncludeSystem ${SOURCES_TO_LINT} --std=c++17 COMMAND scan-build -o ${CMAKE_BINARY_DIR}/scan-build-report make -C ${CMAKE_BINARY_DIR} WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} ) diff --git a/README.md b/README.md index 2042a00..b5c7cc2 100644 --- a/README.md +++ b/README.md @@ -2,15 +2,17 @@ C++17 implementation of the [Demucs v4 hybrid transformer](https://github.com/facebookresearch/demucs), a PyTorch neural network for music demixing. Similar project to [umx.cpp](https://github.com/sevagh/umx.cpp). This code powers my site . -It uses [libnyquist](https://github.com/ddiakopoulos/libnyquist) to load audio files, the [ggml](https://github.com/ggerganov/ggml) file format to serialize the PyTorch weights of `htdemucs`, `htdemucs_6s`, and `htdemucs_ft` (4-source, 6-source, fine-tuned) to a binary file format, and [Eigen](https://eigen.tuxfamily.org/index.php?title=Main_Page) (+ OpenMP) to implement the inference. +It uses [libnyquist](https://github.com/ddiakopoulos/libnyquist) to load audio files, the [ggml](https://github.com/ggerganov/ggml) file format to serialize the PyTorch weights of `htdemucs`, `htdemucs_6s`, and `htdemucs_ft` (4-source, 6-source, fine-tuned) to a binary file format, and [Eigen](https://eigen.tuxfamily.org/index.php?title=Main_Page) (+ OpenMP) to implement the inference. There are also programs for multi-threaded Demucs inference using C++11's `std::thread`. **All Hybrid-Transformer weights** (4-source, 6-source, fine-tuned) are supported. See the [Convert weights](#convert-weights) section below. Demixing quality is nearly identical to PyTorch as shown in the [SDR scores doc](./.github/SDR_scores.md). ### Directory structure -`src` contains the library for Demucs inference, and `cli-apps` contains two driver programs, which compile to: +`src` contains the library for Demucs inference, and `cli-apps` contains four driver programs, which compile to: 1. `demucs.cpp.main`: run a single model (4s, 6s, or a single fine-tuned model) -2. `demucs_ft.cpp.main`: run all 4 fine-tuned models for `htdemucs_ft` inference, same as the BagOfModels idea of PyTorch Demucs +1. `demucs_ft.cpp.main`: run all four fine-tuned models for `htdemucs_ft` inference, same as the BagOfModels idea of PyTorch Demucs +1. `demucs_mt.cpp.main`: run a single model, multi-threaded +1. `demucs_ft_mt.cpp.main`: run all four fine-tuned models, multi-threaded ### Multi-core, OpenMP, BLAS, etc. @@ -21,6 +23,40 @@ If you have OpenMP and OpenBLAS installed, OpenBLAS might automatically use all See the [BLAS benchmarks doc](./.github/BLAS_benchmarks.md) for more details. +### Multi-threading + +There are two new programs, `demucs_mt.cpp.main` and `demucs_ft_mt.cpp.main` that use C++11 [std::threads](https://en.cppreference.com/w/cpp/thread/thread). + +In the single-threaded programs: + +* User supplies a waveform of length N seconds +* Waveform is split into 7.8-second segments for Demucs inference +* Segments are processed sequentially, where each segment inference can use >1 core with `OMP_NUM_THREADS` + +In the multi-threaded programs: +* User supplies a waveform of length N seconds and a `num_threads` argument +* Waveform is split into `num_threads` sub-waveforms (of length M < N) to process in parallel with a 0.75-second overlap + * We always need overlapping segments in audio applications to eliminate [boundary artifacts](https://freemusicdemixer.com/under-the-hood/2024/02/23/Demucs-segmentation#boundary-artifacts-and-the-overlap-add-method) +* `num_threads` threads are launched to perform Demucs inference on the sub-waveforms in parallel +* Within each thread, the sub-waveform is split into 7.8-second segments +* Segments within a thread are still processed sequentially, where each segment inference can use >1 core with `OMP_NUM_THREADS` + +For the single-threaded `demucs.cpp.main`, my suggestion is `OMP_NUM_THREADS=$num_physical_cores`. On my 5950X system with 16 cores, execution time for a 4-minute song: +``` +real 10m23.201s +user 29m42.190s +sys 4m17.248s +``` + +For the multi-threaded `demucs_mt.cpp.main`, using 4 `std::thread` and OMP threads = 4 (4x4 = 16 physical cores): +``` +real 4m9.331s +user 18m59.731s +sys 3m28.465s +``` + +More than 2x faster for 4 threads. This is inspired by the parallelism strategy used in . + ## Instructions ### Build C++ code diff --git a/cli-apps/demucs_ft.cpp b/cli-apps/demucs_ft.cpp index 0509875..7ccccf4 100644 --- a/cli-apps/demucs_ft.cpp +++ b/cli-apps/demucs_ft.cpp @@ -133,7 +133,6 @@ int main(int argc, const char **argv) // iterate over all files in model_dir // and load the model - std::string model_file; for (const auto &entry : std::filesystem::directory_iterator(model_dir)) { bool ret = false; @@ -167,6 +166,10 @@ int main(int argc, const char **argv) std::cout << "Loading ft model " << entry.path().string() << " for vocals" << std::endl; } + else + { + continue; + } // debug some members of model std::cout << "demucs_model_load returned " << (ret ? "true" : "false") diff --git a/cli-apps/demucs_ft_mt.cpp b/cli-apps/demucs_ft_mt.cpp new file mode 100644 index 0000000..c50f22e --- /dev/null +++ b/cli-apps/demucs_ft_mt.cpp @@ -0,0 +1,286 @@ +#include "dsp.hpp" +#include "model.hpp" +#include "tensor.hpp" +#include "threaded_inference.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace demucscpp; +using namespace nqr; + +static Eigen::MatrixXf load_audio_file(std::string filename) +{ + // load a wav file with libnyquist + std::shared_ptr fileData = std::make_shared(); + + NyquistIO loader; + + loader.Load(fileData.get(), filename); + + if (fileData->sampleRate != demucscpp::SUPPORTED_SAMPLE_RATE) + { + std::cerr << "[ERROR] demucs.cpp only supports the following sample " + "rate (Hz): " + << SUPPORTED_SAMPLE_RATE << std::endl; + exit(1); + } + + std::cout << "Input samples: " + << fileData->samples.size() / fileData->channelCount << std::endl; + std::cout << "Length in seconds: " << fileData->lengthSeconds << std::endl; + std::cout << "Number of channels: " << fileData->channelCount << std::endl; + + if (fileData->channelCount != 2 && fileData->channelCount != 1) + { + std::cerr << "[ERROR] demucs.cpp only supports mono and stereo audio" + << std::endl; + exit(1); + } + + // number of samples per channel + size_t N = fileData->samples.size() / fileData->channelCount; + + // create a struct to hold two float vectors for left and right channels + Eigen::MatrixXf ret(2, N); + + if (fileData->channelCount == 1) + { + // Mono case + for (size_t i = 0; i < N; ++i) + { + ret(0, i) = fileData->samples[i]; // left channel + ret(1, i) = fileData->samples[i]; // right channel + } + } + else + { + // Stereo case + for (size_t i = 0; i < N; ++i) + { + ret(0, i) = fileData->samples[2 * i]; // left channel + ret(1, i) = fileData->samples[2 * i + 1]; // right channel + } + } + + return ret; +} + +// write a function to write a StereoWaveform to a wav file +static void write_audio_file(const Eigen::MatrixXf &waveform, + std::string filename) +{ + // create a struct to hold the audio data + std::shared_ptr fileData = std::make_shared(); + + // set the sample rate + fileData->sampleRate = SUPPORTED_SAMPLE_RATE; + + // set the number of channels + fileData->channelCount = 2; + + // set the number of samples + fileData->samples.resize(waveform.cols() * 2); + + // write the left channel + for (long int i = 0; i < waveform.cols(); ++i) + { + fileData->samples[2 * i] = waveform(0, i); + fileData->samples[2 * i + 1] = waveform(1, i); + } + + int encoderStatus = + encode_wav_to_disk({fileData->channelCount, PCM_FLT, DITHER_TRIANGLE}, + fileData.get(), filename); + std::cout << "Encoder Status: " << encoderStatus << std::endl; +} + +int main(int argc, const char **argv) +{ + if (argc != 5) + { + std::cerr << "Usage: " << argv[0] + << " " + << std::endl; + exit(1); + } + + std::cout << "demucs_ft_mt.cpp (Multi-threaded Fine-tuned) driver program" + << std::endl; + + // load model passed as argument + std::string model_dir = argv[1]; + + // load audio passed as argument + std::string wav_file = argv[2]; + + // output dir passed as argument + std::string out_dir = argv[3]; + + // get num threads from user parameter argv[4] + // cast it to int + int num_threads = std::stoi(argv[4]); + + Eigen::MatrixXf audio = load_audio_file(wav_file); + Eigen::Tensor3dXf out_targets; + + // initialize nested 4 fine-tuned struct demucs_model + std::array models = { + demucs_model(), demucs_model(), demucs_model(), demucs_model()}; + + // iterate over all files in model_dir + // and load the model + for (const auto &entry : std::filesystem::directory_iterator(model_dir)) + { + bool ret = false; + + // check if entry contains the name "htdemucs_ft_drums" + if (entry.path().string().find("htdemucs_ft_drums") != + std::string::npos) + { + ret = load_demucs_model(entry.path().string(), &models[0]); + std::cout << "Loading ft model " << entry.path().string() + << " for drums" << std::endl; + } + else if (entry.path().string().find("htdemucs_ft_bass") != + std::string::npos) + { + ret = load_demucs_model(entry.path().string(), &models[1]); + std::cout << "Loading ft model " << entry.path().string() + << " for bass" << std::endl; + } + else if (entry.path().string().find("htdemucs_ft_other") != + std::string::npos) + { + ret = load_demucs_model(entry.path().string(), &models[2]); + std::cout << "Loading ft model " << entry.path().string() + << " for other" << std::endl; + } + else if (entry.path().string().find("htdemucs_ft_vocals") != + std::string::npos) + { + ret = load_demucs_model(entry.path().string(), &models[3]); + std::cout << "Loading ft model " << entry.path().string() + << " for vocals" << std::endl; + } + else + { + continue; + } + + // debug some members of model + std::cout << "demucs_model_load returned " << (ret ? "true" : "false") + << std::endl; + if (!ret) + { + std::cerr << "Error loading model" << std::endl; + exit(1); + } + } + + const int nb_sources = 4; + + std::cout << "Starting Demucs fine-tuned (" << std::to_string(nb_sources) + << "-source) inference" << std::endl; + + // create 4 audio matrix same size, to hold output + Eigen::Tensor3dXf drums_targets = demucscppthreaded::threaded_inference( + models[0], audio, num_threads, "DRUMS\t "); + + Eigen::Tensor3dXf bass_targets = demucscppthreaded::threaded_inference( + models[1], audio, num_threads, "BASS\t "); + + Eigen::Tensor3dXf other_targets = demucscppthreaded::threaded_inference( + models[2], audio, num_threads, "OTHER\t "); + + Eigen::Tensor3dXf vocals_targets = demucscppthreaded::threaded_inference( + models[3], audio, num_threads, "VOCALS\t "); + + out_targets = Eigen::Tensor3dXf(drums_targets.dimension(0), + drums_targets.dimension(1), + drums_targets.dimension(2)); + + // simply use the respective stem from each independent fine-tuned model + out_targets.chip<0>(0) = drums_targets.chip<0>(0); + out_targets.chip<0>(1) = bass_targets.chip<0>(1); + out_targets.chip<0>(2) = other_targets.chip<0>(2); + out_targets.chip<0>(3) = vocals_targets.chip<0>(3); + + const int nb_out_sources = 4; + + for (int target = 0; target < nb_out_sources; ++target) + { + // now write the 4 audio waveforms to files in the output dir + // using libnyquist + // join out_dir with "/target_0.wav" + // using std::filesystem::path; + + std::filesystem::path p = out_dir; + // make sure the directory exists + std::filesystem::create_directories(p); + + auto p_target = p / "target_0.wav"; + + // target 0,1,2,3 map to drums,bass,other,vocals + + std::string target_name; + + switch (target) + { + case 0: + target_name = "drums"; + break; + case 1: + target_name = "bass"; + break; + case 2: + target_name = "other"; + break; + case 3: + target_name = "vocals"; + break; + case 4: + target_name = "guitar"; + break; + case 5: + target_name = "piano"; + break; + default: + std::cerr << "Error: target " << target << " not supported" + << std::endl; + exit(1); + } + + // insert target_name into the path after the digit + // e.g. target_name_0_drums.wav + p_target.replace_filename("target_" + std::to_string(target) + "_" + + target_name + ".wav"); + + std::cout << "Writing wav file " << p_target << std::endl; + + Eigen::MatrixXf target_waveform(2, audio.cols()); + + // copy the input stereo wav file into all 4 targets + for (int channel = 0; channel < 2; ++channel) + { + for (int sample = 0; sample < audio.cols(); ++sample) + { + target_waveform(channel, sample) = + out_targets(target, channel, sample); + } + } + + write_audio_file(target_waveform, p_target); + } +} diff --git a/cli-apps/demucs_mt.cpp b/cli-apps/demucs_mt.cpp new file mode 100644 index 0000000..876dbd9 --- /dev/null +++ b/cli-apps/demucs_mt.cpp @@ -0,0 +1,229 @@ +#include "dsp.hpp" +#include "model.hpp" +#include "tensor.hpp" +#include "threaded_inference.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace demucscpp; +using namespace nqr; + +static Eigen::MatrixXf load_audio_file(std::string filename) +{ + // load a wav file with libnyquist + std::shared_ptr fileData = std::make_shared(); + + NyquistIO loader; + + loader.Load(fileData.get(), filename); + + if (fileData->sampleRate != demucscpp::SUPPORTED_SAMPLE_RATE) + { + std::cerr << "[ERROR] demucs_mt.cpp only supports the following sample " + "rate (Hz): " + << SUPPORTED_SAMPLE_RATE << std::endl; + exit(1); + } + + std::cout << "Input samples: " + << fileData->samples.size() / fileData->channelCount << std::endl; + std::cout << "Length in seconds: " << fileData->lengthSeconds << std::endl; + std::cout << "Number of channels: " << fileData->channelCount << std::endl; + + if (fileData->channelCount != 2 && fileData->channelCount != 1) + { + std::cerr << "[ERROR] demucs_mt.cpp only supports mono and stereo audio" + << std::endl; + exit(1); + } + + // number of samples per channel + size_t N = fileData->samples.size() / fileData->channelCount; + + // create a struct to hold two float vectors for left and right channels + Eigen::MatrixXf ret(2, N); + + if (fileData->channelCount == 1) + { + // Mono case + for (size_t i = 0; i < N; ++i) + { + ret(0, i) = fileData->samples[i]; // left channel + ret(1, i) = fileData->samples[i]; // right channel + } + } + else + { + // Stereo case + for (size_t i = 0; i < N; ++i) + { + ret(0, i) = fileData->samples[2 * i]; // left channel + ret(1, i) = fileData->samples[2 * i + 1]; // right channel + } + } + + return ret; +} + +// write a function to write a StereoWaveform to a wav file +static void write_audio_file(const Eigen::MatrixXf &waveform, + std::string filename) +{ + // create a struct to hold the audio data + std::shared_ptr fileData = std::make_shared(); + + // set the sample rate + fileData->sampleRate = SUPPORTED_SAMPLE_RATE; + + // set the number of channels + fileData->channelCount = 2; + + // set the number of samples + fileData->samples.resize(waveform.cols() * 2); + + // write the left channel + for (long int i = 0; i < waveform.cols(); ++i) + { + fileData->samples[2 * i] = waveform(0, i); + fileData->samples[2 * i + 1] = waveform(1, i); + } + + int encoderStatus = + encode_wav_to_disk({fileData->channelCount, PCM_FLT, DITHER_TRIANGLE}, + fileData.get(), filename); + std::cout << "Encoder Status: " << encoderStatus << std::endl; +} + +int main(int argc, const char **argv) +{ + if (argc != 5) + { + std::cerr << "Usage: " << argv[0] + << " " + << std::endl; + exit(1); + } + + std::cout << "demucs_mt.cpp (Multi-threaded) driver program" << std::endl; + + // load model passed as argument + std::string model_file = argv[1]; + + // load audio passed as argument + std::string wav_file = argv[2]; + + // output dir passed as argument + std::string out_dir = argv[3]; + + // get num threads from user parameter argv[4] + // cast it to int + int num_threads = std::stoi(argv[4]); + + Eigen::MatrixXf audio = load_audio_file(wav_file); + Eigen::Tensor3dXf out_targets; + + // initialize a struct demucs_model + struct demucs_model model + { + }; + + // debug some members of model + auto ret = load_demucs_model(model_file, &model); + std::cout << "demucs_model_load returned " << (ret ? "true" : "false") + << std::endl; + if (!ret) + { + std::cerr << "Error loading model" << std::endl; + exit(1); + } + + int nb_sources = model.is_4sources ? 4 : 6; + + std::cout << "Starting Demucs (" << std::to_string(nb_sources) + << "-source) inference" << std::endl; + + // create 4 audio matrix same size, to hold output + Eigen::Tensor3dXf audio_targets = + demucscppthreaded::threaded_inference(model, audio, num_threads); + + out_targets = audio_targets; + + int nb_out_sources = model.is_4sources ? 4 : 6; + + for (int target = 0; target < nb_out_sources; ++target) + { + // now write the 4 audio waveforms to files in the output dir + // using libnyquist + // join out_dir with "/target_0.wav" + // using std::filesystem::path; + + std::filesystem::path p = out_dir; + // make sure the directory exists + std::filesystem::create_directories(p); + + auto p_target = p / "target_0.wav"; + + // target 0,1,2,3 map to drums,bass,other,vocals + + std::string target_name; + + switch (target) + { + case 0: + target_name = "drums"; + break; + case 1: + target_name = "bass"; + break; + case 2: + target_name = "other"; + break; + case 3: + target_name = "vocals"; + break; + case 4: + target_name = "guitar"; + break; + case 5: + target_name = "piano"; + break; + default: + std::cerr << "Error: target " << target << " not supported" + << std::endl; + exit(1); + } + + // insert target_name into the path after the digit + // e.g. target_name_0_drums.wav + p_target.replace_filename("target_" + std::to_string(target) + "_" + + target_name + ".wav"); + + std::cout << "Writing wav file " << p_target << std::endl; + + Eigen::MatrixXf target_waveform(2, audio.cols()); + + // copy the input stereo wav file into all 4 targets + for (int channel = 0; channel < 2; ++channel) + { + for (int sample = 0; sample < audio.cols(); ++sample) + { + target_waveform(channel, sample) = + out_targets(target, channel, sample); + } + } + + write_audio_file(target_waveform, p_target); + } +} diff --git a/cli-apps/threaded_inference.hpp b/cli-apps/threaded_inference.hpp new file mode 100644 index 0000000..1eafcea --- /dev/null +++ b/cli-apps/threaded_inference.hpp @@ -0,0 +1,190 @@ +#include "model.hpp" +#include "tensor.hpp" +#include +#include +#include +#include +#include +#include +#include +#include + +/* + this is a multithreaded driver program of demucs.cpp + which splits the input song into N segments and processes each independently + + javascript code here: + https://github.com/sevagh/free-music-demixer/blob/main/docs/main.js#L23 + + also similar to src/model_apply.cpp which implements the real + demucs 7.8-second segmentation +*/ +namespace demucscppthreaded +{ +// bigger overlap from free-music-demixer +const int SAMPLE_RATE = 44100; +const float OVERLAP = 0.75; +const int OVERLAP_SAMPLES = ::floorf(SAMPLE_RATE * OVERLAP); + +Eigen::Tensor3dXf +threaded_inference(const struct demucscpp::demucs_model &model, + const Eigen::MatrixXf &full_audio, int num_threads, + const std::string &prefix = "") +{ + // set output precision to 3 decimal places + std::cout << std::fixed << std::setprecision(3); + + // create vector of progresscallbacks per-thread + std::vector cbs; + for (int i = 0; i < num_threads; ++i) + { + cbs.push_back( + [i, prefix](float progress, const std::string &log_message) + { + std::cout << prefix << "[THREAD " << i << "] (" << std::setw(3) + << std::setfill(' ') << progress * 100.0f << "%) " + << log_message << std::endl; + }); + } + + // calculate segment length by dividing n_samples by num_threads + int total_length = full_audio.cols(); + int segment_length = ::ceilf((float)total_length / (float)num_threads); + + std::vector segments; + // split the full audio into segments + for (int i = 0; i < num_threads; ++i) + { + int start = i * segment_length; + int end = std::min(total_length, start + segment_length); + + // Create a new segment with padding for overlap + Eigen::MatrixXf segment = + Eigen::MatrixXf::Zero(2, end - start + 2 * OVERLAP_SAMPLES); + + // Overlap-padding for the left and right channels + // For the first segment, no padding at the start + if (i == 0) + { + segment.block(0, 0, 2, OVERLAP_SAMPLES).colwise() = + full_audio.col(0); + } + else + { + segment.block(0, 0, 2, OVERLAP_SAMPLES) = full_audio.block( + 0, start - OVERLAP_SAMPLES, 2, OVERLAP_SAMPLES); + } + + // For the last segment, no padding at the end + if (i == num_threads - 1) + { + int remaining_samples = total_length - end; + segment.block(0, end - start + OVERLAP_SAMPLES, 2, + remaining_samples) = + full_audio.block(0, end, 2, remaining_samples); + } + else + { + segment.block(0, end - start + OVERLAP_SAMPLES, 2, + OVERLAP_SAMPLES) = + full_audio.block(0, end, 2, OVERLAP_SAMPLES); + } + + // Assign the original segment data + segment.block(0, OVERLAP_SAMPLES, 2, end - start) = + full_audio.block(0, start, 2, end - start); + segments.push_back(segment); + } + + // insert parallel processing here + // pretend like segment_outs contains: + // (4, 2, segment_samples) + // which are 4 targets, stereo/2 channels, and the above segment length + // and we want this to be recombined into a single tensor + // i.e. Eigen::Tensor3dXf(4, 2, total_length) + std::vector segment_outs(num_threads); + + // This vector will hold the threads + std::vector threads; + + for (int i = 0; i < num_threads; ++i) + { + threads.emplace_back( + [&model, &segments, &segment_outs, i, &cbs]() { + segment_outs[i] = + demucscpp::demucs_inference(model, segments[i], cbs[i]); + }); + } + + // Wait for all threads to finish + for (auto &thread : threads) + { + thread.join(); + } + + // Calculate total output size and create the output tensor + Eigen::Tensor3dXf final_output(4, 2, total_length); + final_output.setZero(); + + Eigen::VectorXf ramp(segment_length); + for (int i = 0; i < segment_length; ++i) + { + ramp(i) = std::min(i + 1, segment_length - i); + } + ramp /= ramp.maxCoeff(); // Normalize the ramp + + Eigen::VectorXf sum_weight = Eigen::VectorXf::Zero(total_length); + + for (size_t i = 0; i < segment_outs.size(); ++i) + { + int segment_start = i * segment_length; + for (int t = 0; t < 4; ++t) + { // For each target + for (int ch = 0; ch < 2; ++ch) + { // For each channel + for (int j = 0; j < segment_length + 2 * OVERLAP_SAMPLES; ++j) + { + int global_idx = segment_start + j - OVERLAP_SAMPLES; + if (global_idx >= 0 && global_idx < total_length) + { + float weight = 1.0; + // Apply ramp weights at the beginning and end of the + // segment + if (j < OVERLAP_SAMPLES) + { + weight = ramp(j); + } + else if (j >= segment_length) + { + weight = ramp(segment_length + 2 * OVERLAP_SAMPLES - + j - 1); + } + final_output(t, ch, global_idx) += + segment_outs[i](t, ch, j) * weight; + sum_weight(global_idx) += weight; + } + } + } + } + } + + // Normalize the output by the sum of weights + for (int t = 0; t < 4; ++t) + { + for (int ch = 0; ch < 2; ++ch) + { + for (int i = 0; i < total_length; ++i) + { + if (sum_weight(i) > 0) + { + // account for summing per-target by dividing by n targets, + // 2 channels + final_output(t, ch, i) /= (sum_weight(i) / (2.0f * 4.0f)); + } + } + } + } + + return final_output; +} +}; // namespace demucscppthreaded diff --git a/src/crosstransformer.cpp b/src/crosstransformer.cpp index 59622eb..d649b95 100644 --- a/src/crosstransformer.cpp +++ b/src/crosstransformer.cpp @@ -76,9 +76,10 @@ static Eigen::Tensor3dXf create_sin_embedding(int length, int dim, return pos_emb; } -static void my_transformer_encoder_layer(struct demucscpp::demucs_model &model, - Eigen::Tensor3dXf &x, int freq_or_time, - int weight_idx, float eps = 1e-5) +static void +my_transformer_encoder_layer(const struct demucscpp::demucs_model &model, + Eigen::Tensor3dXf &x, int freq_or_time, + int weight_idx, float eps = 1e-5) { demucscpp::common_encoder_layer( x, // pass x as q @@ -135,7 +136,7 @@ static void my_transformer_encoder_layer(struct demucscpp::demucs_model &model, } static void -cross_transformer_encoder_layer(struct demucscpp::demucs_model &model, +cross_transformer_encoder_layer(const struct demucscpp::demucs_model &model, Eigen::Tensor3dXf &q, // q = x = frequency const Eigen::Tensor3dXf &k, // k = xt = time int freq_or_time, int weight_idx, @@ -201,12 +202,12 @@ cross_transformer_encoder_layer(struct demucscpp::demucs_model &model, eps); } -void demucscpp::apply_crosstransformer(struct demucscpp::demucs_model &model, - Eigen::Tensor3dXf &x, // frequency branch - Eigen::Tensor3dXf &xt, // time branch - demucscpp::ProgressCallback cb, - float current_progress, - float segment_progress) +void demucscpp::apply_crosstransformer( + const struct demucscpp::demucs_model &model, + Eigen::Tensor3dXf &x, // frequency branch + Eigen::Tensor3dXf &xt, // time branch + demucscpp::ProgressCallback cb, float current_progress, + float segment_progress) { cb(current_progress + segment_progress * 8.0f / 26.0f, "Applying crosstransformer"); diff --git a/src/crosstransformer.hpp b/src/crosstransformer.hpp index 33279e6..5085dd9 100644 --- a/src/crosstransformer.hpp +++ b/src/crosstransformer.hpp @@ -8,7 +8,7 @@ namespace demucscpp { void apply_crosstransformer( - struct demucscpp::demucs_model &model, + const struct demucscpp::demucs_model &model, Eigen::Tensor3dXf &x, // frequency branch Eigen::Tensor3dXf &xt, // time branch with leading dim (1, ...) ProgressCallback cb, float current_progress, float segment_progress); diff --git a/src/encdec.cpp b/src/encdec.cpp index 535d10e..4b4265c 100644 --- a/src/encdec.cpp +++ b/src/encdec.cpp @@ -6,7 +6,7 @@ #include // forward declaration to apply a frequency encoder -void demucscpp::apply_freq_encoder(struct demucscpp::demucs_model &model, +void demucscpp::apply_freq_encoder(const struct demucscpp::demucs_model &model, int encoder_idx, const Eigen::Tensor3dXf &x_in, Eigen::Tensor3dXf &x_out) @@ -81,7 +81,7 @@ void demucscpp::apply_freq_encoder(struct demucscpp::demucs_model &model, } // forward declaration to apply a time encoder -void demucscpp::apply_time_encoder(struct demucscpp::demucs_model &model, +void demucscpp::apply_time_encoder(const struct demucscpp::demucs_model &model, int tencoder_idx, const Eigen::Tensor3dXf &xt_in, Eigen::Tensor3dXf &xt_out) @@ -166,7 +166,7 @@ void demucscpp::apply_time_encoder(struct demucscpp::demucs_model &model, } // forward declaration to apply a frequency decoder -void demucscpp::apply_freq_decoder(struct demucscpp::demucs_model &model, +void demucscpp::apply_freq_decoder(const struct demucscpp::demucs_model &model, int decoder_idx, const Eigen::Tensor3dXf &x_in, Eigen::Tensor3dXf &x_out, @@ -262,7 +262,7 @@ void demucscpp::apply_freq_decoder(struct demucscpp::demucs_model &model, } // forward declaration to apply a time decoder -void demucscpp::apply_time_decoder(struct demucscpp::demucs_model &model, +void demucscpp::apply_time_decoder(const struct demucscpp::demucs_model &model, int tdecoder_idx, const Eigen::Tensor3dXf &xt_in, Eigen::Tensor3dXf &xt_out, diff --git a/src/encdec.hpp b/src/encdec.hpp index 5ec5ff0..23a4fd6 100644 --- a/src/encdec.hpp +++ b/src/encdec.hpp @@ -7,23 +7,24 @@ namespace demucscpp { -void apply_freq_encoder(struct demucscpp::demucs_model &model, int encoder_idx, - const Eigen::Tensor3dXf &x_in, +void apply_freq_encoder(const struct demucscpp::demucs_model &model, + int encoder_idx, const Eigen::Tensor3dXf &x_in, Eigen::Tensor3dXf &x_out); // forward declaration to apply a frequency decoder -void apply_freq_decoder(struct demucscpp::demucs_model &model, int decoder_idx, - const Eigen::Tensor3dXf &x_in, Eigen::Tensor3dXf &x_out, +void apply_freq_decoder(const struct demucscpp::demucs_model &model, + int decoder_idx, const Eigen::Tensor3dXf &x_in, + Eigen::Tensor3dXf &x_out, const Eigen::Tensor3dXf &skip); // forward declaration to apply a time encoder -void apply_time_encoder(struct demucscpp::demucs_model &model, int encoder_idx, - const Eigen::Tensor3dXf &xt_in, +void apply_time_encoder(const struct demucscpp::demucs_model &model, + int encoder_idx, const Eigen::Tensor3dXf &xt_in, Eigen::Tensor3dXf &xt_out); // forward declaration to apply a time decoder -void apply_time_decoder(struct demucscpp::demucs_model &model, int decoder_idx, - const Eigen::Tensor3dXf &xt_in, +void apply_time_decoder(const struct demucscpp::demucs_model &model, + int decoder_idx, const Eigen::Tensor3dXf &xt_in, Eigen::Tensor3dXf &xt_out, const Eigen::Tensor3dXf &skip); } // namespace demucscpp diff --git a/src/layers.cpp b/src/layers.cpp index 50b7f17..01239aa 100644 --- a/src/layers.cpp +++ b/src/layers.cpp @@ -147,7 +147,7 @@ Eigen::Tensor3dXf demucscpp::layer_norm(const Eigen::Tensor3dXf &x, return y_out; } -void demucscpp::apply_dconv(struct demucscpp::demucs_model &model, +void demucscpp::apply_dconv(const struct demucscpp::demucs_model &model, Eigen::Tensor3dXf &y, int freq_idx, int encdec_idx, int layer_idx, int mid_crop) { diff --git a/src/layers.hpp b/src/layers.hpp index 8b8e9c3..d84e804 100644 --- a/src/layers.hpp +++ b/src/layers.hpp @@ -11,8 +11,9 @@ namespace demucscpp { -void apply_dconv(struct demucscpp::demucs_model &model, Eigen::Tensor3dXf &y, - int freq_idx, int encdec_idx, int layer_idx, int mid_crop); +void apply_dconv(const struct demucscpp::demucs_model &model, + Eigen::Tensor3dXf &y, int freq_idx, int encdec_idx, + int layer_idx, int mid_crop); // used for implementing both self-attention and cross-attention // let's not modify the second argument diff --git a/src/model.hpp b/src/model.hpp index c2597a6..0e92b33 100644 --- a/src/model.hpp +++ b/src/model.hpp @@ -551,9 +551,6 @@ struct demucs_model Eigen::MatrixXf freq_emb_embedding_weight{Eigen::MatrixXf(512, 48)}; std::unique_ptr crosstransformer; - - float inference_progress; - float load_progress; }; inline std::unique_ptr @@ -660,11 +657,11 @@ const float MAX_SHIFT_SECS = 0.5; // max shift const float OVERLAP = 0.25; // overlap between segments const float TRANSITION_POWER = 1.0; // transition between segments -Eigen::Tensor3dXf demucs_inference(struct demucs_model &model, +Eigen::Tensor3dXf demucs_inference(const struct demucs_model &model, const Eigen::MatrixXf &full_audio, ProgressCallback cb); -void model_inference(struct demucs_model &model, +void model_inference(const struct demucs_model &model, struct demucscpp::demucs_segment_buffers &buffers, struct demucscpp::stft_buffers &stft_buf, ProgressCallback cb, float current_progress, diff --git a/src/model_apply.cpp b/src/model_apply.cpp index 1f9885d..bef25de 100644 --- a/src/model_apply.cpp +++ b/src/model_apply.cpp @@ -43,21 +43,21 @@ symmetric_zero_padding(Eigen::MatrixXf &padded, const Eigen::MatrixXf &original, } // forward declaration of inner fns -static Eigen::Tensor3dXf shift_inference(struct demucscpp::demucs_model &model, - Eigen::MatrixXf &full_audio, - demucscpp::ProgressCallback cb); +static Eigen::Tensor3dXf +shift_inference(const struct demucscpp::demucs_model &model, + Eigen::MatrixXf &full_audio, demucscpp::ProgressCallback cb); -static Eigen::Tensor3dXf split_inference(struct demucscpp::demucs_model &model, - Eigen::MatrixXf &full_audio, - demucscpp::ProgressCallback cb); +static Eigen::Tensor3dXf +split_inference(const struct demucscpp::demucs_model &model, + Eigen::MatrixXf &full_audio, demucscpp::ProgressCallback cb); static Eigen::Tensor3dXf segment_inference( - struct demucscpp::demucs_model &model, Eigen::MatrixXf chunk, + const struct demucscpp::demucs_model &model, Eigen::MatrixXf chunk, int segment_sample, struct demucscpp::demucs_segment_buffers &buffers, struct demucscpp::stft_buffers &stft_buf, demucscpp::ProgressCallback cb, float current_progress, float segment_progress); -Eigen::Tensor3dXf demucscpp::demucs_inference(struct demucs_model &model, +Eigen::Tensor3dXf demucscpp::demucs_inference(const struct demucs_model &model, const Eigen::MatrixXf &audio, demucscpp::ProgressCallback cb) { @@ -90,9 +90,9 @@ Eigen::Tensor3dXf demucscpp::demucs_inference(struct demucs_model &model, return waveform_outputs; } -static Eigen::Tensor3dXf shift_inference(struct demucscpp::demucs_model &model, - Eigen::MatrixXf &full_audio, - demucscpp::ProgressCallback cb) +static Eigen::Tensor3dXf +shift_inference(const struct demucscpp::demucs_model &model, + Eigen::MatrixXf &full_audio, demucscpp::ProgressCallback cb) { // first, apply shifts for time invariance // we simply only support shift=1, the demucs default @@ -137,12 +137,10 @@ static Eigen::Tensor3dXf shift_inference(struct demucscpp::demucs_model &model, return trimmed_waveform_outputs; } -static Eigen::Tensor3dXf split_inference(struct demucscpp::demucs_model &model, - Eigen::MatrixXf &full_audio, - demucscpp::ProgressCallback cb) +static Eigen::Tensor3dXf +split_inference(const struct demucscpp::demucs_model &model, + Eigen::MatrixXf &full_audio, demucscpp::ProgressCallback cb) { - std::cout << "in split inference!" << std::endl; - // calculate segment in samples int segment_samples = (int)(demucscpp::SEGMENT_LEN_SECS * demucscpp::SUPPORTED_SAMPLE_RATE); @@ -186,6 +184,7 @@ static Eigen::Tensor3dXf split_inference(struct demucscpp::demucs_model &model, // i prefer using `std::ceilf` but :shrug: int total_chunks = ::ceilf((float)length / (float)stride_samples); float increment_per_chunk = 1.0f / (float)total_chunks; + float inference_progress = 0.0f; for (int offset = 0; offset < length; offset += stride_samples) { @@ -198,9 +197,9 @@ static Eigen::Tensor3dXf split_inference(struct demucscpp::demucs_model &model, << ", chunk shape: (" << chunk.rows() << ", " << chunk.cols() << ")" << std::endl; - Eigen::Tensor3dXf chunk_out = segment_inference( - model, chunk, segment_samples, buffers, stft_buf, cb, - model.inference_progress, increment_per_chunk); + Eigen::Tensor3dXf chunk_out = + segment_inference(model, chunk, segment_samples, buffers, stft_buf, + cb, inference_progress, increment_per_chunk); // add the weighted chunk to the output // out[..., offset:offset + segment] += (weight[:chunk_length] * @@ -232,7 +231,7 @@ static Eigen::Tensor3dXf split_inference(struct demucscpp::demucs_model &model, sum_weight(offset + k) += weight(k % chunk_length); } - model.inference_progress += increment_per_chunk; + inference_progress += increment_per_chunk; } for (int i = 0; i < nb_out_sources; ++i) @@ -249,13 +248,11 @@ static Eigen::Tensor3dXf split_inference(struct demucscpp::demucs_model &model, } static Eigen::Tensor3dXf segment_inference( - struct demucscpp::demucs_model &model, Eigen::MatrixXf chunk, + const struct demucscpp::demucs_model &model, Eigen::MatrixXf chunk, int segment_samples, struct demucscpp::demucs_segment_buffers &buffers, struct demucscpp::stft_buffers &stft_buf, demucscpp::ProgressCallback cb, float current_progress, float segment_progress) { - std::cout << "in segment inference!" << std::endl; - int chunk_length = chunk.cols(); // copy chunk into buffers.mix with symmetric zero-padding diff --git a/src/model_inference.cpp b/src/model_inference.cpp index 7a5278c..418b360 100644 --- a/src/model_inference.cpp +++ b/src/model_inference.cpp @@ -46,7 +46,7 @@ static void reflect_padding(Eigen::MatrixXf &padded_mix, } void demucscpp::model_inference( - struct demucscpp::demucs_model &model, + const struct demucscpp::demucs_model &model, struct demucscpp::demucs_segment_buffers &buffers, struct demucscpp::stft_buffers &stft_buf, demucscpp::ProgressCallback cb, float current_progress, float segment_progress)