From 6de86deea4896d4de8f022c67c17a85a4cc6185b Mon Sep 17 00:00:00 2001
From: Sevag H <sevagh@protonmail.com>
Date: Sun, 3 Mar 2024 14:57:16 -0500
Subject: [PATCH] Threaded inference (#10)

---
 .clang-format                   |   2 +-
 .github/SDR_scores.md           |  50 ++++++
 CMakeLists.txt                  |  16 +-
 README.md                       |  42 ++++-
 cli-apps/demucs_ft.cpp          |   5 +-
 cli-apps/demucs_ft_mt.cpp       | 286 ++++++++++++++++++++++++++++++++
 cli-apps/demucs_mt.cpp          | 229 +++++++++++++++++++++++++
 cli-apps/threaded_inference.hpp | 190 +++++++++++++++++++++
 src/crosstransformer.cpp        |  21 +--
 src/crosstransformer.hpp        |   2 +-
 src/encdec.cpp                  |   8 +-
 src/encdec.hpp                  |  17 +-
 src/layers.cpp                  |   2 +-
 src/layers.hpp                  |   5 +-
 src/model.hpp                   |   7 +-
 src/model_apply.cpp             |  43 +++--
 src/model_inference.cpp         |   2 +-
 17 files changed, 864 insertions(+), 63 deletions(-)
 create mode 100644 cli-apps/demucs_ft_mt.cpp
 create mode 100644 cli-apps/demucs_mt.cpp
 create mode 100644 cli-apps/threaded_inference.hpp

diff --git a/.clang-format b/.clang-format
index 6af95b9..9e30a2f 100644
--- a/.clang-format
+++ b/.clang-format
@@ -3,4 +3,4 @@ IndentWidth: 4
 BreakBeforeBraces: Allman
 AllowShortIfStatementsOnASingleLine: false
 IndentCaseLabels: false
-ColumnLimit: 80
\ No newline at end of file
+ColumnLimit: 80
diff --git a/.github/SDR_scores.md b/.github/SDR_scores.md
index 5e3ee3a..c9e061e 100644
--- a/.github/SDR_scores.md
+++ b/.github/SDR_scores.md
@@ -59,3 +59,53 @@ drums           ==> SDR:  10.463  SIR:  19.782  ISR:  17.144  SAR:  11.132
 bass            ==> SDR:   4.584  SIR:   9.359  ISR:   9.068  SAR:   4.885
 other           ==> SDR:   7.426  SIR:  12.793  ISR:  12.975  SAR:   7.830
 ```
+
+### Performance of multi-threaded inference
+
+Zeno - Signs, Demucs 4s multi-threaded using the same strategy used in <https://freemusicdemixer.com>.
+
+Optimal performance: `export OMP_NUM_THREADS=4` + 4 threads via cli args for a total of 16 physical cores on my 5950X.
+
+This should be identical in SDR but still worth testing since multi-threaded large waveform segmentation may still impact demixing quality:
+```
+vocals          ==> SDR:   8.317  SIR:  18.089  ISR:  15.887  SAR:   8.391
+drums           ==> SDR:   9.987  SIR:  18.579  ISR:  16.997  SAR:  10.755
+bass            ==> SDR:   4.039  SIR:  12.531  ISR:   6.822  SAR:   3.090
+other           ==> SDR:   7.405  SIR:  11.246  ISR:  14.186  SAR:   8.099
+```
+
+Multi-threaded fine-tuned:
+```
+```
+
+### Time measurements
+
+Regular, big threads = 1, OMP threads = 16:
+```
+real    10m23.201s
+user    29m42.190s
+sys     4m17.248s
+```
+
+Fine-tuned, big threads = 1, OMP threads = 16: probably 4x the above, since it's just tautologically 4 Demucs models.
+
+Mt, big threads = 4, OMP threads = 4 (4x4 = 16):
+```
+real    4m9.331s
+user    18m59.731s
+sys     3m28.465s
+```
+
+Ft Mt, big threads = 4, OMP threads = 4 (4x4 = 16):
+```
+real    16m30.252s
+user    74m27.250s
+sys     14m40.643s
+```
+
+Mt, big threads = 8, OMP threads = 16:
+```
+real    4m9.304s
+user    43m21.830s
+sys     10m15.712s
+```
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0d92148..d3be5b1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,7 +16,7 @@ endif()
 set(CMAKE_CXX_FLAGS "-Wall -Wextra")
 set(CMAKE_CXX_FLAGS_DEBUG "-g -DEIGEN_FAST_MATH=0 -O0")
 
-set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -march=native -fno-unsafe-math-optimizations -fassociative-math -freciprocal-math -fno-signed-zeros")
+set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -march=native -fno-unsafe-math-optimizations -freciprocal-math -fno-signed-zeros")
 
 # define a macro NDEBUG for Eigen3 release builds
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG")
@@ -91,14 +91,24 @@ add_executable(demucs_ft.cpp.main "cli-apps/demucs_ft.cpp")
 target_include_directories(demucs_ft.cpp.main PRIVATE vendor/libnyquist/include)
 target_link_libraries(demucs_ft.cpp.main demucs.cpp.lib libnyquist)
 
-file(GLOB SOURCES_TO_LINT "src/*.cpp" "src/*.hpp" "cli-apps/*.cpp")
+add_executable(demucs_mt.cpp.main "cli-apps/demucs_mt.cpp")
+target_include_directories(demucs_mt.cpp.main PRIVATE vendor/libnyquist/include)
+target_include_directories(demucs_mt.cpp.main PRIVATE cli-apps)
+target_link_libraries(demucs_mt.cpp.main demucs.cpp.lib libnyquist)
+
+add_executable(demucs_ft_mt.cpp.main "cli-apps/demucs_ft_mt.cpp")
+target_include_directories(demucs_ft_mt.cpp.main PRIVATE vendor/libnyquist/include)
+target_include_directories(demucs_ft_mt.cpp.main PRIVATE cli-apps)
+target_link_libraries(demucs_ft_mt.cpp.main demucs.cpp.lib libnyquist)
+
+file(GLOB SOURCES_TO_LINT "src/*.cpp" "src/*.hpp" "cli-apps/*.cpp" "cli-apps/*.hpp")
 
 # add target to run standard lints and formatters
 add_custom_target(lint
     COMMAND clang-format -i ${SOURCES_TO_LINT} --style=file
     # add clang-tidy command
     # add include dirs to clang-tidy
-    COMMAND cppcheck --enable=all --suppress=missingIncludeSystem ${SOURCES_TO_LINT} --std=c++17
+    COMMAND cppcheck -I"src/" -I"cli-apps/" --enable=all --suppress=missingIncludeSystem ${SOURCES_TO_LINT} --std=c++17
     COMMAND scan-build -o ${CMAKE_BINARY_DIR}/scan-build-report make -C ${CMAKE_BINARY_DIR}
     WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
 )
diff --git a/README.md b/README.md
index 2042a00..b5c7cc2 100644
--- a/README.md
+++ b/README.md
@@ -2,15 +2,17 @@
 
 C++17 implementation of the [Demucs v4 hybrid transformer](https://github.com/facebookresearch/demucs), a PyTorch neural network for music demixing. Similar project to [umx.cpp](https://github.com/sevagh/umx.cpp). This code powers my site <https://freemusicdemixer.com>.
 
-It uses [libnyquist](https://github.com/ddiakopoulos/libnyquist) to load audio files, the [ggml](https://github.com/ggerganov/ggml) file format to serialize the PyTorch weights of `htdemucs`, `htdemucs_6s`, and `htdemucs_ft` (4-source, 6-source, fine-tuned) to a binary file format, and [Eigen](https://eigen.tuxfamily.org/index.php?title=Main_Page) (+ OpenMP) to implement the inference.
+It uses [libnyquist](https://github.com/ddiakopoulos/libnyquist) to load audio files, the [ggml](https://github.com/ggerganov/ggml) file format to serialize the PyTorch weights of `htdemucs`, `htdemucs_6s`, and `htdemucs_ft` (4-source, 6-source, fine-tuned) to a binary file format, and [Eigen](https://eigen.tuxfamily.org/index.php?title=Main_Page) (+ OpenMP) to implement the inference. There are also programs for multi-threaded Demucs inference using C++11's `std::thread`.
 
 **All Hybrid-Transformer weights** (4-source, 6-source, fine-tuned) are supported. See the [Convert weights](#convert-weights) section below. Demixing quality is nearly identical to PyTorch as shown in the [SDR scores doc](./.github/SDR_scores.md).
 
 ### Directory structure
 
-`src` contains the library for Demucs inference, and `cli-apps` contains two driver programs, which compile to:
+`src` contains the library for Demucs inference, and `cli-apps` contains four driver programs, which compile to:
 1. `demucs.cpp.main`: run a single model (4s, 6s, or a single fine-tuned model)
-2. `demucs_ft.cpp.main`: run all 4 fine-tuned models for `htdemucs_ft` inference, same as the BagOfModels idea of PyTorch Demucs
+1. `demucs_ft.cpp.main`: run all four fine-tuned models for `htdemucs_ft` inference, same as the BagOfModels idea of PyTorch Demucs
+1. `demucs_mt.cpp.main`: run a single model, multi-threaded
+1. `demucs_ft_mt.cpp.main`: run all four fine-tuned models, multi-threaded
 
 ### Multi-core, OpenMP, BLAS, etc.
 
@@ -21,6 +23,40 @@ If you have OpenMP and OpenBLAS installed, OpenBLAS might automatically use all
 
 See the [BLAS benchmarks doc](./.github/BLAS_benchmarks.md) for more details.
 
+### Multi-threading
+
+There are two new programs, `demucs_mt.cpp.main` and `demucs_ft_mt.cpp.main` that use C++11 [std::threads](https://en.cppreference.com/w/cpp/thread/thread).
+
+In the single-threaded programs:
+
+* User supplies a waveform of length N seconds
+* Waveform is split into 7.8-second segments for Demucs inference
+* Segments are processed sequentially, where each segment inference can use >1 core with `OMP_NUM_THREADS`
+
+In the multi-threaded programs:
+* User supplies a waveform of length N seconds and a `num_threads` argument
+* Waveform is split into `num_threads` sub-waveforms (of length M < N) to process in parallel with a 0.75-second overlap
+    * We always need overlapping segments in audio applications to eliminate [boundary artifacts](https://freemusicdemixer.com/under-the-hood/2024/02/23/Demucs-segmentation#boundary-artifacts-and-the-overlap-add-method)
+* `num_threads` threads are launched to perform Demucs inference on the sub-waveforms in parallel
+* Within each thread, the sub-waveform is split into 7.8-second segments
+* Segments within a thread are still processed sequentially, where each segment inference can use >1 core with `OMP_NUM_THREADS`
+
+For the single-threaded `demucs.cpp.main`, my suggestion is `OMP_NUM_THREADS=$num_physical_cores`. On my 5950X system with 16 cores, execution time for a 4-minute song:
+```
+real    10m23.201s
+user    29m42.190s
+sys     4m17.248s
+```
+
+For the multi-threaded `demucs_mt.cpp.main`, using 4 `std::thread` and OMP threads = 4 (4x4 = 16 physical cores):
+```
+real    4m9.331s
+user    18m59.731s
+sys     3m28.465s
+```
+
+More than 2x faster for 4 threads. This is inspired by the parallelism strategy used in <https://freemusicdemixer.com>.
+
 ## Instructions
 
 ### Build C++ code
diff --git a/cli-apps/demucs_ft.cpp b/cli-apps/demucs_ft.cpp
index 0509875..7ccccf4 100644
--- a/cli-apps/demucs_ft.cpp
+++ b/cli-apps/demucs_ft.cpp
@@ -133,7 +133,6 @@ int main(int argc, const char **argv)
 
     // iterate over all files in model_dir
     // and load the model
-    std::string model_file;
     for (const auto &entry : std::filesystem::directory_iterator(model_dir))
     {
         bool ret = false;
@@ -167,6 +166,10 @@ int main(int argc, const char **argv)
             std::cout << "Loading ft model " << entry.path().string()
                       << " for vocals" << std::endl;
         }
+        else
+        {
+            continue;
+        }
 
         // debug some members of model
         std::cout << "demucs_model_load returned " << (ret ? "true" : "false")
diff --git a/cli-apps/demucs_ft_mt.cpp b/cli-apps/demucs_ft_mt.cpp
new file mode 100644
index 0000000..c50f22e
--- /dev/null
+++ b/cli-apps/demucs_ft_mt.cpp
@@ -0,0 +1,286 @@
+#include "dsp.hpp"
+#include "model.hpp"
+#include "tensor.hpp"
+#include "threaded_inference.hpp"
+#include <Eigen/Core>
+#include <Eigen/Dense>
+#include <cassert>
+#include <filesystem>
+#include <iomanip>
+#include <iostream>
+#include <libnyquist/Common.h>
+#include <libnyquist/Decoders.h>
+#include <libnyquist/Encoders.h>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <unsupported/Eigen/FFT>
+#include <vector>
+
+using namespace demucscpp;
+using namespace nqr;
+
+static Eigen::MatrixXf load_audio_file(std::string filename)
+{
+    // load a wav file with libnyquist
+    std::shared_ptr<AudioData> fileData = std::make_shared<AudioData>();
+
+    NyquistIO loader;
+
+    loader.Load(fileData.get(), filename);
+
+    if (fileData->sampleRate != demucscpp::SUPPORTED_SAMPLE_RATE)
+    {
+        std::cerr << "[ERROR] demucs.cpp only supports the following sample "
+                     "rate (Hz): "
+                  << SUPPORTED_SAMPLE_RATE << std::endl;
+        exit(1);
+    }
+
+    std::cout << "Input samples: "
+              << fileData->samples.size() / fileData->channelCount << std::endl;
+    std::cout << "Length in seconds: " << fileData->lengthSeconds << std::endl;
+    std::cout << "Number of channels: " << fileData->channelCount << std::endl;
+
+    if (fileData->channelCount != 2 && fileData->channelCount != 1)
+    {
+        std::cerr << "[ERROR] demucs.cpp only supports mono and stereo audio"
+                  << std::endl;
+        exit(1);
+    }
+
+    // number of samples per channel
+    size_t N = fileData->samples.size() / fileData->channelCount;
+
+    // create a struct to hold two float vectors for left and right channels
+    Eigen::MatrixXf ret(2, N);
+
+    if (fileData->channelCount == 1)
+    {
+        // Mono case
+        for (size_t i = 0; i < N; ++i)
+        {
+            ret(0, i) = fileData->samples[i]; // left channel
+            ret(1, i) = fileData->samples[i]; // right channel
+        }
+    }
+    else
+    {
+        // Stereo case
+        for (size_t i = 0; i < N; ++i)
+        {
+            ret(0, i) = fileData->samples[2 * i];     // left channel
+            ret(1, i) = fileData->samples[2 * i + 1]; // right channel
+        }
+    }
+
+    return ret;
+}
+
+// write a function to write a StereoWaveform to a wav file
+static void write_audio_file(const Eigen::MatrixXf &waveform,
+                             std::string filename)
+{
+    // create a struct to hold the audio data
+    std::shared_ptr<AudioData> fileData = std::make_shared<AudioData>();
+
+    // set the sample rate
+    fileData->sampleRate = SUPPORTED_SAMPLE_RATE;
+
+    // set the number of channels
+    fileData->channelCount = 2;
+
+    // set the number of samples
+    fileData->samples.resize(waveform.cols() * 2);
+
+    // write the left channel
+    for (long int i = 0; i < waveform.cols(); ++i)
+    {
+        fileData->samples[2 * i] = waveform(0, i);
+        fileData->samples[2 * i + 1] = waveform(1, i);
+    }
+
+    int encoderStatus =
+        encode_wav_to_disk({fileData->channelCount, PCM_FLT, DITHER_TRIANGLE},
+                           fileData.get(), filename);
+    std::cout << "Encoder Status: " << encoderStatus << std::endl;
+}
+
+int main(int argc, const char **argv)
+{
+    if (argc != 5)
+    {
+        std::cerr << "Usage: " << argv[0]
+                  << " <model dir> <wav file> <out dir> <num threads>"
+                  << std::endl;
+        exit(1);
+    }
+
+    std::cout << "demucs_ft_mt.cpp (Multi-threaded Fine-tuned) driver program"
+              << std::endl;
+
+    // load model passed as argument
+    std::string model_dir = argv[1];
+
+    // load audio passed as argument
+    std::string wav_file = argv[2];
+
+    // output dir passed as argument
+    std::string out_dir = argv[3];
+
+    // get num threads from user parameter argv[4]
+    // cast it to int
+    int num_threads = std::stoi(argv[4]);
+
+    Eigen::MatrixXf audio = load_audio_file(wav_file);
+    Eigen::Tensor3dXf out_targets;
+
+    // initialize nested 4 fine-tuned struct demucs_model
+    std::array<struct demucs_model, 4> models = {
+        demucs_model(), demucs_model(), demucs_model(), demucs_model()};
+
+    // iterate over all files in model_dir
+    // and load the model
+    for (const auto &entry : std::filesystem::directory_iterator(model_dir))
+    {
+        bool ret = false;
+
+        // check if entry contains the name "htdemucs_ft_drums"
+        if (entry.path().string().find("htdemucs_ft_drums") !=
+            std::string::npos)
+        {
+            ret = load_demucs_model(entry.path().string(), &models[0]);
+            std::cout << "Loading ft model " << entry.path().string()
+                      << " for drums" << std::endl;
+        }
+        else if (entry.path().string().find("htdemucs_ft_bass") !=
+                 std::string::npos)
+        {
+            ret = load_demucs_model(entry.path().string(), &models[1]);
+            std::cout << "Loading ft model " << entry.path().string()
+                      << " for bass" << std::endl;
+        }
+        else if (entry.path().string().find("htdemucs_ft_other") !=
+                 std::string::npos)
+        {
+            ret = load_demucs_model(entry.path().string(), &models[2]);
+            std::cout << "Loading ft model " << entry.path().string()
+                      << " for other" << std::endl;
+        }
+        else if (entry.path().string().find("htdemucs_ft_vocals") !=
+                 std::string::npos)
+        {
+            ret = load_demucs_model(entry.path().string(), &models[3]);
+            std::cout << "Loading ft model " << entry.path().string()
+                      << " for vocals" << std::endl;
+        }
+        else
+        {
+            continue;
+        }
+
+        // debug some members of model
+        std::cout << "demucs_model_load returned " << (ret ? "true" : "false")
+                  << std::endl;
+        if (!ret)
+        {
+            std::cerr << "Error loading model" << std::endl;
+            exit(1);
+        }
+    }
+
+    const int nb_sources = 4;
+
+    std::cout << "Starting Demucs fine-tuned (" << std::to_string(nb_sources)
+              << "-source) inference" << std::endl;
+
+    // create 4 audio matrix same size, to hold output
+    Eigen::Tensor3dXf drums_targets = demucscppthreaded::threaded_inference(
+        models[0], audio, num_threads, "DRUMS\t ");
+
+    Eigen::Tensor3dXf bass_targets = demucscppthreaded::threaded_inference(
+        models[1], audio, num_threads, "BASS\t ");
+
+    Eigen::Tensor3dXf other_targets = demucscppthreaded::threaded_inference(
+        models[2], audio, num_threads, "OTHER\t ");
+
+    Eigen::Tensor3dXf vocals_targets = demucscppthreaded::threaded_inference(
+        models[3], audio, num_threads, "VOCALS\t ");
+
+    out_targets = Eigen::Tensor3dXf(drums_targets.dimension(0),
+                                    drums_targets.dimension(1),
+                                    drums_targets.dimension(2));
+
+    // simply use the respective stem from each independent fine-tuned model
+    out_targets.chip<0>(0) = drums_targets.chip<0>(0);
+    out_targets.chip<0>(1) = bass_targets.chip<0>(1);
+    out_targets.chip<0>(2) = other_targets.chip<0>(2);
+    out_targets.chip<0>(3) = vocals_targets.chip<0>(3);
+
+    const int nb_out_sources = 4;
+
+    for (int target = 0; target < nb_out_sources; ++target)
+    {
+        // now write the 4 audio waveforms to files in the output dir
+        // using libnyquist
+        // join out_dir with "/target_0.wav"
+        // using std::filesystem::path;
+
+        std::filesystem::path p = out_dir;
+        // make sure the directory exists
+        std::filesystem::create_directories(p);
+
+        auto p_target = p / "target_0.wav";
+
+        // target 0,1,2,3 map to drums,bass,other,vocals
+
+        std::string target_name;
+
+        switch (target)
+        {
+        case 0:
+            target_name = "drums";
+            break;
+        case 1:
+            target_name = "bass";
+            break;
+        case 2:
+            target_name = "other";
+            break;
+        case 3:
+            target_name = "vocals";
+            break;
+        case 4:
+            target_name = "guitar";
+            break;
+        case 5:
+            target_name = "piano";
+            break;
+        default:
+            std::cerr << "Error: target " << target << " not supported"
+                      << std::endl;
+            exit(1);
+        }
+
+        // insert target_name into the path after the digit
+        // e.g. target_name_0_drums.wav
+        p_target.replace_filename("target_" + std::to_string(target) + "_" +
+                                  target_name + ".wav");
+
+        std::cout << "Writing wav file " << p_target << std::endl;
+
+        Eigen::MatrixXf target_waveform(2, audio.cols());
+
+        // copy the input stereo wav file into all 4 targets
+        for (int channel = 0; channel < 2; ++channel)
+        {
+            for (int sample = 0; sample < audio.cols(); ++sample)
+            {
+                target_waveform(channel, sample) =
+                    out_targets(target, channel, sample);
+            }
+        }
+
+        write_audio_file(target_waveform, p_target);
+    }
+}
diff --git a/cli-apps/demucs_mt.cpp b/cli-apps/demucs_mt.cpp
new file mode 100644
index 0000000..876dbd9
--- /dev/null
+++ b/cli-apps/demucs_mt.cpp
@@ -0,0 +1,229 @@
+#include "dsp.hpp"
+#include "model.hpp"
+#include "tensor.hpp"
+#include "threaded_inference.hpp"
+#include <Eigen/Core>
+#include <Eigen/Dense>
+#include <cassert>
+#include <filesystem>
+#include <iomanip>
+#include <iostream>
+#include <libnyquist/Common.h>
+#include <libnyquist/Decoders.h>
+#include <libnyquist/Encoders.h>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <unsupported/Eigen/FFT>
+#include <vector>
+
+using namespace demucscpp;
+using namespace nqr;
+
+static Eigen::MatrixXf load_audio_file(std::string filename)
+{
+    // load a wav file with libnyquist
+    std::shared_ptr<AudioData> fileData = std::make_shared<AudioData>();
+
+    NyquistIO loader;
+
+    loader.Load(fileData.get(), filename);
+
+    if (fileData->sampleRate != demucscpp::SUPPORTED_SAMPLE_RATE)
+    {
+        std::cerr << "[ERROR] demucs_mt.cpp only supports the following sample "
+                     "rate (Hz): "
+                  << SUPPORTED_SAMPLE_RATE << std::endl;
+        exit(1);
+    }
+
+    std::cout << "Input samples: "
+              << fileData->samples.size() / fileData->channelCount << std::endl;
+    std::cout << "Length in seconds: " << fileData->lengthSeconds << std::endl;
+    std::cout << "Number of channels: " << fileData->channelCount << std::endl;
+
+    if (fileData->channelCount != 2 && fileData->channelCount != 1)
+    {
+        std::cerr << "[ERROR] demucs_mt.cpp only supports mono and stereo audio"
+                  << std::endl;
+        exit(1);
+    }
+
+    // number of samples per channel
+    size_t N = fileData->samples.size() / fileData->channelCount;
+
+    // create a struct to hold two float vectors for left and right channels
+    Eigen::MatrixXf ret(2, N);
+
+    if (fileData->channelCount == 1)
+    {
+        // Mono case
+        for (size_t i = 0; i < N; ++i)
+        {
+            ret(0, i) = fileData->samples[i]; // left channel
+            ret(1, i) = fileData->samples[i]; // right channel
+        }
+    }
+    else
+    {
+        // Stereo case
+        for (size_t i = 0; i < N; ++i)
+        {
+            ret(0, i) = fileData->samples[2 * i];     // left channel
+            ret(1, i) = fileData->samples[2 * i + 1]; // right channel
+        }
+    }
+
+    return ret;
+}
+
+// write a function to write a StereoWaveform to a wav file
+static void write_audio_file(const Eigen::MatrixXf &waveform,
+                             std::string filename)
+{
+    // create a struct to hold the audio data
+    std::shared_ptr<AudioData> fileData = std::make_shared<AudioData>();
+
+    // set the sample rate
+    fileData->sampleRate = SUPPORTED_SAMPLE_RATE;
+
+    // set the number of channels
+    fileData->channelCount = 2;
+
+    // set the number of samples
+    fileData->samples.resize(waveform.cols() * 2);
+
+    // write the left channel
+    for (long int i = 0; i < waveform.cols(); ++i)
+    {
+        fileData->samples[2 * i] = waveform(0, i);
+        fileData->samples[2 * i + 1] = waveform(1, i);
+    }
+
+    int encoderStatus =
+        encode_wav_to_disk({fileData->channelCount, PCM_FLT, DITHER_TRIANGLE},
+                           fileData.get(), filename);
+    std::cout << "Encoder Status: " << encoderStatus << std::endl;
+}
+
+int main(int argc, const char **argv)
+{
+    if (argc != 5)
+    {
+        std::cerr << "Usage: " << argv[0]
+                  << " <model file> <wav file> <out dir> <num threads>"
+                  << std::endl;
+        exit(1);
+    }
+
+    std::cout << "demucs_mt.cpp (Multi-threaded) driver program" << std::endl;
+
+    // load model passed as argument
+    std::string model_file = argv[1];
+
+    // load audio passed as argument
+    std::string wav_file = argv[2];
+
+    // output dir passed as argument
+    std::string out_dir = argv[3];
+
+    // get num threads from user parameter argv[4]
+    // cast it to int
+    int num_threads = std::stoi(argv[4]);
+
+    Eigen::MatrixXf audio = load_audio_file(wav_file);
+    Eigen::Tensor3dXf out_targets;
+
+    // initialize a struct demucs_model
+    struct demucs_model model
+    {
+    };
+
+    // debug some members of model
+    auto ret = load_demucs_model(model_file, &model);
+    std::cout << "demucs_model_load returned " << (ret ? "true" : "false")
+              << std::endl;
+    if (!ret)
+    {
+        std::cerr << "Error loading model" << std::endl;
+        exit(1);
+    }
+
+    int nb_sources = model.is_4sources ? 4 : 6;
+
+    std::cout << "Starting Demucs (" << std::to_string(nb_sources)
+              << "-source) inference" << std::endl;
+
+    // create 4 audio matrix same size, to hold output
+    Eigen::Tensor3dXf audio_targets =
+        demucscppthreaded::threaded_inference(model, audio, num_threads);
+
+    out_targets = audio_targets;
+
+    int nb_out_sources = model.is_4sources ? 4 : 6;
+
+    for (int target = 0; target < nb_out_sources; ++target)
+    {
+        // now write the 4 audio waveforms to files in the output dir
+        // using libnyquist
+        // join out_dir with "/target_0.wav"
+        // using std::filesystem::path;
+
+        std::filesystem::path p = out_dir;
+        // make sure the directory exists
+        std::filesystem::create_directories(p);
+
+        auto p_target = p / "target_0.wav";
+
+        // target 0,1,2,3 map to drums,bass,other,vocals
+
+        std::string target_name;
+
+        switch (target)
+        {
+        case 0:
+            target_name = "drums";
+            break;
+        case 1:
+            target_name = "bass";
+            break;
+        case 2:
+            target_name = "other";
+            break;
+        case 3:
+            target_name = "vocals";
+            break;
+        case 4:
+            target_name = "guitar";
+            break;
+        case 5:
+            target_name = "piano";
+            break;
+        default:
+            std::cerr << "Error: target " << target << " not supported"
+                      << std::endl;
+            exit(1);
+        }
+
+        // insert target_name into the path after the digit
+        // e.g. target_name_0_drums.wav
+        p_target.replace_filename("target_" + std::to_string(target) + "_" +
+                                  target_name + ".wav");
+
+        std::cout << "Writing wav file " << p_target << std::endl;
+
+        Eigen::MatrixXf target_waveform(2, audio.cols());
+
+        // copy the input stereo wav file into all 4 targets
+        for (int channel = 0; channel < 2; ++channel)
+        {
+            for (int sample = 0; sample < audio.cols(); ++sample)
+            {
+                target_waveform(channel, sample) =
+                    out_targets(target, channel, sample);
+            }
+        }
+
+        write_audio_file(target_waveform, p_target);
+    }
+}
diff --git a/cli-apps/threaded_inference.hpp b/cli-apps/threaded_inference.hpp
new file mode 100644
index 0000000..1eafcea
--- /dev/null
+++ b/cli-apps/threaded_inference.hpp
@@ -0,0 +1,190 @@
+#include "model.hpp"
+#include "tensor.hpp"
+#include <Eigen/Dense>
+#include <functional>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <vector>
+
+/*
+    this is a multithreaded driver program of demucs.cpp
+    which splits the input song into N segments and processes each independently
+
+    javascript code here:
+    https://github.com/sevagh/free-music-demixer/blob/main/docs/main.js#L23
+
+    also similar to src/model_apply.cpp which implements the real
+    demucs 7.8-second segmentation
+*/
+namespace demucscppthreaded
+{
+// bigger overlap from free-music-demixer
+const int SAMPLE_RATE = 44100;
+const float OVERLAP = 0.75;
+const int OVERLAP_SAMPLES = ::floorf(SAMPLE_RATE * OVERLAP);
+
+Eigen::Tensor3dXf
+threaded_inference(const struct demucscpp::demucs_model &model,
+                   const Eigen::MatrixXf &full_audio, int num_threads,
+                   const std::string &prefix = "")
+{
+    // set output precision to 3 decimal places
+    std::cout << std::fixed << std::setprecision(3);
+
+    // create vector of progresscallbacks per-thread
+    std::vector<demucscpp::ProgressCallback> cbs;
+    for (int i = 0; i < num_threads; ++i)
+    {
+        cbs.push_back(
+            [i, prefix](float progress, const std::string &log_message)
+            {
+                std::cout << prefix << "[THREAD " << i << "] (" << std::setw(3)
+                          << std::setfill(' ') << progress * 100.0f << "%) "
+                          << log_message << std::endl;
+            });
+    }
+
+    // calculate segment length by dividing n_samples by num_threads
+    int total_length = full_audio.cols();
+    int segment_length = ::ceilf((float)total_length / (float)num_threads);
+
+    std::vector<Eigen::MatrixXf> segments;
+    // split the full audio into segments
+    for (int i = 0; i < num_threads; ++i)
+    {
+        int start = i * segment_length;
+        int end = std::min(total_length, start + segment_length);
+
+        // Create a new segment with padding for overlap
+        Eigen::MatrixXf segment =
+            Eigen::MatrixXf::Zero(2, end - start + 2 * OVERLAP_SAMPLES);
+
+        // Overlap-padding for the left and right channels
+        // For the first segment, no padding at the start
+        if (i == 0)
+        {
+            segment.block(0, 0, 2, OVERLAP_SAMPLES).colwise() =
+                full_audio.col(0);
+        }
+        else
+        {
+            segment.block(0, 0, 2, OVERLAP_SAMPLES) = full_audio.block(
+                0, start - OVERLAP_SAMPLES, 2, OVERLAP_SAMPLES);
+        }
+
+        // For the last segment, no padding at the end
+        if (i == num_threads - 1)
+        {
+            int remaining_samples = total_length - end;
+            segment.block(0, end - start + OVERLAP_SAMPLES, 2,
+                          remaining_samples) =
+                full_audio.block(0, end, 2, remaining_samples);
+        }
+        else
+        {
+            segment.block(0, end - start + OVERLAP_SAMPLES, 2,
+                          OVERLAP_SAMPLES) =
+                full_audio.block(0, end, 2, OVERLAP_SAMPLES);
+        }
+
+        // Assign the original segment data
+        segment.block(0, OVERLAP_SAMPLES, 2, end - start) =
+            full_audio.block(0, start, 2, end - start);
+        segments.push_back(segment);
+    }
+
+    // insert parallel processing here
+    // pretend like segment_outs contains:
+    //   (4, 2, segment_samples)
+    // which are 4 targets, stereo/2 channels, and the above segment length
+    // and we want this to be recombined into a single tensor
+    // i.e. Eigen::Tensor3dXf(4, 2, total_length)
+    std::vector<Eigen::Tensor3dXf> segment_outs(num_threads);
+
+    // This vector will hold the threads
+    std::vector<std::thread> threads;
+
+    for (int i = 0; i < num_threads; ++i)
+    {
+        threads.emplace_back(
+            [&model, &segments, &segment_outs, i, &cbs]() {
+                segment_outs[i] =
+                    demucscpp::demucs_inference(model, segments[i], cbs[i]);
+            });
+    }
+
+    // Wait for all threads to finish
+    for (auto &thread : threads)
+    {
+        thread.join();
+    }
+
+    // Calculate total output size and create the output tensor
+    Eigen::Tensor3dXf final_output(4, 2, total_length);
+    final_output.setZero();
+
+    Eigen::VectorXf ramp(segment_length);
+    for (int i = 0; i < segment_length; ++i)
+    {
+        ramp(i) = std::min(i + 1, segment_length - i);
+    }
+    ramp /= ramp.maxCoeff(); // Normalize the ramp
+
+    Eigen::VectorXf sum_weight = Eigen::VectorXf::Zero(total_length);
+
+    for (size_t i = 0; i < segment_outs.size(); ++i)
+    {
+        int segment_start = i * segment_length;
+        for (int t = 0; t < 4; ++t)
+        { // For each target
+            for (int ch = 0; ch < 2; ++ch)
+            { // For each channel
+                for (int j = 0; j < segment_length + 2 * OVERLAP_SAMPLES; ++j)
+                {
+                    int global_idx = segment_start + j - OVERLAP_SAMPLES;
+                    if (global_idx >= 0 && global_idx < total_length)
+                    {
+                        float weight = 1.0;
+                        // Apply ramp weights at the beginning and end of the
+                        // segment
+                        if (j < OVERLAP_SAMPLES)
+                        {
+                            weight = ramp(j);
+                        }
+                        else if (j >= segment_length)
+                        {
+                            weight = ramp(segment_length + 2 * OVERLAP_SAMPLES -
+                                          j - 1);
+                        }
+                        final_output(t, ch, global_idx) +=
+                            segment_outs[i](t, ch, j) * weight;
+                        sum_weight(global_idx) += weight;
+                    }
+                }
+            }
+        }
+    }
+
+    // Normalize the output by the sum of weights
+    for (int t = 0; t < 4; ++t)
+    {
+        for (int ch = 0; ch < 2; ++ch)
+        {
+            for (int i = 0; i < total_length; ++i)
+            {
+                if (sum_weight(i) > 0)
+                {
+                    // account for summing per-target by dividing by n targets,
+                    // 2 channels
+                    final_output(t, ch, i) /= (sum_weight(i) / (2.0f * 4.0f));
+                }
+            }
+        }
+    }
+
+    return final_output;
+}
+}; // namespace demucscppthreaded
diff --git a/src/crosstransformer.cpp b/src/crosstransformer.cpp
index 59622eb..d649b95 100644
--- a/src/crosstransformer.cpp
+++ b/src/crosstransformer.cpp
@@ -76,9 +76,10 @@ static Eigen::Tensor3dXf create_sin_embedding(int length, int dim,
     return pos_emb;
 }
 
-static void my_transformer_encoder_layer(struct demucscpp::demucs_model &model,
-                                         Eigen::Tensor3dXf &x, int freq_or_time,
-                                         int weight_idx, float eps = 1e-5)
+static void
+my_transformer_encoder_layer(const struct demucscpp::demucs_model &model,
+                             Eigen::Tensor3dXf &x, int freq_or_time,
+                             int weight_idx, float eps = 1e-5)
 {
     demucscpp::common_encoder_layer(
         x, // pass x as q
@@ -135,7 +136,7 @@ static void my_transformer_encoder_layer(struct demucscpp::demucs_model &model,
 }
 
 static void
-cross_transformer_encoder_layer(struct demucscpp::demucs_model &model,
+cross_transformer_encoder_layer(const struct demucscpp::demucs_model &model,
                                 Eigen::Tensor3dXf &q,       // q = x = frequency
                                 const Eigen::Tensor3dXf &k, // k = xt = time
                                 int freq_or_time, int weight_idx,
@@ -201,12 +202,12 @@ cross_transformer_encoder_layer(struct demucscpp::demucs_model &model,
         eps);
 }
 
-void demucscpp::apply_crosstransformer(struct demucscpp::demucs_model &model,
-                                       Eigen::Tensor3dXf &x, // frequency branch
-                                       Eigen::Tensor3dXf &xt, // time branch
-                                       demucscpp::ProgressCallback cb,
-                                       float current_progress,
-                                       float segment_progress)
+void demucscpp::apply_crosstransformer(
+    const struct demucscpp::demucs_model &model,
+    Eigen::Tensor3dXf &x,  // frequency branch
+    Eigen::Tensor3dXf &xt, // time branch
+    demucscpp::ProgressCallback cb, float current_progress,
+    float segment_progress)
 {
     cb(current_progress + segment_progress * 8.0f / 26.0f,
        "Applying crosstransformer");
diff --git a/src/crosstransformer.hpp b/src/crosstransformer.hpp
index 33279e6..5085dd9 100644
--- a/src/crosstransformer.hpp
+++ b/src/crosstransformer.hpp
@@ -8,7 +8,7 @@
 namespace demucscpp
 {
 void apply_crosstransformer(
-    struct demucscpp::demucs_model &model,
+    const struct demucscpp::demucs_model &model,
     Eigen::Tensor3dXf &x,  // frequency branch
     Eigen::Tensor3dXf &xt, // time branch with leading dim (1, ...)
     ProgressCallback cb, float current_progress, float segment_progress);
diff --git a/src/encdec.cpp b/src/encdec.cpp
index 535d10e..4b4265c 100644
--- a/src/encdec.cpp
+++ b/src/encdec.cpp
@@ -6,7 +6,7 @@
 #include <cmath>
 
 // forward declaration to apply a frequency encoder
-void demucscpp::apply_freq_encoder(struct demucscpp::demucs_model &model,
+void demucscpp::apply_freq_encoder(const struct demucscpp::demucs_model &model,
                                    int encoder_idx,
                                    const Eigen::Tensor3dXf &x_in,
                                    Eigen::Tensor3dXf &x_out)
@@ -81,7 +81,7 @@ void demucscpp::apply_freq_encoder(struct demucscpp::demucs_model &model,
 }
 
 // forward declaration to apply a time encoder
-void demucscpp::apply_time_encoder(struct demucscpp::demucs_model &model,
+void demucscpp::apply_time_encoder(const struct demucscpp::demucs_model &model,
                                    int tencoder_idx,
                                    const Eigen::Tensor3dXf &xt_in,
                                    Eigen::Tensor3dXf &xt_out)
@@ -166,7 +166,7 @@ void demucscpp::apply_time_encoder(struct demucscpp::demucs_model &model,
 }
 
 // forward declaration to apply a frequency decoder
-void demucscpp::apply_freq_decoder(struct demucscpp::demucs_model &model,
+void demucscpp::apply_freq_decoder(const struct demucscpp::demucs_model &model,
                                    int decoder_idx,
                                    const Eigen::Tensor3dXf &x_in,
                                    Eigen::Tensor3dXf &x_out,
@@ -262,7 +262,7 @@ void demucscpp::apply_freq_decoder(struct demucscpp::demucs_model &model,
 }
 
 // forward declaration to apply a time decoder
-void demucscpp::apply_time_decoder(struct demucscpp::demucs_model &model,
+void demucscpp::apply_time_decoder(const struct demucscpp::demucs_model &model,
                                    int tdecoder_idx,
                                    const Eigen::Tensor3dXf &xt_in,
                                    Eigen::Tensor3dXf &xt_out,
diff --git a/src/encdec.hpp b/src/encdec.hpp
index 5ec5ff0..23a4fd6 100644
--- a/src/encdec.hpp
+++ b/src/encdec.hpp
@@ -7,23 +7,24 @@
 
 namespace demucscpp
 {
-void apply_freq_encoder(struct demucscpp::demucs_model &model, int encoder_idx,
-                        const Eigen::Tensor3dXf &x_in,
+void apply_freq_encoder(const struct demucscpp::demucs_model &model,
+                        int encoder_idx, const Eigen::Tensor3dXf &x_in,
                         Eigen::Tensor3dXf &x_out);
 
 // forward declaration to apply a frequency decoder
-void apply_freq_decoder(struct demucscpp::demucs_model &model, int decoder_idx,
-                        const Eigen::Tensor3dXf &x_in, Eigen::Tensor3dXf &x_out,
+void apply_freq_decoder(const struct demucscpp::demucs_model &model,
+                        int decoder_idx, const Eigen::Tensor3dXf &x_in,
+                        Eigen::Tensor3dXf &x_out,
                         const Eigen::Tensor3dXf &skip);
 
 // forward declaration to apply a time encoder
-void apply_time_encoder(struct demucscpp::demucs_model &model, int encoder_idx,
-                        const Eigen::Tensor3dXf &xt_in,
+void apply_time_encoder(const struct demucscpp::demucs_model &model,
+                        int encoder_idx, const Eigen::Tensor3dXf &xt_in,
                         Eigen::Tensor3dXf &xt_out);
 
 // forward declaration to apply a time decoder
-void apply_time_decoder(struct demucscpp::demucs_model &model, int decoder_idx,
-                        const Eigen::Tensor3dXf &xt_in,
+void apply_time_decoder(const struct demucscpp::demucs_model &model,
+                        int decoder_idx, const Eigen::Tensor3dXf &xt_in,
                         Eigen::Tensor3dXf &xt_out,
                         const Eigen::Tensor3dXf &skip);
 } // namespace demucscpp
diff --git a/src/layers.cpp b/src/layers.cpp
index 50b7f17..01239aa 100644
--- a/src/layers.cpp
+++ b/src/layers.cpp
@@ -147,7 +147,7 @@ Eigen::Tensor3dXf demucscpp::layer_norm(const Eigen::Tensor3dXf &x,
     return y_out;
 }
 
-void demucscpp::apply_dconv(struct demucscpp::demucs_model &model,
+void demucscpp::apply_dconv(const struct demucscpp::demucs_model &model,
                             Eigen::Tensor3dXf &y, int freq_idx, int encdec_idx,
                             int layer_idx, int mid_crop)
 {
diff --git a/src/layers.hpp b/src/layers.hpp
index 8b8e9c3..d84e804 100644
--- a/src/layers.hpp
+++ b/src/layers.hpp
@@ -11,8 +11,9 @@
 namespace demucscpp
 {
 
-void apply_dconv(struct demucscpp::demucs_model &model, Eigen::Tensor3dXf &y,
-                 int freq_idx, int encdec_idx, int layer_idx, int mid_crop);
+void apply_dconv(const struct demucscpp::demucs_model &model,
+                 Eigen::Tensor3dXf &y, int freq_idx, int encdec_idx,
+                 int layer_idx, int mid_crop);
 
 // used for implementing both self-attention and cross-attention
 // let's not modify the second argument
diff --git a/src/model.hpp b/src/model.hpp
index c2597a6..0e92b33 100644
--- a/src/model.hpp
+++ b/src/model.hpp
@@ -551,9 +551,6 @@ struct demucs_model
     Eigen::MatrixXf freq_emb_embedding_weight{Eigen::MatrixXf(512, 48)};
 
     std::unique_ptr<crosstransformer_base> crosstransformer;
-
-    float inference_progress;
-    float load_progress;
 };
 
 inline std::unique_ptr<crosstransformer_base>
@@ -660,11 +657,11 @@ const float MAX_SHIFT_SECS = 0.5;        // max shift
 const float OVERLAP = 0.25;              // overlap between segments
 const float TRANSITION_POWER = 1.0;      // transition between segments
 
-Eigen::Tensor3dXf demucs_inference(struct demucs_model &model,
+Eigen::Tensor3dXf demucs_inference(const struct demucs_model &model,
                                    const Eigen::MatrixXf &full_audio,
                                    ProgressCallback cb);
 
-void model_inference(struct demucs_model &model,
+void model_inference(const struct demucs_model &model,
                      struct demucscpp::demucs_segment_buffers &buffers,
                      struct demucscpp::stft_buffers &stft_buf,
                      ProgressCallback cb, float current_progress,
diff --git a/src/model_apply.cpp b/src/model_apply.cpp
index 1f9885d..bef25de 100644
--- a/src/model_apply.cpp
+++ b/src/model_apply.cpp
@@ -43,21 +43,21 @@ symmetric_zero_padding(Eigen::MatrixXf &padded, const Eigen::MatrixXf &original,
 }
 
 // forward declaration of inner fns
-static Eigen::Tensor3dXf shift_inference(struct demucscpp::demucs_model &model,
-                                         Eigen::MatrixXf &full_audio,
-                                         demucscpp::ProgressCallback cb);
+static Eigen::Tensor3dXf
+shift_inference(const struct demucscpp::demucs_model &model,
+                Eigen::MatrixXf &full_audio, demucscpp::ProgressCallback cb);
 
-static Eigen::Tensor3dXf split_inference(struct demucscpp::demucs_model &model,
-                                         Eigen::MatrixXf &full_audio,
-                                         demucscpp::ProgressCallback cb);
+static Eigen::Tensor3dXf
+split_inference(const struct demucscpp::demucs_model &model,
+                Eigen::MatrixXf &full_audio, demucscpp::ProgressCallback cb);
 
 static Eigen::Tensor3dXf segment_inference(
-    struct demucscpp::demucs_model &model, Eigen::MatrixXf chunk,
+    const struct demucscpp::demucs_model &model, Eigen::MatrixXf chunk,
     int segment_sample, struct demucscpp::demucs_segment_buffers &buffers,
     struct demucscpp::stft_buffers &stft_buf, demucscpp::ProgressCallback cb,
     float current_progress, float segment_progress);
 
-Eigen::Tensor3dXf demucscpp::demucs_inference(struct demucs_model &model,
+Eigen::Tensor3dXf demucscpp::demucs_inference(const struct demucs_model &model,
                                               const Eigen::MatrixXf &audio,
                                               demucscpp::ProgressCallback cb)
 {
@@ -90,9 +90,9 @@ Eigen::Tensor3dXf demucscpp::demucs_inference(struct demucs_model &model,
     return waveform_outputs;
 }
 
-static Eigen::Tensor3dXf shift_inference(struct demucscpp::demucs_model &model,
-                                         Eigen::MatrixXf &full_audio,
-                                         demucscpp::ProgressCallback cb)
+static Eigen::Tensor3dXf
+shift_inference(const struct demucscpp::demucs_model &model,
+                Eigen::MatrixXf &full_audio, demucscpp::ProgressCallback cb)
 {
     // first, apply shifts for time invariance
     // we simply only support shift=1, the demucs default
@@ -137,12 +137,10 @@ static Eigen::Tensor3dXf shift_inference(struct demucscpp::demucs_model &model,
     return trimmed_waveform_outputs;
 }
 
-static Eigen::Tensor3dXf split_inference(struct demucscpp::demucs_model &model,
-                                         Eigen::MatrixXf &full_audio,
-                                         demucscpp::ProgressCallback cb)
+static Eigen::Tensor3dXf
+split_inference(const struct demucscpp::demucs_model &model,
+                Eigen::MatrixXf &full_audio, demucscpp::ProgressCallback cb)
 {
-    std::cout << "in split inference!" << std::endl;
-
     // calculate segment in samples
     int segment_samples =
         (int)(demucscpp::SEGMENT_LEN_SECS * demucscpp::SUPPORTED_SAMPLE_RATE);
@@ -186,6 +184,7 @@ static Eigen::Tensor3dXf split_inference(struct demucscpp::demucs_model &model,
     // i prefer using `std::ceilf` but :shrug:
     int total_chunks = ::ceilf((float)length / (float)stride_samples);
     float increment_per_chunk = 1.0f / (float)total_chunks;
+    float inference_progress = 0.0f;
 
     for (int offset = 0; offset < length; offset += stride_samples)
     {
@@ -198,9 +197,9 @@ static Eigen::Tensor3dXf split_inference(struct demucscpp::demucs_model &model,
                   << ", chunk shape: (" << chunk.rows() << ", " << chunk.cols()
                   << ")" << std::endl;
 
-        Eigen::Tensor3dXf chunk_out = segment_inference(
-            model, chunk, segment_samples, buffers, stft_buf, cb,
-            model.inference_progress, increment_per_chunk);
+        Eigen::Tensor3dXf chunk_out =
+            segment_inference(model, chunk, segment_samples, buffers, stft_buf,
+                              cb, inference_progress, increment_per_chunk);
 
         // add the weighted chunk to the output
         // out[..., offset:offset + segment] += (weight[:chunk_length] *
@@ -232,7 +231,7 @@ static Eigen::Tensor3dXf split_inference(struct demucscpp::demucs_model &model,
             sum_weight(offset + k) += weight(k % chunk_length);
         }
 
-        model.inference_progress += increment_per_chunk;
+        inference_progress += increment_per_chunk;
     }
 
     for (int i = 0; i < nb_out_sources; ++i)
@@ -249,13 +248,11 @@ static Eigen::Tensor3dXf split_inference(struct demucscpp::demucs_model &model,
 }
 
 static Eigen::Tensor3dXf segment_inference(
-    struct demucscpp::demucs_model &model, Eigen::MatrixXf chunk,
+    const struct demucscpp::demucs_model &model, Eigen::MatrixXf chunk,
     int segment_samples, struct demucscpp::demucs_segment_buffers &buffers,
     struct demucscpp::stft_buffers &stft_buf, demucscpp::ProgressCallback cb,
     float current_progress, float segment_progress)
 {
-    std::cout << "in segment inference!" << std::endl;
-
     int chunk_length = chunk.cols();
 
     // copy chunk into buffers.mix with symmetric zero-padding
diff --git a/src/model_inference.cpp b/src/model_inference.cpp
index 7a5278c..418b360 100644
--- a/src/model_inference.cpp
+++ b/src/model_inference.cpp
@@ -46,7 +46,7 @@ static void reflect_padding(Eigen::MatrixXf &padded_mix,
 }
 
 void demucscpp::model_inference(
-    struct demucscpp::demucs_model &model,
+    const struct demucscpp::demucs_model &model,
     struct demucscpp::demucs_segment_buffers &buffers,
     struct demucscpp::stft_buffers &stft_buf, demucscpp::ProgressCallback cb,
     float current_progress, float segment_progress)