From cd31498cdc4844a14e94ca943d7c6a12dbc8a8cf Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Tue, 7 Apr 2026 19:06:49 +0200
Subject: [PATCH 1/5] feat: add inference mutex for thread safety in VAD, Text
 Embeddings and Text-to-Image

---
 .../rnexecutorch/models/embeddings/text/TextEmbeddings.cpp  | 6 ++++++
 .../rnexecutorch/models/embeddings/text/TextEmbeddings.h    | 3 +++
 .../rnexecutorch/models/text_to_image/TextToImage.cpp       | 2 ++
 .../common/rnexecutorch/models/text_to_image/TextToImage.h  | 2 ++
 .../voice_activity_detection/VoiceActivityDetection.h       | 1 +
 5 files changed, 14 insertions(+)
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp
index d645d6afa3..0c9b997914 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp
@@ -35,8 +35,14 @@ TokenIdsWithAttentionMask TextEmbeddings::preprocess(const std::string &input) {
   return {.inputIds = inputIds64, .attentionMask = attentionMask};
 }
 
+void TextEmbeddings::unload() noexcept {
+  std::scoped_lock lock(generate_mutex_);
+  BaseModel::unload();
+}
+
 std::shared_ptr<OwningArrayBuffer>
 TextEmbeddings::generate(const std::string input) {
+  std::scoped_lock lock(generate_mutex_);
   auto preprocessed = preprocess(input);
 
   std::vector<int32_t> tokenIdsShape = {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h
index 28dacca365..8f2b8ea72b 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "rnexecutorch/metaprogramming/ConstructorHelpers.h"
+#include <mutex>
 #include <rnexecutorch/TokenizerModule.h>
 #include <rnexecutorch/models/embeddings/BaseEmbeddings.h>
 
@@ -20,8 +21,10 @@ class TextEmbeddings final : public BaseEmbeddings {
   [[nodiscard(
       "Registered non-void function")]] std::shared_ptr<OwningArrayBuffer>
   generate(const std::string input);
+  void unload() noexcept;
 
 private:
+  mutable std::mutex generate_mutex_;
   std::vector<std::vector<int32_t>> inputShapes;
   TokenIdsWithAttentionMask preprocess(const std::string &input);
   std::unique_ptr<TokenizerModule> tokenizer;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.cpp
index e8de58b708..568f7738f8 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.cpp
@@ -58,6 +58,7 @@ std::shared_ptr<OwningArrayBuffer>
 TextToImage::generate(std::string input, int32_t imageSize,
                       size_t numInferenceSteps, int32_t seed,
                       std::shared_ptr<jsi::Function> callback) {
+  std::scoped_lock lock(generate_mutex_);
   setImageSize(imageSize);
   setSeed(seed);
 
@@ -137,6 +138,7 @@ size_t TextToImage::getMemoryLowerBound() const noexcept {
 }
 
 void TextToImage::unload() noexcept {
+  std::scoped_lock lock(generate_mutex_);
   encoder->unload();
   unet->unload();
   decoder->unload();
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.h
index 18316217cd..1b69bc58f4 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <memory>
+#include <mutex>
 #include <string>
 #include <vector>
 
@@ -49,6 +50,7 @@ class TextToImage final {
   static constexpr float guidanceScale = 7.5f;
   static constexpr float latentsScale = 0.18215f;
   bool interrupted = false;
+  mutable std::mutex generate_mutex_;
 
   std::shared_ptr<react::CallInvoker> callInvoker;
   std::unique_ptr<Scheduler> scheduler;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.h b/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.h
index c756bb6d3c..6cb39852d9 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.h
@@ -23,6 +23,7 @@ class VoiceActivityDetection : public BaseModel {
                          std::shared_ptr<react::CallInvoker> callInvoker);
   [[nodiscard("Registered non-void function")]] std::vector<types::Segment>
   generate(std::span<float> waveform) const;
+  void unload() noexcept;
 
   void unload() noexcept;
 

From c82605bfa5e6adc3e9a18500359398697c59c70a Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Tue, 7 Apr 2026 19:28:51 +0200
Subject: [PATCH 2/5] refactor: rename generate_mutex_ to inference_mutex_

---
 .../rnexecutorch/models/embeddings/text/TextEmbeddings.cpp    | 4 ++--
 .../rnexecutorch/models/embeddings/text/TextEmbeddings.h      | 2 +-
 .../common/rnexecutorch/models/text_to_image/TextToImage.cpp  | 4 ++--
 .../common/rnexecutorch/models/text_to_image/TextToImage.h    | 2 +-
 .../models/voice_activity_detection/VoiceActivityDetection.h  | 1 -
 5 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp
index 0c9b997914..f0f4108543 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.cpp
@@ -36,13 +36,13 @@ TokenIdsWithAttentionMask TextEmbeddings::preprocess(const std::string &input) {
 }
 
 void TextEmbeddings::unload() noexcept {
-  std::scoped_lock lock(generate_mutex_);
+  std::scoped_lock lock(inference_mutex_);
   BaseModel::unload();
 }
 
 std::shared_ptr<OwningArrayBuffer>
 TextEmbeddings::generate(const std::string input) {
-  std::scoped_lock lock(generate_mutex_);
+  std::scoped_lock lock(inference_mutex_);
   auto preprocessed = preprocess(input);
 
   std::vector<int32_t> tokenIdsShape = {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h
index 8f2b8ea72b..93d0988c04 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/text/TextEmbeddings.h
@@ -24,7 +24,7 @@ class TextEmbeddings final : public BaseEmbeddings {
   void unload() noexcept;
 
 private:
-  mutable std::mutex generate_mutex_;
+  mutable std::mutex inference_mutex_;
   std::vector<std::vector<int32_t>> inputShapes;
   TokenIdsWithAttentionMask preprocess(const std::string &input);
   std::unique_ptr<TokenizerModule> tokenizer;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.cpp
index 568f7738f8..22ad6f2ad8 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.cpp
@@ -58,7 +58,7 @@ std::shared_ptr<OwningArrayBuffer>
 TextToImage::generate(std::string input, int32_t imageSize,
                       size_t numInferenceSteps, int32_t seed,
                       std::shared_ptr<jsi::Function> callback) {
-  std::scoped_lock lock(generate_mutex_);
+  std::scoped_lock lock(inference_mutex_);
   setImageSize(imageSize);
   setSeed(seed);
 
@@ -138,7 +138,7 @@ size_t TextToImage::getMemoryLowerBound() const noexcept {
 }
 
 void TextToImage::unload() noexcept {
-  std::scoped_lock lock(generate_mutex_);
+  std::scoped_lock lock(inference_mutex_);
   encoder->unload();
   unet->unload();
   decoder->unload();
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.h
index 1b69bc58f4..e071a0c2ee 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_image/TextToImage.h
@@ -50,7 +50,7 @@ class TextToImage final {
   static constexpr float guidanceScale = 7.5f;
   static constexpr float latentsScale = 0.18215f;
   bool interrupted = false;
-  mutable std::mutex generate_mutex_;
+  mutable std::mutex inference_mutex_;
 
   std::shared_ptr<react::CallInvoker> callInvoker;
   std::unique_ptr<Scheduler> scheduler;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.h b/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.h
index 6cb39852d9..5bdcbd9352 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.h
@@ -29,7 +29,6 @@ class VoiceActivityDetection : public BaseModel {
 
 private:
   mutable std::mutex inference_mutex_;
-
   std::vector<std::array<float, constants::kPaddedWindowSize>>
   preprocess(std::span<float> waveform) const;
   std::vector<types::Segment> postprocess(const std::vector<float> &scores,

From bb52a43daf1fc98ddc017e3bf0a8b0e29af64db3 Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Tue, 7 Apr 2026 20:09:07 +0200
Subject: [PATCH 3/5] fix: capture model and callInvoker by value in
 GlobalThreadPool detach

---
 .../common/rnexecutorch/host_objects/ModelHostObject.h        | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index abfdb40dfc..04b0accd34 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -375,7 +375,9 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
             // We need to dispatch a thread if we want the function to be
             // asynchronous. In this thread all accesses to jsi::Runtime need to
             // be done via the callInvoker.
-            threads::GlobalThreadPool::detach([this, promise,
+            threads::GlobalThreadPool::detach([model = this->model,
+                                               callInvoker = this->callInvoker,
+                                               promise,
                                                argsConverted =
                                                    std::move(argsConverted)]() {
               try {

From eb00055c7e7c4a8c670e0eb566c0af6b14bfda12 Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Wed, 8 Apr 2026 11:40:11 +0200
Subject: [PATCH 4/5] refactor: remove duplicate unload() declaration in
 VoiceActivityDetection

---
 .../models/voice_activity_detection/VoiceActivityDetection.h   | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.h b/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.h
index 5bdcbd9352..4b1c1ed163 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.h
@@ -25,10 +25,9 @@ class VoiceActivityDetection : public BaseModel {
   generate(std::span<float> waveform) const;
   void unload() noexcept;
 
-  void unload() noexcept;
-
 private:
   mutable std::mutex inference_mutex_;
+
   std::vector<std::array<float, constants::kPaddedWindowSize>>
   preprocess(std::span<float> waveform) const;
   std::vector<types::Segment> postprocess(const std::vector<float> &scores,

From dcad33b6adde67fb5a07fc0f091a45e9f0125fcb Mon Sep 17 00:00:00 2001
From: Bartosz Hanc <bartosz.hanc02@gmail.com>
Date: Wed, 8 Apr 2026 11:40:58 +0200
Subject: [PATCH 5/5] refactor: add newline

---
 .../models/voice_activity_detection/VoiceActivityDetection.h     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.h b/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.h
index 4b1c1ed163..c756bb6d3c 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/voice_activity_detection/VoiceActivityDetection.h
@@ -23,6 +23,7 @@ class VoiceActivityDetection : public BaseModel {
                          std::shared_ptr<react::CallInvoker> callInvoker);
   [[nodiscard("Registered non-void function")]] std::vector<types::Segment>
   generate(std::span<float> waveform) const;
+
   void unload() noexcept;
 
 private: