From 37c47d412004044550e75a8f28e42a8a4b6ee5df Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Tue, 2 Dec 2025 15:11:24 -0800
Subject: [PATCH 1/9] [CUDA]: GPU Device Caching for Encoder Output in CUDA
 Backend

Summary:

In encoder-decoder models like Whisper, the encoder output tensor is
used as input to every decoder iteration, and doing unnecessary
CPU->GPU->CPU->GPU cpies.

Implemented a "keep on device" caching mechanism in the CUDA backend
that:

-  Caches encoder output in persistent GPU memory after the encoder runs
-  Uses fast GPU-to-GPU copies decoder iterations instead of slow CPU-to-GPU copies

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
---
 backends/cuda/runtime/cuda_backend.cpp | 158 ++++++++++++++++++++++++-
 extension/asr/runner/runner.cpp        |  26 ++++
 2 files changed, 181 insertions(+), 3 deletions(-)
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 0cef859ddfb..e482cf29e29 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -8,6 +8,7 @@
 
 #include <cuda_runtime.h>
 #include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/backend/options.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
@@ -16,6 +17,7 @@
 #include <filesystem>
 #include <fstream>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 // Include our shim layer headers
@@ -46,9 +48,27 @@ using executorch::runtime::Result;
 using executorch::runtime::Span;
 using executorch::runtime::etensor::Tensor;
 
+// Structure to hold cached GPU tensor data for "keep on device" optimization
+struct CachedGpuData {
+  void* data_ptr;           // GPU memory pointer
+  size_t size_bytes;        // Total size in bytes
+  int32_t scalar_type;      // Data type
+  std::vector<int64_t> sizes;  // Original shape
+};
+
+// Global device cache - maps name to cached GPU data
+// Using raw GPU pointers instead of tensor handles for format independence
+static std::unordered_map<std::string, CachedGpuData> g_device_cache;
+
 class ET_EXPERIMENTAL CudaBackend final
     : public ::executorch::runtime::BackendInterface {
  private:
+  // Cache control options (set via set_option before execute)
+  mutable int cache_output_slot_ = -1;        // Which output slot to cache (-1 = none)
+  mutable std::string cache_output_name_;     // Name to cache output under
+  mutable int use_cache_input_slot_ = -1;     // Which input slot to use cache for (-1 = none)
+  mutable std::string use_cache_input_name_;  // Name of cached tensor to use
+
   Error load_function_pointers_into_handle(
       void* so_handle,
       AOTIDelegateHandle* handle) const {
@@ -91,6 +111,51 @@ class ET_EXPERIMENTAL CudaBackend final
     return 1;
   }
 
+  Error set_option(
+      __ET_UNUSED executorch::runtime::BackendOptionContext& context,
+      const executorch::runtime::Span<executorch::runtime::BackendOption>&
+          backend_options) override {
+    for (size_t i = 0; i < backend_options.size(); i++) {
+      const auto& option = backend_options[i];
+      // Handle cache_output: "slot:name" format (e.g., "0:encoder_output")
+      if (strcmp(option.key, "cache_output") == 0) {
+        if (auto* arr = std::get_if<
+                std::array<char, executorch::runtime::kMaxOptionValueLength>>(
+                &option.value)) {
+          std::string val(arr->data());
+          auto colon_pos = val.find(':');
+          if (colon_pos != std::string::npos) {
+            cache_output_slot_ = std::stoi(val.substr(0, colon_pos));
+            cache_output_name_ = val.substr(colon_pos + 1);
+          }
+        }
+      }
+      // Handle use_cache_input: "slot:name" format (e.g., "1:encoder_output")
+      else if (strcmp(option.key, "use_cache_input") == 0) {
+        if (auto* arr = std::get_if<
+                std::array<char, executorch::runtime::kMaxOptionValueLength>>(
+                &option.value)) {
+          std::string val(arr->data());
+          auto colon_pos = val.find(':');
+          if (colon_pos != std::string::npos) {
+            use_cache_input_slot_ = std::stoi(val.substr(0, colon_pos));
+            use_cache_input_name_ = val.substr(colon_pos + 1);
+          }
+        }
+      }
+      // Handle clear_cache_input: reset input cache settings
+      else if (strcmp(option.key, "clear_cache_input") == 0) {
+        if (auto* val = std::get_if<bool>(&option.value)) {
+          if (*val) {
+            use_cache_input_slot_ = -1;
+            use_cache_input_name_.clear();
+          }
+        }
+      }
+    }
+    return Error::Ok;
+  }
+
   // Once per loaded binary blob
   Result<DelegateHandle*> init(
       BackendInitContext& context,
@@ -223,14 +288,14 @@ class ET_EXPERIMENTAL CudaBackend final
         n_outputs); // GPU tensors for kernel output
 
     // Process input tensors: ExecuTorch provides CPU tensors, create GPU
-    // copies
+    // copies. For cached inputs, use GPU-to-GPU copy instead of CPU-to-GPU.
     for (int i = 0; i < n_inputs; i++) {
       // Get tensor dimensions and properties from ExecuTorch CPU tensor
       auto cpu_tensor = &(args[i]->toTensor());
       auto sizes = cpu_tensor->sizes();
       auto scalar_type = cpu_tensor->scalar_type();
 
-      // Create GPU tensor with same shape
+      // Create GPU tensor with same shape (always needed for AOTI format)
       std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
 
       AOTITensorHandle gpu_input_handle;
@@ -251,7 +316,43 @@ class ET_EXPERIMENTAL CudaBackend final
 
       gpu_inputs[i] = gpu_input_handle;
 
-      // Copy data from CPU to GPU
+      // Check if this input slot should use cached GPU data
+      if (i == use_cache_input_slot_ && !use_cache_input_name_.empty()) {
+        auto cache_it = g_device_cache.find(use_cache_input_name_);
+        if (cache_it != g_device_cache.end()) {
+          const CachedGpuData& cached = cache_it->second;
+          // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
+          size_t numel = gpu_inputs[i]->numel();
+          size_t elem_size = gpu_inputs[i]->element_size();
+          size_t copy_bytes = numel * elem_size;
+
+          ET_CHECK_OR_RETURN_ERROR(
+              copy_bytes == cached.size_bytes,
+              Internal,
+              "Cached tensor size mismatch: expected %zu bytes, got %zu",
+              copy_bytes,
+              cached.size_bytes);
+
+          cudaError_t cuda_err = cudaMemcpy(
+              gpu_inputs[i]->data_ptr(),
+              cached.data_ptr,
+              copy_bytes,
+              cudaMemcpyDeviceToDevice);
+
+          ET_CHECK_OR_RETURN_ERROR(
+              cuda_err == cudaSuccess,
+              Internal,
+              "Failed GPU-to-GPU copy for cached input %d: %s",
+              i,
+              cudaGetErrorString(cuda_err));
+
+          // Skip the CPU-to-GPU copy below
+          continue;
+        }
+        // Cache miss: fall through to normal CPU-to-GPU copy
+      }
+
+      // Copy data from CPU to GPU (normal path)
       ET_CHECK_OR_RETURN_ERROR(
           aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
           Internal,
@@ -303,6 +404,57 @@ class ET_EXPERIMENTAL CudaBackend final
         "AOTInductorModelContainerRun failed with error code %d",
         error);
 
+    // Cache output GPU tensor data if requested
+    // We store the raw GPU pointer for later GPU-to-GPU copy
+    if (cache_output_slot_ >= 0 && cache_output_slot_ < static_cast<int>(n_outputs) &&
+        !cache_output_name_.empty()) {
+      auto* gpu_tensor = gpu_outputs[cache_output_slot_];
+      size_t numel = gpu_tensor->numel();
+      size_t elem_size = gpu_tensor->element_size();
+      size_t size_bytes = numel * elem_size;
+
+      // Allocate persistent GPU memory for the cache
+      void* cache_ptr = nullptr;
+      cudaError_t alloc_err = cudaMalloc(&cache_ptr, size_bytes);
+      ET_CHECK_OR_RETURN_ERROR(
+          alloc_err == cudaSuccess,
+          Internal,
+          "Failed to allocate GPU cache memory: %s",
+          cudaGetErrorString(alloc_err));
+
+      // Copy from tensor to cache (GPU-to-GPU)
+      cudaError_t copy_err = cudaMemcpy(
+          cache_ptr,
+          gpu_tensor->data_ptr(),
+          size_bytes,
+          cudaMemcpyDeviceToDevice);
+      ET_CHECK_OR_RETURN_ERROR(
+          copy_err == cudaSuccess,
+          Internal,
+          "Failed to copy output to GPU cache: %s",
+          cudaGetErrorString(copy_err));
+
+      // Free old cache if exists
+      auto old_it = g_device_cache.find(cache_output_name_);
+      if (old_it != g_device_cache.end()) {
+        cudaFree(old_it->second.data_ptr);
+        g_device_cache.erase(old_it);
+      }
+
+      // Store in cache
+      CachedGpuData cached;
+      cached.data_ptr = cache_ptr;
+      cached.size_bytes = size_bytes;
+      cached.scalar_type = static_cast<int32_t>(gpu_tensor->scalar_type());
+      auto sizes = gpu_tensor->sizes();
+      cached.sizes.assign(sizes.begin(), sizes.end());
+      g_device_cache[cache_output_name_] = std::move(cached);
+
+      // Reset cache_output settings after caching
+      cache_output_slot_ = -1;
+      cache_output_name_.clear();
+    }
+
     // Copy GPU output results back to CPU output tensors
     for (int i = 0; i < n_outputs; i++) {
       auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp
index 4f2523989c1..ebd5d29bdb3 100644
--- a/extension/asr/runner/runner.cpp
+++ b/extension/asr/runner/runner.cpp
@@ -16,6 +16,8 @@
 #include <executorch/extension/llm/runner/util.h>
 #include <executorch/extension/llm/sampler/util.h>
 #include <executorch/extension/tensor/tensor_ptr_maker.h>
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/backend/options.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/platform/assert.h>
 #include <executorch/runtime/platform/log.h>
@@ -196,6 +198,17 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
     }
   }
 
+  // Tell CUDA backend to cache encoder output (slot 0) as "encoder_output"
+  {
+    ::executorch::runtime::BackendOptions<1> opts;
+    opts.set_option("cache_output", "0:encoder_output");
+    auto err =
+        ::executorch::runtime::set_option("CudaBackend", opts.view());
+    if (err != ::executorch::runtime::Error::Ok) {
+      ET_LOG(Info, "Failed to set cache_output option (backend may not support caching)");
+    }
+  }
+
   auto encoder_result =
       module_->execute(kEncoderMethodName, preprocessed_features);
   ET_CHECK_OK_OR_RETURN_ERROR(encoder_result.error());
@@ -249,6 +262,19 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
   decoder_inputs.emplace_back(decoder_input_ptr);
   decoder_inputs.emplace_back(encoder_output_ptr);
   decoder_inputs.emplace_back(cache_position_ptr);
+
+  // Tell CUDA backend to use cached encoder output for decoder input slot 2
+  // Note: Decoder input order in AOTI is: input_ids[0], cache_position[1], encoder_output[2]
+  {
+    ::executorch::runtime::BackendOptions<1> opts;
+    opts.set_option("use_cache_input", "2:encoder_output");
+    auto err =
+        ::executorch::runtime::set_option("CudaBackend", opts.view());
+    if (err != ::executorch::runtime::Error::Ok) {
+      ET_LOG(Info, "Failed to set use_cache_input option (backend may not support caching)");
+    }
+  }
+
   // Add some green coloring for the first generated token
   // token_callback("\033[1;32m");
   while (generated_tokens < config.max_new_tokens) {

From 5da254e7e6f657e0caa6dbf896f3ebabfc71895c Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Tue, 2 Dec 2025 16:49:02 -0800
Subject: [PATCH 2/9] Fix based on Copilot recommendations

---
 backends/cuda/runtime/cuda_backend.cpp | 77 ++++++++++++++++++++++----
 extension/asr/runner/runner.cpp        | 23 +++++++-
 2 files changed, 88 insertions(+), 12 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index e482cf29e29..5cabb51c86d 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -58,8 +58,28 @@ struct CachedGpuData {
 
 // Global device cache - maps name to cached GPU data
 // Using raw GPU pointers instead of tensor handles for format independence
+// Note: This cache is NOT thread-safe. Callers must ensure execute() is called
+// from a single thread.
 static std::unordered_map<std::string, CachedGpuData> g_device_cache;
 
+// Helper function to clear all cached GPU memory
+// Should be called during backend cleanup
+static void clear_device_cache() {
+  for (auto& pair : g_device_cache) {
+    if (pair.second.data_ptr != nullptr) {
+      cudaError_t err = cudaFree(pair.second.data_ptr);
+      if (err != cudaSuccess) {
+        ET_LOG(
+            Warning,
+            "Failed to free cached GPU memory for '%s': %s",
+            pair.first.c_str(),
+            cudaGetErrorString(err));
+      }
+    }
+  }
+  g_device_cache.clear();
+}
+
 class ET_EXPERIMENTAL CudaBackend final
     : public ::executorch::runtime::BackendInterface {
  private:
@@ -125,8 +145,17 @@ class ET_EXPERIMENTAL CudaBackend final
           std::string val(arr->data());
           auto colon_pos = val.find(':');
           if (colon_pos != std::string::npos) {
-            cache_output_slot_ = std::stoi(val.substr(0, colon_pos));
-            cache_output_name_ = val.substr(colon_pos + 1);
+            try {
+              cache_output_slot_ = std::stoi(val.substr(0, colon_pos));
+              cache_output_name_ = val.substr(colon_pos + 1);
+            } catch (const std::exception& e) {
+              ET_LOG(
+                  Error,
+                  "Invalid cache_output format '%s': %s",
+                  val.c_str(),
+                  e.what());
+              return Error::InvalidArgument;
+            }
           }
         }
       }
@@ -138,8 +167,17 @@ class ET_EXPERIMENTAL CudaBackend final
           std::string val(arr->data());
           auto colon_pos = val.find(':');
           if (colon_pos != std::string::npos) {
-            use_cache_input_slot_ = std::stoi(val.substr(0, colon_pos));
-            use_cache_input_name_ = val.substr(colon_pos + 1);
+            try {
+              use_cache_input_slot_ = std::stoi(val.substr(0, colon_pos));
+              use_cache_input_name_ = val.substr(colon_pos + 1);
+            } catch (const std::exception& e) {
+              ET_LOG(
+                  Error,
+                  "Invalid use_cache_input format '%s': %s",
+                  val.c_str(),
+                  e.what());
+              return Error::InvalidArgument;
+            }
           }
         }
       }
@@ -428,16 +466,27 @@ class ET_EXPERIMENTAL CudaBackend final
           gpu_tensor->data_ptr(),
           size_bytes,
           cudaMemcpyDeviceToDevice);
-      ET_CHECK_OR_RETURN_ERROR(
-          copy_err == cudaSuccess,
-          Internal,
-          "Failed to copy output to GPU cache: %s",
-          cudaGetErrorString(copy_err));
+      if (copy_err != cudaSuccess) {
+        // Free allocated memory before returning error
+        cudaFree(cache_ptr);
+        ET_LOG(
+            Error,
+            "Failed to copy output to GPU cache: %s",
+            cudaGetErrorString(copy_err));
+        return Error::Internal;
+      }
 
       // Free old cache if exists
       auto old_it = g_device_cache.find(cache_output_name_);
       if (old_it != g_device_cache.end()) {
-        cudaFree(old_it->second.data_ptr);
+        cudaError_t free_err = cudaFree(old_it->second.data_ptr);
+        if (free_err != cudaSuccess) {
+          ET_LOG(
+              Warning,
+              "Failed to free old cached GPU memory for '%s': %s",
+              cache_output_name_.c_str(),
+              cudaGetErrorString(free_err));
+        }
         g_device_cache.erase(old_it);
       }
 
@@ -469,6 +518,11 @@ class ET_EXPERIMENTAL CudaBackend final
           i);
     }
 
+    // Note: use_cache_input settings are intentionally NOT reset here.
+    // They persist across execute() calls to support decoder loops that
+    // reuse cached encoder output. The caller should explicitly clear
+    // these settings using the "clear_cache_input" option when done.
+
     return Error::Ok;
   }
 
@@ -478,6 +532,9 @@ class ET_EXPERIMENTAL CudaBackend final
     }
     AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
 
+    // Clear all cached GPU memory
+    clear_device_cache();
+
     // Destroy the CUDA stream if it exists
     if (handle->cuda_stream != nullptr) {
       cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp
index ebd5d29bdb3..e92bfc242f7 100644
--- a/extension/asr/runner/runner.cpp
+++ b/extension/asr/runner/runner.cpp
@@ -263,8 +263,18 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
   decoder_inputs.emplace_back(encoder_output_ptr);
   decoder_inputs.emplace_back(cache_position_ptr);
 
-  // Tell CUDA backend to use cached encoder output for decoder input slot 2
-  // Note: Decoder input order in AOTI is: input_ids[0], cache_position[1], encoder_output[2]
+  // Tell CUDA backend to use cached encoder output for decoder input slot 2.
+  //
+  // Why slot 2? AOTI reorders inputs alphabetically by name during compilation,
+  // so the decoder receives inputs in this order:
+  //   slot 0: input_ids (decoder token)
+  //   slot 1: cache_position (position in KV cache)
+  //   slot 2: encoder_output (audio features from encoder)
+  //
+  // This differs from the order we pass them in decoder_inputs above. The
+  // "2:encoder_output" format tells the backend to use the GPU-cached tensor
+  // named "encoder_output" for AOTI input slot 2, avoiding a CPU->GPU copy
+  // on each of the ~100+ decoder iterations.
   {
     ::executorch::runtime::BackendOptions<1> opts;
     opts.set_option("use_cache_input", "2:encoder_output");
@@ -330,6 +340,15 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
       break;
     }
   }
+
+  // Clear cache input settings after decoder loop completes
+  // This prevents stale cache from being used in subsequent transcribe() calls
+  {
+    ::executorch::runtime::BackendOptions<1> opts;
+    opts.set_option("clear_cache_input", true);
+    ::executorch::runtime::set_option("CudaBackend", opts.view());
+  }
+
   // Reset coloring
   // token_callback("\033[0m");
   // Update stats and print report

From ffbfbe7020b8b09e98e3f4afa91900bb8fad6017 Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Tue, 2 Dec 2025 17:19:40 -0800
Subject: [PATCH 3/9] Remove caching

---
 backends/cuda/runtime/cuda_backend.cpp | 143 ++++++++++---------------
 extension/asr/runner/runner.cpp        |  15 ++-
 2 files changed, 65 insertions(+), 93 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 5cabb51c86d..17a17883c3f 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -48,36 +48,26 @@ using executorch::runtime::Result;
 using executorch::runtime::Span;
 using executorch::runtime::etensor::Tensor;
 
-// Structure to hold cached GPU tensor data for "keep on device" optimization
-struct CachedGpuData {
-  void* data_ptr;           // GPU memory pointer
+// Structure to hold a reference to a GPU tensor for "keep on device" optimization.
+// Owns the tensor handle - must be deleted when no longer needed.
+struct GpuTensorRef {
+  AOTITensorHandle handle;  // Tensor handle (owned, for later deletion)
+  void* data_ptr;           // GPU memory pointer (for D2D copy)
   size_t size_bytes;        // Total size in bytes
-  int32_t scalar_type;      // Data type
-  std::vector<int64_t> sizes;  // Original shape
 };
 
-// Global device cache - maps name to cached GPU data
-// Using raw GPU pointers instead of tensor handles for format independence
-// Note: This cache is NOT thread-safe. Callers must ensure execute() is called
-// from a single thread.
-static std::unordered_map<std::string, CachedGpuData> g_device_cache;
-
-// Helper function to clear all cached GPU memory
-// Should be called during backend cleanup
-static void clear_device_cache() {
-  for (auto& pair : g_device_cache) {
-    if (pair.second.data_ptr != nullptr) {
-      cudaError_t err = cudaFree(pair.second.data_ptr);
-      if (err != cudaSuccess) {
-        ET_LOG(
-            Warning,
-            "Failed to free cached GPU memory for '%s': %s",
-            pair.first.c_str(),
-            cudaGetErrorString(err));
-      }
+// Global map of named GPU tensor references.
+// Note: NOT thread-safe. Callers must ensure execute() is called from a single thread.
+static std::unordered_map<std::string, GpuTensorRef> g_gpu_tensors;
+
+// Helper to clear stored GPU tensors and free their memory
+static void clear_gpu_tensors() {
+  for (auto& pair : g_gpu_tensors) {
+    if (pair.second.handle != nullptr) {
+      aoti_torch_delete_tensor_object(pair.second.handle);
     }
   }
-  g_device_cache.clear();
+  g_gpu_tensors.clear();
 }
 
 class ET_EXPERIMENTAL CudaBackend final
@@ -354,40 +344,40 @@ class ET_EXPERIMENTAL CudaBackend final
 
       gpu_inputs[i] = gpu_input_handle;
 
-      // Check if this input slot should use cached GPU data
+      // Check if this input slot should use a stored GPU tensor
       if (i == use_cache_input_slot_ && !use_cache_input_name_.empty()) {
-        auto cache_it = g_device_cache.find(use_cache_input_name_);
-        if (cache_it != g_device_cache.end()) {
-          const CachedGpuData& cached = cache_it->second;
+        auto it = g_gpu_tensors.find(use_cache_input_name_);
+        if (it != g_gpu_tensors.end()) {
+          const GpuTensorRef& ref = it->second;
           // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
           size_t numel = gpu_inputs[i]->numel();
           size_t elem_size = gpu_inputs[i]->element_size();
           size_t copy_bytes = numel * elem_size;
 
           ET_CHECK_OR_RETURN_ERROR(
-              copy_bytes == cached.size_bytes,
+              copy_bytes == ref.size_bytes,
               Internal,
-              "Cached tensor size mismatch: expected %zu bytes, got %zu",
+              "Stored tensor size mismatch: expected %zu bytes, got %zu",
               copy_bytes,
-              cached.size_bytes);
+              ref.size_bytes);
 
           cudaError_t cuda_err = cudaMemcpy(
               gpu_inputs[i]->data_ptr(),
-              cached.data_ptr,
+              ref.data_ptr,
               copy_bytes,
               cudaMemcpyDeviceToDevice);
 
           ET_CHECK_OR_RETURN_ERROR(
               cuda_err == cudaSuccess,
               Internal,
-              "Failed GPU-to-GPU copy for cached input %d: %s",
+              "Failed GPU-to-GPU copy for input %d: %s",
               i,
               cudaGetErrorString(cuda_err));
 
           // Skip the CPU-to-GPU copy below
           continue;
         }
-        // Cache miss: fall through to normal CPU-to-GPU copy
+        // Not found: fall through to normal CPU-to-GPU copy
       }
 
       // Copy data from CPU to GPU (normal path)
@@ -442,8 +432,8 @@ class ET_EXPERIMENTAL CudaBackend final
         "AOTInductorModelContainerRun failed with error code %d",
         error);
 
-    // Cache output GPU tensor data if requested
-    // We store the raw GPU pointer for later GPU-to-GPU copy
+    // Store reference to output GPU tensor if requested.
+    // The tensor will be kept alive for later D2D copy to decoder inputs.
     if (cache_output_slot_ >= 0 && cache_output_slot_ < static_cast<int>(n_outputs) &&
         !cache_output_name_.empty()) {
       auto* gpu_tensor = gpu_outputs[cache_output_slot_];
@@ -451,53 +441,18 @@ class ET_EXPERIMENTAL CudaBackend final
       size_t elem_size = gpu_tensor->element_size();
       size_t size_bytes = numel * elem_size;
 
-      // Allocate persistent GPU memory for the cache
-      void* cache_ptr = nullptr;
-      cudaError_t alloc_err = cudaMalloc(&cache_ptr, size_bytes);
-      ET_CHECK_OR_RETURN_ERROR(
-          alloc_err == cudaSuccess,
-          Internal,
-          "Failed to allocate GPU cache memory: %s",
-          cudaGetErrorString(alloc_err));
-
-      // Copy from tensor to cache (GPU-to-GPU)
-      cudaError_t copy_err = cudaMemcpy(
-          cache_ptr,
-          gpu_tensor->data_ptr(),
-          size_bytes,
-          cudaMemcpyDeviceToDevice);
-      if (copy_err != cudaSuccess) {
-        // Free allocated memory before returning error
-        cudaFree(cache_ptr);
-        ET_LOG(
-            Error,
-            "Failed to copy output to GPU cache: %s",
-            cudaGetErrorString(copy_err));
-        return Error::Internal;
-      }
-
-      // Free old cache if exists
-      auto old_it = g_device_cache.find(cache_output_name_);
-      if (old_it != g_device_cache.end()) {
-        cudaError_t free_err = cudaFree(old_it->second.data_ptr);
-        if (free_err != cudaSuccess) {
-          ET_LOG(
-              Warning,
-              "Failed to free old cached GPU memory for '%s': %s",
-              cache_output_name_.c_str(),
-              cudaGetErrorString(free_err));
-        }
-        g_device_cache.erase(old_it);
+      // Delete old tensor if overwriting
+      auto old_it = g_gpu_tensors.find(cache_output_name_);
+      if (old_it != g_gpu_tensors.end() && old_it->second.handle != nullptr) {
+        aoti_torch_delete_tensor_object(old_it->second.handle);
       }
 
-      // Store in cache
-      CachedGpuData cached;
-      cached.data_ptr = cache_ptr;
-      cached.size_bytes = size_bytes;
-      cached.scalar_type = static_cast<int32_t>(gpu_tensor->scalar_type());
-      auto sizes = gpu_tensor->sizes();
-      cached.sizes.assign(sizes.begin(), sizes.end());
-      g_device_cache[cache_output_name_] = std::move(cached);
+      // Store tensor reference (we now own this tensor)
+      GpuTensorRef ref;
+      ref.handle = gpu_tensor;
+      ref.data_ptr = gpu_tensor->data_ptr();
+      ref.size_bytes = size_bytes;
+      g_gpu_tensors[cache_output_name_] = ref;
 
       // Reset cache_output settings after caching
       cache_output_slot_ = -1;
@@ -523,6 +478,26 @@ class ET_EXPERIMENTAL CudaBackend final
     // reuse cached encoder output. The caller should explicitly clear
     // these settings using the "clear_cache_input" option when done.
 
+    // Cleanup: delete GPU tensors to avoid memory leak across execute() calls.
+    // Input tensors are no longer needed after AOTI execution.
+    for (size_t i = 0; i < n_inputs; i++) {
+      aoti_torch_delete_tensor_object(gpu_inputs[i]);
+    }
+    // Output tensors are no longer needed after copying to CPU,
+    // EXCEPT for tensors stored in g_gpu_tensors (for later D2D copy).
+    for (size_t i = 0; i < n_outputs; i++) {
+      bool is_stored = false;
+      for (const auto& pair : g_gpu_tensors) {
+        if (pair.second.handle == gpu_outputs[i]) {
+          is_stored = true;
+          break;
+        }
+      }
+      if (!is_stored) {
+        aoti_torch_delete_tensor_object(gpu_outputs[i]);
+      }
+    }
+
     return Error::Ok;
   }
 
@@ -532,8 +507,8 @@ class ET_EXPERIMENTAL CudaBackend final
     }
     AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
 
-    // Clear all cached GPU memory
-    clear_device_cache();
+    // Delete stored GPU tensors
+    clear_gpu_tensors();
 
     // Destroy the CUDA stream if it exists
     if (handle->cuda_stream != nullptr) {
diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp
index e92bfc242f7..b8df8e71db0 100644
--- a/extension/asr/runner/runner.cpp
+++ b/extension/asr/runner/runner.cpp
@@ -265,16 +265,13 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
 
   // Tell CUDA backend to use cached encoder output for decoder input slot 2.
   //
-  // Why slot 2? AOTI reorders inputs alphabetically by name during compilation,
-  // so the decoder receives inputs in this order:
-  //   slot 0: input_ids (decoder token)
-  //   slot 1: cache_position (position in KV cache)
-  //   slot 2: encoder_output (audio features from encoder)
+  // Why slot 2? The AOTI-compiled decoder receives inputs in a different order
+  // than we pass them in decoder_inputs above. The AOTI input order was
+  // determined empirically by examining tensor shapes during execution.
   //
-  // This differs from the order we pass them in decoder_inputs above. The
-  // "2:encoder_output" format tells the backend to use the GPU-cached tensor
-  // named "encoder_output" for AOTI input slot 2, avoiding a CPU->GPU copy
-  // on each of the ~100+ decoder iterations.
+  // The "2:encoder_output" format tells the backend to use the stored GPU
+  // tensor named "encoder_output" for AOTI input slot 2. This avoids redundant
+  // CPU->GPU copies on each decoder iteration.
   {
     ::executorch::runtime::BackendOptions<1> opts;
     opts.set_option("use_cache_input", "2:encoder_output");

From 03d27e7af9e3e6a5126c6c32d48d352118edbe96 Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Wed, 3 Dec 2025 05:55:19 -0800
Subject: [PATCH 4/9] Address Copilot comments

---
 backends/cuda/runtime/cuda_backend.cpp | 178 +++++++++++++++++--------
 extension/asr/runner/runner.cpp        |   8 +-
 2 files changed, 128 insertions(+), 58 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 17a17883c3f..efaefd19c99 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -12,6 +12,7 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <climits>
 #include <cstdio>
 
 #include <filesystem>
@@ -56,18 +57,40 @@ struct GpuTensorRef {
   size_t size_bytes;        // Total size in bytes
 };
 
-// Global map of named GPU tensor references.
-// Note: NOT thread-safe. Callers must ensure execute() is called from a single thread.
-static std::unordered_map<std::string, GpuTensorRef> g_gpu_tensors;
+// Parses "slot:name" format string. Returns true on success.
+// Uses character-by-character parsing to avoid std::stoi exceptions.
+static bool parse_slot_name(
+    const std::string& val,
+    int& out_slot,
+    std::string& out_name) {
+  auto colon_pos = val.find(':');
+  if (colon_pos == std::string::npos || colon_pos == 0) {
+    return false;
+  }
 
-// Helper to clear stored GPU tensors and free their memory
-static void clear_gpu_tensors() {
-  for (auto& pair : g_gpu_tensors) {
-    if (pair.second.handle != nullptr) {
-      aoti_torch_delete_tensor_object(pair.second.handle);
+  // Parse slot number manually to avoid exceptions
+  int slot = 0;
+  for (size_t i = 0; i < colon_pos; i++) {
+    char c = val[i];
+    if (c < '0' || c > '9') {
+      return false;  // Non-digit character
+    }
+    int digit = c - '0';
+    // Check for overflow
+    if (slot > (INT_MAX - digit) / 10) {
+      return false;
     }
+    slot = slot * 10 + digit;
   }
-  g_gpu_tensors.clear();
+
+  std::string name = val.substr(colon_pos + 1);
+  if (name.empty()) {
+    return false;
+  }
+
+  out_slot = slot;
+  out_name = std::move(name);
+  return true;
 }
 
 class ET_EXPERIMENTAL CudaBackend final
@@ -79,6 +102,29 @@ class ET_EXPERIMENTAL CudaBackend final
   mutable int use_cache_input_slot_ = -1;     // Which input slot to use cache for (-1 = none)
   mutable std::string use_cache_input_name_;  // Name of cached tensor to use
 
+  // Per-instance map of named GPU tensor references.
+  // Mutable because execute() is const but needs to modify this.
+  //
+  // LIFETIME CONTRACT:
+  // - Stored tensors are valid until overwritten or destroy() is called.
+  // - Caller must ensure the producing execute() call (e.g., encoder) completes
+  //   before any consuming execute() call (e.g., decoder) begins.
+  // - Caller must not call destroy() while execute() is in progress.
+  // - Overwriting a tensor (same cache name) deletes the old tensor immediately,
+  //   so caller must ensure no concurrent execute() is using it.
+  mutable std::unordered_map<std::string, GpuTensorRef> gpu_tensors_;
+
+  // Helper to clear stored GPU tensors and free their memory.
+  // Only call when no execute() is in progress.
+  void clear_gpu_tensors() const {
+    for (auto& pair : gpu_tensors_) {
+      if (pair.second.handle != nullptr) {
+        aoti_torch_delete_tensor_object(pair.second.handle);
+      }
+    }
+    gpu_tensors_.clear();
+  }
+
   Error load_function_pointers_into_handle(
       void* so_handle,
       AOTIDelegateHandle* handle) const {
@@ -133,46 +179,38 @@ class ET_EXPERIMENTAL CudaBackend final
                 std::array<char, executorch::runtime::kMaxOptionValueLength>>(
                 &option.value)) {
           std::string val(arr->data());
-          auto colon_pos = val.find(':');
-          if (colon_pos != std::string::npos) {
-            try {
-              cache_output_slot_ = std::stoi(val.substr(0, colon_pos));
-              cache_output_name_ = val.substr(colon_pos + 1);
-            } catch (const std::exception& e) {
-              ET_LOG(
-                  Error,
-                  "Invalid cache_output format '%s': %s",
-                  val.c_str(),
-                  e.what());
-              return Error::InvalidArgument;
-            }
+          int slot;
+          std::string name;
+          if (parse_slot_name(val, slot, name)) {
+            cache_output_slot_ = slot;
+            cache_output_name_ = std::move(name);
+          } else {
+            ET_LOG(Error, "Invalid cache_output format: '%s'", val.c_str());
+            return Error::InvalidArgument;
           }
         }
       }
-      // Handle use_cache_input: "slot:name" format (e.g., "1:encoder_output")
+      // Handle use_cache_input: "slot:name" format (e.g., "2:encoder_output")
       else if (strcmp(option.key, "use_cache_input") == 0) {
         if (auto* arr = std::get_if<
                 std::array<char, executorch::runtime::kMaxOptionValueLength>>(
                 &option.value)) {
           std::string val(arr->data());
-          auto colon_pos = val.find(':');
-          if (colon_pos != std::string::npos) {
-            try {
-              use_cache_input_slot_ = std::stoi(val.substr(0, colon_pos));
-              use_cache_input_name_ = val.substr(colon_pos + 1);
-            } catch (const std::exception& e) {
-              ET_LOG(
-                  Error,
-                  "Invalid use_cache_input format '%s': %s",
-                  val.c_str(),
-                  e.what());
-              return Error::InvalidArgument;
-            }
+          int slot;
+          std::string name;
+          if (parse_slot_name(val, slot, name)) {
+            use_cache_input_slot_ = slot;
+            use_cache_input_name_ = std::move(name);
+          } else {
+            ET_LOG(Error, "Invalid use_cache_input format: '%s'", val.c_str());
+            return Error::InvalidArgument;
           }
         }
       }
-      // Handle clear_cache_input: reset input cache settings
-      else if (strcmp(option.key, "clear_cache_input") == 0) {
+      // Handle reset_cache_input: disable cache input for subsequent execute() calls.
+      // Note: This only resets the slot/name settings. The stored GPU tensor
+      // remains in memory until overwritten or destroy() is called.
+      else if (strcmp(option.key, "reset_cache_input") == 0) {
         if (auto* val = std::get_if<bool>(&option.value)) {
           if (*val) {
             use_cache_input_slot_ = -1;
@@ -308,6 +346,28 @@ class ET_EXPERIMENTAL CudaBackend final
         n_outputs,
         args.size())
 
+    // Validate cache slot indices if set
+    if (use_cache_input_slot_ >= 0 &&
+        use_cache_input_slot_ >= static_cast<int>(n_inputs)) {
+      ET_LOG(
+          Warning,
+          "use_cache_input slot %d is out of bounds (n_inputs=%zu), ignoring",
+          use_cache_input_slot_,
+          n_inputs);
+      use_cache_input_slot_ = -1;
+      use_cache_input_name_.clear();
+    }
+    if (cache_output_slot_ >= 0 &&
+        cache_output_slot_ >= static_cast<int>(n_outputs)) {
+      ET_LOG(
+          Warning,
+          "cache_output slot %d is out of bounds (n_outputs=%zu), ignoring",
+          cache_output_slot_,
+          n_outputs);
+      cache_output_slot_ = -1;
+      cache_output_name_.clear();
+    }
+
     // NOTE: ExecuTorch tensors are always on CPU/host memory
     // We need to create GPU copies for CUDA kernel execution
     std::vector<AOTITensorHandle> gpu_inputs(
@@ -346,8 +406,8 @@ class ET_EXPERIMENTAL CudaBackend final
 
       // Check if this input slot should use a stored GPU tensor
       if (i == use_cache_input_slot_ && !use_cache_input_name_.empty()) {
-        auto it = g_gpu_tensors.find(use_cache_input_name_);
-        if (it != g_gpu_tensors.end()) {
+        auto it = gpu_tensors_.find(use_cache_input_name_);
+        if (it != gpu_tensors_.end()) {
           const GpuTensorRef& ref = it->second;
           // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
           size_t numel = gpu_inputs[i]->numel();
@@ -358,8 +418,8 @@ class ET_EXPERIMENTAL CudaBackend final
               copy_bytes == ref.size_bytes,
               Internal,
               "Stored tensor size mismatch: expected %zu bytes, got %zu",
-              copy_bytes,
-              ref.size_bytes);
+              ref.size_bytes,
+              copy_bytes);
 
           cudaError_t cuda_err = cudaMemcpy(
               gpu_inputs[i]->data_ptr(),
@@ -434,17 +494,21 @@ class ET_EXPERIMENTAL CudaBackend final
 
     // Store reference to output GPU tensor if requested.
     // The tensor will be kept alive for later D2D copy to decoder inputs.
-    if (cache_output_slot_ >= 0 && cache_output_slot_ < static_cast<int>(n_outputs) &&
-        !cache_output_name_.empty()) {
+    // (Bounds already validated at start of execute())
+    if (cache_output_slot_ >= 0 && !cache_output_name_.empty()) {
       auto* gpu_tensor = gpu_outputs[cache_output_slot_];
       size_t numel = gpu_tensor->numel();
       size_t elem_size = gpu_tensor->element_size();
       size_t size_bytes = numel * elem_size;
 
-      // Delete old tensor if overwriting
-      auto old_it = g_gpu_tensors.find(cache_output_name_);
-      if (old_it != g_gpu_tensors.end() && old_it->second.handle != nullptr) {
-        aoti_torch_delete_tensor_object(old_it->second.handle);
+      // Delete old tensor if overwriting (erase first to prevent double-free)
+      auto old_it = gpu_tensors_.find(cache_output_name_);
+      if (old_it != gpu_tensors_.end()) {
+        AOTITensorHandle old_handle = old_it->second.handle;
+        gpu_tensors_.erase(old_it);  // Remove from map before deleting
+        if (old_handle != nullptr) {
+          aoti_torch_delete_tensor_object(old_handle);
+        }
       }
 
       // Store tensor reference (we now own this tensor)
@@ -452,7 +516,7 @@ class ET_EXPERIMENTAL CudaBackend final
       ref.handle = gpu_tensor;
       ref.data_ptr = gpu_tensor->data_ptr();
       ref.size_bytes = size_bytes;
-      g_gpu_tensors[cache_output_name_] = ref;
+      gpu_tensors_[cache_output_name_] = ref;
 
       // Reset cache_output settings after caching
       cache_output_slot_ = -1;
@@ -473,10 +537,14 @@ class ET_EXPERIMENTAL CudaBackend final
           i);
     }
 
-    // Note: use_cache_input settings are intentionally NOT reset here.
-    // They persist across execute() calls to support decoder loops that
-    // reuse cached encoder output. The caller should explicitly clear
-    // these settings using the "clear_cache_input" option when done.
+    // Memory management notes:
+    // - use_cache_input settings persist across execute() calls to support
+    //   decoder loops that reuse the stored encoder output.
+    // - Stored GPU tensors (in gpu_tensors_) remain in memory until:
+    //   (a) overwritten by a new tensor with the same name, or
+    //   (b) destroy() is called, which frees all stored tensors.
+    // - The "reset_cache_input" option only resets the input slot/name settings,
+    //   NOT the stored GPU tensors themselves.
 
     // Cleanup: delete GPU tensors to avoid memory leak across execute() calls.
     // Input tensors are no longer needed after AOTI execution.
@@ -484,10 +552,10 @@ class ET_EXPERIMENTAL CudaBackend final
       aoti_torch_delete_tensor_object(gpu_inputs[i]);
     }
     // Output tensors are no longer needed after copying to CPU,
-    // EXCEPT for tensors stored in g_gpu_tensors (for later D2D copy).
+    // EXCEPT for tensors stored in gpu_tensors_ (for later D2D copy).
     for (size_t i = 0; i < n_outputs; i++) {
       bool is_stored = false;
-      for (const auto& pair : g_gpu_tensors) {
+      for (const auto& pair : gpu_tensors_) {
         if (pair.second.handle == gpu_outputs[i]) {
           is_stored = true;
           break;
diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp
index b8df8e71db0..af921b7a540 100644
--- a/extension/asr/runner/runner.cpp
+++ b/extension/asr/runner/runner.cpp
@@ -338,11 +338,13 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
     }
   }
 
-  // Clear cache input settings after decoder loop completes
-  // This prevents stale cache from being used in subsequent transcribe() calls
+  // Reset cache input settings after decoder loop completes.
+  // This disables the D2D copy optimization for subsequent execute() calls.
+  // Note: The stored GPU tensor remains in memory until the next encoder run
+  // (which overwrites it) or until the backend is destroyed.
   {
     ::executorch::runtime::BackendOptions<1> opts;
-    opts.set_option("clear_cache_input", true);
+    opts.set_option("reset_cache_input", true);
     ::executorch::runtime::set_option("CudaBackend", opts.view());
   }
 

From e535aeece17bc265371c0f75fd9e7ba952750706 Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Wed, 3 Dec 2025 06:14:54 -0800
Subject: [PATCH 5/9] More simplification

---
 backends/cuda/runtime/cuda_backend.cpp | 184 ++++++++-----------------
 extension/asr/runner/runner.cpp        |  31 ++---
 2 files changed, 68 insertions(+), 147 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index efaefd19c99..58820bb7b69 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -57,50 +57,12 @@ struct GpuTensorRef {
   size_t size_bytes;        // Total size in bytes
 };
 
-// Parses "slot:name" format string. Returns true on success.
-// Uses character-by-character parsing to avoid std::stoi exceptions.
-static bool parse_slot_name(
-    const std::string& val,
-    int& out_slot,
-    std::string& out_name) {
-  auto colon_pos = val.find(':');
-  if (colon_pos == std::string::npos || colon_pos == 0) {
-    return false;
-  }
-
-  // Parse slot number manually to avoid exceptions
-  int slot = 0;
-  for (size_t i = 0; i < colon_pos; i++) {
-    char c = val[i];
-    if (c < '0' || c > '9') {
-      return false;  // Non-digit character
-    }
-    int digit = c - '0';
-    // Check for overflow
-    if (slot > (INT_MAX - digit) / 10) {
-      return false;
-    }
-    slot = slot * 10 + digit;
-  }
-
-  std::string name = val.substr(colon_pos + 1);
-  if (name.empty()) {
-    return false;
-  }
-
-  out_slot = slot;
-  out_name = std::move(name);
-  return true;
-}
-
 class ET_EXPERIMENTAL CudaBackend final
     : public ::executorch::runtime::BackendInterface {
  private:
-  // Cache control options (set via set_option before execute)
-  mutable int cache_output_slot_ = -1;        // Which output slot to cache (-1 = none)
-  mutable std::string cache_output_name_;     // Name to cache output under
-  mutable int use_cache_input_slot_ = -1;     // Which input slot to use cache for (-1 = none)
-  mutable std::string use_cache_input_name_;  // Name of cached tensor to use
+  // Storage control options (set via set_option before execute)
+  mutable std::string store_output_name_;     // Name to store output under (empty = none)
+  mutable std::string use_stored_input_name_; // Name of stored tensor to use (empty = none)
 
   // Per-instance map of named GPU tensor references.
   // Mutable because execute() is const but needs to modify this.
@@ -110,7 +72,7 @@ class ET_EXPERIMENTAL CudaBackend final
   // - Caller must ensure the producing execute() call (e.g., encoder) completes
   //   before any consuming execute() call (e.g., decoder) begins.
   // - Caller must not call destroy() while execute() is in progress.
-  // - Overwriting a tensor (same cache name) deletes the old tensor immediately,
+  // - Overwriting a tensor (same name) deletes the old tensor immediately,
   //   so caller must ensure no concurrent execute() is using it.
   mutable std::unordered_map<std::string, GpuTensorRef> gpu_tensors_;
 
@@ -173,49 +135,39 @@ class ET_EXPERIMENTAL CudaBackend final
           backend_options) override {
     for (size_t i = 0; i < backend_options.size(); i++) {
       const auto& option = backend_options[i];
-      // Handle cache_output: "slot:name" format (e.g., "0:encoder_output")
-      if (strcmp(option.key, "cache_output") == 0) {
+      // Handle store_output: expects a string name (e.g., "encoder_output")
+      if (strcmp(option.key, "store_output") == 0) {
         if (auto* arr = std::get_if<
                 std::array<char, executorch::runtime::kMaxOptionValueLength>>(
                 &option.value)) {
-          std::string val(arr->data());
-          int slot;
-          std::string name;
-          if (parse_slot_name(val, slot, name)) {
-            cache_output_slot_ = slot;
-            cache_output_name_ = std::move(name);
-          } else {
-            ET_LOG(Error, "Invalid cache_output format: '%s'", val.c_str());
-            return Error::InvalidArgument;
-          }
+          store_output_name_ = std::string(arr->data());
+        } else {
+          ET_LOG(Warning, "store_output option expects a string value");
+          return Error::InvalidArgument;
         }
       }
-      // Handle use_cache_input: "slot:name" format (e.g., "2:encoder_output")
-      else if (strcmp(option.key, "use_cache_input") == 0) {
+      // Handle use_stored_input: expects a string name (e.g., "encoder_output")
+      else if (strcmp(option.key, "use_stored_input") == 0) {
         if (auto* arr = std::get_if<
                 std::array<char, executorch::runtime::kMaxOptionValueLength>>(
                 &option.value)) {
-          std::string val(arr->data());
-          int slot;
-          std::string name;
-          if (parse_slot_name(val, slot, name)) {
-            use_cache_input_slot_ = slot;
-            use_cache_input_name_ = std::move(name);
-          } else {
-            ET_LOG(Error, "Invalid use_cache_input format: '%s'", val.c_str());
-            return Error::InvalidArgument;
-          }
+          use_stored_input_name_ = std::string(arr->data());
+        } else {
+          ET_LOG(Warning, "use_stored_input option expects a string value");
+          return Error::InvalidArgument;
         }
       }
-      // Handle reset_cache_input: disable cache input for subsequent execute() calls.
-      // Note: This only resets the slot/name settings. The stored GPU tensor
+      // Handle reset_stored_input: expects a boolean value
+      // Note: This only resets the name setting. The stored GPU tensor
       // remains in memory until overwritten or destroy() is called.
-      else if (strcmp(option.key, "reset_cache_input") == 0) {
+      else if (strcmp(option.key, "reset_stored_input") == 0) {
         if (auto* val = std::get_if<bool>(&option.value)) {
           if (*val) {
-            use_cache_input_slot_ = -1;
-            use_cache_input_name_.clear();
+            use_stored_input_name_.clear();
           }
+        } else {
+          ET_LOG(Warning, "reset_stored_input option expects a boolean value");
+          return Error::InvalidArgument;
         }
       }
     }
@@ -346,28 +298,6 @@ class ET_EXPERIMENTAL CudaBackend final
         n_outputs,
         args.size())
 
-    // Validate cache slot indices if set
-    if (use_cache_input_slot_ >= 0 &&
-        use_cache_input_slot_ >= static_cast<int>(n_inputs)) {
-      ET_LOG(
-          Warning,
-          "use_cache_input slot %d is out of bounds (n_inputs=%zu), ignoring",
-          use_cache_input_slot_,
-          n_inputs);
-      use_cache_input_slot_ = -1;
-      use_cache_input_name_.clear();
-    }
-    if (cache_output_slot_ >= 0 &&
-        cache_output_slot_ >= static_cast<int>(n_outputs)) {
-      ET_LOG(
-          Warning,
-          "cache_output slot %d is out of bounds (n_outputs=%zu), ignoring",
-          cache_output_slot_,
-          n_outputs);
-      cache_output_slot_ = -1;
-      cache_output_name_.clear();
-    }
-
     // NOTE: ExecuTorch tensors are always on CPU/host memory
     // We need to create GPU copies for CUDA kernel execution
     std::vector<AOTITensorHandle> gpu_inputs(
@@ -404,40 +334,35 @@ class ET_EXPERIMENTAL CudaBackend final
 
       gpu_inputs[i] = gpu_input_handle;
 
-      // Check if this input slot should use a stored GPU tensor
-      if (i == use_cache_input_slot_ && !use_cache_input_name_.empty()) {
-        auto it = gpu_tensors_.find(use_cache_input_name_);
+      // Check if this input matches a stored GPU tensor (by size)
+      if (!use_stored_input_name_.empty()) {
+        auto it = gpu_tensors_.find(use_stored_input_name_);
         if (it != gpu_tensors_.end()) {
           const GpuTensorRef& ref = it->second;
-          // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
           size_t numel = gpu_inputs[i]->numel();
           size_t elem_size = gpu_inputs[i]->element_size();
           size_t copy_bytes = numel * elem_size;
 
-          ET_CHECK_OR_RETURN_ERROR(
-              copy_bytes == ref.size_bytes,
-              Internal,
-              "Stored tensor size mismatch: expected %zu bytes, got %zu",
-              ref.size_bytes,
-              copy_bytes);
-
-          cudaError_t cuda_err = cudaMemcpy(
-              gpu_inputs[i]->data_ptr(),
-              ref.data_ptr,
-              copy_bytes,
-              cudaMemcpyDeviceToDevice);
-
-          ET_CHECK_OR_RETURN_ERROR(
-              cuda_err == cudaSuccess,
-              Internal,
-              "Failed GPU-to-GPU copy for input %d: %s",
-              i,
-              cudaGetErrorString(cuda_err));
-
-          // Skip the CPU-to-GPU copy below
-          continue;
+          // Match by size: use stored tensor if sizes match
+          if (copy_bytes == ref.size_bytes) {
+            // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
+            cudaError_t cuda_err = cudaMemcpy(
+                gpu_inputs[i]->data_ptr(),
+                ref.data_ptr,
+                copy_bytes,
+                cudaMemcpyDeviceToDevice);
+
+            ET_CHECK_OR_RETURN_ERROR(
+                cuda_err == cudaSuccess,
+                Internal,
+                "Failed GPU-to-GPU copy for input %d: %s",
+                i,
+                cudaGetErrorString(cuda_err));
+
+            // Skip the CPU-to-GPU copy below
+            continue;
+          }
         }
-        // Not found: fall through to normal CPU-to-GPU copy
       }
 
       // Copy data from CPU to GPU (normal path)
@@ -493,16 +418,16 @@ class ET_EXPERIMENTAL CudaBackend final
         error);
 
     // Store reference to output GPU tensor if requested.
+    // Always uses gpu_outputs[0] (encoder has single output).
     // The tensor will be kept alive for later D2D copy to decoder inputs.
-    // (Bounds already validated at start of execute())
-    if (cache_output_slot_ >= 0 && !cache_output_name_.empty()) {
-      auto* gpu_tensor = gpu_outputs[cache_output_slot_];
+    if (!store_output_name_.empty() && n_outputs > 0) {
+      auto* gpu_tensor = gpu_outputs[0];
       size_t numel = gpu_tensor->numel();
       size_t elem_size = gpu_tensor->element_size();
       size_t size_bytes = numel * elem_size;
 
       // Delete old tensor if overwriting (erase first to prevent double-free)
-      auto old_it = gpu_tensors_.find(cache_output_name_);
+      auto old_it = gpu_tensors_.find(store_output_name_);
       if (old_it != gpu_tensors_.end()) {
         AOTITensorHandle old_handle = old_it->second.handle;
         gpu_tensors_.erase(old_it);  // Remove from map before deleting
@@ -516,11 +441,10 @@ class ET_EXPERIMENTAL CudaBackend final
       ref.handle = gpu_tensor;
       ref.data_ptr = gpu_tensor->data_ptr();
       ref.size_bytes = size_bytes;
-      gpu_tensors_[cache_output_name_] = ref;
+      gpu_tensors_[store_output_name_] = ref;
 
-      // Reset cache_output settings after caching
-      cache_output_slot_ = -1;
-      cache_output_name_.clear();
+      // Reset store_output name after storing
+      store_output_name_.clear();
     }
 
     // Copy GPU output results back to CPU output tensors
@@ -538,12 +462,12 @@ class ET_EXPERIMENTAL CudaBackend final
     }
 
     // Memory management notes:
-    // - use_cache_input settings persist across execute() calls to support
+    // - use_stored_input setting persists across execute() calls to support
     //   decoder loops that reuse the stored encoder output.
     // - Stored GPU tensors (in gpu_tensors_) remain in memory until:
     //   (a) overwritten by a new tensor with the same name, or
     //   (b) destroy() is called, which frees all stored tensors.
-    // - The "reset_cache_input" option only resets the input slot/name settings,
+    // - The "reset_stored_input" option only resets the input name setting,
     //   NOT the stored GPU tensors themselves.
 
     // Cleanup: delete GPU tensors to avoid memory leak across execute() calls.
diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp
index af921b7a540..05b4c7e437e 100644
--- a/extension/asr/runner/runner.cpp
+++ b/extension/asr/runner/runner.cpp
@@ -198,14 +198,14 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
     }
   }
 
-  // Tell CUDA backend to cache encoder output (slot 0) as "encoder_output"
+  // Tell CUDA backend to store encoder output as "encoder_output"
   {
     ::executorch::runtime::BackendOptions<1> opts;
-    opts.set_option("cache_output", "0:encoder_output");
+    opts.set_option("store_output", "encoder_output");
     auto err =
         ::executorch::runtime::set_option("CudaBackend", opts.view());
     if (err != ::executorch::runtime::Error::Ok) {
-      ET_LOG(Info, "Failed to set cache_output option (backend may not support caching)");
+      ET_LOG(Warning, "Failed to set store_output option (backend may not support storage)");
     }
   }
 
@@ -263,22 +263,15 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
   decoder_inputs.emplace_back(encoder_output_ptr);
   decoder_inputs.emplace_back(cache_position_ptr);
 
-  // Tell CUDA backend to use cached encoder output for decoder input slot 2.
-  //
-  // Why slot 2? The AOTI-compiled decoder receives inputs in a different order
-  // than we pass them in decoder_inputs above. The AOTI input order was
-  // determined empirically by examining tensor shapes during execution.
-  //
-  // The "2:encoder_output" format tells the backend to use the stored GPU
-  // tensor named "encoder_output" for AOTI input slot 2. This avoids redundant
-  // CPU->GPU copies on each decoder iteration.
+  // Tell CUDA backend to use stored encoder output for matching decoder inputs.
+  // The backend matches by tensor size, avoiding redundant CPU->GPU copies.
   {
     ::executorch::runtime::BackendOptions<1> opts;
-    opts.set_option("use_cache_input", "2:encoder_output");
+    opts.set_option("use_stored_input", "encoder_output");
     auto err =
         ::executorch::runtime::set_option("CudaBackend", opts.view());
     if (err != ::executorch::runtime::Error::Ok) {
-      ET_LOG(Info, "Failed to set use_cache_input option (backend may not support caching)");
+      ET_LOG(Warning, "Failed to set use_stored_input option (backend may not support storage)");
     }
   }
 
@@ -338,14 +331,18 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
     }
   }
 
-  // Reset cache input settings after decoder loop completes.
+  // Reset stored input settings after decoder loop completes.
   // This disables the D2D copy optimization for subsequent execute() calls.
   // Note: The stored GPU tensor remains in memory until the next encoder run
   // (which overwrites it) or until the backend is destroyed.
   {
     ::executorch::runtime::BackendOptions<1> opts;
-    opts.set_option("reset_cache_input", true);
-    ::executorch::runtime::set_option("CudaBackend", opts.view());
+    opts.set_option("reset_stored_input", true);
+    auto err =
+        ::executorch::runtime::set_option("CudaBackend", opts.view());
+    if (err != ::executorch::runtime::Error::Ok) {
+      ET_LOG(Warning, "Failed to set reset_stored_input option");
+    }
   }
 
   // Reset coloring

From 9e1a3cc719df006a2f145703e98db39121e68643 Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Wed, 3 Dec 2025 06:24:16 -0800
Subject: [PATCH 6/9] RAII

---
 backends/cuda/runtime/cuda_backend.cpp | 96 +++++++++++++++++---------
 extension/asr/runner/runner.cpp        | 17 ++---
 2 files changed, 72 insertions(+), 41 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 58820bb7b69..750a8a85795 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -12,7 +12,6 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
-#include <climits>
 #include <cstdio>
 
 #include <filesystem>
@@ -49,20 +48,22 @@ using executorch::runtime::Result;
 using executorch::runtime::Span;
 using executorch::runtime::etensor::Tensor;
 
-// Structure to hold a reference to a GPU tensor for "keep on device" optimization.
-// Owns the tensor handle - must be deleted when no longer needed.
+// Structure to hold a reference to a GPU tensor for "keep on device"
+// optimization. Owns the tensor handle - must be deleted when no longer needed.
 struct GpuTensorRef {
-  AOTITensorHandle handle;  // Tensor handle (owned, for later deletion)
-  void* data_ptr;           // GPU memory pointer (for D2D copy)
-  size_t size_bytes;        // Total size in bytes
+  AOTITensorHandle handle; // Tensor handle (owned, for later deletion)
+  void* data_ptr; // GPU memory pointer (for D2D copy)
+  size_t size_bytes; // Total size in bytes
 };
 
 class ET_EXPERIMENTAL CudaBackend final
     : public ::executorch::runtime::BackendInterface {
  private:
   // Storage control options (set via set_option before execute)
-  mutable std::string store_output_name_;     // Name to store output under (empty = none)
-  mutable std::string use_stored_input_name_; // Name of stored tensor to use (empty = none)
+  mutable std::string
+      store_output_name_; // Name to store output under (empty = none)
+  mutable std::string
+      use_stored_input_name_; // Name of stored tensor to use (empty = none)
 
   // Per-instance map of named GPU tensor references.
   // Mutable because execute() is const but needs to modify this.
@@ -305,8 +306,41 @@ class ET_EXPERIMENTAL CudaBackend final
     std::vector<AOTITensorHandle> gpu_outputs(
         n_outputs); // GPU tensors for kernel output
 
+    // RAII helper to ensure GPU tensors are cleaned up on all exit paths.
+    // Prevents memory leaks when errors occur during execute().
+    struct TensorCleanup {
+      std::vector<AOTITensorHandle>& inputs;
+      std::vector<AOTITensorHandle>& outputs;
+      const std::unordered_map<std::string, GpuTensorRef>& stored_tensors;
+
+      ~TensorCleanup() {
+        // Clean up input tensors
+        for (auto* handle : inputs) {
+          if (handle != nullptr) {
+            aoti_torch_delete_tensor_object(handle);
+          }
+        }
+        // Clean up output tensors, except those that are stored
+        for (auto* handle : outputs) {
+          if (handle != nullptr) {
+            bool is_stored = false;
+            for (const auto& pair : stored_tensors) {
+              if (pair.second.handle == handle) {
+                is_stored = true;
+                break;
+              }
+            }
+            if (!is_stored) {
+              aoti_torch_delete_tensor_object(handle);
+            }
+          }
+        }
+      }
+    };
+    TensorCleanup cleanup{gpu_inputs, gpu_outputs, gpu_tensors_};
+
     // Process input tensors: ExecuTorch provides CPU tensors, create GPU
-    // copies. For cached inputs, use GPU-to-GPU copy instead of CPU-to-GPU.
+    // copies. For stored inputs, use GPU-to-GPU copy instead of CPU-to-GPU.
     for (int i = 0; i < n_inputs; i++) {
       // Get tensor dimensions and properties from ExecuTorch CPU tensor
       auto cpu_tensor = &(args[i]->toTensor());
@@ -334,7 +368,10 @@ class ET_EXPERIMENTAL CudaBackend final
 
       gpu_inputs[i] = gpu_input_handle;
 
-      // Check if this input matches a stored GPU tensor (by size)
+      // Check if this input matches a stored GPU tensor (by size).
+      // Note: Size-based matching assumes only one input will match. If
+      // multiple inputs have the same byte size as the stored tensor, the first
+      // match wins.
       if (!use_stored_input_name_.empty()) {
         auto it = gpu_tensors_.find(use_stored_input_name_);
         if (it != gpu_tensors_.end()) {
@@ -345,6 +382,13 @@ class ET_EXPERIMENTAL CudaBackend final
 
           // Match by size: use stored tensor if sizes match
           if (copy_bytes == ref.size_bytes) {
+            ET_LOG(
+                Debug,
+                "Using stored tensor '%s' for input %d (%zu bytes, D2D copy)",
+                use_stored_input_name_.c_str(),
+                i,
+                copy_bytes);
+
             // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
             cudaError_t cuda_err = cudaMemcpy(
                 gpu_inputs[i]->data_ptr(),
@@ -418,9 +462,14 @@ class ET_EXPERIMENTAL CudaBackend final
         error);
 
     // Store reference to output GPU tensor if requested.
-    // Always uses gpu_outputs[0] (encoder has single output).
     // The tensor will be kept alive for later D2D copy to decoder inputs.
-    if (!store_output_name_.empty() && n_outputs > 0) {
+    if (!store_output_name_.empty()) {
+      ET_CHECK_OR_RETURN_ERROR(
+          n_outputs == 1,
+          InvalidArgument,
+          "store_output only supports single-output methods, got %zu outputs",
+          n_outputs);
+
       auto* gpu_tensor = gpu_outputs[0];
       size_t numel = gpu_tensor->numel();
       size_t elem_size = gpu_tensor->element_size();
@@ -430,7 +479,7 @@ class ET_EXPERIMENTAL CudaBackend final
       auto old_it = gpu_tensors_.find(store_output_name_);
       if (old_it != gpu_tensors_.end()) {
         AOTITensorHandle old_handle = old_it->second.handle;
-        gpu_tensors_.erase(old_it);  // Remove from map before deleting
+        gpu_tensors_.erase(old_it); // Remove from map before deleting
         if (old_handle != nullptr) {
           aoti_torch_delete_tensor_object(old_handle);
         }
@@ -462,6 +511,7 @@ class ET_EXPERIMENTAL CudaBackend final
     }
 
     // Memory management notes:
+    // - GPU tensor cleanup is handled by TensorCleanup RAII guard above.
     // - use_stored_input setting persists across execute() calls to support
     //   decoder loops that reuse the stored encoder output.
     // - Stored GPU tensors (in gpu_tensors_) remain in memory until:
@@ -470,26 +520,6 @@ class ET_EXPERIMENTAL CudaBackend final
     // - The "reset_stored_input" option only resets the input name setting,
     //   NOT the stored GPU tensors themselves.
 
-    // Cleanup: delete GPU tensors to avoid memory leak across execute() calls.
-    // Input tensors are no longer needed after AOTI execution.
-    for (size_t i = 0; i < n_inputs; i++) {
-      aoti_torch_delete_tensor_object(gpu_inputs[i]);
-    }
-    // Output tensors are no longer needed after copying to CPU,
-    // EXCEPT for tensors stored in gpu_tensors_ (for later D2D copy).
-    for (size_t i = 0; i < n_outputs; i++) {
-      bool is_stored = false;
-      for (const auto& pair : gpu_tensors_) {
-        if (pair.second.handle == gpu_outputs[i]) {
-          is_stored = true;
-          break;
-        }
-      }
-      if (!is_stored) {
-        aoti_torch_delete_tensor_object(gpu_outputs[i]);
-      }
-    }
-
     return Error::Ok;
   }
 
diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp
index 05b4c7e437e..c7363e44239 100644
--- a/extension/asr/runner/runner.cpp
+++ b/extension/asr/runner/runner.cpp
@@ -202,10 +202,11 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
   {
     ::executorch::runtime::BackendOptions<1> opts;
     opts.set_option("store_output", "encoder_output");
-    auto err =
-        ::executorch::runtime::set_option("CudaBackend", opts.view());
+    auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
     if (err != ::executorch::runtime::Error::Ok) {
-      ET_LOG(Warning, "Failed to set store_output option (backend may not support storage)");
+      ET_LOG(
+          Warning,
+          "Failed to set store_output option (backend may not support storage)");
     }
   }
 
@@ -268,10 +269,11 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
   {
     ::executorch::runtime::BackendOptions<1> opts;
     opts.set_option("use_stored_input", "encoder_output");
-    auto err =
-        ::executorch::runtime::set_option("CudaBackend", opts.view());
+    auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
     if (err != ::executorch::runtime::Error::Ok) {
-      ET_LOG(Warning, "Failed to set use_stored_input option (backend may not support storage)");
+      ET_LOG(
+          Warning,
+          "Failed to set use_stored_input option (backend may not support storage)");
     }
   }
 
@@ -338,8 +340,7 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
   {
     ::executorch::runtime::BackendOptions<1> opts;
     opts.set_option("reset_stored_input", true);
-    auto err =
-        ::executorch::runtime::set_option("CudaBackend", opts.view());
+    auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
     if (err != ::executorch::runtime::Error::Ok) {
       ET_LOG(Warning, "Failed to set reset_stored_input option");
     }

From 3d0b62105608fd13cac783a34c5d4bc7666f7e6b Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Wed, 3 Dec 2025 07:02:10 -0800
Subject: [PATCH 7/9] Add clear_stored_tensor option

---
 backends/cuda/runtime/cuda_backend.cpp | 60 ++++++++++++++++++++++++++
 extension/asr/runner/runner.cpp        | 11 +++--
 2 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index 750a8a85795..aa0d358396c 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -59,6 +59,47 @@ struct GpuTensorRef {
 class ET_EXPERIMENTAL CudaBackend final
     : public ::executorch::runtime::BackendInterface {
  private:
+  // ============================================================================
+  // GPU Tensor Storage for D2D Copy Optimization
+  // ============================================================================
+  //
+  // This backend supports storing GPU tensors between execute() calls to enable
+  // device-to-device (D2D) copies instead of slower host-to-device (H2D) copies.
+  // This is useful for encoder-decoder models where the encoder output is reused
+  // across many decoder iterations.
+  //
+  // SUPPORTED OPTIONS (via set_option):
+  //
+  //   "store_output" (string): Store the output tensor under this name after
+  //       the next execute() call. The tensor remains on GPU until cleared.
+  //       Only supports single-output methods.
+  //       Example: opts.set_option("store_output", "encoder_output");
+  //
+  //   "use_stored_input" (string): For inputs matching the stored tensor's size,
+  //       use D2D copy from the stored tensor instead of H2D copy from CPU.
+  //       This setting persists across execute() calls until reset.
+  //       Example: opts.set_option("use_stored_input", "encoder_output");
+  //
+  //   "reset_stored_input" (bool): Clear the use_stored_input setting.
+  //       Does NOT delete the stored tensor - only stops using it for D2D.
+  //       Example: opts.set_option("reset_stored_input", true);
+  //
+  //   "clear_stored_tensor" (string): Delete the named tensor from storage,
+  //       freeing GPU memory. Use after decoder loop completes.
+  //       Example: opts.set_option("clear_stored_tensor", "encoder_output");
+  //
+  // TYPICAL USAGE PATTERN (encoder-decoder model):
+  //
+  //   1. Before encoder: set_option("store_output", "encoder_output")
+  //   2. Execute encoder (output is stored on GPU)
+  //   3. Before decoder loop: set_option("use_stored_input", "encoder_output")
+  //   4. Execute decoder N times (D2D copies for encoder output input)
+  //   5. After decoder loop:
+  //        set_option("reset_stored_input", true)
+  //        set_option("clear_stored_tensor", "encoder_output")
+  //
+  // ============================================================================
+
   // Storage control options (set via set_option before execute)
   mutable std::string
       store_output_name_; // Name to store output under (empty = none)
@@ -171,6 +212,25 @@ class ET_EXPERIMENTAL CudaBackend final
           return Error::InvalidArgument;
         }
       }
+      // Handle clear_stored_tensor: expects a string name
+      // Deletes the named GPU tensor from storage, freeing GPU memory.
+      else if (strcmp(option.key, "clear_stored_tensor") == 0) {
+        if (auto* arr = std::get_if<
+                std::array<char, executorch::runtime::kMaxOptionValueLength>>(
+                &option.value)) {
+          std::string name(arr->data());
+          auto it = gpu_tensors_.find(name);
+          if (it != gpu_tensors_.end()) {
+            if (it->second.handle != nullptr) {
+              aoti_torch_delete_tensor_object(it->second.handle);
+            }
+            gpu_tensors_.erase(it);
+          }
+        } else {
+          ET_LOG(Warning, "clear_stored_tensor option expects a string value");
+          return Error::InvalidArgument;
+        }
+      }
     }
     return Error::Ok;
   }
diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp
index c7363e44239..7610f3b89f5 100644
--- a/extension/asr/runner/runner.cpp
+++ b/extension/asr/runner/runner.cpp
@@ -333,16 +333,15 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
     }
   }
 
-  // Reset stored input settings after decoder loop completes.
-  // This disables the D2D copy optimization for subsequent execute() calls.
-  // Note: The stored GPU tensor remains in memory until the next encoder run
-  // (which overwrites it) or until the backend is destroyed.
+  // Reset stored input settings and free GPU memory after decoder loop completes.
+  // This disables the D2D copy optimization and releases the stored encoder output.
   {
-    ::executorch::runtime::BackendOptions<1> opts;
+    ::executorch::runtime::BackendOptions<2> opts;
     opts.set_option("reset_stored_input", true);
+    opts.set_option("clear_stored_tensor", "encoder_output");
     auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
     if (err != ::executorch::runtime::Error::Ok) {
-      ET_LOG(Warning, "Failed to set reset_stored_input option");
+      ET_LOG(Warning, "Failed to reset stored input settings");
     }
   }
 

From bc560b3859d1e300d0ff8e5eefb4eb7703b1e308 Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Wed, 3 Dec 2025 07:08:49 -0800
Subject: [PATCH 8/9] Minor nit

---
 backends/cuda/runtime/cuda_backend.cpp | 15 ++++++++-------
 extension/asr/runner/runner.cpp        |  5 +++--
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index aa0d358396c..ac82cd7a3f6 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -64,9 +64,9 @@ class ET_EXPERIMENTAL CudaBackend final
   // ============================================================================
   //
   // This backend supports storing GPU tensors between execute() calls to enable
-  // device-to-device (D2D) copies instead of slower host-to-device (H2D) copies.
-  // This is useful for encoder-decoder models where the encoder output is reused
-  // across many decoder iterations.
+  // device-to-device (D2D) copies instead of slower host-to-device (H2D)
+  // copies. This is useful for encoder-decoder models where the encoder output
+  // is reused across many decoder iterations.
   //
   // SUPPORTED OPTIONS (via set_option):
   //
@@ -75,7 +75,8 @@ class ET_EXPERIMENTAL CudaBackend final
   //       Only supports single-output methods.
   //       Example: opts.set_option("store_output", "encoder_output");
   //
-  //   "use_stored_input" (string): For inputs matching the stored tensor's size,
+  //   "use_stored_input" (string): For inputs matching the stored tensor's
+  //   size,
   //       use D2D copy from the stored tensor instead of H2D copy from CPU.
   //       This setting persists across execute() calls until reset.
   //       Example: opts.set_option("use_stored_input", "encoder_output");
@@ -401,7 +402,7 @@ class ET_EXPERIMENTAL CudaBackend final
 
     // Process input tensors: ExecuTorch provides CPU tensors, create GPU
     // copies. For stored inputs, use GPU-to-GPU copy instead of CPU-to-GPU.
-    for (int i = 0; i < n_inputs; i++) {
+    for (size_t i = 0; i < n_inputs; i++) {
       // Get tensor dimensions and properties from ExecuTorch CPU tensor
       auto cpu_tensor = &(args[i]->toTensor());
       auto sizes = cpu_tensor->sizes();
@@ -478,7 +479,7 @@ class ET_EXPERIMENTAL CudaBackend final
     }
     // Process output tensors: create GPU counterparts for ExecuTorch CPU
     // tensors
-    for (int i = 0; i < n_outputs; i++) {
+    for (size_t i = 0; i < n_outputs; i++) {
       // Get output tensor dimensions from ExecuTorch CPU tensor
       auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
       auto sizes = cpu_output_tensor->sizes();
@@ -557,7 +558,7 @@ class ET_EXPERIMENTAL CudaBackend final
     }
 
     // Copy GPU output results back to CPU output tensors
-    for (int i = 0; i < n_outputs; i++) {
+    for (size_t i = 0; i < n_outputs; i++) {
       auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
       // For DYNAMIC_BOUND tensors we try to resize
       ET_CHECK_OK_OR_RETURN_ERROR(
diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp
index 7610f3b89f5..93907b13549 100644
--- a/extension/asr/runner/runner.cpp
+++ b/extension/asr/runner/runner.cpp
@@ -333,8 +333,9 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
     }
   }
 
-  // Reset stored input settings and free GPU memory after decoder loop completes.
-  // This disables the D2D copy optimization and releases the stored encoder output.
+  // Reset stored input settings and free GPU memory after decoder loop
+  // completes. This disables the D2D copy optimization and releases the stored
+  // encoder output.
   {
     ::executorch::runtime::BackendOptions<2> opts;
     opts.set_option("reset_stored_input", true);

From 137e6da75290b69aa0b12ed0b4b6e653c967413b Mon Sep 17 00:00:00 2001
From: Mergen Nachin <mnachin@meta.com>
Date: Wed, 3 Dec 2025 07:25:06 -0800
Subject: [PATCH 9/9] Minor nits

---
 backends/cuda/runtime/cuda_backend.cpp | 87 +++++++++++++++-----------
 extension/asr/runner/runner.cpp        |  6 +-
 2 files changed, 55 insertions(+), 38 deletions(-)

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
index ac82cd7a3f6..059d6c0ea29 100644
--- a/backends/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -185,7 +185,7 @@ class ET_EXPERIMENTAL CudaBackend final
                 &option.value)) {
           store_output_name_ = std::string(arr->data());
         } else {
-          ET_LOG(Warning, "store_output option expects a string value");
+          ET_LOG(Error, "store_output option expects a string value");
           return Error::InvalidArgument;
         }
       }
@@ -196,7 +196,7 @@ class ET_EXPERIMENTAL CudaBackend final
                 &option.value)) {
           use_stored_input_name_ = std::string(arr->data());
         } else {
-          ET_LOG(Warning, "use_stored_input option expects a string value");
+          ET_LOG(Error, "use_stored_input option expects a string value");
           return Error::InvalidArgument;
         }
       }
@@ -209,7 +209,7 @@ class ET_EXPERIMENTAL CudaBackend final
             use_stored_input_name_.clear();
           }
         } else {
-          ET_LOG(Warning, "reset_stored_input option expects a boolean value");
+          ET_LOG(Error, "reset_stored_input option expects a boolean value");
           return Error::InvalidArgument;
         }
       }
@@ -228,7 +228,7 @@ class ET_EXPERIMENTAL CudaBackend final
             gpu_tensors_.erase(it);
           }
         } else {
-          ET_LOG(Warning, "clear_stored_tensor option expects a string value");
+          ET_LOG(Error, "clear_stored_tensor option expects a string value");
           return Error::InvalidArgument;
         }
       }
@@ -400,6 +400,10 @@ class ET_EXPERIMENTAL CudaBackend final
     };
     TensorCleanup cleanup{gpu_inputs, gpu_outputs, gpu_tensors_};
 
+    // Track which input index was matched for D2D copy (for duplicate
+    // detection)
+    ssize_t matched_input_idx = -1;
+
     // Process input tensors: ExecuTorch provides CPU tensors, create GPU
     // copies. For stored inputs, use GPU-to-GPU copy instead of CPU-to-GPU.
     for (size_t i = 0; i < n_inputs; i++) {
@@ -424,15 +428,12 @@ class ET_EXPERIMENTAL CudaBackend final
       ET_CHECK_OR_RETURN_ERROR(
           create_err == Error::Ok,
           Internal,
-          "Failed to create GPU tensor for input %d",
+          "Failed to create GPU tensor for input %zu",
           i);
 
       gpu_inputs[i] = gpu_input_handle;
 
       // Check if this input matches a stored GPU tensor (by size).
-      // Note: Size-based matching assumes only one input will match. If
-      // multiple inputs have the same byte size as the stored tensor, the first
-      // match wins.
       if (!use_stored_input_name_.empty()) {
         auto it = gpu_tensors_.find(use_stored_input_name_);
         if (it != gpu_tensors_.end()) {
@@ -443,29 +444,45 @@ class ET_EXPERIMENTAL CudaBackend final
 
           // Match by size: use stored tensor if sizes match
           if (copy_bytes == ref.size_bytes) {
-            ET_LOG(
-                Debug,
-                "Using stored tensor '%s' for input %d (%zu bytes, D2D copy)",
-                use_stored_input_name_.c_str(),
-                i,
-                copy_bytes);
-
-            // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
-            cudaError_t cuda_err = cudaMemcpy(
-                gpu_inputs[i]->data_ptr(),
-                ref.data_ptr,
-                copy_bytes,
-                cudaMemcpyDeviceToDevice);
-
-            ET_CHECK_OR_RETURN_ERROR(
-                cuda_err == cudaSuccess,
-                Internal,
-                "Failed GPU-to-GPU copy for input %d: %s",
-                i,
-                cudaGetErrorString(cuda_err));
-
-            // Skip the CPU-to-GPU copy below
-            continue;
+            if (matched_input_idx >= 0) {
+              // Another input already matched - warn about ambiguity
+              ET_LOG(
+                  Error,
+                  "Multiple inputs match stored tensor '%s' size (%zu bytes): "
+                  "input %zd was used, input %zu also matches. "
+                  "Consider using unique tensor sizes or a different matching strategy.",
+                  use_stored_input_name_.c_str(),
+                  copy_bytes,
+                  matched_input_idx,
+                  i);
+            } else {
+              // First match - perform D2D copy
+              matched_input_idx = static_cast<ssize_t>(i);
+
+              ET_LOG(
+                  Debug,
+                  "Using stored tensor '%s' for input %zu (%zu bytes, D2D copy)",
+                  use_stored_input_name_.c_str(),
+                  i,
+                  copy_bytes);
+
+              // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format
+              cudaError_t cuda_err = cudaMemcpy(
+                  gpu_inputs[i]->data_ptr(),
+                  ref.data_ptr,
+                  copy_bytes,
+                  cudaMemcpyDeviceToDevice);
+
+              ET_CHECK_OR_RETURN_ERROR(
+                  cuda_err == cudaSuccess,
+                  Internal,
+                  "Failed GPU-to-GPU copy for input %zu: %s",
+                  i,
+                  cudaGetErrorString(cuda_err));
+
+              // Skip the CPU-to-GPU copy below
+              continue;
+            }
           }
         }
       }
@@ -474,7 +491,7 @@ class ET_EXPERIMENTAL CudaBackend final
       ET_CHECK_OR_RETURN_ERROR(
           aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
           Internal,
-          "Failed to copy input %d from CPU to GPU",
+          "Failed to copy input %zu from CPU to GPU",
           i);
     }
     // Process output tensors: create GPU counterparts for ExecuTorch CPU
@@ -501,7 +518,7 @@ class ET_EXPERIMENTAL CudaBackend final
       ET_CHECK_OR_RETURN_ERROR(
           create_err == Error::Ok,
           Internal,
-          "Failed to create GPU tensor for output %d",
+          "Failed to create GPU tensor for output %zu",
           i);
 
       gpu_outputs[i] = gpu_output_handle;
@@ -563,11 +580,11 @@ class ET_EXPERIMENTAL CudaBackend final
       // For DYNAMIC_BOUND tensors we try to resize
       ET_CHECK_OK_OR_RETURN_ERROR(
           resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
-          "Error resizing tensor at output index %d",
+          "Error resizing tensor at output index %zu",
           i);
       ET_CHECK_OK_OR_RETURN_ERROR(
           aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
-          "Failed to copy GPU output %d back to CPU",
+          "Failed to copy GPU output %zu back to CPU",
           i);
     }
 
diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp
index 93907b13549..b0f7f139307 100644
--- a/extension/asr/runner/runner.cpp
+++ b/extension/asr/runner/runner.cpp
@@ -205,7 +205,7 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
     auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
     if (err != ::executorch::runtime::Error::Ok) {
       ET_LOG(
-          Warning,
+          Debug,
           "Failed to set store_output option (backend may not support storage)");
     }
   }
@@ -272,7 +272,7 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
     auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
     if (err != ::executorch::runtime::Error::Ok) {
       ET_LOG(
-          Warning,
+          Debug,
           "Failed to set use_stored_input option (backend may not support storage)");
     }
   }
@@ -342,7 +342,7 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
     opts.set_option("clear_stored_tensor", "encoder_output");
     auto err = ::executorch::runtime::set_option("CudaBackend", opts.view());
     if (err != ::executorch::runtime::Error::Ok) {
-      ET_LOG(Warning, "Failed to reset stored input settings");
+      ET_LOG(Error, "Failed to reset stored input settings");
     }
   }