From 37c47d412004044550e75a8f28e42a8a4b6ee5df Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Tue, 2 Dec 2025 15:11:24 -0800 Subject: [PATCH 1/9] [CUDA]: GPU Device Caching for Encoder Output in CUDA Backend Summary: In encoder-decoder models like Whisper, the encoder output tensor is used as input to every decoder iteration, and doing unnecessary CPU->GPU->CPU->GPU cpies. Implemented a "keep on device" caching mechanism in the CUDA backend that: - Caches encoder output in persistent GPU memory after the encoder runs - Uses fast GPU-to-GPU copies decoder iterations instead of slow CPU-to-GPU copies Test Plan: Reviewers: Subscribers: Tasks: Tags: --- backends/cuda/runtime/cuda_backend.cpp | 158 ++++++++++++++++++++++++- extension/asr/runner/runner.cpp | 26 ++++ 2 files changed, 181 insertions(+), 3 deletions(-) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index 0cef859ddfb..e482cf29e29 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -16,6 +17,7 @@ #include #include #include +#include #include // Include our shim layer headers @@ -46,9 +48,27 @@ using executorch::runtime::Result; using executorch::runtime::Span; using executorch::runtime::etensor::Tensor; +// Structure to hold cached GPU tensor data for "keep on device" optimization +struct CachedGpuData { + void* data_ptr; // GPU memory pointer + size_t size_bytes; // Total size in bytes + int32_t scalar_type; // Data type + std::vector sizes; // Original shape +}; + +// Global device cache - maps name to cached GPU data +// Using raw GPU pointers instead of tensor handles for format independence +static std::unordered_map g_device_cache; + class ET_EXPERIMENTAL CudaBackend final : public ::executorch::runtime::BackendInterface { private: + // Cache control options (set via set_option before execute) + mutable int cache_output_slot_ = -1; // Which output slot to cache (-1 = none) + mutable std::string cache_output_name_; // Name to cache output under + mutable int use_cache_input_slot_ = -1; // Which input slot to use cache for (-1 = none) + mutable std::string use_cache_input_name_; // Name of cached tensor to use + Error load_function_pointers_into_handle( void* so_handle, AOTIDelegateHandle* handle) const { @@ -91,6 +111,51 @@ class ET_EXPERIMENTAL CudaBackend final return 1; } + Error set_option( + __ET_UNUSED executorch::runtime::BackendOptionContext& context, + const executorch::runtime::Span& + backend_options) override { + for (size_t i = 0; i < backend_options.size(); i++) { + const auto& option = backend_options[i]; + // Handle cache_output: "slot:name" format (e.g., "0:encoder_output") + if (strcmp(option.key, "cache_output") == 0) { + if (auto* arr = std::get_if< + std::array>( + &option.value)) { + std::string val(arr->data()); + auto colon_pos = val.find(':'); + if (colon_pos != std::string::npos) { + cache_output_slot_ = std::stoi(val.substr(0, colon_pos)); + cache_output_name_ = val.substr(colon_pos + 1); + } + } + } + // Handle use_cache_input: "slot:name" format (e.g., "1:encoder_output") + else if (strcmp(option.key, "use_cache_input") == 0) { + if (auto* arr = std::get_if< + std::array>( + &option.value)) { + std::string val(arr->data()); + auto colon_pos = val.find(':'); + if (colon_pos != std::string::npos) { + use_cache_input_slot_ = std::stoi(val.substr(0, colon_pos)); + use_cache_input_name_ = val.substr(colon_pos + 1); + } + } + } + // Handle clear_cache_input: reset input cache settings + else if (strcmp(option.key, "clear_cache_input") == 0) { + if (auto* val = std::get_if(&option.value)) { + if (*val) { + use_cache_input_slot_ = -1; + use_cache_input_name_.clear(); + } + } + } + } + return Error::Ok; + } + // Once per loaded binary blob Result init( BackendInitContext& context, @@ -223,14 +288,14 @@ class ET_EXPERIMENTAL CudaBackend final n_outputs); // GPU tensors for kernel output // Process input tensors: ExecuTorch provides CPU tensors, create GPU - // copies + // copies. For cached inputs, use GPU-to-GPU copy instead of CPU-to-GPU. for (int i = 0; i < n_inputs; i++) { // Get tensor dimensions and properties from ExecuTorch CPU tensor auto cpu_tensor = &(args[i]->toTensor()); auto sizes = cpu_tensor->sizes(); auto scalar_type = cpu_tensor->scalar_type(); - // Create GPU tensor with same shape + // Create GPU tensor with same shape (always needed for AOTI format) std::vector sizes_vec(sizes.begin(), sizes.end()); AOTITensorHandle gpu_input_handle; @@ -251,7 +316,43 @@ class ET_EXPERIMENTAL CudaBackend final gpu_inputs[i] = gpu_input_handle; - // Copy data from CPU to GPU + // Check if this input slot should use cached GPU data + if (i == use_cache_input_slot_ && !use_cache_input_name_.empty()) { + auto cache_it = g_device_cache.find(use_cache_input_name_); + if (cache_it != g_device_cache.end()) { + const CachedGpuData& cached = cache_it->second; + // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format + size_t numel = gpu_inputs[i]->numel(); + size_t elem_size = gpu_inputs[i]->element_size(); + size_t copy_bytes = numel * elem_size; + + ET_CHECK_OR_RETURN_ERROR( + copy_bytes == cached.size_bytes, + Internal, + "Cached tensor size mismatch: expected %zu bytes, got %zu", + copy_bytes, + cached.size_bytes); + + cudaError_t cuda_err = cudaMemcpy( + gpu_inputs[i]->data_ptr(), + cached.data_ptr, + copy_bytes, + cudaMemcpyDeviceToDevice); + + ET_CHECK_OR_RETURN_ERROR( + cuda_err == cudaSuccess, + Internal, + "Failed GPU-to-GPU copy for cached input %d: %s", + i, + cudaGetErrorString(cuda_err)); + + // Skip the CPU-to-GPU copy below + continue; + } + // Cache miss: fall through to normal CPU-to-GPU copy + } + + // Copy data from CPU to GPU (normal path) ET_CHECK_OR_RETURN_ERROR( aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok, Internal, @@ -303,6 +404,57 @@ class ET_EXPERIMENTAL CudaBackend final "AOTInductorModelContainerRun failed with error code %d", error); + // Cache output GPU tensor data if requested + // We store the raw GPU pointer for later GPU-to-GPU copy + if (cache_output_slot_ >= 0 && cache_output_slot_ < static_cast(n_outputs) && + !cache_output_name_.empty()) { + auto* gpu_tensor = gpu_outputs[cache_output_slot_]; + size_t numel = gpu_tensor->numel(); + size_t elem_size = gpu_tensor->element_size(); + size_t size_bytes = numel * elem_size; + + // Allocate persistent GPU memory for the cache + void* cache_ptr = nullptr; + cudaError_t alloc_err = cudaMalloc(&cache_ptr, size_bytes); + ET_CHECK_OR_RETURN_ERROR( + alloc_err == cudaSuccess, + Internal, + "Failed to allocate GPU cache memory: %s", + cudaGetErrorString(alloc_err)); + + // Copy from tensor to cache (GPU-to-GPU) + cudaError_t copy_err = cudaMemcpy( + cache_ptr, + gpu_tensor->data_ptr(), + size_bytes, + cudaMemcpyDeviceToDevice); + ET_CHECK_OR_RETURN_ERROR( + copy_err == cudaSuccess, + Internal, + "Failed to copy output to GPU cache: %s", + cudaGetErrorString(copy_err)); + + // Free old cache if exists + auto old_it = g_device_cache.find(cache_output_name_); + if (old_it != g_device_cache.end()) { + cudaFree(old_it->second.data_ptr); + g_device_cache.erase(old_it); + } + + // Store in cache + CachedGpuData cached; + cached.data_ptr = cache_ptr; + cached.size_bytes = size_bytes; + cached.scalar_type = static_cast(gpu_tensor->scalar_type()); + auto sizes = gpu_tensor->sizes(); + cached.sizes.assign(sizes.begin(), sizes.end()); + g_device_cache[cache_output_name_] = std::move(cached); + + // Reset cache_output settings after caching + cache_output_slot_ = -1; + cache_output_name_.clear(); + } + // Copy GPU output results back to CPU output tensors for (int i = 0; i < n_outputs; i++) { auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp index 4f2523989c1..ebd5d29bdb3 100644 --- a/extension/asr/runner/runner.cpp +++ b/extension/asr/runner/runner.cpp @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include #include @@ -196,6 +198,17 @@ Result> AsrRunner::transcribe( } } + // Tell CUDA backend to cache encoder output (slot 0) as "encoder_output" + { + ::executorch::runtime::BackendOptions<1> opts; + opts.set_option("cache_output", "0:encoder_output"); + auto err = + ::executorch::runtime::set_option("CudaBackend", opts.view()); + if (err != ::executorch::runtime::Error::Ok) { + ET_LOG(Info, "Failed to set cache_output option (backend may not support caching)"); + } + } + auto encoder_result = module_->execute(kEncoderMethodName, preprocessed_features); ET_CHECK_OK_OR_RETURN_ERROR(encoder_result.error()); @@ -249,6 +262,19 @@ Result> AsrRunner::transcribe( decoder_inputs.emplace_back(decoder_input_ptr); decoder_inputs.emplace_back(encoder_output_ptr); decoder_inputs.emplace_back(cache_position_ptr); + + // Tell CUDA backend to use cached encoder output for decoder input slot 2 + // Note: Decoder input order in AOTI is: input_ids[0], cache_position[1], encoder_output[2] + { + ::executorch::runtime::BackendOptions<1> opts; + opts.set_option("use_cache_input", "2:encoder_output"); + auto err = + ::executorch::runtime::set_option("CudaBackend", opts.view()); + if (err != ::executorch::runtime::Error::Ok) { + ET_LOG(Info, "Failed to set use_cache_input option (backend may not support caching)"); + } + } + // Add some green coloring for the first generated token // token_callback("\033[1;32m"); while (generated_tokens < config.max_new_tokens) { From 5da254e7e6f657e0caa6dbf896f3ebabfc71895c Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Tue, 2 Dec 2025 16:49:02 -0800 Subject: [PATCH 2/9] Fix based on Copilot recommendations --- backends/cuda/runtime/cuda_backend.cpp | 77 ++++++++++++++++++++++---- extension/asr/runner/runner.cpp | 23 +++++++- 2 files changed, 88 insertions(+), 12 deletions(-) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index e482cf29e29..5cabb51c86d 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -58,8 +58,28 @@ struct CachedGpuData { // Global device cache - maps name to cached GPU data // Using raw GPU pointers instead of tensor handles for format independence +// Note: This cache is NOT thread-safe. Callers must ensure execute() is called +// from a single thread. static std::unordered_map g_device_cache; +// Helper function to clear all cached GPU memory +// Should be called during backend cleanup +static void clear_device_cache() { + for (auto& pair : g_device_cache) { + if (pair.second.data_ptr != nullptr) { + cudaError_t err = cudaFree(pair.second.data_ptr); + if (err != cudaSuccess) { + ET_LOG( + Warning, + "Failed to free cached GPU memory for '%s': %s", + pair.first.c_str(), + cudaGetErrorString(err)); + } + } + } + g_device_cache.clear(); +} + class ET_EXPERIMENTAL CudaBackend final : public ::executorch::runtime::BackendInterface { private: @@ -125,8 +145,17 @@ class ET_EXPERIMENTAL CudaBackend final std::string val(arr->data()); auto colon_pos = val.find(':'); if (colon_pos != std::string::npos) { - cache_output_slot_ = std::stoi(val.substr(0, colon_pos)); - cache_output_name_ = val.substr(colon_pos + 1); + try { + cache_output_slot_ = std::stoi(val.substr(0, colon_pos)); + cache_output_name_ = val.substr(colon_pos + 1); + } catch (const std::exception& e) { + ET_LOG( + Error, + "Invalid cache_output format '%s': %s", + val.c_str(), + e.what()); + return Error::InvalidArgument; + } } } } @@ -138,8 +167,17 @@ class ET_EXPERIMENTAL CudaBackend final std::string val(arr->data()); auto colon_pos = val.find(':'); if (colon_pos != std::string::npos) { - use_cache_input_slot_ = std::stoi(val.substr(0, colon_pos)); - use_cache_input_name_ = val.substr(colon_pos + 1); + try { + use_cache_input_slot_ = std::stoi(val.substr(0, colon_pos)); + use_cache_input_name_ = val.substr(colon_pos + 1); + } catch (const std::exception& e) { + ET_LOG( + Error, + "Invalid use_cache_input format '%s': %s", + val.c_str(), + e.what()); + return Error::InvalidArgument; + } } } } @@ -428,16 +466,27 @@ class ET_EXPERIMENTAL CudaBackend final gpu_tensor->data_ptr(), size_bytes, cudaMemcpyDeviceToDevice); - ET_CHECK_OR_RETURN_ERROR( - copy_err == cudaSuccess, - Internal, - "Failed to copy output to GPU cache: %s", - cudaGetErrorString(copy_err)); + if (copy_err != cudaSuccess) { + // Free allocated memory before returning error + cudaFree(cache_ptr); + ET_LOG( + Error, + "Failed to copy output to GPU cache: %s", + cudaGetErrorString(copy_err)); + return Error::Internal; + } // Free old cache if exists auto old_it = g_device_cache.find(cache_output_name_); if (old_it != g_device_cache.end()) { - cudaFree(old_it->second.data_ptr); + cudaError_t free_err = cudaFree(old_it->second.data_ptr); + if (free_err != cudaSuccess) { + ET_LOG( + Warning, + "Failed to free old cached GPU memory for '%s': %s", + cache_output_name_.c_str(), + cudaGetErrorString(free_err)); + } g_device_cache.erase(old_it); } @@ -469,6 +518,11 @@ class ET_EXPERIMENTAL CudaBackend final i); } + // Note: use_cache_input settings are intentionally NOT reset here. + // They persist across execute() calls to support decoder loops that + // reuse cached encoder output. The caller should explicitly clear + // these settings using the "clear_cache_input" option when done. + return Error::Ok; } @@ -478,6 +532,9 @@ class ET_EXPERIMENTAL CudaBackend final } AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; + // Clear all cached GPU memory + clear_device_cache(); + // Destroy the CUDA stream if it exists if (handle->cuda_stream != nullptr) { cudaStream_t cuda_stream = static_cast(handle->cuda_stream); diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp index ebd5d29bdb3..e92bfc242f7 100644 --- a/extension/asr/runner/runner.cpp +++ b/extension/asr/runner/runner.cpp @@ -263,8 +263,18 @@ Result> AsrRunner::transcribe( decoder_inputs.emplace_back(encoder_output_ptr); decoder_inputs.emplace_back(cache_position_ptr); - // Tell CUDA backend to use cached encoder output for decoder input slot 2 - // Note: Decoder input order in AOTI is: input_ids[0], cache_position[1], encoder_output[2] + // Tell CUDA backend to use cached encoder output for decoder input slot 2. + // + // Why slot 2? AOTI reorders inputs alphabetically by name during compilation, + // so the decoder receives inputs in this order: + // slot 0: input_ids (decoder token) + // slot 1: cache_position (position in KV cache) + // slot 2: encoder_output (audio features from encoder) + // + // This differs from the order we pass them in decoder_inputs above. The + // "2:encoder_output" format tells the backend to use the GPU-cached tensor + // named "encoder_output" for AOTI input slot 2, avoiding a CPU->GPU copy + // on each of the ~100+ decoder iterations. { ::executorch::runtime::BackendOptions<1> opts; opts.set_option("use_cache_input", "2:encoder_output"); @@ -330,6 +340,15 @@ Result> AsrRunner::transcribe( break; } } + + // Clear cache input settings after decoder loop completes + // This prevents stale cache from being used in subsequent transcribe() calls + { + ::executorch::runtime::BackendOptions<1> opts; + opts.set_option("clear_cache_input", true); + ::executorch::runtime::set_option("CudaBackend", opts.view()); + } + // Reset coloring // token_callback("\033[0m"); // Update stats and print report From ffbfbe7020b8b09e98e3f4afa91900bb8fad6017 Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Tue, 2 Dec 2025 17:19:40 -0800 Subject: [PATCH 3/9] Remove caching --- backends/cuda/runtime/cuda_backend.cpp | 143 ++++++++++--------------- extension/asr/runner/runner.cpp | 15 ++- 2 files changed, 65 insertions(+), 93 deletions(-) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index 5cabb51c86d..17a17883c3f 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -48,36 +48,26 @@ using executorch::runtime::Result; using executorch::runtime::Span; using executorch::runtime::etensor::Tensor; -// Structure to hold cached GPU tensor data for "keep on device" optimization -struct CachedGpuData { - void* data_ptr; // GPU memory pointer +// Structure to hold a reference to a GPU tensor for "keep on device" optimization. +// Owns the tensor handle - must be deleted when no longer needed. +struct GpuTensorRef { + AOTITensorHandle handle; // Tensor handle (owned, for later deletion) + void* data_ptr; // GPU memory pointer (for D2D copy) size_t size_bytes; // Total size in bytes - int32_t scalar_type; // Data type - std::vector sizes; // Original shape }; -// Global device cache - maps name to cached GPU data -// Using raw GPU pointers instead of tensor handles for format independence -// Note: This cache is NOT thread-safe. Callers must ensure execute() is called -// from a single thread. -static std::unordered_map g_device_cache; - -// Helper function to clear all cached GPU memory -// Should be called during backend cleanup -static void clear_device_cache() { - for (auto& pair : g_device_cache) { - if (pair.second.data_ptr != nullptr) { - cudaError_t err = cudaFree(pair.second.data_ptr); - if (err != cudaSuccess) { - ET_LOG( - Warning, - "Failed to free cached GPU memory for '%s': %s", - pair.first.c_str(), - cudaGetErrorString(err)); - } +// Global map of named GPU tensor references. +// Note: NOT thread-safe. Callers must ensure execute() is called from a single thread. +static std::unordered_map g_gpu_tensors; + +// Helper to clear stored GPU tensors and free their memory +static void clear_gpu_tensors() { + for (auto& pair : g_gpu_tensors) { + if (pair.second.handle != nullptr) { + aoti_torch_delete_tensor_object(pair.second.handle); } } - g_device_cache.clear(); + g_gpu_tensors.clear(); } class ET_EXPERIMENTAL CudaBackend final @@ -354,40 +344,40 @@ class ET_EXPERIMENTAL CudaBackend final gpu_inputs[i] = gpu_input_handle; - // Check if this input slot should use cached GPU data + // Check if this input slot should use a stored GPU tensor if (i == use_cache_input_slot_ && !use_cache_input_name_.empty()) { - auto cache_it = g_device_cache.find(use_cache_input_name_); - if (cache_it != g_device_cache.end()) { - const CachedGpuData& cached = cache_it->second; + auto it = g_gpu_tensors.find(use_cache_input_name_); + if (it != g_gpu_tensors.end()) { + const GpuTensorRef& ref = it->second; // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format size_t numel = gpu_inputs[i]->numel(); size_t elem_size = gpu_inputs[i]->element_size(); size_t copy_bytes = numel * elem_size; ET_CHECK_OR_RETURN_ERROR( - copy_bytes == cached.size_bytes, + copy_bytes == ref.size_bytes, Internal, - "Cached tensor size mismatch: expected %zu bytes, got %zu", + "Stored tensor size mismatch: expected %zu bytes, got %zu", copy_bytes, - cached.size_bytes); + ref.size_bytes); cudaError_t cuda_err = cudaMemcpy( gpu_inputs[i]->data_ptr(), - cached.data_ptr, + ref.data_ptr, copy_bytes, cudaMemcpyDeviceToDevice); ET_CHECK_OR_RETURN_ERROR( cuda_err == cudaSuccess, Internal, - "Failed GPU-to-GPU copy for cached input %d: %s", + "Failed GPU-to-GPU copy for input %d: %s", i, cudaGetErrorString(cuda_err)); // Skip the CPU-to-GPU copy below continue; } - // Cache miss: fall through to normal CPU-to-GPU copy + // Not found: fall through to normal CPU-to-GPU copy } // Copy data from CPU to GPU (normal path) @@ -442,8 +432,8 @@ class ET_EXPERIMENTAL CudaBackend final "AOTInductorModelContainerRun failed with error code %d", error); - // Cache output GPU tensor data if requested - // We store the raw GPU pointer for later GPU-to-GPU copy + // Store reference to output GPU tensor if requested. + // The tensor will be kept alive for later D2D copy to decoder inputs. if (cache_output_slot_ >= 0 && cache_output_slot_ < static_cast(n_outputs) && !cache_output_name_.empty()) { auto* gpu_tensor = gpu_outputs[cache_output_slot_]; @@ -451,53 +441,18 @@ class ET_EXPERIMENTAL CudaBackend final size_t elem_size = gpu_tensor->element_size(); size_t size_bytes = numel * elem_size; - // Allocate persistent GPU memory for the cache - void* cache_ptr = nullptr; - cudaError_t alloc_err = cudaMalloc(&cache_ptr, size_bytes); - ET_CHECK_OR_RETURN_ERROR( - alloc_err == cudaSuccess, - Internal, - "Failed to allocate GPU cache memory: %s", - cudaGetErrorString(alloc_err)); - - // Copy from tensor to cache (GPU-to-GPU) - cudaError_t copy_err = cudaMemcpy( - cache_ptr, - gpu_tensor->data_ptr(), - size_bytes, - cudaMemcpyDeviceToDevice); - if (copy_err != cudaSuccess) { - // Free allocated memory before returning error - cudaFree(cache_ptr); - ET_LOG( - Error, - "Failed to copy output to GPU cache: %s", - cudaGetErrorString(copy_err)); - return Error::Internal; - } - - // Free old cache if exists - auto old_it = g_device_cache.find(cache_output_name_); - if (old_it != g_device_cache.end()) { - cudaError_t free_err = cudaFree(old_it->second.data_ptr); - if (free_err != cudaSuccess) { - ET_LOG( - Warning, - "Failed to free old cached GPU memory for '%s': %s", - cache_output_name_.c_str(), - cudaGetErrorString(free_err)); - } - g_device_cache.erase(old_it); + // Delete old tensor if overwriting + auto old_it = g_gpu_tensors.find(cache_output_name_); + if (old_it != g_gpu_tensors.end() && old_it->second.handle != nullptr) { + aoti_torch_delete_tensor_object(old_it->second.handle); } - // Store in cache - CachedGpuData cached; - cached.data_ptr = cache_ptr; - cached.size_bytes = size_bytes; - cached.scalar_type = static_cast(gpu_tensor->scalar_type()); - auto sizes = gpu_tensor->sizes(); - cached.sizes.assign(sizes.begin(), sizes.end()); - g_device_cache[cache_output_name_] = std::move(cached); + // Store tensor reference (we now own this tensor) + GpuTensorRef ref; + ref.handle = gpu_tensor; + ref.data_ptr = gpu_tensor->data_ptr(); + ref.size_bytes = size_bytes; + g_gpu_tensors[cache_output_name_] = ref; // Reset cache_output settings after caching cache_output_slot_ = -1; @@ -523,6 +478,26 @@ class ET_EXPERIMENTAL CudaBackend final // reuse cached encoder output. The caller should explicitly clear // these settings using the "clear_cache_input" option when done. + // Cleanup: delete GPU tensors to avoid memory leak across execute() calls. + // Input tensors are no longer needed after AOTI execution. + for (size_t i = 0; i < n_inputs; i++) { + aoti_torch_delete_tensor_object(gpu_inputs[i]); + } + // Output tensors are no longer needed after copying to CPU, + // EXCEPT for tensors stored in g_gpu_tensors (for later D2D copy). + for (size_t i = 0; i < n_outputs; i++) { + bool is_stored = false; + for (const auto& pair : g_gpu_tensors) { + if (pair.second.handle == gpu_outputs[i]) { + is_stored = true; + break; + } + } + if (!is_stored) { + aoti_torch_delete_tensor_object(gpu_outputs[i]); + } + } + return Error::Ok; } @@ -532,8 +507,8 @@ class ET_EXPERIMENTAL CudaBackend final } AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; - // Clear all cached GPU memory - clear_device_cache(); + // Delete stored GPU tensors + clear_gpu_tensors(); // Destroy the CUDA stream if it exists if (handle->cuda_stream != nullptr) { diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp index e92bfc242f7..b8df8e71db0 100644 --- a/extension/asr/runner/runner.cpp +++ b/extension/asr/runner/runner.cpp @@ -265,16 +265,13 @@ Result> AsrRunner::transcribe( // Tell CUDA backend to use cached encoder output for decoder input slot 2. // - // Why slot 2? AOTI reorders inputs alphabetically by name during compilation, - // so the decoder receives inputs in this order: - // slot 0: input_ids (decoder token) - // slot 1: cache_position (position in KV cache) - // slot 2: encoder_output (audio features from encoder) + // Why slot 2? The AOTI-compiled decoder receives inputs in a different order + // than we pass them in decoder_inputs above. The AOTI input order was + // determined empirically by examining tensor shapes during execution. // - // This differs from the order we pass them in decoder_inputs above. The - // "2:encoder_output" format tells the backend to use the GPU-cached tensor - // named "encoder_output" for AOTI input slot 2, avoiding a CPU->GPU copy - // on each of the ~100+ decoder iterations. + // The "2:encoder_output" format tells the backend to use the stored GPU + // tensor named "encoder_output" for AOTI input slot 2. This avoids redundant + // CPU->GPU copies on each decoder iteration. { ::executorch::runtime::BackendOptions<1> opts; opts.set_option("use_cache_input", "2:encoder_output"); From 03d27e7af9e3e6a5126c6c32d48d352118edbe96 Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Wed, 3 Dec 2025 05:55:19 -0800 Subject: [PATCH 4/9] Address Copilot comments --- backends/cuda/runtime/cuda_backend.cpp | 178 +++++++++++++++++-------- extension/asr/runner/runner.cpp | 8 +- 2 files changed, 128 insertions(+), 58 deletions(-) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index 17a17883c3f..efaefd19c99 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -56,18 +57,40 @@ struct GpuTensorRef { size_t size_bytes; // Total size in bytes }; -// Global map of named GPU tensor references. -// Note: NOT thread-safe. Callers must ensure execute() is called from a single thread. -static std::unordered_map g_gpu_tensors; +// Parses "slot:name" format string. Returns true on success. +// Uses character-by-character parsing to avoid std::stoi exceptions. +static bool parse_slot_name( + const std::string& val, + int& out_slot, + std::string& out_name) { + auto colon_pos = val.find(':'); + if (colon_pos == std::string::npos || colon_pos == 0) { + return false; + } -// Helper to clear stored GPU tensors and free their memory -static void clear_gpu_tensors() { - for (auto& pair : g_gpu_tensors) { - if (pair.second.handle != nullptr) { - aoti_torch_delete_tensor_object(pair.second.handle); + // Parse slot number manually to avoid exceptions + int slot = 0; + for (size_t i = 0; i < colon_pos; i++) { + char c = val[i]; + if (c < '0' || c > '9') { + return false; // Non-digit character + } + int digit = c - '0'; + // Check for overflow + if (slot > (INT_MAX - digit) / 10) { + return false; } + slot = slot * 10 + digit; } - g_gpu_tensors.clear(); + + std::string name = val.substr(colon_pos + 1); + if (name.empty()) { + return false; + } + + out_slot = slot; + out_name = std::move(name); + return true; } class ET_EXPERIMENTAL CudaBackend final @@ -79,6 +102,29 @@ class ET_EXPERIMENTAL CudaBackend final mutable int use_cache_input_slot_ = -1; // Which input slot to use cache for (-1 = none) mutable std::string use_cache_input_name_; // Name of cached tensor to use + // Per-instance map of named GPU tensor references. + // Mutable because execute() is const but needs to modify this. + // + // LIFETIME CONTRACT: + // - Stored tensors are valid until overwritten or destroy() is called. + // - Caller must ensure the producing execute() call (e.g., encoder) completes + // before any consuming execute() call (e.g., decoder) begins. + // - Caller must not call destroy() while execute() is in progress. + // - Overwriting a tensor (same cache name) deletes the old tensor immediately, + // so caller must ensure no concurrent execute() is using it. + mutable std::unordered_map gpu_tensors_; + + // Helper to clear stored GPU tensors and free their memory. + // Only call when no execute() is in progress. + void clear_gpu_tensors() const { + for (auto& pair : gpu_tensors_) { + if (pair.second.handle != nullptr) { + aoti_torch_delete_tensor_object(pair.second.handle); + } + } + gpu_tensors_.clear(); + } + Error load_function_pointers_into_handle( void* so_handle, AOTIDelegateHandle* handle) const { @@ -133,46 +179,38 @@ class ET_EXPERIMENTAL CudaBackend final std::array>( &option.value)) { std::string val(arr->data()); - auto colon_pos = val.find(':'); - if (colon_pos != std::string::npos) { - try { - cache_output_slot_ = std::stoi(val.substr(0, colon_pos)); - cache_output_name_ = val.substr(colon_pos + 1); - } catch (const std::exception& e) { - ET_LOG( - Error, - "Invalid cache_output format '%s': %s", - val.c_str(), - e.what()); - return Error::InvalidArgument; - } + int slot; + std::string name; + if (parse_slot_name(val, slot, name)) { + cache_output_slot_ = slot; + cache_output_name_ = std::move(name); + } else { + ET_LOG(Error, "Invalid cache_output format: '%s'", val.c_str()); + return Error::InvalidArgument; } } } - // Handle use_cache_input: "slot:name" format (e.g., "1:encoder_output") + // Handle use_cache_input: "slot:name" format (e.g., "2:encoder_output") else if (strcmp(option.key, "use_cache_input") == 0) { if (auto* arr = std::get_if< std::array>( &option.value)) { std::string val(arr->data()); - auto colon_pos = val.find(':'); - if (colon_pos != std::string::npos) { - try { - use_cache_input_slot_ = std::stoi(val.substr(0, colon_pos)); - use_cache_input_name_ = val.substr(colon_pos + 1); - } catch (const std::exception& e) { - ET_LOG( - Error, - "Invalid use_cache_input format '%s': %s", - val.c_str(), - e.what()); - return Error::InvalidArgument; - } + int slot; + std::string name; + if (parse_slot_name(val, slot, name)) { + use_cache_input_slot_ = slot; + use_cache_input_name_ = std::move(name); + } else { + ET_LOG(Error, "Invalid use_cache_input format: '%s'", val.c_str()); + return Error::InvalidArgument; } } } - // Handle clear_cache_input: reset input cache settings - else if (strcmp(option.key, "clear_cache_input") == 0) { + // Handle reset_cache_input: disable cache input for subsequent execute() calls. + // Note: This only resets the slot/name settings. The stored GPU tensor + // remains in memory until overwritten or destroy() is called. + else if (strcmp(option.key, "reset_cache_input") == 0) { if (auto* val = std::get_if(&option.value)) { if (*val) { use_cache_input_slot_ = -1; @@ -308,6 +346,28 @@ class ET_EXPERIMENTAL CudaBackend final n_outputs, args.size()) + // Validate cache slot indices if set + if (use_cache_input_slot_ >= 0 && + use_cache_input_slot_ >= static_cast(n_inputs)) { + ET_LOG( + Warning, + "use_cache_input slot %d is out of bounds (n_inputs=%zu), ignoring", + use_cache_input_slot_, + n_inputs); + use_cache_input_slot_ = -1; + use_cache_input_name_.clear(); + } + if (cache_output_slot_ >= 0 && + cache_output_slot_ >= static_cast(n_outputs)) { + ET_LOG( + Warning, + "cache_output slot %d is out of bounds (n_outputs=%zu), ignoring", + cache_output_slot_, + n_outputs); + cache_output_slot_ = -1; + cache_output_name_.clear(); + } + // NOTE: ExecuTorch tensors are always on CPU/host memory // We need to create GPU copies for CUDA kernel execution std::vector gpu_inputs( @@ -346,8 +406,8 @@ class ET_EXPERIMENTAL CudaBackend final // Check if this input slot should use a stored GPU tensor if (i == use_cache_input_slot_ && !use_cache_input_name_.empty()) { - auto it = g_gpu_tensors.find(use_cache_input_name_); - if (it != g_gpu_tensors.end()) { + auto it = gpu_tensors_.find(use_cache_input_name_); + if (it != gpu_tensors_.end()) { const GpuTensorRef& ref = it->second; // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format size_t numel = gpu_inputs[i]->numel(); @@ -358,8 +418,8 @@ class ET_EXPERIMENTAL CudaBackend final copy_bytes == ref.size_bytes, Internal, "Stored tensor size mismatch: expected %zu bytes, got %zu", - copy_bytes, - ref.size_bytes); + ref.size_bytes, + copy_bytes); cudaError_t cuda_err = cudaMemcpy( gpu_inputs[i]->data_ptr(), @@ -434,17 +494,21 @@ class ET_EXPERIMENTAL CudaBackend final // Store reference to output GPU tensor if requested. // The tensor will be kept alive for later D2D copy to decoder inputs. - if (cache_output_slot_ >= 0 && cache_output_slot_ < static_cast(n_outputs) && - !cache_output_name_.empty()) { + // (Bounds already validated at start of execute()) + if (cache_output_slot_ >= 0 && !cache_output_name_.empty()) { auto* gpu_tensor = gpu_outputs[cache_output_slot_]; size_t numel = gpu_tensor->numel(); size_t elem_size = gpu_tensor->element_size(); size_t size_bytes = numel * elem_size; - // Delete old tensor if overwriting - auto old_it = g_gpu_tensors.find(cache_output_name_); - if (old_it != g_gpu_tensors.end() && old_it->second.handle != nullptr) { - aoti_torch_delete_tensor_object(old_it->second.handle); + // Delete old tensor if overwriting (erase first to prevent double-free) + auto old_it = gpu_tensors_.find(cache_output_name_); + if (old_it != gpu_tensors_.end()) { + AOTITensorHandle old_handle = old_it->second.handle; + gpu_tensors_.erase(old_it); // Remove from map before deleting + if (old_handle != nullptr) { + aoti_torch_delete_tensor_object(old_handle); + } } // Store tensor reference (we now own this tensor) @@ -452,7 +516,7 @@ class ET_EXPERIMENTAL CudaBackend final ref.handle = gpu_tensor; ref.data_ptr = gpu_tensor->data_ptr(); ref.size_bytes = size_bytes; - g_gpu_tensors[cache_output_name_] = ref; + gpu_tensors_[cache_output_name_] = ref; // Reset cache_output settings after caching cache_output_slot_ = -1; @@ -473,10 +537,14 @@ class ET_EXPERIMENTAL CudaBackend final i); } - // Note: use_cache_input settings are intentionally NOT reset here. - // They persist across execute() calls to support decoder loops that - // reuse cached encoder output. The caller should explicitly clear - // these settings using the "clear_cache_input" option when done. + // Memory management notes: + // - use_cache_input settings persist across execute() calls to support + // decoder loops that reuse the stored encoder output. + // - Stored GPU tensors (in gpu_tensors_) remain in memory until: + // (a) overwritten by a new tensor with the same name, or + // (b) destroy() is called, which frees all stored tensors. + // - The "reset_cache_input" option only resets the input slot/name settings, + // NOT the stored GPU tensors themselves. // Cleanup: delete GPU tensors to avoid memory leak across execute() calls. // Input tensors are no longer needed after AOTI execution. @@ -484,10 +552,10 @@ class ET_EXPERIMENTAL CudaBackend final aoti_torch_delete_tensor_object(gpu_inputs[i]); } // Output tensors are no longer needed after copying to CPU, - // EXCEPT for tensors stored in g_gpu_tensors (for later D2D copy). + // EXCEPT for tensors stored in gpu_tensors_ (for later D2D copy). for (size_t i = 0; i < n_outputs; i++) { bool is_stored = false; - for (const auto& pair : g_gpu_tensors) { + for (const auto& pair : gpu_tensors_) { if (pair.second.handle == gpu_outputs[i]) { is_stored = true; break; diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp index b8df8e71db0..af921b7a540 100644 --- a/extension/asr/runner/runner.cpp +++ b/extension/asr/runner/runner.cpp @@ -338,11 +338,13 @@ Result> AsrRunner::transcribe( } } - // Clear cache input settings after decoder loop completes - // This prevents stale cache from being used in subsequent transcribe() calls + // Reset cache input settings after decoder loop completes. + // This disables the D2D copy optimization for subsequent execute() calls. + // Note: The stored GPU tensor remains in memory until the next encoder run + // (which overwrites it) or until the backend is destroyed. { ::executorch::runtime::BackendOptions<1> opts; - opts.set_option("clear_cache_input", true); + opts.set_option("reset_cache_input", true); ::executorch::runtime::set_option("CudaBackend", opts.view()); } From e535aeece17bc265371c0f75fd9e7ba952750706 Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Wed, 3 Dec 2025 06:14:54 -0800 Subject: [PATCH 5/9] More simplification --- backends/cuda/runtime/cuda_backend.cpp | 184 ++++++++----------------- extension/asr/runner/runner.cpp | 31 ++--- 2 files changed, 68 insertions(+), 147 deletions(-) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index efaefd19c99..58820bb7b69 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -57,50 +57,12 @@ struct GpuTensorRef { size_t size_bytes; // Total size in bytes }; -// Parses "slot:name" format string. Returns true on success. -// Uses character-by-character parsing to avoid std::stoi exceptions. -static bool parse_slot_name( - const std::string& val, - int& out_slot, - std::string& out_name) { - auto colon_pos = val.find(':'); - if (colon_pos == std::string::npos || colon_pos == 0) { - return false; - } - - // Parse slot number manually to avoid exceptions - int slot = 0; - for (size_t i = 0; i < colon_pos; i++) { - char c = val[i]; - if (c < '0' || c > '9') { - return false; // Non-digit character - } - int digit = c - '0'; - // Check for overflow - if (slot > (INT_MAX - digit) / 10) { - return false; - } - slot = slot * 10 + digit; - } - - std::string name = val.substr(colon_pos + 1); - if (name.empty()) { - return false; - } - - out_slot = slot; - out_name = std::move(name); - return true; -} - class ET_EXPERIMENTAL CudaBackend final : public ::executorch::runtime::BackendInterface { private: - // Cache control options (set via set_option before execute) - mutable int cache_output_slot_ = -1; // Which output slot to cache (-1 = none) - mutable std::string cache_output_name_; // Name to cache output under - mutable int use_cache_input_slot_ = -1; // Which input slot to use cache for (-1 = none) - mutable std::string use_cache_input_name_; // Name of cached tensor to use + // Storage control options (set via set_option before execute) + mutable std::string store_output_name_; // Name to store output under (empty = none) + mutable std::string use_stored_input_name_; // Name of stored tensor to use (empty = none) // Per-instance map of named GPU tensor references. // Mutable because execute() is const but needs to modify this. @@ -110,7 +72,7 @@ class ET_EXPERIMENTAL CudaBackend final // - Caller must ensure the producing execute() call (e.g., encoder) completes // before any consuming execute() call (e.g., decoder) begins. // - Caller must not call destroy() while execute() is in progress. - // - Overwriting a tensor (same cache name) deletes the old tensor immediately, + // - Overwriting a tensor (same name) deletes the old tensor immediately, // so caller must ensure no concurrent execute() is using it. mutable std::unordered_map gpu_tensors_; @@ -173,49 +135,39 @@ class ET_EXPERIMENTAL CudaBackend final backend_options) override { for (size_t i = 0; i < backend_options.size(); i++) { const auto& option = backend_options[i]; - // Handle cache_output: "slot:name" format (e.g., "0:encoder_output") - if (strcmp(option.key, "cache_output") == 0) { + // Handle store_output: expects a string name (e.g., "encoder_output") + if (strcmp(option.key, "store_output") == 0) { if (auto* arr = std::get_if< std::array>( &option.value)) { - std::string val(arr->data()); - int slot; - std::string name; - if (parse_slot_name(val, slot, name)) { - cache_output_slot_ = slot; - cache_output_name_ = std::move(name); - } else { - ET_LOG(Error, "Invalid cache_output format: '%s'", val.c_str()); - return Error::InvalidArgument; - } + store_output_name_ = std::string(arr->data()); + } else { + ET_LOG(Warning, "store_output option expects a string value"); + return Error::InvalidArgument; } } - // Handle use_cache_input: "slot:name" format (e.g., "2:encoder_output") - else if (strcmp(option.key, "use_cache_input") == 0) { + // Handle use_stored_input: expects a string name (e.g., "encoder_output") + else if (strcmp(option.key, "use_stored_input") == 0) { if (auto* arr = std::get_if< std::array>( &option.value)) { - std::string val(arr->data()); - int slot; - std::string name; - if (parse_slot_name(val, slot, name)) { - use_cache_input_slot_ = slot; - use_cache_input_name_ = std::move(name); - } else { - ET_LOG(Error, "Invalid use_cache_input format: '%s'", val.c_str()); - return Error::InvalidArgument; - } + use_stored_input_name_ = std::string(arr->data()); + } else { + ET_LOG(Warning, "use_stored_input option expects a string value"); + return Error::InvalidArgument; } } - // Handle reset_cache_input: disable cache input for subsequent execute() calls. - // Note: This only resets the slot/name settings. The stored GPU tensor + // Handle reset_stored_input: expects a boolean value + // Note: This only resets the name setting. The stored GPU tensor // remains in memory until overwritten or destroy() is called. - else if (strcmp(option.key, "reset_cache_input") == 0) { + else if (strcmp(option.key, "reset_stored_input") == 0) { if (auto* val = std::get_if(&option.value)) { if (*val) { - use_cache_input_slot_ = -1; - use_cache_input_name_.clear(); + use_stored_input_name_.clear(); } + } else { + ET_LOG(Warning, "reset_stored_input option expects a boolean value"); + return Error::InvalidArgument; } } } @@ -346,28 +298,6 @@ class ET_EXPERIMENTAL CudaBackend final n_outputs, args.size()) - // Validate cache slot indices if set - if (use_cache_input_slot_ >= 0 && - use_cache_input_slot_ >= static_cast(n_inputs)) { - ET_LOG( - Warning, - "use_cache_input slot %d is out of bounds (n_inputs=%zu), ignoring", - use_cache_input_slot_, - n_inputs); - use_cache_input_slot_ = -1; - use_cache_input_name_.clear(); - } - if (cache_output_slot_ >= 0 && - cache_output_slot_ >= static_cast(n_outputs)) { - ET_LOG( - Warning, - "cache_output slot %d is out of bounds (n_outputs=%zu), ignoring", - cache_output_slot_, - n_outputs); - cache_output_slot_ = -1; - cache_output_name_.clear(); - } - // NOTE: ExecuTorch tensors are always on CPU/host memory // We need to create GPU copies for CUDA kernel execution std::vector gpu_inputs( @@ -404,40 +334,35 @@ class ET_EXPERIMENTAL CudaBackend final gpu_inputs[i] = gpu_input_handle; - // Check if this input slot should use a stored GPU tensor - if (i == use_cache_input_slot_ && !use_cache_input_name_.empty()) { - auto it = gpu_tensors_.find(use_cache_input_name_); + // Check if this input matches a stored GPU tensor (by size) + if (!use_stored_input_name_.empty()) { + auto it = gpu_tensors_.find(use_stored_input_name_); if (it != gpu_tensors_.end()) { const GpuTensorRef& ref = it->second; - // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format size_t numel = gpu_inputs[i]->numel(); size_t elem_size = gpu_inputs[i]->element_size(); size_t copy_bytes = numel * elem_size; - ET_CHECK_OR_RETURN_ERROR( - copy_bytes == ref.size_bytes, - Internal, - "Stored tensor size mismatch: expected %zu bytes, got %zu", - ref.size_bytes, - copy_bytes); - - cudaError_t cuda_err = cudaMemcpy( - gpu_inputs[i]->data_ptr(), - ref.data_ptr, - copy_bytes, - cudaMemcpyDeviceToDevice); - - ET_CHECK_OR_RETURN_ERROR( - cuda_err == cudaSuccess, - Internal, - "Failed GPU-to-GPU copy for input %d: %s", - i, - cudaGetErrorString(cuda_err)); - - // Skip the CPU-to-GPU copy below - continue; + // Match by size: use stored tensor if sizes match + if (copy_bytes == ref.size_bytes) { + // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format + cudaError_t cuda_err = cudaMemcpy( + gpu_inputs[i]->data_ptr(), + ref.data_ptr, + copy_bytes, + cudaMemcpyDeviceToDevice); + + ET_CHECK_OR_RETURN_ERROR( + cuda_err == cudaSuccess, + Internal, + "Failed GPU-to-GPU copy for input %d: %s", + i, + cudaGetErrorString(cuda_err)); + + // Skip the CPU-to-GPU copy below + continue; + } } - // Not found: fall through to normal CPU-to-GPU copy } // Copy data from CPU to GPU (normal path) @@ -493,16 +418,16 @@ class ET_EXPERIMENTAL CudaBackend final error); // Store reference to output GPU tensor if requested. + // Always uses gpu_outputs[0] (encoder has single output). // The tensor will be kept alive for later D2D copy to decoder inputs. - // (Bounds already validated at start of execute()) - if (cache_output_slot_ >= 0 && !cache_output_name_.empty()) { - auto* gpu_tensor = gpu_outputs[cache_output_slot_]; + if (!store_output_name_.empty() && n_outputs > 0) { + auto* gpu_tensor = gpu_outputs[0]; size_t numel = gpu_tensor->numel(); size_t elem_size = gpu_tensor->element_size(); size_t size_bytes = numel * elem_size; // Delete old tensor if overwriting (erase first to prevent double-free) - auto old_it = gpu_tensors_.find(cache_output_name_); + auto old_it = gpu_tensors_.find(store_output_name_); if (old_it != gpu_tensors_.end()) { AOTITensorHandle old_handle = old_it->second.handle; gpu_tensors_.erase(old_it); // Remove from map before deleting @@ -516,11 +441,10 @@ class ET_EXPERIMENTAL CudaBackend final ref.handle = gpu_tensor; ref.data_ptr = gpu_tensor->data_ptr(); ref.size_bytes = size_bytes; - gpu_tensors_[cache_output_name_] = ref; + gpu_tensors_[store_output_name_] = ref; - // Reset cache_output settings after caching - cache_output_slot_ = -1; - cache_output_name_.clear(); + // Reset store_output name after storing + store_output_name_.clear(); } // Copy GPU output results back to CPU output tensors @@ -538,12 +462,12 @@ class ET_EXPERIMENTAL CudaBackend final } // Memory management notes: - // - use_cache_input settings persist across execute() calls to support + // - use_stored_input setting persists across execute() calls to support // decoder loops that reuse the stored encoder output. // - Stored GPU tensors (in gpu_tensors_) remain in memory until: // (a) overwritten by a new tensor with the same name, or // (b) destroy() is called, which frees all stored tensors. - // - The "reset_cache_input" option only resets the input slot/name settings, + // - The "reset_stored_input" option only resets the input name setting, // NOT the stored GPU tensors themselves. // Cleanup: delete GPU tensors to avoid memory leak across execute() calls. diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp index af921b7a540..05b4c7e437e 100644 --- a/extension/asr/runner/runner.cpp +++ b/extension/asr/runner/runner.cpp @@ -198,14 +198,14 @@ Result> AsrRunner::transcribe( } } - // Tell CUDA backend to cache encoder output (slot 0) as "encoder_output" + // Tell CUDA backend to store encoder output as "encoder_output" { ::executorch::runtime::BackendOptions<1> opts; - opts.set_option("cache_output", "0:encoder_output"); + opts.set_option("store_output", "encoder_output"); auto err = ::executorch::runtime::set_option("CudaBackend", opts.view()); if (err != ::executorch::runtime::Error::Ok) { - ET_LOG(Info, "Failed to set cache_output option (backend may not support caching)"); + ET_LOG(Warning, "Failed to set store_output option (backend may not support storage)"); } } @@ -263,22 +263,15 @@ Result> AsrRunner::transcribe( decoder_inputs.emplace_back(encoder_output_ptr); decoder_inputs.emplace_back(cache_position_ptr); - // Tell CUDA backend to use cached encoder output for decoder input slot 2. - // - // Why slot 2? The AOTI-compiled decoder receives inputs in a different order - // than we pass them in decoder_inputs above. The AOTI input order was - // determined empirically by examining tensor shapes during execution. - // - // The "2:encoder_output" format tells the backend to use the stored GPU - // tensor named "encoder_output" for AOTI input slot 2. This avoids redundant - // CPU->GPU copies on each decoder iteration. + // Tell CUDA backend to use stored encoder output for matching decoder inputs. + // The backend matches by tensor size, avoiding redundant CPU->GPU copies. { ::executorch::runtime::BackendOptions<1> opts; - opts.set_option("use_cache_input", "2:encoder_output"); + opts.set_option("use_stored_input", "encoder_output"); auto err = ::executorch::runtime::set_option("CudaBackend", opts.view()); if (err != ::executorch::runtime::Error::Ok) { - ET_LOG(Info, "Failed to set use_cache_input option (backend may not support caching)"); + ET_LOG(Warning, "Failed to set use_stored_input option (backend may not support storage)"); } } @@ -338,14 +331,18 @@ Result> AsrRunner::transcribe( } } - // Reset cache input settings after decoder loop completes. + // Reset stored input settings after decoder loop completes. // This disables the D2D copy optimization for subsequent execute() calls. // Note: The stored GPU tensor remains in memory until the next encoder run // (which overwrites it) or until the backend is destroyed. { ::executorch::runtime::BackendOptions<1> opts; - opts.set_option("reset_cache_input", true); - ::executorch::runtime::set_option("CudaBackend", opts.view()); + opts.set_option("reset_stored_input", true); + auto err = + ::executorch::runtime::set_option("CudaBackend", opts.view()); + if (err != ::executorch::runtime::Error::Ok) { + ET_LOG(Warning, "Failed to set reset_stored_input option"); + } } // Reset coloring From 9e1a3cc719df006a2f145703e98db39121e68643 Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Wed, 3 Dec 2025 06:24:16 -0800 Subject: [PATCH 6/9] RAII --- backends/cuda/runtime/cuda_backend.cpp | 96 +++++++++++++++++--------- extension/asr/runner/runner.cpp | 17 ++--- 2 files changed, 72 insertions(+), 41 deletions(-) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index 58820bb7b69..750a8a85795 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -49,20 +48,22 @@ using executorch::runtime::Result; using executorch::runtime::Span; using executorch::runtime::etensor::Tensor; -// Structure to hold a reference to a GPU tensor for "keep on device" optimization. -// Owns the tensor handle - must be deleted when no longer needed. +// Structure to hold a reference to a GPU tensor for "keep on device" +// optimization. Owns the tensor handle - must be deleted when no longer needed. struct GpuTensorRef { - AOTITensorHandle handle; // Tensor handle (owned, for later deletion) - void* data_ptr; // GPU memory pointer (for D2D copy) - size_t size_bytes; // Total size in bytes + AOTITensorHandle handle; // Tensor handle (owned, for later deletion) + void* data_ptr; // GPU memory pointer (for D2D copy) + size_t size_bytes; // Total size in bytes }; class ET_EXPERIMENTAL CudaBackend final : public ::executorch::runtime::BackendInterface { private: // Storage control options (set via set_option before execute) - mutable std::string store_output_name_; // Name to store output under (empty = none) - mutable std::string use_stored_input_name_; // Name of stored tensor to use (empty = none) + mutable std::string + store_output_name_; // Name to store output under (empty = none) + mutable std::string + use_stored_input_name_; // Name of stored tensor to use (empty = none) // Per-instance map of named GPU tensor references. // Mutable because execute() is const but needs to modify this. @@ -305,8 +306,41 @@ class ET_EXPERIMENTAL CudaBackend final std::vector gpu_outputs( n_outputs); // GPU tensors for kernel output + // RAII helper to ensure GPU tensors are cleaned up on all exit paths. + // Prevents memory leaks when errors occur during execute(). + struct TensorCleanup { + std::vector& inputs; + std::vector& outputs; + const std::unordered_map& stored_tensors; + + ~TensorCleanup() { + // Clean up input tensors + for (auto* handle : inputs) { + if (handle != nullptr) { + aoti_torch_delete_tensor_object(handle); + } + } + // Clean up output tensors, except those that are stored + for (auto* handle : outputs) { + if (handle != nullptr) { + bool is_stored = false; + for (const auto& pair : stored_tensors) { + if (pair.second.handle == handle) { + is_stored = true; + break; + } + } + if (!is_stored) { + aoti_torch_delete_tensor_object(handle); + } + } + } + } + }; + TensorCleanup cleanup{gpu_inputs, gpu_outputs, gpu_tensors_}; + // Process input tensors: ExecuTorch provides CPU tensors, create GPU - // copies. For cached inputs, use GPU-to-GPU copy instead of CPU-to-GPU. + // copies. For stored inputs, use GPU-to-GPU copy instead of CPU-to-GPU. for (int i = 0; i < n_inputs; i++) { // Get tensor dimensions and properties from ExecuTorch CPU tensor auto cpu_tensor = &(args[i]->toTensor()); @@ -334,7 +368,10 @@ class ET_EXPERIMENTAL CudaBackend final gpu_inputs[i] = gpu_input_handle; - // Check if this input matches a stored GPU tensor (by size) + // Check if this input matches a stored GPU tensor (by size). + // Note: Size-based matching assumes only one input will match. If + // multiple inputs have the same byte size as the stored tensor, the first + // match wins. if (!use_stored_input_name_.empty()) { auto it = gpu_tensors_.find(use_stored_input_name_); if (it != gpu_tensors_.end()) { @@ -345,6 +382,13 @@ class ET_EXPERIMENTAL CudaBackend final // Match by size: use stored tensor if sizes match if (copy_bytes == ref.size_bytes) { + ET_LOG( + Debug, + "Using stored tensor '%s' for input %d (%zu bytes, D2D copy)", + use_stored_input_name_.c_str(), + i, + copy_bytes); + // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format cudaError_t cuda_err = cudaMemcpy( gpu_inputs[i]->data_ptr(), @@ -418,9 +462,14 @@ class ET_EXPERIMENTAL CudaBackend final error); // Store reference to output GPU tensor if requested. - // Always uses gpu_outputs[0] (encoder has single output). // The tensor will be kept alive for later D2D copy to decoder inputs. - if (!store_output_name_.empty() && n_outputs > 0) { + if (!store_output_name_.empty()) { + ET_CHECK_OR_RETURN_ERROR( + n_outputs == 1, + InvalidArgument, + "store_output only supports single-output methods, got %zu outputs", + n_outputs); + auto* gpu_tensor = gpu_outputs[0]; size_t numel = gpu_tensor->numel(); size_t elem_size = gpu_tensor->element_size(); @@ -430,7 +479,7 @@ class ET_EXPERIMENTAL CudaBackend final auto old_it = gpu_tensors_.find(store_output_name_); if (old_it != gpu_tensors_.end()) { AOTITensorHandle old_handle = old_it->second.handle; - gpu_tensors_.erase(old_it); // Remove from map before deleting + gpu_tensors_.erase(old_it); // Remove from map before deleting if (old_handle != nullptr) { aoti_torch_delete_tensor_object(old_handle); } @@ -462,6 +511,7 @@ class ET_EXPERIMENTAL CudaBackend final } // Memory management notes: + // - GPU tensor cleanup is handled by TensorCleanup RAII guard above. // - use_stored_input setting persists across execute() calls to support // decoder loops that reuse the stored encoder output. // - Stored GPU tensors (in gpu_tensors_) remain in memory until: @@ -470,26 +520,6 @@ class ET_EXPERIMENTAL CudaBackend final // - The "reset_stored_input" option only resets the input name setting, // NOT the stored GPU tensors themselves. - // Cleanup: delete GPU tensors to avoid memory leak across execute() calls. - // Input tensors are no longer needed after AOTI execution. - for (size_t i = 0; i < n_inputs; i++) { - aoti_torch_delete_tensor_object(gpu_inputs[i]); - } - // Output tensors are no longer needed after copying to CPU, - // EXCEPT for tensors stored in gpu_tensors_ (for later D2D copy). - for (size_t i = 0; i < n_outputs; i++) { - bool is_stored = false; - for (const auto& pair : gpu_tensors_) { - if (pair.second.handle == gpu_outputs[i]) { - is_stored = true; - break; - } - } - if (!is_stored) { - aoti_torch_delete_tensor_object(gpu_outputs[i]); - } - } - return Error::Ok; } diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp index 05b4c7e437e..c7363e44239 100644 --- a/extension/asr/runner/runner.cpp +++ b/extension/asr/runner/runner.cpp @@ -202,10 +202,11 @@ Result> AsrRunner::transcribe( { ::executorch::runtime::BackendOptions<1> opts; opts.set_option("store_output", "encoder_output"); - auto err = - ::executorch::runtime::set_option("CudaBackend", opts.view()); + auto err = ::executorch::runtime::set_option("CudaBackend", opts.view()); if (err != ::executorch::runtime::Error::Ok) { - ET_LOG(Warning, "Failed to set store_output option (backend may not support storage)"); + ET_LOG( + Warning, + "Failed to set store_output option (backend may not support storage)"); } } @@ -268,10 +269,11 @@ Result> AsrRunner::transcribe( { ::executorch::runtime::BackendOptions<1> opts; opts.set_option("use_stored_input", "encoder_output"); - auto err = - ::executorch::runtime::set_option("CudaBackend", opts.view()); + auto err = ::executorch::runtime::set_option("CudaBackend", opts.view()); if (err != ::executorch::runtime::Error::Ok) { - ET_LOG(Warning, "Failed to set use_stored_input option (backend may not support storage)"); + ET_LOG( + Warning, + "Failed to set use_stored_input option (backend may not support storage)"); } } @@ -338,8 +340,7 @@ Result> AsrRunner::transcribe( { ::executorch::runtime::BackendOptions<1> opts; opts.set_option("reset_stored_input", true); - auto err = - ::executorch::runtime::set_option("CudaBackend", opts.view()); + auto err = ::executorch::runtime::set_option("CudaBackend", opts.view()); if (err != ::executorch::runtime::Error::Ok) { ET_LOG(Warning, "Failed to set reset_stored_input option"); } From 3d0b62105608fd13cac783a34c5d4bc7666f7e6b Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Wed, 3 Dec 2025 07:02:10 -0800 Subject: [PATCH 7/9] Add clear_stored_tensor option --- backends/cuda/runtime/cuda_backend.cpp | 60 ++++++++++++++++++++++++++ extension/asr/runner/runner.cpp | 11 +++-- 2 files changed, 65 insertions(+), 6 deletions(-) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index 750a8a85795..aa0d358396c 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -59,6 +59,47 @@ struct GpuTensorRef { class ET_EXPERIMENTAL CudaBackend final : public ::executorch::runtime::BackendInterface { private: + // ============================================================================ + // GPU Tensor Storage for D2D Copy Optimization + // ============================================================================ + // + // This backend supports storing GPU tensors between execute() calls to enable + // device-to-device (D2D) copies instead of slower host-to-device (H2D) copies. + // This is useful for encoder-decoder models where the encoder output is reused + // across many decoder iterations. + // + // SUPPORTED OPTIONS (via set_option): + // + // "store_output" (string): Store the output tensor under this name after + // the next execute() call. The tensor remains on GPU until cleared. + // Only supports single-output methods. + // Example: opts.set_option("store_output", "encoder_output"); + // + // "use_stored_input" (string): For inputs matching the stored tensor's size, + // use D2D copy from the stored tensor instead of H2D copy from CPU. + // This setting persists across execute() calls until reset. + // Example: opts.set_option("use_stored_input", "encoder_output"); + // + // "reset_stored_input" (bool): Clear the use_stored_input setting. + // Does NOT delete the stored tensor - only stops using it for D2D. + // Example: opts.set_option("reset_stored_input", true); + // + // "clear_stored_tensor" (string): Delete the named tensor from storage, + // freeing GPU memory. Use after decoder loop completes. + // Example: opts.set_option("clear_stored_tensor", "encoder_output"); + // + // TYPICAL USAGE PATTERN (encoder-decoder model): + // + // 1. Before encoder: set_option("store_output", "encoder_output") + // 2. Execute encoder (output is stored on GPU) + // 3. Before decoder loop: set_option("use_stored_input", "encoder_output") + // 4. Execute decoder N times (D2D copies for encoder output input) + // 5. After decoder loop: + // set_option("reset_stored_input", true) + // set_option("clear_stored_tensor", "encoder_output") + // + // ============================================================================ + // Storage control options (set via set_option before execute) mutable std::string store_output_name_; // Name to store output under (empty = none) @@ -171,6 +212,25 @@ class ET_EXPERIMENTAL CudaBackend final return Error::InvalidArgument; } } + // Handle clear_stored_tensor: expects a string name + // Deletes the named GPU tensor from storage, freeing GPU memory. + else if (strcmp(option.key, "clear_stored_tensor") == 0) { + if (auto* arr = std::get_if< + std::array>( + &option.value)) { + std::string name(arr->data()); + auto it = gpu_tensors_.find(name); + if (it != gpu_tensors_.end()) { + if (it->second.handle != nullptr) { + aoti_torch_delete_tensor_object(it->second.handle); + } + gpu_tensors_.erase(it); + } + } else { + ET_LOG(Warning, "clear_stored_tensor option expects a string value"); + return Error::InvalidArgument; + } + } } return Error::Ok; } diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp index c7363e44239..7610f3b89f5 100644 --- a/extension/asr/runner/runner.cpp +++ b/extension/asr/runner/runner.cpp @@ -333,16 +333,15 @@ Result> AsrRunner::transcribe( } } - // Reset stored input settings after decoder loop completes. - // This disables the D2D copy optimization for subsequent execute() calls. - // Note: The stored GPU tensor remains in memory until the next encoder run - // (which overwrites it) or until the backend is destroyed. + // Reset stored input settings and free GPU memory after decoder loop completes. + // This disables the D2D copy optimization and releases the stored encoder output. { - ::executorch::runtime::BackendOptions<1> opts; + ::executorch::runtime::BackendOptions<2> opts; opts.set_option("reset_stored_input", true); + opts.set_option("clear_stored_tensor", "encoder_output"); auto err = ::executorch::runtime::set_option("CudaBackend", opts.view()); if (err != ::executorch::runtime::Error::Ok) { - ET_LOG(Warning, "Failed to set reset_stored_input option"); + ET_LOG(Warning, "Failed to reset stored input settings"); } } From bc560b3859d1e300d0ff8e5eefb4eb7703b1e308 Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Wed, 3 Dec 2025 07:08:49 -0800 Subject: [PATCH 8/9] Minor nit --- backends/cuda/runtime/cuda_backend.cpp | 15 ++++++++------- extension/asr/runner/runner.cpp | 5 +++-- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index aa0d358396c..ac82cd7a3f6 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -64,9 +64,9 @@ class ET_EXPERIMENTAL CudaBackend final // ============================================================================ // // This backend supports storing GPU tensors between execute() calls to enable - // device-to-device (D2D) copies instead of slower host-to-device (H2D) copies. - // This is useful for encoder-decoder models where the encoder output is reused - // across many decoder iterations. + // device-to-device (D2D) copies instead of slower host-to-device (H2D) + // copies. This is useful for encoder-decoder models where the encoder output + // is reused across many decoder iterations. // // SUPPORTED OPTIONS (via set_option): // @@ -75,7 +75,8 @@ class ET_EXPERIMENTAL CudaBackend final // Only supports single-output methods. // Example: opts.set_option("store_output", "encoder_output"); // - // "use_stored_input" (string): For inputs matching the stored tensor's size, + // "use_stored_input" (string): For inputs matching the stored tensor's + // size, // use D2D copy from the stored tensor instead of H2D copy from CPU. // This setting persists across execute() calls until reset. // Example: opts.set_option("use_stored_input", "encoder_output"); @@ -401,7 +402,7 @@ class ET_EXPERIMENTAL CudaBackend final // Process input tensors: ExecuTorch provides CPU tensors, create GPU // copies. For stored inputs, use GPU-to-GPU copy instead of CPU-to-GPU. - for (int i = 0; i < n_inputs; i++) { + for (size_t i = 0; i < n_inputs; i++) { // Get tensor dimensions and properties from ExecuTorch CPU tensor auto cpu_tensor = &(args[i]->toTensor()); auto sizes = cpu_tensor->sizes(); @@ -478,7 +479,7 @@ class ET_EXPERIMENTAL CudaBackend final } // Process output tensors: create GPU counterparts for ExecuTorch CPU // tensors - for (int i = 0; i < n_outputs; i++) { + for (size_t i = 0; i < n_outputs; i++) { // Get output tensor dimensions from ExecuTorch CPU tensor auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); auto sizes = cpu_output_tensor->sizes(); @@ -557,7 +558,7 @@ class ET_EXPERIMENTAL CudaBackend final } // Copy GPU output results back to CPU output tensors - for (int i = 0; i < n_outputs; i++) { + for (size_t i = 0; i < n_outputs; i++) { auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); // For DYNAMIC_BOUND tensors we try to resize ET_CHECK_OK_OR_RETURN_ERROR( diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp index 7610f3b89f5..93907b13549 100644 --- a/extension/asr/runner/runner.cpp +++ b/extension/asr/runner/runner.cpp @@ -333,8 +333,9 @@ Result> AsrRunner::transcribe( } } - // Reset stored input settings and free GPU memory after decoder loop completes. - // This disables the D2D copy optimization and releases the stored encoder output. + // Reset stored input settings and free GPU memory after decoder loop + // completes. This disables the D2D copy optimization and releases the stored + // encoder output. { ::executorch::runtime::BackendOptions<2> opts; opts.set_option("reset_stored_input", true); From 137e6da75290b69aa0b12ed0b4b6e653c967413b Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Wed, 3 Dec 2025 07:25:06 -0800 Subject: [PATCH 9/9] Minor nits --- backends/cuda/runtime/cuda_backend.cpp | 87 +++++++++++++++----------- extension/asr/runner/runner.cpp | 6 +- 2 files changed, 55 insertions(+), 38 deletions(-) diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index ac82cd7a3f6..059d6c0ea29 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -185,7 +185,7 @@ class ET_EXPERIMENTAL CudaBackend final &option.value)) { store_output_name_ = std::string(arr->data()); } else { - ET_LOG(Warning, "store_output option expects a string value"); + ET_LOG(Error, "store_output option expects a string value"); return Error::InvalidArgument; } } @@ -196,7 +196,7 @@ class ET_EXPERIMENTAL CudaBackend final &option.value)) { use_stored_input_name_ = std::string(arr->data()); } else { - ET_LOG(Warning, "use_stored_input option expects a string value"); + ET_LOG(Error, "use_stored_input option expects a string value"); return Error::InvalidArgument; } } @@ -209,7 +209,7 @@ class ET_EXPERIMENTAL CudaBackend final use_stored_input_name_.clear(); } } else { - ET_LOG(Warning, "reset_stored_input option expects a boolean value"); + ET_LOG(Error, "reset_stored_input option expects a boolean value"); return Error::InvalidArgument; } } @@ -228,7 +228,7 @@ class ET_EXPERIMENTAL CudaBackend final gpu_tensors_.erase(it); } } else { - ET_LOG(Warning, "clear_stored_tensor option expects a string value"); + ET_LOG(Error, "clear_stored_tensor option expects a string value"); return Error::InvalidArgument; } } @@ -400,6 +400,10 @@ class ET_EXPERIMENTAL CudaBackend final }; TensorCleanup cleanup{gpu_inputs, gpu_outputs, gpu_tensors_}; + // Track which input index was matched for D2D copy (for duplicate + // detection) + ssize_t matched_input_idx = -1; + // Process input tensors: ExecuTorch provides CPU tensors, create GPU // copies. For stored inputs, use GPU-to-GPU copy instead of CPU-to-GPU. for (size_t i = 0; i < n_inputs; i++) { @@ -424,15 +428,12 @@ class ET_EXPERIMENTAL CudaBackend final ET_CHECK_OR_RETURN_ERROR( create_err == Error::Ok, Internal, - "Failed to create GPU tensor for input %d", + "Failed to create GPU tensor for input %zu", i); gpu_inputs[i] = gpu_input_handle; // Check if this input matches a stored GPU tensor (by size). - // Note: Size-based matching assumes only one input will match. If - // multiple inputs have the same byte size as the stored tensor, the first - // match wins. if (!use_stored_input_name_.empty()) { auto it = gpu_tensors_.find(use_stored_input_name_); if (it != gpu_tensors_.end()) { @@ -443,29 +444,45 @@ class ET_EXPERIMENTAL CudaBackend final // Match by size: use stored tensor if sizes match if (copy_bytes == ref.size_bytes) { - ET_LOG( - Debug, - "Using stored tensor '%s' for input %d (%zu bytes, D2D copy)", - use_stored_input_name_.c_str(), - i, - copy_bytes); - - // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format - cudaError_t cuda_err = cudaMemcpy( - gpu_inputs[i]->data_ptr(), - ref.data_ptr, - copy_bytes, - cudaMemcpyDeviceToDevice); - - ET_CHECK_OR_RETURN_ERROR( - cuda_err == cudaSuccess, - Internal, - "Failed GPU-to-GPU copy for input %d: %s", - i, - cudaGetErrorString(cuda_err)); - - // Skip the CPU-to-GPU copy below - continue; + if (matched_input_idx >= 0) { + // Another input already matched - warn about ambiguity + ET_LOG( + Error, + "Multiple inputs match stored tensor '%s' size (%zu bytes): " + "input %zd was used, input %zu also matches. " + "Consider using unique tensor sizes or a different matching strategy.", + use_stored_input_name_.c_str(), + copy_bytes, + matched_input_idx, + i); + } else { + // First match - perform D2D copy + matched_input_idx = static_cast(i); + + ET_LOG( + Debug, + "Using stored tensor '%s' for input %zu (%zu bytes, D2D copy)", + use_stored_input_name_.c_str(), + i, + copy_bytes); + + // GPU-to-GPU copy: fast DMA transfer, normalizes tensor format + cudaError_t cuda_err = cudaMemcpy( + gpu_inputs[i]->data_ptr(), + ref.data_ptr, + copy_bytes, + cudaMemcpyDeviceToDevice); + + ET_CHECK_OR_RETURN_ERROR( + cuda_err == cudaSuccess, + Internal, + "Failed GPU-to-GPU copy for input %zu: %s", + i, + cudaGetErrorString(cuda_err)); + + // Skip the CPU-to-GPU copy below + continue; + } } } } @@ -474,7 +491,7 @@ class ET_EXPERIMENTAL CudaBackend final ET_CHECK_OR_RETURN_ERROR( aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok, Internal, - "Failed to copy input %d from CPU to GPU", + "Failed to copy input %zu from CPU to GPU", i); } // Process output tensors: create GPU counterparts for ExecuTorch CPU @@ -501,7 +518,7 @@ class ET_EXPERIMENTAL CudaBackend final ET_CHECK_OR_RETURN_ERROR( create_err == Error::Ok, Internal, - "Failed to create GPU tensor for output %d", + "Failed to create GPU tensor for output %zu", i); gpu_outputs[i] = gpu_output_handle; @@ -563,11 +580,11 @@ class ET_EXPERIMENTAL CudaBackend final // For DYNAMIC_BOUND tensors we try to resize ET_CHECK_OK_OR_RETURN_ERROR( resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()), - "Error resizing tensor at output index %d", + "Error resizing tensor at output index %zu", i); ET_CHECK_OK_OR_RETURN_ERROR( aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0), - "Failed to copy GPU output %d back to CPU", + "Failed to copy GPU output %zu back to CPU", i); } diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp index 93907b13549..b0f7f139307 100644 --- a/extension/asr/runner/runner.cpp +++ b/extension/asr/runner/runner.cpp @@ -205,7 +205,7 @@ Result> AsrRunner::transcribe( auto err = ::executorch::runtime::set_option("CudaBackend", opts.view()); if (err != ::executorch::runtime::Error::Ok) { ET_LOG( - Warning, + Debug, "Failed to set store_output option (backend may not support storage)"); } } @@ -272,7 +272,7 @@ Result> AsrRunner::transcribe( auto err = ::executorch::runtime::set_option("CudaBackend", opts.view()); if (err != ::executorch::runtime::Error::Ok) { ET_LOG( - Warning, + Debug, "Failed to set use_stored_input option (backend may not support storage)"); } } @@ -342,7 +342,7 @@ Result> AsrRunner::transcribe( opts.set_option("clear_stored_tensor", "encoder_output"); auto err = ::executorch::runtime::set_option("CudaBackend", opts.view()); if (err != ::executorch::runtime::Error::Ok) { - ET_LOG(Warning, "Failed to reset stored input settings"); + ET_LOG(Error, "Failed to reset stored input settings"); } }