diff --git a/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h b/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h
index 13bde6d07a5e..bbb67db3e3c8 100644
--- a/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h
+++ b/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -204,30 +204,6 @@ class RuntimeBuffers
         TensorPtr logits;
         //! Record the usage offset of the cacheGenerationLogits buffer
         SizeType32 offset{0};
-
-        //! Temporarily store the transposed results of multiple fragment logits, [maxBeamWidth, kCACHE_LENGTH]
-        TensorPtr transposedLogits;
-
-        //! Temporarily store logits buffer address during the transposing, [kCACHE_LENGTH]
-        TensorPtr fragmentPointerDevice;
-
-        //! Temporarily store logits buffer address during the transposing, [maxBatchSize, kCACHE_LENGTH]
-        TensorPtr fragmentPointerHost;
-
-        //! Cycling index for workspace
-        size_t workIdx{0};
-
-        void cycleWorkIdx()
-        {
-            workIdx = (workIdx + 1) % (fragmentPointerHost->getShape().d[0]);
-        }
-
-        [[nodiscard]] TensorPtr getFragmentPointerHost()
-        {
-            TensorPtr slice = runtime::ITensor::slice(fragmentPointerHost, workIdx, 1);
-            cycleWorkIdx();
-            return slice;
-        };
     };
 
     GenerationLogitsCache generationLogitsCache;
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
index d1112439686e..cad7abcfad8a 100644
--- a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
+++ b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
@@ -3945,16 +3945,19 @@ SizeType32 KVCacheManager::copyBlockOffsets(ITensor& output, SizeType32 outputSl
         {
             for (SizeType32 beamIdx = 0; beamIdx < beamWidth; ++beamIdx)
             {
-                auto const beamBlockCount = cacheBlockIds[beamIdx].size();
-                auto const copyChunkSize = beamBlockCount * sizeof(tk::KVCacheIndex);
+                // For cross-KV (encoder features), all beams of a request share the same encoder output.
+                // Always use beam 0's block IDs so all beams attend to the correct encoder features.
+                auto const srcBeamIdx = isCrossKv() ? 0 : beamIdx;
+                auto const effectiveBlockCount = isCrossKv() ? cacheBlockIds[0].size() : cacheBlockIds[beamIdx].size();
+                auto const copyChunkSize = effectiveBlockCount * sizeof(tk::KVCacheIndex);
                 for (auto xIdx : {kIdx, vIdx})
                 {
-                    auto const srcIndex = tc::flat_index(srcShape.d, poolIdx, beamIdx, xIdx, 0);
+                    auto const srcIndex = tc::flat_index(srcShape.d, poolIdx, srcBeamIdx, xIdx, 0);
                     auto const dstIndex
                         = tc::flat_index(dstShape.d, absolutePoolIdx, outputSlotOffset + beamIdx, xIdx, 0);
                     std::memcpy(dstPtr + dstIndex, srcPtr + srcIndex, copyChunkSize);
                 }
-                maxBlockCount = std::max<SizeType32>(maxBlockCount, static_cast<SizeType32>(beamBlockCount));
+                maxBlockCount = std::max<SizeType32>(maxBlockCount, static_cast<SizeType32>(effectiveBlockCount));
             }
         }
     }
diff --git a/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp b/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp
index 691fb9c7efda..4da808f1c16d 100644
--- a/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp
+++ b/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -146,16 +146,9 @@ void RuntimeBuffers::create(SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
         auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldConfig.getSize());
         auto const logitsType = engine.getTensorDataType(batch_manager::RuntimeBuffers::kLogitsTensorName);
 
-        generationLogitsCache.transposedLogits = manager.gpu(
-            ITensor::makeShape({maxBeamWidth, GenerationLogitsCache::kCACHE_LENGTH, vocabSizePadded}), logitsType);
         generationLogitsCache.logits = manager.gpu(
             ITensor::makeShape({GenerationLogitsCache::kCACHE_LENGTH, maxBatchSize * maxBeamWidth, vocabSizePadded}),
             logitsType);
-
-        generationLogitsCache.fragmentPointerDevice
-            = manager.gpu(ITensor::makeShape({GenerationLogitsCache::kCACHE_LENGTH}), nvinfer1::DataType::kINT64);
-        generationLogitsCache.fragmentPointerHost = tensorrt_llm::runtime::BufferManager::pinnedPool(
-            ITensor::makeShape({maxBatchSize, GenerationLogitsCache::kCACHE_LENGTH}), nvinfer1::DataType::kINT64);
     }
 
     if (modelConfig.useCrossAttention())
diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
index 05ed827a9511..1638a70f2e6d 100644
--- a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
+++ b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
@@ -1216,8 +1216,18 @@ void TrtGptModelInflightBatching::forwardAsync(RequestList const& activeRequests
         {
             for (auto const& llmReq : activeRequests)
             {
+                // Remove from mInflightReqIds so changeBeamWidth can proceed on the next iteration.
+                // terminateRequest frees seqSlot/KV cache but does not clean up mInflightReqIds.
+                mInflightReqIds.erase(llmReq->mRequestId);
                 terminateRequest(llmReq);
             }
+            // Force buffer/decoder reset to clean up any partial state from the aborted batch
+            // (e.g. partially-filled cross-KV block offsets from mid-context-chunk processing).
+            // This prevents subsequent requests from reusing stale RuntimeBuffers.
+            if (mWorldConfig.isLastPipelineParallelRank())
+            {
+                changeBeamWidth(mOperatingBeamWidth);
+            }
         }
         catch (std::exception const& e)
         {
diff --git a/cpp/tensorrt_llm/batch_manager/utils/inflightBatchingUtils.cpp b/cpp/tensorrt_llm/batch_manager/utils/inflightBatchingUtils.cpp
index bdb12886337c..f1b3934c4fb4 100644
--- a/cpp/tensorrt_llm/batch_manager/utils/inflightBatchingUtils.cpp
+++ b/cpp/tensorrt_llm/batch_manager/utils/inflightBatchingUtils.cpp
@@ -16,7 +16,6 @@
  */
 
 #include "inflightBatchingUtils.h"
-#include "tensorrt_llm/runtime/runtimeKernels.h"
 
 namespace tensorrt_llm::batch_manager::utils
 {
@@ -102,34 +101,39 @@ void copyGenerationLogits(RuntimeBuffers::GenerationLogitsCache& generationLogit
         "Dropped tokens have to be defined for all beams.");
 
     auto const fragmentSize = llmReq.getGenerationLogitsFragmentsSize();
+    auto const& fragments = llmReq.getGenerationLogitsFragments();
 
-    // Merge logits fragments on device
-    auto const& transposeBufferPtr = generationLogitsCache.transposedLogits;
-    auto const& cachePointerDevice = generationLogitsCache.fragmentPointerDevice;
-    auto const& cachePointerHost = generationLogitsCache.getFragmentPointerHost();
-    tensorrt_llm::runtime::kernels::mergeLogitsFragments(bufferManager, *transposeBufferPtr,
-        llmReq.getGenerationLogitsFragments(), *cachePointerDevice, *cachePointerHost, 0, 1, reqBeamWidth,
-        bufferManager.getStream(), 0);
-    llmReq.clearGenerationLogitsFragments();
-
-    // Copy logits to host
+    // Bypass mergeLogitsFragmentsKernel: copy each beam's logits directly from fragment GPU memory
+    // to the host, step by step. Each fragment has shape [1, beamWidth, vocabSizePadded] after
+    // unsqueeze(0) in HandleGenerationLogits; beam b's data starts at offset b*vocab from the
+    // fragment's base pointer. This avoids a kernel+pointer-indirection pattern that causes
+    // intermittent token corruption in gather_generation_logits+concurrent-beam-width scenarios.
     for (SizeType32 beam = 0; beam < reqBeamWidth; beam++)
     {
         auto const droppedSize = !numDroppedTokens.empty() ? numDroppedTokens.at(beam) : 0;
-        // Ignore logits of dropped tokens
         auto const beamFragmentSize = fragmentSize - droppedSize;
-        // If this function is called before the decoder, the request does not contain the generated token of the
-        // current iteration, so we add 1 to the number of tokens.
         auto const numGenerationToken
             = static_cast<SizeType32>(beforeDecoder) + llmReq.getNumTokens(beam) - llmReq.mPromptLen;
         auto const hostOffset = numGenerationToken - beamFragmentSize;
 
-        // [beamWidth, GENERATION_LOGITS_BUFFER_LENGTH, vocabSizePadded] -> [beamFragmentSize, vocabSizePadded]
-        auto beamDeviceTensorPtr = ITensor::slice(transposeBufferPtr, {beam, 0}, beamFragmentSize);
-        // [beamWidth, mMaxNewTokens, vocabSizePadded] -> [beamFragmentSize, vocabSizePadded]
-        auto beamHostTensorPtr = ITensor::slice(llmReq.getGenerationLogitsHost(), {beam, hostOffset}, beamFragmentSize);
-        bufferManager.copy(*beamDeviceTensorPtr, *beamHostTensorPtr);
+        SizeType32 constexpr kOneStep = 1;
+        for (SizeType32 stepIdx = 0; stepIdx < static_cast<SizeType32>(beamFragmentSize); ++stepIdx)
+        {
+            // frag shape: [1, beamWidth, vocabSizePadded]. Beam b starts at offset b*vocab.
+            auto const fragBeamSlice = ITensor::slice(fragments.at(stepIdx), {0, beam}, kOneStep);
+            // host shape: [beamWidth, mMaxNewTokens, vocabSizePadded]. Target: [beam, hostOffset+stepIdx, :].
+            auto const hostStepSlice
+                = ITensor::slice(llmReq.getGenerationLogitsHost(), {beam, hostOffset + stepIdx}, kOneStep);
+            bufferManager.copy(*fragBeamSlice, *hostStepSlice);
+        }
     }
+    // Clear the fragment list.  Although BufferManager::copy() enqueues async GPU-to-host
+    // transfers, no explicit stream synchronization is required here: clearing the list
+    // releases the fragment tensor *objects*, not the underlying GPU buffer.  The source
+    // memory lives inside generationLogitsCache.logits which is owned by RuntimeBuffers and
+    // remains valid until the next changeBeamWidth() call.  GPU-side ordering is ensured by
+    // the CUDA event that the decoder stream waits on before reading any shared state.
+    llmReq.clearGenerationLogitsFragments();
 
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
diff --git a/cpp/tensorrt_llm/runtime/runtimeKernels.cu b/cpp/tensorrt_llm/runtime/runtimeKernels.cu
index 3b3dbcac894a..9c806485bbcc 100644
--- a/cpp/tensorrt_llm/runtime/runtimeKernels.cu
+++ b/cpp/tensorrt_llm/runtime/runtimeKernels.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2026 NVIDIA CORPORATION &
  * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -232,75 +232,6 @@ void invokeTileTensor(ITensor& output, ITensor const& input, SizeType32 const be
         inputRowSize, outputRowSize, static_cast<uint32_t>(beamWidth));
 }
 
-// In the following kernel, we launch a grid with (microBatchSize * beamWidth, outputLen) blocks of threads. Each thread
-// block copies a `vocabSizePadded` length logits tensor from the "inputLogits (microBatchSize, beamWidth,
-// vocabSizePadded)" to the "outputGenerationLogits (batchSize, beamWidth, outputLen, vocabSizePadded)"
-template <typename T>
-__global__ void mergeLogitsFragmentsKernel(T* output, T** fragmentsVector, int const outputLen, int firstBatchSlotIdx,
-    int beamWidth, int vocabSizePadded, int stepOffset)
-{
-    // output: shape: [batchSize, beamWidth, outputLen, vocabSize]
-    // inputVecor.at(i): shape: [microBatchSize, beamWidth, vocabSize]
-
-    // Current step
-    int const curStep = blockIdx.y;
-
-    // The relatively batch slot index that this thread block in microBatchSize.
-    int const relativeBatchSlotIdx = blockIdx.x / beamWidth;
-
-    // The Absolute batch slot index in batchSize.
-    int const absoluteBatchSlotIdx = firstBatchSlotIdx + relativeBatchSlotIdx;
-
-    // The beam index that this thread block process
-    int const mbeamIdx = blockIdx.x % beamWidth;
-
-    // The output pointer
-    unsigned int const outputOffset
-        = (absoluteBatchSlotIdx * beamWidth * outputLen + mbeamIdx * outputLen + curStep + stepOffset)
-        * vocabSizePadded;
-
-    T* outputPtr = &output[outputOffset];
-
-    unsigned int const inputOffset = (relativeBatchSlotIdx * beamWidth + mbeamIdx) * vocabSizePadded;
-    // The input pointer.
-    T const* inputPtr = &fragmentsVector[curStep][inputOffset];
-
-    // The threads in the block collaborate to copy the logits.
-    for (int idx = threadIdx.x; idx < vocabSizePadded; idx += blockDim.x)
-    {
-        outputPtr[idx] = inputPtr[idx];
-    }
-}
-
-template <typename T>
-void invokeMergeLogitsFragments(BufferManager const& bufferManager, ITensor& output,
-    std::vector<TensorPtr> const& fragmentsVector, ITensor& cachePointerDevice, ITensor& cachePointerHost,
-    SizeType32 firstBatchSlotIdx, SizeType32 microBatchSize, SizeType32 beamWidth, CudaStream const& stream,
-    int stepOffset)
-{
-    size_t const fragmentsVectorSize = fragmentsVector.size();
-
-    auto cachePointerHostPtr = bufferCast<T*>(cachePointerHost);
-
-    for (int i = 0; i < fragmentsVectorSize; i++)
-    {
-        cachePointerHostPtr[i] = bufferCast<T>(*fragmentsVector.at(i));
-    }
-    bufferManager.copy(cachePointerHost, cachePointerDevice);
-
-    dim3 const blockSize(256);
-    dim3 const gridSize{(unsigned int) (microBatchSize * beamWidth), (unsigned int) (fragmentsVectorSize)};
-
-    auto const& outputShape = output.getShape();
-    auto const vocabSizePadded = static_cast<SizeType32>(outputShape.d[outputShape.nbDims - 1]);
-    auto const outputLen = static_cast<SizeType32>(outputShape.d[outputShape.nbDims - 2]);
-
-    TLLM_CHECK_WITH_INFO(outputLen >= fragmentsVectorSize, "Fragments size does not match outputLen size");
-
-    mergeLogitsFragmentsKernel<T><<<gridSize, blockSize, 0, stream.get()>>>(bufferCast<T>(output),
-        bufferCast<T*>(cachePointerDevice), outputLen, firstBatchSlotIdx, beamWidth, vocabSizePadded, stepOffset);
-}
-
 } // namespace
 
 template <typename T>
@@ -437,37 +368,6 @@ void tileTensor(ITensor& output, ITensor const& input, SizeType32 beamWidth, Cud
     }
 }
 
-void mergeLogitsFragments(BufferManager const& bufferManager, ITensor& output,
-    std::vector<TensorPtr> const& fragmentsVector, ITensor& cachePointerDevice, ITensor& cachePointerHost,
-    SizeType32 firstBatchSlotIdx, SizeType32 const microBatchSize, SizeType32 const beamWidth, CudaStream const& stream,
-    int stepOffset)
-{
-    switch (output.getDataType())
-    {
-    case nvinfer1::DataType::kFLOAT:
-        invokeMergeLogitsFragments<float>(bufferManager, output, fragmentsVector, cachePointerDevice, cachePointerHost,
-            firstBatchSlotIdx, microBatchSize, beamWidth, stream, stepOffset);
-        break;
-    case nvinfer1::DataType::kHALF:
-        invokeMergeLogitsFragments<half>(bufferManager, output, fragmentsVector, cachePointerDevice, cachePointerHost,
-            firstBatchSlotIdx, microBatchSize, beamWidth, stream, stepOffset);
-        break;
-#ifdef ENABLE_BF16
-    case nvinfer1::DataType::kBF16:
-        invokeMergeLogitsFragments<__nv_bfloat16>(bufferManager, output, fragmentsVector, cachePointerDevice,
-            cachePointerHost, firstBatchSlotIdx, microBatchSize, beamWidth, stream, stepOffset);
-        break;
-#endif // ENABLE_BF16
-#ifdef ENABLE_FP8
-    case nvinfer1::DataType::kFP8:
-        invokeMergeLogitsFragments<__nv_fp8_e4m3>(bufferManager, output, fragmentsVector, cachePointerDevice,
-            cachePointerHost, firstBatchSlotIdx, microBatchSize, beamWidth, stream, stepOffset);
-        break;
-#endif // ENABLE_FP8
-    default: TLLM_THROW("data type not supported");
-    }
-}
-
 void invokeUpdateKVBlockArrayDraftTokenLocation(ITensor const& seqAcceptedDraftTokenOffsets,
     ITensor const& packedAcceptedDraftTokensIndices, ITensor const& pastKeyValueLengths, void* const* pointerArray,
     ::tensorrt_llm::kernels::KVCacheIndex const* offsetArray, SizeType32 layerCount, SizeType32 seqCount,
diff --git a/cpp/tensorrt_llm/runtime/runtimeKernels.h b/cpp/tensorrt_llm/runtime/runtimeKernels.h
index ae251877e583..55824856af09 100644
--- a/cpp/tensorrt_llm/runtime/runtimeKernels.h
+++ b/cpp/tensorrt_llm/runtime/runtimeKernels.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2026, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,11 +47,6 @@ void scatterTensor(ITensor& output, ITensor const& input, SizeType32 beamWidth,
 
 void tileTensor(ITensor& output, ITensor const& input, SizeType32 beamWidth, CudaStream const& stream);
 
-void mergeLogitsFragments(BufferManager const& bufferManager, ITensor& output,
-    std::vector<TensorPtr> const& fragmentsVector, ITensor& cachePointerDevice, ITensor& cachePointerHost,
-    SizeType32 firstBatchSlotIdx, SizeType32 microBatchSize, SizeType32 beamWidth, CudaStream const& stream,
-    int stepOffset);
-
 void invokeUpdateKVBlockArrayDraftTokenLocation(ITensor const& seqAcceptedDraftTokenOffsets,
     ITensor const& packedAcceptedDraftTokensIndices, ITensor const& pastKeyValueLengths, void* const* pointerArray,
     ::tensorrt_llm::kernels::KVCacheIndex const* offsetArray, SizeType32 layerCount, SizeType32 seqCount,
diff --git a/cpp/tests/unit_tests/batch_manager/CMakeLists.txt b/cpp/tests/unit_tests/batch_manager/CMakeLists.txt
index 2ee994f76226..eeb5ec34deef 100644
--- a/cpp/tests/unit_tests/batch_manager/CMakeLists.txt
+++ b/cpp/tests/unit_tests/batch_manager/CMakeLists.txt
@@ -31,3 +31,4 @@ add_gtest(rnnCacheFormatterTest rnnCacheFormatterTest.cpp)
 add_gtest(cudaGraphExecutorCacheTest cudaGraphExecutorCacheTest.cpp)
 add_gtest(agentTreeTest agentTreeTest.cpp)
 add_gtest(truncateBlocksTest truncateBlocksTest.cpp)
+add_gtest(encDecBeamSearchTest encDecBeamSearchTest.cpp)
diff --git a/cpp/tests/unit_tests/batch_manager/encDecBeamSearchTest.cpp b/cpp/tests/unit_tests/batch_manager/encDecBeamSearchTest.cpp
new file mode 100644
index 000000000000..1d95204fce2b
--- /dev/null
+++ b/cpp/tests/unit_tests/batch_manager/encDecBeamSearchTest.cpp
@@ -0,0 +1,219 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Regression tests for encoder-decoder beam-search fixes.
+//
+// Fix 1 — cross-KV copyBlockOffsets shares beam-0 blocks across all beams.
+//   Before: beams 1..N-1 received uninitialised physical blocks, causing
+//   degenerate output ("happ happ happ") when the decoder cross-attended to
+//   garbage encoder features.
+//   After:  KVCacheManager::copyBlockOffsets uses beam-0's block IDs for every
+//   beam when isCrossKv() is true, since the encoder output is identical for
+//   all beams of a request.
+//
+// Fix 3 — copyGenerationLogits direct-copy places each (beam, step) logit at
+//   the correct host offset.
+//   Before: the mergeLogitsFragmentsKernel pointer-indirection caused
+//   intermittent GPU-state corruption when gather_generation_logits=True was
+//   combined with concurrent mixed beam-width requests.
+//   After:  each fragment is copied directly to the host without the kernel.
+
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
+#include "tensorrt_llm/batch_manager/utils/inflightBatchingUtils.h"
+#include "tensorrt_llm/common/memoryUtils.h"
+#include "tensorrt_llm/kernels/kvCacheIndex.h"
+#include "tensorrt_llm/runtime/bufferManager.h"
+#include "tensorrt_llm/runtime/cudaStream.h"
+#include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/samplingConfig.h"
+#include "gtest/gtest.h"
+#include <memory>
+
+using namespace tensorrt_llm::batch_manager;
+using namespace tensorrt_llm::batch_manager::kv_cache_manager;
+namespace tr = tensorrt_llm::runtime;
+namespace tc = tensorrt_llm::common;
+namespace tk = tensorrt_llm::kernels;
+using SizeType32 = tr::SizeType32;
+
+// ============================================================================
+// Fix 1: KVCacheManager::copyBlockOffsets cross-KV beam sharing
+// ============================================================================
+
+// Verify that for a cross-KV cache with beam width > 1, copyBlockOffsets
+// places the same physical block IDs in every beam slot.
+//
+// This tests the isCrossKv() branch added by fix 1: when all beams share the
+// same encoder output, the output offset table must reflect that.  In the
+// production Whisper case the allocator gives each beam its own physical
+// blocks, so beams 1..N-1 would reference uninitialised GPU memory without
+// the fix.  This unit test uses a simple context-only sequence where the
+// allocator happens to share blocks across beams; the observable property
+// (all beam slots equal beam-0) still holds and constitutes a sanity check.
+TEST(CrossKvBeamSharingTest, CopyBlockOffsetsAllBeamsShareBeam0Blocks)
+{
+    // Encoder-decoder setup: 1 layer, 1 KV head, sizePerHead=4,
+    // tokensPerBlock=8, encoder output length=16 → 2 blocks per sequence,
+    // beam width=3.
+    auto stream = std::make_shared<tr::CudaStream>();
+
+    SizeType32 constexpr numLayers = 1;
+    SizeType32 constexpr numHeads = 1;
+    SizeType32 constexpr sizePerHead = 4;
+    SizeType32 constexpr tokensPerBlock = 8;
+    SizeType32 constexpr maxNumSequences = 1;
+    SizeType32 constexpr beamWidth = 3;
+    SizeType32 constexpr maxAttentionWindow = 16;
+    SizeType32 constexpr encoderLen = 16; // 2 blocks
+
+    // Reserve enough blocks for 1 sequence × beamWidth × (encoderLen / tokensPerBlock).
+    SizeType32 constexpr numBlocks = maxNumSequences * beamWidth * (encoderLen / tokensPerBlock);
+    BlocksPerWindow const blocksPerWindow{{maxAttentionWindow, {numBlocks, 0}}};
+
+    KVCacheManager crossKvMgr(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences,
+        beamWidth, {maxAttentionWindow}, nvinfer1::DataType::kHALF,
+        /*sinkTokenLength=*/0, stream, maxAttentionWindow, maxAttentionWindow,
+        /*enableBlockReuse=*/false, CacheType::kCROSS);
+    crossKvMgr.allocatePools(false);
+
+    // Build a minimal LlmRequest and allocate a cross-KV sequence.
+    RequestIdType constexpr requestId = 1;
+    auto inputTokens = std::make_shared<VecTokens>(encoderLen, 0);
+    tr::SamplingConfig const samplingConfig{beamWidth};
+    auto llmReq = std::make_shared<LlmRequest>(requestId, /*maxNewTokens=*/0, inputTokens, samplingConfig, false);
+    crossKvMgr.addSequenceBatch({{{requestId, encoderLen, beamWidth}}}, {std::ref(*llmReq)});
+
+    // Allocate CPU output tensor: [numPools, maxNumSeq*beamWidth, 2, maxBlocksPerSeq].
+    auto const dims = crossKvMgr.getOffsetTableDimensions();
+    SizeType32 const numPools = dims.numPools;
+    SizeType32 const maxBlocksPerSeq = dims.maxBlocksPerSeq;
+    auto blockOffsets
+        = tr::BufferManager::cpu(tr::ITensor::makeShape({numPools, maxNumSequences * beamWidth, 2, maxBlocksPerSeq}),
+            tr::TRTDataType<tk::KVCacheIndex>::value);
+
+    // Fill with sentinel so we can detect un-written slots.
+    auto* const raw = tr::bufferCast<tk::KVCacheIndex>(*blockOffsets);
+    std::fill(raw, raw + blockOffsets->getSize(), tk::KVCacheIndex{tk::KVCacheIndex::kInvalidPoolIndex});
+
+    crossKvMgr.copyBlockOffsets(*blockOffsets, /*outputSlotOffset=*/0, requestId);
+
+    // Post-condition: for every (pool, K/V, block), beams 1..beamWidth-1
+    // must hold the same physical block index as beam 0, and beam 0 itself
+    // must be a valid (non-sentinel) index.
+    auto const& shape = blockOffsets->getShape();
+    for (SizeType32 pool = 0; pool < numPools; ++pool)
+    {
+        for (SizeType32 kv = 0; kv < 2; ++kv)
+        {
+            for (SizeType32 block = 0; block < maxBlocksPerSeq; ++block)
+            {
+                auto idx = [&](SizeType32 beam) { return tc::flat_index(shape.d, pool, beam, kv, block); };
+
+                EXPECT_NE(raw[idx(0)].get(), tk::KVCacheIndex::kInvalidPoolIndex)
+                    << "pool=" << pool << " beam=0 kv=" << kv << " block=" << block << ": not initialised";
+
+                for (SizeType32 beam = 1; beam < beamWidth; ++beam)
+                {
+                    EXPECT_EQ(raw[idx(beam)].get(), raw[idx(0)].get())
+                        << "pool=" << pool << " beam=" << beam << " kv=" << kv << " block=" << block
+                        << ": differs from beam 0 — cross-KV beam sharing is broken";
+                }
+            }
+        }
+    }
+}
+
+// ============================================================================
+// Fix 3: copyGenerationLogits direct-copy correctness
+// ============================================================================
+
+// Verify that the direct-copy implementation of copyGenerationLogits writes
+// each step's logits for each beam to the correct slot in the host buffer.
+TEST(CopyGenerationLogitsTest, DirectCopyPlacesEachBeamStepAtCorrectHostOffset)
+{
+    // Parameters.
+    SizeType32 constexpr beamWidth = 2;
+    SizeType32 constexpr numSteps = 4; // one full cache-length flush
+    SizeType32 constexpr vocabSize = 8;
+    SizeType32 constexpr promptLen = 1;
+
+    auto stream = std::make_shared<tr::CudaStream>();
+    tr::BufferManager bufferMgr{stream};
+
+    // Create a request: promptLen=1, maxNewTokens=numSteps.
+    RequestIdType constexpr requestId = 1;
+    auto inputTokens = std::make_shared<VecTokens>(promptLen, 0);
+    tr::SamplingConfig const samplingConfig{beamWidth};
+    auto llmReq = std::make_shared<LlmRequest>(requestId, numSteps, inputTokens, samplingConfig, /*isStreaming=*/false);
+
+    // Advance internal token count to simulate numSteps tokens generated so
+    // that (with beforeDecoder=false):
+    //   numGenerationToken = getNumTokens(beam) - mPromptLen = numSteps
+    //   hostOffset         = numGenerationToken - fragmentSize = 0
+    LlmRequest::BeamTokens const generatedTokens(beamWidth, VecTokens(numSteps, /*token=*/1));
+    llmReq->setGeneratedTokens(generatedTokens);
+    llmReq->allocGenerationLogitsHost(vocabSize, nvinfer1::DataType::kFLOAT);
+
+    // Build numSteps logit fragments, each of shape [1, beamWidth, vocabSize],
+    // filled with a unique per-(step, beam) sentinel value: step*100 + beam.
+    for (SizeType32 step = 0; step < numSteps; ++step)
+    {
+        tr::ITensor::SharedPtr frag = tr::BufferManager::pinnedPool(
+            tr::ITensor::makeShape({1, beamWidth, vocabSize}), nvinfer1::DataType::kFLOAT);
+        auto* const fragData = tr::bufferCast<float>(*frag);
+        for (SizeType32 beam = 0; beam < beamWidth; ++beam)
+        {
+            float const val = static_cast<float>(step * 100 + beam);
+            for (SizeType32 v = 0; v < vocabSize; ++v)
+            {
+                // Flat layout: [1][beam][v] → beam * vocabSize + v
+                fragData[beam * vocabSize + v] = val;
+            }
+        }
+        llmReq->addGenerationLogitsFragment(frag);
+    }
+    ASSERT_EQ(llmReq->getGenerationLogitsFragmentsSize(), numSteps);
+
+    // Dummy cache — not accessed by the direct-copy implementation.
+    RuntimeBuffers::GenerationLogitsCache dummyCache;
+
+    utils::copyGenerationLogits(dummyCache, bufferMgr, *llmReq, /*beforeDecoder=*/false, /*numDroppedTokens=*/{});
+
+    ASSERT_EQ(cudaStreamSynchronize(stream->get()), cudaSuccess);
+
+    // Post-condition: generationLogitsHost[beam, step, v] == step*100 + beam
+    // for all (beam, step, v).  Host shape: [beamWidth, maxNewTokens, vocab].
+    auto const* const hostData = tr::bufferCast<float>(*llmReq->getGenerationLogitsHost());
+    for (SizeType32 beam = 0; beam < beamWidth; ++beam)
+    {
+        for (SizeType32 step = 0; step < numSteps; ++step)
+        {
+            float const expected = static_cast<float>(step * 100 + beam);
+            for (SizeType32 v = 0; v < vocabSize; ++v)
+            {
+                SizeType32 const flatIdx = (beam * numSteps + step) * vocabSize + v;
+                EXPECT_FLOAT_EQ(hostData[flatIdx], expected) << "host[beam=" << beam << ", step=" << step << ", v=" << v
+                                                             << "]=" << hostData[flatIdx] << " expected " << expected;
+            }
+        }
+    }
+
+    // copyGenerationLogits must clear fragments after flushing.
+    EXPECT_EQ(llmReq->getGenerationLogitsFragmentsSize(), 0);
+}