diff --git a/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h b/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h index 13bde6d07a5e..bbb67db3e3c8 100644 --- a/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h +++ b/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023-2026, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -204,30 +204,6 @@ class RuntimeBuffers TensorPtr logits; //! Record the usage offset of the cacheGenerationLogits buffer SizeType32 offset{0}; - - //! Temporarily store the transposed results of multiple fragment logits, [maxBeamWidth, kCACHE_LENGTH] - TensorPtr transposedLogits; - - //! Temporarily store logits buffer address during the transposing, [kCACHE_LENGTH] - TensorPtr fragmentPointerDevice; - - //! Temporarily store logits buffer address during the transposing, [maxBatchSize, kCACHE_LENGTH] - TensorPtr fragmentPointerHost; - - //! Cycling index for workspace - size_t workIdx{0}; - - void cycleWorkIdx() - { - workIdx = (workIdx + 1) % (fragmentPointerHost->getShape().d[0]); - } - - [[nodiscard]] TensorPtr getFragmentPointerHost() - { - TensorPtr slice = runtime::ITensor::slice(fragmentPointerHost, workIdx, 1); - cycleWorkIdx(); - return slice; - }; }; GenerationLogitsCache generationLogitsCache; diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp index d1112439686e..cad7abcfad8a 100644 --- a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp +++ b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp @@ -3945,16 +3945,19 @@ SizeType32 KVCacheManager::copyBlockOffsets(ITensor& output, SizeType32 outputSl { for (SizeType32 beamIdx = 0; beamIdx < beamWidth; ++beamIdx) { - auto const beamBlockCount = cacheBlockIds[beamIdx].size(); - auto const copyChunkSize = beamBlockCount * sizeof(tk::KVCacheIndex); + // For cross-KV (encoder features), all beams of a request share the same encoder output. + // Always use beam 0's block IDs so all beams attend to the correct encoder features. + auto const srcBeamIdx = isCrossKv() ? 0 : beamIdx; + auto const effectiveBlockCount = isCrossKv() ? cacheBlockIds[0].size() : cacheBlockIds[beamIdx].size(); + auto const copyChunkSize = effectiveBlockCount * sizeof(tk::KVCacheIndex); for (auto xIdx : {kIdx, vIdx}) { - auto const srcIndex = tc::flat_index(srcShape.d, poolIdx, beamIdx, xIdx, 0); + auto const srcIndex = tc::flat_index(srcShape.d, poolIdx, srcBeamIdx, xIdx, 0); auto const dstIndex = tc::flat_index(dstShape.d, absolutePoolIdx, outputSlotOffset + beamIdx, xIdx, 0); std::memcpy(dstPtr + dstIndex, srcPtr + srcIndex, copyChunkSize); } - maxBlockCount = std::max(maxBlockCount, static_cast(beamBlockCount)); + maxBlockCount = std::max(maxBlockCount, static_cast(effectiveBlockCount)); } } } diff --git a/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp b/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp index 691fb9c7efda..4da808f1c16d 100644 --- a/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp +++ b/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -146,16 +146,9 @@ void RuntimeBuffers::create(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldConfig.getSize()); auto const logitsType = engine.getTensorDataType(batch_manager::RuntimeBuffers::kLogitsTensorName); - generationLogitsCache.transposedLogits = manager.gpu( - ITensor::makeShape({maxBeamWidth, GenerationLogitsCache::kCACHE_LENGTH, vocabSizePadded}), logitsType); generationLogitsCache.logits = manager.gpu( ITensor::makeShape({GenerationLogitsCache::kCACHE_LENGTH, maxBatchSize * maxBeamWidth, vocabSizePadded}), logitsType); - - generationLogitsCache.fragmentPointerDevice - = manager.gpu(ITensor::makeShape({GenerationLogitsCache::kCACHE_LENGTH}), nvinfer1::DataType::kINT64); - generationLogitsCache.fragmentPointerHost = tensorrt_llm::runtime::BufferManager::pinnedPool( - ITensor::makeShape({maxBatchSize, GenerationLogitsCache::kCACHE_LENGTH}), nvinfer1::DataType::kINT64); } if (modelConfig.useCrossAttention()) diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp index 05ed827a9511..1638a70f2e6d 100644 --- a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp +++ b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp @@ -1216,8 +1216,18 @@ void TrtGptModelInflightBatching::forwardAsync(RequestList const& activeRequests { for (auto const& llmReq : activeRequests) { + // Remove from mInflightReqIds so changeBeamWidth can proceed on the next iteration. + // terminateRequest frees seqSlot/KV cache but does not clean up mInflightReqIds. + mInflightReqIds.erase(llmReq->mRequestId); terminateRequest(llmReq); } + // Force buffer/decoder reset to clean up any partial state from the aborted batch + // (e.g. partially-filled cross-KV block offsets from mid-context-chunk processing). + // This prevents subsequent requests from reusing stale RuntimeBuffers. + if (mWorldConfig.isLastPipelineParallelRank()) + { + changeBeamWidth(mOperatingBeamWidth); + } } catch (std::exception const& e) { diff --git a/cpp/tensorrt_llm/batch_manager/utils/inflightBatchingUtils.cpp b/cpp/tensorrt_llm/batch_manager/utils/inflightBatchingUtils.cpp index bdb12886337c..f1b3934c4fb4 100644 --- a/cpp/tensorrt_llm/batch_manager/utils/inflightBatchingUtils.cpp +++ b/cpp/tensorrt_llm/batch_manager/utils/inflightBatchingUtils.cpp @@ -16,7 +16,6 @@ */ #include "inflightBatchingUtils.h" -#include "tensorrt_llm/runtime/runtimeKernels.h" namespace tensorrt_llm::batch_manager::utils { @@ -102,34 +101,39 @@ void copyGenerationLogits(RuntimeBuffers::GenerationLogitsCache& generationLogit "Dropped tokens have to be defined for all beams."); auto const fragmentSize = llmReq.getGenerationLogitsFragmentsSize(); + auto const& fragments = llmReq.getGenerationLogitsFragments(); - // Merge logits fragments on device - auto const& transposeBufferPtr = generationLogitsCache.transposedLogits; - auto const& cachePointerDevice = generationLogitsCache.fragmentPointerDevice; - auto const& cachePointerHost = generationLogitsCache.getFragmentPointerHost(); - tensorrt_llm::runtime::kernels::mergeLogitsFragments(bufferManager, *transposeBufferPtr, - llmReq.getGenerationLogitsFragments(), *cachePointerDevice, *cachePointerHost, 0, 1, reqBeamWidth, - bufferManager.getStream(), 0); - llmReq.clearGenerationLogitsFragments(); - - // Copy logits to host + // Bypass mergeLogitsFragmentsKernel: copy each beam's logits directly from fragment GPU memory + // to the host, step by step. Each fragment has shape [1, beamWidth, vocabSizePadded] after + // unsqueeze(0) in HandleGenerationLogits; beam b's data starts at offset b*vocab from the + // fragment's base pointer. This avoids a kernel+pointer-indirection pattern that causes + // intermittent token corruption in gather_generation_logits+concurrent-beam-width scenarios. for (SizeType32 beam = 0; beam < reqBeamWidth; beam++) { auto const droppedSize = !numDroppedTokens.empty() ? numDroppedTokens.at(beam) : 0; - // Ignore logits of dropped tokens auto const beamFragmentSize = fragmentSize - droppedSize; - // If this function is called before the decoder, the request does not contain the generated token of the - // current iteration, so we add 1 to the number of tokens. auto const numGenerationToken = static_cast(beforeDecoder) + llmReq.getNumTokens(beam) - llmReq.mPromptLen; auto const hostOffset = numGenerationToken - beamFragmentSize; - // [beamWidth, GENERATION_LOGITS_BUFFER_LENGTH, vocabSizePadded] -> [beamFragmentSize, vocabSizePadded] - auto beamDeviceTensorPtr = ITensor::slice(transposeBufferPtr, {beam, 0}, beamFragmentSize); - // [beamWidth, mMaxNewTokens, vocabSizePadded] -> [beamFragmentSize, vocabSizePadded] - auto beamHostTensorPtr = ITensor::slice(llmReq.getGenerationLogitsHost(), {beam, hostOffset}, beamFragmentSize); - bufferManager.copy(*beamDeviceTensorPtr, *beamHostTensorPtr); + SizeType32 constexpr kOneStep = 1; + for (SizeType32 stepIdx = 0; stepIdx < static_cast(beamFragmentSize); ++stepIdx) + { + // frag shape: [1, beamWidth, vocabSizePadded]. Beam b starts at offset b*vocab. + auto const fragBeamSlice = ITensor::slice(fragments.at(stepIdx), {0, beam}, kOneStep); + // host shape: [beamWidth, mMaxNewTokens, vocabSizePadded]. Target: [beam, hostOffset+stepIdx, :]. + auto const hostStepSlice + = ITensor::slice(llmReq.getGenerationLogitsHost(), {beam, hostOffset + stepIdx}, kOneStep); + bufferManager.copy(*fragBeamSlice, *hostStepSlice); + } } + // Clear the fragment list. Although BufferManager::copy() enqueues async GPU-to-host + // transfers, no explicit stream synchronization is required here: clearing the list + // releases the fragment tensor *objects*, not the underlying GPU buffer. The source + // memory lives inside generationLogitsCache.logits which is owned by RuntimeBuffers and + // remains valid until the next changeBeamWidth() call. GPU-side ordering is ensured by + // the CUDA event that the decoder stream waits on before reading any shared state. + llmReq.clearGenerationLogitsFragments(); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } diff --git a/cpp/tensorrt_llm/runtime/runtimeKernels.cu b/cpp/tensorrt_llm/runtime/runtimeKernels.cu index 3b3dbcac894a..9c806485bbcc 100644 --- a/cpp/tensorrt_llm/runtime/runtimeKernels.cu +++ b/cpp/tensorrt_llm/runtime/runtimeKernels.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & + * SPDX-FileCopyrightText: Copyright (c) 1993-2026 NVIDIA CORPORATION & * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -232,75 +232,6 @@ void invokeTileTensor(ITensor& output, ITensor const& input, SizeType32 const be inputRowSize, outputRowSize, static_cast(beamWidth)); } -// In the following kernel, we launch a grid with (microBatchSize * beamWidth, outputLen) blocks of threads. Each thread -// block copies a `vocabSizePadded` length logits tensor from the "inputLogits (microBatchSize, beamWidth, -// vocabSizePadded)" to the "outputGenerationLogits (batchSize, beamWidth, outputLen, vocabSizePadded)" -template -__global__ void mergeLogitsFragmentsKernel(T* output, T** fragmentsVector, int const outputLen, int firstBatchSlotIdx, - int beamWidth, int vocabSizePadded, int stepOffset) -{ - // output: shape: [batchSize, beamWidth, outputLen, vocabSize] - // inputVecor.at(i): shape: [microBatchSize, beamWidth, vocabSize] - - // Current step - int const curStep = blockIdx.y; - - // The relatively batch slot index that this thread block in microBatchSize. - int const relativeBatchSlotIdx = blockIdx.x / beamWidth; - - // The Absolute batch slot index in batchSize. - int const absoluteBatchSlotIdx = firstBatchSlotIdx + relativeBatchSlotIdx; - - // The beam index that this thread block process - int const mbeamIdx = blockIdx.x % beamWidth; - - // The output pointer - unsigned int const outputOffset - = (absoluteBatchSlotIdx * beamWidth * outputLen + mbeamIdx * outputLen + curStep + stepOffset) - * vocabSizePadded; - - T* outputPtr = &output[outputOffset]; - - unsigned int const inputOffset = (relativeBatchSlotIdx * beamWidth + mbeamIdx) * vocabSizePadded; - // The input pointer. - T const* inputPtr = &fragmentsVector[curStep][inputOffset]; - - // The threads in the block collaborate to copy the logits. - for (int idx = threadIdx.x; idx < vocabSizePadded; idx += blockDim.x) - { - outputPtr[idx] = inputPtr[idx]; - } -} - -template -void invokeMergeLogitsFragments(BufferManager const& bufferManager, ITensor& output, - std::vector const& fragmentsVector, ITensor& cachePointerDevice, ITensor& cachePointerHost, - SizeType32 firstBatchSlotIdx, SizeType32 microBatchSize, SizeType32 beamWidth, CudaStream const& stream, - int stepOffset) -{ - size_t const fragmentsVectorSize = fragmentsVector.size(); - - auto cachePointerHostPtr = bufferCast(cachePointerHost); - - for (int i = 0; i < fragmentsVectorSize; i++) - { - cachePointerHostPtr[i] = bufferCast(*fragmentsVector.at(i)); - } - bufferManager.copy(cachePointerHost, cachePointerDevice); - - dim3 const blockSize(256); - dim3 const gridSize{(unsigned int) (microBatchSize * beamWidth), (unsigned int) (fragmentsVectorSize)}; - - auto const& outputShape = output.getShape(); - auto const vocabSizePadded = static_cast(outputShape.d[outputShape.nbDims - 1]); - auto const outputLen = static_cast(outputShape.d[outputShape.nbDims - 2]); - - TLLM_CHECK_WITH_INFO(outputLen >= fragmentsVectorSize, "Fragments size does not match outputLen size"); - - mergeLogitsFragmentsKernel<<>>(bufferCast(output), - bufferCast(cachePointerDevice), outputLen, firstBatchSlotIdx, beamWidth, vocabSizePadded, stepOffset); -} - } // namespace template @@ -437,37 +368,6 @@ void tileTensor(ITensor& output, ITensor const& input, SizeType32 beamWidth, Cud } } -void mergeLogitsFragments(BufferManager const& bufferManager, ITensor& output, - std::vector const& fragmentsVector, ITensor& cachePointerDevice, ITensor& cachePointerHost, - SizeType32 firstBatchSlotIdx, SizeType32 const microBatchSize, SizeType32 const beamWidth, CudaStream const& stream, - int stepOffset) -{ - switch (output.getDataType()) - { - case nvinfer1::DataType::kFLOAT: - invokeMergeLogitsFragments(bufferManager, output, fragmentsVector, cachePointerDevice, cachePointerHost, - firstBatchSlotIdx, microBatchSize, beamWidth, stream, stepOffset); - break; - case nvinfer1::DataType::kHALF: - invokeMergeLogitsFragments(bufferManager, output, fragmentsVector, cachePointerDevice, cachePointerHost, - firstBatchSlotIdx, microBatchSize, beamWidth, stream, stepOffset); - break; -#ifdef ENABLE_BF16 - case nvinfer1::DataType::kBF16: - invokeMergeLogitsFragments<__nv_bfloat16>(bufferManager, output, fragmentsVector, cachePointerDevice, - cachePointerHost, firstBatchSlotIdx, microBatchSize, beamWidth, stream, stepOffset); - break; -#endif // ENABLE_BF16 -#ifdef ENABLE_FP8 - case nvinfer1::DataType::kFP8: - invokeMergeLogitsFragments<__nv_fp8_e4m3>(bufferManager, output, fragmentsVector, cachePointerDevice, - cachePointerHost, firstBatchSlotIdx, microBatchSize, beamWidth, stream, stepOffset); - break; -#endif // ENABLE_FP8 - default: TLLM_THROW("data type not supported"); - } -} - void invokeUpdateKVBlockArrayDraftTokenLocation(ITensor const& seqAcceptedDraftTokenOffsets, ITensor const& packedAcceptedDraftTokensIndices, ITensor const& pastKeyValueLengths, void* const* pointerArray, ::tensorrt_llm::kernels::KVCacheIndex const* offsetArray, SizeType32 layerCount, SizeType32 seqCount, diff --git a/cpp/tensorrt_llm/runtime/runtimeKernels.h b/cpp/tensorrt_llm/runtime/runtimeKernels.h index ae251877e583..55824856af09 100644 --- a/cpp/tensorrt_llm/runtime/runtimeKernels.h +++ b/cpp/tensorrt_llm/runtime/runtimeKernels.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2026, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,11 +47,6 @@ void scatterTensor(ITensor& output, ITensor const& input, SizeType32 beamWidth, void tileTensor(ITensor& output, ITensor const& input, SizeType32 beamWidth, CudaStream const& stream); -void mergeLogitsFragments(BufferManager const& bufferManager, ITensor& output, - std::vector const& fragmentsVector, ITensor& cachePointerDevice, ITensor& cachePointerHost, - SizeType32 firstBatchSlotIdx, SizeType32 microBatchSize, SizeType32 beamWidth, CudaStream const& stream, - int stepOffset); - void invokeUpdateKVBlockArrayDraftTokenLocation(ITensor const& seqAcceptedDraftTokenOffsets, ITensor const& packedAcceptedDraftTokensIndices, ITensor const& pastKeyValueLengths, void* const* pointerArray, ::tensorrt_llm::kernels::KVCacheIndex const* offsetArray, SizeType32 layerCount, SizeType32 seqCount, diff --git a/cpp/tests/unit_tests/batch_manager/CMakeLists.txt b/cpp/tests/unit_tests/batch_manager/CMakeLists.txt index 2ee994f76226..eeb5ec34deef 100644 --- a/cpp/tests/unit_tests/batch_manager/CMakeLists.txt +++ b/cpp/tests/unit_tests/batch_manager/CMakeLists.txt @@ -31,3 +31,4 @@ add_gtest(rnnCacheFormatterTest rnnCacheFormatterTest.cpp) add_gtest(cudaGraphExecutorCacheTest cudaGraphExecutorCacheTest.cpp) add_gtest(agentTreeTest agentTreeTest.cpp) add_gtest(truncateBlocksTest truncateBlocksTest.cpp) +add_gtest(encDecBeamSearchTest encDecBeamSearchTest.cpp) diff --git a/cpp/tests/unit_tests/batch_manager/encDecBeamSearchTest.cpp b/cpp/tests/unit_tests/batch_manager/encDecBeamSearchTest.cpp new file mode 100644 index 000000000000..1d95204fce2b --- /dev/null +++ b/cpp/tests/unit_tests/batch_manager/encDecBeamSearchTest.cpp @@ -0,0 +1,219 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Regression tests for encoder-decoder beam-search fixes. +// +// Fix 1 — cross-KV copyBlockOffsets shares beam-0 blocks across all beams. +// Before: beams 1..N-1 received uninitialised physical blocks, causing +// degenerate output ("happ happ happ") when the decoder cross-attended to +// garbage encoder features. +// After: KVCacheManager::copyBlockOffsets uses beam-0's block IDs for every +// beam when isCrossKv() is true, since the encoder output is identical for +// all beams of a request. +// +// Fix 3 — copyGenerationLogits direct-copy places each (beam, step) logit at +// the correct host offset. +// Before: the mergeLogitsFragmentsKernel pointer-indirection caused +// intermittent GPU-state corruption when gather_generation_logits=True was +// combined with concurrent mixed beam-width requests. +// After: each fragment is copied directly to the host without the kernel. + +#include "tensorrt_llm/batch_manager/kvCacheManager.h" +#include "tensorrt_llm/batch_manager/llmRequest.h" +#include "tensorrt_llm/batch_manager/runtimeBuffers.h" +#include "tensorrt_llm/batch_manager/utils/inflightBatchingUtils.h" +#include "tensorrt_llm/common/memoryUtils.h" +#include "tensorrt_llm/kernels/kvCacheIndex.h" +#include "tensorrt_llm/runtime/bufferManager.h" +#include "tensorrt_llm/runtime/cudaStream.h" +#include "tensorrt_llm/runtime/iTensor.h" +#include "tensorrt_llm/runtime/samplingConfig.h" +#include "gtest/gtest.h" +#include + +using namespace tensorrt_llm::batch_manager; +using namespace tensorrt_llm::batch_manager::kv_cache_manager; +namespace tr = tensorrt_llm::runtime; +namespace tc = tensorrt_llm::common; +namespace tk = tensorrt_llm::kernels; +using SizeType32 = tr::SizeType32; + +// ============================================================================ +// Fix 1: KVCacheManager::copyBlockOffsets cross-KV beam sharing +// ============================================================================ + +// Verify that for a cross-KV cache with beam width > 1, copyBlockOffsets +// places the same physical block IDs in every beam slot. +// +// This tests the isCrossKv() branch added by fix 1: when all beams share the +// same encoder output, the output offset table must reflect that. In the +// production Whisper case the allocator gives each beam its own physical +// blocks, so beams 1..N-1 would reference uninitialised GPU memory without +// the fix. This unit test uses a simple context-only sequence where the +// allocator happens to share blocks across beams; the observable property +// (all beam slots equal beam-0) still holds and constitutes a sanity check. +TEST(CrossKvBeamSharingTest, CopyBlockOffsetsAllBeamsShareBeam0Blocks) +{ + // Encoder-decoder setup: 1 layer, 1 KV head, sizePerHead=4, + // tokensPerBlock=8, encoder output length=16 → 2 blocks per sequence, + // beam width=3. + auto stream = std::make_shared(); + + SizeType32 constexpr numLayers = 1; + SizeType32 constexpr numHeads = 1; + SizeType32 constexpr sizePerHead = 4; + SizeType32 constexpr tokensPerBlock = 8; + SizeType32 constexpr maxNumSequences = 1; + SizeType32 constexpr beamWidth = 3; + SizeType32 constexpr maxAttentionWindow = 16; + SizeType32 constexpr encoderLen = 16; // 2 blocks + + // Reserve enough blocks for 1 sequence × beamWidth × (encoderLen / tokensPerBlock). + SizeType32 constexpr numBlocks = maxNumSequences * beamWidth * (encoderLen / tokensPerBlock); + BlocksPerWindow const blocksPerWindow{{maxAttentionWindow, {numBlocks, 0}}}; + + KVCacheManager crossKvMgr(numLayers, numHeads, sizePerHead, tokensPerBlock, blocksPerWindow, maxNumSequences, + beamWidth, {maxAttentionWindow}, nvinfer1::DataType::kHALF, + /*sinkTokenLength=*/0, stream, maxAttentionWindow, maxAttentionWindow, + /*enableBlockReuse=*/false, CacheType::kCROSS); + crossKvMgr.allocatePools(false); + + // Build a minimal LlmRequest and allocate a cross-KV sequence. + RequestIdType constexpr requestId = 1; + auto inputTokens = std::make_shared(encoderLen, 0); + tr::SamplingConfig const samplingConfig{beamWidth}; + auto llmReq = std::make_shared(requestId, /*maxNewTokens=*/0, inputTokens, samplingConfig, false); + crossKvMgr.addSequenceBatch({{{requestId, encoderLen, beamWidth}}}, {std::ref(*llmReq)}); + + // Allocate CPU output tensor: [numPools, maxNumSeq*beamWidth, 2, maxBlocksPerSeq]. + auto const dims = crossKvMgr.getOffsetTableDimensions(); + SizeType32 const numPools = dims.numPools; + SizeType32 const maxBlocksPerSeq = dims.maxBlocksPerSeq; + auto blockOffsets + = tr::BufferManager::cpu(tr::ITensor::makeShape({numPools, maxNumSequences * beamWidth, 2, maxBlocksPerSeq}), + tr::TRTDataType::value); + + // Fill with sentinel so we can detect un-written slots. + auto* const raw = tr::bufferCast(*blockOffsets); + std::fill(raw, raw + blockOffsets->getSize(), tk::KVCacheIndex{tk::KVCacheIndex::kInvalidPoolIndex}); + + crossKvMgr.copyBlockOffsets(*blockOffsets, /*outputSlotOffset=*/0, requestId); + + // Post-condition: for every (pool, K/V, block), beams 1..beamWidth-1 + // must hold the same physical block index as beam 0, and beam 0 itself + // must be a valid (non-sentinel) index. + auto const& shape = blockOffsets->getShape(); + for (SizeType32 pool = 0; pool < numPools; ++pool) + { + for (SizeType32 kv = 0; kv < 2; ++kv) + { + for (SizeType32 block = 0; block < maxBlocksPerSeq; ++block) + { + auto idx = [&](SizeType32 beam) { return tc::flat_index(shape.d, pool, beam, kv, block); }; + + EXPECT_NE(raw[idx(0)].get(), tk::KVCacheIndex::kInvalidPoolIndex) + << "pool=" << pool << " beam=0 kv=" << kv << " block=" << block << ": not initialised"; + + for (SizeType32 beam = 1; beam < beamWidth; ++beam) + { + EXPECT_EQ(raw[idx(beam)].get(), raw[idx(0)].get()) + << "pool=" << pool << " beam=" << beam << " kv=" << kv << " block=" << block + << ": differs from beam 0 — cross-KV beam sharing is broken"; + } + } + } + } +} + +// ============================================================================ +// Fix 3: copyGenerationLogits direct-copy correctness +// ============================================================================ + +// Verify that the direct-copy implementation of copyGenerationLogits writes +// each step's logits for each beam to the correct slot in the host buffer. +TEST(CopyGenerationLogitsTest, DirectCopyPlacesEachBeamStepAtCorrectHostOffset) +{ + // Parameters. + SizeType32 constexpr beamWidth = 2; + SizeType32 constexpr numSteps = 4; // one full cache-length flush + SizeType32 constexpr vocabSize = 8; + SizeType32 constexpr promptLen = 1; + + auto stream = std::make_shared(); + tr::BufferManager bufferMgr{stream}; + + // Create a request: promptLen=1, maxNewTokens=numSteps. + RequestIdType constexpr requestId = 1; + auto inputTokens = std::make_shared(promptLen, 0); + tr::SamplingConfig const samplingConfig{beamWidth}; + auto llmReq = std::make_shared(requestId, numSteps, inputTokens, samplingConfig, /*isStreaming=*/false); + + // Advance internal token count to simulate numSteps tokens generated so + // that (with beforeDecoder=false): + // numGenerationToken = getNumTokens(beam) - mPromptLen = numSteps + // hostOffset = numGenerationToken - fragmentSize = 0 + LlmRequest::BeamTokens const generatedTokens(beamWidth, VecTokens(numSteps, /*token=*/1)); + llmReq->setGeneratedTokens(generatedTokens); + llmReq->allocGenerationLogitsHost(vocabSize, nvinfer1::DataType::kFLOAT); + + // Build numSteps logit fragments, each of shape [1, beamWidth, vocabSize], + // filled with a unique per-(step, beam) sentinel value: step*100 + beam. + for (SizeType32 step = 0; step < numSteps; ++step) + { + tr::ITensor::SharedPtr frag = tr::BufferManager::pinnedPool( + tr::ITensor::makeShape({1, beamWidth, vocabSize}), nvinfer1::DataType::kFLOAT); + auto* const fragData = tr::bufferCast(*frag); + for (SizeType32 beam = 0; beam < beamWidth; ++beam) + { + float const val = static_cast(step * 100 + beam); + for (SizeType32 v = 0; v < vocabSize; ++v) + { + // Flat layout: [1][beam][v] → beam * vocabSize + v + fragData[beam * vocabSize + v] = val; + } + } + llmReq->addGenerationLogitsFragment(frag); + } + ASSERT_EQ(llmReq->getGenerationLogitsFragmentsSize(), numSteps); + + // Dummy cache — not accessed by the direct-copy implementation. + RuntimeBuffers::GenerationLogitsCache dummyCache; + + utils::copyGenerationLogits(dummyCache, bufferMgr, *llmReq, /*beforeDecoder=*/false, /*numDroppedTokens=*/{}); + + ASSERT_EQ(cudaStreamSynchronize(stream->get()), cudaSuccess); + + // Post-condition: generationLogitsHost[beam, step, v] == step*100 + beam + // for all (beam, step, v). Host shape: [beamWidth, maxNewTokens, vocab]. + auto const* const hostData = tr::bufferCast(*llmReq->getGenerationLogitsHost()); + for (SizeType32 beam = 0; beam < beamWidth; ++beam) + { + for (SizeType32 step = 0; step < numSteps; ++step) + { + float const expected = static_cast(step * 100 + beam); + for (SizeType32 v = 0; v < vocabSize; ++v) + { + SizeType32 const flatIdx = (beam * numSteps + step) * vocabSize + v; + EXPECT_FLOAT_EQ(hostData[flatIdx], expected) << "host[beam=" << beam << ", step=" << step << ", v=" << v + << "]=" << hostData[flatIdx] << " expected " << expected; + } + } + } + + // copyGenerationLogits must clear fragments after flushing. + EXPECT_EQ(llmReq->getGenerationLogitsFragmentsSize(), 0); +}