diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 622aca66a96de..ad26d3c516ff0 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -336,9 +336,9 @@ def allocate(self, seq_group: SequenceGroup) -> None: # Assign the self-attention block tables for each sequence. if len(wait_seqs) == 1: - self.block_tables[wait_seqs[0].seq_id] = block_table + self.block_tables[seq.seq_id] = block_table else: - for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): + for seq in wait_seqs: self.block_tables[seq.seq_id] = block_table.copy() # Allocate encoder sequence diff --git a/vllm/sequence.py b/vllm/sequence.py index ba477efc54dd6..fd2dc96566786 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -655,6 +655,9 @@ def get_unfinished_seqs(self) -> List[Sequence]: return [seq for seq in self.seqs if not seq.is_finished()] def get_finished_seqs(self) -> List[Sequence]: + if self.is_single_seq: + return self.seqs if self.seqs[0].is_finished() else [] + return [seq for seq in self.seqs if seq.is_finished()] def update_num_computed_tokens(self, num_new_computed_tokens: int):