Generate: fix assistant in different device (huggingface#33257)

ylacombe · Sep 2, 2024 · 97c0f45 · 97c0f45
1 parent 52a0213
commit 97c0f45
Show file tree

Hide file tree

Showing 2 changed files with 2 additions and 1 deletion.
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
@@ -3964,6 +3964,7 @@ def _assisted_decoding(
 
             #  1. Fetch candidate sequences from a `CandidateGenerator`
             candidate_input_ids, candidate_logits = candidate_generator.get_candidates(input_ids)
+            candidate_input_ids = candidate_input_ids.to(self.device)
             if candidate_logits is not None:
                 candidate_logits = candidate_logits.to(self.device)
 

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
@@ -3323,7 +3323,7 @@ def test_assisted_decoding_in_different_gpu(self):
 
     @slow
     @require_torch_gpu
-    def test_assisted_decoding_in_gpu_cpu(self):
+    def test_assisted_decoding_model_in_gpu_assistant_in_cpu(self):
         # PT-only test: TF doesn't support assisted decoding yet.
         model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to("cuda")
         assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM").to(