diff --git a/tests/engine/v1/test_async_llm.py b/tests/engine/v1/test_async_llm.py index 9f6e2906a4b6b..c6e9a08bef8ec 100644 --- a/tests/engine/v1/test_async_llm.py +++ b/tests/engine/v1/test_async_llm.py @@ -5,8 +5,13 @@ from vllm import SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.platforms import current_platform from vllm.v1.engine.async_llm import AsyncLLM +if not current_platform.is_cuda(): + pytest.skip(reason="V1 currently only supported on CUDA.", + allow_module_level=True) + ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B", disable_log_requests=False) diff --git a/tests/engine/v1/test_engine_core.py b/tests/engine/v1/test_engine_core.py index 25c01c13306ca..3c0d49e866404 100644 --- a/tests/engine/v1/test_engine_core.py +++ b/tests/engine/v1/test_engine_core.py @@ -1,15 +1,21 @@ import time import uuid +import pytest from transformers import AutoTokenizer from vllm import SamplingParams from vllm.engine.arg_utils import EngineArgs +from vllm.platforms import current_platform from vllm.usage.usage_lib import UsageContext from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.core import EngineCore +if not current_platform.is_cuda(): + pytest.skip(reason="V1 currently only supported on CUDA.", + allow_module_level=True) + MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME) PROMPT = "Hello my name is Robert and I love quanitzation kernels" diff --git a/tests/engine/v1/test_engine_core_client.py b/tests/engine/v1/test_engine_core_client.py index 57ad8c84b0160..aa283ad8d9da2 100644 --- a/tests/engine/v1/test_engine_core_client.py +++ b/tests/engine/v1/test_engine_core_client.py @@ -8,11 +8,16 @@ from vllm import SamplingParams from vllm.engine.arg_utils import EngineArgs +from vllm.platforms import current_platform from vllm.usage.usage_lib import UsageContext from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.core_client import EngineCoreClient +if not current_platform.is_cuda(): + pytest.skip(reason="V1 currently only supported on CUDA.", + allow_module_level=True) + MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME) PROMPT = "Hello my name is Robert and I love quanitzation kernels"