diff --git a/vllm/v1/engine/async_llm_engine.py b/vllm/v1/engine/async_llm_engine.py index cec03015c7e25..329fcd369df94 100644 --- a/vllm/v1/engine/async_llm_engine.py +++ b/vllm/v1/engine/async_llm_engine.py @@ -268,45 +268,3 @@ async def run_output_handler(self): # List[RequestOutput] rather than pushing to the Queue at the # expense of doing another loop through List[RequestOutput] here. self.detokenizer.step_streaming(engine_core_outputs) - - async def abort(self): - pass - - async def check_health(self): - pass - - async def dead_error(self): - pass - - async def do_log_stats(self): - pass - - async def encode(self): - pass - - async def errored(self): - pass - - async def get_decoding_config(self): - pass - - async def get_model_config(self): - pass - - async def get_tokenizer(self): - pass - - async def is_running(self): - pass - - async def is_stopped(self): - pass - - async def is_tracing_enabled(self): - pass - - async def start_profile(self): - pass - - async def stop_profile(self): - pass diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 8f5e1f45a037c..b1d58bff1eb2c 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -17,13 +17,14 @@ from vllm.usage.usage_lib import UsageContext from vllm.v1.engine.detokenizer import Detokenizer from vllm.v1.engine.llm_engine_core import LLMEngineCore +from vllm.v1.engine.protocol import LLMEngineProtocol from vllm.v1.engine.processor import Processor from vllm.v1.executor.gpu_executor import GPUExecutor logger = init_logger(__name__) -class LLMEngine: +class LLMEngine(LLMEngineProtocol): def __init__( self, @@ -141,9 +142,3 @@ def step(self) -> List[RequestOutput]: request_outputs = self.detokenizer.step(engine_core_outputs) return request_outputs - - def get_num_unfinished_requests(self) -> int: - return self.detokenizer.get_num_unfinished_requests() - - def has_unfinished_requests(self) -> bool: - return self.detokenizer.has_unfinished_requests()