22import queue
33from multiprocessing .process import BaseProcess
44from threading import Thread
5- from typing import List , Tuple , Type
5+ from typing import List , Tuple , Type , Any
66
77import msgspec
88import zmq
1313from vllm .usage .usage_lib import UsageContext
1414from vllm .v1 .core .scheduler import Scheduler
1515from vllm .v1 .engine import (POLLING_TIMEOUT_MS , EngineCoreOutput ,
16- EngineCoreOutputs , EngineCoreRequest )
16+ EngineCoreOutputs , EngineCoreRequest ,
17+ EngineCoreRequestType )
1718from vllm .v1 .executor .gpu_executor import GPUExecutor
18- from vllm .v1 .request import Request
19+ from vllm .v1 .request import Request , RequestStatus
1920from vllm .version import __version__ as VLLM_VERSION
2021
2122logger = init_logger (__name__ )
@@ -123,6 +124,10 @@ def add_request(self, request: EngineCoreRequest):
123124 req = Request .from_engine_core_request (request )
124125 self .scheduler .add_request (req )
125126
127+ def abort_requests (self , request_ids : List [str ]):
128+ self .scheduler .finish_requests (request_ids ,
129+ RequestStatus .FINISHED_ABORTED )
130+
126131 def step (self ) -> List [EngineCoreOutput ]:
127132 """Schedule, execute, and make output."""
128133
@@ -153,7 +158,10 @@ def __init__(
153158 super ().__init__ (vllm_config , executor_class , usage_context )
154159
155160 self .msgpack_encoder = msgspec .msgpack .Encoder ()
156- self .msgpack_decoder = msgspec .msgpack .Decoder (EngineCoreRequest )
161+ self .msgpack_add_request_decoder = \
162+ msgspec .msgpack .Decoder (EngineCoreRequest )
163+ self .msgpack_abort_requests_decoder = \
164+ msgspec .msgpack .Decoder (list [str ])
157165
158166 self .ctx = zmq .Context () # type: ignore[attr-defined]
159167
@@ -182,10 +190,27 @@ def __init__(
182190 ready_socket .close (linger = 0 )
183191
184192 def process_input_socket (self ):
193+
194+ def get_decoder_from_request_type (req_type : bytes ) \
195+ -> msgspec .msgpack .Decoder :
196+ # Identify msgpack decoder based on request_type.
197+ if req_type == EngineCoreRequestType .AddRequest .value :
198+ return self .msgpack_add_request_decoder
199+ elif req_type == EngineCoreRequestType .AbortRequest .value :
200+ return self .msgpack_abort_requests_decoder
201+ else :
202+ raise ValueError (f"Unhandled request type { request_type } " )
203+
185204 while True :
186- frames = self .input_socket .recv_multipart (copy = False )
187- request = self .msgpack_decoder .decode (frames [0 ].buffer )
188- self .input_queue .put_nowait (request )
205+ request_type , request_data = \
206+ self .input_socket .recv_multipart (copy = False )
207+
208+ # Decode request_data
209+ msgpack_decoder : msgspec .msgpack .Decoder = \
210+ get_decoder_from_request_type (request_type .buffer )
211+ request_data : Any = msgpack_decoder .decode (request_data .buffer )
212+
213+ self .input_queue .put_nowait ((request_type .buffer , request_data ))
189214
190215 def process_output_socket (self ):
191216 while True :
@@ -267,8 +292,7 @@ def run_busy_loop(self):
267292 while True :
268293 # Poll the input socket until there is work to do.
269294 if not self .scheduler .has_unfinished_requests ():
270- request = self .input_queue .get ()
271- self ._handle_request (request )
295+ self ._handle_request (self .input_queue .get ())
272296
273297 # Handle new input from the socket.
274298 self ._handle_new_input ()
@@ -282,17 +306,27 @@ def run_busy_loop(self):
282306 def _handle_new_input (self ):
283307 """Handle new input from the AsyncLLMEngine for async mode."""
284308 while not self .input_queue .empty ():
285- request = self .input_queue .get_nowait ()
286- self ._handle_request (request )
309+ self ._handle_request (self .input_queue .get_nowait ())
310+
311+ def _handle_request (self , request : Tuple [bytes , Any ]):
287312
288- def _handle_request (self , request : EngineCoreRequest ):
289313 try :
290- self .add_request (request )
314+ request_type , request_data = request
315+ # Process request_data based on request_type
316+ if request_type == EngineCoreRequestType .AddRequest .value :
317+ assert isinstance (request_data , EngineCoreRequest ), \
318+ f'Unexpected datatype { type (request_data )} '
319+ self .add_request (request_data )
320+ elif request_type == EngineCoreRequestType .AbortRequest .value :
321+ assert isinstance (request_data , list ), \
322+ f'Unexpected datatype { type (request_data )} '
323+ self .scheduler .finish_requests (request_data ,
324+ RequestStatus .FINISHED_ABORTED )
325+ else :
326+ raise ValueError (f"Unhandled request type { request_type } " )
291327
292- # TODO: handle abort via another socket
293328 # TODO: handle logits processors via cloudpickle
294329 # TODO: handle profiling
295-
296330 except Exception as e :
297331 # TODO: handle gracefully
298332 raise e
0 commit comments