sgl-project · merrymercy · Dec 27, 2024 · Dec 13, 2024 · Dec 19, 2024 · Dec 19, 2024
@@ -15,7 +15,8 @@
 import json
 import logging
 from enum import IntEnum, auto
-from typing import List, Optional, Union
+from functools import lru_cache
+from typing import List, Optional, Set, Union
 
 import torch
 from transformers import PretrainedConfig
@@ -264,6 +265,14 @@ def _verify_quantization(self) -> None:
                     self.quantization,
                 )
 
+    @lru_cache()
+    def get_hf_eos_token_id(self) -> Optional[Set[int]]:
+        eos_ids = getattr(self.hf_config, "eos_token_id", None)
+        if eos_ids:
+            # it can be either int or list of int
+            eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
+        return eos_ids
+
 
 def get_hf_text_config(config: PretrainedConfig):
     """Get the "sub" config relevant to llm for multi modal models.

@@ -29,7 +29,7 @@
 
 import dataclasses
 import logging
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Set, Tuple, Union
 
 import numpy as np
 import torch
@@ -209,6 +209,7 @@ def __init__(
         lora_path: Optional[str] = None,
         input_embeds: Optional[List[List[float]]] = None,
         session_id: Optional[str] = None,
+        eos_token_ids: Optional[Set[int]] = None,
     ):
         # Input and output info
         self.rid = rid
@@ -236,6 +237,7 @@ def __init__(
         self.finished_reason = None
         self.to_abort = False
         self.stream = stream
+        self.eos_token_ids = eos_token_ids
 
         # For incremental decoding
         # ----- | --------- read_ids -------|
@@ -395,18 +397,23 @@ def check_finished(self):
 
         last_token_id = self.output_ids[-1]
 
-        matched_eos = False
-
-        # Check stop token ids
-        if self.sampling_params.stop_token_ids:
-            matched_eos = last_token_id in self.sampling_params.stop_token_ids
-        if self.tokenizer is not None:
-            matched_eos |= last_token_id == self.tokenizer.eos_token_id
-            if self.tokenizer.additional_stop_token_ids:
-                matched_eos |= last_token_id in self.tokenizer.additional_stop_token_ids
-        if matched_eos and not self.sampling_params.ignore_eos:
-            self.finished_reason = FINISH_MATCHED_TOKEN(matched=last_token_id)
-            return
+        if not self.sampling_params.ignore_eos:
+            matched_eos = False
+
+            # Check stop token ids
+            if self.sampling_params.stop_token_ids:
+                matched_eos = last_token_id in self.sampling_params.stop_token_ids
+            if self.eos_token_ids:
+                matched_eos |= last_token_id in self.eos_token_ids
+            if self.tokenizer is not None:
+                matched_eos |= last_token_id == self.tokenizer.eos_token_id
+                if self.tokenizer.additional_stop_token_ids:
+                    matched_eos |= (
+                        last_token_id in self.tokenizer.additional_stop_token_ids
+                    )
+            if matched_eos:
+                self.finished_reason = FINISH_MATCHED_TOKEN(matched=last_token_id)
+                return
 
         # Check stop strings
         if len(self.sampling_params.stop_strs) > 0:

@@ -520,6 +520,7 @@ def handle_generate_request(
                 stream=recv_req.stream,
                 lora_path=recv_req.lora_path,
                 input_embeds=recv_req.input_embeds,
+                eos_token_ids=self.model_config.get_hf_eos_token_id(),
             )
             req.tokenizer = self.tokenizer