Fix package loss for small models (#2717)

sgl-project · Jan 3, 2025 · ffb5816 · ffb5816
1 parent c7ae474
commit ffb5816
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 30 deletions.
diff --git a/docs/references/contribution_guide.md b/docs/references/contribution_guide.md
@@ -6,7 +6,7 @@ Welcome to **SGLang**! We appreciate your interest in contributing. This guide p
 
 ### Fork and Clone the Repository
 
-**Note**: New contributors do **not** have the write permission to push to SGLang. Please fork the repository under your GitHub account, then clone your fork locally.
+**Note**: New contributors do **not** have the write permission to push to the official SGLang repo. Please fork the repository under your GitHub account, then clone your fork locally.
 
 ```bash
 git clone https://github.com/<your_user_name>/sglang.git
@@ -36,7 +36,6 @@ SGLang uses Python's built-in [unittest](https://docs.python.org/3/library/unitt
 
 We recommend new contributors start from writing documentation, which helps you quickly understand SGLang codebase. For more details, please refer to [docs/README.md](https://github.com/sgl-project/sglang/tree/main/docs/README.md).
 
-
 ## Tips for Newcomers
 
 If you want to contribute but don’t have a specific idea in mind, pick issues labeled [“good first issue” or “help wanted”](https://github.com/sgl-project/sglang/issues?q=is%3Aissue+label%3A%22good+first+issue%22%2C%22help+wanted%22). These tasks typically have lower complexity and provide an excellent introduction to the codebase. Also check out this [code walk-through](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/tree/main/sglang/code-walk-through) for a deeper look into SGLang’s workflow.

diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
@@ -1364,11 +1364,11 @@ def stream_output(
             embeddings = []
             prompt_tokens = []
             for req in reqs:
-                assert req.finished()
-                rids.append(req.rid)
-                finished_reasons.append(req.finished_reason.to_json())
-                embeddings.append(req.embedding)
-                prompt_tokens.append(len(req.origin_input_ids))
+                if req.finished():
+                    rids.append(req.rid)
+                    finished_reasons.append(req.finished_reason.to_json())
+                    embeddings.append(req.embedding)
+                    prompt_tokens.append(len(req.origin_input_ids))
             self.send_to_detokenizer.send_pyobj(
                 BatchEmbeddingOut(rids, finished_reasons, embeddings, prompt_tokens)
             )

diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
@@ -222,10 +222,8 @@ async def generate_request(
             is_single = obj.is_single
             if is_single:
                 tokenized_obj = await self._tokenize_one_request(obj)
-                self.send_to_scheduler.send_pyobj(tokenized_obj)
-                async for response in self._wait_one_response(
-                    obj, request, created_time
-                ):
+                self._send_one_request(obj, tokenized_obj, created_time)
+                async for response in self._wait_one_response(obj, request):
                     yield response
             else:
                 async for response in self._handle_batch_request(
@@ -306,16 +304,24 @@ async def _tokenize_one_request(
 
         return tokenized_obj
 
-    async def _wait_one_response(
+    def _send_one_request(
         self,
         obj: Union[GenerateReqInput, EmbeddingReqInput],
-        request: Optional[fastapi.Request] = None,
+        tokenized_obj: Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput],
         created_time: Optional[float] = None,
     ):
-        """Wait for the response of one request."""
         event = asyncio.Event()
         state = ReqState([], False, event, obj, created_time=created_time)
         self.rid_to_state[obj.rid] = state
+        self.send_to_scheduler.send_pyobj(tokenized_obj)
+
+    async def _wait_one_response(
+        self,
+        obj: Union[GenerateReqInput, EmbeddingReqInput],
+        request: Optional[fastapi.Request] = None,
+    ):
+        """Wait for the response of one request."""
+        state = self.rid_to_state[obj.rid]
 
         while True:
             try:
@@ -361,10 +367,8 @@ async def _handle_batch_request(
             for i in range(batch_size):
                 tmp_obj = obj[i]
                 tokenized_obj = await self._tokenize_one_request(tmp_obj)
-                self.send_to_scheduler.send_pyobj(tokenized_obj)
-                generators.append(
-                    self._wait_one_response(tmp_obj, request, created_time)
-                )
+                self._send_one_request(tmp_obj, tokenized_obj, created_time)
+                generators.append(self._wait_one_response(tmp_obj, request))
                 rids.append(tmp_obj.rid)
         else:
             # FIXME: When using batch and parallel_sample_num together, the perf is not optimal.
@@ -389,21 +393,17 @@ async def _handle_batch_request(
                 tokenized_obj.sampling_params = copy.copy(tokenized_obj.sampling_params)
                 tokenized_obj.sampling_params.max_new_tokens = 0
                 tokenized_obj.stream = False
-                self.send_to_scheduler.send_pyobj(tokenized_obj)
-                await self._wait_one_response(
-                    tmp_obj, request, created_time
-                ).__anext__()
+                self._send_one_request(tmp_obj, tokenized_obj, created_time)
+                await self._wait_one_response(tmp_obj, request).__anext__()
 
             # Expand requests, assign new rids for them, and send them
             for i in range(batch_size):
                 for _ in range(obj.parallel_sample_num):
                     tmp_obj = copy.copy(objs[i])
                     tokenized_obj = copy.copy(tokenized_objs[i])
                     tokenized_obj.rid = tmp_obj.regenerate_rid()
-                    self.send_to_scheduler.send_pyobj(tokenized_obj)
-                    generators.append(
-                        self._wait_one_response(tmp_obj, request, created_time)
-                    )
+                    self._send_one_request(tmp_obj, tokenized_obj, created_time)
+                    generators.append(self._wait_one_response(tmp_obj, request))
                     rids.append(tmp_obj.rid)
 
         # Wait for all requests

diff --git a/test/README.md b/test/README.md
@@ -13,7 +13,7 @@ python3 test_srt_endpoint.py
 python3 -m unittest test_srt_endpoint.TestSRTEndpoint.test_simple_decode
 
 # Run a suite with multiple files
-python3 run_suite.py --suite minimal
+python3 run_suite.py --suite per-commit
 ```
 
 ## Test Frontend Language
@@ -28,14 +28,14 @@ python3 test_openai_backend.py
 python3 -m unittest test_openai_backend.TestOpenAIBackend.test_few_shot_qa
 
 # Run a suite with multiple files
-python3 run_suite.py --suite minimal
+python3 run_suite.py --suite per-commit
 ```
 
 ## Adding or Updating Tests in CI
 
 - Create new test files under `test/srt` or `test/lang` depending on the type of test.
-- Ensure they are referenced in the respective `run_suite.py` (e.g., `test/srt/run_suite.py` or `test/lang/run_suite.py`) so they’re picked up in CI.
-- In CI, all tests run automatically. You may modify the workflows in [`.github/workflows/`](https://github.com/sgl-project/sglang/tree/main/.github/workflows) to add custom test groups or extra checks.
+- Ensure they are referenced in the respective `run_suite.py` (e.g., `test/srt/run_suite.py` or `test/lang/run_suite.py`) so they’re picked up in CI. For most small test cases, they can be added to the `per-commit` suite.
+- The CI will run the `per-commit` and `nightly` automatically. If you need special setup or custom test groups, you may modify the workflows in [`.github/workflows/`](https://github.com/sgl-project/sglang/tree/main/.github/workflows).
 
 
 ## Writing Elegant Test Cases