diff --git a/vllm/config.py b/vllm/config.py index 90d574bb3b2aa..9b92d9706c9dd 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -164,9 +164,6 @@ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int: total_num_attention_heads = self.hf_config.num_attention_heads return total_num_attention_heads // parallel_config.tensor_parallel_size - def get_max_model_len(self) -> int: - return self.max_model_len - def get_num_layers(self, parallel_config: "ParallelConfig") -> int: total_num_hidden_layers = self.hf_config.num_hidden_layers return total_num_hidden_layers // parallel_config.pipeline_parallel_size @@ -378,10 +375,17 @@ def _get_and_verify_max_len( if max_len_key is not None: derived_max_model_len = min(derived_max_model_len, max_len_key) if derived_max_model_len == float("inf"): - raise ValueError( - "The model's config.json must contain one of the following keys " - "to determine the original maximum length of the model: " - f"{possible_keys}") + if max_model_len is not None: + # If max_model_len is specified, we use it. + return max_model_len + + default_max_len = 2048 + logger.warning( + "The model's config.json does not contain any of the following " + "keys to determine the original maximum length of the model: " + f"{possible_keys}. Assuming the model's maximum length is " + f"{default_max_len}.") + derived_max_model_len = default_max_len rope_scaling = getattr(hf_config, "rope_scaling", None) if rope_scaling is not None: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 8951a98e4159a..1e163a2bfb6ad 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -184,7 +184,7 @@ def create_engine_configs( self.worker_use_ray) scheduler_config = SchedulerConfig(self.max_num_batched_tokens, self.max_num_seqs, - model_config.get_max_model_len()) + model_config.max_model_len) return model_config, cache_config, parallel_config, scheduler_config diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3e1026bfaefce..c1874f13f07da 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -77,6 +77,7 @@ def __init__( f"revision={model_config.revision}, " f"trust_remote_code={model_config.trust_remote_code}, " f"dtype={model_config.dtype}, " + f"max_seq_len={model_config.max_model_len}, " f"download_dir={model_config.download_dir!r}, " f"load_format={model_config.load_format}, " f"tensor_parallel_size={parallel_config.tensor_parallel_size}, " diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 643dd06cb17dd..7ec155d2e488c 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -615,7 +615,7 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]: engine_args = AsyncEngineArgs.from_cli_args(args) engine = AsyncLLMEngine.from_engine_args(engine_args) engine_model_config = asyncio.run(engine.get_model_config()) - max_model_len = engine_model_config.get_max_model_len() + max_model_len = engine_model_config.max_model_len # A separate tokenizer to map token IDs to strings. tokenizer = get_tokenizer(engine_args.tokenizer,