Merge pull request #11 from VectorInstitute/develop

XkunW · web-flow · commit 39b98a27d5bb · 2024-09-03T14:26:39.000-04:00
v0.3.2
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "vec-inf"
-version = "0.3.1"
+version = "0.3.2"
 description = "Efficient LLM inference on Slurm clusters using vLLM."
 authors = ["Marshall Wang <marshall.wang@vectorinstitute.ai>"]
 license = "MIT license"
diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
@@ -63,6 +63,11 @@ def cli():
     type=str,
     help='Time limit for job, this should comply with QoS, default to max walltime of the chosen QoS'
 )
+@click.option(
+    "--vocab-size",
+    type=int,
+    help='Vocabulary size, this option is intended for custom models'
+)
 @click.option(
     "--data-type",
     type=str,
@@ -93,6 +98,7 @@ def launch(
     num_gpus: int=None,
     qos: str=None,
     time: str=None,
+    vocab_size: int=None,
     data_type: str=None,
     venv: str=None,
     log_dir: str=None,
@@ -109,16 +115,20 @@ def launch(
 
     models_df = load_models_df()
 
-    if model_name not in models_df['model_name'].values:
-        raise ValueError(f"Model name {model_name} not found in available models")
-
-    default_args = load_default_args(models_df, model_name)
-
-    for arg in default_args:
-        if arg in locals() and locals()[arg] is not None:
-            default_args[arg] = locals()[arg]
-        renamed_arg = arg.replace("_", "-")
-        launch_cmd += f" --{renamed_arg} {default_args[arg]}"    
+    if model_name in models_df['model_name'].values:
+        default_args = load_default_args(models_df, model_name)
+        for arg in default_args:
+            if arg in locals() and locals()[arg] is not None:
+                default_args[arg] = locals()[arg]
+            renamed_arg = arg.replace("_", "-")
+            launch_cmd += f" --{renamed_arg} {default_args[arg]}" 
+    else:
+        model_args = models_df.columns.tolist()
+        excluded_keys = ['model_name', 'pipeline_parallelism']
+        for arg in model_args:
+            if arg not in excluded_keys and locals()[arg] is not None:
+                renamed_arg = arg.replace("_", "-")
+                launch_cmd += f" --{renamed_arg} {locals()[arg]}"  
     
     output = run_bash_command(launch_cmd)
 
diff --git a/vec_inf/models/models.csv b/vec_inf/models/models.csv
@@ -42,4 +42,5 @@ Mixtral-8x7B-Instruct-v0.1,Mixtral,8x7B-Instruct-v0.1,a40,m2,08:00:00,4,1,32000,
 Mixtral-8x22B-v0.1,Mixtral,8x22B-v0.1,a40,m2,08:00:00,4,2,32768,65536,auto,singularity,default,false
 Mixtral-8x22B-Instruct-v0.1,Mixtral,8x22B-Instruct-v0.1,a40,m2,08:00:00,4,2,32768,65536,auto,singularity,default,false
 Phi-3-medium-128k-instruct,Phi-3,medium-128k-instruct,a40,m2,08:00:00,2,1,32064,131072,auto,singularity,default,false
-Phi-3-vision-128k-instruct,Phi-3,vision-128k-instruct,a40,m2,08:00:00,2,1,32064,65536,auto,singularity,default,false
+Phi-3-vision-128k-instruct,Phi-3,vision-128k-instruct,a40,m2,08:00:00,2,1,32064,65536,auto,singularity,default,false
+Llama3-OpenBioLLM-70B,Llama3-OpenBioLLM,70B,a40,m2,08:00:00,4,1,128256,8192,auto,singularity,default,false
diff --git a/vec_inf/multinode_vllm.slurm b/vec_inf/multinode_vllm.slurm
@@ -93,7 +93,6 @@ if [ "$VENV_BASE" = "singularity" ]; then
     --pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \
     --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
     --dtype ${VLLM_DATA_TYPE} \
-    --load-format safetensors \
     --trust-remote-code \
     --max-logprobs ${VLLM_MAX_LOGPROBS} \
     --max-model-len ${VLLM_MAX_MODEL_LEN}
@@ -107,7 +106,6 @@ else
     --pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \
     --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
     --dtype ${VLLM_DATA_TYPE} \
-    --load-format safetensors \
     --trust-remote-code \
     --max-logprobs ${VLLM_MAX_LOGPROBS} \
     --max-model-len ${VLLM_MAX_MODEL_LEN}