diff --git a/README.md b/README.md
index 994f89d..af7807e 100644
--- a/README.md
+++ b/README.md
@@ -58,9 +58,9 @@ curl http://localhost:18888/v1/chat/completions \
-| Model | Size | Context | Weights | Serving |
-|--------------|------|---------|--------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| OpenChat 3.2 SUPER | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.2_super) | `python -m ochat.serving.openai_api_server --model-type openchat_v3.2 --model openchat/openchat_v3.2_super --engine-use-ray --worker-use-ray --max-num-batched-tokens 5120` |
+| Model | Size | Context | Weights | Serving |
+|--------------------|------|---------|--------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------|
+| OpenChat 3.2 SUPER | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.2_super) | `python -m ochat.serving.openai_api_server --model openchat/openchat_v3.2_super --engine-use-ray --worker-use-ray` |
For inference with Huggingface Transformers (slow and not recommended), follow the conversation template provided below:
@@ -276,10 +276,11 @@ To run the models on multiple GPUs with smaller VRAM, you can enable tensor para
OpenChat V3 (click to expand)
-| Model | Size | Context | Weights | Serving |
-|--------------|------|---------|--------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| OpenChat 3.2 | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.2) | `python -m ochat.serving.openai_api_server --model-type openchat_v3.2 --model openchat/openchat_v3.2 --engine-use-ray --worker-use-ray --max-num-batched-tokens 5120` |
-| OpenChat 3.1 | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.1) | `python -m ochat.serving.openai_api_server --model-type openchat_v3.1_llama2 --model openchat/openchat_v3.1 --engine-use-ray --worker-use-ray --max-num-batched-tokens 5120` |
+| Model | Size | Context | Weights | Serving |
+|--------------|------|---------|--------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------|
+| OpenChat 3.2 | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.2) | `python -m ochat.serving.openai_api_server --model openchat/openchat_v3.2 --engine-use-ray --worker-use-ray` |
+| OpenChat 3.1 | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.1) | `python -m ochat.serving.openai_api_server --model openchat/openchat_v3.1 --engine-use-ray --worker-use-ray` |
+
## Acknowledgements
diff --git a/ochat/config/model_config.py b/ochat/config/model_config.py
index be97cae..48ade5e 100644
--- a/ochat/config/model_config.py
+++ b/ochat/config/model_config.py
@@ -131,7 +131,7 @@ def _v3_condition(props):
MODEL_CONFIG_MAP = {
# OpenChat V3.2
"openchat_v3.2": ModelConfig(
- name="OpenChat V3.2",
+ name="OpenChat V3.2 Llama 2",
# Prompt
role_prefix=_v3_2_conditional_prefix,
@@ -174,8 +174,8 @@ def _v3_condition(props):
),
# OpenChat V2
- "openchat_v2": ModelConfig(
- name="OpenChat_v2",
+ "openchat_v2_llama2": ModelConfig(
+ name="OpenChat V2 Llama 2",
# Prompt
role_prefix=_v2_conditional_prefix,
@@ -184,7 +184,7 @@ def _v3_condition(props):
bos_token="",
# Tokenize
- model_max_context=2048,
+ model_max_context=4096,
model_create=partial(ochat.models.LlamaForCausalLM.from_pretrained,
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16),
@@ -195,7 +195,7 @@ def _v3_condition(props):
# OpenChat
"openchat_llama2": ModelConfig(
- name="OpenChat Llama 2",
+ name="OpenChat V1 Llama 2",
# Prompt
role_prefix={
diff --git a/ochat/data/filter_sharegpt.py b/ochat/data/filter_sharegpt.py
index 0b77d97..bf58d93 100644
--- a/ochat/data/filter_sharegpt.py
+++ b/ochat/data/filter_sharegpt.py
@@ -10,6 +10,14 @@
import numpy as np
+def subsample_mask(seed: int, n: int, p: float):
+ mask = np.zeros((len(filtered_samples), ), np.bool_)
+ perm = np.random.default_rng(seed=seed).permutation(n)
+
+ mask[perm[:round(n * p)]] = True
+ return mask
+
+
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--in-file", type=str, required=True)
@@ -37,7 +45,7 @@
# Subsampling
if args.subsample < 1.0:
- keep = np.random.default_rng(seed=args.subsample_seed).random(len(filtered_samples)) < args.subsample
+ keep = subsample_mask(args.subsample_seed, len(filtered_samples), args.subsample)
filtered_samples = [s for s, k in zip(filtered_samples, keep) if k]
# Print
diff --git a/ochat/models/unpadded_llama.py b/ochat/models/unpadded_llama.py
index d337a8f..ddd80ac 100644
--- a/ochat/models/unpadded_llama.py
+++ b/ochat/models/unpadded_llama.py
@@ -166,7 +166,7 @@ def forward(
nz_hidden_states: torch.Tensor,
nz_position_ids: torch.LongTensor,
cu_seqlens: torch.Tensor,
- max_seqlen: torch.Tensor
+ max_seqlen: int
) -> torch.Tensor:
# nz_hidden_states: [nnz, num_heads, head_dim]
# nz_position_ids: [nnz]
@@ -213,7 +213,7 @@ def forward(
nz_hidden_states: torch.Tensor,
nz_position_ids: torch.Tensor,
cu_seqlens: torch.Tensor,
- max_seqlen: torch.Tensor
+ max_seqlen: int
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
# Self Attention
residual = nz_hidden_states
@@ -298,7 +298,7 @@ def forward(
nz_input_ids: torch.Tensor,
nz_position_ids: torch.Tensor,
cu_seqlens: torch.Tensor,
- max_seqlen: torch.Tensor,
+ max_seqlen: int,
) -> torch.Tensor:
nz_hidden_states = self.embed_tokens(nz_input_ids)
cos_sin = self.rotary_emb()
@@ -375,7 +375,7 @@ def forward(
nz_input_ids: torch.Tensor,
nz_position_ids: torch.Tensor,
cu_seqlens: torch.Tensor,
- max_seqlen: torch.Tensor,
+ max_seqlen: int,
# Unpadded labels
nz_shifted_label_ids: Optional[torch.Tensor] = None,
nz_shifted_loss_weights: Optional[torch.Tensor] = None
diff --git a/ochat/serving/openai_api_server.py b/ochat/serving/openai_api_server.py
index b92c717..3b08abc 100644
--- a/ochat/serving/openai_api_server.py
+++ b/ochat/serving/openai_api_server.py
@@ -29,6 +29,8 @@
from ochat.config.model_config import MODEL_CONFIG_MAP
from ochat.serving import openai_api_protocol, async_tokenizer
+from transformers.utils.hub import cached_file
+
TIMEOUT_KEEP_ALIVE = 5 # seconds
@@ -37,7 +39,6 @@
class ModelConfig:
name: str = None
- eot_token: str = None
max_length: int = None
stream_period: int = None
@@ -65,7 +66,7 @@ async def validation_exception_handler(request, exc): # pylint: disable=unused-
async def check_api_key(
auth: Optional[HTTPAuthorizationCredentials] = fastapi.Depends(HTTPBearer(auto_error=False)),
-) -> str:
+):
if not model.api_keys:
return
@@ -287,8 +288,6 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
parser = argparse.ArgumentParser(description="OpenChat OpenAI-Compatible RESTful API server.")
# Model
- parser.add_argument("--model-type", type=str, required=True, help="Type of model")
-
parser.add_argument("--stream-period", type=int, default=6, help="Number of tokens per stream event")
parser.add_argument("--api-keys", type=str, nargs="*", default=[], help="Allowed API Keys. Leave blank to not verify")
@@ -327,22 +326,28 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
backupCount=args.log_max_count)
)
- # Load model
- engine_args = AsyncEngineArgs.from_cli_args(args)
- engine = AsyncLLMEngine.from_engine_args(engine_args)
- engine_model_config = asyncio.run(engine.get_model_config())
+ # Load model type
+ with open(cached_file(path_or_repo_id=args.model, filename="openchat.json"), "r") as f:
+ model_type = json.load(f)["model_type"]
# Load tokenizer
- tokenizer = async_tokenizer.AsyncTokenizer.remote(args.model_type, args.model)
+ tokenizer = async_tokenizer.AsyncTokenizer.remote(model_type, args.model)
# Model config
- model.name = args.model_type
- model.eot_token = MODEL_CONFIG_MAP[args.model_type].eot_token
- model.max_length = MODEL_CONFIG_MAP[args.model_type].model_max_context
+ model.name = model_type
+ model.max_length = MODEL_CONFIG_MAP[model_type].model_max_context
model.stream_period = args.stream_period
model.api_keys = args.api_keys
+ # Set max num batched tokens
+ args.max_num_batched_tokens = max(args.max_num_batched_tokens, model.max_length)
+
+ # Load model engine
+ engine_args = AsyncEngineArgs.from_cli_args(args)
+ engine = AsyncLLMEngine.from_engine_args(engine_args)
+ engine_model_config = asyncio.run(engine.get_model_config())
+
# Run
uvicorn.run(app,
host=args.host,
diff --git a/ochat/training_deepspeed/train.py b/ochat/training_deepspeed/train.py
index 674817a..66b25db 100644
--- a/ochat/training_deepspeed/train.py
+++ b/ochat/training_deepspeed/train.py
@@ -1,6 +1,7 @@
import argparse
import os
import math
+import json
from functools import partial
import torch
@@ -41,7 +42,6 @@ def parse_args():
parser.add_argument("--local_rank", type=int, required=True)
# Model type and data
- parser.add_argument("--model_type", type=str, required=True)
parser.add_argument("--model_path", type=str, required=True)
parser.add_argument("--data_path", type=str, required=True)
parser.add_argument("--save_path", type=str, required=True)
@@ -77,6 +77,7 @@ def create_dataset(args, split_name):
_rank0_print (f"Skipping loading {split_name}")
return None
+ _rank0_print(f"Loading {split_name} data from {filename}...")
return ParquetDataset(filename)
@@ -111,20 +112,18 @@ def batch_to_tensor(batch, int_dtype=torch.long, loss_dtype=torch.bfloat16):
batch_tensor[k] = torch.from_numpy(np.concatenate(batch.column(k).to_numpy())).to(dtype)
# cu seqlens
- batch_tensor["max_seqlen"] = torch.max(batch_tensor["seqlens"])
batch_tensor["cu_seqlens"] = torch.nn.functional.pad(batch_tensor["seqlens"].cumsum(-1, dtype=torch.int32), (1, 0))
- del batch_tensor["seqlens"]
+ # batch info
+ batch_info = {"max_seqlen": torch.max(batch_tensor["seqlens"]).item()}
# inputs
- return batch_tensor
+ del batch_tensor["seqlens"]
+ return batch_tensor, batch_info
-def create_distributed_dataloader(args, data):
- # Check data
- assert data.metadata["model_type"] == args.model_type, \
- f"The dataset is for {data.metadata['model_type']}, but you specified {args.model_type} for training."
+def create_distributed_dataloader(args, data):
# Multipack dataloader
args.batch_max_len = args.batch_size_per_gpu * MODEL_CONFIG_MAP[args.model_type].model_max_context
@@ -143,6 +142,8 @@ def create_distributed_dataloader(args, data):
def create_model(args):
global LOCAL_RANK
+ _rank0_print(f"Loading model {args.model_type} from {args.model_path}...")
+
# Create model + optimizer + lr scheduler
model = MODEL_CONFIG_MAP[args.model_type].model_create(args.model_path)
# Model to assigned cuda device
@@ -198,6 +199,14 @@ def save_tokenizer(args, save_path):
tokenizer.save_pretrained(save_path)
+def save_openchat_metadata(args, epoch, save_path):
+ metadata = vars(args)
+ metadata["epoch"] = epoch
+
+ with open(os.path.join(save_path, "openchat.json"), "w") as f:
+ json.dump(metadata, f, default=lambda o: "")
+
+
def calculate_auto_lr(lr, batch_max_len, train_dataset):
if lr is not None:
return lr
@@ -227,10 +236,12 @@ def train():
LOCAL_RANK = args.local_rank
# Dataset
- _rank0_print("Loading data...")
train_dataset = create_dataset(args, "train")
eval_dataset = create_dataset(args, "eval")
+ # Load model type
+ args.model_type = train_dataset.metadata["model_type"]
+
# Data Loader
train_loader = create_distributed_dataloader(args, train_dataset)
train_total_steps = args.epochs * train_loader.num_batches()
@@ -243,7 +254,6 @@ def train():
args.lr = calculate_auto_lr(args.lr, args.batch_max_len, train_dataset)
# Model
- _rank0_print("Loading model...")
model_engine, optimizer = create_model(args)
# LR Scheduler
@@ -265,16 +275,16 @@ def train():
model_engine.train()
train_loader.set_epoch(epoch)
- for batch, all_numseq, cur_numseq in train_loader:
+ for (batch_tensor, batch_info), all_numseq, cur_numseq in train_loader:
step += 1
if step > train_total_steps: # At most train_total_steps
break
# To device
- batch = {k: (v.to(args.device) if v is not None else None) for k, v in batch.items()}
+ batch_tensor = {k: (v.to(args.device) if v is not None else None) for k, v in batch_tensor.items()}
# Update
- loss = (1 / all_numseq) * model_engine(**batch).loss
+ loss = (1 / all_numseq) * model_engine(**batch_tensor, **batch_info).loss
model_engine.backward(loss)
@@ -304,12 +314,12 @@ def train():
eval_loader.set_epoch(epoch)
with torch.inference_mode():
- for batch, all_numseq, cur_numseq in eval_loader:
+ for (batch_tensor, batch_info), all_numseq, cur_numseq in eval_loader:
# To device
- batch = {k: (v.to(args.device) if v is not None else None) for k, v in batch.items()}
+ batch_tensor = {k: (v.to(args.device) if v is not None else None) for k, v in batch_tensor.items()}
# Eval
- eval_loss = (1 / all_numseq) * model_engine(**batch).loss
+ eval_loss = (1 / all_numseq) * model_engine(**batch_tensor, **batch_info).loss
# Accumulate eval loss
eval_total_loss.add_(eval_loss)
@@ -337,6 +347,9 @@ def train():
# Also save tokenizer from base model
save_tokenizer(args, save_path)
+ # Write metadata
+ save_openchat_metadata(args, epoch, save_path)
+
if __name__ == "__main__":
train()