From 0b21c92c240ad6fcbe133cea39763c52976be154 Mon Sep 17 00:00:00 2001 From: One Date: Fri, 15 Sep 2023 07:19:54 +0000 Subject: [PATCH 1/3] [train + serving] store and use model metadata --- README.md | 15 ++++++----- ochat/config/model_config.py | 10 ++++---- ochat/models/unpadded_llama.py | 8 +++--- ochat/serving/openai_api_server.py | 29 ++++++++++++--------- ochat/training_deepspeed/train.py | 41 ++++++++++++++++++++---------- 5 files changed, 61 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 994f89d..af7807e 100644 --- a/README.md +++ b/README.md @@ -58,9 +58,9 @@ curl http://localhost:18888/v1/chat/completions \ -| Model | Size | Context | Weights | Serving | -|--------------|------|---------|--------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| OpenChat 3.2 SUPER | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.2_super) | `python -m ochat.serving.openai_api_server --model-type openchat_v3.2 --model openchat/openchat_v3.2_super --engine-use-ray --worker-use-ray --max-num-batched-tokens 5120` | +| Model | Size | Context | Weights | Serving | +|--------------------|------|---------|--------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------| +| OpenChat 3.2 SUPER | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.2_super) | `python -m ochat.serving.openai_api_server --model openchat/openchat_v3.2_super --engine-use-ray --worker-use-ray` | For inference with Huggingface Transformers (slow and not recommended), follow the conversation template provided below: @@ -276,10 +276,11 @@ To run the models on multiple GPUs with smaller VRAM, you can enable tensor para
OpenChat V3 (click to expand) -| Model | Size | Context | Weights | Serving | -|--------------|------|---------|--------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| OpenChat 3.2 | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.2) | `python -m ochat.serving.openai_api_server --model-type openchat_v3.2 --model openchat/openchat_v3.2 --engine-use-ray --worker-use-ray --max-num-batched-tokens 5120` | -| OpenChat 3.1 | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.1) | `python -m ochat.serving.openai_api_server --model-type openchat_v3.1_llama2 --model openchat/openchat_v3.1 --engine-use-ray --worker-use-ray --max-num-batched-tokens 5120` | +| Model | Size | Context | Weights | Serving | +|--------------|------|---------|--------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------| +| OpenChat 3.2 | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.2) | `python -m ochat.serving.openai_api_server --model openchat/openchat_v3.2 --engine-use-ray --worker-use-ray` | +| OpenChat 3.1 | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.1) | `python -m ochat.serving.openai_api_server --model openchat/openchat_v3.1 --engine-use-ray --worker-use-ray` | +
## Acknowledgements diff --git a/ochat/config/model_config.py b/ochat/config/model_config.py index be97cae..48ade5e 100644 --- a/ochat/config/model_config.py +++ b/ochat/config/model_config.py @@ -131,7 +131,7 @@ def _v3_condition(props): MODEL_CONFIG_MAP = { # OpenChat V3.2 "openchat_v3.2": ModelConfig( - name="OpenChat V3.2", + name="OpenChat V3.2 Llama 2", # Prompt role_prefix=_v3_2_conditional_prefix, @@ -174,8 +174,8 @@ def _v3_condition(props): ), # OpenChat V2 - "openchat_v2": ModelConfig( - name="OpenChat_v2", + "openchat_v2_llama2": ModelConfig( + name="OpenChat V2 Llama 2", # Prompt role_prefix=_v2_conditional_prefix, @@ -184,7 +184,7 @@ def _v3_condition(props): bos_token="", # Tokenize - model_max_context=2048, + model_max_context=4096, model_create=partial(ochat.models.LlamaForCausalLM.from_pretrained, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16), @@ -195,7 +195,7 @@ def _v3_condition(props): # OpenChat "openchat_llama2": ModelConfig( - name="OpenChat Llama 2", + name="OpenChat V1 Llama 2", # Prompt role_prefix={ diff --git a/ochat/models/unpadded_llama.py b/ochat/models/unpadded_llama.py index d337a8f..ddd80ac 100644 --- a/ochat/models/unpadded_llama.py +++ b/ochat/models/unpadded_llama.py @@ -166,7 +166,7 @@ def forward( nz_hidden_states: torch.Tensor, nz_position_ids: torch.LongTensor, cu_seqlens: torch.Tensor, - max_seqlen: torch.Tensor + max_seqlen: int ) -> torch.Tensor: # nz_hidden_states: [nnz, num_heads, head_dim] # nz_position_ids: [nnz] @@ -213,7 +213,7 @@ def forward( nz_hidden_states: torch.Tensor, nz_position_ids: torch.Tensor, cu_seqlens: torch.Tensor, - max_seqlen: torch.Tensor + max_seqlen: int ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: # Self Attention residual = nz_hidden_states @@ -298,7 +298,7 @@ def forward( nz_input_ids: torch.Tensor, nz_position_ids: torch.Tensor, cu_seqlens: torch.Tensor, - max_seqlen: torch.Tensor, + max_seqlen: int, ) -> torch.Tensor: nz_hidden_states = self.embed_tokens(nz_input_ids) cos_sin = self.rotary_emb() @@ -375,7 +375,7 @@ def forward( nz_input_ids: torch.Tensor, nz_position_ids: torch.Tensor, cu_seqlens: torch.Tensor, - max_seqlen: torch.Tensor, + max_seqlen: int, # Unpadded labels nz_shifted_label_ids: Optional[torch.Tensor] = None, nz_shifted_loss_weights: Optional[torch.Tensor] = None diff --git a/ochat/serving/openai_api_server.py b/ochat/serving/openai_api_server.py index b92c717..3b08abc 100644 --- a/ochat/serving/openai_api_server.py +++ b/ochat/serving/openai_api_server.py @@ -29,6 +29,8 @@ from ochat.config.model_config import MODEL_CONFIG_MAP from ochat.serving import openai_api_protocol, async_tokenizer +from transformers.utils.hub import cached_file + TIMEOUT_KEEP_ALIVE = 5 # seconds @@ -37,7 +39,6 @@ class ModelConfig: name: str = None - eot_token: str = None max_length: int = None stream_period: int = None @@ -65,7 +66,7 @@ async def validation_exception_handler(request, exc): # pylint: disable=unused- async def check_api_key( auth: Optional[HTTPAuthorizationCredentials] = fastapi.Depends(HTTPBearer(auto_error=False)), -) -> str: +): if not model.api_keys: return @@ -287,8 +288,6 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]: parser = argparse.ArgumentParser(description="OpenChat OpenAI-Compatible RESTful API server.") # Model - parser.add_argument("--model-type", type=str, required=True, help="Type of model") - parser.add_argument("--stream-period", type=int, default=6, help="Number of tokens per stream event") parser.add_argument("--api-keys", type=str, nargs="*", default=[], help="Allowed API Keys. Leave blank to not verify") @@ -327,22 +326,28 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]: backupCount=args.log_max_count) ) - # Load model - engine_args = AsyncEngineArgs.from_cli_args(args) - engine = AsyncLLMEngine.from_engine_args(engine_args) - engine_model_config = asyncio.run(engine.get_model_config()) + # Load model type + with open(cached_file(path_or_repo_id=args.model, filename="openchat.json"), "r") as f: + model_type = json.load(f)["model_type"] # Load tokenizer - tokenizer = async_tokenizer.AsyncTokenizer.remote(args.model_type, args.model) + tokenizer = async_tokenizer.AsyncTokenizer.remote(model_type, args.model) # Model config - model.name = args.model_type - model.eot_token = MODEL_CONFIG_MAP[args.model_type].eot_token - model.max_length = MODEL_CONFIG_MAP[args.model_type].model_max_context + model.name = model_type + model.max_length = MODEL_CONFIG_MAP[model_type].model_max_context model.stream_period = args.stream_period model.api_keys = args.api_keys + # Set max num batched tokens + args.max_num_batched_tokens = max(args.max_num_batched_tokens, model.max_length) + + # Load model engine + engine_args = AsyncEngineArgs.from_cli_args(args) + engine = AsyncLLMEngine.from_engine_args(engine_args) + engine_model_config = asyncio.run(engine.get_model_config()) + # Run uvicorn.run(app, host=args.host, diff --git a/ochat/training_deepspeed/train.py b/ochat/training_deepspeed/train.py index 674817a..71bfdfe 100644 --- a/ochat/training_deepspeed/train.py +++ b/ochat/training_deepspeed/train.py @@ -1,6 +1,7 @@ import argparse import os import math +import json from functools import partial import torch @@ -41,7 +42,6 @@ def parse_args(): parser.add_argument("--local_rank", type=int, required=True) # Model type and data - parser.add_argument("--model_type", type=str, required=True) parser.add_argument("--model_path", type=str, required=True) parser.add_argument("--data_path", type=str, required=True) parser.add_argument("--save_path", type=str, required=True) @@ -77,6 +77,7 @@ def create_dataset(args, split_name): _rank0_print (f"Skipping loading {split_name}") return None + _rank0_print(f"Loading {split_name} data from {filename}...") return ParquetDataset(filename) @@ -111,20 +112,18 @@ def batch_to_tensor(batch, int_dtype=torch.long, loss_dtype=torch.bfloat16): batch_tensor[k] = torch.from_numpy(np.concatenate(batch.column(k).to_numpy())).to(dtype) # cu seqlens - batch_tensor["max_seqlen"] = torch.max(batch_tensor["seqlens"]) batch_tensor["cu_seqlens"] = torch.nn.functional.pad(batch_tensor["seqlens"].cumsum(-1, dtype=torch.int32), (1, 0)) - del batch_tensor["seqlens"] + # batch info + batch_info = {"max_seqlen": torch.max(batch_tensor["seqlens"]).item()} # inputs - return batch_tensor + del batch_tensor["seqlens"] + return batch_tensor, batch_info -def create_distributed_dataloader(args, data): - # Check data - assert data.metadata["model_type"] == args.model_type, \ - f"The dataset is for {data.metadata['model_type']}, but you specified {args.model_type} for training." +def create_distributed_dataloader(args, data): # Multipack dataloader args.batch_max_len = args.batch_size_per_gpu * MODEL_CONFIG_MAP[args.model_type].model_max_context @@ -143,6 +142,8 @@ def create_distributed_dataloader(args, data): def create_model(args): global LOCAL_RANK + _rank0_print(f"Loading model {args.model_type} from {args.model_path}...") + # Create model + optimizer + lr scheduler model = MODEL_CONFIG_MAP[args.model_type].model_create(args.model_path) # Model to assigned cuda device @@ -198,6 +199,14 @@ def save_tokenizer(args, save_path): tokenizer.save_pretrained(save_path) +def save_openchat_metadata(args, epoch, save_path): + metadata = vars(args) + metadata["epoch"] = epoch + + with open(os.path.join(save_path, "openchat.json"), "w") as f: + json.dump(metadata, f, default=lambda o: "") + + def calculate_auto_lr(lr, batch_max_len, train_dataset): if lr is not None: return lr @@ -227,10 +236,12 @@ def train(): LOCAL_RANK = args.local_rank # Dataset - _rank0_print("Loading data...") train_dataset = create_dataset(args, "train") eval_dataset = create_dataset(args, "eval") + # Load model type + args.model_type = train_dataset.metadata["model_type"] + # Data Loader train_loader = create_distributed_dataloader(args, train_dataset) train_total_steps = args.epochs * train_loader.num_batches() @@ -243,7 +254,6 @@ def train(): args.lr = calculate_auto_lr(args.lr, args.batch_max_len, train_dataset) # Model - _rank0_print("Loading model...") model_engine, optimizer = create_model(args) # LR Scheduler @@ -265,7 +275,7 @@ def train(): model_engine.train() train_loader.set_epoch(epoch) - for batch, all_numseq, cur_numseq in train_loader: + for (batch, batch_info), all_numseq, cur_numseq in train_loader: step += 1 if step > train_total_steps: # At most train_total_steps break @@ -274,7 +284,7 @@ def train(): batch = {k: (v.to(args.device) if v is not None else None) for k, v in batch.items()} # Update - loss = (1 / all_numseq) * model_engine(**batch).loss + loss = (1 / all_numseq) * model_engine(**batch, **batch_info).loss model_engine.backward(loss) @@ -304,12 +314,12 @@ def train(): eval_loader.set_epoch(epoch) with torch.inference_mode(): - for batch, all_numseq, cur_numseq in eval_loader: + for (batch, batch_info), all_numseq, cur_numseq in eval_loader: # To device batch = {k: (v.to(args.device) if v is not None else None) for k, v in batch.items()} # Eval - eval_loss = (1 / all_numseq) * model_engine(**batch).loss + eval_loss = (1 / all_numseq) * model_engine(**batch, **batch_info).loss # Accumulate eval loss eval_total_loss.add_(eval_loss) @@ -337,6 +347,9 @@ def train(): # Also save tokenizer from base model save_tokenizer(args, save_path) + # Write metadata + save_openchat_metadata(args, epoch, save_path) + if __name__ == "__main__": train() From b39588acdf673e2be97f0bf3c29b9a027af2089e Mon Sep 17 00:00:00 2001 From: One Date: Fri, 15 Sep 2023 07:20:24 +0000 Subject: [PATCH 2/3] [data] subsample for fixed number --- ochat/data/filter_sharegpt.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ochat/data/filter_sharegpt.py b/ochat/data/filter_sharegpt.py index 0b77d97..bf58d93 100644 --- a/ochat/data/filter_sharegpt.py +++ b/ochat/data/filter_sharegpt.py @@ -10,6 +10,14 @@ import numpy as np +def subsample_mask(seed: int, n: int, p: float): + mask = np.zeros((len(filtered_samples), ), np.bool_) + perm = np.random.default_rng(seed=seed).permutation(n) + + mask[perm[:round(n * p)]] = True + return mask + + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--in-file", type=str, required=True) @@ -37,7 +45,7 @@ # Subsampling if args.subsample < 1.0: - keep = np.random.default_rng(seed=args.subsample_seed).random(len(filtered_samples)) < args.subsample + keep = subsample_mask(args.subsample_seed, len(filtered_samples), args.subsample) filtered_samples = [s for s, k in zip(filtered_samples, keep) if k] # Print From f55561c3ba7f22d97451ff7dcbb16ab8e7a09db6 Mon Sep 17 00:00:00 2001 From: One Date: Fri, 15 Sep 2023 07:27:26 +0000 Subject: [PATCH 3/3] [training] change batch to batch_tensor --- ochat/training_deepspeed/train.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ochat/training_deepspeed/train.py b/ochat/training_deepspeed/train.py index 71bfdfe..66b25db 100644 --- a/ochat/training_deepspeed/train.py +++ b/ochat/training_deepspeed/train.py @@ -275,16 +275,16 @@ def train(): model_engine.train() train_loader.set_epoch(epoch) - for (batch, batch_info), all_numseq, cur_numseq in train_loader: + for (batch_tensor, batch_info), all_numseq, cur_numseq in train_loader: step += 1 if step > train_total_steps: # At most train_total_steps break # To device - batch = {k: (v.to(args.device) if v is not None else None) for k, v in batch.items()} + batch_tensor = {k: (v.to(args.device) if v is not None else None) for k, v in batch_tensor.items()} # Update - loss = (1 / all_numseq) * model_engine(**batch, **batch_info).loss + loss = (1 / all_numseq) * model_engine(**batch_tensor, **batch_info).loss model_engine.backward(loss) @@ -314,12 +314,12 @@ def train(): eval_loader.set_epoch(epoch) with torch.inference_mode(): - for (batch, batch_info), all_numseq, cur_numseq in eval_loader: + for (batch_tensor, batch_info), all_numseq, cur_numseq in eval_loader: # To device - batch = {k: (v.to(args.device) if v is not None else None) for k, v in batch.items()} + batch_tensor = {k: (v.to(args.device) if v is not None else None) for k, v in batch_tensor.items()} # Eval - eval_loss = (1 / all_numseq) * model_engine(**batch, **batch_info).loss + eval_loss = (1 / all_numseq) * model_engine(**batch_tensor, **batch_info).loss # Accumulate eval loss eval_total_loss.add_(eval_loss)