Skip to content

Commit

Permalink
Merge pull request #53 from imoneoi/metadata_fix
Browse files Browse the repository at this point in the history
Metadata fix
  • Loading branch information
imoneoi authored Sep 15, 2023
2 parents df03b57 + f55561c commit 9e3293d
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 45 deletions.
15 changes: 8 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ curl http://localhost:18888/v1/chat/completions \

</details>

| Model | Size | Context | Weights | Serving |
|--------------|------|---------|--------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| OpenChat 3.2 SUPER | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.2_super) | `python -m ochat.serving.openai_api_server --model-type openchat_v3.2 --model openchat/openchat_v3.2_super --engine-use-ray --worker-use-ray --max-num-batched-tokens 5120` |
| Model | Size | Context | Weights | Serving |
|--------------------|------|---------|--------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------|
| OpenChat 3.2 SUPER | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.2_super) | `python -m ochat.serving.openai_api_server --model openchat/openchat_v3.2_super --engine-use-ray --worker-use-ray` |

For inference with Huggingface Transformers (slow and not recommended), follow the conversation template provided below:

Expand Down Expand Up @@ -276,10 +276,11 @@ To run the models on multiple GPUs with smaller VRAM, you can enable tensor para
<details>
<summary>OpenChat V3 (click to expand)</summary>

| Model | Size | Context | Weights | Serving |
|--------------|------|---------|--------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| OpenChat 3.2 | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.2) | `python -m ochat.serving.openai_api_server --model-type openchat_v3.2 --model openchat/openchat_v3.2 --engine-use-ray --worker-use-ray --max-num-batched-tokens 5120` |
| OpenChat 3.1 | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.1) | `python -m ochat.serving.openai_api_server --model-type openchat_v3.1_llama2 --model openchat/openchat_v3.1 --engine-use-ray --worker-use-ray --max-num-batched-tokens 5120` |
| Model | Size | Context | Weights | Serving |
|--------------|------|---------|--------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------|
| OpenChat 3.2 | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.2) | `python -m ochat.serving.openai_api_server --model openchat/openchat_v3.2 --engine-use-ray --worker-use-ray` |
| OpenChat 3.1 | 13B | 4096 | [Huggingface](https://huggingface.co/openchat/openchat_v3.1) | `python -m ochat.serving.openai_api_server --model openchat/openchat_v3.1 --engine-use-ray --worker-use-ray` |

</details>

## Acknowledgements
Expand Down
10 changes: 5 additions & 5 deletions ochat/config/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def _v3_condition(props):
MODEL_CONFIG_MAP = {
# OpenChat V3.2
"openchat_v3.2": ModelConfig(
name="OpenChat V3.2",
name="OpenChat V3.2 Llama 2",

# Prompt
role_prefix=_v3_2_conditional_prefix,
Expand Down Expand Up @@ -174,8 +174,8 @@ def _v3_condition(props):
),

# OpenChat V2
"openchat_v2": ModelConfig(
name="OpenChat_v2",
"openchat_v2_llama2": ModelConfig(
name="OpenChat V2 Llama 2",

# Prompt
role_prefix=_v2_conditional_prefix,
Expand All @@ -184,7 +184,7 @@ def _v3_condition(props):
bos_token="<s>",

# Tokenize
model_max_context=2048,
model_max_context=4096,
model_create=partial(ochat.models.LlamaForCausalLM.from_pretrained,
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16),
Expand All @@ -195,7 +195,7 @@ def _v3_condition(props):

# OpenChat
"openchat_llama2": ModelConfig(
name="OpenChat Llama 2",
name="OpenChat V1 Llama 2",

# Prompt
role_prefix={
Expand Down
10 changes: 9 additions & 1 deletion ochat/data/filter_sharegpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@
import numpy as np


def subsample_mask(seed: int, n: int, p: float):
mask = np.zeros((len(filtered_samples), ), np.bool_)
perm = np.random.default_rng(seed=seed).permutation(n)

mask[perm[:round(n * p)]] = True
return mask


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--in-file", type=str, required=True)
Expand Down Expand Up @@ -37,7 +45,7 @@

# Subsampling
if args.subsample < 1.0:
keep = np.random.default_rng(seed=args.subsample_seed).random(len(filtered_samples)) < args.subsample
keep = subsample_mask(args.subsample_seed, len(filtered_samples), args.subsample)
filtered_samples = [s for s, k in zip(filtered_samples, keep) if k]

# Print
Expand Down
8 changes: 4 additions & 4 deletions ochat/models/unpadded_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def forward(
nz_hidden_states: torch.Tensor,
nz_position_ids: torch.LongTensor,
cu_seqlens: torch.Tensor,
max_seqlen: torch.Tensor
max_seqlen: int
) -> torch.Tensor:
# nz_hidden_states: [nnz, num_heads, head_dim]
# nz_position_ids: [nnz]
Expand Down Expand Up @@ -213,7 +213,7 @@ def forward(
nz_hidden_states: torch.Tensor,
nz_position_ids: torch.Tensor,
cu_seqlens: torch.Tensor,
max_seqlen: torch.Tensor
max_seqlen: int
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
# Self Attention
residual = nz_hidden_states
Expand Down Expand Up @@ -298,7 +298,7 @@ def forward(
nz_input_ids: torch.Tensor,
nz_position_ids: torch.Tensor,
cu_seqlens: torch.Tensor,
max_seqlen: torch.Tensor,
max_seqlen: int,
) -> torch.Tensor:
nz_hidden_states = self.embed_tokens(nz_input_ids)
cos_sin = self.rotary_emb()
Expand Down Expand Up @@ -375,7 +375,7 @@ def forward(
nz_input_ids: torch.Tensor,
nz_position_ids: torch.Tensor,
cu_seqlens: torch.Tensor,
max_seqlen: torch.Tensor,
max_seqlen: int,
# Unpadded labels
nz_shifted_label_ids: Optional[torch.Tensor] = None,
nz_shifted_loss_weights: Optional[torch.Tensor] = None
Expand Down
29 changes: 17 additions & 12 deletions ochat/serving/openai_api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
from ochat.config.model_config import MODEL_CONFIG_MAP
from ochat.serving import openai_api_protocol, async_tokenizer

from transformers.utils.hub import cached_file


TIMEOUT_KEEP_ALIVE = 5 # seconds

Expand All @@ -37,7 +39,6 @@
class ModelConfig:
name: str = None

eot_token: str = None
max_length: int = None
stream_period: int = None

Expand Down Expand Up @@ -65,7 +66,7 @@ async def validation_exception_handler(request, exc): # pylint: disable=unused-

async def check_api_key(
auth: Optional[HTTPAuthorizationCredentials] = fastapi.Depends(HTTPBearer(auto_error=False)),
) -> str:
):
if not model.api_keys:
return

Expand Down Expand Up @@ -287,8 +288,6 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
parser = argparse.ArgumentParser(description="OpenChat OpenAI-Compatible RESTful API server.")

# Model
parser.add_argument("--model-type", type=str, required=True, help="Type of model")

parser.add_argument("--stream-period", type=int, default=6, help="Number of tokens per stream event")
parser.add_argument("--api-keys", type=str, nargs="*", default=[], help="Allowed API Keys. Leave blank to not verify")

Expand Down Expand Up @@ -327,22 +326,28 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
backupCount=args.log_max_count)
)

# Load model
engine_args = AsyncEngineArgs.from_cli_args(args)
engine = AsyncLLMEngine.from_engine_args(engine_args)
engine_model_config = asyncio.run(engine.get_model_config())
# Load model type
with open(cached_file(path_or_repo_id=args.model, filename="openchat.json"), "r") as f:
model_type = json.load(f)["model_type"]

# Load tokenizer
tokenizer = async_tokenizer.AsyncTokenizer.remote(args.model_type, args.model)
tokenizer = async_tokenizer.AsyncTokenizer.remote(model_type, args.model)

# Model config
model.name = args.model_type
model.eot_token = MODEL_CONFIG_MAP[args.model_type].eot_token
model.max_length = MODEL_CONFIG_MAP[args.model_type].model_max_context
model.name = model_type
model.max_length = MODEL_CONFIG_MAP[model_type].model_max_context

model.stream_period = args.stream_period
model.api_keys = args.api_keys

# Set max num batched tokens
args.max_num_batched_tokens = max(args.max_num_batched_tokens, model.max_length)

# Load model engine
engine_args = AsyncEngineArgs.from_cli_args(args)
engine = AsyncLLMEngine.from_engine_args(engine_args)
engine_model_config = asyncio.run(engine.get_model_config())

# Run
uvicorn.run(app,
host=args.host,
Expand Down
45 changes: 29 additions & 16 deletions ochat/training_deepspeed/train.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import os
import math
import json
from functools import partial

import torch
Expand Down Expand Up @@ -41,7 +42,6 @@ def parse_args():
parser.add_argument("--local_rank", type=int, required=True)

# Model type and data
parser.add_argument("--model_type", type=str, required=True)
parser.add_argument("--model_path", type=str, required=True)
parser.add_argument("--data_path", type=str, required=True)
parser.add_argument("--save_path", type=str, required=True)
Expand Down Expand Up @@ -77,6 +77,7 @@ def create_dataset(args, split_name):
_rank0_print (f"Skipping loading {split_name}")
return None

_rank0_print(f"Loading {split_name} data from {filename}...")
return ParquetDataset(filename)


Expand Down Expand Up @@ -111,20 +112,18 @@ def batch_to_tensor(batch, int_dtype=torch.long, loss_dtype=torch.bfloat16):
batch_tensor[k] = torch.from_numpy(np.concatenate(batch.column(k).to_numpy())).to(dtype)

# cu seqlens
batch_tensor["max_seqlen"] = torch.max(batch_tensor["seqlens"])
batch_tensor["cu_seqlens"] = torch.nn.functional.pad(batch_tensor["seqlens"].cumsum(-1, dtype=torch.int32), (1, 0))

del batch_tensor["seqlens"]
# batch info
batch_info = {"max_seqlen": torch.max(batch_tensor["seqlens"]).item()}

# inputs
return batch_tensor
del batch_tensor["seqlens"]

return batch_tensor, batch_info

def create_distributed_dataloader(args, data):
# Check data
assert data.metadata["model_type"] == args.model_type, \
f"The dataset is for {data.metadata['model_type']}, but you specified {args.model_type} for training."

def create_distributed_dataloader(args, data):
# Multipack dataloader
args.batch_max_len = args.batch_size_per_gpu * MODEL_CONFIG_MAP[args.model_type].model_max_context

Expand All @@ -143,6 +142,8 @@ def create_distributed_dataloader(args, data):
def create_model(args):
global LOCAL_RANK

_rank0_print(f"Loading model {args.model_type} from {args.model_path}...")

# Create model + optimizer + lr scheduler
model = MODEL_CONFIG_MAP[args.model_type].model_create(args.model_path)
# Model to assigned cuda device
Expand Down Expand Up @@ -198,6 +199,14 @@ def save_tokenizer(args, save_path):
tokenizer.save_pretrained(save_path)


def save_openchat_metadata(args, epoch, save_path):
metadata = vars(args)
metadata["epoch"] = epoch

with open(os.path.join(save_path, "openchat.json"), "w") as f:
json.dump(metadata, f, default=lambda o: "<non-serializable>")


def calculate_auto_lr(lr, batch_max_len, train_dataset):
if lr is not None:
return lr
Expand Down Expand Up @@ -227,10 +236,12 @@ def train():
LOCAL_RANK = args.local_rank

# Dataset
_rank0_print("Loading data...")
train_dataset = create_dataset(args, "train")
eval_dataset = create_dataset(args, "eval")

# Load model type
args.model_type = train_dataset.metadata["model_type"]

# Data Loader
train_loader = create_distributed_dataloader(args, train_dataset)
train_total_steps = args.epochs * train_loader.num_batches()
Expand All @@ -243,7 +254,6 @@ def train():
args.lr = calculate_auto_lr(args.lr, args.batch_max_len, train_dataset)

# Model
_rank0_print("Loading model...")
model_engine, optimizer = create_model(args)

# LR Scheduler
Expand All @@ -265,16 +275,16 @@ def train():
model_engine.train()

train_loader.set_epoch(epoch)
for batch, all_numseq, cur_numseq in train_loader:
for (batch_tensor, batch_info), all_numseq, cur_numseq in train_loader:
step += 1
if step > train_total_steps: # At most train_total_steps
break

# To device
batch = {k: (v.to(args.device) if v is not None else None) for k, v in batch.items()}
batch_tensor = {k: (v.to(args.device) if v is not None else None) for k, v in batch_tensor.items()}

# Update
loss = (1 / all_numseq) * model_engine(**batch).loss
loss = (1 / all_numseq) * model_engine(**batch_tensor, **batch_info).loss

model_engine.backward(loss)

Expand Down Expand Up @@ -304,12 +314,12 @@ def train():

eval_loader.set_epoch(epoch)
with torch.inference_mode():
for batch, all_numseq, cur_numseq in eval_loader:
for (batch_tensor, batch_info), all_numseq, cur_numseq in eval_loader:
# To device
batch = {k: (v.to(args.device) if v is not None else None) for k, v in batch.items()}
batch_tensor = {k: (v.to(args.device) if v is not None else None) for k, v in batch_tensor.items()}

# Eval
eval_loss = (1 / all_numseq) * model_engine(**batch).loss
eval_loss = (1 / all_numseq) * model_engine(**batch_tensor, **batch_info).loss

# Accumulate eval loss
eval_total_loss.add_(eval_loss)
Expand Down Expand Up @@ -337,6 +347,9 @@ def train():
# Also save tokenizer from base model
save_tokenizer(args, save_path)

# Write metadata
save_openchat_metadata(args, epoch, save_path)


if __name__ == "__main__":
train()

0 comments on commit 9e3293d

Please sign in to comment.