diff --git a/benchmarking/configs/inference_test.yml b/benchmarking/configs/inference_test.yml new file mode 100644 index 000000000..13e36aee5 --- /dev/null +++ b/benchmarking/configs/inference_test.yml @@ -0,0 +1,11 @@ +# GPT inference testing setup +models: + - EleutherAI/pythia-70m + - EleutherAI/pythia-160m + - EleutherAI/pythia-410m + - EleutherAI/pythia-1b + - EleutherAI/pythia-1.4b + +world_size: 1 +trials: 10 +max_tokens: 4 diff --git a/benchmarking/hf_ds_benchmark.py b/benchmarking/hf_ds_benchmark.py new file mode 100644 index 000000000..d859af510 --- /dev/null +++ b/benchmarking/hf_ds_benchmark.py @@ -0,0 +1,179 @@ +'''Adapted from https://github.com/microsoft/DeepSpeed/blob/master/benchmarks/inference/gpt-bench.py''' + +import argparse +import io +import os +import subprocess +import time + +import deepspeed +from deepspeed.accelerator import get_accelerator +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from transformers import pipeline +import torch +import yaml + +PYTHIA_TO_OLD_SUFFIXES = { + "70m": "19M", + "160m": "125M", + "410m": "350M", + "1b": "800M", + "1.4b": "1-3B", + "2.8b": "2.7B", + "6.9b": "6-7B", + "12b": "13B", + "20b": "20B"} + +def benchmark_model( + model, output_dir, use_deepspeed, dtype, graphs, kernel_inject, max_tokens, local_rank, world_size, trials): + + deepspeed.init_distributed() + if local_rank == 0: + print("BENCHMARK SETTINGS:") + print(f"\tMODEL: {model}") + print(f"\tMAX_TOKENS: {max_tokens}") + print(f"\tDTYPE: {dtype}") + print(f"\tCUDA_GRAPHS: {graphs}") + print(f"\tKERNEL_INJECT: {kernel_inject}") + print(f"\tWORLD_SIZE: {world_size}") + + if dtype == "int8": + dtype = torch.int8 + elif dtype == "fp16": + dtype = torch.float16 + else: + dtype = torch.float32 + + pipe = pipeline("text-generation", model=model, framework="pt") + + if dtype == torch.float16: + pipe.model.half() + print("") + if use_deepspeed: + pipe.model = deepspeed.init_inference( + pipe.model, + dtype=dtype, + mp_size=world_size, + replace_with_kernel_inject=kernel_inject, + enable_cuda_graph=graphs, + ) + pipe.model.profile_model_time() + + responses = [] + times = [] + mtimes = [] + for i in range(trials): + get_accelerator().synchronize() + start = time.time() + r = pipe("DeepSpeed is", do_sample=False, max_new_tokens=max_tokens) + get_accelerator().synchronize() + end = time.time() + responses.append(r) + times.append(end - start) # / (max_tokens - 3)) + if use_deepspeed: + mtimes.append(sum(pipe.model.model_times())) + + if use_deepspeed: + for_dataframe = np.vstack((times, mtimes, list(map(lambda t: t / (max_tokens - 3), times)))).T + columns = ["(e2e) latency", "(model-only) latency", "(e2e) per token latency"] + + else: + for_dataframe = np.vstack((times, list(map(lambda t: t / (max_tokens - 3), times)))).T + columns = ["(e2e) latency", "(e2e) per token latency"] + + df = pd.DataFrame( + for_dataframe, + columns = columns) + + if local_rank == 0: + + + deepspeed_str = "deepspeed" if use_deepspeed else "hf" + deepspeed_dir = os.path.join(output_dir, deepspeed_str) + max_tokens_dir = os.path.join(deepspeed_dir, "max_tokens_{}".format(max_tokens)) + world_size_dir = os.path.join(max_tokens_dir, "world_size_{}".format(world_size)) + + os.makedirs(world_size_dir, exist_ok=True) + + fname = os.path.join(world_size_dir, + "{}_{}_benchmark.csv".format(model.split('/')[-1], str(dtype).split('.')[1])) + + print("saving benchmark to {}".format(fname)) + + df.to_csv(fname, index=False) + return df + + +def main(models, output_dir, dtype, graphs, kernel_inject, max_tokens, local_rank, world_size, trials): + deepspeed_dfs = [] + hf_dfs = [] + print("Models to benchmark: {}".format(models)) + for model in models: + print("Benchmarking model: {}".format(model)) + # run using deepspeed + print("Running with deepspeed") + deepspeed_dfs.append(benchmark_model( + model, output_dir, True, dtype, graphs, kernel_inject, max_tokens, local_rank, world_size, trials)) + + # run using huggingface + print("Running with huggingface") + hf_dfs.append(benchmark_model( + model, output_dir, False, dtype, graphs, kernel_inject, max_tokens, local_rank, world_size, trials)) + + + print("plotting results") + # drop first 3 rows (warmup) + ds_means = [x["(e2e) latency"].iloc[3:].mean() for x in deepspeed_dfs] + ds_std = [x["(e2e) latency"].iloc[3:].std() for x in deepspeed_dfs] + hf_means = [x["(e2e) latency"].iloc[3:].mean() for x in hf_dfs] + hf_std = [x["(e2e) latency"].iloc[3:].std() for x in hf_dfs] + + + # plot results + fig, ax = plt.subplots(figsize=(12, 4)) + ax.bar( + np.arange(len(ds_means)) - 0.24, + ds_means, yerr=ds_std, align='center', alpha=0.5, ecolor='black', capsize=10, width=0.4, label='Deepspeed') + ax.bar( + np.arange(len(hf_means)) + 0.24, + hf_means, yerr=hf_std, align='center', alpha=0.5, ecolor='black', capsize=10, width=0.4, label='Huggingface') + ax.set_xticks(np.arange(len(models))) + ax.set_xticklabels(models) + ax.set_xlabel('Model') + ax.set_ylabel('Time (s)') + plt.legend() + plt.tight_layout() + plt.title("e2e latency (s), {} tokens, {} world size, {} trials".format(max_tokens, world_size, trials)) + plt.savefig(os.path.join(output_dir, "benchmark.png")) + print("plot saved to {}".format(os.path.join(output_dir, "benchmark.png"))) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--output_dir", type=str, default='/home/mchorse/benchmarking/output', help="output_directory") + parser.add_argument("--config", type=str, default='configs/inference_test.yml') + parser.add_argument("--dtype", type=str, default="fp16", choices=["fp16", "fp32", "int8"], help="int8, fp16, or fp32") + parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on") + parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on") + parser.add_argument("--local_rank", type=int, default=int(os.getenv("LOCAL_RANK", "0")), help="local rank") + args = parser.parse_args() + + with open(args.config, "r") as f: + config = yaml.safe_load(f) + + models = config["models"] + world_size = config["world_size"] + trials = config["trials"] + max_tokens = config["max_tokens"] + + main(models=models, + output_dir=args.output_dir, + dtype=args.dtype, + graphs=args.graphs, + kernel_inject=args.kernel_inject, + max_tokens=max_tokens, + local_rank=args.local_rank, + world_size=world_size, + trials=trials) + diff --git a/benchmarking/megatron_config.json b/benchmarking/megatron_config.json new file mode 100644 index 000000000..355906854 --- /dev/null +++ b/benchmarking/megatron_config.json @@ -0,0 +1 @@ +{"train_batch_size": 128, "train_micro_batch_size_per_gpu": 32, "optimizer": {"type": "Adam", "params": {"lr": 0.0008, "betas": [0.9, 0.95], "eps": 1e-08}}, "fp16": {"fp16": true, "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 12, "hysteresis": 2, "min_loss_scale": 1}, "zero_optimization": {"stage": 1, "allgather_partitions": true, "allgather_bucket_size": 500000000, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 500000000, "contiguous_gradients": true}, "wall_clock_breakdown": true, "precision": "fp16", "num_layers": 10, "hidden_size": 640, "num_attention_heads": 10, "seq_length": 2048, "max_position_embeddings": 2048, "pos_emb": "rotary", "no_weight_tying": true, "attention_config": ["global", "global", "global", "global", "global", "global", "global", "global", "global", "global"], "sparsity_config": {}, "rotary_pct": 0.25, "init_method": "small_init", "output_layer_init_method": "wang_init", "gpt_j_residual": true, "output_layer_parallelism": "column", "lr_decay_style": "cosine", "lr_decay_iters": 143000, "min_lr": 8e-05, "optimizer_type": "Adam", "zero_stage": 1, "zero_reduce_scatter": true, "zero_contiguous_gradients": true, "zero_reduce_bucket_size": 500000000, "zero_allgather_bucket_size": 500000000, "lr": 0.0008, "padded_vocab_size": 50304, "data_path": "../data/enwik8/enwik8_text_document", "data_impl": "mmap", "save": "checkpoints", "config_files": {"49M.yml": "{\n # parallelism settings\n \"pipe-parallel-size\": 2,\n \"model-parallel-size\": 1,\n\n # model settings\n \"num-layers\": 10,\n \"hidden-size\": 640,\n \"num-attention-heads\": 10,\n \"seq-length\": 2048,\n \"max-position-embeddings\": 2048,\n \"pos-emb\": \"rotary\",\n \"rotary-pct\": 0.25,\n \"no-weight-tying\": true,\n \"gpt-j-residual\": true,\n \"output-layer-parallelism\": \"column\",\n\n # these should provide some speedup but takes a while to build, set to true if desired\n \"scaled-upper-triang-masked-softmax-fusion\": false,\n \"bias-gelu-fusion\": false,\n\n # init methods\n \"init_method\": \"small_init\",\n \"output_layer_init_method\": \"wang_init\",\n\n # optimizer settings\n \"optimizer\": {\n \"type\": \"Adam\",\n \"params\": {\n \"lr\": 0.0008,\n \"betas\": [0.9, 0.95],\n \"eps\": 1.0e-8,\n }\n },\n \"min_lr\": 0.00008,\n\n # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training\n \"zero_optimization\": {\n \"stage\": 1,\n \"allgather_partitions\": True,\n \"allgather_bucket_size\": 500000000,\n \"overlap_comm\": True,\n \"reduce_scatter\": True,\n \"reduce_bucket_size\": 500000000,\n \"contiguous_gradients\": True,\n },\n\n # batch / data settings\n \"train_micro_batch_size_per_gpu\": 32,\n \"gas\": 1,\n \"data-impl\": \"mmap\",\n \"num_workers\": 1,\n\n # activation checkpointing\n \"checkpoint-activations\": true,\n \"checkpoint-num-layers\": 1,\n \"partition-activations\": true,\n \"synchronize-each-layer\": true,\n\n # regularization\n \"gradient_clipping\": 1.0,\n \"weight-decay\": 0.1,\n \"hidden-dropout\": 0,\n \"attention-dropout\": 0,\n\n # precision settings\n \"fp16\": {\n \"fp16\": true,\n \"enabled\": true,\n \"loss_scale\": 0,\n \"loss_scale_window\": 1000,\n \"initial_scale_power\": 12,\n \"hysteresis\": 2,\n \"min_loss_scale\": 1,\n },\n\n # misc. training settings\n \"train-iters\": 143000,\n \"lr-decay-iters\": 143000,\n \"distributed-backend\": \"nccl\",\n \"lr-decay-style\": \"cosine\",\n \"warmup\": 0.01,\n \"checkpoint-factor\": 1000,\n \"eval-interval\": 100000,\n \"eval-iters\": 10,\n\n # logging\n \"log-interval\": 10,\n \"steps_per_print\": 10,\n \"wall_clock_breakdown\": true,\n}\n", "benchmark_setup.yml": "# Suggested data paths when using GPT-NeoX locally\n{\n \"data-path\": \"../data/enwik8/enwik8_text_document\",\n\n # or for weighted datasets:\n # \"train-data-paths\": [\"data/enwik8/enwik8_text_document\", \"data/enwik8/enwik8_text_document\"],\n # \"test-data-paths\": [\"data/enwik8/enwik8_text_document\", \"data/enwik8/enwik8_text_document\"],\n # \"valid-data-paths\": [\"data/enwik8/enwik8_text_document\", \"data/enwik8/enwik8_text_document\"],\n # \"train-data-weights\": [1., 2.],\n # \"test-data-weights\": [2., 1.],\n # \"valid-data-weights\": [0.5, 0.4],\n\n # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.\n # WARNING: setting this to True will override any user provided weights\n # \"weight_by_num_documents\": false,\n # \"weighted_sampler_alpha\": 0.3,\n\n \"vocab-file\": \"../data/gpt2-vocab.json\",\n \"merge-file\": \"../data/gpt2-merges.txt\",\n\n \"save\": \"checkpoints\",\n \"load\": \"checkpoints\",\n \"checkpoint_validation_with_forward_pass\": False,\n\n \"tensorboard-dir\": \"tensorboard\",\n \"log-dir\": \"logs\",\n \"use_wandb\": True,\n \"wandb_host\": \"https://api.wandb.ai\",\n \"wandb_project\": \"neox\"\n}\n", "benchmarking.yml": "# Parameters used for text generation\n# Make sure `load` is specified somewhere else\n{\n # Text gen type: `input-file`, `unconditional` or `interactive`\n \"text-gen-type\": \"from_prompt\",\n\n # Params for all\n \"maximum_tokens\": 128,\n \"prompt_end\": \"\\n\",\n \"temperature\": 1.0,\n \"top_p\": 0.0,\n \"top_k\": 0,\n \"recompute\": false,\n\n # `unconditional`: samples\n \"num-samples\": 10,\n}\n"}, "load": "checkpoints", "checkpoint_factor": 1000, "batch_size": 32, "train_iters": 143000, "eval_iters": 10, "eval_interval": 100000, "vocab_file": "../data/gpt2-vocab.json", "merge_file": "../data/gpt2-merges.txt", "num_workers": 1, "attention_dropout": 0, "hidden_dropout": 0, "weight_decay": 0.1, "checkpoint_activations": true, "synchronize_each_layer": true, "partition_activations": true, "gas": 1, "clip_grad": 1.0, "dynamic_loss_scale": true, "pipe_parallel_size": 2, "world_size": 2, "is_pipe_parallel": true, "use_wandb": true, "log_dir": "logs", "tensorboard_dir": "tensorboard", "log_interval": 10, "text_gen_type": "from_prompt", "temperature": 1.0, "maximum_tokens": 128, "num_samples": 10, "local_rank": 0, "rank": 0, "save_iters": [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000, 21000, 22000, 23000, 24000, 25000, 26000, 27000, 28000, 29000, 30000, 31000, 32000, 33000, 34000, 35000, 36000, 37000, 38000, 39000, 40000, 41000, 42000, 43000, 44000, 45000, 46000, 47000, 48000, 49000, 50000, 51000, 52000, 53000, 54000, 55000, 56000, 57000, 58000, 59000, 60000, 61000, 62000, 63000, 64000, 65000, 66000, 67000, 68000, 69000, 70000, 71000, 72000, 73000, 74000, 75000, 76000, 77000, 78000, 79000, 80000, 81000, 82000, 83000, 84000, 85000, 86000, 87000, 88000, 89000, 90000, 91000, 92000, 93000, 94000, 95000, 96000, 97000, 98000, 99000, 100000, 101000, 102000, 103000, 104000, 105000, 106000, 107000, 108000, 109000, 110000, 111000, 112000, 113000, 114000, 115000, 116000, 117000, 118000, 119000, 120000, 121000, 122000, 123000, 124000, 125000, 126000, 127000, 128000, 129000, 130000, 131000, 132000, 133000, 134000, 135000, 136000, 137000, 138000, 139000, 140000, 141000, 142000], "global_num_gpus": 8} \ No newline at end of file diff --git a/benchmarking/neox_benchmark.py b/benchmarking/neox_benchmark.py new file mode 100644 index 000000000..8b3e18e54 --- /dev/null +++ b/benchmarking/neox_benchmark.py @@ -0,0 +1,87 @@ +'''Adapted from https://github.com/microsoft/DeepSpeed/blob/master/benchmarks/inference/gpt-bench.py''' + +import argparse +import os +import sys +sys.path.insert(0, os.path.abspath(os.getcwd())) + + +import tempfile +import time + +import deepspeed +from deepspeed.accelerator import get_accelerator +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from transformers import pipeline +import torch +import yaml + +from megatron.text_generation_utils import generate_samples_from_prompt +from megatron.utils import print_rank_0, setup_for_inference_or_eval + + +PYTHIA_TO_OLD_SUFFIXES = { + "70m": "19M", + "160m": "125M", + "410m": "350M", + "1b": "800M", + "1.4b": "1-3B", + "2.8b": "2.7B", + "6.9b": "6-7B", + "12b": "13B", + "20b": "20B"} + + +def main(): + model, neox_args = setup_for_inference_or_eval(use_cache=True) + max_tokens = 10 + print_rank_0("Finished loading model") + + prompts = ["DeepSpeed is" for x in range(100)] + + generated_texts = generate_samples_from_prompt( + neox_args=neox_args, + model=model, + text=prompts, + eos_token_id=0, + maximum_tokens=10, + recompute=neox_args.recompute, + temperature=neox_args.temperature, + top_k=neox_args.top_k, + top_p=neox_args.top_p, + ) + + times = [x["duration_seconds"] for x in generated_texts] + + for_dataframe = np.vstack((times, list(map(lambda t: t / (max_tokens - 3), times)))).T + columns = ["(e2e) latency", "(e2e) per token latency"] + + df = pd.DataFrame( + for_dataframe, + columns = columns) + + + # save dataframe to CSV inside the directory for world_size + # if local_rank == 0: + + # neox_dir = os.path.join(output_dir, "neox") + # max_tokens_dir = os.path.join(neox_dir, "max_tokens_{}".format(max_tokens)) + # world_size_dir = os.path.join(max_tokens_dir, "world_size_{}".format(world_size)) + + # os.makedirs(world_size_dir, exist_ok=True) + + # fname = os.path.join(world_size_dir, + # "{}_fp16_benchmark.csv".format(model.split('/')[-1])) + + # print("saving benchmark to {}".format(fname)) + # df.to_csv(fname, index=False) + print("Starting data generation...") + df.to_csv(sys.stdout, index=False) + print("Data generation complete!") + + +if __name__ == "__main__": + main() + diff --git a/benchmarking/neox_benchmark_input.txt b/benchmarking/neox_benchmark_input.txt new file mode 100644 index 000000000..eff01f7d4 --- /dev/null +++ b/benchmarking/neox_benchmark_input.txt @@ -0,0 +1,100 @@ +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n +Deepspeed is \n diff --git a/benchmarking/scrapbook.ipynb b/benchmarking/scrapbook.ipynb new file mode 100644 index 000000000..ceda84fb5 --- /dev/null +++ b/benchmarking/scrapbook.ipynb @@ -0,0 +1,260 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "id": "928fcf87-4202-4111-ab43-0382a389f37d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.8/dist-packages/pandas/core/computation/expressions.py:20: UserWarning: Pandas requires version '2.7.3' or newer of 'numexpr' (version '2.7.2' currently installed).\n", + " from pandas.core.computation.check import NUMEXPR_INSTALLED\n" + ] + } + ], + "source": [ + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import yaml\n", + "\n", + "with open(\"/home/mchorse/gpt-neox/benchmarking/configs/inference_test.yml\", \"r\") as f:\n", + " config = yaml.safe_load(f)\n", + "\n", + "models = config[\"models\"]\n", + "world_size = config[\"world_size\"]\n", + "trials = config[\"trials\"]\n", + "max_tokens = config[\"max_tokens\"]\n", + "\n", + "models = [x.split(\"/\")[-1] for x in models]\n", + "ds_files = [\"/home/mchorse/inference_benchmark/deepspeed/max_tokens_128/world_size_1/{}_float16_benchmark.csv\".format(x) for x in models]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0578b8d9-462d-4d12-b583-4197f3ea4ea4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "hf_files = [\"/home/mchorse/inference_benchmark/hf/max_tokens_128/world_size_1/{}_float16_benchmark.csv\".format(x) for x in models]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2e935d3d", + "metadata": {}, + "outputs": [], + "source": [ + "ds_dfs = [pd.read_csv(x) for x in ds_files]\n", + "hf_dfs = [pd.read_csv(x) for x in hf_files]\n", + "ds_means = [x[\"(e2e) latency\"].iloc[3:].mean() for x in ds_dfs]\n", + "ds_std = [x[\"(e2e) latency\"].iloc[3:].std() for x in ds_dfs]\n", + "hf_means = [x[\"(e2e) latency\"].iloc[3:].mean() for x in hf_dfs]\n", + "hf_std = [x[\"(e2e) latency\"].iloc[3:].std() for x in hf_dfs]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e681fa64", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAioAAAGwCAYAAACHJU4LAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA6T0lEQVR4nO3deViU9f7/8deIMiAyKCiKCmq5hJricvSH6Ulzw0otM60sMZes41ZkGacFadO0skUzK5c6qVlq5bdSsxJTO6a5m6ZJap7C6GiJICDC5/dHl3OacBkGmLmB5+O65rrmXj/v+cwAL+77M/dtM8YYAQAAWFAlXxcAAABwIQQVAABgWQQVAABgWQQVAABgWQQVAABgWQQVAABgWQQVAABgWZV9XUBxFBQU6Oeff1ZwcLBsNpuvywEAAG4wxujUqVOqW7euKlW6+DGTMh1Ufv75Z0VGRvq6DAAA4IGjR4+qfv36F12nTAeV4OBgSX+8UIfD4eNqAACAOzIyMhQZGen8O34xZTqonDvd43A4CCoAAJQx7gzbYDAtAACwLIIKAACwLIIKAACwLJ+OUZk8ebKSk5Nd5jVr1kzfffddibaTn5+vvLy8Et0nyr4qVarIz8/P12UAAC7C54NpW7Rooc8++8w5XblyyZVkjNGxY8f0+++/l9g+Ub5Ur15dderU4To8AGBRPg8qlStXVp06dUpl3+dCSnh4uKpWrcofIzgZY3T69Gmlp6dLkiIiInxcEQDgfHweVL7//nvVrVtXAQEBio2N1ZQpUxQVFXXedXNzc5Wbm+uczsjIuOB+8/PznSElLCysxOtG2RcYGChJSk9PV3h4OKeBAMCCfDqYtmPHjlqwYIFWrVql2bNn69ChQ+rSpYtOnTp13vWnTJmikJAQ5+NiV6U9NyalatWqpVI7yodznw/GMAGANdmMMcbXRZzz+++/q0GDBnr++ec1YsSIQsvPd0QlMjJSJ0+eLHTBt5ycHB06dEiNGjVSQEBAqdeOsonPCQB4X0ZGhkJCQs779/uvfH7q58+qV6+upk2b6uDBg+ddbrfbZbfbvVwVAADwFUtdRyUzM1OpqakMbAQAAJJ8fERl4sSJ6tu3rxo0aKCff/5ZSUlJ8vPz06233lqq7c5Yc6BU9/9n9/Vs6rW2KpLDhw+rUaNG2r59u2JiYnxdDgCglPj0iMp//vMf3XrrrWrWrJkGDRqksLAwbdq0SbVq1fJlWT43bNgw2Ww22Ww2ValSRbVr11bPnj01b948FRQU+Lo8AAC8xqdHVN555x1fNm9pcXFxmj9/vvLz8/XLL79o1apVmjBhgpYuXaoVK1aU6IXxAACwKkuNUcH/2O121alTR/Xq1VPbtm31z3/+Ux9++KFWrlypBQsWSPrjW1IjR45UrVq15HA4dM0112jnzp0u+/nwww/Vtm1bBQQE6LLLLlNycrLOnj3rXG6z2TR79mz16dNHgYGBuuyyy7R06VLn8jNnzmjs2LGKiIhQQECAGjRooClTpri9vSQdPXpUgwYNUvXq1RUaGqr+/fvr8OHDLuu88cYbio6OVkBAgK644gq98sorLss3b96sNm3aKCAgQO3bt9f27duL070AKqi0tDRt27bNa4+0tDRfv+Qyj3/Ly5BrrrlGrVu31vLlyzVy5EjdfPPNCgwM1MqVKxUSEqI5c+aoe/fuOnDggEJDQ7V+/XoNHTpUL730krp06aLU1FTdddddkqSkpCTnfh999FFNnTpVL774ov71r3/plltu0e7duxUdHa2XXnpJK1as0LvvvquoqCgdPXpUR48edanrYtvn5eWpd+/eio2N1fr161W5cmU9+eSTiouL065du+Tv76+FCxfqscce08yZM9WmTRtt375do0aNUlBQkOLj45WZmanrr79ePXv21Ntvv61Dhw5pwoQJXu17AOXDnDlzCt1jrjQlJSVp8uTJXmuvPCKolDFXXHGFdu3apQ0bNmjz5s1KT093fmX72Wef1QcffKClS5fqrrvuUnJysh566CHFx8dLki677DI98cQTevDBB12Cys0336yRI0dKkp544gmtWbNGL7/8sl555RX9+OOPatKkiTp37iybzaYGDRoUquli2y9ZskQFBQV64403nLcwmD9/vqpXr66UlBT16tVLSUlJeu655zRgwABJUqNGjbR3717NmTNH8fHxWrRokQoKCjR37lwFBASoRYsW+s9//qN77rmn9DoaQLk0evRo9evXz+31s7Oz1blzZ0nShg0bnFe0dhffYi0+gkoZY4yRzWbTzp07lZmZWej2ANnZ2UpNTZUk7dy5Uxs3btRTTz3lXJ6fn6+cnBydPn3aeVXW2NhYl33ExsZqx44dkv4Y2NuzZ081a9ZMcXFxuv7669WrV69C619o+507d+rgwYMKDg52WScnJ0epqanKyspSamqqRowYoVGjRjmXnz17ViEhIZKkffv2qVWrVi4XZPtrmwDgjoiIiCKFh6ysLOfzmJgYBQUFlUZZuAiCShmzb98+NWrUSJmZmYqIiFBKSkqhdapXry7pj+vSJCcnO49U/Jm7V2Ft27atDh06pJUrV+qzzz7ToEGD1KNHj0LjUC4kMzNT7dq108KFCwstq1WrljIzMyVJr7/+ujp27OiynHvvAAAIKmXIF198od27d+u+++5T/fr1dezYMVWuXFkNGzY87/pt27bV/v371bhx44vud9OmTRo6dKjLdJs2bZzTDodDgwcP1uDBgzVw4EDFxcXpxIkTCg0NveT2bdu21ZIlSxQeHn7eyySHhISobt26+uGHHzRkyJDz1hcdHa1//etfysnJcQasTZs2XfQ1AQDKB4KKReXm5urYsWMuX0+eMmWKrr/+eg0dOlSVKlVSbGysbrjhBk2bNk1NmzbVzz//rI8//lg33nij2rdvr8cee0zXX3+9oqKiNHDgQFWqVEk7d+7Unj179OSTTzrbeu+999S+fXt17txZCxcu1ObNmzV37lxJ0vPPP6+IiAi1adNGlSpV0nvvvac6deo4j9pcavshQ4Zo+vTp6t+/vx5//HHVr19fR44c0fLly/Xggw+qfv36Sk5O1vjx4xUSEqK4uDjl5ubqm2++0W+//aaEhATddtttevjhhzVq1CglJibq8OHDevbZZ736fgAAfMSUYSdPnjSSzMmTJwsty87ONnv37jXZ2dk+qKx44uPjjSQjyVSuXNnUqlXL9OjRw8ybN8/k5+c718vIyDDjxo0zdevWNVWqVDGRkZFmyJAh5scff3Sus2rVKtOpUycTGBhoHA6H6dChg3nttdecyyWZWbNmmZ49exq73W4aNmxolixZ4lz+2muvmZiYGBMUFGQcDofp3r272bZtm9vbG2NMWlqaGTp0qKlZs6ax2+3msssuM6NGjXJ53xYuXGhiYmKMv7+/qVGjhvn73/9uli9f7lz+73//27Ru3dr4+/ubmJgYs2zZMiPJbN++vVh9XZY/JwBKX2ZmpvP3cWZmpq/LKTcu9vf7ryx19+SiutjdF7krrntsNpvef/993XDDDT7Z3tf4nAC4mKysLFWrVk3SH2PuGExbMopy92Qu+AYAACyLMSoAnNLS0rx6Jc2iflUUQMVDUKnginvmrwyfOcR5cNVOAFZDUAHgxFU7AVgNQQWAE1ftBGA1DKYFAACWRVABAACWRVABAACWRVDBJTVs2FAvvPBCie/3gw8+UOPGjeXn56d77723xPcPACj7KuZg2rVTvNdWt8QibzJs2DD9/vvv+uCDD1zmp6SkqFu3bvrtt99c7rVT2rZs2VIqgyRHjx6tO++8U+PHj1dwcHCJ7x8AUPZVzKCCIqlVq1aJ7zMzM1Pp6enq3bu36tatW+L7B4AZaw4Uex+52aedz1/+/HvZA6sWa3/39Wxa3JIqHE79lFGTJ09WTEyMy7wXXnhBDRs2dE6fPXtW48ePV/Xq1RUWFqZJkyYpPj7e5b48p06d0pAhQxQUFKSIiAjNmDFDXbt2dTkV89dTPzabTW+88YZuvPFGVa1aVU2aNNGKFStcalmxYoWaNGmigIAAdevWTW+++aZsNpt+//13paSkOI+gXHPNNbLZbEpJSdHx48d16623ql69eqpataquvPJKLV682GW/BQUFmjZtmho3biy73a6oqCg99dRTzuVHjx7VoEGDVL16dYWGhqp///46fPiwR30MAPA9gko59swzz2jhwoWaP3++Nm7cqIyMjEKnkxISErRx40atWLFCa9as0fr167Vt27ZL7js5OVmDBg3Srl27dO2112rIkCE6ceKEJOnQoUMaOHCgbrjhBu3cuVOjR4/Www8/7Ny2U6dO2r9/vyRp2bJlSktLU6dOnZSTk6N27drp448/1p49e3TXXXfpjjvu0ObNm53bJiYmaurUqXr00Ue1d+9eLVq0SLVr15Yk5eXlqXfv3goODtb69eu1ceNGVatWTXFxcTpz5kxxuxMA4AOc+rGojz76yHnHznPy8/OLtI+XX35ZiYmJuvHGGyVJM2fO1CeffOJcfurUKb355ptatGiRunfvLkmaP3++W6dihg0bpltvvVWS9PTTT+ull17S5s2bFRcXpzlz5qhZs2aaPn26JKlZs2bas2eP88iHv7+/wsPDJUmhoaGqU6eOJKlevXqaOHGis41x48Zp9erVevfdd9WhQwedOnVKL774ombOnKn4+HhJ0uWXX+68MuqSJUtUUFCgN954Qzabzfl6qlevrpSUFPXq1atI/QcA8D2CikV169ZNs2fPdpn39ddf6/bbb3dr+5MnT+qXX35Rhw4dnPP8/PzUrl07FRQUSJJ++OEH5eXluawTEhKiZs2aXXL/rVq1cj4PCgqSw+FQenq6JGn//v3629/+5rL+n9u4kPz8fD399NN699139dNPP+nMmTPKzc1V1ap/nBPet2+fcnNznaHqr3bu3KmDBw8WGpibk5Oj1NTUS7YPALAegopFBQUFqXHjxi7z/vOf/zifV6pUqdANAfPy8rxSmyRVqVLFZdpmszkDkKemT5+uF198US+88IKuvPJKBQUF6d5773WetrnUfWQyMzPVrl07LVy4sNCy0hgQDAAofYxRKaNq1aqlY8eOuYSVHTt2OJ+HhISodu3a2rJli3Nefn6+y/iTyy67TFWqVHFZ5+TJkzpwoHgj5Zs1a6ZvvvnGZd6f27iQjRs3qn///rr99tvVunVrXXbZZS61NGnSRIGBgfr888/Pu33btm31/fffKzw8XI0bN3Z5hISEFOs1AQB8g6BSRnXt2lW//vqrpk2bptTUVM2aNUsrV650WWfcuHGaMmWKPvzwQ+3fv18TJkzQb7/95hy/ERwcrPj4eD3wwANau3atvv32W40YMUKVKlVyruOJ0aNH67vvvtOkSZN04MABvfvuu1qwYIEkXXS/TZo00Zo1a/TVV19p3759Gj16tH755Rfn8oCAAE2aNEkPPvig3nrrLaWmpmrTpk2aO3euJGnIkCGqWbOm+vfvr/Xr1+vQoUNKSUnR+PHjXY5GAQDKDoJKGRUdHa1XXnlFs2bNUuvWrbV582aXgaiSNGnSJN16660aOnSoYmNjVa1aNfXu3VsBAQHOdZ5//nnFxsbq+uuvV48ePXTVVVcpOjraZZ2iatSokZYuXarly5erVatWmj17tvNbP3a7/YLbPfLII2rbtq169+6trl27qk6dOi5fpZakRx99VPfff78ee+wxRUdHa/Dgwc6xMVWrVtWXX36pqKgoDRgwQNHR0RoxYoRycnLkcDg8fj0AAN+xmb8OdChDMjIyFBISopMnTxb6Q5STk6NDhw6pUaNGxfqjW54UFBQoOjpagwYN0hNPPHHedbKyslSvXj0999xzGjFiRIm1/dRTT+nVV1/V0aNHS2yfJYHPSfFkZWU5v52WmZlZKlcwBjxVUhd8S+zfRpI05cPtXPCthFzs7/dfMZi2HDty5Ig+/fRTXX311crNzdXMmTN16NAh3Xbbbc51tm/fru+++04dOnTQyZMn9fjjj0uS+vfvX6y2X3nlFf3tb39TWFiYNm7cqOnTp2vs2LHF2icAoOIhqJRjlSpV0oIFCzRx4kQZY9SyZUt99tlnio6Odlnv2Wef1f79++Xv76927dpp/fr1qlmzZrHa/v777/Xkk0/qxIkTioqK0v3336/ExKLf9wgAULERVMqxyMhIbdy48aLrtGnTRlu3bi3xtmfMmKEZM2aU+H4BABULQQWooLhhG4CyoNx/66cMjxWGF/D5AABrK7dB5dyVU0+fPn2JNVGRnft8/PVKuwAAayi3p378/PxUvXp1l2tsFOciZihfjDE6ffq00tPTVb16dfn5+fm6JADAeZTboCLJeVfec2EF+Kvq1as7PycAyr+M4+nKOPGr2+ufyc1xPv8pdZ/87UW73pIjtJYcYeFF2gauynVQsdlsioiIUHh4uFdv2IeyoUqVKhxJASqYrz5eok/fnunRtjMTbrv0Sn/R6/axihs6zqP28IdyHVTO8fPz4w8SAECdrhuslrHXeK09Ryh3bi+uChFUAACQJEdYOKdiyphy+60fAABQ9hFUAACAZRFUAACAZRFUAACAZTGYFgAAi0pLS1NaWprX2ouIiFBERITX2nMHQQUAAIuaM2eOkpOTvdZeUlKSJk+e7LX23EFQAQDAokaPHq1+/fq5vX52drY6d+4sSdqwYYMCAwOL1J7VjqZIBBUAACyrqKdisrKynM9jYmIUFBRUGmV5FYNpAQCAZXFEBQAAq1g7pXjbZ5/53/N1z0qB/sXbnyR1Syz+PoqBIyoAAMCyCCoAAMCyOPUDwCnjeLoyTvzq9vpncnOcz39K3Sd/e0CR2nOE1uIGccBFpB3PUNrxU26vn52b53y+4+DPCrRXKVJ7EWHBighzFGmb0kZQAeD01cdL9OnbMz3admbCbUXeptftYxU3dJxH7QEVwZz/26zkt77waNvOE14r8jZJQ6/R5GE9PGqvtBBUADh1um6wWsZe47X2HKG1vNYWUBaN7ttB/TpFe629iLBgr7XlLoIKACdHWDinYgALiQhzWO5UjLcxmBYAAFgWQQUAAFgWp34AlFncWRYo/wgqAMos7iwLlH8EFQBlFneWBco/ywSVqVOnKjExURMmTNALL7zg63IAlAHcWRYo/ywRVLZs2aI5c+aoVatWvi4FgJVxwzagwvH5t34yMzM1ZMgQvf7666pRo4avywEAABbi8yMqY8aM0XXXXacePXroySefvOi6ubm5ys3NdU5nZGSUdnkALIz7oADln0+DyjvvvKNt27Zpy5Ytbq0/ZcoUr47wB2Bt3AcFKP98FlSOHj2qCRMmaM2aNQoIcO+Oq4mJiUpISHBOZ2RkKDIysrRKBGBx3AcFKP98FlS2bt2q9PR0tW3b1jkvPz9fX375pWbOnKnc3Fz5+fm5bGO322W3271dKgCL4j4oQPnns6DSvXt37d6922XenXfeqSuuuEKTJk0qFFIAAEDF47OgEhwcrJYtW7rMCwoKUlhYWKH5AACgYvL515MBAAAuxOdfT/6zlJQUX5cAAAAshCMqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsir7ugAAqKjS0tKUlpbmtfYiIiIUERHhtfaAkkBQAQAfmTNnjpKTk73WXlJSkiZPnuy19oCSQFABAB8ZPXq0+vXr5/b62dnZ6ty5syRpw4YNCgwMLFJ7JXE0haNA8DaCCgD4SFH/CGdlZTmfx8TEKCgoqDTKuiiOAsHbCCoA4CUz1hwo1va52aedz1/+/HvZA6sWtyTd17NpkdYvi0eBULYRVAAAbiuLR4FQtvH1ZAAAYFkcUQEAH8k4nq6ME7+6vf6Z3Bzn859S98nfHlCk9hyhteQICy/SNoCvEVQAwEe++niJPn17pkfbzky4rcjb9Lp9rOKGjvOoPcBXCCoA4COdrhuslrHXeK09R2gtr7UFlBSCCgD4iCMsnFMxwCUwmBYAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgWF3wDAJzf2inF30f2mf89X/esFOhfvP11Syze9ihzOKICAAAsi6ACAAAsi6ACAAAsi6ACAAAsi6ACAAAsi6ACAAAsy6dBZfbs2WrVqpUcDoccDodiY2O1cuVKX5YEAAAsxKdBpX79+po6daq2bt2qb775Rtdcc4369++vb7/91pdlAQAAi/DpBd/69u3rMv3UU09p9uzZ2rRpk1q0aOGjqgAAgFUUOagUFBRo3bp1Wr9+vY4cOaLTp0+rVq1aatOmjXr06KHIyEiPCsnPz9d7772nrKwsxcbGnned3Nxc5ebmOqczMjI8agsAAJQNbp/6yc7O1pNPPqnIyEhde+21WrlypX7//Xf5+fnp4MGDSkpKUqNGjXTttddq06ZNbhewe/duVatWTXa7XXfffbfef/99NW/e/LzrTpkyRSEhIc6Hp6EIAACUDW4fUWnatKliY2P1+uuvq2fPnqpSpUqhdY4cOaJFixbplltu0cMPP6xRo0Zdcr/NmjXTjh07dPLkSS1dulTx8fFat27decNKYmKiEhISnNMZGRmEFQAAyjG3g8qnn36q6Ojoi67ToEEDJSYmauLEifrxxx/d2q+/v78aN24sSWrXrp22bNmiF198UXPmzCm0rt1ul91ud7dkAABQxrl96udSIeXPqlSpossvv9yjggoKClzGoQAAgIrLo2/9rFq1StWqVVPnzp0lSbNmzdLrr7+u5s2ba9asWapRo4Zb+0lMTFSfPn0UFRWlU6dOadGiRUpJSdHq1as9KQsAUMrSjmco7fgpt9fPzs1zPt9x8GcF2gsPG7iYiLBgRYQ5irQNyhePgsoDDzygZ555RtIfg2Hvv/9+JSQkaO3atUpISND8+fPd2k96erqGDh2qtLQ0hYSEqFWrVlq9erV69uzpSVkAgFI25/82K/mtLzzatvOE14q8TdLQazR5WA+P2kP54FFQOXTokHOw67Jly3T99dfr6aef1rZt23Tttde6vZ+5c+d60jwAwEdG9+2gfp3cHwpQXBFhwV5rC9bkUVDx9/fX6dOnJUmfffaZhg4dKkkKDQ3l2iYAUI5FhDk4FQOv8iiodO7cWQkJCbrqqqu0efNmLVmyRJJ04MAB1a9fv0QLBAAAFZdH9/qZOXOmKleurKVLl2r27NmqV6+eJGnlypWKi4sr0QIBAEDF5dERlaioKH300UeF5s+YMaPYBQEAAJxTrJsSpqenKz09XQUFBS7zW7VqVayiAAAAJA+DytatWxUfH699+/bJGCNJstlsMsbIZrMpPz+/RIsEAAAVk0dBZfjw4WratKnmzp2r2rVry2azlXRdAAAAngWVH374QcuWLXPeowcAAKA0ePStn+7du2vnzp0lXQsAAIALj46ovPHGG4qPj9eePXvUsmVLVanieu+Gfv36lUhxAACgYvMoqPz73//Wxo0btXLlykLLGEwLAABKikenfsaNG6fbb79daWlpKigocHkQUgAAQEnxKKgcP35c9913n2rXrl3S9QAAADh5FFQGDBigtWvXlnQtAAAALjwao9K0aVMlJiZqw4YNuvLKKwsNph0/fnyJFAcAACo2j7/1U61aNa1bt07r1q1zWWaz2QgqAACgRHgUVA4dOlTSdQAAABTi0RgVAAAAb3A7qEydOlXZ2dlurfv111/r448/9rgoAAAAqQhBZe/evYqKitI//vEPrVy5Ur/++qtz2dmzZ7Vr1y698sor6tSpkwYPHqzg4OBSKRgAAFQcbo9Reeutt7Rz507NnDlTt912mzIyMuTn5ye73a7Tp09Lktq0aaORI0dq2LBhCggIKLWiAQBAxVCkwbStW7fW66+/rjlz5mjXrl06cuSIsrOzVbNmTcXExKhmzZqlVScAAKiAPPrWT6VKlRQTE6OYmJgSLgcAAOB/+NYPAACwLIIKAACwLIIKAACwLIIKAACwrGIFlYMHD2r16tXOC8EZY0qkKAAAAMnDoHL8+HH16NFDTZs21bXXXqu0tDRJ0ogRI3T//feXaIEAAKDi8iio3HfffapcubJ+/PFHVa1a1Tl/8ODBWrVqVYkVBwAAKjaPrqPy6aefavXq1apfv77L/CZNmujIkSMlUhgAAIBHR1SysrJcjqScc+LECdnt9mIXBQAAIHkYVLp06aK33nrLOW2z2VRQUKBp06apW7duJVYcAACo2Dw69TNt2jR1795d33zzjc6cOaMHH3xQ3377rU6cOKGNGzeWdI0AAKCC8uiISsuWLXXgwAF17txZ/fv3V1ZWlgYMGKDt27fr8ssvL+kaAQBABeXRERVJCgkJ0cMPP1yStQAAALjwOKjk5ORo165dSk9PV0FBgcuyfv36FbswAAAAj4LKqlWrNHToUP33v/8ttMxmsyk/P7/YhQEAAHg0RmXcuHG6+eablZaWpoKCApcHIQUAAJQUj4LKL7/8ooSEBNWuXbuk6wEAAHDyKKgMHDhQKSkpJVwKAACAK4/GqMycOVM333yz1q9fryuvvFJVqlRxWT5+/PgSKQ4AAFRsHgWVxYsX69NPP1VAQIBSUlJks9mcy2w2G0EFAACUCI+CysMPP6zk5GQ99NBDqlTJo7NHAAAAl+RRUDlz5owGDx5MSAH+Ii0tTWlpaV5rLyIiQhEREV5rDwC8zaOgEh8fryVLluif//xnSdcDlGlz5sxRcnKy19pLSkrS5MmTvdYeAHibR0ElPz9f06ZN0+rVq9WqVatCg2mff/75EikOKGtGjx5dpCszZ2dnq3PnzpKkDRs2KDAwsEjtcTQFQHnnUVDZvXu32rRpI0nas2ePy7I/D6wFiqusnUop6vZZWVnO5zExMQoKCvK4bQAojzwKKmvXri3pOoDz4lQKAFRsHt+UEPAGTqUAQMXmdlAZMGCAFixYIIfDoQEDBlx03eXLlxe7MFQ8M9YcuMCSam7vIzf7f99EW388SPbAqkUr4vgpac8p5+R9PZsWbXsAQIlyO6iEhIQ4x5+EhISUWkHAn2UcT1fGiV/dXv9Mbo7z+U+p++RvDyhSe47QWnKEhRdpGwBA6XE7qMyfP1+PP/64Jk6cqPnz55dmTYDTVx8v0advz/Ro25kJtxV5m163j1Xc0HEetQcAKHlFGqOSnJysu+++W1WrFvFwOuChTtcNVsvYa7zWniO0ltfaAgBcWpGCijGmtOoAzssRFs6pGACowIp8DXyukwIAALylyF9Pbtq06SXDyokTJzwuCAAA4JwiB5Xk5GS+9QNI0topxd9H9pn/PV/3rBToX7z9dUss3vYAYDFFDiq33HKLwsMZMwAAAEpfkcaoMD4FAAB4U5GCCt/6AQAA3lSkoFJQUFCip32mTJmiv/3tbwoODlZ4eLhuuOEG7d+/v8T2DwAAyrYifz25JK1bt05jxozRpk2btGbNGuXl5alXr17KysryZVkAAMAifHr35FWrVrlML1iwQOHh4dq6dav+/ve/+6gqAABgFT4NKn918uRJSVJoaOh5l+fm5io3N9c5nZGR4ZW6AACAb/j01M+fFRQU6N5779VVV12lli1bnnedKVOmKCQkxPmIjIz0cpUAAMCbLBNUxowZoz179uidd9654DqJiYk6efKk83H06FEvVggAALzNEqd+xo4dq48++khffvml6tevf8H17Ha77Ha7FysDAAC+5NOgYozRuHHj9P777yslJUWNGjXyZTkAAMBifBpUxowZo0WLFunDDz9UcHCwjh07JkkKCQlRYGCgL0sDPJJ2PENpx0+5vX52bp7z+Y6DPyvQXqVI7UWEBSsizFGkbQCgLPFpUJk9e7YkqWvXri7z58+fr2HDhnm/IKCY5vzfZiW/9YVH23ae8FqRt0kaeo0mD+vhUXsAUBb4/NQPUJ6M7ttB/TpFe629iLBgr7UFAL5gicG0QHkREebgVAwAlCDLfD0ZAADgrwgqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsir7uoCyJC0tTWlpaV5rLyIiQhEREV5rDwAAqyGoFMGcOXOUnJzstfaSkpI0efJkr7UHAIDVEFSKYPTo0erXr5/b62dnZ6tz586SpA0bNigwMLBI7XE0BQBQ0RFUiqCop2KysrKcz2NiYhQUFFQaZQEAUG4xmBYAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgW11G5iBlrDhRr+9zs087nL3/+veyBVYtbku7r2bTY+wAAoKzgiAoAALAsggoAALAsggoAALAsggoAALAsggoAALAsggoAALAsggoAALAsggoAALAsLvhWBBnH05Vx4le31z+Tm+N8/lPqPvnbA4rUniO0lhxh4UXaBgCA8oSgUgRffbxEn74906NtZybcVuRtet0+VnFDx3nUHgAA5QFBpQg6XTdYLWOv8Vp7jtBaXmsLAAArIqgUgSMsnFMxAAB4EYNpAQCAZRFUAACAZRFUAACAZRFUAACAZTGYtpxLS0tTWlqa19qLiIhQRESE19oDAJRvBJVybs6cOUpOTvZae0lJSZo8ebLX2gMAlG8ElfJk7ZRCs26od1xN/jnI7V3knjmrEc8ulyTNnThAdv+ifURa1DvuWke3xCJtDwDAn/k0qHz55ZeaPn26tm7dqrS0NL3//vu64YYbfFlSufPBhr1KfusLj7Y9F1iKImnoNYppXNej9gAA+CufBpWsrCy1bt1aw4cP14ABA3xZSrk1um8H9esU7bX2IsKCvdYWAKD882lQ6dOnj/r06ePLEsq9iDCHIsIcvi4DAACPlKkxKrm5ucrNzXVOZ2Rk+LAaAABQ2srUdVSmTJmikJAQ5yMyMtLXJQEAgFJUpoJKYmKiTp486XwcPXrU1yUBAIBSVKZO/djtdtntdl+XAQAAvKRMHVEBAAAVi0+PqGRmZurgwYPO6UOHDmnHjh0KDQ1VVFSUDysDAABW4NOg8s0336hbt27O6YSEBElSfHy8FixY4KOqAACAVfg0qHTt2lXGGF+WAAAALIwxKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIsEVRmzZqlhg0bKiAgQB07dtTmzZt9XRIAALAAnweVJUuWKCEhQUlJSdq2bZtat26t3r17Kz093delAQAAH/N5UHn++ec1atQo3XnnnWrevLleffVVVa1aVfPmzfN1aQAAwMcq+7LxM2fOaOvWrUpMTHTOq1Spknr06KF///vfhdbPzc1Vbm6uc/rkyZOSpIyMjFKpLycrs1T2WxwXfa1ZOd4rxF1FeG/o7xJQnvtbos9LAZ9x7yrvn3H3d/nHPo0xl17Z+NBPP/1kJJmvvvrKZf4DDzxgOnToUGj9pKQkI4kHDx48ePDgUQ4eR48evWRW8OkRlaJKTExUQkKCc7qgoEAnTpxQWFiYbDabDyvzvoyMDEVGRuro0aNyOBy+LqdCoM+9i/72PvrcuypyfxtjdOrUKdWtW/eS6/o0qNSsWVN+fn765ZdfXOb/8ssvqlOnTqH17Xa77Ha7y7zq1auXZomW53A4KtwH3Nfoc++iv72PPveuitrfISEhbq3n08G0/v7+ateunT7//HPnvIKCAn3++eeKjY31YWUAAMAKfH7qJyEhQfHx8Wrfvr06dOigF154QVlZWbrzzjt9XRoAAPAxnweVwYMH69dff9Vjjz2mY8eOKSYmRqtWrVLt2rV9XZql2e12JSUlFToVhtJDn3sX/e199Ll30d/usRnjzneDAAAAvM/nF3wDAAC4EIIKAACwLIIKAACwLIKKjxw+fFg2m007duy44DoLFiyo8NeJ8RT96130t7VZ6f1xp5aywEp96imbzaYPPvjA12VcEkHFC4YNG6YbbrihyNsNHjxYBw4cKFbb536Yzvd47733nOv9+OOPuu6661S1alWFh4frgQce0NmzZ4vVtrf4sn8l6bXXXlPXrl3lcDhks9n0+++/n3e9jz/+WB07dlRgYKBq1KhRqOay8h74ur/PMcaoT58+5/1lO378eLVr1052u10xMTHn3X7Xrl3q0qWLAgICFBkZqWnTppVYbb7k6/fH3Z+HsqSs9+nUqVNls9l07733FrsWXyCoWFhgYKDCw8OLtY/IyEilpaW5PJKTk1WtWjX16dNHkpSfn6/rrrtOZ86c0VdffaU333xTCxYs0GOPPVYSL8OySqJ/Jen06dOKi4vTP//5zwuus2zZMt1xxx268847tXPnTm3cuFG33Xabc3lFeA9Kqr/PeeGFFy5664zhw4dr8ODB512WkZGhXr16qUGDBtq6daumT5+uyZMn67XXXiux+soab/48VBRW6NMtW7Zozpw5atWqVbHr8JkSubtgOXf11VebMWPGmDFjxhiHw2HCwsLMI488YgoKCkxycrJp0aJFoW1at25tHnnkkfPeSHHt2rXm0KFDRpJZtmyZ6dq1qwkMDDStWrVyuUHj/PnzTUhIiHP64MGDpl+/fiY8PNwEBQWZ9u3bmzVr1hT59cTExJjhw4c7pz/55BNTqVIlc+zYMee82bNnG4fDYXJzc40xf9wQsnXr1mbu3LkmMjLSBAUFmXvuucecPXvWPPPMM6Z27dqmVq1a5sknnyxyPeWlf9euXWskmd9++81lfl5enqlXr5554403LritN9+D8tDf27dvN/Xq1TNpaWlGknn//ffPu965PvurV155xdSoUcPZt8YYM2nSJNOsWTPndHx8vOnfv7956qmnTHh4uAkJCTHJyckmLy/PTJw40dSoUcPUq1fPzJs3z62a3VUe3h9jLvzzcK6WxYsXm9jYWGO3202LFi1MSkpKkfvKXeW9Ty/k1KlTpkmTJmbNmjXm6quvNhMmTHBZLsm88sorJi4uzgQEBJhGjRqZ9957z+16vIWg4oarr77aVKtWzUyYMMF899135u233zZVq1Y1r732mjl69KipVKmS2bx5s3P9bdu2GZvNZlJTU82pU6fMoEGDTFxcnElLSzNpaWkmNzfX+SG/4oorzEcffWT2799vBg4caBo0aGDy8vKMMYU/5Dt27DCvvvqq2b17tzlw4IB55JFHTEBAgDly5Ijbr+Wbb74xkszGjRud8x599NFCv8x/+OEHI8ls27bNGPPHL/xq1aqZgQMHmm+//dasWLHC+Pv7m969e5tx48aZ7777zsybN89IMps2baqQ/XuhXyJff/21kWTmzZtnYmJiTJ06dUxcXJzZvXu3cx1vvgdlvb+zsrJMdHS0+eCDD4wxxqOgcscdd5j+/fu7zPviiy+MJHPixAljzB9BJTg42IwZM8Z89913Zu7cuUaS6d27t3nqqafMgQMHzBNPPGGqVKni1h1g3VXW359zLhVU6tevb5YuXWr27t1rRo4caYKDg81///vfYvff+ZT3Pr2QoUOHmnvvvdfZB+cLKmFhYeb11183+/fvN4888ojx8/Mze/fudWv/3kJQccPVV19toqOjTUFBgXPepEmTTHR0tDHGmD59+ph77rnHuWzcuHGma9euzulz/5n92bkP+Z//y/7222+NJLNv3z5jTOEP+fm0aNHCvPzyy26/lnvuucdZ9zmjRo0yvXr1cpmXlZVlJJlPPvnEGPPHL/yqVauajIwM5zq9e/c2DRs2NPn5+c55zZo1M1OmTHG7HmPKT/9e6JfI4sWLjSQTFRVlli5dar755htz6623mrCwMHP8+HFjjHffg7Le33fddZcZMWKEc9qToNKzZ09z1113ucw7V++5X9Lx8fGmQYMGhfq2S5cuzumzZ8+aoKAgs3jx4ovWXBRl/f0551JBZerUqc55eXl5pn79+uaZZ55xa99FVd779HwWL15sWrZsabKzs40xFw4qd999t8u8jh07uvSFFTBGxU3/7//9P5fz4bGxsfr++++Vn5+vUaNGafHixcrJydGZM2e0aNEiDR8+3K39/vm8YUREhCQpPT39vOtmZmZq4sSJio6OVvXq1VWtWjXt27dPP/74oyTp6aefVrVq1ZyPc/PPyc7O1qJFizRixIgivfZzGjZsqODgYOd07dq11bx5c1WqVMll3oXqv5jy0L8XUlBQIEl6+OGHddNNN6ldu3aaP39+oQHN7iip96Cs9veKFSv0xRdf6IUXXnCrnuJq0aJFob698sorndN+fn4KCwvz6DN/MWX1/SmKP994tnLlymrfvr327dtXpH0URUXo03OOHj2qCRMmaOHChQoICLjoun+9AXBsbGypvg+e8Pm9fsqDvn37ym636/3335e/v7/y8vI0cOBAt7atUqWK8/m5H6Jzf9j+auLEiVqzZo2effZZNW7cWIGBgRo4cKDOnDkjSbr77rs1aNAg5/p169Z12X7p0qU6ffq0hg4d6jK/Tp062rx5s8u8X375xbnsfLWeq/d88y5Uv6fKSv9eyLlfXs2bN3fOs9vtuuyyy5y/iKz0Hli5v59//nmlpqYW+srnTTfdpC5duiglJcWtOuvUqePs33Os9Jm/GCu/P2VVeevTrVu3Kj09XW3btnXOy8/P15dffqmZM2cqNzdXfn5+Hu3bFwgqbvr6669dpjdt2qQmTZo43+z4+HjNnz9f/v7+uuWWWxQYGOhc19/fX/n5+cWuYePGjRo2bJhuvPFGSX+k88OHDzuXh4aGKjQ09ILbz507V/369VOtWrVc5sfGxuqpp55Senq6c4T6mjVr5HA4XP64lqby0L8Xcu5rsvv371fnzp0lSXl5eTp8+LAaNGggyfvvQVnt74ceekgjR450mXfllVdqxowZ6tu3r9ttx8bG6uGHH1ZeXp7zD82aNWvUrFkz1ahRw8NXVHLK6vtTFJs2bdLf//53SdLZs2e1detWjR07tlg1X0xF6NNzunfvrt27d7vMu/POO3XFFVdo0qRJLiFl06ZNLv+8btq0SW3atCl2DSWJoOKmH3/8UQkJCRo9erS2bduml19+Wc8995xz+ciRIxUdHS3pjw/jnzVs2FCrV6/W/v37FRYWppCQEI9qaNKkiZYvX66+ffvKZrPp0Ucfdfs/uYMHD+rLL7/UJ598UmhZr1691Lx5c91xxx2aNm2ajh07pkceeURjxozx2l09y3L/Hjt2TMeOHdPBgwclSbt371ZwcLCioqIUGhoqh8Ohu+++W0lJSYqMjFSDBg00ffp0SdLNN98syfvvQVnt7zp16rgc8TgnKipKjRo1ck4fPHhQmZmZOnbsmLKzs50X5WrevLn8/f112223KTk5WSNGjNCkSZO0Z88evfjii5oxY4ZHr6WkldX3R7r0z8M5s2bNUpMmTRQdHa0ZM2bot99+c/t0iyfKe592795dN954o8aOHavg4GC1bNnSZR9BQUEKCwsrNP+9995T+/bt1blzZy1cuFCbN2/W3LlzPXp9pYWg4qahQ4cqOztbHTp0kJ+fnyZMmKC77rrLubxJkybq1KmTTpw4oY4dO7psO2rUKKWkpKh9+/bKzMzU2rVr1bBhwyLX8Pzzz2v48OHq1KmTatasqUmTJikjI8OtbefNm6f69eurV69ehZb5+fnpo48+0j333KPY2FgFBQUpPj5ejz/+eJFr9FRZ7t9XX31VycnJzulz/yXOnz9fw4YNkyRNnz5dlStX1h133KHs7Gx17NhRX3zxhfO/d2+/B2W5v90xcuRIrVu3zjl97j/EQ4cOqWHDhgoJCdGnn36qMWPGqF27dqpZs6Yee+wxlz7wpbL8/rjz8yD9cRGyqVOnaseOHWrcuLFWrFihmjVrFrlOd5X3Pk1NTdV///vfIteUnJysd955R//4xz8UERGhxYsXe+1Iutt8PZq3LDjfaOm/KigoMJdffrl57rnnvFNUOUL/ehf9bW28PyWPPi3bOKJSAn799Ve98847OnbsmO68805fl1Pu0L/eRX9bG+9PyaNPrY2gUgLCw8NVs2ZNvfbaa5YYiFfe0L/eRX9bG+9PyaNPrc1mjDG+LgIAAOB8uOAbAACwLIIKAACwLIIKAACwLIIKAACwLIIKAACwLIIKgDIlJSVFNptNv//+u9vbNGzY0Gt3XAZQsggqAErUsGHDZLPZdPfddxdaNmbMGNlsNpdLqQPAxRBUAJS4yMhIvfPOO8rOznbOy8nJ0aJFixQVFeXDygCUNQQVACWubdu2ioyM1PLly53zli9frqioKJdbyOfm5mr8+PEKDw9XQECAOnfurC1btrjs65NPPlHTpk0VGBiobt266fDhw4Xa27Bhg7p06aLAwEBFRkZq/PjxysrKKrXXB8B7CCoASsXw4cM1f/585/S8efMK3UflwQcf1LJly/Tmm29q27Ztaty4sXr37q0TJ05Iko4ePaoBAwaob9++2rFjh0aOHKmHHnrIZR+pqamKi4vTTTfdpF27dmnJkiXasGGDxo4dW/ovEkCpI6gAKBW33367NmzYoCNHjujIkSPauHGjbr/9dufyrKwszZ49W9OnT1efPn3UvHlzvf766woMDNTcuXMlSbNnz9bll1+u5557Ts2aNdOQIUMKjW+ZMmWKhgwZonvvvVdNmjRRp06d9NJLL+mtt95STk6ON18ygFLATQkBlIpatWrpuuuu04IFC2SM0XXXXaeaNWs6l6empiovL09XXXWVc16VKlXUoUMH7du3T5K0b98+dezY0WW/sbGxLtM7d+7Url27tHDhQuc8Y4wKCgp06NAhRUdHl8bLA+AlBBUApWb48OHOUzCzZs0qlTYyMzM1evRojR8/vtAyBu4CZR9BBUCpiYuL05kzZ2Sz2dS7d2+XZZdffrn8/f21ceNGNWjQQJKUl5enLVu26N5775UkRUdHa8WKFS7bbdq0yWW6bdu22rt3rxo3blx6LwSAzzBGBUCp8fPz0759+7R37175+fm5LAsKCtI999yjBx54QKtWrdLevXs1atQonT59WiNGjJAk3X333fr+++/1wAMPaP/+/Vq0aJEWLFjgsp9Jkybpq6++0tixY7Vjxw59//33+vDDDxlMC5QTBBUApcrhcMjhcJx32dSpU3XTTTfpjjvuUNu2bXXw4EGtXr1aNWrUkPTHqZtly5bpgw8+UOvWrfXqq6/q6aefdtlHq1attG7dOh04cEBdunRRmzZt9Nhjj6lu3bql/toAlD6bMcb4uggAAIDz4YgKAACwLIIKAACwLIIKAACwLIIKAACwLIIKAACwLIIKAACwLIIKAACwLIIKAACwLIIKAACwLIIKAACwLIIKAACwrP8POdGutrnzE2MAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create the figure and axes objects\n", + "fig, ax = plt.subplots()\n", + "# Create the bar plot with error bars\n", + "ax.bar(\n", + " np.arange(len(ds_means)) - 0.24,\n", + " ds_means, yerr=ds_std, align='center', alpha=0.5, ecolor='black', capsize=10, width=0.4, label='Deepspeed')\n", + "ax.bar(\n", + " np.arange(len(hf_means)) + 0.24,\n", + " hf_means, yerr=hf_std, align='center', alpha=0.5, ecolor='black', capsize=10, width=0.4, label='Huggingface')\n", + "\n", + "# Set the x-axis tick labels to be the index of the values list\n", + "ax.set_xticks(np.arange(len(models)))\n", + "ax.set_xticklabels(models)\n", + "\n", + "# Set the labels and title\n", + "ax.set_xlabel('Model')\n", + "ax.set_ylabel('Time (ms)')\n", + "\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "b45dcc00-a07d-435a-b424-ce5abc3ff23a", + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: 'data/gpt2-vocab.json'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[21], line 14\u001b[0m\n\u001b[1;32m 9\u001b[0m neox_args \u001b[39m=\u001b[39m NeoXArgs\u001b[39m.\u001b[39mfrom_ymls(\n\u001b[1;32m 10\u001b[0m [\u001b[39m\"\u001b[39m\u001b[39m/home/mchorse/gpt-neox/configs/19M.yml\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 11\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m/home/mchorse/gpt-neox/configs/local_setup.yml\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 12\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m/home/mchorse/gpt-neox/configs/benchmarking.yml\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m 13\u001b[0m neox_args\u001b[39m.\u001b[39mconfigure_distributed_args()\n\u001b[0;32m---> 14\u001b[0m neox_args\u001b[39m.\u001b[39;49mbuild_tokenizer()\n", + "File \u001b[0;32m~/gpt-neox/megatron/neox_arguments/arguments.py:147\u001b[0m, in \u001b[0;36mNeoXArgs.build_tokenizer\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 146\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mbuild_tokenizer\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[0;32m--> 147\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtokenizer \u001b[39m=\u001b[39m build_tokenizer(\u001b[39mself\u001b[39;49m)\n", + "File \u001b[0;32m~/gpt-neox/megatron/tokenizer/tokenizer.py:40\u001b[0m, in \u001b[0;36mbuild_tokenizer\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[39massert\u001b[39;00m args\u001b[39m.\u001b[39mvocab_file \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m 39\u001b[0m \u001b[39massert\u001b[39;00m args\u001b[39m.\u001b[39mmerge_file \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m---> 40\u001b[0m tokenizer \u001b[39m=\u001b[39m _GPT2BPETokenizer(args\u001b[39m.\u001b[39;49mvocab_file, args\u001b[39m.\u001b[39;49mmerge_file)\n\u001b[1;32m 41\u001b[0m \u001b[39melif\u001b[39;00m args\u001b[39m.\u001b[39mtokenizer_type\u001b[39m.\u001b[39mlower() \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mSPMTokenizer\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mlower():\n\u001b[1;32m 42\u001b[0m \u001b[39massert\u001b[39;00m args\u001b[39m.\u001b[39mvocab_file \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n", + "File \u001b[0;32m~/gpt-neox/megatron/tokenizer/tokenizer.py:157\u001b[0m, in \u001b[0;36m_GPT2BPETokenizer.__init__\u001b[0;34m(self, vocab_file, merge_file)\u001b[0m\n\u001b[1;32m 154\u001b[0m name \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mGPT2 BPE\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 155\u001b[0m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39m\u001b[39m__init__\u001b[39m(name)\n\u001b[0;32m--> 157\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtokenizer \u001b[39m=\u001b[39m GPT2Tokenizer(\n\u001b[1;32m 158\u001b[0m vocab_file, merge_file, errors\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mreplace\u001b[39;49m\u001b[39m\"\u001b[39;49m, special_tokens\u001b[39m=\u001b[39;49m[], max_len\u001b[39m=\u001b[39;49m\u001b[39mNone\u001b[39;49;00m\n\u001b[1;32m 159\u001b[0m )\n\u001b[1;32m 160\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39meod_id \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtokenizer\u001b[39m.\u001b[39mencoder[\u001b[39m\"\u001b[39m\u001b[39m<|endoftext|>\u001b[39m\u001b[39m\"\u001b[39m]\n", + "File \u001b[0;32m~/gpt-neox/megatron/tokenizer/gpt2_tokenization.py:188\u001b[0m, in \u001b[0;36mGPT2Tokenizer.__init__\u001b[0;34m(self, vocab_file, merges_file, errors, special_tokens, max_len)\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\n\u001b[1;32m 180\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[1;32m 181\u001b[0m vocab_file,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 185\u001b[0m max_len\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m,\n\u001b[1;32m 186\u001b[0m ):\n\u001b[1;32m 187\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmax_len \u001b[39m=\u001b[39m max_len \u001b[39mif\u001b[39;00m max_len \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m \u001b[39mint\u001b[39m(\u001b[39m1e12\u001b[39m)\n\u001b[0;32m--> 188\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mencoder \u001b[39m=\u001b[39m json\u001b[39m.\u001b[39mload(\u001b[39mopen\u001b[39;49m(vocab_file))\n\u001b[1;32m 189\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdecoder \u001b[39m=\u001b[39m {v: k \u001b[39mfor\u001b[39;00m k, v \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mencoder\u001b[39m.\u001b[39mitems()}\n\u001b[1;32m 190\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merrors \u001b[39m=\u001b[39m errors \u001b[39m# how to handle errors in decoding\u001b[39;00m\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/gpt2-vocab.json'" + ] + } + ], + "source": [ + "import sys\n", + "sys.path.insert(0, '/home/mchorse/gpt-neox')\n", + "from megatron.text_generation_utils import generate_samples_from_prompt\n", + "from megatron.utils import print_rank_0, setup_for_inference_or_eval\n", + "from megatron.neox_arguments import NeoXArgs\n", + "from megatron.training import setup_model_and_optimizer\n", + "from megatron.initialize import initialize_megatron\n", + "\n", + "neox_args = NeoXArgs.from_ymls(\n", + " [\"/home/mchorse/gpt-neox/configs/19M.yml\",\n", + " \"/home/mchorse/gpt-neox/configs/local_setup.yml\",\n", + " \"/home/mchorse/gpt-neox/configs/benchmarking.yml\"])\n", + "neox_args.configure_distributed_args()\n", + "neox_args.build_tokenizer()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f1833a4c", + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "unsupported operand type(s) for %: 'NoneType' and 'int'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[22], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m use_cache \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[1;32m 2\u001b[0m initialize_megatron(neox_args)\n\u001b[0;32m----> 3\u001b[0m model, _, _ \u001b[39m=\u001b[39m setup_model_and_optimizer(\n\u001b[1;32m 4\u001b[0m neox_args\u001b[39m=\u001b[39;49mneox_args,\n\u001b[1;32m 5\u001b[0m use_cache\u001b[39m=\u001b[39;49muse_cache,\n\u001b[1;32m 6\u001b[0m iteration\u001b[39m=\u001b[39;49mneox_args\u001b[39m.\u001b[39;49miteration,\n\u001b[1;32m 7\u001b[0m ) \u001b[39m# we use setup_model_and_optimizer instead of get_model in order to initialize deepspeed\u001b[39;00m\n\u001b[1;32m 8\u001b[0m print_rank_0(\u001b[39m\"\u001b[39m\u001b[39mFinished loading model\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 10\u001b[0m model\u001b[39m.\u001b[39mmodule\u001b[39m.\u001b[39minference_mode(use_cache\u001b[39m=\u001b[39muse_cache)\n", + "File \u001b[0;32m~/gpt-neox/megatron/training.py:600\u001b[0m, in \u001b[0;36msetup_model_and_optimizer\u001b[0;34m(neox_args, use_cache, iteration)\u001b[0m\n\u001b[1;32m 598\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39msetup_model_and_optimizer\u001b[39m(neox_args, use_cache\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m, iteration\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m):\n\u001b[1;32m 599\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Setup model and optimizer.\"\"\"\u001b[39;00m\n\u001b[0;32m--> 600\u001b[0m model \u001b[39m=\u001b[39m get_model(neox_args\u001b[39m=\u001b[39;49mneox_args, use_cache\u001b[39m=\u001b[39;49muse_cache)\n\u001b[1;32m 601\u001b[0m optimizer, param_groups \u001b[39m=\u001b[39m get_optimizer(model\u001b[39m=\u001b[39mmodel, neox_args\u001b[39m=\u001b[39mneox_args)\n\u001b[1;32m 602\u001b[0m lr_scheduler \u001b[39m=\u001b[39m get_learning_rate_scheduler(optimizer\u001b[39m=\u001b[39moptimizer, neox_args\u001b[39m=\u001b[39mneox_args)\n", + "File \u001b[0;32m~/gpt-neox/megatron/training.py:388\u001b[0m, in \u001b[0;36mget_model\u001b[0;34m(neox_args, use_cache)\u001b[0m\n\u001b[1;32m 386\u001b[0m old_use_mup \u001b[39m=\u001b[39m neox_args\u001b[39m.\u001b[39muse_mup\n\u001b[1;32m 387\u001b[0m neox_args\u001b[39m.\u001b[39muse_mup \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m\n\u001b[0;32m--> 388\u001b[0m model \u001b[39m=\u001b[39m GPT2ModelPipe(\n\u001b[1;32m 389\u001b[0m neox_args\u001b[39m=\u001b[39;49mneox_args,\n\u001b[1;32m 390\u001b[0m num_tokentypes\u001b[39m=\u001b[39;49m\u001b[39m0\u001b[39;49m,\n\u001b[1;32m 391\u001b[0m parallel_output\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m,\n\u001b[1;32m 392\u001b[0m topology\u001b[39m=\u001b[39;49mmpu\u001b[39m.\u001b[39;49mget_topology(),\n\u001b[1;32m 393\u001b[0m use_cache\u001b[39m=\u001b[39;49muse_cache,\n\u001b[1;32m 394\u001b[0m )\n\u001b[1;32m 396\u001b[0m \u001b[39m### soft prompt tuning stuff ###\u001b[39;00m\n\u001b[1;32m 397\u001b[0m \u001b[39mif\u001b[39;00m neox_args\u001b[39m.\u001b[39msoft_prompt_tuning \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m neox_args\u001b[39m.\u001b[39msoft_prompt_tuning\u001b[39m.\u001b[39mget(\n\u001b[1;32m 398\u001b[0m \u001b[39m\"\u001b[39m\u001b[39menabled\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m 399\u001b[0m ):\n", + "File \u001b[0;32m~/gpt-neox/megatron/model/gpt2_model.py:123\u001b[0m, in \u001b[0;36mGPT2ModelPipe.__init__\u001b[0;34m(self, neox_args, num_tokentypes, parallel_output, topology, use_cache)\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mspecs \u001b[39m=\u001b[39m []\n\u001b[1;32m 121\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minit_specs() \u001b[39m# initializes the layer specs (basically a fancy nn.Sequential)\u001b[39;00m\n\u001b[0;32m--> 123\u001b[0m \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49m\u001b[39m__init__\u001b[39;49m(\n\u001b[1;32m 124\u001b[0m layers\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mspecs,\n\u001b[1;32m 125\u001b[0m loss_fn\u001b[39m=\u001b[39;49mpartial(cross_entropy, _fp16\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mneox_args\u001b[39m.\u001b[39;49mfp16_lm_cross_entropy),\n\u001b[1;32m 126\u001b[0m topology\u001b[39m=\u001b[39;49mtopology,\n\u001b[1;32m 127\u001b[0m activation_checkpoint_interval\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mneox_args\u001b[39m.\u001b[39;49mcheckpoint_num_layers\n\u001b[1;32m 128\u001b[0m \u001b[39mif\u001b[39;49;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mneox_args\u001b[39m.\u001b[39;49mcheckpoint_activations\n\u001b[1;32m 129\u001b[0m \u001b[39melse\u001b[39;49;00m \u001b[39m0\u001b[39;49m,\n\u001b[1;32m 130\u001b[0m partition_method\u001b[39m=\u001b[39;49mneox_args\u001b[39m.\u001b[39;49mpipe_partition_method,\n\u001b[1;32m 131\u001b[0m checkpointable_layers\u001b[39m=\u001b[39;49m[\u001b[39m\"\u001b[39;49m\u001b[39mGMLPBlock\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mParallelTransformerLayerPipe\u001b[39;49m\u001b[39m\"\u001b[39;49m],\n\u001b[1;32m 132\u001b[0m )\n", + "File \u001b[0;32m~/DeepSpeed/deepspeed/runtime/pipe/module.py:196\u001b[0m, in \u001b[0;36mPipelineModule.__init__\u001b[0;34m(self, layers, num_stages, topology, loss_fn, seed_layers, seed_fn, base_seed, partition_method, activation_checkpoint_interval, activation_checkpoint_func, checkpointable_layers)\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtied_weight_attrs \u001b[39m=\u001b[39m {}\n\u001b[1;32m 191\u001b[0m \u001b[39m# Offset the random seed by the stage ID.\u001b[39;00m\n\u001b[1;32m 192\u001b[0m \u001b[39m#newseed = get_accelerator().initial_seed() + self._grid.get_stage_id()\u001b[39;00m\n\u001b[1;32m 193\u001b[0m \u001b[39m#ds_utils.set_random_seed(newseed)\u001b[39;00m\n\u001b[1;32m 194\u001b[0m \n\u001b[1;32m 195\u001b[0m \u001b[39m#with torch.random.fork_rng(devices=[get_accelerator().current_device_name()]):\u001b[39;00m\n\u001b[0;32m--> 196\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_build()\n\u001b[1;32m 197\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mto(get_accelerator()\u001b[39m.\u001b[39mdevice_name(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlocal_rank))\n\u001b[1;32m 199\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtied_comms \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_index_tied_modules()\n", + "File \u001b[0;32m~/DeepSpeed/deepspeed/runtime/pipe/module.py:243\u001b[0m, in \u001b[0;36mPipelineModule._build\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 241\u001b[0m \u001b[39m# LayerSpec objects contain an nn.Module that should be allocated now.\u001b[39;00m\n\u001b[1;32m 242\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(layer, LayerSpec):\n\u001b[0;32m--> 243\u001b[0m module \u001b[39m=\u001b[39m layer\u001b[39m.\u001b[39;49mbuild()\n\u001b[1;32m 244\u001b[0m name \u001b[39m=\u001b[39m \u001b[39mstr\u001b[39m(layer_idx)\n\u001b[1;32m 245\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mforward_funcs\u001b[39m.\u001b[39mappend(module)\n", + "File \u001b[0;32m~/DeepSpeed/deepspeed/runtime/pipe/module.py:70\u001b[0m, in \u001b[0;36mLayerSpec.build\u001b[0;34m(self, log)\u001b[0m\n\u001b[1;32m 67\u001b[0m \u001b[39mif\u001b[39;00m log:\n\u001b[1;32m 68\u001b[0m logger\u001b[39m.\u001b[39minfo(\u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39mRANK=\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mglobal_rank\u001b[39m}\u001b[39;00m\u001b[39m building \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mrepr\u001b[39m(\u001b[39mself\u001b[39m)\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m)\n\u001b[0;32m---> 70\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mtypename(\u001b[39m*\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmodule_args, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmodule_kwargs)\n", + "File \u001b[0;32m~/gpt-neox/megatron/model/word_embeddings.py:58\u001b[0m, in \u001b[0;36mEmbedding.__init__\u001b[0;34m(self, neox_args, hidden_size, vocab_size, max_sequence_length, embedding_dropout_prob, init_method, num_tokentypes, use_pos_emb)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmup_rp_embedding_mult \u001b[39m=\u001b[39m neox_args\u001b[39m.\u001b[39mmup_rp_embedding_mult\n\u001b[1;32m 57\u001b[0m \u001b[39m# Word embeddings (parallel).\u001b[39;00m\n\u001b[0;32m---> 58\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mword_embeddings \u001b[39m=\u001b[39m mpu\u001b[39m.\u001b[39;49mVocabParallelEmbedding(\n\u001b[1;32m 59\u001b[0m neox_args\u001b[39m=\u001b[39;49mneox_args,\n\u001b[1;32m 60\u001b[0m num_embeddings\u001b[39m=\u001b[39;49mvocab_size,\n\u001b[1;32m 61\u001b[0m embedding_dim\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mhidden_size,\n\u001b[1;32m 62\u001b[0m init_method\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49minit_method,\n\u001b[1;32m 63\u001b[0m )\n\u001b[1;32m 64\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_word_embeddings_key \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mword_embeddings\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 66\u001b[0m \u001b[39mif\u001b[39;00m neox_args\u001b[39m.\u001b[39muse_bnb_optimizer:\n", + "File \u001b[0;32m~/gpt-neox/megatron/mpu/layers.py:126\u001b[0m, in \u001b[0;36mVocabParallelEmbedding.__init__\u001b[0;34m(self, neox_args, num_embeddings, embedding_dim, init_method)\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmodel_parallel_size \u001b[39m=\u001b[39m get_model_parallel_world_size()\n\u001b[1;32m 122\u001b[0m \u001b[39m# Divide the weight matrix along the vocabulary dimension.\u001b[39;00m\n\u001b[1;32m 123\u001b[0m (\n\u001b[1;32m 124\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvocab_start_index,\n\u001b[1;32m 125\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvocab_end_index,\n\u001b[0;32m--> 126\u001b[0m ) \u001b[39m=\u001b[39m VocabUtility\u001b[39m.\u001b[39;49mvocab_range_from_global_vocab_size(\n\u001b[1;32m 127\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mnum_embeddings, get_model_parallel_rank(), \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmodel_parallel_size\n\u001b[1;32m 128\u001b[0m )\n\u001b[1;32m 129\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mnum_embeddings_per_partition \u001b[39m=\u001b[39m (\n\u001b[1;32m 130\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvocab_end_index \u001b[39m-\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvocab_start_index\n\u001b[1;32m 131\u001b[0m )\n\u001b[1;32m 132\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minit_method \u001b[39m=\u001b[39m init_method\n", + "File \u001b[0;32m~/gpt-neox/megatron/mpu/utils.py:71\u001b[0m, in \u001b[0;36mVocabUtility.vocab_range_from_global_vocab_size\u001b[0;34m(global_vocab_size, rank, world_size)\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[39m@staticmethod\u001b[39m\n\u001b[1;32m 70\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mvocab_range_from_global_vocab_size\u001b[39m(global_vocab_size, rank, world_size):\n\u001b[0;32m---> 71\u001b[0m per_partition_vocab_size \u001b[39m=\u001b[39m divide(global_vocab_size, world_size)\n\u001b[1;32m 72\u001b[0m \u001b[39mreturn\u001b[39;00m VocabUtility\u001b[39m.\u001b[39mvocab_range_from_per_partition_vocab_size(\n\u001b[1;32m 73\u001b[0m per_partition_vocab_size, rank, world_size\n\u001b[1;32m 74\u001b[0m )\n", + "File \u001b[0;32m~/gpt-neox/megatron/mpu/utils.py:32\u001b[0m, in \u001b[0;36mdivide\u001b[0;34m(numerator, denominator)\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mdivide\u001b[39m(numerator, denominator):\n\u001b[1;32m 30\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Ensure that numerator is divisible by the denominator and return\u001b[39;00m\n\u001b[1;32m 31\u001b[0m \u001b[39m the division value.\"\"\"\u001b[39;00m\n\u001b[0;32m---> 32\u001b[0m ensure_divisibility(numerator, denominator)\n\u001b[1;32m 33\u001b[0m \u001b[39mreturn\u001b[39;00m numerator \u001b[39m/\u001b[39m\u001b[39m/\u001b[39m denominator\n", + "File \u001b[0;32m~/gpt-neox/megatron/mpu/utils.py:24\u001b[0m, in \u001b[0;36mensure_divisibility\u001b[0;34m(numerator, denominator)\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mensure_divisibility\u001b[39m(numerator, denominator):\n\u001b[1;32m 23\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Ensure that numerator is divisible by the denominator.\"\"\"\u001b[39;00m\n\u001b[0;32m---> 24\u001b[0m \u001b[39massert\u001b[39;00m numerator \u001b[39m%\u001b[39;49m denominator \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m is not divisible by \u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mformat(\n\u001b[1;32m 25\u001b[0m numerator, denominator\n\u001b[1;32m 26\u001b[0m )\n", + "\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for %: 'NoneType' and 'int'" + ] + } + ], + "source": [ + "use_cache = True\n", + "initialize_megatron(neox_args)\n", + "model, _, _ = setup_model_and_optimizer(\n", + " neox_args=neox_args,\n", + " use_cache=use_cache,\n", + " iteration=neox_args.iteration,\n", + " ) # we use setup_model_and_optimizer instead of get_model in order to initialize deepspeed\n", + "print_rank_0(\"Finished loading model\")\n", + "\n", + "model.module.inference_mode(use_cache=use_cache)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "59662052", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "NeoXArgs(distributed_backend='nccl', local_rank=None, rank=None, lazy_mpu_init=False, short_seq_prob=0.1, eod_mask_loss=False, adlr_autoresume=False, adlr_autoresume_interval=1000, seed=1234, onnx_safe=False, deepscale=False, deepscale_config=None, deepspeed_mpi=False, deepspeed_slurm=False, user_script=None, iteration=None, do_train=None, do_valid=None, do_test=None, save_iters=[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000, 21000, 22000, 23000, 24000, 25000, 26000, 27000, 28000, 29000, 30000, 31000, 32000, 33000, 34000, 35000, 36000, 37000, 38000, 39000, 40000, 41000, 42000, 43000, 44000, 45000, 46000, 47000, 48000, 49000, 50000, 51000, 52000, 53000, 54000, 55000, 56000, 57000, 58000, 59000, 60000, 61000, 62000, 63000, 64000, 65000, 66000, 67000, 68000, 69000, 70000, 71000, 72000, 73000, 74000, 75000, 76000, 77000, 78000, 79000, 80000, 81000, 82000, 83000, 84000, 85000, 86000, 87000, 88000, 89000, 90000, 91000, 92000, 93000, 94000, 95000, 96000, 97000, 98000, 99000, 100000, 101000, 102000, 103000, 104000, 105000, 106000, 107000, 108000, 109000, 110000, 111000, 112000, 113000, 114000, 115000, 116000, 117000, 118000, 119000, 120000, 121000, 122000, 123000, 124000, 125000, 126000, 127000, 128000, 129000, 130000, 131000, 132000, 133000, 134000, 135000, 136000, 137000, 138000, 139000, 140000, 141000, 142000], global_num_gpus=8, text_gen_type='unconditional', temperature=0.0, top_p=0.0, top_k=0, return_logits=False, maximum_tokens=64, prompt_end='\\n', sample_input_file=None, sample_output_file='samples.txt', num_samples=1, recompute=False, eval_results_prefix='', eval_tasks=None, use_wandb=True, wandb_group=None, wandb_team=None, wandb_project='neox', wandb_host='https://api.wandb.ai', wandb_init_all_ranks=False, git_hash='b0e9745', log_dir='logs', tensorboard_dir='tensorboard', log_interval=10, log_grad_pct_zeros=False, log_param_norm=False, log_grad_norm=False, log_optimizer_states=False, log_gradient_noise_scale=False, gradient_noise_scale_n_batches=5, gradient_noise_scale_cpu_offload=False, pipe_parallel_size=1, model_parallel_size=1, pipe_partition_method='type:transformer|mlp', world_size=None, is_pipe_parallel=False, data_path='data/enwik8/enwik8_text_document', use_shared_fs=True, train_data_paths=None, test_data_paths=None, valid_data_paths=None, train_data_weights=None, valid_data_weights=None, test_data_weights=None, weight_by_num_documents=False, weighted_sampler_alpha=0.3, data_impl='mmap', mmap_warmup=False, save='checkpoints', config_files={'19M.yml': '{\\n \"pipe-parallel-size\": 1,\\n \"model-parallel-size\": 1,\\n\\n # model settings\\n \"num-layers\": 6,\\n \"hidden-size\": 512,\\n \"num-attention-heads\": 8,\\n \"seq-length\": 2048,\\n \"max-position-embeddings\": 2048,\\n \"pos-emb\": \"rotary\",\\n \"no-weight-tying\": true,\\n \"gpt-j-residual\": false,\\n \"output-layer-parallelism\": \"column\",\\n\\n \"scaled-upper-triang-masked-softmax-fusion\": false,\\n \"bias-gelu-fusion\": false,\\n\\n # init methods\\n \"init_method\": \"small_init\",\\n \"output_layer_init_method\": \"wang_init\",\\n\\n \"optimizer\": {\\n \"type\": \"Adam\",\\n \"params\": {\\n \"lr\": 0.001,\\n \"betas\": [0.9, 0.95],\\n \"eps\": 1.0e-8,\\n }\\n },\\n \"min_lr\": 0.0001,\\n\\n # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training\\n \"zero_optimization\": {\\n \"stage\": 1,\\n \"allgather_partitions\": True,\\n \"allgather_bucket_size\": 500000000,\\n \"overlap_comm\": True,\\n \"reduce_scatter\": True,\\n \"reduce_bucket_size\": 500000000,\\n \"contiguous_gradients\": True,\\n },\\n\\n \"train_micro_batch_size_per_gpu\": 4, #32,\\n \"gas\": 1,\\n \"data-impl\": \"mmap\",\\n \"num_workers\": 1,\\n\\n # activation checkpointing\\n \"checkpoint-activations\": true,\\n \"checkpoint-num-layers\": 1,\\n \"partition-activations\": true,\\n \"synchronize-each-layer\": true,\\n\\n # regularization\\n \"gradient_clipping\": 1.0,\\n \"weight-decay\": 0.1,\\n \"hidden-dropout\": 0,\\n \"attention-dropout\": 0,\\n\\n # precision settings\\n \"fp16\": {\\n \"fp16\": true,\\n \"enabled\": true,\\n \"loss_scale\": 0,\\n \"loss_scale_window\": 1000,\\n \"initial_scale_power\": 12,\\n \"hysteresis\": 2,\\n \"min_loss_scale\": 1,\\n },\\n\\n \"train-iters\": 143000,\\n \"lr-decay-iters\": 143000,\\n \"distributed-backend\": \"nccl\",\\n \"lr-decay-style\": \"cosine\",\\n \"warmup\": 0.01,\\n \"checkpoint-factor\": 1000,\\n \"eval-interval\": 100000,\\n \"eval-iters\": 10,\\n\\n \"log-interval\": 10,\\n \"steps_per_print\": 10,\\n \"wall_clock_breakdown\": true,\\n\\n # additional deepspeed args not specified above\\n \"deepspeed_extra_args\": {\\n \"comms_logger\": {\\n \"enabled\": true,\\n \"verbose\": true,\\n \"prof_all\": true,\\n \"debug\": false\\n },\\n }\\n\\n}\\n', 'local_setup.yml': '# Suggested data paths when using GPT-NeoX locally\\n{\\n \"data-path\": \"data/enwik8/enwik8_text_document\",\\n\\n # or for weighted datasets:\\n # \"train-data-paths\": [\"data/enwik8/enwik8_text_document\", \"data/enwik8/enwik8_text_document\"],\\n # \"test-data-paths\": [\"data/enwik8/enwik8_text_document\", \"data/enwik8/enwik8_text_document\"],\\n # \"valid-data-paths\": [\"data/enwik8/enwik8_text_document\", \"data/enwik8/enwik8_text_document\"],\\n # \"train-data-weights\": [1., 2.],\\n # \"test-data-weights\": [2., 1.],\\n # \"valid-data-weights\": [0.5, 0.4],\\n\\n # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.\\n # WARNING: setting this to True will override any user provided weights\\n # \"weight_by_num_documents\": false,\\n # \"weighted_sampler_alpha\": 0.3,\\n\\n \"vocab-file\": \"data/gpt2-vocab.json\",\\n \"merge-file\": \"data/gpt2-merges.txt\",\\n\\n \"save\": \"checkpoints\",\\n \"load\": \"checkpoints\",\\n \"checkpoint_validation_with_forward_pass\": False,\\n\\n \"tensorboard-dir\": \"tensorboard\",\\n \"log-dir\": \"logs\",\\n \"use_wandb\": True,\\n \"wandb_host\": \"https://api.wandb.ai\",\\n \"wandb_project\": \"neox\"\\n}\\n'}, load='checkpoints', checkpoint_validation_with_forward_pass=False, checkpoint_scale='linear', checkpoint_factor=1000, extra_save_iters=None, no_save_optim=False, no_save_rng=False, no_load_optim=False, no_load_rng=False, finetune=False, batch_size=4, train_iters=143000, eval_iters=10, keep_last_n_checkpoints=None, eval_interval=100000, split='969, 30, 1', vocab_file='data/gpt2-vocab.json', merge_file='data/gpt2-merges.txt', num_workers=1, exit_interval=None, attention_dropout=0, hidden_dropout=0, weight_decay=0.1, checkpoint_activations=True, checkpoint_num_layers=1, deepspeed_activation_checkpointing=True, contiguous_checkpointing=False, checkpoint_in_cpu=False, synchronize_each_layer=True, profile_backward=False, partition_activations=True, gas=1, clip_grad=1.0, hysteresis=2, dynamic_loss_scale=True, loss_scale=None, loss_scale_window=1000.0, min_scale=1.0, char_level_ppl=False, use_mup=False, coord_check=False, save_base_shapes=False, base_shapes_file=None, mup_init_scale=1.0, mup_attn_temp=1.0, mup_output_temp=1.0, mup_embedding_mult=1.0, mup_rp_embedding_mult=1.0, mup_width_scale=2, tokenizer_type='GPT2BPETokenizer', padded_vocab_size=None, optimizer_type='Adam', use_bnb_optimizer=False, zero_stage=1, zero_reduce_scatter=True, zero_contiguous_gradients=True, zero_reduce_bucket_size=500000000, zero_allgather_bucket_size=500000000, lr=0.001, lr_decay_style='cosine', lr_decay_iters=143000, min_lr=0.0001, warmup=0.01, override_lr_scheduler=False, use_checkpoint_lr_scheduler=False, precision='fp16', num_layers=6, hidden_size=512, num_attention_heads=8, seq_length=2048, max_position_embeddings=2048, norm='layernorm', layernorm_epsilon=1e-05, rms_norm_epsilon=1e-08, scalenorm_epsilon=1e-08, pos_emb='rotary', rpe_num_buckets=32, rpe_max_distance=128, opt_pos_emb_offset=0, no_weight_tying=True, attention_config=['global', 'global', 'global', 'global', 'global', 'global'], sparsity_config={}, num_unique_layers=None, param_sharing_style='grouped', make_vocab_size_divisible_by=128, activation='gelu', scaled_upper_triang_masked_softmax_fusion=False, scaled_masked_softmax_fusion=False, bias_gelu_fusion=False, bias_dropout_fusion=False, fp16_lm_cross_entropy=False, init_method_std=0.02, apply_query_key_layer_scaling=False, use_cpu_initialization=False, attention_softmax_in_fp32=False, rotary_pct=1.0, rotary_emb_base=10000, init_method='small_init', output_layer_init_method='wang_init', gmlp_attn_dim=64, gpt_j_residual=False, gpt_j_tied=False, soft_prompt_tuning=None, output_layer_parallelism='column', deepspeed=True, train_batch_size=32, train_micro_batch_size_per_gpu=4, gradient_accumulation_steps=1, optimizer={'type': 'Adam', 'params': {'lr': 0.001, 'betas': [0.9, 0.95], 'eps': 1e-08}}, scheduler=None, fp32_allreduce=False, prescale_gradients=False, gradient_predivide_factor=1.0, sparse_gradients=False, fp16={'fp16': True, 'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 12, 'hysteresis': 2, 'min_loss_scale': 1}, amp=None, gradient_clipping=1.0, zero_optimization={'stage': 1, 'allgather_partitions': True, 'allgather_bucket_size': 500000000, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 500000000, 'contiguous_gradients': True}, curriculum_learning=None, curriculum_seqlen=0, steps_per_print=10, wall_clock_breakdown=True, dump_state=False, flops_profiler=None, communication_data_type=None, bf16=None, autotuning=None, activation_checkpointing=None, sparse_attention=None, data_efficiency=None, tensorboard=None, wandb=None, csv_monitor=None, elasticity=None, comms_logger=None, compression_training=None, checkpoint=None, data_types=None, deepspeed_extra_args={'comms_logger': {'enabled': True, 'verbose': True, 'prof_all': True, 'debug': False}}, hostfile=None, include=None, exclude=None, num_nodes=-1, num_gpus=None, master_port=29500, master_addr=None, launcher='pdsh', force_multi=False, detect_nvlink_pairs=False, autotuning_run=None, no_ssh_check=False, comment=None)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generate_samples_from_prompt(neox_args, )" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "49ff863b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'checkpoints'" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "neox_args.load" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72bf3192", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 8de085895..72cf411e6 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = ce9bee3 + Default = f4a9106 current git hash of repository @@ -926,7 +926,7 @@ Text Generation arguments - **prompt_end**: str - Default = + Default = a single prompt's end. Defaults to newline @@ -968,7 +968,7 @@ Text Generation arguments - **eval_results_prefix**: str - Default = + Default = prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json @@ -1686,7 +1686,7 @@ Args for deepspeed config Default = None - + @@ -1988,4 +1988,3 @@ Args for deepspeed runner (deepspeed.launcher.runner). Default = None Adds a `--comment` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometime necessary for cluster rules, or so I've heard. - diff --git a/inference/HFvsDS_comparision.png b/inference/HFvsDS_comparision.png new file mode 100644 index 000000000..5683ae483 Binary files /dev/null and b/inference/HFvsDS_comparision.png differ diff --git a/inference/README.md b/inference/README.md new file mode 100644 index 000000000..20fba80b0 --- /dev/null +++ b/inference/README.md @@ -0,0 +1,53 @@ +# Neox Inference with Deepspeed + +For Inference of neox models we use the Deepspeed MII library. The installation and usage instructions remain same as from [Deepspeed-MII](https://github.com/microsoft/DeepSpeed-MII#getting-started-with-mii). + +# Installation +`pip install deepspeed-mii` + +# Inference Usage +DeepSpeed MII incorporates both DS inference and Zero inference into one framework. Both of which serve different purposes and cannot be used together. + +## 1. DS Inference: +This fits the entire model into GPUs memory and is more suitable for inference applications that are latency sensitive or have small batch sizes. + +``` +# Deployment +import mii +mii_configs = {"tensor_parallel": 2, "dtype": "fp16", "load_with_sys_mem": True} +mii.deploy(task="text-generation", + model="EleutherAI/gpt-neox-20b", + deployment_name="gpt-neox-20b-deploy", + mii_config=mii_configs) + +# Generation +generator = mii.mii_query_handle("gpt-neox-20b-deploy") + +# Terminate (if you no longer want to infer) +mii.terminate("gpt-neox-20b-deploy") +``` + +Neox-20b fp16 model requires greater than 40GB memory and cannot fit on single A100 40GB GPU, so we keep `tensor_parallel:2` to use two GPUs. If you have 80GB GPU, you can set `tensor_parallel:1` for neox-20b to use single GPU. + +## 2. Zero Inference: +It adapts and optimises ZeRO-Infinity techniques for model inference on GPUs by hosting the model weights in CPU or NVMe memory, thus hosting no weights (zero) in GPU. It is designed for inference applications that require GPU acceleration but lack sufficient GPU memory to host the model. This therefore have higher latency compared to DS inference. + +Example usage: +``` +# Deployment +python zero_inference.py + +# Generation +generator = mii.mii_query_handle("EleutherAI/pythia-160m_deploy") + +# Terminate (if you no longer want to infer) +mii.terminate("EleutherAI/pythia-160m_deploy") +``` + +# Batch size +Batch size at inference is not directly supported with deepspeed mii. However you can run with few changes and caveats, but note that the higher batch size does not necessarily decrease inference time. Follow the [issue](https://github.com/microsoft/DeepSpeed-MII/issues/133#issuecomment-1509534568) for more details. + +# HF Vs DS Inference Comparison +![HF Vs DS Comparision plot](HFvsDS_comparision.png) + +Using benchmark.py, we benchmark different pythia models with neox-20b model to compare HF and Deepspeed inference. All inference are done for fp-16 models using single A100 40GB GPU for pythia models and two A100 40GB GPUs for neox-20b. Relative comparision between HF and Deepspeed is more important than Absolute latency values in the plot. diff --git a/inference/benchmark.py b/inference/benchmark.py new file mode 100644 index 000000000..cd604e716 --- /dev/null +++ b/inference/benchmark.py @@ -0,0 +1,78 @@ +import torch +import mii +from transformers import pipeline +import time +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument('--model', '-m', type=str, default='EleutherAI/pythia-160m', help='hf model name') +parser.add_argument('--trials', type=int, default=50, help='number of trials') +parser.add_argument('--dtype', type=str, default='fp16', help='Data type for model') +parser.add_argument('--tensor_parallel', type=int, default=1, help='Tensor parallelism degree') +parser.add_argument('--load_with_sys_mem', action='store_true', help='Load model with system memory') +args = parser.parse_args() + +def hf_infer(model, torch_dtype, query=['Deepspeed is', 'Seattle is'], trials=1): + + generator = pipeline('text-generation', model=model, device=0, torch_dtype=torch_dtype) + eos_token = generator.tokenizer.eos_token_id + + start_time = time.time() + for i in range(trials): + hf_result = generator(query, max_new_tokens=100, pad_token_id=eos_token) + end_time = time.time() + + hf_time = (end_time - start_time) / trials + + generator = None + torch.cuda.empty_cache() + + return eos_token, hf_result, hf_time + +def mii_infer(model, eos_token, query=['Deepspeed is', 'Seattle is'], trials=1): + generator = mii.mii_query_handle(model + '_deploy') + start_time = time.time() + for i in range(trials): + mii_result = generator.query({'query': query}, pad_token_id=eos_token, max_new_tokens=100) + end_time = time.time() + mii_time = (end_time - start_time) / trials + + return mii_result, mii_time + +def main(): + + dtype_mapping = { + 'fp16': torch.float16, + 'fp32': torch.float32, + 'fp64': torch.float64, + 'int8': torch.int8, + 'int16': torch.int16, + 'int32': torch.int32, + 'int64': torch.int64 + } + + torch_dtype = dtype_mapping[args.dtype] + load_with_sys_mem = args.load_with_sys_mem + tensor_parallel = args.tensor_parallel + trials = args.trials + model = args.model + + eos_token, hf_result, hf_time = hf_infer(model, torch_dtype, trials=trials) + + mii_configs = {'tensor_parallel': tensor_parallel, 'dtype': torch_dtype, 'load_with_sys_mem': load_with_sys_mem} + mii.deploy(task='text-generation', + model=model, + deployment_name=model + '_deploy', + mii_config=mii_configs) + mii_result, mii_time = mii_infer(model, eos_token, trials=trials) + + print('HF sample output', hf_result) + print('HF Average Inference time: ', hf_time) + + print('MII sample output', mii_result) + print('MII Average Inference time: ', mii_time) + + mii.terminate(model + '_deploy') + +if __name__ == '__main__': + main() diff --git a/inference/zero_inference_example.py b/inference/zero_inference_example.py new file mode 100644 index 000000000..2502b5708 --- /dev/null +++ b/inference/zero_inference_example.py @@ -0,0 +1,47 @@ +import mii +from transformers import AutoConfig + +mii_config = {"dtype": "fp16"} + +name = "EleutherAI/pythia-160m" + +config = AutoConfig.from_pretrained(name) +model_hidden_size = config.hidden_size + +ds_config = { + "fp16": { + "enabled": True + }, + "bf16": { + "enabled": False + }, + "aio": { + "block_size": 262144, + "queue_depth": 32, + "thread_count": 1, + "single_submit": False, + "overlap_events": True + }, + "zero_optimization": { + "stage": 3, + "offload_param": { + "device": "cpu", + }, + "overlap_comm": True, + "contiguous_gradients": True, + "reduce_bucket_size": model_hidden_size * model_hidden_size, + "stage3_prefetch_bucket_size": 0.1 * model_hidden_size * model_hidden_size, + "stage3_max_live_parameters": 1e8, + "stage3_max_reuse_distance": 1e8, + "stage3_param_persistence_threshold": 10 * model_hidden_size + }, + "train_micro_batch_size_per_gpu": 1, +} + +mii.deploy(task='text-generation', + model=name, + deployment_name=name + "_deploy", + mii_config=mii_config, + enable_deepspeed=False, + enable_zero=True, + ds_config=ds_config) \ No newline at end of file diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp index 90488fa61..830326c3f 100644 --- a/megatron/data/helpers.cpp +++ b/megatron/data/helpers.cpp @@ -173,7 +173,6 @@ py::array build_sample_idx_int32(const py::array_t& sizes_, free_when_done); // numpy array references } - py::array build_sample_idx_int64(const py::array_t& sizes_, const py::array_t& doc_idx_, const int32_t seq_length, diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py index 386b5ed1e..80caa2d97 100644 --- a/megatron/fused_kernels/__init__.py +++ b/megatron/fused_kernels/__init__.py @@ -38,7 +38,7 @@ def load_fused_kernels(): print(e) print("=" * 100) print( - f'ERROR: Fused kernels configured but not properly installed. Please run `pip install {str(srcpath)}` to install them' + f"ERROR: Fused kernels configured but not properly installed. Please run `pip install {str(srcpath)}` to install them" ) print("=" * 100) exit() diff --git a/megatron/model/flash_attention.py b/megatron/model/flash_attention.py index f9889b4c0..be3ebb14e 100644 --- a/megatron/model/flash_attention.py +++ b/megatron/model/flash_attention.py @@ -8,6 +8,12 @@ import flash_attn_cuda +def flash_attn_unpadded_unpacked_func_triton( + q, k, v, bias=None, causal=False, softmax_scale=None +): + return flash_attn_triton.flash_attn_func(q, k, v, bias, causal, softmax_scale) + + def _flash_attn_forward_cuda( q, k, @@ -186,7 +192,273 @@ def flash_attn_unpadded_qkvpacked_func_cuda( ) -def flash_attn_unpadded_qkvpacked_func_triton( - q, k, v, bias=None, causal=False, softmax_scale=None +class FlashAttnKVPackedFunc(torch.autograd.Function): + @staticmethod + def forward( + ctx, + q, + kv, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal, + return_softmax, + ): + # Save rng_state because the backward pass will regenerate the dropout mask + rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + out, softmax_lse, S_dmask = _flash_attn_forward_cuda( + q, + kv[:, 0], + kv[:, 1], + torch.empty_like(q), + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal=causal, + return_softmax=return_softmax, + ) + ctx.save_for_backward( + q, kv, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state + ) + ctx.dropout_p = dropout_p + ctx.max_seqlen_q = max_seqlen_q + ctx.max_seqlen_k = max_seqlen_k + ctx.softmax_scale = softmax_scale + ctx.causal = causal + return out if not return_softmax else (out, softmax_lse, S_dmask) + + @staticmethod + def backward(ctx, dout, *args): + ( + q, + kv, + out, + softmax_lse, + cu_seqlens_q, + cu_seqlens_k, + rng_state, + ) = ctx.saved_tensors + if rng_state is not None: + cur_rng_state = torch.cuda.get_rng_state() + torch.cuda.set_rng_state(rng_state) + dq = torch.empty_like(q) + dkv = torch.empty_like(kv) + _flash_attn_backward_cuda( + dout, + q, + kv[:, 0], + kv[:, 1], + out, + softmax_lse, + dq, + dkv[:, 0], + dkv[:, 1], + cu_seqlens_q, + cu_seqlens_k, + ctx.max_seqlen_q, + ctx.max_seqlen_k, + ctx.dropout_p, + ctx.softmax_scale, + ctx.causal, + ) + if rng_state is not None: + torch.cuda.set_rng_state(cur_rng_state) + return dq, dkv, None, None, None, None, None, None, None, None + + +def flash_attn_unpadded_kvpacked_func_cuda( + q, + kv, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale=None, + causal=False, + return_attn_probs=False, ): - return flash_attn_triton.flash_attn_func(q, k, v, bias, causal, softmax_scale) + """dropout_p should be set to 0.0 during evaluation + Arguments: + q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch. + kv: (total_k, 2, nheads, headdim), where total_k = total number of key tokens in the batch. + cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths + of the sequences in the batch, used to index into q. + cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths + of the sequences in the batch, used to index into kv. + max_seqlen_q: int. Maximum query sequence length in the batch. + max_seqlen_k: int. Maximum key sequence length in the batch. + dropout_p: float. Dropout probability. + softmax_scale: float. The scaling of QK^T before applying softmax. + Default to 1 / sqrt(headdim). + causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling). + return_attn_probs: bool. Whether to return the attention probabilities. This option is for + testing only. The returned probabilities are not guaranteed to be correct + (they might not have the right scaling). + Return: + out: (total, nheads, headdim). + softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The + logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax + normalization factor). + S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen). + The output of softmax (possibly with different scaling). It also encodes the dropout + pattern (negative means that location was dropped, nonnegative means it was kept). + """ + return FlashAttnKVPackedFunc.apply( + q, + kv, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal, + return_attn_probs, + ) + + +class FlashAttnFunc(torch.autograd.Function): + @staticmethod + def forward( + ctx, + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal, + return_softmax, + ): + # Save rng_state because the backward pass will regenerate the dropout mask + rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + out, softmax_lse, S_dmask = _flash_attn_forward_cuda( + q, + k, + v, + torch.empty_like(q), + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal=causal, + return_softmax=return_softmax, + ) + ctx.save_for_backward( + q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state + ) + ctx.dropout_p = dropout_p + ctx.max_seqlen_q = max_seqlen_q + ctx.max_seqlen_k = max_seqlen_k + ctx.softmax_scale = softmax_scale + ctx.causal = causal + return out if not return_softmax else (out, softmax_lse, S_dmask) + + @staticmethod + def backward(ctx, dout, *args): + ( + q, + k, + v, + out, + softmax_lse, + cu_seqlens_q, + cu_seqlens_k, + rng_state, + ) = ctx.saved_tensors + if rng_state is not None: + cur_rng_state = torch.cuda.get_rng_state() + torch.cuda.set_rng_state(rng_state) + dq, dk, dv = torch.empty_like(q), torch.empty_like(k), torch.empty_like(v) + _flash_attn_backward_cuda( + dout, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + cu_seqlens_q, + cu_seqlens_k, + ctx.max_seqlen_q, + ctx.max_seqlen_k, + ctx.dropout_p, + ctx.softmax_scale, + ctx.causal, + ) + if rng_state is not None: + torch.cuda.set_rng_state(cur_rng_state) + return dq, dk, dv, None, None, None, None, None, None, None, None + + +def flash_attn_unpadded_func_cuda( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale=None, + causal=False, + return_attn_probs=False, +): + """dropout_p should be set to 0.0 during evaluation + Arguments: + q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch. + k: (total_k, nheads, headdim), where total_k = total number of key tokens in the batch. + v: (total_k, nheads, headdim), where total_k = total number of key tokens in the batch. + cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths + of the sequences in the batch, used to index into q. + cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths + of the sequences in the batch, used to index into kv. + max_seqlen_q: int. Maximum query sequence length in the batch. + max_seqlen_k: int. Maximum key sequence length in the batch. + dropout_p: float. Dropout probability. + softmax_scale: float. The scaling of QK^T before applying softmax. + Default to 1 / sqrt(headdim). + causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling). + return_attn_probs: bool. Whether to return the attention probabilities. This option is for + testing only. The returned probabilities are not guaranteed to be correct + (they might not have the right scaling). + Return: + out: (total, nheads, headdim). + softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The + logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax + normalization factor). + S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen). + The output of softmax (possibly with different scaling). It also encodes the dropout + pattern (negative means that location was dropped, nonnegative means it was kept). + """ + return FlashAttnFunc.apply( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + softmax_scale, + causal, + return_attn_probs, + ) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 4d57b063c..cc7b8be4a 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -278,17 +278,23 @@ def __init__( if self.use_flash_attention: from megatron.model.flash_attention import ( flash_attn_unpadded_qkvpacked_func_cuda, - flash_attn_unpadded_qkvpacked_func_triton, + flash_attn_unpadded_kvpacked_func_cuda, + flash_attn_unpadded_unpacked_func_triton, ) if self.pos_emb == "alibi": self.flash_attention_function = ( - flash_attn_unpadded_qkvpacked_func_triton + flash_attn_unpadded_unpacked_func_triton ) else: - self.flash_attention_function = ( - flash_attn_unpadded_qkvpacked_func_cuda - ) + if self.training: + self.flash_attention_function = ( + flash_attn_unpadded_qkvpacked_func_cuda + ) + else: + self.flash_attention_function = ( + flash_attn_unpadded_kvpacked_func_cuda + ) else: self.scale_mask_softmax = FusedScaleMaskSoftmax( input_in_fp16=self.fp16, @@ -429,59 +435,101 @@ def flash_attention(self, query_layer, key_layer, value_layer): ) if self.pos_emb != "alibi": - # [s, b, np, hn] -> [b, s, np, hn] -> [b * s, 1, np, hn] - query_layer = query_layer.transpose(0, 1).reshape( - output_size[0] * output_size[2], 1, output_size[1], -1 - ) + + # [sk, b, np, hn] -> [b, sk, np, hn] -> [b * sk, 1, np, hn] key_layer = key_layer.transpose(0, 1).reshape( output_size[0] * output_size[3], 1, output_size[1], -1 ) value_layer = value_layer.transpose(0, 1).reshape( output_size[0] * output_size[3], 1, output_size[1], -1 ) - # Combined q/k/v into [b * s, 3, np, hn]. - qkv = torch.concat([query_layer, key_layer, value_layer], dim=1) batch_size = output_size[0] - seqlen = output_size[2] - max_s = seqlen + max_seqlen_q = output_size[2] + max_seqlen_k = output_size[3] - cu_seqlens = torch.arange( + cu_seqlens_q = torch.arange( 0, - (batch_size + 1) * seqlen, - step=seqlen, + (batch_size + 1) * max_seqlen_q, + step=max_seqlen_q, dtype=torch.int32, - device=qkv.device, + device=query_layer.device, ) - output = self.flash_attention_function( - qkv, - cu_seqlens, - max_s, - self.dropout_p if self.training else 0.0, - softmax_scale=None, - causal=True, + cu_seqlens_k = torch.arange( + 0, + (batch_size + 1) * max_seqlen_k, + step=max_seqlen_k, + dtype=torch.int32, + device=key_layer.device, ) + + if self.training: + + # [sq, b, np, hn] -> [b * sq, np, hn] + query_layer = query_layer.transpose(0, 1).reshape( + output_size[0] * output_size[2], output_size[1], -1 + ) + + # Combined k/v into [b * sk, 2, np, hn]. + kv = torch.concat([key_layer, value_layer], dim=1) + + output = self.flash_attn_unpadded_kvpacked_func( + query_layer, + kv, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + self.dropout_p if self.training else 0.0, + softmax_scale=None, + causal=True, + ) + + else: + + # [sq, b, np, hn] -> [b * sq, 1, np, hn] + query_layer.transpose(0, 1).reshape( + output_size[0] * output_size[2], 1, output_size[1], -1 + ) + + # Combined q/k/v into [b * s, 3, np, hn]. + qkv = torch.concat([query_layer, key_layer, value_layer], dim=1) + + output = self.flash_attn_unpadded_qkvpacked_func( + qkv, + cu_seqlens_q, + max_seqlen_q, + self.dropout_p if self.training else 0.0, + softmax_scale=None, + causal=True, + ) + # [b * sq, np, hn] -> [b, sq, np, hn] matmul_result = output.view( output_size[0], output_size[2], output.shape[1], output.shape[2] ) # [b, sq, np, hn] -> [b, np, sq, hn] matmul_result = matmul_result.transpose(1, 2) + else: # [sq, b, np, hn] -> [b, sq, np, hn] sq = query_layer.size(0) b = query_layer.size(1) sk = key_layer.size(0) + query_layer = query_layer.transpose(0, 1) key_layer = key_layer.transpose(0, 1) value_layer = value_layer.transpose(0, 1) + bias = self.alibi_embed.bias(sq, sk, query_layer.device, query_layer.dtype) bias = bias.unsqueeze(0).tile((b, 1, 1, 1)) + matmul_result = self.flash_attention_function( query_layer, key_layer, value_layer, bias=bias, causal=True ) matmul_result = matmul_result.transpose(1, 2) + return matmul_result def sparse_attention(self, query_layer, key_layer, value_layer, attention_mask): diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index 73bcbdc75..6ac476fc1 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -989,7 +989,7 @@ def calculate_derived(self): # Update 'is pipe parallel' flag # if we set pipe_parallel_size to 0 or 1, GPT2ModelPipe.to_sequential() is called, and we run training with # the sequential model without the PipelineModule wrapper to avoid the overhead it incurs - self.update_value("is_pipe_parallel", self.pipe_parallel_size >= 2) + self.update_value("is_pipe_parallel", self.pipe_parallel_size >= 1) # Attention config if self.attention_config is None: diff --git a/tools/convert_sequential_to_hf.py b/tools/convert_sequential_to_hf.py index 5a66219bf..f2299fccf 100644 --- a/tools/convert_sequential_to_hf.py +++ b/tools/convert_sequential_to_hf.py @@ -155,9 +155,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): hf_config = create_config(loaded_config) - hf_model = GPTNeoXForCausalLM( - hf_config - ) + hf_model = GPTNeoXForCausalLM(hf_config) # save model in FP16 if Deepspeed fp16 was used in config, else 32 bit fp16 = get_key(loaded_config, "fp16") @@ -177,7 +175,9 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): hf_model.to(dtype=torch.bfloat16) print("Saving weights in bf16 precision...") except: - print("Model not trained in fp16 / bf16 mixed precision, saving weights in fp32...") + print( + "Model not trained in fp16 / bf16 mixed precision, saving weights in fp32..." + ) mp_partitions = get_key(loaded_config, "model-parallel-size") diff --git a/tools/convert_v1.0_to_hf.py b/tools/convert_v1.0_to_hf.py index 905bdfa16..8f3537cd4 100644 --- a/tools/convert_v1.0_to_hf.py +++ b/tools/convert_v1.0_to_hf.py @@ -153,15 +153,19 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path): try: # this conditional is quite messy because there were a number of ways to specify bf16 or fp16 training # in DeeperSpeed v1.0 . - if (fp16.get("fp16", None) or fp16["enabled"]) and not (fp16.get("type", None) == "bfloat16"): + if (fp16.get("fp16", None) or fp16["enabled"]) and not ( + fp16.get("type", None) == "bfloat16" + ): hf_model.half() print("Saving weights in fp16 precision...") elif fp16.get("type", None) == "bfloat16": hf_model.to(dtype=torch.bfloat16) print("Saving weights in bf16 precision...") except: - print("Model not trained in fp16 / bf16 mixed precision, saving weights in fp32...") - + print( + "Model not trained in fp16 / bf16 mixed precision, saving weights in fp32..." + ) + mp_partitions = get_key(loaded_config, "model-parallel-size") ### Embedding layer ### diff --git a/tools/merge_datasets.py b/tools/merge_datasets.py index e6e290016..c5d1e6255 100644 --- a/tools/merge_datasets.py +++ b/tools/merge_datasets.py @@ -2,8 +2,10 @@ import sys import json import argparse -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), - os.path.pardir))) + +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) +) from megatron.data import indexed_dataset @@ -20,47 +22,63 @@ def main(args): if not os.path.isfile(os.path.join(args.input, basename)): continue - ext_pair = '.bin' if ext == '.idx' else '.idx' - assert os.path.isfile(os.path.join(args.input, prefix) + ext_pair), \ - f'ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}' + ext_pair = ".bin" if ext == ".idx" else ".idx" + assert os.path.isfile( + os.path.join(args.input, prefix) + ext_pair + ), f"ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}" prefixes.add(prefix) builder = None for prefix in sorted(prefixes): if builder is None: - dataset = indexed_dataset.make_dataset(os.path.join(args.input, prefix), 'infer') + dataset = indexed_dataset.make_dataset( + os.path.join(args.input, prefix), "infer" + ) if isinstance(dataset, indexed_dataset.MMapIndexedDataset): - builder = indexed_dataset.MMapIndexedDatasetBuilder(args.output_prefix + '.bin', dtype=dataset._index.dtype) + builder = indexed_dataset.MMapIndexedDatasetBuilder( + args.output_prefix + ".bin", dtype=dataset._index.dtype + ) else: - builder = indexed_dataset.IndexedDatasetBuilder(args.output_prefix + '.bin') + builder = indexed_dataset.IndexedDatasetBuilder( + args.output_prefix + ".bin" + ) del dataset builder.merge_file_(os.path.join(args.input, prefix)) - builder.finalize(args.output_prefix + '.idx') + builder.finalize(args.output_prefix + ".idx") -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - group = parser.add_argument_group(title='input data') - group.add_argument('--input', type=str, required=True, - help='Path to directory containing all document files to merge') - - group = parser.add_argument_group(title='output data') - group.add_argument('--output-prefix', type=str, required=True, - help='Path to binary output file without suffix') + group = parser.add_argument_group(title="input data") + group.add_argument( + "--input", + type=str, + required=True, + help="Path to directory containing all document files to merge", + ) + + group = parser.add_argument_group(title="output data") + group.add_argument( + "--output-prefix", + type=str, + required=True, + help="Path to binary output file without suffix", + ) args = parser.parse_args() - assert os.path.isdir(args.input), \ - f'ERROR: {args.input} is not a directory or does not exist' + assert os.path.isdir( + args.input + ), f"ERROR: {args.input} is not a directory or does not exist" - assert os.path.isdir(os.path.dirname(args.output_prefix)), \ - f'ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist' + assert os.path.isdir( + os.path.dirname(args.output_prefix) + ), f"ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist" main(args) -