diff --git a/benchmarking/configs/inference_test.yml b/benchmarking/configs/inference_test.yml
new file mode 100644
index 000000000..13e36aee5
--- /dev/null
+++ b/benchmarking/configs/inference_test.yml
@@ -0,0 +1,11 @@
+# GPT inference testing setup
+models:
+  - EleutherAI/pythia-70m
+  - EleutherAI/pythia-160m
+  - EleutherAI/pythia-410m
+  - EleutherAI/pythia-1b
+  - EleutherAI/pythia-1.4b
+
+world_size: 1
+trials: 10
+max_tokens: 4
diff --git a/benchmarking/hf_ds_benchmark.py b/benchmarking/hf_ds_benchmark.py
new file mode 100644
index 000000000..d859af510
--- /dev/null
+++ b/benchmarking/hf_ds_benchmark.py
@@ -0,0 +1,179 @@
+'''Adapted from https://github.com/microsoft/DeepSpeed/blob/master/benchmarks/inference/gpt-bench.py'''
+
+import argparse
+import io
+import os
+import subprocess
+import time
+
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from transformers import pipeline
+import torch
+import yaml
+
+PYTHIA_TO_OLD_SUFFIXES = {
+    "70m": "19M",
+    "160m": "125M",
+    "410m": "350M",
+    "1b": "800M",
+    "1.4b": "1-3B",
+    "2.8b": "2.7B",
+    "6.9b": "6-7B",
+    "12b": "13B",
+    "20b": "20B"}
+
+def benchmark_model(
+    model, output_dir, use_deepspeed, dtype, graphs, kernel_inject, max_tokens, local_rank, world_size, trials):
+
+    deepspeed.init_distributed()
+    if local_rank == 0:
+        print("BENCHMARK SETTINGS:")
+        print(f"\tMODEL: {model}")
+        print(f"\tMAX_TOKENS: {max_tokens}")
+        print(f"\tDTYPE: {dtype}")
+        print(f"\tCUDA_GRAPHS: {graphs}")
+        print(f"\tKERNEL_INJECT: {kernel_inject}")
+        print(f"\tWORLD_SIZE: {world_size}")
+
+    if dtype == "int8":
+        dtype = torch.int8
+    elif dtype == "fp16":
+        dtype = torch.float16
+    else:
+        dtype = torch.float32
+
+    pipe = pipeline("text-generation", model=model, framework="pt")
+
+    if dtype == torch.float16:
+        pipe.model.half()
+    print("")
+    if use_deepspeed:
+        pipe.model = deepspeed.init_inference(
+            pipe.model,
+            dtype=dtype,
+            mp_size=world_size,
+            replace_with_kernel_inject=kernel_inject,
+            enable_cuda_graph=graphs,
+        )
+        pipe.model.profile_model_time()
+
+    responses = []
+    times = []
+    mtimes = []
+    for i in range(trials):
+        get_accelerator().synchronize()
+        start = time.time()
+        r = pipe("DeepSpeed is", do_sample=False, max_new_tokens=max_tokens)
+        get_accelerator().synchronize()
+        end = time.time()
+        responses.append(r)
+        times.append(end - start)  # / (max_tokens - 3))
+        if use_deepspeed:
+            mtimes.append(sum(pipe.model.model_times()))
+
+    if use_deepspeed:
+        for_dataframe = np.vstack((times, mtimes, list(map(lambda t: t / (max_tokens - 3), times)))).T
+        columns = ["(e2e) latency", "(model-only) latency", "(e2e) per token latency"]
+
+    else:
+        for_dataframe = np.vstack((times, list(map(lambda t: t / (max_tokens - 3), times)))).T
+        columns = ["(e2e) latency", "(e2e) per token latency"]
+
+    df = pd.DataFrame(
+        for_dataframe,
+        columns = columns)
+    
+    if local_rank == 0:
+
+
+        deepspeed_str = "deepspeed" if use_deepspeed else "hf"
+        deepspeed_dir = os.path.join(output_dir, deepspeed_str)
+        max_tokens_dir = os.path.join(deepspeed_dir, "max_tokens_{}".format(max_tokens))
+        world_size_dir = os.path.join(max_tokens_dir, "world_size_{}".format(world_size))
+
+        os.makedirs(world_size_dir, exist_ok=True)
+
+        fname = os.path.join(world_size_dir,
+                            "{}_{}_benchmark.csv".format(model.split('/')[-1], str(dtype).split('.')[1]))
+        
+        print("saving benchmark to {}".format(fname))
+
+        df.to_csv(fname, index=False)
+    return df
+
+
+def main(models, output_dir, dtype, graphs, kernel_inject, max_tokens, local_rank, world_size, trials):
+    deepspeed_dfs = []
+    hf_dfs = []
+    print("Models to benchmark: {}".format(models))
+    for model in models:
+        print("Benchmarking model: {}".format(model))
+        # run using deepspeed
+        print("Running with deepspeed")
+        deepspeed_dfs.append(benchmark_model(
+            model, output_dir, True, dtype, graphs, kernel_inject, max_tokens, local_rank, world_size, trials))
+
+        # run using huggingface
+        print("Running with huggingface")
+        hf_dfs.append(benchmark_model(
+            model, output_dir, False, dtype, graphs, kernel_inject, max_tokens, local_rank, world_size, trials))
+        
+
+    print("plotting results")
+    # drop first 3 rows (warmup)
+    ds_means = [x["(e2e) latency"].iloc[3:].mean() for x in deepspeed_dfs]
+    ds_std = [x["(e2e) latency"].iloc[3:].std() for x in deepspeed_dfs]
+    hf_means = [x["(e2e) latency"].iloc[3:].mean() for x in hf_dfs]
+    hf_std = [x["(e2e) latency"].iloc[3:].std() for x in hf_dfs]
+
+
+    # plot results
+    fig, ax = plt.subplots(figsize=(12, 4))
+    ax.bar(
+        np.arange(len(ds_means)) - 0.24,
+        ds_means, yerr=ds_std, align='center', alpha=0.5, ecolor='black', capsize=10, width=0.4, label='Deepspeed')
+    ax.bar(
+        np.arange(len(hf_means)) + 0.24,
+        hf_means, yerr=hf_std, align='center', alpha=0.5, ecolor='black', capsize=10, width=0.4, label='Huggingface')
+    ax.set_xticks(np.arange(len(models)))
+    ax.set_xticklabels(models)
+    ax.set_xlabel('Model')
+    ax.set_ylabel('Time (s)')
+    plt.legend()
+    plt.tight_layout()
+    plt.title("e2e latency (s), {} tokens, {} world size, {} trials".format(max_tokens, world_size, trials))
+    plt.savefig(os.path.join(output_dir, "benchmark.png"))
+    print("plot saved to {}".format(os.path.join(output_dir, "benchmark.png")))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_dir", type=str, default='/home/mchorse/benchmarking/output', help="output_directory")
+    parser.add_argument("--config", type=str, default='configs/inference_test.yml')
+    parser.add_argument("--dtype", type=str, default="fp16", choices=["fp16", "fp32", "int8"], help="int8, fp16, or fp32")
+    parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on")
+    parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on")
+    parser.add_argument("--local_rank", type=int, default=int(os.getenv("LOCAL_RANK", "0")), help="local rank")
+    args = parser.parse_args()
+
+    with open(args.config, "r") as f:
+        config = yaml.safe_load(f)
+
+    models = config["models"]
+    world_size = config["world_size"]
+    trials = config["trials"]
+    max_tokens = config["max_tokens"]
+
+    main(models=models,
+         output_dir=args.output_dir,
+         dtype=args.dtype,
+         graphs=args.graphs,
+         kernel_inject=args.kernel_inject,
+         max_tokens=max_tokens,
+         local_rank=args.local_rank,
+         world_size=world_size,
+         trials=trials)
+
diff --git a/benchmarking/megatron_config.json b/benchmarking/megatron_config.json
new file mode 100644
index 000000000..355906854
--- /dev/null
+++ b/benchmarking/megatron_config.json
@@ -0,0 +1 @@
+{"train_batch_size": 128, "train_micro_batch_size_per_gpu": 32, "optimizer": {"type": "Adam", "params": {"lr": 0.0008, "betas": [0.9, 0.95], "eps": 1e-08}}, "fp16": {"fp16": true, "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 12, "hysteresis": 2, "min_loss_scale": 1}, "zero_optimization": {"stage": 1, "allgather_partitions": true, "allgather_bucket_size": 500000000, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 500000000, "contiguous_gradients": true}, "wall_clock_breakdown": true, "precision": "fp16", "num_layers": 10, "hidden_size": 640, "num_attention_heads": 10, "seq_length": 2048, "max_position_embeddings": 2048, "pos_emb": "rotary", "no_weight_tying": true, "attention_config": ["global", "global", "global", "global", "global", "global", "global", "global", "global", "global"], "sparsity_config": {}, "rotary_pct": 0.25, "init_method": "small_init", "output_layer_init_method": "wang_init", "gpt_j_residual": true, "output_layer_parallelism": "column", "lr_decay_style": "cosine", "lr_decay_iters": 143000, "min_lr": 8e-05, "optimizer_type": "Adam", "zero_stage": 1, "zero_reduce_scatter": true, "zero_contiguous_gradients": true, "zero_reduce_bucket_size": 500000000, "zero_allgather_bucket_size": 500000000, "lr": 0.0008, "padded_vocab_size": 50304, "data_path": "../data/enwik8/enwik8_text_document", "data_impl": "mmap", "save": "checkpoints", "config_files": {"49M.yml": "{\n  # parallelism settings\n  \"pipe-parallel-size\": 2,\n  \"model-parallel-size\": 1,\n\n  # model settings\n  \"num-layers\": 10,\n  \"hidden-size\": 640,\n  \"num-attention-heads\": 10,\n  \"seq-length\": 2048,\n  \"max-position-embeddings\": 2048,\n  \"pos-emb\": \"rotary\",\n  \"rotary-pct\": 0.25,\n  \"no-weight-tying\": true,\n  \"gpt-j-residual\": true,\n  \"output-layer-parallelism\": \"column\",\n\n  # these should provide some speedup but takes a while to build, set to true if desired\n  \"scaled-upper-triang-masked-softmax-fusion\": false,\n  \"bias-gelu-fusion\": false,\n\n  # init methods\n  \"init_method\": \"small_init\",\n  \"output_layer_init_method\": \"wang_init\",\n\n  # optimizer settings\n  \"optimizer\": {\n    \"type\": \"Adam\",\n    \"params\": {\n      \"lr\": 0.0008,\n      \"betas\": [0.9, 0.95],\n      \"eps\": 1.0e-8,\n    }\n  },\n  \"min_lr\": 0.00008,\n\n  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training\n  \"zero_optimization\": {\n    \"stage\": 1,\n    \"allgather_partitions\": True,\n    \"allgather_bucket_size\": 500000000,\n    \"overlap_comm\": True,\n    \"reduce_scatter\": True,\n    \"reduce_bucket_size\": 500000000,\n    \"contiguous_gradients\": True,\n  },\n\n  # batch / data settings\n  \"train_micro_batch_size_per_gpu\": 32,\n  \"gas\": 1,\n  \"data-impl\": \"mmap\",\n  \"num_workers\": 1,\n\n  # activation checkpointing\n  \"checkpoint-activations\": true,\n  \"checkpoint-num-layers\": 1,\n  \"partition-activations\": true,\n  \"synchronize-each-layer\": true,\n\n  # regularization\n  \"gradient_clipping\": 1.0,\n  \"weight-decay\": 0.1,\n  \"hidden-dropout\": 0,\n  \"attention-dropout\": 0,\n\n  # precision settings\n  \"fp16\": {\n    \"fp16\": true,\n    \"enabled\": true,\n    \"loss_scale\": 0,\n    \"loss_scale_window\": 1000,\n    \"initial_scale_power\": 12,\n    \"hysteresis\": 2,\n    \"min_loss_scale\": 1,\n  },\n\n  # misc. training settings\n  \"train-iters\": 143000,\n  \"lr-decay-iters\": 143000,\n  \"distributed-backend\": \"nccl\",\n  \"lr-decay-style\": \"cosine\",\n  \"warmup\": 0.01,\n  \"checkpoint-factor\": 1000,\n  \"eval-interval\": 100000,\n  \"eval-iters\": 10,\n\n  # logging\n  \"log-interval\": 10,\n  \"steps_per_print\": 10,\n  \"wall_clock_breakdown\": true,\n}\n", "benchmark_setup.yml": "# Suggested data paths when using GPT-NeoX locally\n{\n  \"data-path\": \"../data/enwik8/enwik8_text_document\",\n\n  # or for weighted datasets:\n  # \"train-data-paths\": [\"data/enwik8/enwik8_text_document\", \"data/enwik8/enwik8_text_document\"],\n  # \"test-data-paths\": [\"data/enwik8/enwik8_text_document\", \"data/enwik8/enwik8_text_document\"],\n  # \"valid-data-paths\": [\"data/enwik8/enwik8_text_document\", \"data/enwik8/enwik8_text_document\"],\n  # \"train-data-weights\": [1., 2.],\n  # \"test-data-weights\": [2., 1.],\n  # \"valid-data-weights\": [0.5, 0.4],\n\n  # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.\n  # WARNING: setting this to True will override any user provided weights\n  # \"weight_by_num_documents\": false,\n  # \"weighted_sampler_alpha\": 0.3,\n\n  \"vocab-file\": \"../data/gpt2-vocab.json\",\n  \"merge-file\": \"../data/gpt2-merges.txt\",\n\n  \"save\": \"checkpoints\",\n  \"load\": \"checkpoints\",\n  \"checkpoint_validation_with_forward_pass\": False,\n\n  \"tensorboard-dir\": \"tensorboard\",\n  \"log-dir\": \"logs\",\n  \"use_wandb\": True,\n  \"wandb_host\": \"https://api.wandb.ai\",\n  \"wandb_project\": \"neox\"\n}\n", "benchmarking.yml": "# Parameters used for text generation\n# Make sure `load` is specified somewhere else\n{\n  # Text gen type: `input-file`, `unconditional` or `interactive`\n  \"text-gen-type\": \"from_prompt\",\n\n  # Params for all\n  \"maximum_tokens\": 128,\n  \"prompt_end\": \"\\n\",\n  \"temperature\": 1.0,\n  \"top_p\": 0.0,\n  \"top_k\": 0,\n  \"recompute\": false,\n\n  # `unconditional`: samples\n  \"num-samples\": 10,\n}\n"}, "load": "checkpoints", "checkpoint_factor": 1000, "batch_size": 32, "train_iters": 143000, "eval_iters": 10, "eval_interval": 100000, "vocab_file": "../data/gpt2-vocab.json", "merge_file": "../data/gpt2-merges.txt", "num_workers": 1, "attention_dropout": 0, "hidden_dropout": 0, "weight_decay": 0.1, "checkpoint_activations": true, "synchronize_each_layer": true, "partition_activations": true, "gas": 1, "clip_grad": 1.0, "dynamic_loss_scale": true, "pipe_parallel_size": 2, "world_size": 2, "is_pipe_parallel": true, "use_wandb": true, "log_dir": "logs", "tensorboard_dir": "tensorboard", "log_interval": 10, "text_gen_type": "from_prompt", "temperature": 1.0, "maximum_tokens": 128, "num_samples": 10, "local_rank": 0, "rank": 0, "save_iters": [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000, 21000, 22000, 23000, 24000, 25000, 26000, 27000, 28000, 29000, 30000, 31000, 32000, 33000, 34000, 35000, 36000, 37000, 38000, 39000, 40000, 41000, 42000, 43000, 44000, 45000, 46000, 47000, 48000, 49000, 50000, 51000, 52000, 53000, 54000, 55000, 56000, 57000, 58000, 59000, 60000, 61000, 62000, 63000, 64000, 65000, 66000, 67000, 68000, 69000, 70000, 71000, 72000, 73000, 74000, 75000, 76000, 77000, 78000, 79000, 80000, 81000, 82000, 83000, 84000, 85000, 86000, 87000, 88000, 89000, 90000, 91000, 92000, 93000, 94000, 95000, 96000, 97000, 98000, 99000, 100000, 101000, 102000, 103000, 104000, 105000, 106000, 107000, 108000, 109000, 110000, 111000, 112000, 113000, 114000, 115000, 116000, 117000, 118000, 119000, 120000, 121000, 122000, 123000, 124000, 125000, 126000, 127000, 128000, 129000, 130000, 131000, 132000, 133000, 134000, 135000, 136000, 137000, 138000, 139000, 140000, 141000, 142000], "global_num_gpus": 8}
\ No newline at end of file
diff --git a/benchmarking/neox_benchmark.py b/benchmarking/neox_benchmark.py
new file mode 100644
index 000000000..8b3e18e54
--- /dev/null
+++ b/benchmarking/neox_benchmark.py
@@ -0,0 +1,87 @@
+'''Adapted from https://github.com/microsoft/DeepSpeed/blob/master/benchmarks/inference/gpt-bench.py'''
+
+import argparse
+import os
+import sys
+sys.path.insert(0, os.path.abspath(os.getcwd()))
+
+
+import tempfile
+import time
+
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from transformers import pipeline
+import torch
+import yaml
+
+from megatron.text_generation_utils import generate_samples_from_prompt
+from megatron.utils import print_rank_0, setup_for_inference_or_eval
+
+
+PYTHIA_TO_OLD_SUFFIXES = {
+    "70m": "19M",
+    "160m": "125M",
+    "410m": "350M",
+    "1b": "800M",
+    "1.4b": "1-3B",
+    "2.8b": "2.7B",
+    "6.9b": "6-7B",
+    "12b": "13B",
+    "20b": "20B"}
+
+
+def main():
+    model, neox_args = setup_for_inference_or_eval(use_cache=True)
+    max_tokens = 10
+    print_rank_0("Finished loading model")
+
+    prompts = ["DeepSpeed is" for x in range(100)]
+
+    generated_texts = generate_samples_from_prompt(
+        neox_args=neox_args,
+        model=model,
+        text=prompts,
+        eos_token_id=0,
+        maximum_tokens=10,
+        recompute=neox_args.recompute,
+        temperature=neox_args.temperature,
+        top_k=neox_args.top_k,
+        top_p=neox_args.top_p,
+    )
+
+    times = [x["duration_seconds"] for x in generated_texts]
+
+    for_dataframe = np.vstack((times, list(map(lambda t: t / (max_tokens - 3), times)))).T
+    columns = ["(e2e) latency", "(e2e) per token latency"]
+
+    df = pd.DataFrame(
+        for_dataframe,
+        columns = columns)
+
+
+    # save dataframe to CSV inside the directory for world_size
+    # if local_rank == 0:
+
+    # neox_dir = os.path.join(output_dir, "neox")
+    # max_tokens_dir = os.path.join(neox_dir, "max_tokens_{}".format(max_tokens))
+    # world_size_dir = os.path.join(max_tokens_dir, "world_size_{}".format(world_size))
+
+    # os.makedirs(world_size_dir, exist_ok=True)
+
+    # fname = os.path.join(world_size_dir,
+    #                     "{}_fp16_benchmark.csv".format(model.split('/')[-1]))
+    
+    # print("saving benchmark to {}".format(fname))
+    # df.to_csv(fname, index=False)
+    print("Starting data generation...")
+    df.to_csv(sys.stdout, index=False)
+    print("Data generation complete!")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/benchmarking/neox_benchmark_input.txt b/benchmarking/neox_benchmark_input.txt
new file mode 100644
index 000000000..eff01f7d4
--- /dev/null
+++ b/benchmarking/neox_benchmark_input.txt
@@ -0,0 +1,100 @@
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
diff --git a/benchmarking/scrapbook.ipynb b/benchmarking/scrapbook.ipynb
new file mode 100644
index 000000000..ceda84fb5
--- /dev/null
+++ b/benchmarking/scrapbook.ipynb
@@ -0,0 +1,260 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "928fcf87-4202-4111-ab43-0382a389f37d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.8/dist-packages/pandas/core/computation/expressions.py:20: UserWarning: Pandas requires version '2.7.3' or newer of 'numexpr' (version '2.7.2' currently installed).\n",
+      "  from pandas.core.computation.check import NUMEXPR_INSTALLED\n"
+     ]
+    }
+   ],
+   "source": [
+    "%matplotlib inline\n",
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import  yaml\n",
+    "\n",
+    "with open(\"/home/mchorse/gpt-neox/benchmarking/configs/inference_test.yml\", \"r\") as f:\n",
+    "    config = yaml.safe_load(f)\n",
+    "\n",
+    "models = config[\"models\"]\n",
+    "world_size = config[\"world_size\"]\n",
+    "trials = config[\"trials\"]\n",
+    "max_tokens = config[\"max_tokens\"]\n",
+    "\n",
+    "models = [x.split(\"/\")[-1] for x in models]\n",
+    "ds_files = [\"/home/mchorse/inference_benchmark/deepspeed/max_tokens_128/world_size_1/{}_float16_benchmark.csv\".format(x) for x in models]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "0578b8d9-462d-4d12-b583-4197f3ea4ea4",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "hf_files = [\"/home/mchorse/inference_benchmark/hf/max_tokens_128/world_size_1/{}_float16_benchmark.csv\".format(x) for x in models]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "2e935d3d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds_dfs = [pd.read_csv(x) for x in ds_files]\n",
+    "hf_dfs = [pd.read_csv(x) for x in hf_files]\n",
+    "ds_means = [x[\"(e2e) latency\"].iloc[3:].mean() for x in ds_dfs]\n",
+    "ds_std = [x[\"(e2e) latency\"].iloc[3:].std() for x in ds_dfs]\n",
+    "hf_means = [x[\"(e2e) latency\"].iloc[3:].mean() for x in hf_dfs]\n",
+    "hf_std = [x[\"(e2e) latency\"].iloc[3:].std() for x in hf_dfs]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "e681fa64",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAioAAAGwCAYAAACHJU4LAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA6T0lEQVR4nO3deViU9f7/8deIMiAyKCiKCmq5hJricvSH6Ulzw0otM60sMZes41ZkGacFadO0skUzK5c6qVlq5bdSsxJTO6a5m6ZJap7C6GiJICDC5/dHl3OacBkGmLmB5+O65rrmXj/v+cwAL+77M/dtM8YYAQAAWFAlXxcAAABwIQQVAABgWQQVAABgWQQVAABgWQQVAABgWQQVAABgWQQVAABgWZV9XUBxFBQU6Oeff1ZwcLBsNpuvywEAAG4wxujUqVOqW7euKlW6+DGTMh1Ufv75Z0VGRvq6DAAA4IGjR4+qfv36F12nTAeV4OBgSX+8UIfD4eNqAACAOzIyMhQZGen8O34xZTqonDvd43A4CCoAAJQx7gzbYDAtAACwLIIKAACwLIIKAACwLJ+OUZk8ebKSk5Nd5jVr1kzfffddibaTn5+vvLy8Et0nyr4qVarIz8/P12UAAC7C54NpW7Rooc8++8w5XblyyZVkjNGxY8f0+++/l9g+Ub5Ur15dderU4To8AGBRPg8qlStXVp06dUpl3+dCSnh4uKpWrcofIzgZY3T69Gmlp6dLkiIiInxcEQDgfHweVL7//nvVrVtXAQEBio2N1ZQpUxQVFXXedXNzc5Wbm+uczsjIuOB+8/PznSElLCysxOtG2RcYGChJSk9PV3h4OKeBAMCCfDqYtmPHjlqwYIFWrVql2bNn69ChQ+rSpYtOnTp13vWnTJmikJAQ5+NiV6U9NyalatWqpVI7yodznw/GMAGANdmMMcbXRZzz+++/q0GDBnr++ec1YsSIQsvPd0QlMjJSJ0+eLHTBt5ycHB06dEiNGjVSQEBAqdeOsonPCQB4X0ZGhkJCQs779/uvfH7q58+qV6+upk2b6uDBg+ddbrfbZbfbvVwVAADwFUtdRyUzM1OpqakMbAQAAJJ8fERl4sSJ6tu3rxo0aKCff/5ZSUlJ8vPz06233lqq7c5Yc6BU9/9n9/Vs6rW2KpLDhw+rUaNG2r59u2JiYnxdDgCglPj0iMp//vMf3XrrrWrWrJkGDRqksLAwbdq0SbVq1fJlWT43bNgw2Ww22Ww2ValSRbVr11bPnj01b948FRQU+Lo8AAC8xqdHVN555x1fNm9pcXFxmj9/vvLz8/XLL79o1apVmjBhgpYuXaoVK1aU6IXxAACwKkuNUcH/2O121alTR/Xq1VPbtm31z3/+Ux9++KFWrlypBQsWSPrjW1IjR45UrVq15HA4dM0112jnzp0u+/nwww/Vtm1bBQQE6LLLLlNycrLOnj3rXG6z2TR79mz16dNHgYGBuuyyy7R06VLn8jNnzmjs2LGKiIhQQECAGjRooClTpri9vSQdPXpUgwYNUvXq1RUaGqr+/fvr8OHDLuu88cYbio6OVkBAgK644gq98sorLss3b96sNm3aKCAgQO3bt9f27duL070AKqi0tDRt27bNa4+0tDRfv+Qyj3/Ly5BrrrlGrVu31vLlyzVy5EjdfPPNCgwM1MqVKxUSEqI5c+aoe/fuOnDggEJDQ7V+/XoNHTpUL730krp06aLU1FTdddddkqSkpCTnfh999FFNnTpVL774ov71r3/plltu0e7duxUdHa2XXnpJK1as0LvvvquoqCgdPXpUR48edanrYtvn5eWpd+/eio2N1fr161W5cmU9+eSTiouL065du+Tv76+FCxfqscce08yZM9WmTRtt375do0aNUlBQkOLj45WZmanrr79ePXv21Ntvv61Dhw5pwoQJXu17AOXDnDlzCt1jrjQlJSVp8uTJXmuvPCKolDFXXHGFdu3apQ0bNmjz5s1KT093fmX72Wef1QcffKClS5fqrrvuUnJysh566CHFx8dLki677DI98cQTevDBB12Cys0336yRI0dKkp544gmtWbNGL7/8sl555RX9+OOPatKkiTp37iybzaYGDRoUquli2y9ZskQFBQV64403nLcwmD9/vqpXr66UlBT16tVLSUlJeu655zRgwABJUqNGjbR3717NmTNH8fHxWrRokQoKCjR37lwFBASoRYsW+s9//qN77rmn9DoaQLk0evRo9evXz+31s7Oz1blzZ0nShg0bnFe0dhffYi0+gkoZY4yRzWbTzp07lZmZWej2ANnZ2UpNTZUk7dy5Uxs3btRTTz3lXJ6fn6+cnBydPn3aeVXW2NhYl33ExsZqx44dkv4Y2NuzZ081a9ZMcXFxuv7669WrV69C619o+507d+rgwYMKDg52WScnJ0epqanKyspSamqqRowYoVGjRjmXnz17ViEhIZKkffv2qVWrVi4XZPtrmwDgjoiIiCKFh6ysLOfzmJgYBQUFlUZZuAiCShmzb98+NWrUSJmZmYqIiFBKSkqhdapXry7pj+vSJCcnO49U/Jm7V2Ft27atDh06pJUrV+qzzz7ToEGD1KNHj0LjUC4kMzNT7dq108KFCwstq1WrljIzMyVJr7/+ujp27OiynHvvAAAIKmXIF198od27d+u+++5T/fr1dezYMVWuXFkNGzY87/pt27bV/v371bhx44vud9OmTRo6dKjLdJs2bZzTDodDgwcP1uDBgzVw4EDFxcXpxIkTCg0NveT2bdu21ZIlSxQeHn7eyySHhISobt26+uGHHzRkyJDz1hcdHa1//etfysnJcQasTZs2XfQ1AQDKB4KKReXm5urYsWMuX0+eMmWKrr/+eg0dOlSVKlVSbGysbrjhBk2bNk1NmzbVzz//rI8//lg33nij2rdvr8cee0zXX3+9oqKiNHDgQFWqVEk7d+7Unj179OSTTzrbeu+999S+fXt17txZCxcu1ObNmzV37lxJ0vPPP6+IiAi1adNGlSpV0nvvvac6deo4j9pcavshQ4Zo+vTp6t+/vx5//HHVr19fR44c0fLly/Xggw+qfv36Sk5O1vjx4xUSEqK4uDjl5ubqm2++0W+//aaEhATddtttevjhhzVq1CglJibq8OHDevbZZ736fgAAfMSUYSdPnjSSzMmTJwsty87ONnv37jXZ2dk+qKx44uPjjSQjyVSuXNnUqlXL9OjRw8ybN8/k5+c718vIyDDjxo0zdevWNVWqVDGRkZFmyJAh5scff3Sus2rVKtOpUycTGBhoHA6H6dChg3nttdecyyWZWbNmmZ49exq73W4aNmxolixZ4lz+2muvmZiYGBMUFGQcDofp3r272bZtm9vbG2NMWlqaGTp0qKlZs6ax2+3msssuM6NGjXJ53xYuXGhiYmKMv7+/qVGjhvn73/9uli9f7lz+73//27Ru3dr4+/ubmJgYs2zZMiPJbN++vVh9XZY/JwBKX2ZmpvP3cWZmpq/LKTcu9vf7ryx19+SiutjdF7krrntsNpvef/993XDDDT7Z3tf4nAC4mKysLFWrVk3SH2PuGExbMopy92Qu+AYAACyLMSoAnNLS0rx6Jc2iflUUQMVDUKnginvmrwyfOcR5cNVOAFZDUAHgxFU7AVgNQQWAE1ftBGA1DKYFAACWRVABAACWRVABAACWRVDBJTVs2FAvvPBCie/3gw8+UOPGjeXn56d77723xPcPACj7KuZg2rVTvNdWt8QibzJs2DD9/vvv+uCDD1zmp6SkqFu3bvrtt99c7rVT2rZs2VIqgyRHjx6tO++8U+PHj1dwcHCJ7x8AUPZVzKCCIqlVq1aJ7zMzM1Pp6enq3bu36tatW+L7B4AZaw4Uex+52aedz1/+/HvZA6sWa3/39Wxa3JIqHE79lFGTJ09WTEyMy7wXXnhBDRs2dE6fPXtW48ePV/Xq1RUWFqZJkyYpPj7e5b48p06d0pAhQxQUFKSIiAjNmDFDXbt2dTkV89dTPzabTW+88YZuvPFGVa1aVU2aNNGKFStcalmxYoWaNGmigIAAdevWTW+++aZsNpt+//13paSkOI+gXHPNNbLZbEpJSdHx48d16623ql69eqpataquvPJKLV682GW/BQUFmjZtmho3biy73a6oqCg99dRTzuVHjx7VoEGDVL16dYWGhqp///46fPiwR30MAPA9gko59swzz2jhwoWaP3++Nm7cqIyMjEKnkxISErRx40atWLFCa9as0fr167Vt27ZL7js5OVmDBg3Srl27dO2112rIkCE6ceKEJOnQoUMaOHCgbrjhBu3cuVOjR4/Www8/7Ny2U6dO2r9/vyRp2bJlSktLU6dOnZSTk6N27drp448/1p49e3TXXXfpjjvu0ObNm53bJiYmaurUqXr00Ue1d+9eLVq0SLVr15Yk5eXlqXfv3goODtb69eu1ceNGVatWTXFxcTpz5kxxuxMA4AOc+rGojz76yHnHznPy8/OLtI+XX35ZiYmJuvHGGyVJM2fO1CeffOJcfurUKb355ptatGiRunfvLkmaP3++W6dihg0bpltvvVWS9PTTT+ull17S5s2bFRcXpzlz5qhZs2aaPn26JKlZs2bas2eP88iHv7+/wsPDJUmhoaGqU6eOJKlevXqaOHGis41x48Zp9erVevfdd9WhQwedOnVKL774ombOnKn4+HhJ0uWXX+68MuqSJUtUUFCgN954Qzabzfl6qlevrpSUFPXq1atI/QcA8D2CikV169ZNs2fPdpn39ddf6/bbb3dr+5MnT+qXX35Rhw4dnPP8/PzUrl07FRQUSJJ++OEH5eXluawTEhKiZs2aXXL/rVq1cj4PCgqSw+FQenq6JGn//v3629/+5rL+n9u4kPz8fD399NN699139dNPP+nMmTPKzc1V1ap/nBPet2+fcnNznaHqr3bu3KmDBw8WGpibk5Oj1NTUS7YPALAegopFBQUFqXHjxi7z/vOf/zifV6pUqdANAfPy8rxSmyRVqVLFZdpmszkDkKemT5+uF198US+88IKuvPJKBQUF6d5773WetrnUfWQyMzPVrl07LVy4sNCy0hgQDAAofYxRKaNq1aqlY8eOuYSVHTt2OJ+HhISodu3a2rJli3Nefn6+y/iTyy67TFWqVHFZ5+TJkzpwoHgj5Zs1a6ZvvvnGZd6f27iQjRs3qn///rr99tvVunVrXXbZZS61NGnSRIGBgfr888/Pu33btm31/fffKzw8XI0bN3Z5hISEFOs1AQB8g6BSRnXt2lW//vqrpk2bptTUVM2aNUsrV650WWfcuHGaMmWKPvzwQ+3fv18TJkzQb7/95hy/ERwcrPj4eD3wwANau3atvv32W40YMUKVKlVyruOJ0aNH67vvvtOkSZN04MABvfvuu1qwYIEkXXS/TZo00Zo1a/TVV19p3759Gj16tH755Rfn8oCAAE2aNEkPPvig3nrrLaWmpmrTpk2aO3euJGnIkCGqWbOm+vfvr/Xr1+vQoUNKSUnR+PHjXY5GAQDKDoJKGRUdHa1XXnlFs2bNUuvWrbV582aXgaiSNGnSJN16660aOnSoYmNjVa1aNfXu3VsBAQHOdZ5//nnFxsbq+uuvV48ePXTVVVcpOjraZZ2iatSokZYuXarly5erVatWmj17tvNbP3a7/YLbPfLII2rbtq169+6trl27qk6dOi5fpZakRx99VPfff78ee+wxRUdHa/Dgwc6xMVWrVtWXX36pqKgoDRgwQNHR0RoxYoRycnLkcDg8fj0AAN+xmb8OdChDMjIyFBISopMnTxb6Q5STk6NDhw6pUaNGxfqjW54UFBQoOjpagwYN0hNPPHHedbKyslSvXj0999xzGjFiRIm1/dRTT+nVV1/V0aNHS2yfJYHPSfFkZWU5v52WmZlZKlcwBjxVUhd8S+zfRpI05cPtXPCthFzs7/dfMZi2HDty5Ig+/fRTXX311crNzdXMmTN16NAh3Xbbbc51tm/fru+++04dOnTQyZMn9fjjj0uS+vfvX6y2X3nlFf3tb39TWFiYNm7cqOnTp2vs2LHF2icAoOIhqJRjlSpV0oIFCzRx4kQZY9SyZUt99tlnio6Odlnv2Wef1f79++Xv76927dpp/fr1qlmzZrHa/v777/Xkk0/qxIkTioqK0v3336/ExKLf9wgAULERVMqxyMhIbdy48aLrtGnTRlu3bi3xtmfMmKEZM2aU+H4BABULQQWooLhhG4CyoNx/66cMjxWGF/D5AABrK7dB5dyVU0+fPn2JNVGRnft8/PVKuwAAayi3p378/PxUvXp1l2tsFOciZihfjDE6ffq00tPTVb16dfn5+fm6JADAeZTboCLJeVfec2EF+Kvq1as7PycAyr+M4+nKOPGr2+ufyc1xPv8pdZ/87UW73pIjtJYcYeFF2gauynVQsdlsioiIUHh4uFdv2IeyoUqVKhxJASqYrz5eok/fnunRtjMTbrv0Sn/R6/axihs6zqP28IdyHVTO8fPz4w8SAECdrhuslrHXeK09Ryh3bi+uChFUAACQJEdYOKdiyphy+60fAABQ9hFUAACAZRFUAACAZRFUAACAZTGYFgAAi0pLS1NaWprX2ouIiFBERITX2nMHQQUAAIuaM2eOkpOTvdZeUlKSJk+e7LX23EFQAQDAokaPHq1+/fq5vX52drY6d+4sSdqwYYMCAwOL1J7VjqZIBBUAACyrqKdisrKynM9jYmIUFBRUGmV5FYNpAQCAZXFEBQAAq1g7pXjbZ5/53/N1z0qB/sXbnyR1Syz+PoqBIyoAAMCyCCoAAMCyOPUDwCnjeLoyTvzq9vpncnOcz39K3Sd/e0CR2nOE1uIGccBFpB3PUNrxU26vn52b53y+4+DPCrRXKVJ7EWHBighzFGmb0kZQAeD01cdL9OnbMz3admbCbUXeptftYxU3dJxH7QEVwZz/26zkt77waNvOE14r8jZJQ6/R5GE9PGqvtBBUADh1um6wWsZe47X2HKG1vNYWUBaN7ttB/TpFe629iLBgr7XlLoIKACdHWDinYgALiQhzWO5UjLcxmBYAAFgWQQUAAFgWp34AlFncWRYo/wgqAMos7iwLlH8EFQBlFneWBco/ywSVqVOnKjExURMmTNALL7zg63IAlAHcWRYo/ywRVLZs2aI5c+aoVatWvi4FgJVxwzagwvH5t34yMzM1ZMgQvf7666pRo4avywEAABbi8yMqY8aM0XXXXacePXroySefvOi6ubm5ys3NdU5nZGSUdnkALIz7oADln0+DyjvvvKNt27Zpy5Ytbq0/ZcoUr47wB2Bt3AcFKP98FlSOHj2qCRMmaM2aNQoIcO+Oq4mJiUpISHBOZ2RkKDIysrRKBGBx3AcFKP98FlS2bt2q9PR0tW3b1jkvPz9fX375pWbOnKnc3Fz5+fm5bGO322W3271dKgCL4j4oQPnns6DSvXt37d6922XenXfeqSuuuEKTJk0qFFIAAEDF47OgEhwcrJYtW7rMCwoKUlhYWKH5AACgYvL515MBAAAuxOdfT/6zlJQUX5cAAAAshCMqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsir7ugAAqKjS0tKUlpbmtfYiIiIUERHhtfaAkkBQAQAfmTNnjpKTk73WXlJSkiZPnuy19oCSQFABAB8ZPXq0+vXr5/b62dnZ6ty5syRpw4YNCgwMLFJ7JXE0haNA8DaCCgD4SFH/CGdlZTmfx8TEKCgoqDTKuiiOAsHbCCoA4CUz1hwo1va52aedz1/+/HvZA6sWtyTd17NpkdYvi0eBULYRVAAAbiuLR4FQtvH1ZAAAYFkcUQEAH8k4nq6ME7+6vf6Z3Bzn859S98nfHlCk9hyhteQICy/SNoCvEVQAwEe++niJPn17pkfbzky4rcjb9Lp9rOKGjvOoPcBXCCoA4COdrhuslrHXeK09R2gtr7UFlBSCCgD4iCMsnFMxwCUwmBYAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgWF3wDAJzf2inF30f2mf89X/esFOhfvP11Syze9ihzOKICAAAsi6ACAAAsi6ACAAAsi6ACAAAsi6ACAAAsi6ACAAAsy6dBZfbs2WrVqpUcDoccDodiY2O1cuVKX5YEAAAsxKdBpX79+po6daq2bt2qb775Rtdcc4369++vb7/91pdlAQAAi/DpBd/69u3rMv3UU09p9uzZ2rRpk1q0aOGjqgAAgFUUOagUFBRo3bp1Wr9+vY4cOaLTp0+rVq1aatOmjXr06KHIyEiPCsnPz9d7772nrKwsxcbGnned3Nxc5ebmOqczMjI8agsAAJQNbp/6yc7O1pNPPqnIyEhde+21WrlypX7//Xf5+fnp4MGDSkpKUqNGjXTttddq06ZNbhewe/duVatWTXa7XXfffbfef/99NW/e/LzrTpkyRSEhIc6Hp6EIAACUDW4fUWnatKliY2P1+uuvq2fPnqpSpUqhdY4cOaJFixbplltu0cMPP6xRo0Zdcr/NmjXTjh07dPLkSS1dulTx8fFat27decNKYmKiEhISnNMZGRmEFQAAyjG3g8qnn36q6Ojoi67ToEEDJSYmauLEifrxxx/d2q+/v78aN24sSWrXrp22bNmiF198UXPmzCm0rt1ul91ud7dkAABQxrl96udSIeXPqlSpossvv9yjggoKClzGoQAAgIrLo2/9rFq1StWqVVPnzp0lSbNmzdLrr7+u5s2ba9asWapRo4Zb+0lMTFSfPn0UFRWlU6dOadGiRUpJSdHq1as9KQsAUMrSjmco7fgpt9fPzs1zPt9x8GcF2gsPG7iYiLBgRYQ5irQNyhePgsoDDzygZ555RtIfg2Hvv/9+JSQkaO3atUpISND8+fPd2k96erqGDh2qtLQ0hYSEqFWrVlq9erV69uzpSVkAgFI25/82K/mtLzzatvOE14q8TdLQazR5WA+P2kP54FFQOXTokHOw67Jly3T99dfr6aef1rZt23Tttde6vZ+5c+d60jwAwEdG9+2gfp3cHwpQXBFhwV5rC9bkUVDx9/fX6dOnJUmfffaZhg4dKkkKDQ3l2iYAUI5FhDk4FQOv8iiodO7cWQkJCbrqqqu0efNmLVmyRJJ04MAB1a9fv0QLBAAAFZdH9/qZOXOmKleurKVLl2r27NmqV6+eJGnlypWKi4sr0QIBAEDF5dERlaioKH300UeF5s+YMaPYBQEAAJxTrJsSpqenKz09XQUFBS7zW7VqVayiAAAAJA+DytatWxUfH699+/bJGCNJstlsMsbIZrMpPz+/RIsEAAAVk0dBZfjw4WratKnmzp2r2rVry2azlXRdAAAAngWVH374QcuWLXPeowcAAKA0ePStn+7du2vnzp0lXQsAAIALj46ovPHGG4qPj9eePXvUsmVLVanieu+Gfv36lUhxAACgYvMoqPz73//Wxo0btXLlykLLGEwLAABKikenfsaNG6fbb79daWlpKigocHkQUgAAQEnxKKgcP35c9913n2rXrl3S9QAAADh5FFQGDBigtWvXlnQtAAAALjwao9K0aVMlJiZqw4YNuvLKKwsNph0/fnyJFAcAACo2j7/1U61aNa1bt07r1q1zWWaz2QgqAACgRHgUVA4dOlTSdQAAABTi0RgVAAAAb3A7qEydOlXZ2dlurfv111/r448/9rgoAAAAqQhBZe/evYqKitI//vEPrVy5Ur/++qtz2dmzZ7Vr1y698sor6tSpkwYPHqzg4OBSKRgAAFQcbo9Reeutt7Rz507NnDlTt912mzIyMuTn5ye73a7Tp09Lktq0aaORI0dq2LBhCggIKLWiAQBAxVCkwbStW7fW66+/rjlz5mjXrl06cuSIsrOzVbNmTcXExKhmzZqlVScAAKiAPPrWT6VKlRQTE6OYmJgSLgcAAOB/+NYPAACwLIIKAACwLIIKAACwLIIKAACwrGIFlYMHD2r16tXOC8EZY0qkKAAAAMnDoHL8+HH16NFDTZs21bXXXqu0tDRJ0ogRI3T//feXaIEAAKDi8iio3HfffapcubJ+/PFHVa1a1Tl/8ODBWrVqVYkVBwAAKjaPrqPy6aefavXq1apfv77L/CZNmujIkSMlUhgAAIBHR1SysrJcjqScc+LECdnt9mIXBQAAIHkYVLp06aK33nrLOW2z2VRQUKBp06apW7duJVYcAACo2Dw69TNt2jR1795d33zzjc6cOaMHH3xQ3377rU6cOKGNGzeWdI0AAKCC8uiISsuWLXXgwAF17txZ/fv3V1ZWlgYMGKDt27fr8ssvL+kaAQBABeXRERVJCgkJ0cMPP1yStQAAALjwOKjk5ORo165dSk9PV0FBgcuyfv36FbswAAAAj4LKqlWrNHToUP33v/8ttMxmsyk/P7/YhQEAAHg0RmXcuHG6+eablZaWpoKCApcHIQUAAJQUj4LKL7/8ooSEBNWuXbuk6wEAAHDyKKgMHDhQKSkpJVwKAACAK4/GqMycOVM333yz1q9fryuvvFJVqlRxWT5+/PgSKQ4AAFRsHgWVxYsX69NPP1VAQIBSUlJks9mcy2w2G0EFAACUCI+CysMPP6zk5GQ99NBDqlTJo7NHAAAAl+RRUDlz5owGDx5MSAH+Ii0tTWlpaV5rLyIiQhEREV5rDwC8zaOgEh8fryVLluif//xnSdcDlGlz5sxRcnKy19pLSkrS5MmTvdYeAHibR0ElPz9f06ZN0+rVq9WqVatCg2mff/75EikOKGtGjx5dpCszZ2dnq3PnzpKkDRs2KDAwsEjtcTQFQHnnUVDZvXu32rRpI0nas2ePy7I/D6wFiqusnUop6vZZWVnO5zExMQoKCvK4bQAojzwKKmvXri3pOoDz4lQKAFRsHt+UEPAGTqUAQMXmdlAZMGCAFixYIIfDoQEDBlx03eXLlxe7MFQ8M9YcuMCSam7vIzf7f99EW388SPbAqkUr4vgpac8p5+R9PZsWbXsAQIlyO6iEhIQ4x5+EhISUWkHAn2UcT1fGiV/dXv9Mbo7z+U+p++RvDyhSe47QWnKEhRdpGwBA6XE7qMyfP1+PP/64Jk6cqPnz55dmTYDTVx8v0advz/Ro25kJtxV5m163j1Xc0HEetQcAKHlFGqOSnJysu+++W1WrFvFwOuChTtcNVsvYa7zWniO0ltfaAgBcWpGCijGmtOoAzssRFs6pGACowIp8DXyukwIAALylyF9Pbtq06SXDyokTJzwuCAAA4JwiB5Xk5GS+9QNI0topxd9H9pn/PV/3rBToX7z9dUss3vYAYDFFDiq33HKLwsMZMwAAAEpfkcaoMD4FAAB4U5GCCt/6AQAA3lSkoFJQUFCip32mTJmiv/3tbwoODlZ4eLhuuOEG7d+/v8T2DwAAyrYifz25JK1bt05jxozRpk2btGbNGuXl5alXr17KysryZVkAAMAifHr35FWrVrlML1iwQOHh4dq6dav+/ve/+6gqAABgFT4NKn918uRJSVJoaOh5l+fm5io3N9c5nZGR4ZW6AACAb/j01M+fFRQU6N5779VVV12lli1bnnedKVOmKCQkxPmIjIz0cpUAAMCbLBNUxowZoz179uidd9654DqJiYk6efKk83H06FEvVggAALzNEqd+xo4dq48++khffvml6tevf8H17Ha77Ha7FysDAAC+5NOgYozRuHHj9P777yslJUWNGjXyZTkAAMBifBpUxowZo0WLFunDDz9UcHCwjh07JkkKCQlRYGCgL0sDPJJ2PENpx0+5vX52bp7z+Y6DPyvQXqVI7UWEBSsizFGkbQCgLPFpUJk9e7YkqWvXri7z58+fr2HDhnm/IKCY5vzfZiW/9YVH23ae8FqRt0kaeo0mD+vhUXsAUBb4/NQPUJ6M7ttB/TpFe629iLBgr7UFAL5gicG0QHkREebgVAwAlCDLfD0ZAADgrwgqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsggqAADAsir7uoCyJC0tTWlpaV5rLyIiQhEREV5rDwAAqyGoFMGcOXOUnJzstfaSkpI0efJkr7UHAIDVEFSKYPTo0erXr5/b62dnZ6tz586SpA0bNigwMLBI7XE0BQBQ0RFUiqCop2KysrKcz2NiYhQUFFQaZQEAUG4xmBYAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgWQQUAAFgW11G5iBlrDhRr+9zs087nL3/+veyBVYtbku7r2bTY+wAAoKzgiAoAALAsggoAALAsggoAALAsggoAALAsggoAALAsggoAALAsggoAALAsggoAALAsLvhWBBnH05Vx4le31z+Tm+N8/lPqPvnbA4rUniO0lhxh4UXaBgCA8oSgUgRffbxEn74906NtZybcVuRtet0+VnFDx3nUHgAA5QFBpQg6XTdYLWOv8Vp7jtBaXmsLAAArIqgUgSMsnFMxAAB4EYNpAQCAZRFUAACAZRFUAACAZRFUAACAZTGYtpxLS0tTWlqa19qLiIhQRESE19oDAJRvBJVybs6cOUpOTvZae0lJSZo8ebLX2gMAlG8ElfJk7ZRCs26od1xN/jnI7V3knjmrEc8ulyTNnThAdv+ifURa1DvuWke3xCJtDwDAn/k0qHz55ZeaPn26tm7dqrS0NL3//vu64YYbfFlSufPBhr1KfusLj7Y9F1iKImnoNYppXNej9gAA+CufBpWsrCy1bt1aw4cP14ABA3xZSrk1um8H9esU7bX2IsKCvdYWAKD882lQ6dOnj/r06ePLEsq9iDCHIsIcvi4DAACPlKkxKrm5ucrNzXVOZ2Rk+LAaAABQ2srUdVSmTJmikJAQ5yMyMtLXJQEAgFJUpoJKYmKiTp486XwcPXrU1yUBAIBSVKZO/djtdtntdl+XAQAAvKRMHVEBAAAVi0+PqGRmZurgwYPO6UOHDmnHjh0KDQ1VVFSUDysDAABW4NOg8s0336hbt27O6YSEBElSfHy8FixY4KOqAACAVfg0qHTt2lXGGF+WAAAALIwxKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIIKgAAwLIsEVRmzZqlhg0bKiAgQB07dtTmzZt9XRIAALAAnweVJUuWKCEhQUlJSdq2bZtat26t3r17Kz093delAQAAH/N5UHn++ec1atQo3XnnnWrevLleffVVVa1aVfPmzfN1aQAAwMcq+7LxM2fOaOvWrUpMTHTOq1Spknr06KF///vfhdbPzc1Vbm6uc/rkyZOSpIyMjFKpLycrs1T2WxwXfa1ZOd4rxF1FeG/o7xJQnvtbos9LAZ9x7yrvn3H3d/nHPo0xl17Z+NBPP/1kJJmvvvrKZf4DDzxgOnToUGj9pKQkI4kHDx48ePDgUQ4eR48evWRW8OkRlaJKTExUQkKCc7qgoEAnTpxQWFiYbDabDyvzvoyMDEVGRuro0aNyOBy+LqdCoM+9i/72PvrcuypyfxtjdOrUKdWtW/eS6/o0qNSsWVN+fn765ZdfXOb/8ssvqlOnTqH17Xa77Ha7y7zq1auXZomW53A4KtwH3Nfoc++iv72PPveuitrfISEhbq3n08G0/v7+ateunT7//HPnvIKCAn3++eeKjY31YWUAAMAKfH7qJyEhQfHx8Wrfvr06dOigF154QVlZWbrzzjt9XRoAAPAxnweVwYMH69dff9Vjjz2mY8eOKSYmRqtWrVLt2rV9XZql2e12JSUlFToVhtJDn3sX/e199Ll30d/usRnjzneDAAAAvM/nF3wDAAC4EIIKAACwLIIKAACwLIKKjxw+fFg2m007duy44DoLFiyo8NeJ8RT96130t7VZ6f1xp5aywEp96imbzaYPPvjA12VcEkHFC4YNG6YbbrihyNsNHjxYBw4cKFbb536Yzvd47733nOv9+OOPuu6661S1alWFh4frgQce0NmzZ4vVtrf4sn8l6bXXXlPXrl3lcDhks9n0+++/n3e9jz/+WB07dlRgYKBq1KhRqOay8h74ur/PMcaoT58+5/1lO378eLVr1052u10xMTHn3X7Xrl3q0qWLAgICFBkZqWnTppVYbb7k6/fH3Z+HsqSs9+nUqVNls9l07733FrsWXyCoWFhgYKDCw8OLtY/IyEilpaW5PJKTk1WtWjX16dNHkpSfn6/rrrtOZ86c0VdffaU333xTCxYs0GOPPVYSL8OySqJ/Jen06dOKi4vTP//5zwuus2zZMt1xxx268847tXPnTm3cuFG33Xabc3lFeA9Kqr/PeeGFFy5664zhw4dr8ODB512WkZGhXr16qUGDBtq6daumT5+uyZMn67XXXiux+soab/48VBRW6NMtW7Zozpw5atWqVbHr8JkSubtgOXf11VebMWPGmDFjxhiHw2HCwsLMI488YgoKCkxycrJp0aJFoW1at25tHnnkkfPeSHHt2rXm0KFDRpJZtmyZ6dq1qwkMDDStWrVyuUHj/PnzTUhIiHP64MGDpl+/fiY8PNwEBQWZ9u3bmzVr1hT59cTExJjhw4c7pz/55BNTqVIlc+zYMee82bNnG4fDYXJzc40xf9wQsnXr1mbu3LkmMjLSBAUFmXvuucecPXvWPPPMM6Z27dqmVq1a5sknnyxyPeWlf9euXWskmd9++81lfl5enqlXr5554403LritN9+D8tDf27dvN/Xq1TNpaWlGknn//ffPu965PvurV155xdSoUcPZt8YYM2nSJNOsWTPndHx8vOnfv7956qmnTHh4uAkJCTHJyckmLy/PTJw40dSoUcPUq1fPzJs3z62a3VUe3h9jLvzzcK6WxYsXm9jYWGO3202LFi1MSkpKkfvKXeW9Ty/k1KlTpkmTJmbNmjXm6quvNhMmTHBZLsm88sorJi4uzgQEBJhGjRqZ9957z+16vIWg4oarr77aVKtWzUyYMMF899135u233zZVq1Y1r732mjl69KipVKmS2bx5s3P9bdu2GZvNZlJTU82pU6fMoEGDTFxcnElLSzNpaWkmNzfX+SG/4oorzEcffWT2799vBg4caBo0aGDy8vKMMYU/5Dt27DCvvvqq2b17tzlw4IB55JFHTEBAgDly5Ijbr+Wbb74xkszGjRud8x599NFCv8x/+OEHI8ls27bNGPPHL/xq1aqZgQMHmm+//dasWLHC+Pv7m969e5tx48aZ7777zsybN89IMps2baqQ/XuhXyJff/21kWTmzZtnYmJiTJ06dUxcXJzZvXu3cx1vvgdlvb+zsrJMdHS0+eCDD4wxxqOgcscdd5j+/fu7zPviiy+MJHPixAljzB9BJTg42IwZM8Z89913Zu7cuUaS6d27t3nqqafMgQMHzBNPPGGqVKni1h1g3VXW359zLhVU6tevb5YuXWr27t1rRo4caYKDg81///vfYvff+ZT3Pr2QoUOHmnvvvdfZB+cLKmFhYeb11183+/fvN4888ojx8/Mze/fudWv/3kJQccPVV19toqOjTUFBgXPepEmTTHR0tDHGmD59+ph77rnHuWzcuHGma9euzulz/5n92bkP+Z//y/7222+NJLNv3z5jTOEP+fm0aNHCvPzyy26/lnvuucdZ9zmjRo0yvXr1cpmXlZVlJJlPPvnEGPPHL/yqVauajIwM5zq9e/c2DRs2NPn5+c55zZo1M1OmTHG7HmPKT/9e6JfI4sWLjSQTFRVlli5dar755htz6623mrCwMHP8+HFjjHffg7Le33fddZcZMWKEc9qToNKzZ09z1113ucw7V++5X9Lx8fGmQYMGhfq2S5cuzumzZ8+aoKAgs3jx4ovWXBRl/f0551JBZerUqc55eXl5pn79+uaZZ55xa99FVd779HwWL15sWrZsabKzs40xFw4qd999t8u8jh07uvSFFTBGxU3/7//9P5fz4bGxsfr++++Vn5+vUaNGafHixcrJydGZM2e0aNEiDR8+3K39/vm8YUREhCQpPT39vOtmZmZq4sSJio6OVvXq1VWtWjXt27dPP/74oyTp6aefVrVq1ZyPc/PPyc7O1qJFizRixIgivfZzGjZsqODgYOd07dq11bx5c1WqVMll3oXqv5jy0L8XUlBQIEl6+OGHddNNN6ldu3aaP39+oQHN7iip96Cs9veKFSv0xRdf6IUXXnCrnuJq0aJFob698sorndN+fn4KCwvz6DN/MWX1/SmKP994tnLlymrfvr327dtXpH0URUXo03OOHj2qCRMmaOHChQoICLjoun+9AXBsbGypvg+e8Pm9fsqDvn37ym636/3335e/v7/y8vI0cOBAt7atUqWK8/m5H6Jzf9j+auLEiVqzZo2effZZNW7cWIGBgRo4cKDOnDkjSbr77rs1aNAg5/p169Z12X7p0qU6ffq0hg4d6jK/Tp062rx5s8u8X375xbnsfLWeq/d88y5Uv6fKSv9eyLlfXs2bN3fOs9vtuuyyy5y/iKz0Hli5v59//nmlpqYW+srnTTfdpC5duiglJcWtOuvUqePs33Os9Jm/GCu/P2VVeevTrVu3Kj09XW3btnXOy8/P15dffqmZM2cqNzdXfn5+Hu3bFwgqbvr6669dpjdt2qQmTZo43+z4+HjNnz9f/v7+uuWWWxQYGOhc19/fX/n5+cWuYePGjRo2bJhuvPFGSX+k88OHDzuXh4aGKjQ09ILbz507V/369VOtWrVc5sfGxuqpp55Senq6c4T6mjVr5HA4XP64lqby0L8Xcu5rsvv371fnzp0lSXl5eTp8+LAaNGggyfvvQVnt74ceekgjR450mXfllVdqxowZ6tu3r9ttx8bG6uGHH1ZeXp7zD82aNWvUrFkz1ahRw8NXVHLK6vtTFJs2bdLf//53SdLZs2e1detWjR07tlg1X0xF6NNzunfvrt27d7vMu/POO3XFFVdo0qRJLiFl06ZNLv+8btq0SW3atCl2DSWJoOKmH3/8UQkJCRo9erS2bduml19+Wc8995xz+ciRIxUdHS3pjw/jnzVs2FCrV6/W/v37FRYWppCQEI9qaNKkiZYvX66+ffvKZrPp0Ucfdfs/uYMHD+rLL7/UJ598UmhZr1691Lx5c91xxx2aNm2ajh07pkceeURjxozx2l09y3L/Hjt2TMeOHdPBgwclSbt371ZwcLCioqIUGhoqh8Ohu+++W0lJSYqMjFSDBg00ffp0SdLNN98syfvvQVnt7zp16rgc8TgnKipKjRo1ck4fPHhQmZmZOnbsmLKzs50X5WrevLn8/f112223KTk5WSNGjNCkSZO0Z88evfjii5oxY4ZHr6WkldX3R7r0z8M5s2bNUpMmTRQdHa0ZM2bot99+c/t0iyfKe592795dN954o8aOHavg4GC1bNnSZR9BQUEKCwsrNP+9995T+/bt1blzZy1cuFCbN2/W3LlzPXp9pYWg4qahQ4cqOztbHTp0kJ+fnyZMmKC77rrLubxJkybq1KmTTpw4oY4dO7psO2rUKKWkpKh9+/bKzMzU2rVr1bBhwyLX8Pzzz2v48OHq1KmTatasqUmTJikjI8OtbefNm6f69eurV69ehZb5+fnpo48+0j333KPY2FgFBQUpPj5ejz/+eJFr9FRZ7t9XX31VycnJzulz/yXOnz9fw4YNkyRNnz5dlStX1h133KHs7Gx17NhRX3zxhfO/d2+/B2W5v90xcuRIrVu3zjl97j/EQ4cOqWHDhgoJCdGnn36qMWPGqF27dqpZs6Yee+wxlz7wpbL8/rjz8yD9cRGyqVOnaseOHWrcuLFWrFihmjVrFrlOd5X3Pk1NTdV///vfIteUnJysd955R//4xz8UERGhxYsXe+1Iutt8PZq3LDjfaOm/KigoMJdffrl57rnnvFNUOUL/ehf9bW28PyWPPi3bOKJSAn799Ve98847OnbsmO68805fl1Pu0L/eRX9bG+9PyaNPrY2gUgLCw8NVs2ZNvfbaa5YYiFfe0L/eRX9bG+9PyaNPrc1mjDG+LgIAAOB8uOAbAACwLIIKAACwLIIKAACwLIIKAACwLIIKAACwLIIKgDIlJSVFNptNv//+u9vbNGzY0Gt3XAZQsggqAErUsGHDZLPZdPfddxdaNmbMGNlsNpdLqQPAxRBUAJS4yMhIvfPOO8rOznbOy8nJ0aJFixQVFeXDygCUNQQVACWubdu2ioyM1PLly53zli9frqioKJdbyOfm5mr8+PEKDw9XQECAOnfurC1btrjs65NPPlHTpk0VGBiobt266fDhw4Xa27Bhg7p06aLAwEBFRkZq/PjxysrKKrXXB8B7CCoASsXw4cM1f/585/S8efMK3UflwQcf1LJly/Tmm29q27Ztaty4sXr37q0TJ05Iko4ePaoBAwaob9++2rFjh0aOHKmHHnrIZR+pqamKi4vTTTfdpF27dmnJkiXasGGDxo4dW/ovEkCpI6gAKBW33367NmzYoCNHjujIkSPauHGjbr/9dufyrKwszZ49W9OnT1efPn3UvHlzvf766woMDNTcuXMlSbNnz9bll1+u5557Ts2aNdOQIUMKjW+ZMmWKhgwZonvvvVdNmjRRp06d9NJLL+mtt95STk6ON18ygFLATQkBlIpatWrpuuuu04IFC2SM0XXXXaeaNWs6l6empiovL09XXXWVc16VKlXUoUMH7du3T5K0b98+dezY0WW/sbGxLtM7d+7Url27tHDhQuc8Y4wKCgp06NAhRUdHl8bLA+AlBBUApWb48OHOUzCzZs0qlTYyMzM1evRojR8/vtAyBu4CZR9BBUCpiYuL05kzZ2Sz2dS7d2+XZZdffrn8/f21ceNGNWjQQJKUl5enLVu26N5775UkRUdHa8WKFS7bbdq0yWW6bdu22rt3rxo3blx6LwSAzzBGBUCp8fPz0759+7R37175+fm5LAsKCtI999yjBx54QKtWrdLevXs1atQonT59WiNGjJAk3X333fr+++/1wAMPaP/+/Vq0aJEWLFjgsp9Jkybpq6++0tixY7Vjxw59//33+vDDDxlMC5QTBBUApcrhcMjhcJx32dSpU3XTTTfpjjvuUNu2bXXw4EGtXr1aNWrUkPTHqZtly5bpgw8+UOvWrfXqq6/q6aefdtlHq1attG7dOh04cEBdunRRmzZt9Nhjj6lu3bql/toAlD6bMcb4uggAAIDz4YgKAACwLIIKAACwLIIKAACwLIIKAACwLIIKAACwLIIKAACwLIIKAACwLIIKAACwLIIKAACwLIIKAACwLIIKAACwrP8POdGutrnzE2MAAAAASUVORK5CYII=",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Create the figure and axes objects\n",
+    "fig, ax = plt.subplots()\n",
+    "# Create the bar plot with error bars\n",
+    "ax.bar(\n",
+    "    np.arange(len(ds_means)) - 0.24,\n",
+    "    ds_means, yerr=ds_std, align='center', alpha=0.5, ecolor='black', capsize=10, width=0.4, label='Deepspeed')\n",
+    "ax.bar(\n",
+    "    np.arange(len(hf_means)) + 0.24,\n",
+    "    hf_means, yerr=hf_std, align='center', alpha=0.5, ecolor='black', capsize=10, width=0.4, label='Huggingface')\n",
+    "\n",
+    "# Set the x-axis tick labels to be the index of the values list\n",
+    "ax.set_xticks(np.arange(len(models)))\n",
+    "ax.set_xticklabels(models)\n",
+    "\n",
+    "# Set the labels and title\n",
+    "ax.set_xlabel('Model')\n",
+    "ax.set_ylabel('Time (ms)')\n",
+    "\n",
+    "plt.legend()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "b45dcc00-a07d-435a-b424-ce5abc3ff23a",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: 'data/gpt2-vocab.json'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[21], line 14\u001b[0m\n\u001b[1;32m      9\u001b[0m neox_args \u001b[39m=\u001b[39m NeoXArgs\u001b[39m.\u001b[39mfrom_ymls(\n\u001b[1;32m     10\u001b[0m     [\u001b[39m\"\u001b[39m\u001b[39m/home/mchorse/gpt-neox/configs/19M.yml\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m     11\u001b[0m      \u001b[39m\"\u001b[39m\u001b[39m/home/mchorse/gpt-neox/configs/local_setup.yml\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m     12\u001b[0m      \u001b[39m\"\u001b[39m\u001b[39m/home/mchorse/gpt-neox/configs/benchmarking.yml\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m     13\u001b[0m neox_args\u001b[39m.\u001b[39mconfigure_distributed_args()\n\u001b[0;32m---> 14\u001b[0m neox_args\u001b[39m.\u001b[39;49mbuild_tokenizer()\n",
+      "File \u001b[0;32m~/gpt-neox/megatron/neox_arguments/arguments.py:147\u001b[0m, in \u001b[0;36mNeoXArgs.build_tokenizer\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    146\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mbuild_tokenizer\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[0;32m--> 147\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtokenizer \u001b[39m=\u001b[39m build_tokenizer(\u001b[39mself\u001b[39;49m)\n",
+      "File \u001b[0;32m~/gpt-neox/megatron/tokenizer/tokenizer.py:40\u001b[0m, in \u001b[0;36mbuild_tokenizer\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m     38\u001b[0m     \u001b[39massert\u001b[39;00m args\u001b[39m.\u001b[39mvocab_file \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m     39\u001b[0m     \u001b[39massert\u001b[39;00m args\u001b[39m.\u001b[39mmerge_file \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m---> 40\u001b[0m     tokenizer \u001b[39m=\u001b[39m _GPT2BPETokenizer(args\u001b[39m.\u001b[39;49mvocab_file, args\u001b[39m.\u001b[39;49mmerge_file)\n\u001b[1;32m     41\u001b[0m \u001b[39melif\u001b[39;00m args\u001b[39m.\u001b[39mtokenizer_type\u001b[39m.\u001b[39mlower() \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mSPMTokenizer\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mlower():\n\u001b[1;32m     42\u001b[0m     \u001b[39massert\u001b[39;00m args\u001b[39m.\u001b[39mvocab_file \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n",
+      "File \u001b[0;32m~/gpt-neox/megatron/tokenizer/tokenizer.py:157\u001b[0m, in \u001b[0;36m_GPT2BPETokenizer.__init__\u001b[0;34m(self, vocab_file, merge_file)\u001b[0m\n\u001b[1;32m    154\u001b[0m name \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mGPT2 BPE\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    155\u001b[0m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39m\u001b[39m__init__\u001b[39m(name)\n\u001b[0;32m--> 157\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtokenizer \u001b[39m=\u001b[39m GPT2Tokenizer(\n\u001b[1;32m    158\u001b[0m     vocab_file, merge_file, errors\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mreplace\u001b[39;49m\u001b[39m\"\u001b[39;49m, special_tokens\u001b[39m=\u001b[39;49m[], max_len\u001b[39m=\u001b[39;49m\u001b[39mNone\u001b[39;49;00m\n\u001b[1;32m    159\u001b[0m )\n\u001b[1;32m    160\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39meod_id \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtokenizer\u001b[39m.\u001b[39mencoder[\u001b[39m\"\u001b[39m\u001b[39m<|endoftext|>\u001b[39m\u001b[39m\"\u001b[39m]\n",
+      "File \u001b[0;32m~/gpt-neox/megatron/tokenizer/gpt2_tokenization.py:188\u001b[0m, in \u001b[0;36mGPT2Tokenizer.__init__\u001b[0;34m(self, vocab_file, merges_file, errors, special_tokens, max_len)\u001b[0m\n\u001b[1;32m    179\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\n\u001b[1;32m    180\u001b[0m     \u001b[39mself\u001b[39m,\n\u001b[1;32m    181\u001b[0m     vocab_file,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    185\u001b[0m     max_len\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m,\n\u001b[1;32m    186\u001b[0m ):\n\u001b[1;32m    187\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmax_len \u001b[39m=\u001b[39m max_len \u001b[39mif\u001b[39;00m max_len \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m \u001b[39mint\u001b[39m(\u001b[39m1e12\u001b[39m)\n\u001b[0;32m--> 188\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mencoder \u001b[39m=\u001b[39m json\u001b[39m.\u001b[39mload(\u001b[39mopen\u001b[39;49m(vocab_file))\n\u001b[1;32m    189\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdecoder \u001b[39m=\u001b[39m {v: k \u001b[39mfor\u001b[39;00m k, v \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mencoder\u001b[39m.\u001b[39mitems()}\n\u001b[1;32m    190\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39merrors \u001b[39m=\u001b[39m errors  \u001b[39m# how to handle errors in decoding\u001b[39;00m\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/gpt2-vocab.json'"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, '/home/mchorse/gpt-neox')\n",
+    "from megatron.text_generation_utils import generate_samples_from_prompt\n",
+    "from megatron.utils import print_rank_0, setup_for_inference_or_eval\n",
+    "from megatron.neox_arguments import NeoXArgs\n",
+    "from megatron.training import setup_model_and_optimizer\n",
+    "from megatron.initialize import initialize_megatron\n",
+    "\n",
+    "neox_args = NeoXArgs.from_ymls(\n",
+    "    [\"/home/mchorse/gpt-neox/configs/19M.yml\",\n",
+    "     \"/home/mchorse/gpt-neox/configs/local_setup.yml\",\n",
+    "     \"/home/mchorse/gpt-neox/configs/benchmarking.yml\"])\n",
+    "neox_args.configure_distributed_args()\n",
+    "neox_args.build_tokenizer()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "f1833a4c",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "unsupported operand type(s) for %: 'NoneType' and 'int'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[22], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m use_cache \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[1;32m      2\u001b[0m initialize_megatron(neox_args)\n\u001b[0;32m----> 3\u001b[0m model, _, _ \u001b[39m=\u001b[39m setup_model_and_optimizer(\n\u001b[1;32m      4\u001b[0m     neox_args\u001b[39m=\u001b[39;49mneox_args,\n\u001b[1;32m      5\u001b[0m     use_cache\u001b[39m=\u001b[39;49muse_cache,\n\u001b[1;32m      6\u001b[0m     iteration\u001b[39m=\u001b[39;49mneox_args\u001b[39m.\u001b[39;49miteration,\n\u001b[1;32m      7\u001b[0m     )  \u001b[39m# we use setup_model_and_optimizer instead of get_model in order to initialize deepspeed\u001b[39;00m\n\u001b[1;32m      8\u001b[0m print_rank_0(\u001b[39m\"\u001b[39m\u001b[39mFinished loading model\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m     10\u001b[0m model\u001b[39m.\u001b[39mmodule\u001b[39m.\u001b[39minference_mode(use_cache\u001b[39m=\u001b[39muse_cache)\n",
+      "File \u001b[0;32m~/gpt-neox/megatron/training.py:600\u001b[0m, in \u001b[0;36msetup_model_and_optimizer\u001b[0;34m(neox_args, use_cache, iteration)\u001b[0m\n\u001b[1;32m    598\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39msetup_model_and_optimizer\u001b[39m(neox_args, use_cache\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m, iteration\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m):\n\u001b[1;32m    599\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"Setup model and optimizer.\"\"\"\u001b[39;00m\n\u001b[0;32m--> 600\u001b[0m     model \u001b[39m=\u001b[39m get_model(neox_args\u001b[39m=\u001b[39;49mneox_args, use_cache\u001b[39m=\u001b[39;49muse_cache)\n\u001b[1;32m    601\u001b[0m     optimizer, param_groups \u001b[39m=\u001b[39m get_optimizer(model\u001b[39m=\u001b[39mmodel, neox_args\u001b[39m=\u001b[39mneox_args)\n\u001b[1;32m    602\u001b[0m     lr_scheduler \u001b[39m=\u001b[39m get_learning_rate_scheduler(optimizer\u001b[39m=\u001b[39moptimizer, neox_args\u001b[39m=\u001b[39mneox_args)\n",
+      "File \u001b[0;32m~/gpt-neox/megatron/training.py:388\u001b[0m, in \u001b[0;36mget_model\u001b[0;34m(neox_args, use_cache)\u001b[0m\n\u001b[1;32m    386\u001b[0m old_use_mup \u001b[39m=\u001b[39m neox_args\u001b[39m.\u001b[39muse_mup\n\u001b[1;32m    387\u001b[0m neox_args\u001b[39m.\u001b[39muse_mup \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m\n\u001b[0;32m--> 388\u001b[0m model \u001b[39m=\u001b[39m GPT2ModelPipe(\n\u001b[1;32m    389\u001b[0m     neox_args\u001b[39m=\u001b[39;49mneox_args,\n\u001b[1;32m    390\u001b[0m     num_tokentypes\u001b[39m=\u001b[39;49m\u001b[39m0\u001b[39;49m,\n\u001b[1;32m    391\u001b[0m     parallel_output\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m,\n\u001b[1;32m    392\u001b[0m     topology\u001b[39m=\u001b[39;49mmpu\u001b[39m.\u001b[39;49mget_topology(),\n\u001b[1;32m    393\u001b[0m     use_cache\u001b[39m=\u001b[39;49muse_cache,\n\u001b[1;32m    394\u001b[0m )\n\u001b[1;32m    396\u001b[0m \u001b[39m### soft prompt tuning stuff ###\u001b[39;00m\n\u001b[1;32m    397\u001b[0m \u001b[39mif\u001b[39;00m neox_args\u001b[39m.\u001b[39msoft_prompt_tuning \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m neox_args\u001b[39m.\u001b[39msoft_prompt_tuning\u001b[39m.\u001b[39mget(\n\u001b[1;32m    398\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39menabled\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mFalse\u001b[39;00m\n\u001b[1;32m    399\u001b[0m ):\n",
+      "File \u001b[0;32m~/gpt-neox/megatron/model/gpt2_model.py:123\u001b[0m, in \u001b[0;36mGPT2ModelPipe.__init__\u001b[0;34m(self, neox_args, num_tokentypes, parallel_output, topology, use_cache)\u001b[0m\n\u001b[1;32m    120\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mspecs \u001b[39m=\u001b[39m []\n\u001b[1;32m    121\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minit_specs()  \u001b[39m# initializes the layer specs (basically a fancy nn.Sequential)\u001b[39;00m\n\u001b[0;32m--> 123\u001b[0m \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49m\u001b[39m__init__\u001b[39;49m(\n\u001b[1;32m    124\u001b[0m     layers\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mspecs,\n\u001b[1;32m    125\u001b[0m     loss_fn\u001b[39m=\u001b[39;49mpartial(cross_entropy, _fp16\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mneox_args\u001b[39m.\u001b[39;49mfp16_lm_cross_entropy),\n\u001b[1;32m    126\u001b[0m     topology\u001b[39m=\u001b[39;49mtopology,\n\u001b[1;32m    127\u001b[0m     activation_checkpoint_interval\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mneox_args\u001b[39m.\u001b[39;49mcheckpoint_num_layers\n\u001b[1;32m    128\u001b[0m     \u001b[39mif\u001b[39;49;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mneox_args\u001b[39m.\u001b[39;49mcheckpoint_activations\n\u001b[1;32m    129\u001b[0m     \u001b[39melse\u001b[39;49;00m \u001b[39m0\u001b[39;49m,\n\u001b[1;32m    130\u001b[0m     partition_method\u001b[39m=\u001b[39;49mneox_args\u001b[39m.\u001b[39;49mpipe_partition_method,\n\u001b[1;32m    131\u001b[0m     checkpointable_layers\u001b[39m=\u001b[39;49m[\u001b[39m\"\u001b[39;49m\u001b[39mGMLPBlock\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mParallelTransformerLayerPipe\u001b[39;49m\u001b[39m\"\u001b[39;49m],\n\u001b[1;32m    132\u001b[0m )\n",
+      "File \u001b[0;32m~/DeepSpeed/deepspeed/runtime/pipe/module.py:196\u001b[0m, in \u001b[0;36mPipelineModule.__init__\u001b[0;34m(self, layers, num_stages, topology, loss_fn, seed_layers, seed_fn, base_seed, partition_method, activation_checkpoint_interval, activation_checkpoint_func, checkpointable_layers)\u001b[0m\n\u001b[1;32m    189\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtied_weight_attrs \u001b[39m=\u001b[39m {}\n\u001b[1;32m    191\u001b[0m \u001b[39m# Offset the random seed by the stage ID.\u001b[39;00m\n\u001b[1;32m    192\u001b[0m \u001b[39m#newseed = get_accelerator().initial_seed() + self._grid.get_stage_id()\u001b[39;00m\n\u001b[1;32m    193\u001b[0m \u001b[39m#ds_utils.set_random_seed(newseed)\u001b[39;00m\n\u001b[1;32m    194\u001b[0m \n\u001b[1;32m    195\u001b[0m \u001b[39m#with torch.random.fork_rng(devices=[get_accelerator().current_device_name()]):\u001b[39;00m\n\u001b[0;32m--> 196\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_build()\n\u001b[1;32m    197\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mto(get_accelerator()\u001b[39m.\u001b[39mdevice_name(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlocal_rank))\n\u001b[1;32m    199\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtied_comms \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_index_tied_modules()\n",
+      "File \u001b[0;32m~/DeepSpeed/deepspeed/runtime/pipe/module.py:243\u001b[0m, in \u001b[0;36mPipelineModule._build\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    241\u001b[0m \u001b[39m# LayerSpec objects contain an nn.Module that should be allocated now.\u001b[39;00m\n\u001b[1;32m    242\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(layer, LayerSpec):\n\u001b[0;32m--> 243\u001b[0m     module \u001b[39m=\u001b[39m layer\u001b[39m.\u001b[39;49mbuild()\n\u001b[1;32m    244\u001b[0m     name \u001b[39m=\u001b[39m \u001b[39mstr\u001b[39m(layer_idx)\n\u001b[1;32m    245\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mforward_funcs\u001b[39m.\u001b[39mappend(module)\n",
+      "File \u001b[0;32m~/DeepSpeed/deepspeed/runtime/pipe/module.py:70\u001b[0m, in \u001b[0;36mLayerSpec.build\u001b[0;34m(self, log)\u001b[0m\n\u001b[1;32m     67\u001b[0m \u001b[39mif\u001b[39;00m log:\n\u001b[1;32m     68\u001b[0m     logger\u001b[39m.\u001b[39minfo(\u001b[39mf\u001b[39m\u001b[39m'\u001b[39m\u001b[39mRANK=\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mglobal_rank\u001b[39m}\u001b[39;00m\u001b[39m building \u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mrepr\u001b[39m(\u001b[39mself\u001b[39m)\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m)\n\u001b[0;32m---> 70\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mtypename(\u001b[39m*\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmodule_args, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmodule_kwargs)\n",
+      "File \u001b[0;32m~/gpt-neox/megatron/model/word_embeddings.py:58\u001b[0m, in \u001b[0;36mEmbedding.__init__\u001b[0;34m(self, neox_args, hidden_size, vocab_size, max_sequence_length, embedding_dropout_prob, init_method, num_tokentypes, use_pos_emb)\u001b[0m\n\u001b[1;32m     55\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmup_rp_embedding_mult \u001b[39m=\u001b[39m neox_args\u001b[39m.\u001b[39mmup_rp_embedding_mult\n\u001b[1;32m     57\u001b[0m \u001b[39m# Word embeddings (parallel).\u001b[39;00m\n\u001b[0;32m---> 58\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mword_embeddings \u001b[39m=\u001b[39m mpu\u001b[39m.\u001b[39;49mVocabParallelEmbedding(\n\u001b[1;32m     59\u001b[0m     neox_args\u001b[39m=\u001b[39;49mneox_args,\n\u001b[1;32m     60\u001b[0m     num_embeddings\u001b[39m=\u001b[39;49mvocab_size,\n\u001b[1;32m     61\u001b[0m     embedding_dim\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mhidden_size,\n\u001b[1;32m     62\u001b[0m     init_method\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49minit_method,\n\u001b[1;32m     63\u001b[0m )\n\u001b[1;32m     64\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_word_embeddings_key \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mword_embeddings\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m     66\u001b[0m \u001b[39mif\u001b[39;00m neox_args\u001b[39m.\u001b[39muse_bnb_optimizer:\n",
+      "File \u001b[0;32m~/gpt-neox/megatron/mpu/layers.py:126\u001b[0m, in \u001b[0;36mVocabParallelEmbedding.__init__\u001b[0;34m(self, neox_args, num_embeddings, embedding_dim, init_method)\u001b[0m\n\u001b[1;32m    121\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmodel_parallel_size \u001b[39m=\u001b[39m get_model_parallel_world_size()\n\u001b[1;32m    122\u001b[0m \u001b[39m# Divide the weight matrix along the vocabulary dimension.\u001b[39;00m\n\u001b[1;32m    123\u001b[0m (\n\u001b[1;32m    124\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvocab_start_index,\n\u001b[1;32m    125\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvocab_end_index,\n\u001b[0;32m--> 126\u001b[0m ) \u001b[39m=\u001b[39m VocabUtility\u001b[39m.\u001b[39;49mvocab_range_from_global_vocab_size(\n\u001b[1;32m    127\u001b[0m     \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mnum_embeddings, get_model_parallel_rank(), \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmodel_parallel_size\n\u001b[1;32m    128\u001b[0m )\n\u001b[1;32m    129\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mnum_embeddings_per_partition \u001b[39m=\u001b[39m (\n\u001b[1;32m    130\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvocab_end_index \u001b[39m-\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mvocab_start_index\n\u001b[1;32m    131\u001b[0m )\n\u001b[1;32m    132\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minit_method \u001b[39m=\u001b[39m init_method\n",
+      "File \u001b[0;32m~/gpt-neox/megatron/mpu/utils.py:71\u001b[0m, in \u001b[0;36mVocabUtility.vocab_range_from_global_vocab_size\u001b[0;34m(global_vocab_size, rank, world_size)\u001b[0m\n\u001b[1;32m     69\u001b[0m \u001b[39m@staticmethod\u001b[39m\n\u001b[1;32m     70\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mvocab_range_from_global_vocab_size\u001b[39m(global_vocab_size, rank, world_size):\n\u001b[0;32m---> 71\u001b[0m     per_partition_vocab_size \u001b[39m=\u001b[39m divide(global_vocab_size, world_size)\n\u001b[1;32m     72\u001b[0m     \u001b[39mreturn\u001b[39;00m VocabUtility\u001b[39m.\u001b[39mvocab_range_from_per_partition_vocab_size(\n\u001b[1;32m     73\u001b[0m         per_partition_vocab_size, rank, world_size\n\u001b[1;32m     74\u001b[0m     )\n",
+      "File \u001b[0;32m~/gpt-neox/megatron/mpu/utils.py:32\u001b[0m, in \u001b[0;36mdivide\u001b[0;34m(numerator, denominator)\u001b[0m\n\u001b[1;32m     29\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mdivide\u001b[39m(numerator, denominator):\n\u001b[1;32m     30\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"Ensure that numerator is divisible by the denominator and return\u001b[39;00m\n\u001b[1;32m     31\u001b[0m \u001b[39m    the division value.\"\"\"\u001b[39;00m\n\u001b[0;32m---> 32\u001b[0m     ensure_divisibility(numerator, denominator)\n\u001b[1;32m     33\u001b[0m     \u001b[39mreturn\u001b[39;00m numerator \u001b[39m/\u001b[39m\u001b[39m/\u001b[39m denominator\n",
+      "File \u001b[0;32m~/gpt-neox/megatron/mpu/utils.py:24\u001b[0m, in \u001b[0;36mensure_divisibility\u001b[0;34m(numerator, denominator)\u001b[0m\n\u001b[1;32m     22\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mensure_divisibility\u001b[39m(numerator, denominator):\n\u001b[1;32m     23\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"Ensure that numerator is divisible by the denominator.\"\"\"\u001b[39;00m\n\u001b[0;32m---> 24\u001b[0m     \u001b[39massert\u001b[39;00m numerator \u001b[39m%\u001b[39;49m denominator \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m is not divisible by \u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mformat(\n\u001b[1;32m     25\u001b[0m         numerator, denominator\n\u001b[1;32m     26\u001b[0m     )\n",
+      "\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for %: 'NoneType' and 'int'"
+     ]
+    }
+   ],
+   "source": [
+    "use_cache = True\n",
+    "initialize_megatron(neox_args)\n",
+    "model, _, _ = setup_model_and_optimizer(\n",
+    "    neox_args=neox_args,\n",
+    "    use_cache=use_cache,\n",
+    "    iteration=neox_args.iteration,\n",
+    "    )  # we use setup_model_and_optimizer instead of get_model in order to initialize deepspeed\n",
+    "print_rank_0(\"Finished loading model\")\n",
+    "\n",
+    "model.module.inference_mode(use_cache=use_cache)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "59662052",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "NeoXArgs(distributed_backend='nccl', local_rank=None, rank=None, lazy_mpu_init=False, short_seq_prob=0.1, eod_mask_loss=False, adlr_autoresume=False, adlr_autoresume_interval=1000, seed=1234, onnx_safe=False, deepscale=False, deepscale_config=None, deepspeed_mpi=False, deepspeed_slurm=False, user_script=None, iteration=None, do_train=None, do_valid=None, do_test=None, save_iters=[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000, 21000, 22000, 23000, 24000, 25000, 26000, 27000, 28000, 29000, 30000, 31000, 32000, 33000, 34000, 35000, 36000, 37000, 38000, 39000, 40000, 41000, 42000, 43000, 44000, 45000, 46000, 47000, 48000, 49000, 50000, 51000, 52000, 53000, 54000, 55000, 56000, 57000, 58000, 59000, 60000, 61000, 62000, 63000, 64000, 65000, 66000, 67000, 68000, 69000, 70000, 71000, 72000, 73000, 74000, 75000, 76000, 77000, 78000, 79000, 80000, 81000, 82000, 83000, 84000, 85000, 86000, 87000, 88000, 89000, 90000, 91000, 92000, 93000, 94000, 95000, 96000, 97000, 98000, 99000, 100000, 101000, 102000, 103000, 104000, 105000, 106000, 107000, 108000, 109000, 110000, 111000, 112000, 113000, 114000, 115000, 116000, 117000, 118000, 119000, 120000, 121000, 122000, 123000, 124000, 125000, 126000, 127000, 128000, 129000, 130000, 131000, 132000, 133000, 134000, 135000, 136000, 137000, 138000, 139000, 140000, 141000, 142000], global_num_gpus=8, text_gen_type='unconditional', temperature=0.0, top_p=0.0, top_k=0, return_logits=False, maximum_tokens=64, prompt_end='\\n', sample_input_file=None, sample_output_file='samples.txt', num_samples=1, recompute=False, eval_results_prefix='', eval_tasks=None, use_wandb=True, wandb_group=None, wandb_team=None, wandb_project='neox', wandb_host='https://api.wandb.ai', wandb_init_all_ranks=False, git_hash='b0e9745', log_dir='logs', tensorboard_dir='tensorboard', log_interval=10, log_grad_pct_zeros=False, log_param_norm=False, log_grad_norm=False, log_optimizer_states=False, log_gradient_noise_scale=False, gradient_noise_scale_n_batches=5, gradient_noise_scale_cpu_offload=False, pipe_parallel_size=1, model_parallel_size=1, pipe_partition_method='type:transformer|mlp', world_size=None, is_pipe_parallel=False, data_path='data/enwik8/enwik8_text_document', use_shared_fs=True, train_data_paths=None, test_data_paths=None, valid_data_paths=None, train_data_weights=None, valid_data_weights=None, test_data_weights=None, weight_by_num_documents=False, weighted_sampler_alpha=0.3, data_impl='mmap', mmap_warmup=False, save='checkpoints', config_files={'19M.yml': '{\\n  \"pipe-parallel-size\": 1,\\n  \"model-parallel-size\": 1,\\n\\n  # model settings\\n  \"num-layers\": 6,\\n  \"hidden-size\": 512,\\n  \"num-attention-heads\": 8,\\n  \"seq-length\": 2048,\\n  \"max-position-embeddings\": 2048,\\n  \"pos-emb\": \"rotary\",\\n  \"no-weight-tying\": true,\\n  \"gpt-j-residual\": false,\\n  \"output-layer-parallelism\": \"column\",\\n\\n  \"scaled-upper-triang-masked-softmax-fusion\": false,\\n  \"bias-gelu-fusion\": false,\\n\\n  # init methods\\n  \"init_method\": \"small_init\",\\n  \"output_layer_init_method\": \"wang_init\",\\n\\n  \"optimizer\": {\\n    \"type\": \"Adam\",\\n    \"params\": {\\n      \"lr\": 0.001,\\n      \"betas\": [0.9, 0.95],\\n      \"eps\": 1.0e-8,\\n    }\\n  },\\n  \"min_lr\": 0.0001,\\n\\n  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training\\n  \"zero_optimization\": {\\n    \"stage\": 1,\\n    \"allgather_partitions\": True,\\n    \"allgather_bucket_size\": 500000000,\\n    \"overlap_comm\": True,\\n    \"reduce_scatter\": True,\\n    \"reduce_bucket_size\": 500000000,\\n    \"contiguous_gradients\": True,\\n  },\\n\\n  \"train_micro_batch_size_per_gpu\": 4, #32,\\n  \"gas\": 1,\\n  \"data-impl\": \"mmap\",\\n  \"num_workers\": 1,\\n\\n  # activation checkpointing\\n  \"checkpoint-activations\": true,\\n  \"checkpoint-num-layers\": 1,\\n  \"partition-activations\": true,\\n  \"synchronize-each-layer\": true,\\n\\n  # regularization\\n  \"gradient_clipping\": 1.0,\\n  \"weight-decay\": 0.1,\\n  \"hidden-dropout\": 0,\\n  \"attention-dropout\": 0,\\n\\n  # precision settings\\n  \"fp16\": {\\n    \"fp16\": true,\\n    \"enabled\": true,\\n    \"loss_scale\": 0,\\n    \"loss_scale_window\": 1000,\\n    \"initial_scale_power\": 12,\\n    \"hysteresis\": 2,\\n    \"min_loss_scale\": 1,\\n  },\\n\\n  \"train-iters\": 143000,\\n  \"lr-decay-iters\": 143000,\\n  \"distributed-backend\": \"nccl\",\\n  \"lr-decay-style\": \"cosine\",\\n  \"warmup\": 0.01,\\n  \"checkpoint-factor\": 1000,\\n  \"eval-interval\": 100000,\\n  \"eval-iters\": 10,\\n\\n  \"log-interval\": 10,\\n  \"steps_per_print\": 10,\\n  \"wall_clock_breakdown\": true,\\n\\n  # additional deepspeed args not specified above\\n  \"deepspeed_extra_args\": {\\n    \"comms_logger\": {\\n        \"enabled\": true,\\n        \"verbose\": true,\\n        \"prof_all\": true,\\n        \"debug\": false\\n    },\\n  }\\n\\n}\\n', 'local_setup.yml': '# Suggested data paths when using GPT-NeoX locally\\n{\\n  \"data-path\": \"data/enwik8/enwik8_text_document\",\\n\\n  # or for weighted datasets:\\n  # \"train-data-paths\": [\"data/enwik8/enwik8_text_document\", \"data/enwik8/enwik8_text_document\"],\\n  # \"test-data-paths\": [\"data/enwik8/enwik8_text_document\", \"data/enwik8/enwik8_text_document\"],\\n  # \"valid-data-paths\": [\"data/enwik8/enwik8_text_document\", \"data/enwik8/enwik8_text_document\"],\\n  # \"train-data-weights\": [1., 2.],\\n  # \"test-data-weights\": [2., 1.],\\n  # \"valid-data-weights\": [0.5, 0.4],\\n\\n  # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.\\n  # WARNING: setting this to True will override any user provided weights\\n  # \"weight_by_num_documents\": false,\\n  # \"weighted_sampler_alpha\": 0.3,\\n\\n  \"vocab-file\": \"data/gpt2-vocab.json\",\\n  \"merge-file\": \"data/gpt2-merges.txt\",\\n\\n  \"save\": \"checkpoints\",\\n  \"load\": \"checkpoints\",\\n  \"checkpoint_validation_with_forward_pass\": False,\\n\\n  \"tensorboard-dir\": \"tensorboard\",\\n  \"log-dir\": \"logs\",\\n  \"use_wandb\": True,\\n  \"wandb_host\": \"https://api.wandb.ai\",\\n  \"wandb_project\": \"neox\"\\n}\\n'}, load='checkpoints', checkpoint_validation_with_forward_pass=False, checkpoint_scale='linear', checkpoint_factor=1000, extra_save_iters=None, no_save_optim=False, no_save_rng=False, no_load_optim=False, no_load_rng=False, finetune=False, batch_size=4, train_iters=143000, eval_iters=10, keep_last_n_checkpoints=None, eval_interval=100000, split='969, 30, 1', vocab_file='data/gpt2-vocab.json', merge_file='data/gpt2-merges.txt', num_workers=1, exit_interval=None, attention_dropout=0, hidden_dropout=0, weight_decay=0.1, checkpoint_activations=True, checkpoint_num_layers=1, deepspeed_activation_checkpointing=True, contiguous_checkpointing=False, checkpoint_in_cpu=False, synchronize_each_layer=True, profile_backward=False, partition_activations=True, gas=1, clip_grad=1.0, hysteresis=2, dynamic_loss_scale=True, loss_scale=None, loss_scale_window=1000.0, min_scale=1.0, char_level_ppl=False, use_mup=False, coord_check=False, save_base_shapes=False, base_shapes_file=None, mup_init_scale=1.0, mup_attn_temp=1.0, mup_output_temp=1.0, mup_embedding_mult=1.0, mup_rp_embedding_mult=1.0, mup_width_scale=2, tokenizer_type='GPT2BPETokenizer', padded_vocab_size=None, optimizer_type='Adam', use_bnb_optimizer=False, zero_stage=1, zero_reduce_scatter=True, zero_contiguous_gradients=True, zero_reduce_bucket_size=500000000, zero_allgather_bucket_size=500000000, lr=0.001, lr_decay_style='cosine', lr_decay_iters=143000, min_lr=0.0001, warmup=0.01, override_lr_scheduler=False, use_checkpoint_lr_scheduler=False, precision='fp16', num_layers=6, hidden_size=512, num_attention_heads=8, seq_length=2048, max_position_embeddings=2048, norm='layernorm', layernorm_epsilon=1e-05, rms_norm_epsilon=1e-08, scalenorm_epsilon=1e-08, pos_emb='rotary', rpe_num_buckets=32, rpe_max_distance=128, opt_pos_emb_offset=0, no_weight_tying=True, attention_config=['global', 'global', 'global', 'global', 'global', 'global'], sparsity_config={}, num_unique_layers=None, param_sharing_style='grouped', make_vocab_size_divisible_by=128, activation='gelu', scaled_upper_triang_masked_softmax_fusion=False, scaled_masked_softmax_fusion=False, bias_gelu_fusion=False, bias_dropout_fusion=False, fp16_lm_cross_entropy=False, init_method_std=0.02, apply_query_key_layer_scaling=False, use_cpu_initialization=False, attention_softmax_in_fp32=False, rotary_pct=1.0, rotary_emb_base=10000, init_method='small_init', output_layer_init_method='wang_init', gmlp_attn_dim=64, gpt_j_residual=False, gpt_j_tied=False, soft_prompt_tuning=None, output_layer_parallelism='column', deepspeed=True, train_batch_size=32, train_micro_batch_size_per_gpu=4, gradient_accumulation_steps=1, optimizer={'type': 'Adam', 'params': {'lr': 0.001, 'betas': [0.9, 0.95], 'eps': 1e-08}}, scheduler=None, fp32_allreduce=False, prescale_gradients=False, gradient_predivide_factor=1.0, sparse_gradients=False, fp16={'fp16': True, 'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 12, 'hysteresis': 2, 'min_loss_scale': 1}, amp=None, gradient_clipping=1.0, zero_optimization={'stage': 1, 'allgather_partitions': True, 'allgather_bucket_size': 500000000, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 500000000, 'contiguous_gradients': True}, curriculum_learning=None, curriculum_seqlen=0, steps_per_print=10, wall_clock_breakdown=True, dump_state=False, flops_profiler=None, communication_data_type=None, bf16=None, autotuning=None, activation_checkpointing=None, sparse_attention=None, data_efficiency=None, tensorboard=None, wandb=None, csv_monitor=None, elasticity=None, comms_logger=None, compression_training=None, checkpoint=None, data_types=None, deepspeed_extra_args={'comms_logger': {'enabled': True, 'verbose': True, 'prof_all': True, 'debug': False}}, hostfile=None, include=None, exclude=None, num_nodes=-1, num_gpus=None, master_port=29500, master_addr=None, launcher='pdsh', force_multi=False, detect_nvlink_pairs=False, autotuning_run=None, no_ssh_check=False, comment=None)"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "generate_samples_from_prompt(neox_args, )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "49ff863b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'checkpoints'"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "neox_args.load"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72bf3192",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 8de085895..72cf411e6 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = ce9bee3
+    Default = f4a9106
 
     current git hash of repository
 
@@ -926,7 +926,7 @@ Text Generation arguments
 
 - **prompt_end**: str
 
-    Default = 
+    Default =
 
 
     a single prompt's end. Defaults to newline
@@ -968,7 +968,7 @@ Text Generation arguments
 
 - **eval_results_prefix**: str
 
-    Default = 
+    Default =
 
     prefix to which to save evaluation results - final fp will be {eval_results_prefix}_eval_results_yy-mm-dd-HH-MM.json
 
@@ -1686,7 +1686,7 @@ Args for deepspeed config
 
     Default = None
 
-    
+
 
 
 
@@ -1988,4 +1988,3 @@ Args for deepspeed runner (deepspeed.launcher.runner).
     Default = None
 
     Adds a `--comment` to the DeepSpeed launch command. In DeeperSpeed this is passed on to the SlurmLauncher as well. Sometime necessary for cluster rules, or so I've heard.
-
diff --git a/inference/HFvsDS_comparision.png b/inference/HFvsDS_comparision.png
new file mode 100644
index 000000000..5683ae483
Binary files /dev/null and b/inference/HFvsDS_comparision.png differ
diff --git a/inference/README.md b/inference/README.md
new file mode 100644
index 000000000..20fba80b0
--- /dev/null
+++ b/inference/README.md
@@ -0,0 +1,53 @@
+# Neox Inference with Deepspeed
+
+For Inference of neox models we use the Deepspeed MII library. The installation and usage instructions remain same as from [Deepspeed-MII](https://github.com/microsoft/DeepSpeed-MII#getting-started-with-mii).
+
+# Installation
+`pip install deepspeed-mii`
+
+# Inference Usage
+DeepSpeed MII incorporates both DS inference and Zero inference into one framework. Both of which serve different purposes and cannot be used together.
+
+## 1. DS Inference:
+This fits the entire model into GPUs memory and is more suitable for inference applications that are latency sensitive or have small batch sizes.
+
+```
+# Deployment
+import mii
+mii_configs = {"tensor_parallel": 2, "dtype": "fp16", "load_with_sys_mem": True}
+mii.deploy(task="text-generation",
+           model="EleutherAI/gpt-neox-20b",
+           deployment_name="gpt-neox-20b-deploy",
+           mii_config=mii_configs)
+
+# Generation
+generator = mii.mii_query_handle("gpt-neox-20b-deploy")
+
+# Terminate (if you no longer want to infer)
+mii.terminate("gpt-neox-20b-deploy")
+```
+
+Neox-20b fp16 model requires greater than 40GB memory and cannot fit on single A100 40GB GPU, so we keep `tensor_parallel:2` to use two GPUs. If you have 80GB GPU, you can set `tensor_parallel:1` for neox-20b to use single GPU.
+
+## 2. Zero Inference:
+It adapts and optimises ZeRO-Infinity techniques for model inference on GPUs by hosting the model weights in CPU or NVMe memory, thus hosting no weights (zero) in GPU. It is designed for inference applications that require GPU acceleration but lack sufficient GPU memory to host the model. This therefore have higher latency compared to DS inference.
+
+Example usage:
+```
+# Deployment
+python zero_inference.py
+
+# Generation
+generator = mii.mii_query_handle("EleutherAI/pythia-160m_deploy")
+
+# Terminate (if you no longer want to infer)
+mii.terminate("EleutherAI/pythia-160m_deploy")
+```
+
+# Batch size
+Batch size at inference is not directly supported with deepspeed mii. However you can run with few changes and caveats, but note that the higher batch size does not necessarily decrease inference time. Follow the [issue](https://github.com/microsoft/DeepSpeed-MII/issues/133#issuecomment-1509534568) for more details.
+
+# HF Vs DS Inference Comparison
+![HF Vs DS Comparision plot](HFvsDS_comparision.png)
+
+Using benchmark.py, we benchmark different pythia models with neox-20b model to compare HF and Deepspeed inference. All inference are done for fp-16 models using single A100 40GB GPU for pythia models and two A100 40GB GPUs for neox-20b. Relative comparision between HF and Deepspeed is more important than Absolute latency values in the plot.
diff --git a/inference/benchmark.py b/inference/benchmark.py
new file mode 100644
index 000000000..cd604e716
--- /dev/null
+++ b/inference/benchmark.py
@@ -0,0 +1,78 @@
+import torch
+import mii
+from transformers import pipeline
+import time
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--model', '-m', type=str, default='EleutherAI/pythia-160m', help='hf model name')
+parser.add_argument('--trials', type=int, default=50, help='number of trials')
+parser.add_argument('--dtype', type=str, default='fp16', help='Data type for model')
+parser.add_argument('--tensor_parallel', type=int, default=1, help='Tensor parallelism degree')
+parser.add_argument('--load_with_sys_mem', action='store_true', help='Load model with system memory')
+args = parser.parse_args()
+
+def hf_infer(model, torch_dtype, query=['Deepspeed is', 'Seattle is'], trials=1):
+
+    generator = pipeline('text-generation', model=model, device=0, torch_dtype=torch_dtype)
+    eos_token = generator.tokenizer.eos_token_id
+
+    start_time = time.time()
+    for i in range(trials):
+        hf_result = generator(query, max_new_tokens=100, pad_token_id=eos_token)
+    end_time = time.time()
+
+    hf_time = (end_time - start_time) / trials
+
+    generator = None
+    torch.cuda.empty_cache()
+
+    return eos_token, hf_result, hf_time
+    
+def mii_infer(model, eos_token, query=['Deepspeed is', 'Seattle is'], trials=1):
+    generator = mii.mii_query_handle(model + '_deploy')
+    start_time = time.time()
+    for i in range(trials):
+        mii_result = generator.query({'query': query}, pad_token_id=eos_token, max_new_tokens=100)
+    end_time = time.time()
+    mii_time = (end_time - start_time) / trials
+
+    return mii_result, mii_time
+
+def main():
+
+    dtype_mapping = {
+    'fp16': torch.float16,
+    'fp32': torch.float32,
+    'fp64': torch.float64,
+    'int8': torch.int8,
+    'int16': torch.int16,
+    'int32': torch.int32,
+    'int64': torch.int64
+    }
+
+    torch_dtype = dtype_mapping[args.dtype]
+    load_with_sys_mem = args.load_with_sys_mem
+    tensor_parallel = args.tensor_parallel
+    trials = args.trials
+    model = args.model
+
+    eos_token, hf_result, hf_time = hf_infer(model, torch_dtype, trials=trials)
+
+    mii_configs = {'tensor_parallel': tensor_parallel, 'dtype': torch_dtype, 'load_with_sys_mem': load_with_sys_mem}
+    mii.deploy(task='text-generation',
+            model=model,
+            deployment_name=model + '_deploy',
+            mii_config=mii_configs)
+    mii_result, mii_time = mii_infer(model, eos_token, trials=trials)
+
+    print('HF sample output', hf_result)
+    print('HF Average Inference time: ', hf_time)
+    
+    print('MII sample output', mii_result)
+    print('MII Average Inference time: ', mii_time)
+
+    mii.terminate(model + '_deploy')
+
+if __name__ == '__main__':
+    main()
diff --git a/inference/zero_inference_example.py b/inference/zero_inference_example.py
new file mode 100644
index 000000000..2502b5708
--- /dev/null
+++ b/inference/zero_inference_example.py
@@ -0,0 +1,47 @@
+import mii
+from transformers import AutoConfig
+
+mii_config = {"dtype": "fp16"}
+
+name = "EleutherAI/pythia-160m"
+
+config = AutoConfig.from_pretrained(name)
+model_hidden_size = config.hidden_size
+
+ds_config = {
+    "fp16": {
+        "enabled": True
+    },
+    "bf16": {
+        "enabled": False
+    },
+    "aio": {
+        "block_size": 262144,
+        "queue_depth": 32,
+        "thread_count": 1,
+        "single_submit": False,
+        "overlap_events": True
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_param": {
+            "device": "cpu",
+        },
+        "overlap_comm": True,
+        "contiguous_gradients": True,
+        "reduce_bucket_size": model_hidden_size * model_hidden_size,
+        "stage3_prefetch_bucket_size": 0.1 * model_hidden_size * model_hidden_size,
+        "stage3_max_live_parameters": 1e8,
+        "stage3_max_reuse_distance": 1e8,
+        "stage3_param_persistence_threshold": 10 * model_hidden_size
+    },
+    "train_micro_batch_size_per_gpu": 1,
+}
+
+mii.deploy(task='text-generation',
+           model=name,
+           deployment_name=name + "_deploy",
+           mii_config=mii_config,
+           enable_deepspeed=False,
+           enable_zero=True,
+           ds_config=ds_config)
\ No newline at end of file
diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp
index 90488fa61..830326c3f 100644
--- a/megatron/data/helpers.cpp
+++ b/megatron/data/helpers.cpp
@@ -173,7 +173,6 @@ py::array build_sample_idx_int32(const py::array_t<int32_t>& sizes_,
                      free_when_done);                           // numpy array references
 }
 
-
 py::array build_sample_idx_int64(const py::array_t<int32_t>& sizes_,
                                  const py::array_t<int32_t>& doc_idx_,
                                  const int32_t seq_length,
diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py
index 386b5ed1e..80caa2d97 100644
--- a/megatron/fused_kernels/__init__.py
+++ b/megatron/fused_kernels/__init__.py
@@ -38,7 +38,7 @@ def load_fused_kernels():
         print(e)
         print("=" * 100)
         print(
-            f'ERROR: Fused kernels configured but not properly installed. Please run `pip install {str(srcpath)}` to install them'
+            f"ERROR: Fused kernels configured but not properly installed. Please run `pip install {str(srcpath)}` to install them"
         )
         print("=" * 100)
         exit()
diff --git a/megatron/model/flash_attention.py b/megatron/model/flash_attention.py
index f9889b4c0..be3ebb14e 100644
--- a/megatron/model/flash_attention.py
+++ b/megatron/model/flash_attention.py
@@ -8,6 +8,12 @@
 import flash_attn_cuda
 
 
+def flash_attn_unpadded_unpacked_func_triton(
+    q, k, v, bias=None, causal=False, softmax_scale=None
+):
+    return flash_attn_triton.flash_attn_func(q, k, v, bias, causal, softmax_scale)
+
+
 def _flash_attn_forward_cuda(
     q,
     k,
@@ -186,7 +192,273 @@ def flash_attn_unpadded_qkvpacked_func_cuda(
     )
 
 
-def flash_attn_unpadded_qkvpacked_func_triton(
-    q, k, v, bias=None, causal=False, softmax_scale=None
+class FlashAttnKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        kv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        causal,
+        return_softmax,
+    ):
+        # Save rng_state because the backward pass will regenerate the dropout mask
+        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
+        if softmax_scale is None:
+            softmax_scale = q.shape[-1] ** (-0.5)
+        out, softmax_lse, S_dmask = _flash_attn_forward_cuda(
+            q,
+            kv[:, 0],
+            kv[:, 1],
+            torch.empty_like(q),
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            dropout_p,
+            softmax_scale,
+            causal=causal,
+            return_softmax=return_softmax,
+        )
+        ctx.save_for_backward(
+            q, kv, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state
+        )
+        ctx.dropout_p = dropout_p
+        ctx.max_seqlen_q = max_seqlen_q
+        ctx.max_seqlen_k = max_seqlen_k
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        return out if not return_softmax else (out, softmax_lse, S_dmask)
+
+    @staticmethod
+    def backward(ctx, dout, *args):
+        (
+            q,
+            kv,
+            out,
+            softmax_lse,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            rng_state,
+        ) = ctx.saved_tensors
+        if rng_state is not None:
+            cur_rng_state = torch.cuda.get_rng_state()
+            torch.cuda.set_rng_state(rng_state)
+        dq = torch.empty_like(q)
+        dkv = torch.empty_like(kv)
+        _flash_attn_backward_cuda(
+            dout,
+            q,
+            kv[:, 0],
+            kv[:, 1],
+            out,
+            softmax_lse,
+            dq,
+            dkv[:, 0],
+            dkv[:, 1],
+            cu_seqlens_q,
+            cu_seqlens_k,
+            ctx.max_seqlen_q,
+            ctx.max_seqlen_k,
+            ctx.dropout_p,
+            ctx.softmax_scale,
+            ctx.causal,
+        )
+        if rng_state is not None:
+            torch.cuda.set_rng_state(cur_rng_state)
+        return dq, dkv, None, None, None, None, None, None, None, None
+
+
+def flash_attn_unpadded_kvpacked_func_cuda(
+    q,
+    kv,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p,
+    softmax_scale=None,
+    causal=False,
+    return_attn_probs=False,
 ):
-    return flash_attn_triton.flash_attn_func(q, k, v, bias, causal, softmax_scale)
+    """dropout_p should be set to 0.0 during evaluation
+    Arguments:
+        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
+        kv: (total_k, 2, nheads, headdim), where total_k = total number of key tokens in the batch.
+        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into q.
+        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into kv.
+        max_seqlen_q: int. Maximum query sequence length in the batch.
+        max_seqlen_k: int. Maximum key sequence length in the batch.
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (total, nheads, headdim).
+        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
+            The output of softmax (possibly with different scaling). It also encodes the dropout
+            pattern (negative means that location was dropped, nonnegative means it was kept).
+    """
+    return FlashAttnKVPackedFunc.apply(
+        q,
+        kv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        causal,
+        return_attn_probs,
+    )
+
+
+class FlashAttnFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        k,
+        v,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        causal,
+        return_softmax,
+    ):
+        # Save rng_state because the backward pass will regenerate the dropout mask
+        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
+        if softmax_scale is None:
+            softmax_scale = q.shape[-1] ** (-0.5)
+        out, softmax_lse, S_dmask = _flash_attn_forward_cuda(
+            q,
+            k,
+            v,
+            torch.empty_like(q),
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            dropout_p,
+            softmax_scale,
+            causal=causal,
+            return_softmax=return_softmax,
+        )
+        ctx.save_for_backward(
+            q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state
+        )
+        ctx.dropout_p = dropout_p
+        ctx.max_seqlen_q = max_seqlen_q
+        ctx.max_seqlen_k = max_seqlen_k
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        return out if not return_softmax else (out, softmax_lse, S_dmask)
+
+    @staticmethod
+    def backward(ctx, dout, *args):
+        (
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            rng_state,
+        ) = ctx.saved_tensors
+        if rng_state is not None:
+            cur_rng_state = torch.cuda.get_rng_state()
+            torch.cuda.set_rng_state(rng_state)
+        dq, dk, dv = torch.empty_like(q), torch.empty_like(k), torch.empty_like(v)
+        _flash_attn_backward_cuda(
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            dq,
+            dk,
+            dv,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            ctx.max_seqlen_q,
+            ctx.max_seqlen_k,
+            ctx.dropout_p,
+            ctx.softmax_scale,
+            ctx.causal,
+        )
+        if rng_state is not None:
+            torch.cuda.set_rng_state(cur_rng_state)
+        return dq, dk, dv, None, None, None, None, None, None, None, None
+
+
+def flash_attn_unpadded_func_cuda(
+    q,
+    k,
+    v,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p,
+    softmax_scale=None,
+    causal=False,
+    return_attn_probs=False,
+):
+    """dropout_p should be set to 0.0 during evaluation
+    Arguments:
+        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
+        k: (total_k, nheads, headdim), where total_k = total number of key tokens in the batch.
+        v: (total_k, nheads, headdim), where total_k = total number of key tokens in the batch.
+        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into q.
+        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into kv.
+        max_seqlen_q: int. Maximum query sequence length in the batch.
+        max_seqlen_k: int. Maximum key sequence length in the batch.
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (total, nheads, headdim).
+        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
+            The output of softmax (possibly with different scaling). It also encodes the dropout
+            pattern (negative means that location was dropped, nonnegative means it was kept).
+    """
+    return FlashAttnFunc.apply(
+        q,
+        k,
+        v,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale,
+        causal,
+        return_attn_probs,
+    )
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 4d57b063c..cc7b8be4a 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -278,17 +278,23 @@ def __init__(
             if self.use_flash_attention:
                 from megatron.model.flash_attention import (
                     flash_attn_unpadded_qkvpacked_func_cuda,
-                    flash_attn_unpadded_qkvpacked_func_triton,
+                    flash_attn_unpadded_kvpacked_func_cuda,
+                    flash_attn_unpadded_unpacked_func_triton,
                 )
 
                 if self.pos_emb == "alibi":
                     self.flash_attention_function = (
-                        flash_attn_unpadded_qkvpacked_func_triton
+                        flash_attn_unpadded_unpacked_func_triton
                     )
                 else:
-                    self.flash_attention_function = (
-                        flash_attn_unpadded_qkvpacked_func_cuda
-                    )
+                    if self.training:
+                        self.flash_attention_function = (
+                            flash_attn_unpadded_qkvpacked_func_cuda
+                        )
+                    else:
+                        self.flash_attention_function = (
+                            flash_attn_unpadded_kvpacked_func_cuda
+                        )
             else:
                 self.scale_mask_softmax = FusedScaleMaskSoftmax(
                     input_in_fp16=self.fp16,
@@ -429,59 +435,101 @@ def flash_attention(self, query_layer, key_layer, value_layer):
         )
 
         if self.pos_emb != "alibi":
-            # [s, b, np, hn] -> [b, s, np, hn] -> [b * s, 1, np, hn]
-            query_layer = query_layer.transpose(0, 1).reshape(
-                output_size[0] * output_size[2], 1, output_size[1], -1
-            )
+
+            # [sk, b, np, hn] -> [b, sk, np, hn] -> [b * sk, 1, np, hn]
             key_layer = key_layer.transpose(0, 1).reshape(
                 output_size[0] * output_size[3], 1, output_size[1], -1
             )
             value_layer = value_layer.transpose(0, 1).reshape(
                 output_size[0] * output_size[3], 1, output_size[1], -1
             )
-            # Combined q/k/v into [b * s, 3, np, hn].
-            qkv = torch.concat([query_layer, key_layer, value_layer], dim=1)
 
             batch_size = output_size[0]
-            seqlen = output_size[2]
-            max_s = seqlen
+            max_seqlen_q = output_size[2]
+            max_seqlen_k = output_size[3]
 
-            cu_seqlens = torch.arange(
+            cu_seqlens_q = torch.arange(
                 0,
-                (batch_size + 1) * seqlen,
-                step=seqlen,
+                (batch_size + 1) * max_seqlen_q,
+                step=max_seqlen_q,
                 dtype=torch.int32,
-                device=qkv.device,
+                device=query_layer.device,
             )
 
-            output = self.flash_attention_function(
-                qkv,
-                cu_seqlens,
-                max_s,
-                self.dropout_p if self.training else 0.0,
-                softmax_scale=None,
-                causal=True,
+            cu_seqlens_k = torch.arange(
+                0,
+                (batch_size + 1) * max_seqlen_k,
+                step=max_seqlen_k,
+                dtype=torch.int32,
+                device=key_layer.device,
             )
+
+            if self.training:
+
+                # [sq, b, np, hn] -> [b * sq, np, hn]
+                query_layer = query_layer.transpose(0, 1).reshape(
+                    output_size[0] * output_size[2], output_size[1], -1
+                )
+
+                # Combined k/v into [b * sk, 2, np, hn].
+                kv = torch.concat([key_layer, value_layer], dim=1)
+
+                output = self.flash_attn_unpadded_kvpacked_func(
+                    query_layer,
+                    kv,
+                    cu_seqlens_q,
+                    cu_seqlens_k,
+                    max_seqlen_q,
+                    max_seqlen_k,
+                    self.dropout_p if self.training else 0.0,
+                    softmax_scale=None,
+                    causal=True,
+                )
+
+            else:
+
+                # [sq, b, np, hn] -> [b * sq, 1, np, hn]
+                query_layer.transpose(0, 1).reshape(
+                    output_size[0] * output_size[2], 1, output_size[1], -1
+                )
+
+                # Combined q/k/v into [b * s, 3, np, hn].
+                qkv = torch.concat([query_layer, key_layer, value_layer], dim=1)
+
+                output = self.flash_attn_unpadded_qkvpacked_func(
+                    qkv,
+                    cu_seqlens_q,
+                    max_seqlen_q,
+                    self.dropout_p if self.training else 0.0,
+                    softmax_scale=None,
+                    causal=True,
+                )
+
             # [b * sq, np, hn] -> [b, sq, np, hn]
             matmul_result = output.view(
                 output_size[0], output_size[2], output.shape[1], output.shape[2]
             )
             # [b, sq, np, hn] -> [b, np, sq, hn]
             matmul_result = matmul_result.transpose(1, 2)
+
         else:
             # [sq, b, np, hn] -> [b, sq, np, hn]
             sq = query_layer.size(0)
             b = query_layer.size(1)
             sk = key_layer.size(0)
+
             query_layer = query_layer.transpose(0, 1)
             key_layer = key_layer.transpose(0, 1)
             value_layer = value_layer.transpose(0, 1)
+
             bias = self.alibi_embed.bias(sq, sk, query_layer.device, query_layer.dtype)
             bias = bias.unsqueeze(0).tile((b, 1, 1, 1))
+
             matmul_result = self.flash_attention_function(
                 query_layer, key_layer, value_layer, bias=bias, causal=True
             )
             matmul_result = matmul_result.transpose(1, 2)
+
         return matmul_result
 
     def sparse_attention(self, query_layer, key_layer, value_layer, attention_mask):
diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 73bcbdc75..6ac476fc1 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -989,7 +989,7 @@ def calculate_derived(self):
         # Update 'is pipe parallel' flag
         # if we set pipe_parallel_size to 0 or 1, GPT2ModelPipe.to_sequential() is called, and we run training with
         # the sequential model without the PipelineModule wrapper to avoid the overhead it incurs
-        self.update_value("is_pipe_parallel", self.pipe_parallel_size >= 2)
+        self.update_value("is_pipe_parallel", self.pipe_parallel_size >= 1)
 
         # Attention config
         if self.attention_config is None:
diff --git a/tools/convert_sequential_to_hf.py b/tools/convert_sequential_to_hf.py
index 5a66219bf..f2299fccf 100644
--- a/tools/convert_sequential_to_hf.py
+++ b/tools/convert_sequential_to_hf.py
@@ -155,9 +155,7 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
 
     hf_config = create_config(loaded_config)
 
-    hf_model = GPTNeoXForCausalLM(
-        hf_config
-    )
+    hf_model = GPTNeoXForCausalLM(hf_config)
 
     # save model in FP16 if Deepspeed fp16 was used in config, else 32 bit
     fp16 = get_key(loaded_config, "fp16")
@@ -177,7 +175,9 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
                     hf_model.to(dtype=torch.bfloat16)
                     print("Saving weights in bf16 precision...")
             except:
-                print("Model not trained in fp16 / bf16 mixed precision, saving weights in fp32...")  
+                print(
+                    "Model not trained in fp16 / bf16 mixed precision, saving weights in fp32..."
+                )
 
     mp_partitions = get_key(loaded_config, "model-parallel-size")
 
diff --git a/tools/convert_v1.0_to_hf.py b/tools/convert_v1.0_to_hf.py
index 905bdfa16..8f3537cd4 100644
--- a/tools/convert_v1.0_to_hf.py
+++ b/tools/convert_v1.0_to_hf.py
@@ -153,15 +153,19 @@ def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
         try:
             # this conditional is quite messy because there were a number of ways to specify bf16 or fp16 training
             # in DeeperSpeed v1.0 .
-            if (fp16.get("fp16", None) or fp16["enabled"]) and not (fp16.get("type", None) == "bfloat16"):
+            if (fp16.get("fp16", None) or fp16["enabled"]) and not (
+                fp16.get("type", None) == "bfloat16"
+            ):
                 hf_model.half()
                 print("Saving weights in fp16 precision...")
             elif fp16.get("type", None) == "bfloat16":
                 hf_model.to(dtype=torch.bfloat16)
                 print("Saving weights in bf16 precision...")
         except:
-            print("Model not trained in fp16 / bf16 mixed precision, saving weights in fp32...")
-    
+            print(
+                "Model not trained in fp16 / bf16 mixed precision, saving weights in fp32..."
+            )
+
     mp_partitions = get_key(loaded_config, "model-parallel-size")
 
     ### Embedding layer ###
diff --git a/tools/merge_datasets.py b/tools/merge_datasets.py
index e6e290016..c5d1e6255 100644
--- a/tools/merge_datasets.py
+++ b/tools/merge_datasets.py
@@ -2,8 +2,10 @@
 import sys
 import json
 import argparse
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir)))
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
+)
 
 from megatron.data import indexed_dataset
 
@@ -20,47 +22,63 @@ def main(args):
         if not os.path.isfile(os.path.join(args.input, basename)):
             continue
 
-        ext_pair = '.bin' if ext == '.idx' else '.idx'
-        assert os.path.isfile(os.path.join(args.input, prefix) + ext_pair), \
-               f'ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}'
+        ext_pair = ".bin" if ext == ".idx" else ".idx"
+        assert os.path.isfile(
+            os.path.join(args.input, prefix) + ext_pair
+        ), f"ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}"
 
         prefixes.add(prefix)
 
     builder = None
     for prefix in sorted(prefixes):
         if builder is None:
-            dataset = indexed_dataset.make_dataset(os.path.join(args.input, prefix), 'infer')
+            dataset = indexed_dataset.make_dataset(
+                os.path.join(args.input, prefix), "infer"
+            )
 
             if isinstance(dataset, indexed_dataset.MMapIndexedDataset):
-                builder = indexed_dataset.MMapIndexedDatasetBuilder(args.output_prefix + '.bin', dtype=dataset._index.dtype)
+                builder = indexed_dataset.MMapIndexedDatasetBuilder(
+                    args.output_prefix + ".bin", dtype=dataset._index.dtype
+                )
             else:
-                builder = indexed_dataset.IndexedDatasetBuilder(args.output_prefix + '.bin')
+                builder = indexed_dataset.IndexedDatasetBuilder(
+                    args.output_prefix + ".bin"
+                )
 
             del dataset
 
         builder.merge_file_(os.path.join(args.input, prefix))
 
-    builder.finalize(args.output_prefix + '.idx')
+    builder.finalize(args.output_prefix + ".idx")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
-    group = parser.add_argument_group(title='input data')
-    group.add_argument('--input', type=str, required=True,
-                       help='Path to directory containing all document files to merge')
-
-    group = parser.add_argument_group(title='output data')
-    group.add_argument('--output-prefix', type=str, required=True,
-                       help='Path to binary output file without suffix')
+    group = parser.add_argument_group(title="input data")
+    group.add_argument(
+        "--input",
+        type=str,
+        required=True,
+        help="Path to directory containing all document files to merge",
+    )
+
+    group = parser.add_argument_group(title="output data")
+    group.add_argument(
+        "--output-prefix",
+        type=str,
+        required=True,
+        help="Path to binary output file without suffix",
+    )
 
     args = parser.parse_args()
 
-    assert os.path.isdir(args.input), \
-           f'ERROR: {args.input} is not a directory or does not exist'
+    assert os.path.isdir(
+        args.input
+    ), f"ERROR: {args.input} is not a directory or does not exist"
 
-    assert os.path.isdir(os.path.dirname(args.output_prefix)), \
-           f'ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist'
+    assert os.path.isdir(
+        os.path.dirname(args.output_prefix)
+    ), f"ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist"
 
     main(args)
-