mit-han-lab · AniZpZ · Sep 21, 2023 · Sep 21, 2023 · Sep 21, 2023 · Sep 26, 2023
diff --git a/examples/export_int8_llama.py b/examples/export_int8_llama.py
@@ -0,0 +1,57 @@
+import torch
+import argparse
+import os
+
+from pathlib import Path
+
+from transformers import AutoTokenizer
+from transformers.models.llama.modeling_llama import LlamaForCausalLM
+
+from smoothquant.llama import Int8LlamaForCausalLM
+from smoothquant.smooth import smooth_lm
+
+from smoothquant.calibration import get_static_llama_decoder_layer_scales
+from torch.nn.functional import pad
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name", type=str, default='fp16_models/llama-13b')
+    parser.add_argument("--num-samples", type=int, default=512)
+    parser.add_argument("--seq-len", type=int, default=512)
+    parser.add_argument("--act-scales", type=str,
+                        default='act_scales/llama-13b.pt')
+    parser.add_argument("--output-path", type=str, default='int8_models')
+    parser.add_argument('--dataset-path', type=str, default='dataset/val.jsonl.zst',
+                        help='location of the calibration dataset, we use the validation set of the Pile dataset')
+    parser.add_argument('--export-FT', default=False, action="store_true")
+    args = parser.parse_args()
+    model = LlamaForCausalLM.from_pretrained(
+        args.model_name, device_map="auto", torch_dtype=torch.float16)
+    act_scales = torch.load(args.act_scales)
+    smooth_lm(model, act_scales, 0.5)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+
+    if not os.path.exists(args.dataset_path):
+        print(f'Cannot find the dataset at {args.dataset_path}')
+        print('Please download the Pile dataset and put the validation set at the path')
+        print('You can download the validation dataset of the Pile at https://mystic.the-eye.eu/public/AI/pile/val.jsonl.zst')
+        raise FileNotFoundError
+
+    decoder_layer_scales, raw_scales = get_static_llama_decoder_layer_scales(model,
+                                                                            tokenizer,
+                                                                            args.dataset_path,
+                                                                            num_samples=args.num_samples,
+                                                                            seq_len=args.seq_len)
+    output_path = Path(args.output_path) / ("llama-" + Path(args.model_name).name + "-smoothquant-per-token-opt")
+    if args.export_FT:
+        model.save_pretrained(output_path)
+        print(f"Saved smoothed model at {output_path}")
+
+        output_path = Path(args.output_path) / (Path(args.model_name).name + "-smoothquant-scales.pt")
+        torch.save(raw_scales, output_path)
+        print(f"Saved scaling factors at {output_path}")
+    else:
+        int8_model = Int8LlamaForCausalLM.from_float(model, decoder_layer_scales)
+        int8_model.save_pretrained(output_path)
+        print(f"Saved int8 model at {output_path}")
diff --git a/smoothquant/calibration.py b/smoothquant/calibration.py
@@ -118,3 +118,82 @@ def stat_io_hook(m, x, y, name):
         decoder_layer_scales.append(scale_dict)
 
     return decoder_layer_scales, act_dict
+
+#TODO: merge to get_static_decoder_layer_scales method
+@torch.no_grad()
+def get_static_llama_decoder_layer_scales(model,
+                                          tokenizer,
+                                          dataset_path,
+                                          num_samples=512,
+                                          seq_len=512,
+                                          ):
+    model.eval()
+    device = next(model.parameters()).device
+
+    act_dict = defaultdict(dict)
+
+    def stat_io_hook(m, x, y, name):
+        if isinstance(x, tuple):
+            x = x[0]
+        if name not in act_dict or "input" not in act_dict[name]:
+            act_dict[name]["input"] = x.detach().abs().max().item()
+        else:
+            act_dict[name]["input"] = max(
+                act_dict[name]["input"], x.detach().abs().max().item())
+        if isinstance(y, tuple):
+            y = y[0]
+        if name not in act_dict or "output" not in act_dict[name]:
+            act_dict[name]["output"] = y.detach().abs().max().item()
+        else:
+            act_dict[name]["output"] = max(
+                act_dict[name]["output"], y.detach().abs().max().item())
+
+    hooks = []
+    for name, m in model.named_modules():
+        if isinstance(m, torch.nn.Linear):
+            hooks.append(m.register_forward_hook(
+                partial(stat_io_hook, name=name)))
+
+    print("Collecting activation scales...")
+    pbar = tqdm(range(num_samples))
+    dataset = load_dataset('json', data_files=dataset_path, split="train")
+    dataset = dataset.shuffle(seed=42)
+    for i in pbar:
+        input_ids = tokenizer(dataset[i]["text"], return_tensors="pt",
+                              max_length=seq_len, truncation=True).input_ids.to(device)
+        model(input_ids)
+        mean_scale = np.mean([v["input"] for v in act_dict.values()])
+        pbar.set_description(f"Mean input scale: {mean_scale:.2f}")
+    for hook in hooks:
+        hook.remove()
+
+    decoder_layer_scales = []
+    for idx in range(model.config.num_hidden_layers):
+        scale_dict = {}
+        # self attenion scales
+        scale_dict["attn_input_scale"] = act_dict[
+            f"model.layers.{idx}.self_attn.q_proj"]['input'] / 127
+        scale_dict["q_output_scale"] = act_dict[
+            f"model.layers.{idx}.self_attn.q_proj"]['output'] / 127
+        scale_dict["k_output_scale"] = act_dict[
+            f"model.layers.{idx}.self_attn.k_proj"]['output'] / 127
+        scale_dict["v_output_scale"] = act_dict[
+            f"model.layers.{idx}.self_attn.v_proj"]['output'] / 127
+        scale_dict["out_input_scale"] = act_dict[
+            f"model.layers.{idx}.self_attn.o_proj"]['input'] / 127
+        # mlp scales
+        scale_dict["gate_input_scale"] = act_dict[
+            f"model.layers.{idx}.mlp.gate_proj"]['input'] / 127
+        scale_dict["up_input_scale"] = act_dict[
+            f"model.layers.{idx}.mlp.up_proj"]["input"] / 127
+        scale_dict["down_input_scale"] = act_dict[
+            f"model.layers.{idx}.mlp.down_proj"]["input"] / 127
+        scale_dict["gate_output_scale"] = act_dict[
+            f"model.layers.{idx}.mlp.gate_proj"]['output'] / 127
+        scale_dict["up_output_scale"] = act_dict[
+            f"model.layers.{idx}.mlp.up_proj"]["output"] / 127
+        scale_dict["down_output_scale"] = act_dict[
+            f"model.layers.{idx}.mlp.down_proj"]["output"] / 127
+        decoder_layer_scales.append(scale_dict)
+
+    return decoder_layer_scales, act_dict