intel · PenghuiCheng · Apr 15, 2024 · Apr 15, 2024 · Apr 15, 2024 · Apr 16, 2024
diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt
@@ -5,7 +5,7 @@ protobuf
 sentencepiece != 0.1.92
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 torch==2.1.0a0
-transformers
+transformers==4.35
 optimum-intel
 bitsandbytes  #baichuan
 transformers_stream_generator

diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py
@@ -139,11 +139,7 @@
 user_model = None
 
 # tokenizer
-if config.model_type == "llama":
-   from transformers import LlamaTokenizer
-   tokenizer = LlamaTokenizer.from_pretrained(args.model)
-else:
-   tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
+tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
 
 quantization_config = None
 if args.woq:
@@ -253,7 +249,9 @@
         dtype=amp_dtype if amp_enabled else None,
     ):
         for i in range(num_iter + num_warmup):
-            with torch.autograd.profiler_legacy.profile(enabled=args.do_profiling, use_xpu=(args.device=="xpu"), record_shapes=False) as prof:
+            # workaround for Windows
+            # with torch.autograd.profiler_legacy.profile(enabled=args.do_profiling, use_xpu=(args.device=="xpu"), record_shapes=False) as prof:
+            if True
                 input_ids = tokenizer(
                     prompt, return_tensors="pt").input_ids.to(args.device)
                 tic = time.time()

@@ -109,9 +109,7 @@ def replace_linear(
     empty_weights=False,
 ):
     if modules_to_not_convert is None:
-        # output_layer is chatglm last layer name
-        # embed_out is dolly_v2 last layer name
-        modules_to_not_convert = ["lm_head", "output_layer", "embed_out"]
+        modules_to_not_convert = []
     if quantization_config.llm_int8_skip_modules:
         modules_to_not_convert = modules_to_not_convert.extend(
             quantization_config.llm_int8_skip_modules
@@ -517,17 +515,6 @@ def default_calib_func(model):
                     },
                 },
             },
-            op_name_dict={
-                ".*lm_head": {  # re.match
-                    "weight": {"dtype": "fp32"},
-                },
-                ".*output_layer": {  # re.match
-                    "weight": {"dtype": "fp32"},
-                },
-                ".*embed_out": {  # re.match
-                    "weight": {"dtype": "fp32"},
-                },
-            },
             recipes=recipes,
         )
         # TEQ: set calib_func=None, use default training func as calib_func