diff --git a/conda_meta/meta.yaml b/conda_meta/meta.yaml index a9ca9591e77..0a5574eb56b 100644 --- a/conda_meta/meta.yaml +++ b/conda_meta/meta.yaml @@ -7,7 +7,6 @@ build: script_env: - IMEX_WHL number: {{buildnumber}} - noarch: python script: pip install --no-deps {{IMEX_WHL}} requirements: build: @@ -19,6 +18,8 @@ requirements: - numpy - transformers - packaging + - neural_compressor + - protobuf test: imports: - intel_extension_for_transformers diff --git a/docs/tutorials/pytorch/language-modeling/benchmark.py b/docs/tutorials/pytorch/language-modeling/benchmark.py new file mode 100644 index 00000000000..f4d237f14fe --- /dev/null +++ b/docs/tutorials/pytorch/language-modeling/benchmark.py @@ -0,0 +1,188 @@ +import logging +import os +from datasets import load_dataset, load_metric +from itertools import chain +from intel_extension_for_transformers import metrics, OptimizedModel +from intel_extension_for_transformers.optimization.trainer import NLPTrainer +from argparse import ArgumentParser +from transformers import ( + MODEL_FOR_MASKED_LM_MAPPING, + AutoConfig, + AutoModelForMaskedLM, + AutoModelForMultipleChoice, + AutoTokenizer, + DataCollatorForLanguageModeling, + TrainingArguments, + is_torch_tpu_available, + set_seed, +) + +os.environ["WANDB_DISABLED"] = "true" + +logger = logging.getLogger(__name__) +MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + +arg_parser = ArgumentParser(description='Parse args') +arg_parser.add_argument('--data_type', default = "int8", help='data type of model') +arg_parser.add_argument('--model_name_or_path', default = "bert-base-uncased", help = 'input model for benchmark') +args = arg_parser.parse_args() + +dataset_name="wikitext" +dataset_config_name="wikitext-2-raw-v1" +training_args = TrainingArguments( + output_dir=args.mpdel_name_or_path, + do_eval=True, + do_train=True, + no_cuda=True, + per_device_eval_batch_size=1, + overwrite_output_dir=True +) + +raw_datasets = load_dataset(dataset_name, dataset_config_name) +config = AutoConfig.from_pretrained(args.model_name_or_path) +tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) +# Set seed before initializing model. +set_seed(training_args.seed) + +## start with int8 benchmarking +if args.data_type == "int8": + # Load the model obtained after Intel Neural Compressor (INC) quantization + model = OptimizedModel.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + revision="main", + use_auth_token=None, + ) +else: + ## original fp32 model benchmarking + model = AutoModelForMaskedLM.from_pretrained( + args.model_name_or_path, + config=config, + revision="main", + use_auth_token=None, + ) + model.resize_token_embeddings(len(tokenizer)) + +# First we tokenize all the texts. +if training_args.do_train: + column_names = raw_datasets["train"].column_names +else: + column_names = raw_datasets["validation"].column_names +text_column_name = "text" if "text" in column_names else column_names[0] + +max_seq_length = tokenizer.model_max_length + + +def tokenize_function(examples): + return tokenizer(examples[text_column_name], return_special_tokens_mask=True) + + +column_names = raw_datasets["train"].column_names +text_column_name = "text" if "text" in column_names else column_names[0] + +with training_args.main_process_first(desc="dataset map tokenization"): + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + remove_columns=column_names, + load_from_cache_file=True, + desc="Running tokenizer on every text in dataset", + ) + + +# Main data processing function that will concatenate all texts from our dataset and generate chunks of max_seq_length. +def group_texts(examples): + # Concatenate all texts. + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= max_seq_length: + total_length = (total_length // max_seq_length) * max_seq_length + # Split by chunks of max_len. + result = { + k: [t[i: i + max_seq_length] for i in range(0, total_length, max_seq_length)] + for k, t in concatenated_examples.items() + } + return result + + +# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a +# remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value +# might be slower to preprocess. + +with training_args.main_process_first(desc="grouping texts together"): + tokenized_datasets = tokenized_datasets.map( + group_texts, + batched=True, + load_from_cache_file=True, + desc=f"Grouping texts in chunks of {max_seq_length}", + ) + +if training_args.do_train: + if "train" not in tokenized_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = tokenized_datasets["train"] + +if training_args.do_eval: + if "validation" not in tokenized_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = tokenized_datasets["validation"] + + + def preprocess_logits_for_metrics(logits, labels): + if isinstance(logits, tuple): + # Depending on the model and config, logits may contain extra tensors, + # like past_key_values, but logits always come first + logits = logits[0] + return logits.argmax(dim=-1) + + + metric = load_metric("accuracy") + + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # preds have the same shape as the labels, after the argmax(-1) has been calculated + # by preprocess_logits_for_metrics + labels = labels.reshape(-1) + preds = preds.reshape(-1) + mask = labels != -100 + labels = labels[mask] + preds = preds[mask] + return metric.compute(predictions=preds, references=labels) + +# Data collator will take care of randomly masking the tokens. +data_collator = DataCollatorForLanguageModeling( + tokenizer=tokenizer, + mlm_probability=0.15, + pad_to_multiple_of=None, +) + +# Initialize the Trainer +set_seed(training_args.seed) +trainer = NLPTrainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None, + preprocess_logits_for_metrics=preprocess_logits_for_metrics + if training_args.do_eval and not is_torch_tpu_available() + else None, +) + +results = trainer.evaluate() +bert_task_acc_keys = ['eval_loss', 'eval_f1', 'eval_accuracy', 'eval_matthews_correlation', + 'eval_pearson', 'eval_mcc', 'eval_spearmanr'] + +throughput = results.get("eval_samples_per_second") +eval_loss = results["eval_loss"] +print('Batch size = {}'.format(training_args.per_device_eval_batch_size)) +print("Finally Eval eval_loss Accuracy: {}".format(eval_loss)) +print("Latency: {:.3f} ms".format(1000 / throughput)) +print("Throughput: {} samples/sec".format(throughput)) diff --git a/docs/tutorials/pytorch/language-modeling/bert-base-uncased.ipynb b/docs/tutorials/pytorch/language-modeling/bert-base-uncased.ipynb index 4070a1a4d65..2c154dbb7bf 100644 --- a/docs/tutorials/pytorch/language-modeling/bert-base-uncased.ipynb +++ b/docs/tutorials/pytorch/language-modeling/bert-base-uncased.ipynb @@ -43,7 +43,7 @@ "id": "c1816be1", "metadata": {}, "source": [ - "* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. " + "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. " ] }, { @@ -167,7 +167,7 @@ " dataset_config_name=\"wikitext-2-raw-v1\",\n", ")\n", "training_args = TrainingArguments(\n", - " output_dir=\"./saved_results\",\n", + " output_dir=\"./saved_results_static\",\n", " do_eval=True,\n", " do_train=True,\n", " no_cuda=True,\n", @@ -367,8 +367,6 @@ " else None,\n", ")\n", "\n", - "trainer_ptq_static.save_model(\"./saved_results_ptq_static\") # quantized model\n", - "\n", "tune_metric = metrics.Metric(\n", " name=\"eval_loss\", # Metric used for the tuning strategy.\n", " is_relative=True, # Metric tolerance mode, True is for relative, otherwise for absolute.\n", @@ -381,7 +379,11 @@ ")\n", "\n", "# run quantization\n", - "trainer_ptq_static.quantize(quant_config=quantization_config)" + "trainer_ptq_static.quantize(quant_config=quantization_config)\n", + "\n", + "# save quantized model\n", + "trainer_ptq_static.save_model(\"./saved_results_static\")\n", + "model.config.save_pretrained(\"./saved_results_static\")" ] }, { @@ -414,6 +416,26 @@ "print(\"Throughput: {} samples/sec\".format(throughput_ptq_static))" ] }, + { + "cell_type": "markdown", + "id": "5a7e93de", + "metadata": {}, + "source": [ + "## Run Benchmark after Static Post Training Quantization with Multi-Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a6795aa", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.system('numactl --hardware')\n", + "results = os.system('bash ../multi_instance.sh --model=saved_results_static --core_per_instance=4 --data_type=int8')" + ] + }, { "cell_type": "markdown", "id": "7a51f6ca", @@ -431,6 +453,7 @@ "source": [ "# Initialize the Trainer\n", "set_seed(training_args.seed)\n", + "training_args.output_dir = \"./saved_results_dynamic\"\n", "trainer_ptq_dynamic = NLPTrainer(\n", " model=model,\n", " args=training_args,\n", @@ -444,8 +467,6 @@ " else None,\n", ")\n", "\n", - "trainer_ptq_dynamic.save_model(\"./saved_results_ptq_dynamic\")\n", - "\n", "tune_metric = metrics.Metric(\n", " name=\"eval_loss\", \n", " is_relative=True,\n", @@ -458,7 +479,10 @@ ")\n", "\n", "# run quantization\n", - "trainer_ptq_dynamic.quantize(quant_config=quantization_config)" + "trainer_ptq_dynamic.quantize(quant_config=quantization_config)\n", + "\n", + "# save quantized model\n", + "trainer_ptq_dynamic.save_model(\"./saved_results_dynamic\")" ] }, { @@ -487,6 +511,25 @@ "print(\"Throughput: {} samples/sec\".format(throughput_ptq_dynamic))" ] }, + { + "cell_type": "markdown", + "id": "5a7e93de", + "metadata": {}, + "source": [ + "## Run Benchmark after Dynamic Post Training Quantization with Multi-Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea631f92", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "results = os.system('bash ../multi_instance.sh --model=saved_results_dynamic --core_per_instance=4 --data_type=int8')" + ] + }, { "cell_type": "markdown", "id": "44cca2a1", @@ -527,6 +570,25 @@ "print(\"Latency: {:.3f} ms\".format(1000 / throughput_fp32))\n", "print(\"Throughput: {} samples/sec\".format(throughput_fp32))" ] + }, + { + "cell_type": "markdown", + "id": "5a7e93de", + "metadata": {}, + "source": [ + "## Run Benchmark for FP32 Model with Multi-Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "571317cf", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "results = os.system('bash ../multi_instance.sh --model=bert-base-uncased --core_per_instance=4 --data_type=fp32')" + ] } ], "metadata": { diff --git a/docs/tutorials/pytorch/multi_instance.sh b/docs/tutorials/pytorch/multi_instance.sh new file mode 100644 index 00000000000..6f1441c278f --- /dev/null +++ b/docs/tutorials/pytorch/multi_instance.sh @@ -0,0 +1,62 @@ +set -eo pipefail +set -x +PATTERN='[-a-zA-Z0-9_]*=' +for i in "$@" +do + case $i in + --model=*) + model=`echo $i | sed "s/${PATTERN}//"`;; + --core_per_instance=*) + core_per_instance=`echo $i | sed "s/${PATTERN}//"`;; + --data_type=*) + data_type=`echo $i | sed "s/${PATTERN}//"`;; + *) + echo "Parameter $i not recognized."; exit 1;; + esac +done +ncores_per_socket=${ncores_per_socket:=$( lscpu | grep 'Core(s) per socket' | cut -d: -f2 | xargs echo -n)} +log_name="${model}.log" +cmd="python benchmark.py --data_type=${data_type} --model_name_or_path=${model}" +echo "Executing multi instance benchmark" +echo -e ">>> Executing multi instance benchmark $core_per_instance $cmd" >>"$log_name" +for ((j = 0; $(($j + $core_per_instance)) <= $ncores_per_socket; j = $(($j + ${core_per_instance})))); do + numa_prefix="numactl -m 0 -C $j-$((j + core_per_instance - 1)) " + # Make it works on machines with no numa support + if [[ -n $(numactl -s | grep "No NUMA support available") ]]; then + echo "No NUMA support available" + echo "Please install numactl" + exit 1 + fi + echo "${numa_prefix}${cmd}" >>$log_name + ${numa_prefix}${cmd} | + tee -a $log_name & +done +wait +echo -e "<<< Executing multi instance benchmark $core_per_instance $2" >>"$log_name" + +status="SUCCESS" + +for pid in "${benchmark_pids[@]}"; do + wait $pid + exit_code=$? + echo "Detected exit code: ${exit_code}" + if [ ${exit_code} == 0 ]; then + echo "Process ${pid} succeeded" + else + echo "Process ${pid} failed" + status="FAILURE" + fi +done +echo "Benchmark process status: ${status}" +if [ ${status} == "FAILURE" ]; then + echo "Benchmark process returned non-zero exit code." + exit 1 +fi +Total_Throughput=$(cat $log_name | grep -Po "Throughput:\s+(\d+(\.\d+)?)" | cut -f 2 -d ' ' | awk '{ SUM += $1} END { print SUM }') +echo "Throughput : $Total_Throughput" +Batch_size=$(cat $log_name | grep -Po "Batch\s+size\s+=\s+\d+" | tail -1) +echo $Batch_size +Accuray=$(cat $log_name | grep -Po "Finally Eval .* Accuracy.*\d+" | tail -1) +echo $Accuray +Total_Latency=$(cat $log_name | grep -Po "Latency:\s+(\d+(\.\d+)?)" | cut -f 2 -d ' ' | awk '{ SUM += $1} END { print SUM }') +echo "Latency : $Total_Latency" \ No newline at end of file diff --git a/docs/tutorials/pytorch/multiple-choice/benchmark.py b/docs/tutorials/pytorch/multiple-choice/benchmark.py new file mode 100644 index 00000000000..6fde6834d0f --- /dev/null +++ b/docs/tutorials/pytorch/multiple-choice/benchmark.py @@ -0,0 +1,145 @@ +import logging +import os +import numpy as np +from datasets import load_dataset, load_metric +from itertools import chain +from intel_extension_for_transformers import metrics, OptimizedModel +from intel_extension_for_transformers.optimization.trainer import NLPTrainer +from argparse import ArgumentParser +from transformers import ( + MODEL_FOR_MASKED_LM_MAPPING, + AutoConfig, + AutoModelForMultipleChoice, + AutoTokenizer, + TrainingArguments, + set_seed, + default_data_collator, +) + +os.environ["WANDB_DISABLED"] = "true" + +logger = logging.getLogger(__name__) +MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + +arg_parser = ArgumentParser(description='Parse args') +arg_parser.add_argument('--data_type', default = "int8", help='data type of model') +arg_parser.add_argument('--model_name_or_path', default = "ehdwns1516/bert-base-uncased_SWAG", help = 'input model for benchmark') +args = arg_parser.parse_args() + +dataset_name="swag" +dataset_config_name="regular" + +training_args = TrainingArguments( + output_dir=args.model_name_or_path, + do_eval=True, + do_train=True, + no_cuda=True, + overwrite_output_dir=True, + per_device_eval_batch_size=8, + per_device_train_batch_size=8 +) + +raw_datasets = load_dataset(dataset_name, dataset_config_name) +config = AutoConfig.from_pretrained(args.model_name_or_path) +tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) +## start with int8 benchmarking +if args.data_type == "int8": + # Load the model obtained after Intel Neural Compressor (INC) quantization + model = OptimizedModel.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + revision="main", + use_auth_token=None, + ) +else: + ## original fp32 model benchmarking + model = AutoModelForMultipleChoice.from_pretrained( + args.model_name_or_path, + config=config, + revision="main", + use_auth_token=None, + ) + +ending_names = [f"ending{i}" for i in range(4)] +context_name = "sent1" +question_header_name = "sent2" + +# First we tokenize all the texts. +max_seq_length = tokenizer.model_max_length +if max_seq_length >1024: + max_seq_length = 1024 + +# preprocessing the datasets +def preprocess_function(examples): + first_sentences = [[context] * 4 for context in examples[context_name]] + question_headers = examples[question_header_name] + second_sentences = [ + [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers) + ] + + # Flatten out + first_sentences = list(chain(*first_sentences)) + second_sentences = list(chain(*second_sentences)) + + # Tokenize + tokenized_examples = tokenizer( + first_sentences, + second_sentences, + truncation=True, + max_length=max_seq_length, + padding="max_length" + ) + # Un-flatten + return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()} + +if training_args.do_train: + if "train" not in raw_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = raw_datasets["train"] + with training_args.main_process_first(desc="train dataset map pre-processing"): + train_dataset = train_dataset.map(preprocess_function, batched=True) +if training_args.do_eval: + if "validation" not in raw_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = raw_datasets["validation"] + eval_dataset = eval_dataset.select(range(1000)) + with training_args.main_process_first(desc="validation dataset map pre-processing"): + eval_dataset = eval_dataset.map( + preprocess_function, + batched=True, + load_from_cache_file=False + ) + +# Data collator +data_collator = default_data_collator + +# Metric +def compute_metrics(eval_predictions): + predictions, label_ids = eval_predictions + preds = np.argmax(predictions, axis=1) + return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()} + +# Initialize the Trainer +set_seed(training_args.seed) +trainer = NLPTrainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, +) + +results = trainer.evaluate() +bert_task_acc_keys = ['eval_loss', 'eval_f1', 'eval_accuracy', 'eval_matthews_correlation', + 'eval_pearson', 'eval_mcc', 'eval_spearmanr'] + +throughput = results.get("eval_samples_per_second") +eval_loss = results["eval_loss"] +print('Batch size = {}'.format(training_args.per_device_eval_batch_size)) +print("Finally Eval eval_loss Accuracy: {}".format(eval_loss)) +print("Latency: {:.3f} ms".format(1000 / throughput)) +print("Throughput: {} samples/sec".format(throughput)) diff --git a/docs/tutorials/pytorch/multiple-choice/bert-base-uncased_SWAG.ipynb b/docs/tutorials/pytorch/multiple-choice/bert-base-uncased_SWAG.ipynb index fc808148301..eebf4f3412c 100644 --- a/docs/tutorials/pytorch/multiple-choice/bert-base-uncased_SWAG.ipynb +++ b/docs/tutorials/pytorch/multiple-choice/bert-base-uncased_SWAG.ipynb @@ -57,7 +57,7 @@ } }, "source": [ - "* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. " + "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. " ] }, { @@ -225,7 +225,7 @@ " overwrite_cache=True\n", ")\n", "training_args = TrainingArguments(\n", - " output_dir=\"./tmp/swag_output\",\n", + " output_dir=\"./saved_results_static\",\n", " do_eval=True,\n", " do_train=True,\n", " no_cuda=True,\n", @@ -433,8 +433,6 @@ ")\n", "\n", "# quantized model\n", - "trainer_static.save_model(\"./tmp/swag_output/saved_results_static\")\n", - "\n", "tune_metric = metrics.Metric(\n", " name=\"eval_accuracy\", # Metric used for the tuning strategy.\n", " is_relative=True, # Metric tolerance mode, True is for relative, otherwise for absolute.\n", @@ -446,7 +444,11 @@ ")\n", "\n", "# run quantization\n", - "trainer_static.quantize(quant_config=quantization_config)" + "trainer_static.quantize(quant_config=quantization_config)\n", + "\n", + "# save quantized model\n", + "trainer_static.save_model(\"./saved_results_static\")\n", + "model.config.save_pretrained(\"./saved_results_static\")" ] }, { @@ -483,6 +485,26 @@ "print(\"Throughput: {} samples/sec\".format(throughput_static))\n" ] }, + { + "cell_type": "markdown", + "id": "363eea55", + "metadata": {}, + "source": [ + "## Run Benchmark after Static Post Training Quantization with Multi-Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54bd1a23", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.system('numactl --hardware')\n", + "results = os.system('bash ../multi_instance.sh --model=saved_results_static --core_per_instance=4 --data_type=int8')" + ] + }, { "cell_type": "markdown", "id": "7a51f6ca", @@ -507,6 +529,7 @@ "outputs": [], "source": [ "set_seed(training_args.seed)\n", + "training_args.output_dir = \"saved_results_dynamic\"\n", "# Initialize our Trainer\n", "trainer_dynamic = NLPTrainer(\n", " model=model,\n", @@ -519,8 +542,6 @@ ")\n", "\n", "# quantized model\n", - "trainer_dynamic.save_model(\"./tmp/swag_output/saved_results_dynamic\")\n", - "\n", "tune_metric = metrics.Metric(\n", " name=\"eval_accuracy\", # Metric used for the tuning strategy.\n", " is_relative=True, # Metric tolerance mode, True is for relative, otherwise for absolute.\n", @@ -532,7 +553,10 @@ ")\n", "\n", "# run quantization\n", - "trainer_dynamic.quantize(quant_config=quantization_config)" + "trainer_dynamic.quantize(quant_config=quantization_config)\n", + "\n", + "# save quantized model\n", + "trainer_dynamic.save_model(\"./saved_results_dynamic\")" ] }, { @@ -568,6 +592,25 @@ "print(\"Throughput: {} samples/sec\".format(throughput_dynamic))" ] }, + { + "cell_type": "markdown", + "id": "14ed0b81", + "metadata": {}, + "source": [ + "## Run Benchmark after Dynamic Post Training Quantization with Multi-Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33df0db9", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "results = os.system('bash ../multi_instance.sh --model=saved_results_dynamic --core_per_instance=4 --data_type=int8')" + ] + }, { "cell_type": "markdown", "id": "44cca2a1", @@ -612,6 +655,25 @@ "print(\"Latency: {:.3f} ms\".format(1000 / throughput_fp32))\n", "print(\"Throughput: {} samples/sec\".format(throughput_fp32))" ] + }, + { + "cell_type": "markdown", + "id": "6390e6a3", + "metadata": {}, + "source": [ + "## Run Benchmark for FP32 Model with Multi-Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "675c5036", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "results = os.system('bash ../multi_instance.sh --model=ehdwns1516/bert-base-uncased_SWAG --core_per_instance=4 --data_type=fp32')" + ] } ], "metadata": { diff --git a/docs/tutorials/pytorch/question-answering/Dynamic_MiniLM_SQuAD.ipynb b/docs/tutorials/pytorch/question-answering/Dynamic_MiniLM_SQuAD.ipynb index b0830c5243e..db34c6f3ef6 100644 --- a/docs/tutorials/pytorch/question-answering/Dynamic_MiniLM_SQuAD.ipynb +++ b/docs/tutorials/pytorch/question-answering/Dynamic_MiniLM_SQuAD.ipynb @@ -37,7 +37,7 @@ "id": "c1816be1", "metadata": {}, "source": [ - "* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. " + "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. " ] }, { diff --git a/docs/tutorials/pytorch/question-answering/benchmark.py b/docs/tutorials/pytorch/question-answering/benchmark.py new file mode 100644 index 00000000000..be1a498e02c --- /dev/null +++ b/docs/tutorials/pytorch/question-answering/benchmark.py @@ -0,0 +1,589 @@ +import logging +import os +import numpy as np +import random +from datasets import load_dataset, load_metric +from intel_extension_for_transformers import OptimizedModel +from intel_extension_for_transformers.optimization.trainer import NLPTrainer +from argparse import ArgumentParser +import timeit +import collections +import json +from typing import Optional, Tuple +from tqdm.auto import tqdm +from transformers import ( + AutoConfig, + AutoModelForQuestionAnswering, + AutoTokenizer, + DataCollatorWithPadding, + EvalPrediction, + TrainingArguments, + set_seed, + is_torch_tpu_available, +) +from transformers.trainer_utils import PredictionOutput + +os.environ["WANDB_DISABLED"] = "true" +logger = logging.getLogger(__name__) + +arg_parser = ArgumentParser(description='Parse args') +arg_parser.add_argument('--data_type', default = "int8", help='data type of model') +arg_parser.add_argument('--model_name_or_path', default = "distilbert-base-uncased-distilled-squad", help = 'input model for benchmark') +args = arg_parser.parse_args() + + +if is_torch_tpu_available(): + import torch_xla.core.xla_model as xm + import torch_xla.debug.metrics as met + +class QuestionAnsweringTrainer(NLPTrainer): + def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs): + super().__init__(*args, **kwargs) + self.eval_examples = eval_examples + self.post_process_function = post_process_function + + def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"): + eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset + eval_dataloader = self.get_eval_dataloader(eval_dataset) + eval_examples = self.eval_examples if eval_examples is None else eval_examples + + # Temporarily disable metric computation, we will do it in the loop here. + compute_metrics = self.compute_metrics + self.compute_metrics = None + eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop + try: + output = eval_loop( + eval_dataloader, + description="Evaluation", + # No point gathering the predictions if there are no metrics, otherwise we defer to + # self.args.prediction_loss_only + prediction_loss_only=True if compute_metrics is None else None, + ignore_keys=ignore_keys, + ) + finally: + self.compute_metrics = compute_metrics + + if self.post_process_function is not None and self.compute_metrics is not None: + eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions) + metrics = self.compute_metrics(eval_preds) + + # Prefix all keys with metric_key_prefix + '_' + for key in list(metrics.keys()): + if not key.startswith(f"{metric_key_prefix}_"): + metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) + + self.log(metrics) + else: + metrics = {} + + if self.args.tpu_metrics_debug or self.args.debug: + # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) + xm.master_print(met.metrics_report()) + + self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics) + return metrics + + def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"): + predict_dataloader = self.get_test_dataloader(predict_dataset) + + # Temporarily disable metric computation, we will do it in the loop here. + compute_metrics = self.compute_metrics + self.compute_metrics = None + eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop + try: + output = eval_loop( + predict_dataloader, + description="Prediction", + # No point gathering the predictions if there are no metrics, otherwise we defer to + # self.args.prediction_loss_only + prediction_loss_only=True if compute_metrics is None else None, + ignore_keys=ignore_keys, + ) + finally: + self.compute_metrics = compute_metrics + + if self.post_process_function is None or self.compute_metrics is None: + return output + + predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict") + metrics = self.compute_metrics(predictions) + + # Prefix all keys with metric_key_prefix + '_' + for key in list(metrics.keys()): + if not key.startswith(f"{metric_key_prefix}_"): + metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) + + return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics) + +def postprocess_qa_predictions( + examples, + features, + predictions: Tuple[np.ndarray, np.ndarray], + version_2_with_negative: bool = False, + n_best_size: int = 20, + max_answer_length: int = 30, + null_score_diff_threshold: float = 0.0, + output_dir: Optional[str] = None, + prefix: Optional[str] = None, + log_level: Optional[int] = logging.WARNING, +): + """ + Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the + original contexts. This is the base postprocessing functions for models that only return start and end logits. + + Args: + examples: The non-preprocessed dataset (see the main script for more information). + features: The processed dataset (see the main script for more information). + predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): + The predictions of the model: two arrays containing the start logits and the end logits respectively. Its + first dimension must match the number of elements of :obj:`features`. + version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the underlying dataset contains examples with no answers. + n_best_size (:obj:`int`, `optional`, defaults to 20): + The total number of n-best predictions to generate when looking for an answer. + max_answer_length (:obj:`int`, `optional`, defaults to 30): + The maximum length of an answer that can be generated. This is needed because the start and end predictions + are not conditioned on one another. + null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0): + The threshold used to select the null answer: if the best answer has a score that is less than the score of + the null answer minus this threshold, the null answer is selected for this example (note that the score of + the null answer for an example giving several features is the minimum of the scores for the null answer on + each feature: all features must be aligned on the fact they `want` to predict a null answer). + + Only useful when :obj:`version_2_with_negative` is :obj:`True`. + output_dir (:obj:`str`, `optional`): + If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if + :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null + answers, are saved in `output_dir`. + prefix (:obj:`str`, `optional`): + If provided, the dictionaries mentioned above are saved with `prefix` added to their names. + log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``): + ``logging`` log level (e.g., ``logging.WARNING``) + """ + + if len(predictions) != 2: + raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).") + all_start_logits, all_end_logits = predictions + + if len(predictions[0]) != len(features): + raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.") + + # Build a map example to its corresponding features. + example_id_to_index = {k: i for i, k in enumerate(examples["id"])} + features_per_example = collections.defaultdict(list) + for i, feature in enumerate(features): + features_per_example[example_id_to_index[feature["example_id"]]].append(i) + + # The dictionaries we have to fill. + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + if version_2_with_negative: + scores_diff_json = collections.OrderedDict() + + # Logging. + logger.setLevel(log_level) + logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") + + # Let's loop over all the examples! + for example_index, example in enumerate(tqdm(examples)): + # Those are the indices of the features associated to the current example. + feature_indices = features_per_example[example_index] + + min_null_prediction = None + prelim_predictions = [] + + # Looping through all the features associated to the current example. + for feature_index in feature_indices: + # We grab the predictions of the model for this feature. + start_logits = all_start_logits[feature_index] + end_logits = all_end_logits[feature_index] + # This is what will allow us to map some the positions in our logits to span of texts in the original + # context. + offset_mapping = features[feature_index]["offset_mapping"] + # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context + # available in the current feature. + token_is_max_context = features[feature_index].get("token_is_max_context", None) + + # Update minimum null prediction. + feature_null_score = start_logits[0] + end_logits[0] + if min_null_prediction is None or min_null_prediction["score"] > feature_null_score: + min_null_prediction = { + "offsets": (0, 0), + "score": feature_null_score, + "start_logit": start_logits[0], + "end_logit": end_logits[0], + } + + # Go through all possibilities for the `n_best_size` greater start and end logits. + start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist() + end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist() + for start_index in start_indexes: + for end_index in end_indexes: + # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond + # to part of the input_ids that are not in the context. + if ( + start_index >= len(offset_mapping) + or end_index >= len(offset_mapping) + or offset_mapping[start_index] is None + or len(offset_mapping[start_index]) < 2 + or offset_mapping[end_index] is None + or len(offset_mapping[end_index]) < 2 + ): + continue + # Don't consider answers with a length that is either < 0 or > max_answer_length. + if end_index < start_index or end_index - start_index + 1 > max_answer_length: + continue + # Don't consider answer that don't have the maximum context available (if such information is + # provided). + if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False): + continue + + prelim_predictions.append( + { + "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]), + "score": start_logits[start_index] + end_logits[end_index], + "start_logit": start_logits[start_index], + "end_logit": end_logits[end_index], + } + ) + if version_2_with_negative: + # Add the minimum null prediction + prelim_predictions.append(min_null_prediction) + null_score = min_null_prediction["score"] + + # Only keep the best `n_best_size` predictions. + predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size] + + # Add back the minimum null prediction if it was removed because of its low score. + if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions): + predictions.append(min_null_prediction) + + # Use the offsets to gather the answer text in the original context. + context = example["context"] + for pred in predictions: + offsets = pred.pop("offsets") + pred["text"] = context[offsets[0] : offsets[1]] + + # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid + # failure. + if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""): + predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0}) + + # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using + # the LogSumExp trick). + scores = np.array([pred.pop("score") for pred in predictions]) + exp_scores = np.exp(scores - np.max(scores)) + probs = exp_scores / exp_scores.sum() + + # Include the probabilities in our predictions. + for prob, pred in zip(probs, predictions): + pred["probability"] = prob + + # Pick the best prediction. If the null answer is not possible, this is easy. + if not version_2_with_negative: + all_predictions[example["id"]] = predictions[0]["text"] + else: + # Otherwise we first need to find the best non-empty prediction. + i = 0 + while predictions[i]["text"] == "": + i += 1 + best_non_null_pred = predictions[i] + + # Then we compare to the null prediction using the threshold. + score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"] + scores_diff_json[example["id"]] = float(score_diff) # To be JSON-serializable. + if score_diff > null_score_diff_threshold: + all_predictions[example["id"]] = "" + else: + all_predictions[example["id"]] = best_non_null_pred["text"] + + # Make `predictions` JSON-serializable by casting np.float back to float. + all_nbest_json[example["id"]] = [ + {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()} + for pred in predictions + ] + + # If we have an output_dir, let's save all those dicts. + if output_dir is not None: + if not os.path.isdir(output_dir): + raise EnvironmentError(f"{output_dir} is not a directory.") + + prediction_file = os.path.join( + output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json" + ) + nbest_file = os.path.join( + output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json" + ) + if version_2_with_negative: + null_odds_file = os.path.join( + output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json" + ) + + logger.info(f"Saving predictions to {prediction_file}.") + with open(prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + logger.info(f"Saving nbest_preds to {nbest_file}.") + with open(nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + if version_2_with_negative: + logger.info(f"Saving null_odds to {null_odds_file}.") + with open(null_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + return all_predictions + + + +raw_datasets = load_dataset("squad") +# download the dataset. +training_args = TrainingArguments( + output_dir=args.model_name_or_path, + do_eval=True, + do_train=True, + no_cuda=True, + overwrite_output_dir=True, + per_device_train_batch_size=8, +) +config = AutoConfig.from_pretrained(args.model_name_or_path) +tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=True) +log_level = training_args.get_process_log_level() +## start with int8 benchmarking +if args.data_type == "int8": + # Load the model obtained after Intel Neural Compressor (INC) quantization + model = OptimizedModel.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + revision="main", + use_auth_token=None, + ) +else: + ## original fp32 model benchmarking + model = AutoModelForQuestionAnswering.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + use_auth_token=None + ) + +# Preprocessing is slighlty different for training and evaluation. +column_names = raw_datasets["train"].column_names +question_column_name = "question" if "question" in column_names else column_names[0] +context_column_name = "context" if "context" in column_names else column_names[1] +answer_column_name = "answers" if "answers" in column_names else column_names[2] + +# Padding side determines if we do (question|context) or (context|question). +pad_on_right = tokenizer.padding_side == "right" + +max_seq_length = min(384, tokenizer.model_max_length) + +# Training preprocessing +def prepare_train_features(examples): + # Some of the questions have lots of whitespace on the left, which is not useful and will make the + # truncation of the context fail (the tokenized question will take a lots of space). So we remove that + # left whitespace + examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]] + + # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results + # in one example possible giving several features when a context is long, each of those features having a + # context that overlaps a bit the context of the previous feature. + tokenized_examples = tokenizer( + examples[question_column_name if pad_on_right else context_column_name], + examples[context_column_name if pad_on_right else question_column_name], + truncation="only_second" if pad_on_right else "only_first", + max_length=max_seq_length, + stride=128, + return_overflowing_tokens=True, + return_offsets_mapping=True, + padding="max_length", + ) + + # Since one example might give us several features if it has a long context, we need a map from a feature to + # its corresponding example. This key gives us just that. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + # The offset mappings will give us a map from token to character position in the original context. This will + # help us compute the start_positions and end_positions. + offset_mapping = tokenized_examples.pop("offset_mapping") + + # Let's label those examples! + tokenized_examples["start_positions"] = [] + tokenized_examples["end_positions"] = [] + + for i, offsets in enumerate(offset_mapping): + # We will label impossible answers with the index of the CLS token. + input_ids = tokenized_examples["input_ids"][i] + cls_index = input_ids.index(tokenizer.cls_token_id) + + # Grab the sequence corresponding to that example (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + + # One example can give several spans, this is the index of the example containing this span of text. + sample_index = sample_mapping[i] + answers = examples[answer_column_name][sample_index] + # If no answers are given, set the cls_index as answer. + if len(answers["answer_start"]) == 0: + tokenized_examples["start_positions"].append(cls_index) + tokenized_examples["end_positions"].append(cls_index) + else: + # Start/end character index of the answer in the text. + start_char = answers["answer_start"][0] + end_char = start_char + len(answers["text"][0]) + + # Start token index of the current span in the text. + token_start_index = 0 + while sequence_ids[token_start_index] != (1 if pad_on_right else 0): + token_start_index += 1 + + # End token index of the current span in the text. + token_end_index = len(input_ids) - 1 + while sequence_ids[token_end_index] != (1 if pad_on_right else 0): + token_end_index -= 1 + + # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). + if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): + tokenized_examples["start_positions"].append(cls_index) + tokenized_examples["end_positions"].append(cls_index) + else: + # Otherwise move the token_start_index and token_end_index to the two ends of the answer. + # Note: we could go after the last offset if the answer is the last word (edge case). + while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char: + token_start_index += 1 + tokenized_examples["start_positions"].append(token_start_index - 1) + while offsets[token_end_index][1] >= end_char: + token_end_index -= 1 + tokenized_examples["end_positions"].append(token_end_index + 1) + + return tokenized_examples + +if training_args.do_train: + if "train" not in raw_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = raw_datasets["train"] + with training_args.main_process_first(desc="train dataset map pre-processing"): + train_dataset = train_dataset.map( + prepare_train_features, + batched=True, + remove_columns=column_names, + load_from_cache_file=True, + desc="Running tokenizer on train dataset" + ) + +# Validation preprocessing +def prepare_validation_features(examples): + # Some of the questions have lots of whitespace on the left, which is not useful and will make the + # truncation of the context fail (the tokenized question will take a lots of space). So we remove that + # left whitespace + examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]] + + # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results + # in one example possible giving several features when a context is long, each of those features having a + # context that overlaps a bit the context of the previous feature. + tokenized_examples = tokenizer( + examples[question_column_name if pad_on_right else context_column_name], + examples[context_column_name if pad_on_right else question_column_name], + truncation="only_second" if pad_on_right else "only_first", + max_length=max_seq_length, + stride=128, + return_overflowing_tokens=True, + return_offsets_mapping=True, + padding="max_length", + ) + + # Since one example might give us several features if it has a long context, we need a map from a feature to + # its corresponding example. This key gives us just that. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + + # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the + # corresponding example_id and we will store the offset mappings. + tokenized_examples["example_id"] = [] + + for i in range(len(tokenized_examples["input_ids"])): + # Grab the sequence corresponding to that example (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + context_index = 1 if pad_on_right else 0 + + # One example can give several spans, this is the index of the example containing this span of text. + sample_index = sample_mapping[i] + tokenized_examples["example_id"].append(examples["id"][sample_index]) + + # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token + # position is part of the context or not. + tokenized_examples["offset_mapping"][i] = [ + (o if sequence_ids[k] == context_index else None) + for k, o in enumerate(tokenized_examples["offset_mapping"][i]) + ] + + return tokenized_examples + +if training_args.do_eval: + if "validation" not in raw_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_examples = raw_datasets["validation"] + with training_args.main_process_first(desc="validation dataset map pre-processing"): + eval_dataset = eval_examples.map( + prepare_validation_features, + batched=True, + remove_columns=column_names, + load_from_cache_file=True, + desc="Running tokenizer on validation dataset", + ) + max_eval_samples = min(len(eval_dataset), 5000) + eval_dataset = eval_dataset.select(range(max_eval_samples)) + +# Data collator +data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) + +# Post-processing: +def post_processing_function(examples, features, predictions, stage="eval"): + # Post-processing: we match the start logits and end logits to answers in the original context. + predictions = postprocess_qa_predictions( + examples=examples, + features=features, + predictions=predictions, + version_2_with_negative=False, + n_best_size=20, + max_answer_length=30, + null_score_diff_threshold=0.0, + output_dir=training_args.output_dir, + log_level=log_level, + prefix=stage, + ) + # Format the result to the format the metric expects. + formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] + + references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples] + return EvalPrediction(predictions=formatted_predictions, label_ids=references) + +metric = load_metric("squad") + +def compute_metrics(p: EvalPrediction): + return metric.compute(predictions=p.predictions, references=p.label_ids) + + +# Initialize the Trainer +set_seed(training_args.seed) +trainer = QuestionAnsweringTrainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + eval_examples=eval_examples if training_args.do_eval else None, + tokenizer=tokenizer, + data_collator=data_collator, + post_process_function=post_processing_function, + compute_metrics=compute_metrics, +) + + +set_seed(training_args.seed) +start_time = timeit.default_timer() +results = trainer.evaluate() +evalTime = timeit.default_timer() - start_time +max_eval_samples = 5000 +samples = min(max_eval_samples, len(eval_dataset)) + +eval_f1_static = results.get("eval_f1") +print('Batch size = {}'.format(training_args.per_device_eval_batch_size)) +print("Finally Eval eval_f1 Accuracy: {}".format(eval_f1_static)) +print("Latency: {:.3f} ms".format(evalTime / samples * 1000)) +print("Throughput: {} samples/sec".format(samples/evalTime)) \ No newline at end of file diff --git a/docs/tutorials/pytorch/question-answering/bert-base-uncased_distilled-squad.ipynb b/docs/tutorials/pytorch/question-answering/bert-base-uncased_distilled-squad.ipynb index aa4310d2ae6..f0253a9f642 100644 --- a/docs/tutorials/pytorch/question-answering/bert-base-uncased_distilled-squad.ipynb +++ b/docs/tutorials/pytorch/question-answering/bert-base-uncased_distilled-squad.ipynb @@ -69,7 +69,7 @@ } }, "source": [ - "* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. " + "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. " ] }, { @@ -636,7 +636,7 @@ " max_eval_samples=5000\n", ")\n", "training_args = TrainingArguments(\n", - " output_dir=\"./tmp/squad_output\",\n", + " output_dir=\"./saved_results_static\",\n", " do_eval=True,\n", " do_train=True,\n", " no_cuda=True,\n", @@ -991,8 +991,8 @@ " compute_metrics=compute_metrics,\n", ")\n", "\n", - "trainer_static.save_model(training_args.output_dir+'/saved_results_static')\n", - "model.config.save_pretrained(training_args.output_dir+'/saved_results_static')\n", + "trainer_static.save_model('./saved_results_static')\n", + "model.config.save_pretrained('./saved_results_static')\n", "\n", "tune_metric = metrics.Metric(\n", " name=\"eval_f1\", # Metric used for the tuning strategy.\n", @@ -1047,6 +1047,26 @@ "print(\"Throughput: {} samples/sec\".format(samples/evalTime))" ] }, + { + "cell_type": "markdown", + "id": "0a0ec3a8", + "metadata": {}, + "source": [ + "## Run Benchmark after Static Post Training Quantization with Multi-Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2754e847", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.system('numactl --hardware')\n", + "results = os.system('bash ../multi_instance.sh --model=saved_results_static --core_per_instance=4 --data_type=int8')" + ] + }, { "cell_type": "markdown", "id": "7a51f6ca", @@ -1071,6 +1091,7 @@ "outputs": [], "source": [ "set_seed(training_args.seed)\n", + "training_args.output_dir = \"./saved_results_dynamic\"\n", "# Initialize our Trainer\n", "trainer_dynamic = QuestionAnsweringTrainer(\n", " model=model,\n", @@ -1084,8 +1105,8 @@ " compute_metrics=compute_metrics,\n", ")\n", "\n", - "trainer_dynamic.save_model(training_args.output_dir+'/saved_results_dynamic')\n", - "model.config.save_pretrained(training_args.output_dir+'/saved_results_dynamic')\n", + "trainer_dynamic.save_model('./saved_results_dynamic')\n", + "model.config.save_pretrained('./saved_results_dynamic')\n", "\n", "tune_metric = metrics.Metric(\n", " name=\"eval_f1\", # Metric used for the tuning strategy.\n", @@ -1139,6 +1160,25 @@ "print(\"Throughput: {} samples/sec\".format(samples/evalTime))" ] }, + { + "cell_type": "markdown", + "id": "4c8e1c6f", + "metadata": {}, + "source": [ + "## Run Benchmark after Dynamic Post Training Quantization with Multi-Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "101d0b51", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "results = os.system('bash ../multi_instance.sh --model=saved_results_dynamic --core_per_instance=4 --data_type=int8')" + ] + }, { "cell_type": "markdown", "metadata": { @@ -1189,6 +1229,25 @@ "print(\"Latency: {:.3f} ms\".format(evalTime / samples * 1000))\n", "print(\"Throughput: {} samples/sec\".format(samples/evalTime))" ] + }, + { + "cell_type": "markdown", + "id": "17fb0e7f", + "metadata": {}, + "source": [ + "## Run Benchmark for FP32 Model with Multi-Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9710d14", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "results = os.system('bash ../multi_instance.sh --model=distilbert-base-uncased-distilled-squad --core_per_instance=4 --data_type=fp32')" + ] } ], "metadata": { diff --git a/docs/tutorials/pytorch/question-answering/distillation.ipynb b/docs/tutorials/pytorch/question-answering/distillation.ipynb new file mode 100644 index 00000000000..4e905c1c3c5 --- /dev/null +++ b/docs/tutorials/pytorch/question-answering/distillation.ipynb @@ -0,0 +1,773 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This tutorial demonstrates how to use the distillation approach based on [IntelĀ® Neural Compressor](https://github.com/intel/neural-compressor) for question-answering." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prerequisite" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install packages" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# install model dependency\n", + "! pip install accelerate datasets >= 1.1.3 sentencepiece != 0.1.92 protobuf torch >= 1.10 transformers >= 4.12.0 wandb\n", + "! pip install -r requirements.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import os\n", + "import sys\n", + "from dataclasses import dataclass, field\n", + "from typing import Optional\n", + "\n", + "import datasets\n", + "from datasets import load_dataset, load_metric\n", + "\n", + "import functools\n", + "import numpy as np\n", + "import time\n", + "import torch\n", + "import transformers\n", + "from intel_extension_for_transformers import metrics, OptimizedModel, DistillationConfig\n", + "from torch.utils.data import DataLoader\n", + "from tqdm import tqdm\n", + "from trainer_qa import QuestionAnsweringTrainer\n", + "from transformers import (\n", + " AutoConfig,\n", + " AutoModelForQuestionAnswering,\n", + " AutoTokenizer,\n", + " DataCollatorWithPadding,\n", + " EvalPrediction,\n", + " HfArgumentParser,\n", + " PreTrainedTokenizerFast,\n", + " TrainingArguments,\n", + " default_data_collator,\n", + " set_seed,\n", + ")\n", + "from transformers.trainer_utils import get_last_checkpoint\n", + "from transformers.utils import check_min_version\n", + "from transformers.utils.versions import require_version\n", + "from typing import Optional\n", + "from utils_qa import postprocess_qa_predictions\n", + "\n", + "\n", + "# Will error if the minimal version of Transformers is not installed. Remove at your own risks.\n", + "check_min_version(\"4.12.0\")\n", + "\n", + "require_version(\"datasets>=1.8.0\", \"To fix: pip install -r examples/pytorch/question-answering/requirements.txt\")\n", + "\n", + "logger = logging.getLogger(__name__)\n", + "\n", + "os.environ[\"WANDB_DISABLED\"] = \"true\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define arguments" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ========== Define arguments =========\n", + "@dataclass\n", + "class ModelArguments:\n", + " \"\"\"\n", + " Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.\n", + " \"\"\"\n", + " model_name_or_path: str = field(\n", + " metadata={\"help\": \"Path to pretrained model or model identifier from huggingface.co/models\"}\n", + " )\n", + "\n", + "\n", + "@dataclass\n", + "class DataTrainingArguments:\n", + " \"\"\"\n", + " Arguments pertaining to what data we are going to input our model for training and eval.\n", + " \"\"\"\n", + " dataset_name: Optional[str] = field(\n", + " default=None, metadata={\"help\": \"The name of the dataset to use (via the datasets library).\"}\n", + " )\n", + " max_seq_length: int = field(\n", + " default=384,\n", + " metadata={\n", + " \"help\": \"The maximum total input sequence length after tokenization. Sequences longer \"\n", + " \"than this will be truncated, sequences shorter will be padded.\"\n", + " },\n", + " )\n", + " max_train_samples: Optional[int] = field(\n", + " default=None,\n", + " metadata={\n", + " \"help\": \"For debugging purposes or quicker training, truncate the number of training examples to this \"\n", + " \"value if set.\"\n", + " },\n", + " )\n", + " max_eval_samples: Optional[int] = field(\n", + " default=None,\n", + " metadata={\n", + " \"help\": \"For debugging purposes or quicker training, truncate the number of evaluation examples to this \"\n", + " \"value if set.\"\n", + " },\n", + " )\n", + " overwrite_cache: bool = field(\n", + " default=False, metadata={\"help\": \"Overwrite the cached training and evaluation sets\"}\n", + " )\n", + " doc_stride: int = field(\n", + " default=128,\n", + " metadata={\"help\": \"When splitting up a long document into chunks, how much stride to take between chunks.\"},\n", + " )\n", + " pad_to_max_length: bool = field(\n", + " default=True,\n", + " metadata={\n", + " \"help\": \"Whether to pad all samples to `max_seq_length`. \"\n", + " \"If False, will pad the samples dynamically when batching to the maximum length in the batch (which can \"\n", + " \"be faster on GPU but will be slower on TPU).\"\n", + " },\n", + " )\n", + " version_2_with_negative: bool = field(\n", + " default=False, metadata={\"help\": \"If true, some of the examples do not have an answer.\"}\n", + " )\n", + " null_score_diff_threshold: float = field(\n", + " default=0.0,\n", + " metadata={\n", + " \"help\": \"The threshold used to select the null answer: if the best answer has a score that is less than \"\n", + " \"the score of the null answer minus this threshold, the null answer is selected for this example. \"\n", + " \"Only useful when `version_2_with_negative=True`.\"\n", + " },\n", + " )\n", + " n_best_size: int = field(\n", + " default=20,\n", + " metadata={\"help\": \"The total number of n-best predictions to generate when looking for an answer.\"},\n", + " )\n", + " max_answer_length: int = field(\n", + " default=30,\n", + " metadata={\n", + " \"help\": \"The maximum length of an answer that can be generated. This is needed because the start \"\n", + " \"and end predictions are not conditioned on one another.\"\n", + " },\n", + " )\n", + "\n", + "@dataclass\n", + "class OptimizationArguments:\n", + " \"\"\"\n", + " Arguments pertaining to what type of optimization we are going to apply on the model.\n", + " \"\"\"\n", + "\n", + " distillation: bool = field(\n", + " default=False,\n", + " metadata={\"help\": \"Whether or not to apply distillation.\"},\n", + " )\n", + " teacher_model_name_or_path: str = field(\n", + " default=False,\n", + " metadata={\"help\": \"Path to pretrained model or model identifier from huggingface.co/models\"},\n", + " )\n", + " run_teacher_logits: bool = field(\n", + " default=False,\n", + " metadata={\"help\": \"Whether or not to obtain teacher model's logits on train dataset before training.\"},\n", + " )\n", + " metric_name: Optional[str] = field(\n", + " default=\"eval_f1\",\n", + " metadata={\"help\": \"Metric used for the tuning strategy.\"},\n", + " )\n", + " tolerance_mode: Optional[str] = field(\n", + " default=\"absolute\",\n", + " metadata={\"help\": \"Metric tolerance model, expected to be relative or absolute.\"},\n", + " )\n", + " perf_tol: Optional[float] = field(\n", + " default=0.02,\n", + " metadata={\"help\": \"Performance tolerance when optimizing the model.\"},\n", + " )\n", + " benchmark: bool = field(\n", + " default=False,\n", + " metadata={\"help\": \"run benchmark.\"}\n", + " )\n", + " accuracy_only: bool = field(\n", + " default=False,\n", + " metadata={\"help\":\"Whether to only test accuracy for model tuned by Neural Compressor.\"}\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We now keep distinct sets of args, for a cleaner separation of concerns.\n", + "parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, OptimizationArguments))\n", + "if len(sys.argv) == 2 and sys.argv[1].endswith(\".json\"):\n", + " # If we pass only one argument to the script and it's the path to a json file,\n", + " # let's parse it to get our arguments.\n", + " model_args, data_args, training_args, optim_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))\n", + "else:\n", + " model_args, data_args, training_args, optim_args = parser.parse_args_into_dataclasses()\n", + "\n", + "# Setup logging\n", + "logging.basicConfig(\n", + " format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n", + " datefmt=\"%m/%d/%Y %H:%M:%S\",\n", + " handlers=[logging.StreamHandler(sys.stdout)],\n", + ")\n", + "\n", + "log_level = training_args.get_process_log_level()\n", + "logger.setLevel(log_level)\n", + "datasets.utils.logging.set_verbosity(log_level)\n", + "transformers.utils.logging.set_verbosity(log_level)\n", + "transformers.utils.logging.enable_default_handler()\n", + "transformers.utils.logging.enable_explicit_format()\n", + "\n", + "# Log on each process the small summary:\n", + "logger.warning(\n", + " f\"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}\"\n", + " + f\"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}\"\n", + ")\n", + "logger.info(f\"Training/evaluation parameters {training_args}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download dataset from the hub" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "raw_datasets = load_dataset(\n", + " data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download fp32 model from the hub" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set seed before initializing model.\n", + "set_seed(training_args.seed)\n", + "\n", + "# get fp32 model\n", + "config = AutoConfig.from_pretrained(model_args.model_name_or_path)\n", + "tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, use_fast=True)\n", + "model = AutoModelForQuestionAnswering.from_pretrained(\n", + " model_args.model_name_or_path,\n", + " from_tf=bool(\".ckpt\" in model_args.model_name_or_path),\n", + " config=config,\n", + " use_auth_token=True if model_args.use_auth_token else None,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocessing the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Preprocessing the datasets.\n", + "# Preprocessing is slighlty different for training and evaluation.\n", + "column_names = raw_datasets[\"train\"].column_names\n", + "question_column_name = \"question\" if \"question\" in column_names else column_names[0]\n", + "context_column_name = \"context\" if \"context\" in column_names else column_names[1]\n", + "answer_column_name = \"answers\" if \"answers\" in column_names else column_names[2]\n", + "\n", + "# Padding side determines if we do (question|context) or (context|question).\n", + "pad_on_right = tokenizer.padding_side == \"right\"\n", + "\n", + "max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)\n", + "\n", + "# Training preprocessing\n", + "def prepare_train_features(examples, tokenizer=tokenizer):\n", + " # Some of the questions have lots of whitespace on the left, which is not useful and will make the\n", + " # truncation of the context fail (the tokenized question will take a lots of space). So we remove that\n", + " # left whitespace\n", + " examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]\n", + "\n", + " # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results\n", + " # in one example possible giving several features when a context is long, each of those features having a\n", + " # context that overlaps a bit the context of the previous feature.\n", + " tokenized_examples = tokenizer(\n", + " examples[question_column_name if pad_on_right else context_column_name],\n", + " examples[context_column_name if pad_on_right else question_column_name],\n", + " truncation=\"only_second\" if pad_on_right else \"only_first\",\n", + " max_length=max_seq_length,\n", + " stride=data_args.doc_stride,\n", + " return_overflowing_tokens=True,\n", + " return_offsets_mapping=True,\n", + " padding=\"max_length\" if data_args.pad_to_max_length else False,\n", + " )\n", + "\n", + " # Since one example might give us several features if it has a long context, we need a map from a feature to\n", + " # its corresponding example. This key gives us just that.\n", + " sample_mapping = tokenized_examples.pop(\"overflow_to_sample_mapping\")\n", + " # The offset mappings will give us a map from token to character position in the original context. This will\n", + " # help us compute the start_positions and end_positions.\n", + " offset_mapping = tokenized_examples.pop(\"offset_mapping\")\n", + "\n", + " # Let's label those examples!\n", + " tokenized_examples[\"start_positions\"] = []\n", + " tokenized_examples[\"end_positions\"] = []\n", + "\n", + " for i, offsets in enumerate(offset_mapping):\n", + " # We will label impossible answers with the index of the CLS token.\n", + " input_ids = tokenized_examples[\"input_ids\"][i]\n", + " cls_index = input_ids.index(tokenizer.cls_token_id)\n", + "\n", + " # Grab the sequence corresponding to that example (to know what is the context and what is the question).\n", + " sequence_ids = tokenized_examples.sequence_ids(i)\n", + "\n", + " # One example can give several spans, this is the index of the example containing this span of text.\n", + " sample_index = sample_mapping[i]\n", + " answers = examples[answer_column_name][sample_index]\n", + " # If no answers are given, set the cls_index as answer.\n", + " if len(answers[\"answer_start\"]) == 0:\n", + " tokenized_examples[\"start_positions\"].append(cls_index)\n", + " tokenized_examples[\"end_positions\"].append(cls_index)\n", + " else:\n", + " # Start/end character index of the answer in the text.\n", + " start_char = answers[\"answer_start\"][0]\n", + " end_char = start_char + len(answers[\"text\"][0])\n", + "\n", + " # Start token index of the current span in the text.\n", + " token_start_index = 0\n", + " while sequence_ids[token_start_index] != (1 if pad_on_right else 0):\n", + " token_start_index += 1\n", + "\n", + " # End token index of the current span in the text.\n", + " token_end_index = len(input_ids) - 1\n", + " while sequence_ids[token_end_index] != (1 if pad_on_right else 0):\n", + " token_end_index -= 1\n", + "\n", + " # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).\n", + " if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):\n", + " tokenized_examples[\"start_positions\"].append(cls_index)\n", + " tokenized_examples[\"end_positions\"].append(cls_index)\n", + " else:\n", + " # Otherwise move the token_start_index and token_end_index to the two ends of the answer.\n", + " # Note: we could go after the last offset if the answer is the last word (edge case).\n", + " while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:\n", + " token_start_index += 1\n", + " tokenized_examples[\"start_positions\"].append(token_start_index - 1)\n", + " while offsets[token_end_index][1] >= end_char:\n", + " token_end_index -= 1\n", + " tokenized_examples[\"end_positions\"].append(token_end_index + 1)\n", + "\n", + " return tokenized_examples\n", + "\n", + "if training_args.do_train:\n", + " if \"train\" not in raw_datasets:\n", + " raise ValueError(\"--do_train requires a train dataset\")\n", + " train_dataset = raw_datasets[\"train\"]\n", + " with training_args.main_process_first(desc=\"train dataset map pre-processing\"):\n", + " train_dataset = train_dataset.map(\n", + " prepare_train_features,\n", + " batched=True,\n", + " remove_columns=column_names,\n", + " load_from_cache_file=not data_args.overwrite_cache,\n", + " desc=\"Running tokenizer on train dataset\"\n", + " )\n", + " if data_args.max_train_samples is not None:\n", + " # Number of samples might increase during Feature Creation, We select only specified max samples\n", + " max_train_samples = min(len(train_dataset), data_args.max_train_samples)\n", + " train_dataset = train_dataset.select(range(max_train_samples))\n", + "\n", + "# Validation preprocessing\n", + "def prepare_validation_features(examples):\n", + " # Some of the questions have lots of whitespace on the left, which is not useful and will make the\n", + " # truncation of the context fail (the tokenized question will take a lots of space). So we remove that\n", + " # left whitespace\n", + " examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]\n", + "\n", + " # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results\n", + " # in one example possible giving several features when a context is long, each of those features having a\n", + " # context that overlaps a bit the context of the previous feature.\n", + " tokenized_examples = tokenizer(\n", + " examples[question_column_name if pad_on_right else context_column_name],\n", + " examples[context_column_name if pad_on_right else question_column_name],\n", + " truncation=\"only_second\" if pad_on_right else \"only_first\",\n", + " max_length=max_seq_length,\n", + " stride=data_args.doc_stride,\n", + " return_overflowing_tokens=True,\n", + " return_offsets_mapping=True,\n", + " padding=\"max_length\" if data_args.pad_to_max_length else False,\n", + " )\n", + "\n", + " # Since one example might give us several features if it has a long context, we need a map from a feature to\n", + " # its corresponding example. This key gives us just that.\n", + " sample_mapping = tokenized_examples.pop(\"overflow_to_sample_mapping\")\n", + "\n", + " # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the\n", + " # corresponding example_id and we will store the offset mappings.\n", + " tokenized_examples[\"example_id\"] = []\n", + "\n", + " for i in range(len(tokenized_examples[\"input_ids\"])):\n", + " # Grab the sequence corresponding to that example (to know what is the context and what is the question).\n", + " sequence_ids = tokenized_examples.sequence_ids(i)\n", + " context_index = 1 if pad_on_right else 0\n", + "\n", + " # One example can give several spans, this is the index of the example containing this span of text.\n", + " sample_index = sample_mapping[i]\n", + " tokenized_examples[\"example_id\"].append(examples[\"id\"][sample_index])\n", + "\n", + " # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token\n", + " # position is part of the context or not.\n", + " tokenized_examples[\"offset_mapping\"][i] = [\n", + " (o if sequence_ids[k] == context_index else None)\n", + " for k, o in enumerate(tokenized_examples[\"offset_mapping\"][i])\n", + " ]\n", + "\n", + " return tokenized_examples\n", + "\n", + "if training_args.do_eval:\n", + " if \"validation\" not in raw_datasets:\n", + " raise ValueError(\"--do_eval requires a validation dataset\")\n", + " eval_examples = raw_datasets[\"validation\"]\n", + " if data_args.max_eval_samples is not None:\n", + " # We will select sample from whole data\n", + " eval_examples = eval_examples.select(range(data_args.max_eval_samples))\n", + " # Validation Feature Creation\n", + " with training_args.main_process_first(desc=\"validation dataset map pre-processing\"):\n", + " eval_dataset = eval_examples.map(\n", + " prepare_validation_features,\n", + " batched=True,\n", + " num_proc=data_args.preprocessing_num_workers,\n", + " remove_columns=column_names,\n", + " load_from_cache_file=not data_args.overwrite_cache,\n", + " desc=\"Running tokenizer on validation dataset\",\n", + " )\n", + " if data_args.max_eval_samples is not None:\n", + " # During Feature creation dataset samples might increase, we will select required samples again\n", + " eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))\n", + "\n", + "# Data collator\n", + "data_collator = (\n", + " DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)\n", + ")\n", + "\n", + "def post_processing_function(examples, features, predictions, stage=\"eval\"):\n", + " # Post-processing: we match the start logits and end logits to answers in the original context.\n", + " predictions = postprocess_qa_predictions(\n", + " examples=examples,\n", + " features=features,\n", + " predictions=predictions,\n", + " version_2_with_negative=data_args.version_2_with_negative,\n", + " n_best_size=data_args.n_best_size,\n", + " max_answer_length=data_args.max_answer_length,\n", + " null_score_diff_threshold=data_args.null_score_diff_threshold,\n", + " output_dir=training_args.output_dir,\n", + " log_level=log_level,\n", + " prefix=stage,\n", + " )\n", + " # Format the result to the format the metric expects.\n", + " if data_args.version_2_with_negative:\n", + " formatted_predictions = [\n", + " {\"id\": k, \"prediction_text\": v, \"no_answer_probability\": 0.0} for k, v in predictions.items()\n", + " ]\n", + " else:\n", + " formatted_predictions = [{\"id\": k, \"prediction_text\": v} for k, v in predictions.items()]\n", + "\n", + " references = [{\"id\": ex[\"id\"], \"answers\": ex[answer_column_name]} for ex in examples]\n", + " return EvalPrediction(predictions=formatted_predictions, label_ids=references)\n", + "\n", + "metric = load_metric(\"squad_v2\" if data_args.version_2_with_negative else \"squad\")\n", + "\n", + "def compute_metrics(p: EvalPrediction):\n", + " return metric.compute(predictions=p.predictions, references=p.label_ids)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distillation & Benchmark" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Distillation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class QAModel_output_reshaped(torch.nn.Module):\n", + " def __init__(self, model):\n", + " super(QAModel_output_reshaped, self).__init__()\n", + " self.model = model\n", + "\n", + " def forward(self, *args, **kwargs):\n", + " outputs = self.model(*args, **kwargs)\n", + " outputs_reshaped = torch.vstack([torch.vstack([sx, ex]) \\\n", + " for sx, ex in zip(outputs['start_logits'], outputs['end_logits'])])\n", + " return outputs_reshaped\n", + "\n", + "teacher_config = AutoConfig.from_pretrained(\n", + " optim_args.teacher_model_name_or_path,\n", + " use_auth_token=True if model_args.use_auth_token else None,\n", + ")\n", + "teacher_tokenizer = AutoTokenizer.from_pretrained(\n", + " optim_args.teacher_model_name_or_path,\n", + " use_fast=True,\n", + " use_auth_token=True if model_args.use_auth_token else None,\n", + ")\n", + "teacher_model = AutoModelForQuestionAnswering.from_pretrained(\n", + " optim_args.teacher_model_name_or_path,\n", + " from_tf=bool(\".ckpt\" in model_args.model_name_or_path),\n", + " config=teacher_config,\n", + " use_auth_token=True if model_args.use_auth_token else None,\n", + ")\n", + "teacher_model.to(training_args.device)\n", + "\n", + "# Prepare datasets for teacher model\n", + "# Create train feature from dataset\n", + "with training_args.main_process_first(desc=\"train dataset map pre-processing\"):\n", + " teacher_train_dataset = train_examples.map(\n", + " functools.partial(prepare_train_features, tokenizer=teacher_tokenizer),\n", + " batched=True,\n", + " num_proc=data_args.preprocessing_num_workers,\n", + " remove_columns=column_names,\n", + " load_from_cache_file=not data_args.overwrite_cache,\n", + " desc=\"Running tokenizer on train dataset\",\n", + " )\n", + "if data_args.max_train_samples is not None:\n", + " # Number of samples might increase during Feature Creation, We select only specified max samples\n", + " teacher_train_dataset = teacher_train_dataset.select(range(data_args.max_train_samples))\n", + "\n", + "# Validation Feature Creation\n", + "with training_args.main_process_first(desc=\"validation dataset map pre-processing\"):\n", + " teacher_eval_dataset = eval_examples.map(\n", + " functools.partial(prepare_validation_features, tokenizer=teacher_tokenizer),\n", + " batched=True,\n", + " num_proc=data_args.preprocessing_num_workers,\n", + " remove_columns=column_names,\n", + " load_from_cache_file=not data_args.overwrite_cache,\n", + " desc=\"Running tokenizer on validation dataset\",\n", + " )\n", + "if data_args.max_eval_samples is not None:\n", + " # During Feature creation dataset samples might increase, we will select required samples again\n", + " teacher_eval_dataset = teacher_eval_dataset.select(range(data_args.max_eval_samples))\n", + " \n", + "# get logits of teacher model\n", + "if optim_args.run_teacher_logits:\n", + " def dict_tensor_to_model_device(batch, model):\n", + " device = next(model.parameters()).device\n", + " for k in batch:\n", + " batch[k] = batch[k].to(device)\n", + "\n", + " def get_logits(teacher_model, train_dataset, teacher_train_dataset):\n", + " logger.info(\"***** Getting logits of teacher model *****\")\n", + " logger.info(f\" Num examples = {len(train_dataset) }\")\n", + " teacher_model.eval()\n", + " npy_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),\n", + " '{}.{}.npy'.format(data_args.dataset_name, \n", + " optim_args.teacher_model_name_or_path.replace('/', '.')))\n", + " if os.path.exists(npy_file):\n", + " teacher_logits = [list(x) for x in np.load(npy_file, allow_pickle=True)]\n", + " else:\n", + " sampler = None\n", + " if training_args.world_size > 1:\n", + " from transformers.trainer_pt_utils import ShardSampler\n", + " sampler = ShardSampler(\n", + " teacher_train_dataset,\n", + " batch_size=training_args.per_device_eval_batch_size,\n", + " num_processes=training_args.world_size,\n", + " process_index=training_args.process_index,\n", + " )\n", + " teacher_model = torch.nn.parallel.DistributedDataParallel(\n", + " teacher_model,\n", + " device_ids=[training_args.local_rank] \\\n", + " if training_args._n_gpu != 0 else None,\n", + " output_device=training_args.local_rank \\\n", + " if training_args._n_gpu != 0 else None,\n", + " )\n", + " train_dataloader = DataLoader(teacher_train_dataset, \n", + " collate_fn=data_collator, \n", + " sampler=sampler,\n", + " batch_size=training_args.per_device_eval_batch_size)\n", + " train_dataloader = tqdm(train_dataloader, desc=\"Evaluating\")\n", + " teacher_logits = []\n", + " for step, batch in enumerate(train_dataloader):\n", + " dict_tensor_to_model_device(batch, teacher_model)\n", + " outputs = teacher_model(**batch).cpu().numpy()\n", + " if training_args.world_size > 1:\n", + " outputs_list = [None for i in range(training_args.world_size)]\n", + " torch.distributed.all_gather_object(outputs_list, outputs)\n", + " outputs = np.concatenate(outputs_list, axis=0)\n", + " teacher_logits += [[s,e] for s,e in zip(outputs[0::2], outputs[1::2])]\n", + " if training_args.world_size > 1:\n", + " teacher_logits = teacher_logits[:len(teacher_train_dataset)]\n", + " if training_args.local_rank in [-1, 0]:\n", + " np.save(npy_file, teacher_logits, allow_pickle=True)\n", + " return train_dataset.add_column('teacher_logits', teacher_logits[:data_args.max_train_samples])\n", + " with torch.no_grad():\n", + " train_dataset = get_logits(QAModel_output_reshaped(teacher_model), train_dataset, teacher_train_dataset)\n", + " \n", + "para_counter = lambda model:sum(p.numel() for p in model.parameters())\n", + "logger.info(\"***** Number of teacher model parameters: {:.2f}M *****\".format(\\\n", + " para_counter(teacher_model)/10**6))\n", + "logger.info(\"***** Number of student model parameters: {:.2f}M *****\".format(\\\n", + " para_counter(model)/10**6))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "set_seed(training_args.seed)\n", + "# Initialize our Trainer\n", + "trainer = QuestionAnsweringTrainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=train_dataset if training_args.do_train else None,\n", + " eval_dataset=eval_dataset if training_args.do_eval else None,\n", + " eval_examples=eval_examples if training_args.do_eval else None,\n", + " tokenizer=tokenizer,\n", + " data_collator=data_collator,\n", + " post_process_function=post_processing_function,\n", + " compute_metrics=compute_metrics,\n", + ")\n", + "\n", + "tune_metric = metrics.Metric(name=optim_args.metric_name)\n", + "distillation_conf = DistillationConfig(metrics=tune_metric)\n", + "model = trainer.distill(\n", + " distillation_config=distillation_conf, teacher_model=teacher_model\n", + ")\n", + "trainer.save_model(training_args.output_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run Benchmark after Distillation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the model obtained after Intel Neural Compressor (INC) quantization\n", + "model = OptimizedModel.from_pretrained(\n", + " training_args.output_dir,\n", + ")\n", + "model.eval()\n", + "trainer.model = model\n", + "start_time = timeit.default_timer()\n", + "results = trainer.evaluate()\n", + "evalTime = timeit.default_timer() - start_time\n", + "max_eval_samples = data_args.max_eval_samples \\\n", + " if data_args.max_eval_samples is not None else len(eval_dataset)\n", + "eval_samples = min(max_eval_samples, len(eval_dataset))\n", + "samples = eval_samples - (eval_samples % batch_size) \\\n", + " if training_args.dataloader_drop_last else eval_samples\n", + "logger.info(\"metrics keys: {}\".format(results.keys()))\n", + "bert_task_acc_keys = ['eval_f1', 'eval_accuracy', 'eval_matthews_correlation',\n", + " 'eval_pearson', 'eval_mcc', 'eval_spearmanr']\n", + "ret = False\n", + "for key in bert_task_acc_keys:\n", + " if key in results.keys():\n", + " ret = True\n", + " print('Batch size = ', training_args.per_device_eval_batch_size)\n", + " print(\"Finally Eval {} Accuracy: {}\".format(key, results[key]))\n", + " print(\"Latency: {:.5f} ms\".format(evalTime / samples * 1000))\n", + " print(\"Throughput: {:.5f} samples/sec\".format(samples/evalTime))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.6 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.6" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "a3ed54c68abdb79eabea0140062ffa976ea4d8132b937aa83ca919a8d862edf2" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/tutorials/pytorch/summarization/benchmark.py b/docs/tutorials/pytorch/summarization/benchmark.py new file mode 100644 index 00000000000..e070db9c719 --- /dev/null +++ b/docs/tutorials/pytorch/summarization/benchmark.py @@ -0,0 +1,239 @@ +import logging +import os +import numpy as np +import nltk +from datasets import load_dataset, load_metric +from intel_extension_for_transformers import metrics, OptimizedModel +from intel_extension_for_transformers.optimization.trainer import NLPSeq2SeqTrainer +from argparse import ArgumentParser +from transformers import ( + AutoConfig, + AutoModelForSeq2SeqLM, + AutoTokenizer, + DataCollatorForSeq2Seq, + Seq2SeqTrainingArguments, + set_seed, +) + +os.environ["WANDB_DISABLED"] = "true" + +logger = logging.getLogger(__name__) + +arg_parser = ArgumentParser(description='Parse args') +arg_parser.add_argument('--data_type', default = "int8", help='data type of model') +arg_parser.add_argument('--model_name_or_path', default = "lvwerra/pegasus-samsum", help = 'input model for benchmark') +args = arg_parser.parse_args() + +dataset_name="samsum" +summarization_name_mapping = { + "amazon_reviews_multi": ("review_body", "review_title"), + "big_patent": ("description", "abstract"), + "cnn_dailymail": ("article", "highlights"), + "orange_sum": ("text", "summary"), + "pn_summary": ("article", "summary"), + "psc": ("extract_text", "summary_text"), + "samsum": ("dialogue", "summary"), + "thaisum": ("body", "summary"), + "xglue": ("news_body", "news_title"), + "xsum": ("document", "summary"), + "wiki_summary": ("article", "highlights"), +} +training_args = Seq2SeqTrainingArguments( + output_dir=args.model_name_or_path, + do_eval=True, + do_train=True, + no_cuda=True, + predict_with_generate=True, + overwrite_output_dir=True, + per_device_train_batch_size=8, + per_device_eval_batch_size=8, +) + +raw_datasets = load_dataset(dataset_name) +config = AutoConfig.from_pretrained(args.model_name_or_path, revision="main") +tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=True, revision="main") + +## start with int8 benchmarking +if args.data_type == "int8": + # Load the model obtained after Intel Neural Compressor (INC) quantization + model = OptimizedModel.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + revision="main", + use_auth_token=None, + ) +else: + ## original fp32 model benchmarking + model = AutoModelForSeq2SeqLM.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + revision="main" + ) + model.resize_token_embeddings(len(tokenizer)) + +if model.config.decoder_start_token_id is None: + raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") + +if ( + hasattr(model.config, "max_position_embeddings") + and model.config.max_position_embeddings < 1024 +): + model.resize_position_embeddings(1024) + +prefix = "" +# preprocessing dataset + +# Preprocessing the datasets. +# We need to tokenize inputs and targets. +if training_args.do_train: + column_names = raw_datasets["train"].column_names +elif training_args.do_eval: + column_names = raw_datasets["validation"].column_names +elif training_args.do_predict: + column_names = raw_datasets["test"].column_names +else: + logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.") + + +# Get the column names for input/target. +dataset_columns = summarization_name_mapping.get(dataset_name, None) +text_column = dataset_columns[0] if dataset_columns is not None else column_names[0] +summary_column = dataset_columns[1] if dataset_columns is not None else column_names[1] + +# Temporarily set max_target_length for training. +max_target_length = 128 +padding = False + +def preprocess_function(examples): + # remove pairs where at least one record is None + + inputs, targets = [], [] + for i in range(len(examples[text_column])): + if examples[text_column][i] is not None and examples[summary_column][i] is not None: + inputs.append(examples[text_column][i]) + targets.append(examples[summary_column][i]) + + inputs = [prefix + inp for inp in inputs] + model_inputs = tokenizer(inputs, max_length=1024, padding=padding, truncation=True) + + # Setup the tokenizer for targets + with tokenizer.as_target_tokenizer(): + labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) + + # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore + # padding in the loss. + if padding == "max_length": + labels["input_ids"] = [ + [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] + ] + + model_inputs["labels"] = labels["input_ids"] + return model_inputs + + +if training_args.do_train: + if "train" not in raw_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = raw_datasets["train"] + max_train_samples = min(len(train_dataset), 10000) + train_dataset = train_dataset.select(range(max_train_samples)) + with training_args.main_process_first(desc="train dataset map pre-processing"): + train_dataset = train_dataset.map( + preprocess_function, + batched=True, + remove_columns=column_names, + load_from_cache_file=False, + desc="Running tokenizer on train dataset", + ) + +if training_args.do_eval: + max_target_length = 128 + if "validation" not in raw_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = raw_datasets["validation"] + max_eval_samples = min(len(eval_dataset), 500) + eval_dataset = eval_dataset.select(range(max_eval_samples)) + with training_args.main_process_first(desc="validation dataset map pre-processing"): + eval_dataset = eval_dataset.map( + preprocess_function, + batched=True, + remove_columns=column_names, + load_from_cache_file=False, + desc="Running tokenizer on validation dataset", + ) + +# Data collator +label_pad_token_id = -100 +data_collator = DataCollatorForSeq2Seq( + tokenizer, + model=model, + label_pad_token_id=label_pad_token_id, + pad_to_multiple_of=8 if training_args.fp16 else None, +) + +# Metric +metric = load_metric("rouge") + +def postprocess_text(preds, labels): + preds = [pred.strip() for pred in preds] + labels = [label.strip() for label in labels] + + # rougeLSum expects newline after each sentence + preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] + labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] + + return preds, labels + +def compute_metrics(eval_preds): + preds, labels = eval_preds + if isinstance(preds, tuple): + preds = preds[0] + decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) + # Replace -100 in the labels as we can't decode them. + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + + # Some simple post-processing + decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) + # Extract a few results from ROUGE + result = {key: value.mid.fmeasure * 100 for key, value in result.items()} + + prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] + result["gen_len"] = np.mean(prediction_lens) + result = {k: round(v, 4) for k, v in result.items()} + return result + +# Initialize the Trainer +set_seed(training_args.seed) +trainer = NLPSeq2SeqTrainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics if training_args.predict_with_generate else None, +) +max_length = ( + training_args.generation_max_length + if training_args.generation_max_length is not None + else 128 +) +num_beams = training_args.generation_num_beams +trainer.max_length = max_length +trainer.num_beams = num_beams + +results = trainer.evaluate(max_length=max_length, num_beams=num_beams) +bert_task_acc_keys = ['eval_loss', 'eval_f1', 'eval_accuracy', 'eval_matthews_correlation', + 'eval_pearson', 'eval_mcc', 'eval_spearmanr'] + +throughput = results.get("eval_samples_per_second") +eval_loss = results["eval_loss"] +print('Batch size = {}'.format(training_args.per_device_eval_batch_size)) +print("Finally Eval eval_loss Accuracy: {}".format(eval_loss)) +print("Latency: {:.3f} ms".format(1000 / throughput)) +print("Throughput: {} samples/sec".format(throughput)) diff --git a/docs/tutorials/pytorch/summarization/pegasus-samsum.ipynb b/docs/tutorials/pytorch/summarization/pegasus-samsum.ipynb index 101186af5fd..013559fe459 100644 --- a/docs/tutorials/pytorch/summarization/pegasus-samsum.ipynb +++ b/docs/tutorials/pytorch/summarization/pegasus-samsum.ipynb @@ -57,7 +57,7 @@ } }, "source": [ - "* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. " + "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. " ] }, { @@ -304,7 +304,7 @@ " max_eval_samples=500\n", ")\n", "training_args = Seq2SeqTrainingArguments(\n", - " output_dir=\"/tmp/tst-summarization\",\n", + " output_dir=\"./saved_results_dynamic\",\n", " do_eval=True,\n", " do_train=True,\n", " no_cuda=True,\n", @@ -625,8 +625,8 @@ "metric_name = \"eval_rougeLsum\"\n", "\n", "# tuning\n", - "model.config.save_pretrained(\"/tmp/tst-summarization/saved_pretrained_static\")\n", - "trainer.save_model(\"/tmp/tst-summarization/saved_model_static\")\n", + "model.config.save_pretrained(\"./saved_results_dynamic\")\n", + "trainer.save_model(\"./saved_results_dynamic\")\n", "\n", "tune_metric = nlp_metrics.Metric(\n", " name=metric_name, is_relative=True, criterion=0.25\n", @@ -674,6 +674,25 @@ "print(\"Throughput: {:.5f} samples/sec\".format(throughput))" ] }, + { + "cell_type": "markdown", + "id": "f35818b8", + "metadata": {}, + "source": [ + "## Run Benchmark after Dynamic Post Training Quantization with Multi-Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bac69198", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "results = os.system('bash ../multi_instance.sh --model=saved_results_dynamic --core_per_instance=4 --data_type=int8')" + ] + }, { "cell_type": "markdown", "id": "44cca2a1", @@ -716,6 +735,25 @@ "print(\"Latency: {:.5f} ms\".format(1000 / throughput_fp32))\n", "print(\"Throughput: {:.5f} samples/sec\".format(throughput_fp32))" ] + }, + { + "cell_type": "markdown", + "id": "05df025f", + "metadata": {}, + "source": [ + "## Run Benchmark for FP32 Model with Multi-Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71e4e7da", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "results = os.system('bash ../multi_instance.sh --model=textattack/bert-base-uncased-MRPC --core_per_instance=4 --data_type=fp32')" + ] } ], "metadata": { diff --git a/docs/tutorials/pytorch/text-classification/benchmark.py b/docs/tutorials/pytorch/text-classification/benchmark.py new file mode 100644 index 00000000000..eac4d67ec7d --- /dev/null +++ b/docs/tutorials/pytorch/text-classification/benchmark.py @@ -0,0 +1,177 @@ +import logging +import os +import numpy as np +import random +from datasets import load_dataset, load_metric +from intel_extension_for_transformers import OptimizedModel +from intel_extension_for_transformers.optimization.trainer import NLPTrainer +from argparse import ArgumentParser +from transformers import ( + AutoConfig, + AutoModelForSequenceClassification, + AutoTokenizer, + EvalPrediction, + PretrainedConfig, + TrainingArguments, + set_seed, +) + +os.environ["WANDB_DISABLED"] = "true" +task_to_keys = { + "cola": ("sentence", None), + "mnli": ("premise", "hypothesis"), + "mrpc": ("sentence1", "sentence2"), + "qnli": ("question", "sentence"), + "qqp": ("question1", "question2"), + "rte": ("sentence1", "sentence2"), + "sst2": ("sentence", None), + "stsb": ("sentence1", "sentence2"), + "wnli": ("sentence1", "sentence2"), +} +logger = logging.getLogger(__name__) + +arg_parser = ArgumentParser(description='Parse args') +arg_parser.add_argument('--data_type', default = "int8", help='data type of model') +arg_parser.add_argument('--model_name_or_path', default = "textattack/bert-base-uncased-MRPC", help = 'input model for benchmark') +args = arg_parser.parse_args() + +# download the dataset. +raw_datasets = load_dataset("glue", "mrpc") +# Labels +label_list = raw_datasets["train"].features["label"].names +num_labels = len(label_list) + +training_args = TrainingArguments( + output_dir=args.model_name_or_path, + do_eval=True, + do_train=True, + no_cuda=True, + overwrite_output_dir=True, + per_device_train_batch_size=8, +) +config = AutoConfig.from_pretrained( + args.model_name_or_path, + num_labels=num_labels, + finetuning_task="mrpc", + revision="main" +) +tokenizer = AutoTokenizer.from_pretrained( + args.model_name_or_path, + use_fast=True, + revision="main" +) + +## start with int8 benchmarking +if args.data_type == "int8": + # Load the model obtained after Intel Neural Compressor (INC) quantization + model = OptimizedModel.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + revision="main", + use_auth_token=None, + ) +else: + ## original fp32 model benchmarking + model = AutoModelForSequenceClassification.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + revision="main" + ) + +# Preprocessing the raw_datasets +sentence1_key, sentence2_key = task_to_keys["mrpc"] +# Padding strategy +padding = False +# Some models have set the order of the labels to use, so let's make sure we do use it. +label_to_id = None +if model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id: + # Some have all caps in their config, some don't. + label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()} + if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): + label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)} + else: + logger.warning( + f"Your model seems to have been trained with labels, but they don't match the dataset: " + f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}.\n" + f"Ignoring the model labels as a result." + ) +if label_to_id is not None: + model.config.label2id = label_to_id + model.config.id2label = {id: label for label, id in config.label2id.items()} +max_seq_length = min(128, tokenizer.model_max_length) + +def preprocess_function(examples): + # Tokenize the texts + args = ( + (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) + ) + result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) + + # Map labels to IDs (not necessary for GLUE tasks) + if label_to_id is not None and "label" in examples: + result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]] + return result + +with training_args.main_process_first(desc="dataset map pre-processing"): + raw_datasets = raw_datasets.map( + preprocess_function, batched=True, load_from_cache_file=False + ) + +if training_args.do_train: + if "train" not in raw_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = raw_datasets["train"] + +if training_args.do_eval: + if "validation" not in raw_datasets and "validation_matched" not in raw_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = raw_datasets["validation"] + +# Log a few random samples from the training set: +if training_args.do_train: + for index in random.sample(range(len(train_dataset)), 3): + logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") + +# Get the metric function +metric = load_metric("glue", "mrpc") + +metric_name = "eval_accuracy" + +# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a +# predictions and label_ids field) and has to return a dictionary string to float. +def compute_metrics(p: EvalPrediction): + preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions + preds = np.argmax(preds, axis=1) + result = metric.compute(predictions=preds, references=p.label_ids) + if len(result) > 1: + result["combined_score"] = np.mean(list(result.values())).item() + return result + +# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. +data_collator = None + +# Initialize the Trainer +set_seed(training_args.seed) +trainer = NLPTrainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + compute_metrics=compute_metrics, + tokenizer=tokenizer, + data_collator=data_collator, +) + + +results = trainer.evaluate() +bert_task_acc_keys = ['eval_loss', 'eval_f1', 'eval_accuracy', 'eval_matthews_correlation', + 'eval_pearson', 'eval_mcc', 'eval_spearmanr'] + +throughput = results.get("eval_samples_per_second") +eval_loss = results["eval_loss"] +print('Batch size = {}'.format(training_args.per_device_eval_batch_size)) +print("Finally Eval eval_loss Accuracy: {}".format(eval_loss)) +print("Latency: {:.3f} ms".format(1000 / throughput)) +print("Throughput: {} samples/sec".format(throughput)) diff --git a/docs/tutorials/pytorch/text-classification/bert-base-uncased-MRPC.ipynb b/docs/tutorials/pytorch/text-classification/bert-base-uncased-MRPC.ipynb index 15fda15cf64..926ed3e63e9 100644 --- a/docs/tutorials/pytorch/text-classification/bert-base-uncased-MRPC.ipynb +++ b/docs/tutorials/pytorch/text-classification/bert-base-uncased-MRPC.ipynb @@ -36,6 +36,14 @@ "# Prerequisite" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "2172da4c", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "id": "86b20e2b", @@ -57,7 +65,7 @@ } }, "source": [ - "* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. " + "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. " ] }, { @@ -279,7 +287,7 @@ " overwrite_cache=True\n", ")\n", "training_args = TrainingArguments(\n", - " output_dir=\"./saved_result\",\n", + " output_dir=\"./saved_result_static\",\n", " do_eval=True,\n", " do_train=True,\n", " no_cuda=True,\n", @@ -517,8 +525,8 @@ "if not training_args.do_eval:\n", " raise ValueError(\"do_eval must be set to True for quantization.\")\n", "\n", - "model.config.save_pretrained(\"./saved_result/saved_pretrained_static\")\n", - "trainer_static.save_model(\"./saved_result/saved_model_static\")\n", + "model.config.save_pretrained(\"./saved_results_static\")\n", + "trainer_static.save_model(\"./saved_results_static\")\n", "\n", "tune_metric = metrics.Metric(\n", " name=metric_name, is_relative=True, criterion=0.25\n", @@ -566,6 +574,26 @@ "print(\"Throughput: {:.5f} samples/sec\".format(throughput))" ] }, + { + "cell_type": "markdown", + "id": "bc69524d", + "metadata": {}, + "source": [ + "## Run Benchmark after Static Post Training Quantization with Multi-Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "860e0503", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.system('numactl --hardware')\n", + "results = os.system('bash ../multi_instance.sh --model=saved_results_static --core_per_instance=4 --data_type=int8')" + ] + }, { "cell_type": "markdown", "id": "7a51f6ca", @@ -591,6 +619,7 @@ "source": [ "# Set seed before initializing model.\n", "set_seed(training_args.seed)\n", + "training_args.output_dir = \"saved_results_dynamic\"\n", "# Initialize our Trainer\n", "trainer_dynamic = NLPTrainer(\n", " model=model,\n", @@ -606,8 +635,8 @@ "if not training_args.do_eval:\n", " raise ValueError(\"do_eval must be set to True for quantization.\")\n", "\n", - "model.config.save_pretrained(\"./saved_result/saved_pretrained_dynamic\")\n", - "trainer_dynamic.save_model(\"./saved_result/saved_model_dynamic\")\n", + "model.config.save_pretrained(\"./saved_results_dynamic\")\n", + "trainer_dynamic.save_model(\"./saved_results_dynamic\")\n", "\n", "tune_metric = metrics.Metric(\n", " name=metric_name, is_relative=True, criterion=0.25\n", @@ -654,6 +683,25 @@ "print(\"Throughput: {:.5f} samples/sec\".format(throughput))" ] }, + { + "cell_type": "markdown", + "id": "5b98f5dc", + "metadata": {}, + "source": [ + "## Run Benchmark after Dynamic Post Training Quantization with Multi-Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36575ce4", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "results = os.system('bash ../multi_instance.sh --model=saved_results_dynamic --core_per_instance=4 --data_type=int8')" + ] + }, { "cell_type": "markdown", "id": "44cca2a1", @@ -697,6 +745,25 @@ "print(\"Latency: {:.5f} ms\".format(1000 / throughput))\n", "print(\"Throughput: {:.5f} samples/sec\".format(throughput))" ] + }, + { + "cell_type": "markdown", + "id": "7f7c85bc", + "metadata": {}, + "source": [ + "## Run Benchmark for FP32 Model with Multi-Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36dec093", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "results = os.system('bash ../multi_instance.sh --model=lvwerra/pegasus-samsum --core_per_instance=4 --data_type=fp32')" + ] } ], "metadata": { diff --git a/docs/tutorials/pytorch/text-classification/distillation.ipynb b/docs/tutorials/pytorch/text-classification/distillation.ipynb new file mode 100644 index 00000000000..0f9d70a7744 --- /dev/null +++ b/docs/tutorials/pytorch/text-classification/distillation.ipynb @@ -0,0 +1,665 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This tutorial demonstrates how to use the distillation approach based on [IntelĀ® Neural Compressor](https://github.com/intel/neural-compressor) for text-classification." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prerequisite" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install packages" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# install model dependency\n", + "! pip install accelerate datasets >= 1.1.3 sentencepiece != 0.1.92 protobuf torch >= 1.10 transformers >= 4.12.0 wandb\n", + "! pip install -r requirements.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import datasets\n", + "import functools\n", + "import logging\n", + "import os\n", + "import numpy as np\n", + "import random\n", + "import sys\n", + "import torch\n", + "import transformers\n", + "from dataclasses import dataclass, field\n", + "from datasets import load_dataset, load_metric\n", + "from intel_extension_for_transformers import (\n", + " metrics,\n", + " DistillationConfig,\n", + " OptimizedModel,\n", + ")\n", + "from intel_extension_for_transformers.optimization.trainer import NLPTrainer\n", + "from torch.utils.data import DataLoader\n", + "from tqdm.auto import tqdm\n", + "from transformers import (\n", + " AutoConfig,\n", + " AutoModelForSequenceClassification,\n", + " AutoTokenizer,\n", + " DataCollatorWithPadding,\n", + " EvalPrediction,\n", + " HfArgumentParser,\n", + " PretrainedConfig,\n", + " TrainingArguments,\n", + " default_data_collator,\n", + " set_seed,\n", + ")\n", + "from transformers.trainer_utils import get_last_checkpoint\n", + "from transformers.utils import check_min_version\n", + "from typing import Optional\n", + "\n", + "\n", + "\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"\"\n", + "os.environ[\"WANDB_DISABLED\"] = \"true\"\n", + "\n", + "\n", + "# Will error if the minimal version of Transformers is not installed. Remove at your own risks.\n", + "check_min_version(\"4.12.0\")\n", + "\n", + "\n", + "task_to_keys = {\n", + " \"cola\": (\"sentence\", None),\n", + " \"mnli\": (\"premise\", \"hypothesis\"),\n", + " \"mrpc\": (\"sentence1\", \"sentence2\"),\n", + " \"qnli\": (\"question\", \"sentence\"),\n", + " \"qqp\": (\"question1\", \"question2\"),\n", + " \"rte\": (\"sentence1\", \"sentence2\"),\n", + " \"sst2\": (\"sentence\", None),\n", + " \"stsb\": (\"sentence1\", \"sentence2\"),\n", + " \"wnli\": (\"sentence1\", \"sentence2\"),\n", + "}\n", + "\n", + "logger = logging.getLogger(__name__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define arguments" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ========== Define arguments =========\n", + "@dataclass\n", + "class DataTrainingArguments:\n", + " \"\"\"\n", + " Arguments pertaining to what data we are going to input our model for training and eval.\n", + " Using `HfArgumentParser` we can turn this class\n", + " into argparse arguments to be able to specify them on\n", + " the command line.\n", + " \"\"\"\n", + "\n", + " task_name: Optional[str] = field(\n", + " default=None,\n", + " metadata={\"help\": \"The name of the task to train on: \" + \", \".join(task_to_keys.keys())},\n", + " )\n", + " dataset_name: Optional[str] = field(\n", + " default=None, metadata={\"help\": \"The name of the dataset to use (via the datasets library).\"}\n", + " )\n", + " dataset_config_name: Optional[str] = field(\n", + " default=None, metadata={\"help\": \"The configuration name of the dataset to use (via the datasets library).\"}\n", + " )\n", + " max_seq_length: int = field(\n", + " default=128,\n", + " metadata={\n", + " \"help\": \"The maximum total input sequence length after tokenization. Sequences longer \"\n", + " \"than this will be truncated, sequences shorter will be padded.\"\n", + " },\n", + " )\n", + " overwrite_cache: bool = field(\n", + " default=False, metadata={\"help\": \"Overwrite the cached preprocessed datasets or not.\"}\n", + " )\n", + " pad_to_max_length: bool = field(\n", + " default=True,\n", + " metadata={\n", + " \"help\": \"Whether to pad all samples to `max_seq_length`. \"\n", + " \"If False, will pad the samples dynamically when batching to the maximum length in the batch.\"\n", + " },\n", + " )\n", + " max_train_samples: Optional[int] = field(\n", + " default=None,\n", + " metadata={\n", + " \"help\": \"For debugging purposes or quicker training, truncate the number of training examples to this \"\n", + " \"value if set.\"\n", + " },\n", + " )\n", + " max_eval_samples: Optional[int] = field(\n", + " default=None,\n", + " metadata={\n", + " \"help\": \"For debugging purposes or quicker training, truncate the number of evaluation examples to this \"\n", + " \"value if set.\"\n", + " },\n", + " )\n", + " max_predict_samples: Optional[int] = field(\n", + " default=None,\n", + " metadata={\n", + " \"help\": \"For debugging purposes or quicker training, truncate the number of prediction examples to this \"\n", + " \"value if set.\"\n", + " },\n", + " )\n", + " train_file: Optional[str] = field(\n", + " default=None, metadata={\"help\": \"A csv or a json file containing the training data.\"}\n", + " )\n", + " validation_file: Optional[str] = field(\n", + " default=None, metadata={\"help\": \"A csv or a json file containing the validation data.\"}\n", + " )\n", + "\n", + " def __post_init__(self):\n", + " if self.task_name is not None:\n", + " self.task_name = self.task_name.lower()\n", + " if self.task_name not in task_to_keys.keys():\n", + " raise ValueError(\"Unknown task, you should pick one in \" + \",\".join(task_to_keys.keys()))\n", + " elif self.dataset_name is not None:\n", + " pass\n", + " elif self.train_file is None or self.validation_file is None:\n", + " raise ValueError(\"Need either a GLUE task, a training/validation file or a dataset name.\")\n", + " else:\n", + " train_extension = self.train_file.split(\".\")[-1]\n", + " assert train_extension in [\"csv\", \"json\"], \"`train_file` should be a csv or a json file.\"\n", + " validation_extension = self.validation_file.split(\".\")[-1]\n", + " assert (\n", + " validation_extension == train_extension\n", + " ), \"`validation_file` should have the same extension (csv or json) as `train_file`.\"\n", + "\n", + "\n", + "@dataclass\n", + "class ModelArguments:\n", + " \"\"\"\n", + " Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.\n", + " \"\"\"\n", + "\n", + " model_name_or_path: str = field(\n", + " metadata={\"help\": \"Path to pretrained model or model identifier from huggingface.co/models\"}\n", + " )\n", + " config_name: Optional[str] = field(\n", + " default=None, metadata={\"help\": \"Pretrained config name or path if not the same as model_name\"}\n", + " )\n", + " tokenizer_name: Optional[str] = field(\n", + " default=None, metadata={\"help\": \"Pretrained tokenizer name or path if not the same as model_name\"}\n", + " )\n", + " cache_dir: Optional[str] = field(\n", + " default=None,\n", + " metadata={\"help\": \"Where do you want to store the pretrained models downloaded from huggingface.co\"},\n", + " )\n", + " use_fast_tokenizer: bool = field(\n", + " default=True,\n", + " metadata={\"help\": \"Whether to use one of the fast tokenizer (backed by the tokenizers library) or not.\"},\n", + " )\n", + " model_revision: str = field(\n", + " default=\"main\",\n", + " metadata={\"help\": \"The specific model version to use (can be a branch name, tag name or commit id).\"},\n", + " )\n", + " use_auth_token: bool = field(\n", + " default=False,\n", + " metadata={\n", + " \"help\": \"Will use the token generated when running `transformers-cli login` (necessary to use this script \"\n", + " \"with private models).\"\n", + " },\n", + " )\n", + "\n", + "\n", + "@dataclass\n", + "class OptimizationArguments:\n", + " \"\"\"\n", + " Arguments pertaining to what type of optimization we are going to apply on the model.\n", + " \"\"\"\n", + "\n", + " distillation: bool = field(\n", + " default=False,\n", + " metadata={\"help\": \"Whether or not to apply distillation.\"},\n", + " )\n", + " teacher_model_name_or_path: str = field(\n", + " default=False,\n", + " metadata={\"help\": \"Path to pretrained model or model identifier from huggingface.co/models\"}\n", + " )\n", + " metric_name: Optional[str] = field(\n", + " default=None,\n", + " metadata={\"help\": \"Metric used for the tuning strategy.\"},\n", + " )\n", + " tolerance_mode: Optional[str] = field(\n", + " default=\"absolute\",\n", + " metadata={\"help\": \"Metric tolerance model, expected to be relative or absolute.\"},\n", + " )\n", + " perf_tol: Optional[float] = field(\n", + " default=0.02,\n", + " metadata={\"help\": \"Performance tolerance when optimizing the model.\"},\n", + " )\n", + " benchmark: bool = field(\n", + " default=False,\n", + " metadata={\"help\": \"run benchmark.\"})\n", + " accuracy_only: bool = field(\n", + " default=False,\n", + " metadata={\"help\":\"Whether to only test accuracy for model tuned by Neural Compressor.\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We now keep distinct sets of args, for a cleaner separation of concerns.\n", + "parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, OptimizationArguments))\n", + "if len(sys.argv) == 2 and sys.argv[1].endswith(\".json\"):\n", + " # If we pass only one argument to the script and it's the path to a json file,\n", + " # let's parse it to get our arguments.\n", + " model_args, data_args, training_args, optim_args = parser.parse_json_file(\n", + " json_file=os.path.abspath(sys.argv[1])\n", + " )\n", + "else:\n", + " model_args, data_args, training_args, optim_args = parser.parse_args_into_dataclasses()\n", + "\n", + "# Setup logging\n", + "logging.basicConfig(\n", + " format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n", + " datefmt=\"%m/%d/%Y %H:%M:%S\",\n", + " handlers=[logging.StreamHandler(sys.stdout)],\n", + ")\n", + "\n", + "log_level = training_args.get_process_log_level()\n", + "logger.setLevel(log_level)\n", + "datasets.utils.logging.set_verbosity(log_level)\n", + "transformers.utils.logging.set_verbosity(log_level)\n", + "transformers.utils.logging.enable_default_handler()\n", + "transformers.utils.logging.enable_explicit_format()\n", + "\n", + "# Log on each process the small summary:\n", + "logger.warning(\n", + " f\"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}\"\n", + " + f\"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}\"\n", + ")\n", + "logger.info(f\"Training/evaluation parameters {training_args}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download dataset from the hub" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# download the dataset.\n", + "raw_datasets = load_dataset(\"glue\", data_args.task_name)\n", + "# Labels\n", + "label_list = raw_datasets[\"train\"].features[\"label\"].names\n", + "num_labels = len(label_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download fp32 model from the hub" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load pretrained model and tokenizer\n", + "config = AutoConfig.from_pretrained(\n", + " model_args.model_name_or_path,\n", + " num_labels=num_labels,\n", + " finetuning_task=data_args.task_name,\n", + " revision=\"main\"\n", + ")\n", + "tokenizer = AutoTokenizer.from_pretrained(\n", + " model_args.model_name_or_path,\n", + " use_fast=True,\n", + " revision=\"main\"\n", + ")\n", + "model = AutoModelForSequenceClassification.from_pretrained(\n", + " model_args.model_name_or_path,\n", + " from_tf=bool(\".ckpt\" in model_args.model_name_or_path),\n", + " config=config,\n", + " revision=\"main\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocessing the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Preprocessing the raw_datasets\n", + "sentence1_key, sentence2_key = task_to_keys[data_args.task_name]\n", + "# Padding strategy\n", + "padding = False\n", + "# Some models have set the order of the labels to use, so let's make sure we do use it.\n", + "label_to_id = None\n", + "if model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id:\n", + " # Some have all caps in their config, some don't.\n", + " label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}\n", + " if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):\n", + " label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}\n", + " else:\n", + " logger.warning(\n", + " f\"Your model seems to have been trained with labels, but they don't match the dataset: \"\n", + " f\"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}.\\n\"\n", + " f\"Ignoring the model labels as a result.\"\n", + " )\n", + "if label_to_id is not None:\n", + " model.config.label2id = label_to_id\n", + " model.config.id2label = {id: label for label, id in config.label2id.items()}\n", + "max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)\n", + "\n", + "def preprocess_function(examples, tokenizer=tokenizer):\n", + " # Tokenize the texts\n", + " args = (\n", + " (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])\n", + " )\n", + " result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)\n", + "\n", + " # Map labels to IDs (not necessary for GLUE tasks)\n", + " if label_to_id is not None and \"label\" in examples:\n", + " result[\"label\"] = [(label_to_id[l] if l != -1 else -1) for l in examples[\"label\"]]\n", + " return result\n", + "\n", + "with training_args.main_process_first(desc=\"dataset map pre-processing\"):\n", + " raw_datasets = raw_datasets.map(\n", + " preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache\n", + " )\n", + "\n", + "if training_args.do_train:\n", + " if \"train\" not in raw_datasets:\n", + " raise ValueError(\"--do_train requires a train dataset\")\n", + " train_dataset = raw_datasets[\"train\"]\n", + "\n", + "if training_args.do_eval:\n", + " if \"validation\" not in raw_datasets and \"validation_matched\" not in raw_datasets:\n", + " raise ValueError(\"--do_eval requires a validation dataset\")\n", + " eval_dataset = raw_datasets[\"validation_matched\" if data_args.task_name == \"mnli\" else \"validation\"]\n", + "\n", + "# Log a few random samples from the training set:\n", + "if training_args.do_train:\n", + " for index in random.sample(range(len(train_dataset)), 3):\n", + " logger.info(f\"Sample {index} of the training set: {train_dataset[index]}.\")\n", + "\n", + "# Get the metric function\n", + "metric = load_metric(\"glue\", data_args.task_name)\n", + "\n", + "metric_name = \"eval_accuracy\"\n", + "\n", + "# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with\n", + "# predictions and label_ids field) and has to return a dictionary string to float.\n", + "def compute_metrics(p: EvalPrediction):\n", + " preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions\n", + " preds = np.argmax(preds, axis=1)\n", + " if data_args.task_name is not None:\n", + " result = metric.compute(predictions=preds, references=p.label_ids)\n", + " if len(result) > 1:\n", + " result[\"combined_score\"] = np.mean(list(result.values())).item()\n", + " return result\n", + " else:\n", + " return {\"accuracy\": (preds == p.label_ids).astype(np.float32).mean().item()}\n", + "\n", + "# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.\n", + "data_collator = None" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distillation & Benchmark" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Distillation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class BertModelforLogitsOutputOnly(torch.nn.Module):\n", + " def __init__(self, model):\n", + " super(BertModelforLogitsOutputOnly, self).__init__()\n", + " self.model = model\n", + " def forward(self, *args, **kwargs):\n", + " output = self.model(*args, **kwargs)\n", + " return output['logits']\n", + "\n", + "teacher_config = AutoConfig.from_pretrained(optim_args.teacher_model_name_or_path, \\\n", + " num_labels=num_labels, finetuning_task=data_args.task_name)\n", + "teacher_tokenizer = AutoTokenizer.from_pretrained(optim_args.teacher_model_name_or_path, \\\n", + " use_fast=model_args.use_fast_tokenizer)\n", + "teacher_model = AutoModelForSequenceClassification.from_pretrained(\n", + " optim_args.teacher_model_name_or_path,\n", + " from_tf=bool(\".ckpt\" in optim_args.teacher_model_name_or_path),\n", + " config=teacher_config,\n", + ")\n", + "teacher_model.to(training_args.device)\n", + "\n", + "# prepare datasets for teacher model\n", + "teacher_processed_datasets = raw_datasets.map(\n", + " functools.partial(preprocess_function, tokenizer=teacher_tokenizer), \n", + " batched=True, remove_columns=raw_datasets[\"train\"].column_names\n", + ")\n", + "teacher_train_dataset = teacher_processed_datasets[\"train\"]\n", + "teacher_train_dataset = teacher_train_dataset.select(range(data_args.max_train_samples))\n", + "teacher_eval_dataset = teacher_processed_datasets[\"validation_matched\" \\\n", + " if data_args.task_name == \"mnli\" else \"validation\"]\n", + "teacher_eval_dataset = teacher_eval_dataset.select(range(data_args.max_eval_samples))\n", + " \n", + "# get logits of teacher model\n", + "def dict_tensor_to_model_device(batch, model):\n", + " device = next(model.parameters()).device\n", + " for k in batch:\n", + " batch[k] = batch[k].to(device)\n", + "\n", + "def get_logits(teacher_model, train_dataset, teacher_train_dataset):\n", + " logger.info(\"***** Getting logits of teacher model *****\")\n", + " logger.info(f\" Num examples = {len(train_dataset) }\")\n", + " teacher_model.eval()\n", + " npy_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),\n", + " '{}.{}.npy'.format(data_args.task_name, \n", + " optim_args.teacher_model_name_or_path.replace('/', '.')))\n", + " if os.path.exists(npy_file):\n", + " teacher_logits = [x for x in np.load(npy_file)]\n", + " else:\n", + " sampler = None\n", + " if training_args.world_size > 1:\n", + " from transformers.trainer_pt_utils import ShardSampler\n", + " sampler = ShardSampler(\n", + " teacher_train_dataset,\n", + " batch_size=training_args.per_device_eval_batch_size,\n", + " num_processes=training_args.world_size,\n", + " process_index=training_args.process_index,\n", + " )\n", + " teacher_model = torch.nn.parallel.DistributedDataParallel(\n", + " teacher_model,\n", + " device_ids=[training_args.local_rank] \\\n", + " if training_args._n_gpu != 0 else None,\n", + " output_device=training_args.local_rank \\\n", + " if training_args._n_gpu != 0 else None,\n", + " )\n", + " train_dataloader = DataLoader(teacher_train_dataset, \n", + " collate_fn=data_collator,\n", + " sampler=sampler,\n", + " batch_size=training_args.per_device_eval_batch_size)\n", + " train_dataloader = tqdm(train_dataloader, desc=\"Evaluating\")\n", + " teacher_logits = []\n", + " for step, batch in enumerate(train_dataloader):\n", + " dict_tensor_to_model_device(batch, teacher_model)\n", + " outputs = teacher_model(**batch)\n", + " if training_args.world_size > 1:\n", + " outputs_list = [None for i in range(training_args.world_size)]\n", + " torch.distributed.all_gather_object(outputs_list, outputs)\n", + " outputs = torch.concat(outputs_list, dim=0)\n", + " teacher_logits += [x for x in outputs.cpu().numpy()]\n", + " if training_args.world_size > 1:\n", + " teacher_logits = teacher_logits[:len(teacher_train_dataset)]\n", + " if training_args.local_rank in [-1, 0]:\n", + " np.save(npy_file, np.array(teacher_logits))\n", + " return train_dataset.add_column('teacher_logits', teacher_logits)\n", + "\n", + "with torch.no_grad():\n", + " train_dataset = get_logits(BertModelforLogitsOutputOnly(teacher_model), train_dataset, teacher_train_dataset)\n", + " \n", + "para_counter = lambda model:sum(p.numel() for p in model.parameters())\n", + "logger.info(\"***** Number of teacher model parameters: {:.2f}M *****\".format(\\\n", + " para_counter(teacher_model)/10**6))\n", + "logger.info(\"***** Number of student model parameters: {:.2f}M *****\".format(\\\n", + " para_counter(model)/10**6))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set seed before initializing model.\n", + "set_seed(training_args.seed)\n", + "# Initialize our Trainer\n", + "trainer = NLPTrainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=train_dataset if training_args.do_train else None,\n", + " eval_dataset=eval_dataset if training_args.do_eval else None,\n", + " compute_metrics=compute_metrics,\n", + " tokenizer=tokenizer,\n", + " data_collator=data_collator,\n", + ")\n", + "# distillation\n", + "if not training_args.do_eval:\n", + " raise ValueError(\"do_eval must be set to True for distillation.\")\n", + "\n", + "tune_metric = metrics.Metric(name=metric_name)\n", + "distillation_conf = DistillationConfig(metrics=tune_metric)\n", + "model = trainer.distill(\n", + " distillation_config=distillation_conf, teacher_model=teacher_model\n", + ")\n", + "trainer.save_model(training_args.output_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run Benchmark after Distillation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the model obtained after Intel Neural Compressor (INC) quantization\n", + "model = OptimizedModel.from_pretrained(\n", + " training_args.output_dir,\n", + ")\n", + "model.eval()\n", + "trainer.model = model\n", + "results = trainer.evaluate()\n", + "logger.info(\"metrics keys: {}\".format(results.keys()))\n", + "bert_task_acc_keys = ['eval_f1', 'eval_accuracy', 'eval_matthews_correlation',\n", + " 'eval_pearson', 'eval_mcc', 'eval_spearmanr']\n", + "\n", + "for key in bert_task_acc_keys:\n", + " if key in results.keys():\n", + " ret = True\n", + " throughput = results.get(\"eval_samples_per_second\")\n", + " print('Batch size = ', training_args.per_device_eval_batch_size)\n", + " print(\"Finally Eval {} Accuracy: {}\".format(key, results[key]))\n", + " print(\"Latency: {:.5f} ms\".format(1000 / throughput))\n", + " print(\"Throughput: {:.5f} samples/sec\".format(throughput))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.6 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.6" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "a3ed54c68abdb79eabea0140062ffa976ea4d8132b937aa83ca919a8d862edf2" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/tutorials/pytorch/token-classification/benchmark.py b/docs/tutorials/pytorch/token-classification/benchmark.py new file mode 100644 index 00000000000..9c2c083c672 --- /dev/null +++ b/docs/tutorials/pytorch/token-classification/benchmark.py @@ -0,0 +1,242 @@ +import logging +import os +import numpy as np +from datasets import ClassLabel, load_dataset, load_metric +from intel_extension_for_transformers import OptimizedModel +from intel_extension_for_transformers.optimization.trainer import NLPTrainer +from argparse import ArgumentParser +from transformers import ( + AutoConfig, + AutoModelForTokenClassification, + AutoTokenizer, + DataCollatorForTokenClassification, + PretrainedConfig, + TrainingArguments, + set_seed, +) + +os.environ["WANDB_DISABLED"] = "true" + +logger = logging.getLogger(__name__) + +arg_parser = ArgumentParser(description='Parse args') +arg_parser.add_argument('--data_type', default = "int8", help='data type of model') +arg_parser.add_argument('--model_name_or_path', default = "elastic/distilbert-base-uncased-finetuned-conll03-english", help = 'input model for benchmark') +args = arg_parser.parse_args() + +# download the dataset. +raw_datasets = load_dataset("conll2003") +training_args = TrainingArguments( + output_dir=args.model_name_or_path, + do_eval=True, + do_train=True, + no_cuda=True, + overwrite_output_dir=True, + per_device_train_batch_size=8, + per_device_eval_batch_size=8, +) +column_names = raw_datasets["train"].column_names +features = raw_datasets["train"].features +text_column_name = "tokens" +label_column_name = "ner_tags" + +# In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the +# unique labels. +def get_label_list(labels): + unique_labels = set() + for label in labels: + unique_labels = unique_labels | set(label) + label_list = list(unique_labels) + label_list.sort() + return label_list + +# If the labels are of type ClassLabel, they are already integers and we have the map stored somewhere. +# Otherwise, we have to get the list of labels manually. +labels_are_int = isinstance(features[label_column_name].feature, ClassLabel) +if labels_are_int: + label_list = features[label_column_name].feature.names + label_to_id = {i: i for i in range(len(label_list))} +else: + label_list = get_label_list(raw_datasets["train"][label_column_name]) + label_to_id = {l: i for i, l in enumerate(label_list)} + +num_labels = len(label_list) + +# download model & vocab. +config = AutoConfig.from_pretrained( + args.model_name_or_path, + num_labels=num_labels, + finetuning_task="ner", + revision="main", +) + +tokenizer_name_or_path = args.model_name_or_path +if config.model_type in {"gpt2", "roberta"}: + tokenizer = AutoTokenizer.from_pretrained( + tokenizer_name_or_path, + use_fast=True, + revision="main", + add_prefix_space=True, + ) +else: + tokenizer = AutoTokenizer.from_pretrained( + tokenizer_name_or_path, + use_fast=True, + revision="main", + ) + +## start with int8 benchmarking +if args.data_type == "int8": + # Load the model obtained after Intel Neural Compressor (INC) quantization + model = OptimizedModel.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + revision="main", + use_auth_token=None, + ) +else: + ## original fp32 model benchmarking + # Load the model obtained after Intel Neural Compressor (INC) quantization + model = AutoModelForTokenClassification.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + revision="main" + ) +# Model has labels -> use them. +if model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id: + if list(sorted(model.config.label2id.keys())) == list(sorted(label_list)): + # Reorganize `label_list` to match the ordering of the model. + if labels_are_int: + label_to_id = {i: int(model.config.label2id[l]) for i, l in enumerate(label_list)} + label_list = [model.config.id2label[i] for i in range(num_labels)] + else: + label_list = [model.config.id2label[i] for i in range(num_labels)] + label_to_id = {l: i for i, l in enumerate(label_list)} + else: + logger.warning( + "Your model seems to have been trained with labels, but they don't match the dataset: ", + f"model labels: {list(sorted(model.config.label2id.keys()))}, dataset labels: {list(sorted(label_list))}." + "\nIgnoring the model labels as a result.", + ) + +# Set the correspondences label/ID inside the model config +model.config.label2id = {l: i for i, l in enumerate(label_list)} +model.config.id2label = {i: l for i, l in enumerate(label_list)} + +# Map that sends B-Xxx label to its I-Xxx counterpart +b_to_i_label = [] +for idx, label in enumerate(label_list): + if label.startswith("B-") and label.replace("B-", "I-") in label_list: + b_to_i_label.append(label_list.index(label.replace("B-", "I-"))) + else: + b_to_i_label.append(idx) + +# Padding strategy +padding = "max_length" + +# Tokenize all texts and align the labels with them. +def tokenize_and_align_labels(examples): + tokenized_inputs = tokenizer( + examples[text_column_name], + padding=padding, + truncation=True, + # We use this argument because the texts in our dataset are lists of words (with a label for each word). + is_split_into_words=True, + ) + labels = [] + for i, label in enumerate(examples[label_column_name]): + word_ids = tokenized_inputs.word_ids(batch_index=i) + previous_word_idx = None + label_ids = [] + for word_idx in word_ids: + # Special tokens have a word id that is None. We set the label to -100 so they are automatically + # ignored in the loss function. + if word_idx is None: + label_ids.append(-100) + # We set the label for the first token of each word. + elif word_idx != previous_word_idx: + label_ids.append(label_to_id[label[word_idx]]) + # For the other tokens in a word, we set the label to either the current label or -100, depending on + # the label_all_tokens flag. + else: + label_ids.append(-100) + previous_word_idx = word_idx + + labels.append(label_ids) + tokenized_inputs["labels"] = labels + return tokenized_inputs + +# train dataset +train_dataset = raw_datasets["train"] +with training_args.main_process_first(desc="train dataset map pre-processing"): + train_dataset = train_dataset.map( + tokenize_and_align_labels, + batched=True, + load_from_cache_file=False, + desc="Running tokenizer on train dataset", + ) + +# evaluation dataset +eval_dataset = raw_datasets["validation"] +eval_dataset = eval_dataset.select(range(1000)) +with training_args.main_process_first(desc="validation dataset map pre-processing"): + eval_dataset = eval_dataset.map( + tokenize_and_align_labels, + batched=True, + load_from_cache_file=False, + desc="Running tokenizer on validation dataset", + ) + +# Data collator +data_collator = DataCollatorForTokenClassification(tokenizer) + +# Metrics +metric = load_metric("seqeval") +metric_name = "eval_f1" + +def compute_metrics(p): + predictions, labels = p + predictions = np.argmax(predictions, axis=2) + + # Remove ignored index (special tokens) + true_predictions = [ + [label_list[p] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] + true_labels = [ + [label_list[l] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] + + results = metric.compute(predictions=true_predictions, references=true_labels) + return { + "precision": results["overall_precision"], + "recall": results["overall_recall"], + "f1": results["overall_f1"], + "accuracy": results["overall_accuracy"], + } + +# Initialize the Trainer +set_seed(training_args.seed) +trainer = NLPTrainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, +) + +results = trainer.evaluate() +bert_task_acc_keys = ['eval_loss', 'eval_f1', 'eval_accuracy', 'eval_matthews_correlation', + 'eval_pearson', 'eval_mcc', 'eval_spearmanr'] + +throughput = results.get("eval_samples_per_second") +eval_loss = results["eval_loss"] +print('Batch size = {}'.format(training_args.per_device_eval_batch_size)) +print("Finally Eval eval_loss Accuracy: {}".format(eval_loss)) +print("Latency: {:.3f} ms".format(1000 / throughput)) +print("Throughput: {} samples/sec".format(throughput)) diff --git a/docs/tutorials/pytorch/token-classification/distilbert_base_ner.ipynb b/docs/tutorials/pytorch/token-classification/distilbert_base_ner.ipynb index b7bb2c2ba78..25638c19078 100644 --- a/docs/tutorials/pytorch/token-classification/distilbert_base_ner.ipynb +++ b/docs/tutorials/pytorch/token-classification/distilbert_base_ner.ipynb @@ -57,7 +57,7 @@ } }, "source": [ - "* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. " + "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. " ] }, { @@ -242,7 +242,7 @@ " pad_to_max_length=True\n", ")\n", "training_args = TrainingArguments(\n", - " output_dir=\"/tmp/conll03_output\",\n", + " output_dir=\"./saved_results_static\",\n", " do_eval=True,\n", " do_train=True,\n", " no_cuda=True,\n", @@ -554,8 +554,8 @@ ")\n", "\n", "# tuning\n", - "model.config.save_pretrained(training_args.output_dir)\n", - "trainer_static.save_model(training_args.output_dir)\n", + "model.config.save_pretrained(\"./saved_results_static\")\n", + "trainer_static.save_model(\"./saved_results_static\")\n", "tune_metric = metrics.Metric(\n", " name=metric_name, is_relative=True, criterion=0.25\n", ")\n", @@ -598,6 +598,26 @@ "print(\"Throughput: {} samples/sec\".format(throughput_static))" ] }, + { + "cell_type": "markdown", + "id": "765c996e", + "metadata": {}, + "source": [ + "## Run Benchmark after Static Post Training Quantization with Multi-Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "103c648d", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.system('numactl --hardware')\n", + "results = os.system('bash ../multi_instance.sh --model=saved_results_static --core_per_instance=4 --data_type=int8')" + ] + }, { "cell_type": "markdown", "id": "7a51f6ca", @@ -623,6 +643,7 @@ "source": [ "# Set seed before initializing model.\n", "set_seed(training_args.seed)\n", + "training_args.output_dir = \"saved_results_dynamic\"\n", "# Initialize our Trainer\n", "trainer_dynamic = NLPTrainer(\n", " model=model,\n", @@ -635,8 +656,8 @@ ")\n", "\n", "# tuning\n", - "model.config.save_pretrained(training_args.output_dir)\n", - "trainer_dynamic.save_model(training_args.output_dir)\n", + "model.config.save_pretrained(\"./saved_results_dynamic\")\n", + "trainer_dynamic.save_model(\"./saved_results_dynamic\")\n", "tune_metric = metrics.Metric(\n", " name=metric_name, is_relative=True, criterion=0.25\n", ")\n", @@ -678,6 +699,25 @@ "print(\"Throughput: {} samples/sec\".format(throughput_dynamic))" ] }, + { + "cell_type": "markdown", + "id": "cb86bdae", + "metadata": {}, + "source": [ + "## Run Benchmark after Dynamic Post Training Quantization with Multi-Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d92088a2", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "results = os.system('bash ../multi_instance.sh --model=saved_results_dynamic --core_per_instance=4 --data_type=int8')" + ] + }, { "cell_type": "markdown", "id": "44cca2a1", @@ -722,6 +762,25 @@ "print(\"Latency: {:.3f} ms\".format(1000 / throughput_fp32))\n", "print(\"Throughput: {} samples/sec\".format(throughput_fp32))" ] + }, + { + "cell_type": "markdown", + "id": "784fe8d5", + "metadata": {}, + "source": [ + "## Run Benchmark for FP32 Model with Multi-Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0239337e", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "results = os.system('bash ../multi_instance.sh --model=elastic/distilbert-base-uncased-finetuned-conll03-english --core_per_instance=4 --data_type=fp32')" + ] } ], "metadata": { diff --git a/docs/tutorials/pytorch/translation/benchmark.py b/docs/tutorials/pytorch/translation/benchmark.py new file mode 100644 index 00000000000..30fbc5b2e59 --- /dev/null +++ b/docs/tutorials/pytorch/translation/benchmark.py @@ -0,0 +1,171 @@ +import logging +import os +import numpy as np +from datasets import load_dataset, load_metric +from intel_extension_for_transformers import OptimizedModel +from intel_extension_for_transformers.optimization.trainer import NLPSeq2SeqTrainer +from argparse import ArgumentParser +from transformers import ( + AutoConfig, + AutoModelForSeq2SeqLM, + AutoTokenizer, + DataCollatorForSeq2Seq, + Seq2SeqTrainingArguments, + set_seed, +) + +os.environ["WANDB_DISABLED"] = "true" + +logger = logging.getLogger(__name__) + +arg_parser = ArgumentParser(description='Parse args') +arg_parser.add_argument('--data_type', default = "int8", help='data type of model') +arg_parser.add_argument('--model_name_or_path', default = "t5-small", help = 'input model for benchmark') +args = arg_parser.parse_args() + +raw_datasets = load_dataset("wmt16", "ro-en") +training_args = Seq2SeqTrainingArguments( + output_dir="./saved_results_dynamic", + do_eval=True, + do_train=True, + no_cuda=True, + overwrite_output_dir=True, + per_device_eval_batch_size=8, + predict_with_generate=True +) +config = AutoConfig.from_pretrained(args.model_name_or_path, revision="main") +tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, revision="main", use_fast=True) +prefix = "" + +## start with int8 benchmarking +if args.data_type == "int8": + # Load the model obtained after Intel Neural Compressor (INC) quantization + model = OptimizedModel.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + revision="main", + use_auth_token=None, + ) +else: + ## original fp32 model benchmarking + # Load the model obtained after Intel Neural Compressor (INC) quantization + model = AutoModelForSeq2SeqLM.from_pretrained( + "t5-small", + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + revision="main" + ) + model.resize_token_embeddings(len(tokenizer)) + +# We need to tokenize inputs and targets. +column_names = raw_datasets["train"].column_names + +# Get the language codes for input/target. +source_lang = "en" +target_lang = "ro" + +# Temporarily set max_target_length for training. +max_target_length = 128 +padding = False + +def preprocess_function(examples): + inputs = [ex[source_lang] for ex in examples["translation"]] + targets = [ex[target_lang] for ex in examples["translation"]] + inputs = [prefix + inp for inp in inputs] + model_inputs = tokenizer(inputs, max_length=1024, padding=padding, truncation=True) + + # Setup the tokenizer for targets + with tokenizer.as_target_tokenizer(): + labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) + + model_inputs["labels"] = labels["input_ids"] + return model_inputs + +# define train dataset +train_dataset = raw_datasets["train"] +with training_args.main_process_first(desc="train dataset map pre-processing"): + train_dataset = train_dataset.map( + preprocess_function, + batched=True, + remove_columns=column_names, + load_from_cache_file=False, + desc="Running tokenizer on train dataset", + ) + +# define eval dataset +eval_dataset = raw_datasets["validation"] +max_eval_samples = min(len(eval_dataset), 400) +eval_dataset = eval_dataset.select(range(max_eval_samples)) +with training_args.main_process_first(desc="validation dataset map pre-processing"): + eval_dataset = eval_dataset.map( + preprocess_function, + batched=True, + remove_columns=column_names, + load_from_cache_file=False, + desc="Running tokenizer on validation dataset", + ) + +# Data collator +label_pad_token_id = -100 +data_collator = DataCollatorForSeq2Seq( + tokenizer, + model=model, + label_pad_token_id=label_pad_token_id, + pad_to_multiple_of=8 if training_args.fp16 else None, +) + +# Metric +metric = load_metric("sacrebleu") + +def postprocess_text(preds, labels): + preds = [pred.strip() for pred in preds] + labels = [[label.strip()] for label in labels] + + return preds, labels + +def compute_metrics(eval_preds): + preds, labels = eval_preds + if isinstance(preds, tuple): + preds = preds[0] + decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) + # Replace -100 in the labels as we can't decode them. + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + + # Some simple post-processing + decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) + + result = metric.compute(predictions=decoded_preds, references=decoded_labels) + result = {"bleu": result["score"]} + + prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] + result["gen_len"] = np.mean(prediction_lens) + result = {k: round(v, 4) for k, v in result.items()} + return result + +metric_name = "eval_bleu" +max_length = 128 +num_beams = None +# Initialize the Trainer +set_seed(training_args.seed) +trainer = NLPSeq2SeqTrainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics if training_args.predict_with_generate else None, +) + +results = trainer.evaluate(max_length=max_length, num_beams=num_beams) +bert_task_acc_keys = ['eval_loss', 'eval_f1', 'eval_accuracy', 'eval_matthews_correlation', + 'eval_pearson', 'eval_mcc', 'eval_spearmanr'] + +throughput = results.get("eval_samples_per_second") +eval_loss = results["eval_loss"] +print('Batch size = {}'.format(training_args.per_device_eval_batch_size)) +print("Finally Eval eval_loss Accuracy: {}".format(eval_loss)) +print("Latency: {:.3f} ms".format(1000 / throughput)) +print("Throughput: {} samples/sec".format(throughput)) diff --git a/docs/tutorials/pytorch/translation/t5-small.ipynb b/docs/tutorials/pytorch/translation/t5-small.ipynb index cff4566c044..90e41e9f12e 100644 --- a/docs/tutorials/pytorch/translation/t5-small.ipynb +++ b/docs/tutorials/pytorch/translation/t5-small.ipynb @@ -57,7 +57,7 @@ } }, "source": [ - "* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. " + "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. " ] }, { @@ -249,7 +249,7 @@ " source_prefix=\"translate English to Romanian: \"\n", ")\n", "training_args = Seq2SeqTrainingArguments(\n", - " output_dir=\"/tmp/tst-translation\",\n", + " output_dir=\"./saved_results_dynamic\",\n", " do_eval=True,\n", " do_train=True,\n", " no_cuda=True,\n", @@ -551,6 +551,25 @@ "print(\"Throughput: {:.5f} samples/sec\".format(throughput))" ] }, + { + "cell_type": "markdown", + "id": "4f477616", + "metadata": {}, + "source": [ + "## Run Benchmark after Dynamic Post Training Quantization with Multi-Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d40db91b", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "results = os.system('bash ../multi_instance.sh --model=saved_results_dynamic --core_per_instance=4 --data_type=int8')" + ] + }, { "cell_type": "markdown", "id": "44cca2a1", @@ -594,6 +613,25 @@ "print(\"Latency: {:.5f} ms\".format(1000 / throughput_fp32))\n", "print(\"Throughput: {:.5f} samples/sec\".format(throughput_fp32))" ] + }, + { + "cell_type": "markdown", + "id": "96e88ce1", + "metadata": {}, + "source": [ + "## Run Benchmark for FP32 Model with Multi-Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ade4c930", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "results = os.system('bash ../multi_instance.sh --model=t5-small --core_per_instance=4 --data_type=fp32')" + ] } ], "metadata": {