diff --git a/conda_meta/meta.yaml b/conda_meta/meta.yaml
index a9ca9591e77..0a5574eb56b 100644
--- a/conda_meta/meta.yaml
+++ b/conda_meta/meta.yaml
@@ -7,7 +7,6 @@ build:
   script_env:
     - IMEX_WHL
   number: {{buildnumber}}
-  noarch: python
   script: pip install --no-deps {{IMEX_WHL}}
 requirements:
   build:
@@ -19,6 +18,8 @@ requirements:
     - numpy
     - transformers
     - packaging
+    - neural_compressor
+    - protobuf
 test:
   imports:
     - intel_extension_for_transformers
diff --git a/docs/tutorials/pytorch/language-modeling/benchmark.py b/docs/tutorials/pytorch/language-modeling/benchmark.py
new file mode 100644
index 00000000000..f4d237f14fe
--- /dev/null
+++ b/docs/tutorials/pytorch/language-modeling/benchmark.py
@@ -0,0 +1,188 @@
+import logging
+import os
+from datasets import load_dataset, load_metric
+from itertools import chain
+from intel_extension_for_transformers import metrics, OptimizedModel
+from intel_extension_for_transformers.optimization.trainer import NLPTrainer
+from argparse import ArgumentParser
+from transformers import (
+    MODEL_FOR_MASKED_LM_MAPPING,
+    AutoConfig,
+    AutoModelForMaskedLM,
+    AutoModelForMultipleChoice,
+    AutoTokenizer,
+    DataCollatorForLanguageModeling,
+    TrainingArguments,
+    is_torch_tpu_available,
+    set_seed,
+)
+
+os.environ["WANDB_DISABLED"] = "true"
+
+logger = logging.getLogger(__name__)
+MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+arg_parser = ArgumentParser(description='Parse args')
+arg_parser.add_argument('--data_type', default = "int8", help='data type of model')
+arg_parser.add_argument('--model_name_or_path', default = "bert-base-uncased", help = 'input model for benchmark')
+args = arg_parser.parse_args()
+
+dataset_name="wikitext"
+dataset_config_name="wikitext-2-raw-v1"
+training_args = TrainingArguments(
+    output_dir=args.mpdel_name_or_path,
+    do_eval=True,
+    do_train=True,
+    no_cuda=True,
+    per_device_eval_batch_size=1,
+    overwrite_output_dir=True
+)
+
+raw_datasets = load_dataset(dataset_name, dataset_config_name)
+config = AutoConfig.from_pretrained(args.model_name_or_path)
+tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+# Set seed before initializing model.
+set_seed(training_args.seed)
+
+## start with int8 benchmarking
+if args.data_type == "int8":
+    # Load the model obtained after Intel Neural Compressor (INC) quantization
+    model = OptimizedModel.from_pretrained(
+          args.model_name_or_path,
+          from_tf=bool(".ckpt" in args.model_name_or_path),
+          config=config,
+          revision="main",
+          use_auth_token=None,
+    )
+else:
+    ## original fp32 model benchmarking
+    model = AutoModelForMaskedLM.from_pretrained(
+        args.model_name_or_path,
+        config=config,
+        revision="main",
+        use_auth_token=None,
+    )
+    model.resize_token_embeddings(len(tokenizer))
+
+# First we tokenize all the texts.
+if training_args.do_train:
+    column_names = raw_datasets["train"].column_names
+else:
+    column_names = raw_datasets["validation"].column_names
+text_column_name = "text" if "text" in column_names else column_names[0]
+
+max_seq_length = tokenizer.model_max_length
+
+
+def tokenize_function(examples):
+    return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
+
+
+column_names = raw_datasets["train"].column_names
+text_column_name = "text" if "text" in column_names else column_names[0]
+
+with training_args.main_process_first(desc="dataset map tokenization"):
+    tokenized_datasets = raw_datasets.map(
+        tokenize_function,
+        batched=True,
+        remove_columns=column_names,
+        load_from_cache_file=True,
+        desc="Running tokenizer on every text in dataset",
+    )
+
+
+# Main data processing function that will concatenate all texts from our dataset and generate chunks of max_seq_length.
+def group_texts(examples):
+    # Concatenate all texts.
+    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+    total_length = len(concatenated_examples[list(examples.keys())[0]])
+    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+    # customize this part to your needs.
+    if total_length >= max_seq_length:
+        total_length = (total_length // max_seq_length) * max_seq_length
+    # Split by chunks of max_len.
+    result = {
+        k: [t[i: i + max_seq_length] for i in range(0, total_length, max_seq_length)]
+        for k, t in concatenated_examples.items()
+    }
+    return result
+
+
+# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+# remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+# might be slower to preprocess.
+
+with training_args.main_process_first(desc="grouping texts together"):
+    tokenized_datasets = tokenized_datasets.map(
+        group_texts,
+        batched=True,
+        load_from_cache_file=True,
+        desc=f"Grouping texts in chunks of {max_seq_length}",
+    )
+
+if training_args.do_train:
+    if "train" not in tokenized_datasets:
+        raise ValueError("--do_train requires a train dataset")
+    train_dataset = tokenized_datasets["train"]
+
+if training_args.do_eval:
+    if "validation" not in tokenized_datasets:
+        raise ValueError("--do_eval requires a validation dataset")
+    eval_dataset = tokenized_datasets["validation"]
+
+
+    def preprocess_logits_for_metrics(logits, labels):
+        if isinstance(logits, tuple):
+            # Depending on the model and config, logits may contain extra tensors,
+            # like past_key_values, but logits always come first
+            logits = logits[0]
+        return logits.argmax(dim=-1)
+
+
+    metric = load_metric("accuracy")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        # preds have the same shape as the labels, after the argmax(-1) has been calculated
+        # by preprocess_logits_for_metrics
+        labels = labels.reshape(-1)
+        preds = preds.reshape(-1)
+        mask = labels != -100
+        labels = labels[mask]
+        preds = preds[mask]
+        return metric.compute(predictions=preds, references=labels)
+
+# Data collator will take care of randomly masking the tokens.
+data_collator = DataCollatorForLanguageModeling(
+    tokenizer=tokenizer,
+    mlm_probability=0.15,
+    pad_to_multiple_of=None,
+)
+
+# Initialize the Trainer
+set_seed(training_args.seed)
+trainer = NLPTrainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset if training_args.do_train else None,
+    eval_dataset=eval_dataset if training_args.do_eval else None,
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
+    preprocess_logits_for_metrics=preprocess_logits_for_metrics
+    if training_args.do_eval and not is_torch_tpu_available()
+    else None,
+)
+
+results = trainer.evaluate()
+bert_task_acc_keys = ['eval_loss', 'eval_f1', 'eval_accuracy', 'eval_matthews_correlation',
+                              'eval_pearson', 'eval_mcc', 'eval_spearmanr']
+
+throughput = results.get("eval_samples_per_second")
+eval_loss = results["eval_loss"]
+print('Batch size = {}'.format(training_args.per_device_eval_batch_size))
+print("Finally Eval eval_loss Accuracy: {}".format(eval_loss))
+print("Latency: {:.3f} ms".format(1000 / throughput))
+print("Throughput: {} samples/sec".format(throughput))
diff --git a/docs/tutorials/pytorch/language-modeling/bert-base-uncased.ipynb b/docs/tutorials/pytorch/language-modeling/bert-base-uncased.ipynb
index 4070a1a4d65..2c154dbb7bf 100644
--- a/docs/tutorials/pytorch/language-modeling/bert-base-uncased.ipynb
+++ b/docs/tutorials/pytorch/language-modeling/bert-base-uncased.ipynb
@@ -43,7 +43,7 @@
    "id": "c1816be1",
    "metadata": {},
    "source": [
-    "* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. "
+    "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. "
    ]
   },
   {
@@ -167,7 +167,7 @@
     "    dataset_config_name=\"wikitext-2-raw-v1\",\n",
     ")\n",
     "training_args = TrainingArguments(\n",
-    "    output_dir=\"./saved_results\",\n",
+    "    output_dir=\"./saved_results_static\",\n",
     "    do_eval=True,\n",
     "    do_train=True,\n",
     "    no_cuda=True,\n",
@@ -367,8 +367,6 @@
     "    else None,\n",
     ")\n",
     "\n",
-    "trainer_ptq_static.save_model(\"./saved_results_ptq_static\") # quantized model\n",
-    "\n",
     "tune_metric = metrics.Metric(\n",
     "    name=\"eval_loss\", # Metric used for the tuning strategy.\n",
     "    is_relative=True, # Metric tolerance mode, True is for relative, otherwise for absolute.\n",
@@ -381,7 +379,11 @@
     ")\n",
     "\n",
     "# run quantization\n",
-    "trainer_ptq_static.quantize(quant_config=quantization_config)"
+    "trainer_ptq_static.quantize(quant_config=quantization_config)\n",
+    "\n",
+    "# save quantized model\n",
+    "trainer_ptq_static.save_model(\"./saved_results_static\")\n",
+    "model.config.save_pretrained(\"./saved_results_static\")"
    ]
   },
   {
@@ -414,6 +416,26 @@
     "print(\"Throughput: {} samples/sec\".format(throughput_ptq_static))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "5a7e93de",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark after Static Post Training Quantization with Multi-Instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a6795aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.system('numactl --hardware')\n",
+    "results = os.system('bash ../multi_instance.sh --model=saved_results_static --core_per_instance=4 --data_type=int8')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "7a51f6ca",
@@ -431,6 +453,7 @@
    "source": [
     "# Initialize the Trainer\n",
     "set_seed(training_args.seed)\n",
+    "training_args.output_dir = \"./saved_results_dynamic\"\n",
     "trainer_ptq_dynamic = NLPTrainer(\n",
     "    model=model,\n",
     "    args=training_args,\n",
@@ -444,8 +467,6 @@
     "    else None,\n",
     ")\n",
     "\n",
-    "trainer_ptq_dynamic.save_model(\"./saved_results_ptq_dynamic\")\n",
-    "\n",
     "tune_metric = metrics.Metric(\n",
     "    name=\"eval_loss\", \n",
     "    is_relative=True,\n",
@@ -458,7 +479,10 @@
     ")\n",
     "\n",
     "# run quantization\n",
-    "trainer_ptq_dynamic.quantize(quant_config=quantization_config)"
+    "trainer_ptq_dynamic.quantize(quant_config=quantization_config)\n",
+    "\n",
+    "# save quantized model\n",
+    "trainer_ptq_dynamic.save_model(\"./saved_results_dynamic\")"
    ]
   },
   {
@@ -487,6 +511,25 @@
     "print(\"Throughput: {} samples/sec\".format(throughput_ptq_dynamic))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "5a7e93de",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark after Dynamic Post Training Quantization with Multi-Instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ea631f92",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "results = os.system('bash ../multi_instance.sh --model=saved_results_dynamic --core_per_instance=4 --data_type=int8')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "44cca2a1",
@@ -527,6 +570,25 @@
     "print(\"Latency: {:.3f} ms\".format(1000 / throughput_fp32))\n",
     "print(\"Throughput: {} samples/sec\".format(throughput_fp32))"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5a7e93de",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark for FP32 Model with Multi-Instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "571317cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "results = os.system('bash ../multi_instance.sh --model=bert-base-uncased --core_per_instance=4 --data_type=fp32')"
+   ]
   }
  ],
  "metadata": {
diff --git a/docs/tutorials/pytorch/multi_instance.sh b/docs/tutorials/pytorch/multi_instance.sh
new file mode 100644
index 00000000000..6f1441c278f
--- /dev/null
+++ b/docs/tutorials/pytorch/multi_instance.sh
@@ -0,0 +1,62 @@
+set -eo pipefail
+set -x
+PATTERN='[-a-zA-Z0-9_]*='
+for i in "$@"
+do
+    case $i in
+        --model=*)
+            model=`echo $i | sed "s/${PATTERN}//"`;;
+        --core_per_instance=*)
+            core_per_instance=`echo $i | sed "s/${PATTERN}//"`;;
+        --data_type=*)
+            data_type=`echo $i | sed "s/${PATTERN}//"`;;
+        *)
+            echo "Parameter $i not recognized."; exit 1;;
+    esac
+done
+ncores_per_socket=${ncores_per_socket:=$( lscpu | grep 'Core(s) per socket' | cut -d: -f2 | xargs echo -n)}
+log_name="${model}.log"
+cmd="python benchmark.py --data_type=${data_type} --model_name_or_path=${model}"
+echo "Executing multi instance benchmark"
+echo -e ">>> Executing multi instance benchmark $core_per_instance $cmd" >>"$log_name"
+for ((j = 0; $(($j + $core_per_instance)) <= $ncores_per_socket; j = $(($j + ${core_per_instance})))); do
+    numa_prefix="numactl -m 0 -C $j-$((j + core_per_instance - 1)) "
+    # Make it works on machines with no numa support
+    if [[ -n $(numactl -s | grep "No NUMA support available") ]]; then
+        echo "No NUMA support available"
+        echo "Please install numactl"
+        exit 1
+    fi
+    echo "${numa_prefix}${cmd}" >>$log_name
+    ${numa_prefix}${cmd} |
+        tee -a $log_name &
+done
+wait
+echo -e "<<< Executing multi instance benchmark $core_per_instance $2" >>"$log_name"
+
+status="SUCCESS"
+
+for pid in "${benchmark_pids[@]}"; do
+    wait $pid
+    exit_code=$?
+    echo "Detected exit code: ${exit_code}"
+    if [ ${exit_code} == 0 ]; then
+        echo "Process ${pid} succeeded"
+    else
+        echo "Process ${pid} failed"
+        status="FAILURE"
+    fi
+done
+echo "Benchmark process status: ${status}"
+if [ ${status} == "FAILURE" ]; then
+    echo "Benchmark process returned non-zero exit code."
+    exit 1
+fi
+Total_Throughput=$(cat $log_name | grep -Po "Throughput:\s+(\d+(\.\d+)?)" | cut -f 2 -d ' ' | awk '{ SUM += $1} END { print SUM }')
+echo "Throughput : $Total_Throughput"
+Batch_size=$(cat $log_name | grep -Po "Batch\s+size\s+=\s+\d+" | tail -1)
+echo $Batch_size
+Accuray=$(cat $log_name | grep -Po "Finally Eval .* Accuracy.*\d+" | tail -1)
+echo $Accuray
+Total_Latency=$(cat $log_name | grep -Po "Latency:\s+(\d+(\.\d+)?)" | cut -f 2 -d ' ' | awk '{ SUM += $1} END { print SUM }')
+echo "Latency : $Total_Latency"
\ No newline at end of file
diff --git a/docs/tutorials/pytorch/multiple-choice/benchmark.py b/docs/tutorials/pytorch/multiple-choice/benchmark.py
new file mode 100644
index 00000000000..6fde6834d0f
--- /dev/null
+++ b/docs/tutorials/pytorch/multiple-choice/benchmark.py
@@ -0,0 +1,145 @@
+import logging
+import os
+import numpy as np
+from datasets import load_dataset, load_metric
+from itertools import chain
+from intel_extension_for_transformers import metrics, OptimizedModel
+from intel_extension_for_transformers.optimization.trainer import NLPTrainer
+from argparse import ArgumentParser
+from transformers import (
+    MODEL_FOR_MASKED_LM_MAPPING,
+    AutoConfig,
+    AutoModelForMultipleChoice,
+    AutoTokenizer,
+    TrainingArguments,
+    set_seed,
+    default_data_collator,
+)
+
+os.environ["WANDB_DISABLED"] = "true"
+
+logger = logging.getLogger(__name__)
+MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+arg_parser = ArgumentParser(description='Parse args')
+arg_parser.add_argument('--data_type', default = "int8", help='data type of model')
+arg_parser.add_argument('--model_name_or_path', default = "ehdwns1516/bert-base-uncased_SWAG", help = 'input model for benchmark')
+args = arg_parser.parse_args()
+
+dataset_name="swag"
+dataset_config_name="regular"
+
+training_args = TrainingArguments(
+    output_dir=args.model_name_or_path,
+    do_eval=True,
+    do_train=True,
+    no_cuda=True,
+    overwrite_output_dir=True,
+    per_device_eval_batch_size=8,
+    per_device_train_batch_size=8
+)
+
+raw_datasets = load_dataset(dataset_name, dataset_config_name)
+config = AutoConfig.from_pretrained(args.model_name_or_path)
+tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+## start with int8 benchmarking
+if args.data_type == "int8":
+    # Load the model obtained after Intel Neural Compressor (INC) quantization
+    model = OptimizedModel.from_pretrained(
+          args.model_name_or_path,
+          from_tf=bool(".ckpt" in args.model_name_or_path),
+          config=config,
+          revision="main",
+          use_auth_token=None,
+    )
+else:
+    ## original fp32 model benchmarking
+    model = AutoModelForMultipleChoice.from_pretrained(
+        args.model_name_or_path,
+        config=config,
+        revision="main",
+        use_auth_token=None,
+    )
+
+ending_names = [f"ending{i}" for i in range(4)]
+context_name = "sent1"
+question_header_name = "sent2"
+
+# First we tokenize all the texts.
+max_seq_length = tokenizer.model_max_length
+if max_seq_length >1024:
+    max_seq_length = 1024
+
+# preprocessing the datasets
+def preprocess_function(examples):
+    first_sentences = [[context] * 4 for context in examples[context_name]]
+    question_headers = examples[question_header_name]
+    second_sentences = [
+        [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
+    ]
+
+    # Flatten out
+    first_sentences = list(chain(*first_sentences))
+    second_sentences = list(chain(*second_sentences))
+
+    # Tokenize
+    tokenized_examples = tokenizer(
+        first_sentences,
+        second_sentences,
+        truncation=True,
+        max_length=max_seq_length,
+        padding="max_length"
+    )
+    # Un-flatten
+    return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
+
+if training_args.do_train:
+    if "train" not in raw_datasets:
+        raise ValueError("--do_train requires a train dataset")
+    train_dataset = raw_datasets["train"]
+    with training_args.main_process_first(desc="train dataset map pre-processing"):
+        train_dataset = train_dataset.map(preprocess_function, batched=True)
+if training_args.do_eval:
+    if "validation" not in raw_datasets:
+        raise ValueError("--do_eval requires a validation dataset")
+    eval_dataset = raw_datasets["validation"]
+    eval_dataset = eval_dataset.select(range(1000))
+    with training_args.main_process_first(desc="validation dataset map pre-processing"):
+        eval_dataset = eval_dataset.map(
+            preprocess_function,
+            batched=True,
+            load_from_cache_file=False
+        )
+
+# Data collator
+data_collator = default_data_collator
+
+# Metric
+def compute_metrics(eval_predictions):
+    predictions, label_ids = eval_predictions
+    preds = np.argmax(predictions, axis=1)
+    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}
+
+# Initialize the Trainer
+set_seed(training_args.seed)
+trainer = NLPTrainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset if training_args.do_train else None,
+    eval_dataset=eval_dataset if training_args.do_eval else None,
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+)
+
+results = trainer.evaluate()
+bert_task_acc_keys = ['eval_loss', 'eval_f1', 'eval_accuracy', 'eval_matthews_correlation',
+                              'eval_pearson', 'eval_mcc', 'eval_spearmanr']
+
+throughput = results.get("eval_samples_per_second")
+eval_loss = results["eval_loss"]
+print('Batch size = {}'.format(training_args.per_device_eval_batch_size))
+print("Finally Eval eval_loss Accuracy: {}".format(eval_loss))
+print("Latency: {:.3f} ms".format(1000 / throughput))
+print("Throughput: {} samples/sec".format(throughput))
diff --git a/docs/tutorials/pytorch/multiple-choice/bert-base-uncased_SWAG.ipynb b/docs/tutorials/pytorch/multiple-choice/bert-base-uncased_SWAG.ipynb
index fc808148301..eebf4f3412c 100644
--- a/docs/tutorials/pytorch/multiple-choice/bert-base-uncased_SWAG.ipynb
+++ b/docs/tutorials/pytorch/multiple-choice/bert-base-uncased_SWAG.ipynb
@@ -57,7 +57,7 @@
     }
    },
    "source": [
-    "* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. "
+    "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. "
    ]
   },
   {
@@ -225,7 +225,7 @@
     "    overwrite_cache=True\n",
     ")\n",
     "training_args = TrainingArguments(\n",
-    "    output_dir=\"./tmp/swag_output\",\n",
+    "    output_dir=\"./saved_results_static\",\n",
     "    do_eval=True,\n",
     "    do_train=True,\n",
     "    no_cuda=True,\n",
@@ -433,8 +433,6 @@
     ")\n",
     "\n",
     "# quantized model\n",
-    "trainer_static.save_model(\"./tmp/swag_output/saved_results_static\")\n",
-    "\n",
     "tune_metric = metrics.Metric(\n",
     "    name=\"eval_accuracy\", # Metric used for the tuning strategy.\n",
     "    is_relative=True, # Metric tolerance mode, True is for relative, otherwise for absolute.\n",
@@ -446,7 +444,11 @@
     ")\n",
     "\n",
     "# run quantization\n",
-    "trainer_static.quantize(quant_config=quantization_config)"
+    "trainer_static.quantize(quant_config=quantization_config)\n",
+    "\n",
+    "# save quantized model\n",
+    "trainer_static.save_model(\"./saved_results_static\")\n",
+    "model.config.save_pretrained(\"./saved_results_static\")"
    ]
   },
   {
@@ -483,6 +485,26 @@
     "print(\"Throughput: {} samples/sec\".format(throughput_static))\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "363eea55",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark after Static Post Training Quantization with Multi-Instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "54bd1a23",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.system('numactl --hardware')\n",
+    "results = os.system('bash ../multi_instance.sh --model=saved_results_static --core_per_instance=4 --data_type=int8')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "7a51f6ca",
@@ -507,6 +529,7 @@
    "outputs": [],
    "source": [
     "set_seed(training_args.seed)\n",
+    "training_args.output_dir = \"saved_results_dynamic\"\n",
     "# Initialize our Trainer\n",
     "trainer_dynamic = NLPTrainer(\n",
     "    model=model,\n",
@@ -519,8 +542,6 @@
     ")\n",
     "\n",
     "# quantized model\n",
-    "trainer_dynamic.save_model(\"./tmp/swag_output/saved_results_dynamic\")\n",
-    "\n",
     "tune_metric = metrics.Metric(\n",
     "    name=\"eval_accuracy\", # Metric used for the tuning strategy.\n",
     "    is_relative=True, # Metric tolerance mode, True is for relative, otherwise for absolute.\n",
@@ -532,7 +553,10 @@
     ")\n",
     "\n",
     "# run quantization\n",
-    "trainer_dynamic.quantize(quant_config=quantization_config)"
+    "trainer_dynamic.quantize(quant_config=quantization_config)\n",
+    "\n",
+    "# save quantized model\n",
+    "trainer_dynamic.save_model(\"./saved_results_dynamic\")"
    ]
   },
   {
@@ -568,6 +592,25 @@
     "print(\"Throughput: {} samples/sec\".format(throughput_dynamic))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "14ed0b81",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark after Dynamic Post Training Quantization with Multi-Instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "33df0db9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "results = os.system('bash ../multi_instance.sh --model=saved_results_dynamic --core_per_instance=4 --data_type=int8')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "44cca2a1",
@@ -612,6 +655,25 @@
     "print(\"Latency: {:.3f} ms\".format(1000 / throughput_fp32))\n",
     "print(\"Throughput: {} samples/sec\".format(throughput_fp32))"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6390e6a3",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark for FP32 Model with Multi-Instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "675c5036",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "results = os.system('bash ../multi_instance.sh --model=ehdwns1516/bert-base-uncased_SWAG --core_per_instance=4 --data_type=fp32')"
+   ]
   }
  ],
  "metadata": {
diff --git a/docs/tutorials/pytorch/question-answering/Dynamic_MiniLM_SQuAD.ipynb b/docs/tutorials/pytorch/question-answering/Dynamic_MiniLM_SQuAD.ipynb
index b0830c5243e..db34c6f3ef6 100644
--- a/docs/tutorials/pytorch/question-answering/Dynamic_MiniLM_SQuAD.ipynb
+++ b/docs/tutorials/pytorch/question-answering/Dynamic_MiniLM_SQuAD.ipynb
@@ -37,7 +37,7 @@
    "id": "c1816be1",
    "metadata": {},
    "source": [
-    "* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. "
+    "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. "
    ]
   },
   {
diff --git a/docs/tutorials/pytorch/question-answering/benchmark.py b/docs/tutorials/pytorch/question-answering/benchmark.py
new file mode 100644
index 00000000000..be1a498e02c
--- /dev/null
+++ b/docs/tutorials/pytorch/question-answering/benchmark.py
@@ -0,0 +1,589 @@
+import logging
+import os
+import numpy as np
+import random
+from datasets import load_dataset, load_metric
+from intel_extension_for_transformers import OptimizedModel
+from intel_extension_for_transformers.optimization.trainer import NLPTrainer
+from argparse import ArgumentParser
+import timeit
+import collections
+import json
+from typing import Optional, Tuple
+from tqdm.auto import tqdm
+from transformers import (
+    AutoConfig,
+    AutoModelForQuestionAnswering,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    TrainingArguments,
+    set_seed,
+    is_torch_tpu_available,
+)
+from transformers.trainer_utils import PredictionOutput
+
+os.environ["WANDB_DISABLED"] = "true"
+logger = logging.getLogger(__name__)
+
+arg_parser = ArgumentParser(description='Parse args')
+arg_parser.add_argument('--data_type', default = "int8", help='data type of model')
+arg_parser.add_argument('--model_name_or_path', default = "distilbert-base-uncased-distilled-squad", help = 'input model for benchmark')
+args = arg_parser.parse_args()
+
+
+if is_torch_tpu_available():
+    import torch_xla.core.xla_model as xm
+    import torch_xla.debug.metrics as met
+
+class QuestionAnsweringTrainer(NLPTrainer):
+    def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.eval_examples = eval_examples
+        self.post_process_function = post_process_function
+
+    def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"):
+        eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
+        eval_dataloader = self.get_eval_dataloader(eval_dataset)
+        eval_examples = self.eval_examples if eval_examples is None else eval_examples
+
+        # Temporarily disable metric computation, we will do it in the loop here.
+        compute_metrics = self.compute_metrics
+        self.compute_metrics = None
+        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
+        try:
+            output = eval_loop(
+                eval_dataloader,
+                description="Evaluation",
+                # No point gathering the predictions if there are no metrics, otherwise we defer to
+                # self.args.prediction_loss_only
+                prediction_loss_only=True if compute_metrics is None else None,
+                ignore_keys=ignore_keys,
+            )
+        finally:
+            self.compute_metrics = compute_metrics
+
+        if self.post_process_function is not None and self.compute_metrics is not None:
+            eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions)
+            metrics = self.compute_metrics(eval_preds)
+
+            # Prefix all keys with metric_key_prefix + '_'
+            for key in list(metrics.keys()):
+                if not key.startswith(f"{metric_key_prefix}_"):
+                    metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+            self.log(metrics)
+        else:
+            metrics = {}
+
+        if self.args.tpu_metrics_debug or self.args.debug:
+            # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
+            xm.master_print(met.metrics_report())
+
+        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
+        return metrics
+
+    def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"):
+        predict_dataloader = self.get_test_dataloader(predict_dataset)
+
+        # Temporarily disable metric computation, we will do it in the loop here.
+        compute_metrics = self.compute_metrics
+        self.compute_metrics = None
+        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
+        try:
+            output = eval_loop(
+                predict_dataloader,
+                description="Prediction",
+                # No point gathering the predictions if there are no metrics, otherwise we defer to
+                # self.args.prediction_loss_only
+                prediction_loss_only=True if compute_metrics is None else None,
+                ignore_keys=ignore_keys,
+            )
+        finally:
+            self.compute_metrics = compute_metrics
+
+        if self.post_process_function is None or self.compute_metrics is None:
+            return output
+
+        predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict")
+        metrics = self.compute_metrics(predictions)
+
+        # Prefix all keys with metric_key_prefix + '_'
+        for key in list(metrics.keys()):
+            if not key.startswith(f"{metric_key_prefix}_"):
+                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+        return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics)
+
+def postprocess_qa_predictions(
+    examples,
+    features,
+    predictions: Tuple[np.ndarray, np.ndarray],
+    version_2_with_negative: bool = False,
+    n_best_size: int = 20,
+    max_answer_length: int = 30,
+    null_score_diff_threshold: float = 0.0,
+    output_dir: Optional[str] = None,
+    prefix: Optional[str] = None,
+    log_level: Optional[int] = logging.WARNING,
+):
+    """
+    Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
+    original contexts. This is the base postprocessing functions for models that only return start and end logits.
+
+    Args:
+        examples: The non-preprocessed dataset (see the main script for more information).
+        features: The processed dataset (see the main script for more information).
+        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
+            first dimension must match the number of elements of :obj:`features`.
+        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the underlying dataset contains examples with no answers.
+        n_best_size (:obj:`int`, `optional`, defaults to 20):
+            The total number of n-best predictions to generate when looking for an answer.
+        max_answer_length (:obj:`int`, `optional`, defaults to 30):
+            The maximum length of an answer that can be generated. This is needed because the start and end predictions
+            are not conditioned on one another.
+        null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
+            The threshold used to select the null answer: if the best answer has a score that is less than the score of
+            the null answer minus this threshold, the null answer is selected for this example (note that the score of
+            the null answer for an example giving several features is the minimum of the scores for the null answer on
+            each feature: all features must be aligned on the fact they `want` to predict a null answer).
+
+            Only useful when :obj:`version_2_with_negative` is :obj:`True`.
+        output_dir (:obj:`str`, `optional`):
+            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
+            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
+            answers, are saved in `output_dir`.
+        prefix (:obj:`str`, `optional`):
+            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
+        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+            ``logging`` log level (e.g., ``logging.WARNING``)
+    """
+
+    if len(predictions) != 2:
+        raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).")
+    all_start_logits, all_end_logits = predictions
+
+    if len(predictions[0]) != len(features):
+        raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")
+
+    # Build a map example to its corresponding features.
+    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+    features_per_example = collections.defaultdict(list)
+    for i, feature in enumerate(features):
+        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+    # The dictionaries we have to fill.
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    if version_2_with_negative:
+        scores_diff_json = collections.OrderedDict()
+
+    # Logging.
+    logger.setLevel(log_level)
+    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
+
+    # Let's loop over all the examples!
+    for example_index, example in enumerate(tqdm(examples)):
+        # Those are the indices of the features associated to the current example.
+        feature_indices = features_per_example[example_index]
+
+        min_null_prediction = None
+        prelim_predictions = []
+
+        # Looping through all the features associated to the current example.
+        for feature_index in feature_indices:
+            # We grab the predictions of the model for this feature.
+            start_logits = all_start_logits[feature_index]
+            end_logits = all_end_logits[feature_index]
+            # This is what will allow us to map some the positions in our logits to span of texts in the original
+            # context.
+            offset_mapping = features[feature_index]["offset_mapping"]
+            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+            # available in the current feature.
+            token_is_max_context = features[feature_index].get("token_is_max_context", None)
+
+            # Update minimum null prediction.
+            feature_null_score = start_logits[0] + end_logits[0]
+            if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
+                min_null_prediction = {
+                    "offsets": (0, 0),
+                    "score": feature_null_score,
+                    "start_logit": start_logits[0],
+                    "end_logit": end_logits[0],
+                }
+
+            # Go through all possibilities for the `n_best_size` greater start and end logits.
+            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
+            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
+                    # to part of the input_ids that are not in the context.
+                    if (
+                        start_index >= len(offset_mapping)
+                        or end_index >= len(offset_mapping)
+                        or offset_mapping[start_index] is None
+                        or len(offset_mapping[start_index]) < 2
+                        or offset_mapping[end_index] is None
+                        or len(offset_mapping[end_index]) < 2
+                    ):
+                        continue
+                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
+                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
+                        continue
+                    # Don't consider answer that don't have the maximum context available (if such information is
+                    # provided).
+                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
+                        continue
+
+                    prelim_predictions.append(
+                        {
+                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
+                            "score": start_logits[start_index] + end_logits[end_index],
+                            "start_logit": start_logits[start_index],
+                            "end_logit": end_logits[end_index],
+                        }
+                    )
+        if version_2_with_negative:
+            # Add the minimum null prediction
+            prelim_predictions.append(min_null_prediction)
+            null_score = min_null_prediction["score"]
+
+        # Only keep the best `n_best_size` predictions.
+        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
+
+        # Add back the minimum null prediction if it was removed because of its low score.
+        if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions):
+            predictions.append(min_null_prediction)
+
+        # Use the offsets to gather the answer text in the original context.
+        context = example["context"]
+        for pred in predictions:
+            offsets = pred.pop("offsets")
+            pred["text"] = context[offsets[0] : offsets[1]]
+
+        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+        # failure.
+        if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
+            predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})
+
+        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+        # the LogSumExp trick).
+        scores = np.array([pred.pop("score") for pred in predictions])
+        exp_scores = np.exp(scores - np.max(scores))
+        probs = exp_scores / exp_scores.sum()
+
+        # Include the probabilities in our predictions.
+        for prob, pred in zip(probs, predictions):
+            pred["probability"] = prob
+
+        # Pick the best prediction. If the null answer is not possible, this is easy.
+        if not version_2_with_negative:
+            all_predictions[example["id"]] = predictions[0]["text"]
+        else:
+            # Otherwise we first need to find the best non-empty prediction.
+            i = 0
+            while predictions[i]["text"] == "":
+                i += 1
+            best_non_null_pred = predictions[i]
+
+            # Then we compare to the null prediction using the threshold.
+            score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"]
+            scores_diff_json[example["id"]] = float(score_diff)  # To be JSON-serializable.
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example["id"]] = ""
+            else:
+                all_predictions[example["id"]] = best_non_null_pred["text"]
+
+        # Make `predictions` JSON-serializable by casting np.float back to float.
+        all_nbest_json[example["id"]] = [
+            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
+            for pred in predictions
+        ]
+
+    # If we have an output_dir, let's save all those dicts.
+    if output_dir is not None:
+        if not os.path.isdir(output_dir):
+            raise EnvironmentError(f"{output_dir} is not a directory.")
+
+        prediction_file = os.path.join(
+            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
+        )
+        nbest_file = os.path.join(
+            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
+        )
+        if version_2_with_negative:
+            null_odds_file = os.path.join(
+                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
+            )
+
+        logger.info(f"Saving predictions to {prediction_file}.")
+        with open(prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+        logger.info(f"Saving nbest_preds to {nbest_file}.")
+        with open(nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+        if version_2_with_negative:
+            logger.info(f"Saving null_odds to {null_odds_file}.")
+            with open(null_odds_file, "w") as writer:
+                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions
+
+
+
+raw_datasets = load_dataset("squad")
+# download the dataset.
+training_args = TrainingArguments(
+    output_dir=args.model_name_or_path,
+    do_eval=True,
+    do_train=True,
+    no_cuda=True,
+    overwrite_output_dir=True,
+    per_device_train_batch_size=8,
+)
+config = AutoConfig.from_pretrained(args.model_name_or_path)
+tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=True)
+log_level = training_args.get_process_log_level()
+## start with int8 benchmarking
+if args.data_type == "int8":
+    # Load the model obtained after Intel Neural Compressor (INC) quantization
+    model = OptimizedModel.from_pretrained(
+          args.model_name_or_path,
+          from_tf=bool(".ckpt" in args.model_name_or_path),
+          config=config,
+          revision="main",
+          use_auth_token=None,
+    )
+else:
+    ## original fp32 model benchmarking
+    model = AutoModelForQuestionAnswering.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        use_auth_token=None
+    )
+
+# Preprocessing is slighlty different for training and evaluation.
+column_names = raw_datasets["train"].column_names
+question_column_name = "question" if "question" in column_names else column_names[0]
+context_column_name = "context" if "context" in column_names else column_names[1]
+answer_column_name = "answers" if "answers" in column_names else column_names[2]
+
+# Padding side determines if we do (question|context) or (context|question).
+pad_on_right = tokenizer.padding_side == "right"
+
+max_seq_length = min(384, tokenizer.model_max_length)
+
+# Training preprocessing
+def prepare_train_features(examples):
+    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+    # left whitespace
+    examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
+
+    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+    # in one example possible giving several features when a context is long, each of those features having a
+    # context that overlaps a bit the context of the previous feature.
+    tokenized_examples = tokenizer(
+        examples[question_column_name if pad_on_right else context_column_name],
+        examples[context_column_name if pad_on_right else question_column_name],
+        truncation="only_second" if pad_on_right else "only_first",
+        max_length=max_seq_length,
+        stride=128,
+        return_overflowing_tokens=True,
+        return_offsets_mapping=True,
+        padding="max_length",
+    )
+
+    # Since one example might give us several features if it has a long context, we need a map from a feature to
+    # its corresponding example. This key gives us just that.
+    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+    # The offset mappings will give us a map from token to character position in the original context. This will
+    # help us compute the start_positions and end_positions.
+    offset_mapping = tokenized_examples.pop("offset_mapping")
+
+    # Let's label those examples!
+    tokenized_examples["start_positions"] = []
+    tokenized_examples["end_positions"] = []
+
+    for i, offsets in enumerate(offset_mapping):
+        # We will label impossible answers with the index of the CLS token.
+        input_ids = tokenized_examples["input_ids"][i]
+        cls_index = input_ids.index(tokenizer.cls_token_id)
+
+        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+        sequence_ids = tokenized_examples.sequence_ids(i)
+
+        # One example can give several spans, this is the index of the example containing this span of text.
+        sample_index = sample_mapping[i]
+        answers = examples[answer_column_name][sample_index]
+        # If no answers are given, set the cls_index as answer.
+        if len(answers["answer_start"]) == 0:
+            tokenized_examples["start_positions"].append(cls_index)
+            tokenized_examples["end_positions"].append(cls_index)
+        else:
+            # Start/end character index of the answer in the text.
+            start_char = answers["answer_start"][0]
+            end_char = start_char + len(answers["text"][0])
+
+            # Start token index of the current span in the text.
+            token_start_index = 0
+            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
+                token_start_index += 1
+
+            # End token index of the current span in the text.
+            token_end_index = len(input_ids) - 1
+            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
+                token_end_index -= 1
+
+            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
+            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
+                tokenized_examples["start_positions"].append(cls_index)
+                tokenized_examples["end_positions"].append(cls_index)
+            else:
+                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
+                # Note: we could go after the last offset if the answer is the last word (edge case).
+                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
+                    token_start_index += 1
+                tokenized_examples["start_positions"].append(token_start_index - 1)
+                while offsets[token_end_index][1] >= end_char:
+                    token_end_index -= 1
+                tokenized_examples["end_positions"].append(token_end_index + 1)
+
+    return tokenized_examples
+
+if training_args.do_train:
+    if "train" not in raw_datasets:
+        raise ValueError("--do_train requires a train dataset")
+    train_dataset = raw_datasets["train"]
+    with training_args.main_process_first(desc="train dataset map pre-processing"):
+        train_dataset = train_dataset.map(
+            prepare_train_features,
+            batched=True,
+            remove_columns=column_names,
+            load_from_cache_file=True,
+            desc="Running tokenizer on train dataset"
+        )
+
+# Validation preprocessing
+def prepare_validation_features(examples):
+    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+    # left whitespace
+    examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
+
+    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+    # in one example possible giving several features when a context is long, each of those features having a
+    # context that overlaps a bit the context of the previous feature.
+    tokenized_examples = tokenizer(
+        examples[question_column_name if pad_on_right else context_column_name],
+        examples[context_column_name if pad_on_right else question_column_name],
+        truncation="only_second" if pad_on_right else "only_first",
+        max_length=max_seq_length,
+        stride=128,
+        return_overflowing_tokens=True,
+        return_offsets_mapping=True,
+        padding="max_length",
+    )
+
+    # Since one example might give us several features if it has a long context, we need a map from a feature to
+    # its corresponding example. This key gives us just that.
+    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+
+    # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+    # corresponding example_id and we will store the offset mappings.
+    tokenized_examples["example_id"] = []
+
+    for i in range(len(tokenized_examples["input_ids"])):
+        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+        sequence_ids = tokenized_examples.sequence_ids(i)
+        context_index = 1 if pad_on_right else 0
+
+        # One example can give several spans, this is the index of the example containing this span of text.
+        sample_index = sample_mapping[i]
+        tokenized_examples["example_id"].append(examples["id"][sample_index])
+
+        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
+        # position is part of the context or not.
+        tokenized_examples["offset_mapping"][i] = [
+            (o if sequence_ids[k] == context_index else None)
+            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+        ]
+
+    return tokenized_examples
+
+if training_args.do_eval:
+    if "validation" not in raw_datasets:
+        raise ValueError("--do_eval requires a validation dataset")
+    eval_examples = raw_datasets["validation"]
+    with training_args.main_process_first(desc="validation dataset map pre-processing"):
+        eval_dataset = eval_examples.map(
+            prepare_validation_features,
+            batched=True,
+            remove_columns=column_names,
+            load_from_cache_file=True,
+            desc="Running tokenizer on validation dataset",
+        )
+    max_eval_samples = min(len(eval_dataset), 5000)
+    eval_dataset = eval_dataset.select(range(max_eval_samples))
+
+# Data collator
+data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
+
+# Post-processing:
+def post_processing_function(examples, features, predictions, stage="eval"):
+    # Post-processing: we match the start logits and end logits to answers in the original context.
+    predictions = postprocess_qa_predictions(
+        examples=examples,
+        features=features,
+        predictions=predictions,
+        version_2_with_negative=False,
+        n_best_size=20,
+        max_answer_length=30,
+        null_score_diff_threshold=0.0,
+        output_dir=training_args.output_dir,
+        log_level=log_level,
+        prefix=stage,
+    )
+    # Format the result to the format the metric expects.
+    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+
+    references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
+    return EvalPrediction(predictions=formatted_predictions, label_ids=references)
+
+metric = load_metric("squad")
+
+def compute_metrics(p: EvalPrediction):
+    return metric.compute(predictions=p.predictions, references=p.label_ids)
+
+
+# Initialize the Trainer
+set_seed(training_args.seed)
+trainer = QuestionAnsweringTrainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset if training_args.do_train else None,
+    eval_dataset=eval_dataset if training_args.do_eval else None,
+    eval_examples=eval_examples if training_args.do_eval else None,
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    post_process_function=post_processing_function,
+    compute_metrics=compute_metrics,
+)
+
+
+set_seed(training_args.seed)
+start_time = timeit.default_timer()
+results = trainer.evaluate()
+evalTime = timeit.default_timer() - start_time
+max_eval_samples = 5000
+samples = min(max_eval_samples, len(eval_dataset))
+
+eval_f1_static = results.get("eval_f1")
+print('Batch size = {}'.format(training_args.per_device_eval_batch_size))
+print("Finally Eval eval_f1 Accuracy: {}".format(eval_f1_static))
+print("Latency: {:.3f} ms".format(evalTime / samples * 1000))
+print("Throughput: {} samples/sec".format(samples/evalTime))
\ No newline at end of file
diff --git a/docs/tutorials/pytorch/question-answering/bert-base-uncased_distilled-squad.ipynb b/docs/tutorials/pytorch/question-answering/bert-base-uncased_distilled-squad.ipynb
index aa4310d2ae6..f0253a9f642 100644
--- a/docs/tutorials/pytorch/question-answering/bert-base-uncased_distilled-squad.ipynb
+++ b/docs/tutorials/pytorch/question-answering/bert-base-uncased_distilled-squad.ipynb
@@ -69,7 +69,7 @@
     }
    },
    "source": [
-    "* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. "
+    "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. "
    ]
   },
   {
@@ -636,7 +636,7 @@
     "    max_eval_samples=5000\n",
     ")\n",
     "training_args = TrainingArguments(\n",
-    "    output_dir=\"./tmp/squad_output\",\n",
+    "    output_dir=\"./saved_results_static\",\n",
     "    do_eval=True,\n",
     "    do_train=True,\n",
     "    no_cuda=True,\n",
@@ -991,8 +991,8 @@
     "    compute_metrics=compute_metrics,\n",
     ")\n",
     "\n",
-    "trainer_static.save_model(training_args.output_dir+'/saved_results_static')\n",
-    "model.config.save_pretrained(training_args.output_dir+'/saved_results_static')\n",
+    "trainer_static.save_model('./saved_results_static')\n",
+    "model.config.save_pretrained('./saved_results_static')\n",
     "\n",
     "tune_metric = metrics.Metric(\n",
     "    name=\"eval_f1\", # Metric used for the tuning strategy.\n",
@@ -1047,6 +1047,26 @@
     "print(\"Throughput: {} samples/sec\".format(samples/evalTime))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "0a0ec3a8",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark after Static Post Training Quantization with Multi-Instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2754e847",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.system('numactl --hardware')\n",
+    "results = os.system('bash ../multi_instance.sh --model=saved_results_static --core_per_instance=4 --data_type=int8')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "7a51f6ca",
@@ -1071,6 +1091,7 @@
    "outputs": [],
    "source": [
     "set_seed(training_args.seed)\n",
+    "training_args.output_dir = \"./saved_results_dynamic\"\n",
     "# Initialize our Trainer\n",
     "trainer_dynamic = QuestionAnsweringTrainer(\n",
     "    model=model,\n",
@@ -1084,8 +1105,8 @@
     "    compute_metrics=compute_metrics,\n",
     ")\n",
     "\n",
-    "trainer_dynamic.save_model(training_args.output_dir+'/saved_results_dynamic')\n",
-    "model.config.save_pretrained(training_args.output_dir+'/saved_results_dynamic')\n",
+    "trainer_dynamic.save_model('./saved_results_dynamic')\n",
+    "model.config.save_pretrained('./saved_results_dynamic')\n",
     "\n",
     "tune_metric = metrics.Metric(\n",
     "    name=\"eval_f1\", # Metric used for the tuning strategy.\n",
@@ -1139,6 +1160,25 @@
     "print(\"Throughput: {} samples/sec\".format(samples/evalTime))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "4c8e1c6f",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark after Dynamic Post Training Quantization with Multi-Instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "101d0b51",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "results = os.system('bash ../multi_instance.sh --model=saved_results_dynamic --core_per_instance=4 --data_type=int8')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -1189,6 +1229,25 @@
     "print(\"Latency: {:.3f} ms\".format(evalTime / samples * 1000))\n",
     "print(\"Throughput: {} samples/sec\".format(samples/evalTime))"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "17fb0e7f",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark for FP32 Model with Multi-Instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a9710d14",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "results = os.system('bash ../multi_instance.sh --model=distilbert-base-uncased-distilled-squad --core_per_instance=4 --data_type=fp32')"
+   ]
   }
  ],
  "metadata": {
diff --git a/docs/tutorials/pytorch/question-answering/distillation.ipynb b/docs/tutorials/pytorch/question-answering/distillation.ipynb
new file mode 100644
index 00000000000..4e905c1c3c5
--- /dev/null
+++ b/docs/tutorials/pytorch/question-answering/distillation.ipynb
@@ -0,0 +1,773 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Introduction"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This tutorial demonstrates how to use the distillation approach based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor) for question-answering."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prerequisite"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Install packages"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# install model dependency\n",
+    "! pip install accelerate datasets >= 1.1.3 sentencepiece != 0.1.92 protobuf torch >= 1.10 transformers >= 4.12.0 wandb\n",
+    "! pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import packages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "import os\n",
+    "import sys\n",
+    "from dataclasses import dataclass, field\n",
+    "from typing import Optional\n",
+    "\n",
+    "import datasets\n",
+    "from datasets import load_dataset, load_metric\n",
+    "\n",
+    "import functools\n",
+    "import numpy as np\n",
+    "import time\n",
+    "import torch\n",
+    "import transformers\n",
+    "from intel_extension_for_transformers import metrics, OptimizedModel, DistillationConfig\n",
+    "from torch.utils.data import DataLoader\n",
+    "from tqdm import tqdm\n",
+    "from trainer_qa import QuestionAnsweringTrainer\n",
+    "from transformers import (\n",
+    "    AutoConfig,\n",
+    "    AutoModelForQuestionAnswering,\n",
+    "    AutoTokenizer,\n",
+    "    DataCollatorWithPadding,\n",
+    "    EvalPrediction,\n",
+    "    HfArgumentParser,\n",
+    "    PreTrainedTokenizerFast,\n",
+    "    TrainingArguments,\n",
+    "    default_data_collator,\n",
+    "    set_seed,\n",
+    ")\n",
+    "from transformers.trainer_utils import get_last_checkpoint\n",
+    "from transformers.utils import check_min_version\n",
+    "from transformers.utils.versions import require_version\n",
+    "from typing import Optional\n",
+    "from utils_qa import postprocess_qa_predictions\n",
+    "\n",
+    "\n",
+    "# Will error if the minimal version of Transformers is not installed. Remove at your own risks.\n",
+    "check_min_version(\"4.12.0\")\n",
+    "\n",
+    "require_version(\"datasets>=1.8.0\", \"To fix: pip install -r examples/pytorch/question-answering/requirements.txt\")\n",
+    "\n",
+    "logger = logging.getLogger(__name__)\n",
+    "\n",
+    "os.environ[\"WANDB_DISABLED\"] = \"true\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define arguments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ========== Define arguments =========\n",
+    "@dataclass\n",
+    "class ModelArguments:\n",
+    "    \"\"\"\n",
+    "    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.\n",
+    "    \"\"\"\n",
+    "    model_name_or_path: str = field(\n",
+    "        metadata={\"help\": \"Path to pretrained model or model identifier from huggingface.co/models\"}\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "@dataclass\n",
+    "class DataTrainingArguments:\n",
+    "    \"\"\"\n",
+    "    Arguments pertaining to what data we are going to input our model for training and eval.\n",
+    "    \"\"\"\n",
+    "    dataset_name: Optional[str] = field(\n",
+    "        default=None, metadata={\"help\": \"The name of the dataset to use (via the datasets library).\"}\n",
+    "    )\n",
+    "    max_seq_length: int = field(\n",
+    "        default=384,\n",
+    "        metadata={\n",
+    "            \"help\": \"The maximum total input sequence length after tokenization. Sequences longer \"\n",
+    "            \"than this will be truncated, sequences shorter will be padded.\"\n",
+    "        },\n",
+    "    )\n",
+    "    max_train_samples: Optional[int] = field(\n",
+    "        default=None,\n",
+    "        metadata={\n",
+    "            \"help\": \"For debugging purposes or quicker training, truncate the number of training examples to this \"\n",
+    "            \"value if set.\"\n",
+    "        },\n",
+    "    )\n",
+    "    max_eval_samples: Optional[int] = field(\n",
+    "        default=None,\n",
+    "        metadata={\n",
+    "            \"help\": \"For debugging purposes or quicker training, truncate the number of evaluation examples to this \"\n",
+    "            \"value if set.\"\n",
+    "        },\n",
+    "    )\n",
+    "    overwrite_cache: bool = field(\n",
+    "        default=False, metadata={\"help\": \"Overwrite the cached training and evaluation sets\"}\n",
+    "    )\n",
+    "    doc_stride: int = field(\n",
+    "        default=128,\n",
+    "        metadata={\"help\": \"When splitting up a long document into chunks, how much stride to take between chunks.\"},\n",
+    "    )\n",
+    "    pad_to_max_length: bool = field(\n",
+    "        default=True,\n",
+    "        metadata={\n",
+    "            \"help\": \"Whether to pad all samples to `max_seq_length`. \"\n",
+    "            \"If False, will pad the samples dynamically when batching to the maximum length in the batch (which can \"\n",
+    "            \"be faster on GPU but will be slower on TPU).\"\n",
+    "        },\n",
+    "    )\n",
+    "    version_2_with_negative: bool = field(\n",
+    "        default=False, metadata={\"help\": \"If true, some of the examples do not have an answer.\"}\n",
+    "    )\n",
+    "    null_score_diff_threshold: float = field(\n",
+    "        default=0.0,\n",
+    "        metadata={\n",
+    "            \"help\": \"The threshold used to select the null answer: if the best answer has a score that is less than \"\n",
+    "            \"the score of the null answer minus this threshold, the null answer is selected for this example. \"\n",
+    "            \"Only useful when `version_2_with_negative=True`.\"\n",
+    "        },\n",
+    "    )\n",
+    "    n_best_size: int = field(\n",
+    "        default=20,\n",
+    "        metadata={\"help\": \"The total number of n-best predictions to generate when looking for an answer.\"},\n",
+    "    )\n",
+    "    max_answer_length: int = field(\n",
+    "        default=30,\n",
+    "        metadata={\n",
+    "            \"help\": \"The maximum length of an answer that can be generated. This is needed because the start \"\n",
+    "            \"and end predictions are not conditioned on one another.\"\n",
+    "        },\n",
+    "    )\n",
+    "\n",
+    "@dataclass\n",
+    "class OptimizationArguments:\n",
+    "    \"\"\"\n",
+    "    Arguments pertaining to what type of optimization we are going to apply on the model.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    distillation: bool = field(\n",
+    "        default=False,\n",
+    "        metadata={\"help\": \"Whether or not to apply distillation.\"},\n",
+    "    )\n",
+    "    teacher_model_name_or_path: str = field(\n",
+    "        default=False,\n",
+    "        metadata={\"help\": \"Path to pretrained model or model identifier from huggingface.co/models\"},\n",
+    "    )\n",
+    "    run_teacher_logits: bool = field(\n",
+    "        default=False,\n",
+    "        metadata={\"help\": \"Whether or not to obtain teacher model's logits on train dataset before training.\"},\n",
+    "        )\n",
+    "    metric_name: Optional[str] = field(\n",
+    "        default=\"eval_f1\",\n",
+    "        metadata={\"help\": \"Metric used for the tuning strategy.\"},\n",
+    "    )\n",
+    "    tolerance_mode: Optional[str] = field(\n",
+    "        default=\"absolute\",\n",
+    "        metadata={\"help\": \"Metric tolerance model, expected to be relative or absolute.\"},\n",
+    "    )\n",
+    "    perf_tol: Optional[float] = field(\n",
+    "        default=0.02,\n",
+    "        metadata={\"help\": \"Performance tolerance when optimizing the model.\"},\n",
+    "    )\n",
+    "    benchmark: bool = field(\n",
+    "        default=False,\n",
+    "        metadata={\"help\": \"run benchmark.\"}\n",
+    "    )\n",
+    "    accuracy_only: bool = field(\n",
+    "        default=False,\n",
+    "        metadata={\"help\":\"Whether to only test accuracy for model tuned by Neural Compressor.\"}\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We now keep distinct sets of args, for a cleaner separation of concerns.\n",
+    "parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, OptimizationArguments))\n",
+    "if len(sys.argv) == 2 and sys.argv[1].endswith(\".json\"):\n",
+    "    # If we pass only one argument to the script and it's the path to a json file,\n",
+    "    # let's parse it to get our arguments.\n",
+    "    model_args, data_args, training_args, optim_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))\n",
+    "else:\n",
+    "    model_args, data_args, training_args, optim_args = parser.parse_args_into_dataclasses()\n",
+    "\n",
+    "# Setup logging\n",
+    "logging.basicConfig(\n",
+    "    format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n",
+    "    datefmt=\"%m/%d/%Y %H:%M:%S\",\n",
+    "    handlers=[logging.StreamHandler(sys.stdout)],\n",
+    ")\n",
+    "\n",
+    "log_level = training_args.get_process_log_level()\n",
+    "logger.setLevel(log_level)\n",
+    "datasets.utils.logging.set_verbosity(log_level)\n",
+    "transformers.utils.logging.set_verbosity(log_level)\n",
+    "transformers.utils.logging.enable_default_handler()\n",
+    "transformers.utils.logging.enable_explicit_format()\n",
+    "\n",
+    "# Log on each process the small summary:\n",
+    "logger.warning(\n",
+    "    f\"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}\"\n",
+    "    + f\"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}\"\n",
+    ")\n",
+    "logger.info(f\"Training/evaluation parameters {training_args}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download dataset from the hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_datasets = load_dataset(\n",
+    "    data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download fp32 model from the hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set seed before initializing model.\n",
+    "set_seed(training_args.seed)\n",
+    "\n",
+    "# get fp32 model\n",
+    "config = AutoConfig.from_pretrained(model_args.model_name_or_path)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, use_fast=True)\n",
+    "model = AutoModelForQuestionAnswering.from_pretrained(\n",
+    "    model_args.model_name_or_path,\n",
+    "    from_tf=bool(\".ckpt\" in model_args.model_name_or_path),\n",
+    "    config=config,\n",
+    "    use_auth_token=True if model_args.use_auth_token else None,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preprocessing the dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Preprocessing the datasets.\n",
+    "# Preprocessing is slighlty different for training and evaluation.\n",
+    "column_names = raw_datasets[\"train\"].column_names\n",
+    "question_column_name = \"question\" if \"question\" in column_names else column_names[0]\n",
+    "context_column_name = \"context\" if \"context\" in column_names else column_names[1]\n",
+    "answer_column_name = \"answers\" if \"answers\" in column_names else column_names[2]\n",
+    "\n",
+    "# Padding side determines if we do (question|context) or (context|question).\n",
+    "pad_on_right = tokenizer.padding_side == \"right\"\n",
+    "\n",
+    "max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)\n",
+    "\n",
+    "# Training preprocessing\n",
+    "def prepare_train_features(examples, tokenizer=tokenizer):\n",
+    "    # Some of the questions have lots of whitespace on the left, which is not useful and will make the\n",
+    "    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that\n",
+    "    # left whitespace\n",
+    "    examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]\n",
+    "\n",
+    "    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results\n",
+    "    # in one example possible giving several features when a context is long, each of those features having a\n",
+    "    # context that overlaps a bit the context of the previous feature.\n",
+    "    tokenized_examples = tokenizer(\n",
+    "        examples[question_column_name if pad_on_right else context_column_name],\n",
+    "        examples[context_column_name if pad_on_right else question_column_name],\n",
+    "        truncation=\"only_second\" if pad_on_right else \"only_first\",\n",
+    "        max_length=max_seq_length,\n",
+    "        stride=data_args.doc_stride,\n",
+    "        return_overflowing_tokens=True,\n",
+    "        return_offsets_mapping=True,\n",
+    "        padding=\"max_length\" if data_args.pad_to_max_length else False,\n",
+    "    )\n",
+    "\n",
+    "    # Since one example might give us several features if it has a long context, we need a map from a feature to\n",
+    "    # its corresponding example. This key gives us just that.\n",
+    "    sample_mapping = tokenized_examples.pop(\"overflow_to_sample_mapping\")\n",
+    "    # The offset mappings will give us a map from token to character position in the original context. This will\n",
+    "    # help us compute the start_positions and end_positions.\n",
+    "    offset_mapping = tokenized_examples.pop(\"offset_mapping\")\n",
+    "\n",
+    "    # Let's label those examples!\n",
+    "    tokenized_examples[\"start_positions\"] = []\n",
+    "    tokenized_examples[\"end_positions\"] = []\n",
+    "\n",
+    "    for i, offsets in enumerate(offset_mapping):\n",
+    "        # We will label impossible answers with the index of the CLS token.\n",
+    "        input_ids = tokenized_examples[\"input_ids\"][i]\n",
+    "        cls_index = input_ids.index(tokenizer.cls_token_id)\n",
+    "\n",
+    "        # Grab the sequence corresponding to that example (to know what is the context and what is the question).\n",
+    "        sequence_ids = tokenized_examples.sequence_ids(i)\n",
+    "\n",
+    "        # One example can give several spans, this is the index of the example containing this span of text.\n",
+    "        sample_index = sample_mapping[i]\n",
+    "        answers = examples[answer_column_name][sample_index]\n",
+    "        # If no answers are given, set the cls_index as answer.\n",
+    "        if len(answers[\"answer_start\"]) == 0:\n",
+    "            tokenized_examples[\"start_positions\"].append(cls_index)\n",
+    "            tokenized_examples[\"end_positions\"].append(cls_index)\n",
+    "        else:\n",
+    "            # Start/end character index of the answer in the text.\n",
+    "            start_char = answers[\"answer_start\"][0]\n",
+    "            end_char = start_char + len(answers[\"text\"][0])\n",
+    "\n",
+    "            # Start token index of the current span in the text.\n",
+    "            token_start_index = 0\n",
+    "            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):\n",
+    "                token_start_index += 1\n",
+    "\n",
+    "            # End token index of the current span in the text.\n",
+    "            token_end_index = len(input_ids) - 1\n",
+    "            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):\n",
+    "                token_end_index -= 1\n",
+    "\n",
+    "            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).\n",
+    "            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):\n",
+    "                tokenized_examples[\"start_positions\"].append(cls_index)\n",
+    "                tokenized_examples[\"end_positions\"].append(cls_index)\n",
+    "            else:\n",
+    "                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.\n",
+    "                # Note: we could go after the last offset if the answer is the last word (edge case).\n",
+    "                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:\n",
+    "                    token_start_index += 1\n",
+    "                tokenized_examples[\"start_positions\"].append(token_start_index - 1)\n",
+    "                while offsets[token_end_index][1] >= end_char:\n",
+    "                    token_end_index -= 1\n",
+    "                tokenized_examples[\"end_positions\"].append(token_end_index + 1)\n",
+    "\n",
+    "    return tokenized_examples\n",
+    "\n",
+    "if training_args.do_train:\n",
+    "    if \"train\" not in raw_datasets:\n",
+    "        raise ValueError(\"--do_train requires a train dataset\")\n",
+    "    train_dataset = raw_datasets[\"train\"]\n",
+    "    with training_args.main_process_first(desc=\"train dataset map pre-processing\"):\n",
+    "        train_dataset = train_dataset.map(\n",
+    "            prepare_train_features,\n",
+    "            batched=True,\n",
+    "            remove_columns=column_names,\n",
+    "            load_from_cache_file=not data_args.overwrite_cache,\n",
+    "            desc=\"Running tokenizer on train dataset\"\n",
+    "        )\n",
+    "    if data_args.max_train_samples is not None:\n",
+    "        # Number of samples might increase during Feature Creation, We select only specified max samples\n",
+    "        max_train_samples = min(len(train_dataset), data_args.max_train_samples)\n",
+    "        train_dataset = train_dataset.select(range(max_train_samples))\n",
+    "\n",
+    "# Validation preprocessing\n",
+    "def prepare_validation_features(examples):\n",
+    "    # Some of the questions have lots of whitespace on the left, which is not useful and will make the\n",
+    "    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that\n",
+    "    # left whitespace\n",
+    "    examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]\n",
+    "\n",
+    "    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results\n",
+    "    # in one example possible giving several features when a context is long, each of those features having a\n",
+    "    # context that overlaps a bit the context of the previous feature.\n",
+    "    tokenized_examples = tokenizer(\n",
+    "        examples[question_column_name if pad_on_right else context_column_name],\n",
+    "        examples[context_column_name if pad_on_right else question_column_name],\n",
+    "        truncation=\"only_second\" if pad_on_right else \"only_first\",\n",
+    "        max_length=max_seq_length,\n",
+    "        stride=data_args.doc_stride,\n",
+    "        return_overflowing_tokens=True,\n",
+    "        return_offsets_mapping=True,\n",
+    "        padding=\"max_length\" if data_args.pad_to_max_length else False,\n",
+    "    )\n",
+    "\n",
+    "    # Since one example might give us several features if it has a long context, we need a map from a feature to\n",
+    "    # its corresponding example. This key gives us just that.\n",
+    "    sample_mapping = tokenized_examples.pop(\"overflow_to_sample_mapping\")\n",
+    "\n",
+    "    # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the\n",
+    "    # corresponding example_id and we will store the offset mappings.\n",
+    "    tokenized_examples[\"example_id\"] = []\n",
+    "\n",
+    "    for i in range(len(tokenized_examples[\"input_ids\"])):\n",
+    "        # Grab the sequence corresponding to that example (to know what is the context and what is the question).\n",
+    "        sequence_ids = tokenized_examples.sequence_ids(i)\n",
+    "        context_index = 1 if pad_on_right else 0\n",
+    "\n",
+    "        # One example can give several spans, this is the index of the example containing this span of text.\n",
+    "        sample_index = sample_mapping[i]\n",
+    "        tokenized_examples[\"example_id\"].append(examples[\"id\"][sample_index])\n",
+    "\n",
+    "        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token\n",
+    "        # position is part of the context or not.\n",
+    "        tokenized_examples[\"offset_mapping\"][i] = [\n",
+    "            (o if sequence_ids[k] == context_index else None)\n",
+    "            for k, o in enumerate(tokenized_examples[\"offset_mapping\"][i])\n",
+    "        ]\n",
+    "\n",
+    "    return tokenized_examples\n",
+    "\n",
+    "if training_args.do_eval:\n",
+    "    if \"validation\" not in raw_datasets:\n",
+    "        raise ValueError(\"--do_eval requires a validation dataset\")\n",
+    "    eval_examples = raw_datasets[\"validation\"]\n",
+    "    if data_args.max_eval_samples is not None:\n",
+    "        # We will select sample from whole data\n",
+    "        eval_examples = eval_examples.select(range(data_args.max_eval_samples))\n",
+    "    # Validation Feature Creation\n",
+    "    with training_args.main_process_first(desc=\"validation dataset map pre-processing\"):\n",
+    "        eval_dataset = eval_examples.map(\n",
+    "            prepare_validation_features,\n",
+    "            batched=True,\n",
+    "            num_proc=data_args.preprocessing_num_workers,\n",
+    "            remove_columns=column_names,\n",
+    "            load_from_cache_file=not data_args.overwrite_cache,\n",
+    "            desc=\"Running tokenizer on validation dataset\",\n",
+    "        )\n",
+    "    if data_args.max_eval_samples is not None:\n",
+    "        # During Feature creation dataset samples might increase, we will select required samples again\n",
+    "        eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))\n",
+    "\n",
+    "# Data collator\n",
+    "data_collator = (\n",
+    "    DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)\n",
+    ")\n",
+    "\n",
+    "def post_processing_function(examples, features, predictions, stage=\"eval\"):\n",
+    "    # Post-processing: we match the start logits and end logits to answers in the original context.\n",
+    "    predictions = postprocess_qa_predictions(\n",
+    "        examples=examples,\n",
+    "        features=features,\n",
+    "        predictions=predictions,\n",
+    "        version_2_with_negative=data_args.version_2_with_negative,\n",
+    "        n_best_size=data_args.n_best_size,\n",
+    "        max_answer_length=data_args.max_answer_length,\n",
+    "        null_score_diff_threshold=data_args.null_score_diff_threshold,\n",
+    "        output_dir=training_args.output_dir,\n",
+    "        log_level=log_level,\n",
+    "        prefix=stage,\n",
+    "    )\n",
+    "    # Format the result to the format the metric expects.\n",
+    "    if data_args.version_2_with_negative:\n",
+    "        formatted_predictions = [\n",
+    "            {\"id\": k, \"prediction_text\": v, \"no_answer_probability\": 0.0} for k, v in predictions.items()\n",
+    "        ]\n",
+    "    else:\n",
+    "        formatted_predictions = [{\"id\": k, \"prediction_text\": v} for k, v in predictions.items()]\n",
+    "\n",
+    "    references = [{\"id\": ex[\"id\"], \"answers\": ex[answer_column_name]} for ex in examples]\n",
+    "    return EvalPrediction(predictions=formatted_predictions, label_ids=references)\n",
+    "\n",
+    "metric = load_metric(\"squad_v2\" if data_args.version_2_with_negative else \"squad\")\n",
+    "\n",
+    "def compute_metrics(p: EvalPrediction):\n",
+    "    return metric.compute(predictions=p.predictions, references=p.label_ids)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Distillation & Benchmark"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model Distillation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class QAModel_output_reshaped(torch.nn.Module):\n",
+    "    def __init__(self, model):\n",
+    "        super(QAModel_output_reshaped, self).__init__()\n",
+    "        self.model = model\n",
+    "\n",
+    "    def forward(self, *args, **kwargs):\n",
+    "        outputs = self.model(*args, **kwargs)\n",
+    "        outputs_reshaped = torch.vstack([torch.vstack([sx, ex]) \\\n",
+    "                for sx, ex in zip(outputs['start_logits'], outputs['end_logits'])])\n",
+    "        return outputs_reshaped\n",
+    "\n",
+    "teacher_config = AutoConfig.from_pretrained(\n",
+    "    optim_args.teacher_model_name_or_path,\n",
+    "    use_auth_token=True if model_args.use_auth_token else None,\n",
+    ")\n",
+    "teacher_tokenizer = AutoTokenizer.from_pretrained(\n",
+    "    optim_args.teacher_model_name_or_path,\n",
+    "    use_fast=True,\n",
+    "    use_auth_token=True if model_args.use_auth_token else None,\n",
+    ")\n",
+    "teacher_model = AutoModelForQuestionAnswering.from_pretrained(\n",
+    "    optim_args.teacher_model_name_or_path,\n",
+    "    from_tf=bool(\".ckpt\" in model_args.model_name_or_path),\n",
+    "    config=teacher_config,\n",
+    "    use_auth_token=True if model_args.use_auth_token else None,\n",
+    ")\n",
+    "teacher_model.to(training_args.device)\n",
+    "\n",
+    "# Prepare datasets for teacher model\n",
+    "# Create train feature from dataset\n",
+    "with training_args.main_process_first(desc=\"train dataset map pre-processing\"):\n",
+    "    teacher_train_dataset = train_examples.map(\n",
+    "        functools.partial(prepare_train_features, tokenizer=teacher_tokenizer),\n",
+    "        batched=True,\n",
+    "        num_proc=data_args.preprocessing_num_workers,\n",
+    "        remove_columns=column_names,\n",
+    "        load_from_cache_file=not data_args.overwrite_cache,\n",
+    "        desc=\"Running tokenizer on train dataset\",\n",
+    "    )\n",
+    "if data_args.max_train_samples is not None:\n",
+    "    # Number of samples might increase during Feature Creation, We select only specified max samples\n",
+    "    teacher_train_dataset = teacher_train_dataset.select(range(data_args.max_train_samples))\n",
+    "\n",
+    "# Validation Feature Creation\n",
+    "with training_args.main_process_first(desc=\"validation dataset map pre-processing\"):\n",
+    "    teacher_eval_dataset = eval_examples.map(\n",
+    "        functools.partial(prepare_validation_features, tokenizer=teacher_tokenizer),\n",
+    "        batched=True,\n",
+    "        num_proc=data_args.preprocessing_num_workers,\n",
+    "        remove_columns=column_names,\n",
+    "        load_from_cache_file=not data_args.overwrite_cache,\n",
+    "        desc=\"Running tokenizer on validation dataset\",\n",
+    "    )\n",
+    "if data_args.max_eval_samples is not None:\n",
+    "    # During Feature creation dataset samples might increase, we will select required samples again\n",
+    "    teacher_eval_dataset = teacher_eval_dataset.select(range(data_args.max_eval_samples))\n",
+    "    \n",
+    "# get logits of teacher model\n",
+    "if optim_args.run_teacher_logits:\n",
+    "    def dict_tensor_to_model_device(batch, model):\n",
+    "        device = next(model.parameters()).device\n",
+    "        for k in batch:\n",
+    "            batch[k] = batch[k].to(device)\n",
+    "\n",
+    "    def get_logits(teacher_model, train_dataset, teacher_train_dataset):\n",
+    "        logger.info(\"***** Getting logits of teacher model *****\")\n",
+    "        logger.info(f\"  Num examples = {len(train_dataset) }\")\n",
+    "        teacher_model.eval()\n",
+    "        npy_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),\n",
+    "            '{}.{}.npy'.format(data_args.dataset_name, \n",
+    "                            optim_args.teacher_model_name_or_path.replace('/', '.')))\n",
+    "        if os.path.exists(npy_file):\n",
+    "            teacher_logits = [list(x) for x in np.load(npy_file, allow_pickle=True)]\n",
+    "        else:\n",
+    "            sampler = None\n",
+    "            if training_args.world_size > 1:\n",
+    "                from transformers.trainer_pt_utils import ShardSampler\n",
+    "                sampler = ShardSampler(\n",
+    "                    teacher_train_dataset,\n",
+    "                    batch_size=training_args.per_device_eval_batch_size,\n",
+    "                    num_processes=training_args.world_size,\n",
+    "                    process_index=training_args.process_index,\n",
+    "                )\n",
+    "                teacher_model = torch.nn.parallel.DistributedDataParallel(\n",
+    "                    teacher_model,\n",
+    "                    device_ids=[training_args.local_rank] \\\n",
+    "                        if training_args._n_gpu != 0 else None,\n",
+    "                    output_device=training_args.local_rank \\\n",
+    "                        if training_args._n_gpu != 0 else None,\n",
+    "                )\n",
+    "            train_dataloader = DataLoader(teacher_train_dataset, \n",
+    "                                        collate_fn=data_collator, \n",
+    "                                        sampler=sampler,\n",
+    "                                        batch_size=training_args.per_device_eval_batch_size)\n",
+    "            train_dataloader = tqdm(train_dataloader, desc=\"Evaluating\")\n",
+    "            teacher_logits = []\n",
+    "            for step, batch in enumerate(train_dataloader):\n",
+    "                dict_tensor_to_model_device(batch, teacher_model)\n",
+    "                outputs = teacher_model(**batch).cpu().numpy()\n",
+    "                if training_args.world_size > 1:\n",
+    "                    outputs_list = [None for i in range(training_args.world_size)]\n",
+    "                    torch.distributed.all_gather_object(outputs_list, outputs)\n",
+    "                    outputs = np.concatenate(outputs_list, axis=0)\n",
+    "                teacher_logits += [[s,e] for s,e in zip(outputs[0::2], outputs[1::2])]\n",
+    "            if training_args.world_size > 1:\n",
+    "                teacher_logits = teacher_logits[:len(teacher_train_dataset)]\n",
+    "            if training_args.local_rank in [-1, 0]:\n",
+    "                np.save(npy_file, teacher_logits, allow_pickle=True)\n",
+    "        return train_dataset.add_column('teacher_logits', teacher_logits[:data_args.max_train_samples])\n",
+    "    with torch.no_grad():\n",
+    "        train_dataset = get_logits(QAModel_output_reshaped(teacher_model), train_dataset, teacher_train_dataset)\n",
+    "        \n",
+    "para_counter = lambda model:sum(p.numel() for p in model.parameters())\n",
+    "logger.info(\"***** Number of teacher model parameters: {:.2f}M *****\".format(\\\n",
+    "            para_counter(teacher_model)/10**6))\n",
+    "logger.info(\"***** Number of student model parameters: {:.2f}M *****\".format(\\\n",
+    "            para_counter(model)/10**6))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "set_seed(training_args.seed)\n",
+    "# Initialize our Trainer\n",
+    "trainer = QuestionAnsweringTrainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset if training_args.do_train else None,\n",
+    "    eval_dataset=eval_dataset if training_args.do_eval else None,\n",
+    "    eval_examples=eval_examples if training_args.do_eval else None,\n",
+    "    tokenizer=tokenizer,\n",
+    "    data_collator=data_collator,\n",
+    "    post_process_function=post_processing_function,\n",
+    "    compute_metrics=compute_metrics,\n",
+    ")\n",
+    "\n",
+    "tune_metric = metrics.Metric(name=optim_args.metric_name)\n",
+    "distillation_conf = DistillationConfig(metrics=tune_metric)\n",
+    "model = trainer.distill(\n",
+    "    distillation_config=distillation_conf, teacher_model=teacher_model\n",
+    ")\n",
+    "trainer.save_model(training_args.output_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark after Distillation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the model obtained after Intel Neural Compressor (INC) quantization\n",
+    "model = OptimizedModel.from_pretrained(\n",
+    "    training_args.output_dir,\n",
+    ")\n",
+    "model.eval()\n",
+    "trainer.model = model\n",
+    "start_time = timeit.default_timer()\n",
+    "results = trainer.evaluate()\n",
+    "evalTime = timeit.default_timer() - start_time\n",
+    "max_eval_samples = data_args.max_eval_samples \\\n",
+    "    if data_args.max_eval_samples is not None else len(eval_dataset)\n",
+    "eval_samples = min(max_eval_samples, len(eval_dataset))\n",
+    "samples = eval_samples - (eval_samples % batch_size) \\\n",
+    "    if training_args.dataloader_drop_last else eval_samples\n",
+    "logger.info(\"metrics keys: {}\".format(results.keys()))\n",
+    "bert_task_acc_keys = ['eval_f1', 'eval_accuracy', 'eval_matthews_correlation',\n",
+    "                        'eval_pearson', 'eval_mcc', 'eval_spearmanr']\n",
+    "ret = False\n",
+    "for key in bert_task_acc_keys:\n",
+    "    if key in results.keys():\n",
+    "        ret = True\n",
+    "        print('Batch size = ', training_args.per_device_eval_batch_size)\n",
+    "        print(\"Finally Eval {} Accuracy: {}\".format(key, results[key]))\n",
+    "        print(\"Latency: {:.5f} ms\".format(evalTime / samples * 1000))\n",
+    "        print(\"Throughput: {:.5f} samples/sec\".format(samples/evalTime))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.6 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.6"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "a3ed54c68abdb79eabea0140062ffa976ea4d8132b937aa83ca919a8d862edf2"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/tutorials/pytorch/summarization/benchmark.py b/docs/tutorials/pytorch/summarization/benchmark.py
new file mode 100644
index 00000000000..e070db9c719
--- /dev/null
+++ b/docs/tutorials/pytorch/summarization/benchmark.py
@@ -0,0 +1,239 @@
+import logging
+import os
+import numpy as np
+import nltk 
+from datasets import load_dataset, load_metric
+from intel_extension_for_transformers import metrics, OptimizedModel
+from intel_extension_for_transformers.optimization.trainer import NLPSeq2SeqTrainer
+from argparse import ArgumentParser
+from transformers import (
+    AutoConfig,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    DataCollatorForSeq2Seq,
+    Seq2SeqTrainingArguments,
+    set_seed,
+)
+
+os.environ["WANDB_DISABLED"] = "true"
+
+logger = logging.getLogger(__name__)
+
+arg_parser = ArgumentParser(description='Parse args')
+arg_parser.add_argument('--data_type', default = "int8", help='data type of model')
+arg_parser.add_argument('--model_name_or_path', default = "lvwerra/pegasus-samsum", help = 'input model for benchmark')
+args = arg_parser.parse_args()
+
+dataset_name="samsum"
+summarization_name_mapping = {
+    "amazon_reviews_multi": ("review_body", "review_title"),
+    "big_patent": ("description", "abstract"),
+    "cnn_dailymail": ("article", "highlights"),
+    "orange_sum": ("text", "summary"),
+    "pn_summary": ("article", "summary"),
+    "psc": ("extract_text", "summary_text"),
+    "samsum": ("dialogue", "summary"),
+    "thaisum": ("body", "summary"),
+    "xglue": ("news_body", "news_title"),
+    "xsum": ("document", "summary"),
+    "wiki_summary": ("article", "highlights"),
+}
+training_args = Seq2SeqTrainingArguments(
+    output_dir=args.model_name_or_path,
+    do_eval=True,
+    do_train=True,
+    no_cuda=True,
+    predict_with_generate=True,
+    overwrite_output_dir=True,
+    per_device_train_batch_size=8,
+    per_device_eval_batch_size=8,
+)
+
+raw_datasets = load_dataset(dataset_name)
+config = AutoConfig.from_pretrained(args.model_name_or_path, revision="main")
+tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=True, revision="main")
+
+## start with int8 benchmarking
+if args.data_type == "int8":
+    # Load the model obtained after Intel Neural Compressor (INC) quantization
+    model = OptimizedModel.from_pretrained(
+          args.model_name_or_path,
+          from_tf=bool(".ckpt" in args.model_name_or_path),
+          config=config,
+          revision="main",
+          use_auth_token=None,
+    )
+else:
+    ## original fp32 model benchmarking
+    model = AutoModelForSeq2SeqLM.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        revision="main"
+    )
+    model.resize_token_embeddings(len(tokenizer))
+
+if model.config.decoder_start_token_id is None:
+    raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
+
+if (
+    hasattr(model.config, "max_position_embeddings")
+    and model.config.max_position_embeddings < 1024
+):
+    model.resize_position_embeddings(1024)
+
+prefix = ""
+# preprocessing dataset
+
+# Preprocessing the datasets.
+# We need to tokenize inputs and targets.
+if training_args.do_train:
+    column_names = raw_datasets["train"].column_names
+elif training_args.do_eval:
+    column_names = raw_datasets["validation"].column_names
+elif training_args.do_predict:
+    column_names = raw_datasets["test"].column_names
+else:
+    logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
+
+
+# Get the column names for input/target.
+dataset_columns = summarization_name_mapping.get(dataset_name, None)
+text_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+summary_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+
+# Temporarily set max_target_length for training.
+max_target_length = 128
+padding = False
+
+def preprocess_function(examples):
+    # remove pairs where at least one record is None
+
+    inputs, targets = [], []
+    for i in range(len(examples[text_column])):
+        if examples[text_column][i] is not None and examples[summary_column][i] is not None:
+            inputs.append(examples[text_column][i])
+            targets.append(examples[summary_column][i])
+
+    inputs = [prefix + inp for inp in inputs]
+    model_inputs = tokenizer(inputs, max_length=1024, padding=padding, truncation=True)
+
+    # Setup the tokenizer for targets
+    with tokenizer.as_target_tokenizer():
+        labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
+
+    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
+    # padding in the loss.
+    if padding == "max_length":
+        labels["input_ids"] = [
+            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
+        ]
+
+    model_inputs["labels"] = labels["input_ids"]
+    return model_inputs
+
+
+if training_args.do_train:
+    if "train" not in raw_datasets:
+        raise ValueError("--do_train requires a train dataset")
+    train_dataset = raw_datasets["train"]
+    max_train_samples = min(len(train_dataset), 10000)
+    train_dataset = train_dataset.select(range(max_train_samples))
+    with training_args.main_process_first(desc="train dataset map pre-processing"):
+        train_dataset = train_dataset.map(
+            preprocess_function,
+            batched=True,
+            remove_columns=column_names,
+            load_from_cache_file=False,
+            desc="Running tokenizer on train dataset",
+        )
+
+if training_args.do_eval:
+    max_target_length = 128
+    if "validation" not in raw_datasets:
+        raise ValueError("--do_eval requires a validation dataset")
+    eval_dataset = raw_datasets["validation"]
+    max_eval_samples = min(len(eval_dataset), 500)
+    eval_dataset = eval_dataset.select(range(max_eval_samples))
+    with training_args.main_process_first(desc="validation dataset map pre-processing"):
+        eval_dataset = eval_dataset.map(
+            preprocess_function,
+            batched=True,
+            remove_columns=column_names,
+            load_from_cache_file=False,
+            desc="Running tokenizer on validation dataset",
+        )
+
+# Data collator
+label_pad_token_id = -100
+data_collator = DataCollatorForSeq2Seq(
+    tokenizer,
+    model=model,
+    label_pad_token_id=label_pad_token_id,
+    pad_to_multiple_of=8 if training_args.fp16 else None,
+)
+
+# Metric
+metric = load_metric("rouge")
+
+def postprocess_text(preds, labels):
+    preds = [pred.strip() for pred in preds]
+    labels = [label.strip() for label in labels]
+
+    # rougeLSum expects newline after each sentence
+    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
+    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
+
+    return preds, labels
+
+def compute_metrics(eval_preds):
+    preds, labels = eval_preds
+    if isinstance(preds, tuple):
+        preds = preds[0]
+    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+    # Replace -100 in the labels as we can't decode them.
+    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+    # Some simple post-processing
+    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
+
+    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
+    # Extract a few results from ROUGE
+    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
+
+    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
+    result["gen_len"] = np.mean(prediction_lens)
+    result = {k: round(v, 4) for k, v in result.items()}
+    return result
+
+# Initialize the Trainer
+set_seed(training_args.seed)
+trainer = NLPSeq2SeqTrainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset if training_args.do_train else None,
+    eval_dataset=eval_dataset if training_args.do_eval else None,
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics if training_args.predict_with_generate else None,
+)
+max_length = (
+    training_args.generation_max_length
+    if training_args.generation_max_length is not None
+    else 128
+)
+num_beams = training_args.generation_num_beams
+trainer.max_length = max_length
+trainer.num_beams = num_beams
+
+results = trainer.evaluate(max_length=max_length, num_beams=num_beams)
+bert_task_acc_keys = ['eval_loss', 'eval_f1', 'eval_accuracy', 'eval_matthews_correlation',
+                              'eval_pearson', 'eval_mcc', 'eval_spearmanr']
+
+throughput = results.get("eval_samples_per_second")
+eval_loss = results["eval_loss"]
+print('Batch size = {}'.format(training_args.per_device_eval_batch_size))
+print("Finally Eval eval_loss Accuracy: {}".format(eval_loss))
+print("Latency: {:.3f} ms".format(1000 / throughput))
+print("Throughput: {} samples/sec".format(throughput))
diff --git a/docs/tutorials/pytorch/summarization/pegasus-samsum.ipynb b/docs/tutorials/pytorch/summarization/pegasus-samsum.ipynb
index 101186af5fd..013559fe459 100644
--- a/docs/tutorials/pytorch/summarization/pegasus-samsum.ipynb
+++ b/docs/tutorials/pytorch/summarization/pegasus-samsum.ipynb
@@ -57,7 +57,7 @@
     }
    },
    "source": [
-    "* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. "
+    "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. "
    ]
   },
   {
@@ -304,7 +304,7 @@
     "    max_eval_samples=500\n",
     ")\n",
     "training_args = Seq2SeqTrainingArguments(\n",
-    "    output_dir=\"/tmp/tst-summarization\",\n",
+    "    output_dir=\"./saved_results_dynamic\",\n",
     "    do_eval=True,\n",
     "    do_train=True,\n",
     "    no_cuda=True,\n",
@@ -625,8 +625,8 @@
     "metric_name = \"eval_rougeLsum\"\n",
     "\n",
     "# tuning\n",
-    "model.config.save_pretrained(\"/tmp/tst-summarization/saved_pretrained_static\")\n",
-    "trainer.save_model(\"/tmp/tst-summarization/saved_model_static\")\n",
+    "model.config.save_pretrained(\"./saved_results_dynamic\")\n",
+    "trainer.save_model(\"./saved_results_dynamic\")\n",
     "\n",
     "tune_metric = nlp_metrics.Metric(\n",
     "    name=metric_name, is_relative=True, criterion=0.25\n",
@@ -674,6 +674,25 @@
     "print(\"Throughput: {:.5f} samples/sec\".format(throughput))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "f35818b8",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark after Dynamic Post Training Quantization with Multi-Instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bac69198",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "results = os.system('bash ../multi_instance.sh --model=saved_results_dynamic --core_per_instance=4 --data_type=int8')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "44cca2a1",
@@ -716,6 +735,25 @@
     "print(\"Latency: {:.5f} ms\".format(1000 / throughput_fp32))\n",
     "print(\"Throughput: {:.5f} samples/sec\".format(throughput_fp32))"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "05df025f",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark for FP32 Model with Multi-Instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71e4e7da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "results = os.system('bash ../multi_instance.sh --model=textattack/bert-base-uncased-MRPC --core_per_instance=4 --data_type=fp32')"
+   ]
   }
  ],
  "metadata": {
diff --git a/docs/tutorials/pytorch/text-classification/benchmark.py b/docs/tutorials/pytorch/text-classification/benchmark.py
new file mode 100644
index 00000000000..eac4d67ec7d
--- /dev/null
+++ b/docs/tutorials/pytorch/text-classification/benchmark.py
@@ -0,0 +1,177 @@
+import logging
+import os
+import numpy as np
+import random
+from datasets import load_dataset, load_metric
+from intel_extension_for_transformers import OptimizedModel
+from intel_extension_for_transformers.optimization.trainer import NLPTrainer
+from argparse import ArgumentParser
+from transformers import (
+    AutoConfig,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    EvalPrediction,
+    PretrainedConfig,
+    TrainingArguments,
+    set_seed,
+)
+
+os.environ["WANDB_DISABLED"] = "true"
+task_to_keys = {
+    "cola": ("sentence", None),
+    "mnli": ("premise", "hypothesis"),
+    "mrpc": ("sentence1", "sentence2"),
+    "qnli": ("question", "sentence"),
+    "qqp": ("question1", "question2"),
+    "rte": ("sentence1", "sentence2"),
+    "sst2": ("sentence", None),
+    "stsb": ("sentence1", "sentence2"),
+    "wnli": ("sentence1", "sentence2"),
+}
+logger = logging.getLogger(__name__)
+
+arg_parser = ArgumentParser(description='Parse args')
+arg_parser.add_argument('--data_type', default = "int8", help='data type of model')
+arg_parser.add_argument('--model_name_or_path', default = "textattack/bert-base-uncased-MRPC", help = 'input model for benchmark')
+args = arg_parser.parse_args()
+
+# download the dataset.
+raw_datasets = load_dataset("glue", "mrpc")
+# Labels
+label_list = raw_datasets["train"].features["label"].names
+num_labels = len(label_list)
+
+training_args = TrainingArguments(
+    output_dir=args.model_name_or_path,
+    do_eval=True,
+    do_train=True,
+    no_cuda=True,
+    overwrite_output_dir=True,
+    per_device_train_batch_size=8,
+)
+config = AutoConfig.from_pretrained(
+    args.model_name_or_path,
+    num_labels=num_labels,
+    finetuning_task="mrpc",
+    revision="main"
+)
+tokenizer = AutoTokenizer.from_pretrained(
+    args.model_name_or_path,
+    use_fast=True,
+    revision="main"
+)
+
+## start with int8 benchmarking
+if args.data_type == "int8":
+    # Load the model obtained after Intel Neural Compressor (INC) quantization
+    model = OptimizedModel.from_pretrained(
+          args.model_name_or_path,
+          from_tf=bool(".ckpt" in args.model_name_or_path),
+          config=config,
+          revision="main",
+          use_auth_token=None,
+    )
+else:
+    ## original fp32 model benchmarking
+    model = AutoModelForSequenceClassification.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        revision="main"
+    )
+
+# Preprocessing the raw_datasets
+sentence1_key, sentence2_key = task_to_keys["mrpc"]
+# Padding strategy
+padding = False
+# Some models have set the order of the labels to use, so let's make sure we do use it.
+label_to_id = None
+if model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id:
+    # Some have all caps in their config, some don't.
+    label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
+    if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
+        label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
+    else:
+        logger.warning(
+            f"Your model seems to have been trained with labels, but they don't match the dataset: "
+            f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}.\n"
+            f"Ignoring the model labels as a result."
+        )
+if label_to_id is not None:
+    model.config.label2id = label_to_id
+    model.config.id2label = {id: label for label, id in config.label2id.items()}
+max_seq_length = min(128, tokenizer.model_max_length)
+
+def preprocess_function(examples):
+    # Tokenize the texts
+    args = (
+        (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
+    )
+    result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
+
+    # Map labels to IDs (not necessary for GLUE tasks)
+    if label_to_id is not None and "label" in examples:
+        result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
+    return result
+
+with training_args.main_process_first(desc="dataset map pre-processing"):
+    raw_datasets = raw_datasets.map(
+        preprocess_function, batched=True, load_from_cache_file=False
+    )
+
+if training_args.do_train:
+    if "train" not in raw_datasets:
+        raise ValueError("--do_train requires a train dataset")
+    train_dataset = raw_datasets["train"]
+
+if training_args.do_eval:
+    if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
+        raise ValueError("--do_eval requires a validation dataset")
+    eval_dataset = raw_datasets["validation"]
+
+# Log a few random samples from the training set:
+if training_args.do_train:
+    for index in random.sample(range(len(train_dataset)), 3):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+# Get the metric function
+metric = load_metric("glue", "mrpc")
+
+metric_name = "eval_accuracy"
+
+# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
+# predictions and label_ids field) and has to return a dictionary string to float.
+def compute_metrics(p: EvalPrediction):
+    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
+    preds =  np.argmax(preds, axis=1)
+    result = metric.compute(predictions=preds, references=p.label_ids)
+    if len(result) > 1:
+        result["combined_score"] = np.mean(list(result.values())).item()
+    return result
+
+# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
+data_collator = None
+
+# Initialize the Trainer
+set_seed(training_args.seed)
+trainer = NLPTrainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset if training_args.do_train else None,
+    eval_dataset=eval_dataset if training_args.do_eval else None,
+    compute_metrics=compute_metrics,
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+)
+
+
+results = trainer.evaluate()
+bert_task_acc_keys = ['eval_loss', 'eval_f1', 'eval_accuracy', 'eval_matthews_correlation',
+                              'eval_pearson', 'eval_mcc', 'eval_spearmanr']
+
+throughput = results.get("eval_samples_per_second")
+eval_loss = results["eval_loss"]
+print('Batch size = {}'.format(training_args.per_device_eval_batch_size))
+print("Finally Eval eval_loss Accuracy: {}".format(eval_loss))
+print("Latency: {:.3f} ms".format(1000 / throughput))
+print("Throughput: {} samples/sec".format(throughput))
diff --git a/docs/tutorials/pytorch/text-classification/bert-base-uncased-MRPC.ipynb b/docs/tutorials/pytorch/text-classification/bert-base-uncased-MRPC.ipynb
index 15fda15cf64..926ed3e63e9 100644
--- a/docs/tutorials/pytorch/text-classification/bert-base-uncased-MRPC.ipynb
+++ b/docs/tutorials/pytorch/text-classification/bert-base-uncased-MRPC.ipynb
@@ -36,6 +36,14 @@
     "# Prerequisite"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2172da4c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "markdown",
    "id": "86b20e2b",
@@ -57,7 +65,7 @@
     }
    },
    "source": [
-    "* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. "
+    "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. "
    ]
   },
   {
@@ -279,7 +287,7 @@
     "    overwrite_cache=True\n",
     ")\n",
     "training_args = TrainingArguments(\n",
-    "    output_dir=\"./saved_result\",\n",
+    "    output_dir=\"./saved_result_static\",\n",
     "    do_eval=True,\n",
     "    do_train=True,\n",
     "    no_cuda=True,\n",
@@ -517,8 +525,8 @@
     "if not training_args.do_eval:\n",
     "    raise ValueError(\"do_eval must be set to True for quantization.\")\n",
     "\n",
-    "model.config.save_pretrained(\"./saved_result/saved_pretrained_static\")\n",
-    "trainer_static.save_model(\"./saved_result/saved_model_static\")\n",
+    "model.config.save_pretrained(\"./saved_results_static\")\n",
+    "trainer_static.save_model(\"./saved_results_static\")\n",
     "\n",
     "tune_metric = metrics.Metric(\n",
     "    name=metric_name, is_relative=True, criterion=0.25\n",
@@ -566,6 +574,26 @@
     "print(\"Throughput: {:.5f} samples/sec\".format(throughput))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "bc69524d",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark after Static Post Training Quantization with Multi-Instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "860e0503",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.system('numactl --hardware')\n",
+    "results = os.system('bash ../multi_instance.sh --model=saved_results_static --core_per_instance=4 --data_type=int8')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "7a51f6ca",
@@ -591,6 +619,7 @@
    "source": [
     "# Set seed before initializing model.\n",
     "set_seed(training_args.seed)\n",
+    "training_args.output_dir = \"saved_results_dynamic\"\n",
     "# Initialize our Trainer\n",
     "trainer_dynamic = NLPTrainer(\n",
     "    model=model,\n",
@@ -606,8 +635,8 @@
     "if not training_args.do_eval:\n",
     "    raise ValueError(\"do_eval must be set to True for quantization.\")\n",
     "\n",
-    "model.config.save_pretrained(\"./saved_result/saved_pretrained_dynamic\")\n",
-    "trainer_dynamic.save_model(\"./saved_result/saved_model_dynamic\")\n",
+    "model.config.save_pretrained(\"./saved_results_dynamic\")\n",
+    "trainer_dynamic.save_model(\"./saved_results_dynamic\")\n",
     "\n",
     "tune_metric = metrics.Metric(\n",
     "    name=metric_name, is_relative=True, criterion=0.25\n",
@@ -654,6 +683,25 @@
     "print(\"Throughput: {:.5f} samples/sec\".format(throughput))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "5b98f5dc",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark after Dynamic Post Training Quantization with Multi-Instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36575ce4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "results = os.system('bash ../multi_instance.sh --model=saved_results_dynamic --core_per_instance=4 --data_type=int8')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "44cca2a1",
@@ -697,6 +745,25 @@
     "print(\"Latency: {:.5f} ms\".format(1000 / throughput))\n",
     "print(\"Throughput: {:.5f} samples/sec\".format(throughput))"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7f7c85bc",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark for FP32 Model with Multi-Instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36dec093",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "results = os.system('bash ../multi_instance.sh --model=lvwerra/pegasus-samsum --core_per_instance=4 --data_type=fp32')"
+   ]
   }
  ],
  "metadata": {
diff --git a/docs/tutorials/pytorch/text-classification/distillation.ipynb b/docs/tutorials/pytorch/text-classification/distillation.ipynb
new file mode 100644
index 00000000000..0f9d70a7744
--- /dev/null
+++ b/docs/tutorials/pytorch/text-classification/distillation.ipynb
@@ -0,0 +1,665 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Introduction"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This tutorial demonstrates how to use the distillation approach based on [Intel® Neural Compressor](https://github.com/intel/neural-compressor) for text-classification."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prerequisite"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Install packages"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# install model dependency\n",
+    "! pip install accelerate datasets >= 1.1.3 sentencepiece != 0.1.92 protobuf torch >= 1.10 transformers >= 4.12.0 wandb\n",
+    "! pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import packages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datasets\n",
+    "import functools\n",
+    "import logging\n",
+    "import os\n",
+    "import numpy as np\n",
+    "import random\n",
+    "import sys\n",
+    "import torch\n",
+    "import transformers\n",
+    "from dataclasses import dataclass, field\n",
+    "from datasets import load_dataset, load_metric\n",
+    "from intel_extension_for_transformers import (\n",
+    "    metrics,\n",
+    "    DistillationConfig,\n",
+    "    OptimizedModel,\n",
+    ")\n",
+    "from intel_extension_for_transformers.optimization.trainer import NLPTrainer\n",
+    "from torch.utils.data import DataLoader\n",
+    "from tqdm.auto import tqdm\n",
+    "from transformers import (\n",
+    "    AutoConfig,\n",
+    "    AutoModelForSequenceClassification,\n",
+    "    AutoTokenizer,\n",
+    "    DataCollatorWithPadding,\n",
+    "    EvalPrediction,\n",
+    "    HfArgumentParser,\n",
+    "    PretrainedConfig,\n",
+    "    TrainingArguments,\n",
+    "    default_data_collator,\n",
+    "    set_seed,\n",
+    ")\n",
+    "from transformers.trainer_utils import get_last_checkpoint\n",
+    "from transformers.utils import check_min_version\n",
+    "from typing import Optional\n",
+    "\n",
+    "\n",
+    "\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"\"\n",
+    "os.environ[\"WANDB_DISABLED\"] = \"true\"\n",
+    "\n",
+    "\n",
+    "# Will error if the minimal version of Transformers is not installed. Remove at your own risks.\n",
+    "check_min_version(\"4.12.0\")\n",
+    "\n",
+    "\n",
+    "task_to_keys = {\n",
+    "    \"cola\": (\"sentence\", None),\n",
+    "    \"mnli\": (\"premise\", \"hypothesis\"),\n",
+    "    \"mrpc\": (\"sentence1\", \"sentence2\"),\n",
+    "    \"qnli\": (\"question\", \"sentence\"),\n",
+    "    \"qqp\": (\"question1\", \"question2\"),\n",
+    "    \"rte\": (\"sentence1\", \"sentence2\"),\n",
+    "    \"sst2\": (\"sentence\", None),\n",
+    "    \"stsb\": (\"sentence1\", \"sentence2\"),\n",
+    "    \"wnli\": (\"sentence1\", \"sentence2\"),\n",
+    "}\n",
+    "\n",
+    "logger = logging.getLogger(__name__)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define arguments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ========== Define arguments =========\n",
+    "@dataclass\n",
+    "class DataTrainingArguments:\n",
+    "    \"\"\"\n",
+    "    Arguments pertaining to what data we are going to input our model for training and eval.\n",
+    "    Using `HfArgumentParser` we can turn this class\n",
+    "    into argparse arguments to be able to specify them on\n",
+    "    the command line.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    task_name: Optional[str] = field(\n",
+    "        default=None,\n",
+    "        metadata={\"help\": \"The name of the task to train on: \" + \", \".join(task_to_keys.keys())},\n",
+    "    )\n",
+    "    dataset_name: Optional[str] = field(\n",
+    "        default=None, metadata={\"help\": \"The name of the dataset to use (via the datasets library).\"}\n",
+    "    )\n",
+    "    dataset_config_name: Optional[str] = field(\n",
+    "        default=None, metadata={\"help\": \"The configuration name of the dataset to use (via the datasets library).\"}\n",
+    "    )\n",
+    "    max_seq_length: int = field(\n",
+    "        default=128,\n",
+    "        metadata={\n",
+    "            \"help\": \"The maximum total input sequence length after tokenization. Sequences longer \"\n",
+    "            \"than this will be truncated, sequences shorter will be padded.\"\n",
+    "        },\n",
+    "    )\n",
+    "    overwrite_cache: bool = field(\n",
+    "        default=False, metadata={\"help\": \"Overwrite the cached preprocessed datasets or not.\"}\n",
+    "    )\n",
+    "    pad_to_max_length: bool = field(\n",
+    "        default=True,\n",
+    "        metadata={\n",
+    "            \"help\": \"Whether to pad all samples to `max_seq_length`. \"\n",
+    "            \"If False, will pad the samples dynamically when batching to the maximum length in the batch.\"\n",
+    "        },\n",
+    "    )\n",
+    "    max_train_samples: Optional[int] = field(\n",
+    "        default=None,\n",
+    "        metadata={\n",
+    "            \"help\": \"For debugging purposes or quicker training, truncate the number of training examples to this \"\n",
+    "            \"value if set.\"\n",
+    "        },\n",
+    "    )\n",
+    "    max_eval_samples: Optional[int] = field(\n",
+    "        default=None,\n",
+    "        metadata={\n",
+    "            \"help\": \"For debugging purposes or quicker training, truncate the number of evaluation examples to this \"\n",
+    "            \"value if set.\"\n",
+    "        },\n",
+    "    )\n",
+    "    max_predict_samples: Optional[int] = field(\n",
+    "        default=None,\n",
+    "        metadata={\n",
+    "            \"help\": \"For debugging purposes or quicker training, truncate the number of prediction examples to this \"\n",
+    "            \"value if set.\"\n",
+    "        },\n",
+    "    )\n",
+    "    train_file: Optional[str] = field(\n",
+    "        default=None, metadata={\"help\": \"A csv or a json file containing the training data.\"}\n",
+    "    )\n",
+    "    validation_file: Optional[str] = field(\n",
+    "        default=None, metadata={\"help\": \"A csv or a json file containing the validation data.\"}\n",
+    "    )\n",
+    "\n",
+    "    def __post_init__(self):\n",
+    "        if self.task_name is not None:\n",
+    "            self.task_name = self.task_name.lower()\n",
+    "            if self.task_name not in task_to_keys.keys():\n",
+    "                raise ValueError(\"Unknown task, you should pick one in \" + \",\".join(task_to_keys.keys()))\n",
+    "        elif self.dataset_name is not None:\n",
+    "            pass\n",
+    "        elif self.train_file is None or self.validation_file is None:\n",
+    "            raise ValueError(\"Need either a GLUE task, a training/validation file or a dataset name.\")\n",
+    "        else:\n",
+    "            train_extension = self.train_file.split(\".\")[-1]\n",
+    "            assert train_extension in [\"csv\", \"json\"], \"`train_file` should be a csv or a json file.\"\n",
+    "            validation_extension = self.validation_file.split(\".\")[-1]\n",
+    "            assert (\n",
+    "                validation_extension == train_extension\n",
+    "            ), \"`validation_file` should have the same extension (csv or json) as `train_file`.\"\n",
+    "\n",
+    "\n",
+    "@dataclass\n",
+    "class ModelArguments:\n",
+    "    \"\"\"\n",
+    "    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    model_name_or_path: str = field(\n",
+    "        metadata={\"help\": \"Path to pretrained model or model identifier from huggingface.co/models\"}\n",
+    "    )\n",
+    "    config_name: Optional[str] = field(\n",
+    "        default=None, metadata={\"help\": \"Pretrained config name or path if not the same as model_name\"}\n",
+    "    )\n",
+    "    tokenizer_name: Optional[str] = field(\n",
+    "        default=None, metadata={\"help\": \"Pretrained tokenizer name or path if not the same as model_name\"}\n",
+    "    )\n",
+    "    cache_dir: Optional[str] = field(\n",
+    "        default=None,\n",
+    "        metadata={\"help\": \"Where do you want to store the pretrained models downloaded from huggingface.co\"},\n",
+    "    )\n",
+    "    use_fast_tokenizer: bool = field(\n",
+    "        default=True,\n",
+    "        metadata={\"help\": \"Whether to use one of the fast tokenizer (backed by the tokenizers library) or not.\"},\n",
+    "    )\n",
+    "    model_revision: str = field(\n",
+    "        default=\"main\",\n",
+    "        metadata={\"help\": \"The specific model version to use (can be a branch name, tag name or commit id).\"},\n",
+    "    )\n",
+    "    use_auth_token: bool = field(\n",
+    "        default=False,\n",
+    "        metadata={\n",
+    "            \"help\": \"Will use the token generated when running `transformers-cli login` (necessary to use this script \"\n",
+    "            \"with private models).\"\n",
+    "        },\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "@dataclass\n",
+    "class OptimizationArguments:\n",
+    "    \"\"\"\n",
+    "    Arguments pertaining to what type of optimization we are going to apply on the model.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    distillation: bool = field(\n",
+    "        default=False,\n",
+    "        metadata={\"help\": \"Whether or not to apply distillation.\"},\n",
+    "    )\n",
+    "    teacher_model_name_or_path: str = field(\n",
+    "        default=False,\n",
+    "        metadata={\"help\": \"Path to pretrained model or model identifier from huggingface.co/models\"}\n",
+    "    )\n",
+    "    metric_name: Optional[str] = field(\n",
+    "        default=None,\n",
+    "        metadata={\"help\": \"Metric used for the tuning strategy.\"},\n",
+    "    )\n",
+    "    tolerance_mode: Optional[str] = field(\n",
+    "        default=\"absolute\",\n",
+    "        metadata={\"help\": \"Metric tolerance model, expected to be relative or absolute.\"},\n",
+    "    )\n",
+    "    perf_tol: Optional[float] = field(\n",
+    "        default=0.02,\n",
+    "        metadata={\"help\": \"Performance tolerance when optimizing the model.\"},\n",
+    "    )\n",
+    "    benchmark: bool = field(\n",
+    "        default=False,\n",
+    "        metadata={\"help\": \"run benchmark.\"})\n",
+    "    accuracy_only: bool = field(\n",
+    "        default=False,\n",
+    "        metadata={\"help\":\"Whether to only test accuracy for model tuned by Neural Compressor.\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We now keep distinct sets of args, for a cleaner separation of concerns.\n",
+    "parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, OptimizationArguments))\n",
+    "if len(sys.argv) == 2 and sys.argv[1].endswith(\".json\"):\n",
+    "    # If we pass only one argument to the script and it's the path to a json file,\n",
+    "    # let's parse it to get our arguments.\n",
+    "    model_args, data_args, training_args, optim_args = parser.parse_json_file(\n",
+    "        json_file=os.path.abspath(sys.argv[1])\n",
+    "    )\n",
+    "else:\n",
+    "    model_args, data_args, training_args, optim_args = parser.parse_args_into_dataclasses()\n",
+    "\n",
+    "# Setup logging\n",
+    "logging.basicConfig(\n",
+    "    format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n",
+    "    datefmt=\"%m/%d/%Y %H:%M:%S\",\n",
+    "    handlers=[logging.StreamHandler(sys.stdout)],\n",
+    ")\n",
+    "\n",
+    "log_level = training_args.get_process_log_level()\n",
+    "logger.setLevel(log_level)\n",
+    "datasets.utils.logging.set_verbosity(log_level)\n",
+    "transformers.utils.logging.set_verbosity(log_level)\n",
+    "transformers.utils.logging.enable_default_handler()\n",
+    "transformers.utils.logging.enable_explicit_format()\n",
+    "\n",
+    "# Log on each process the small summary:\n",
+    "logger.warning(\n",
+    "    f\"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}\"\n",
+    "    + f\"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}\"\n",
+    ")\n",
+    "logger.info(f\"Training/evaluation parameters {training_args}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download dataset from the hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# download the dataset.\n",
+    "raw_datasets = load_dataset(\"glue\", data_args.task_name)\n",
+    "# Labels\n",
+    "label_list = raw_datasets[\"train\"].features[\"label\"].names\n",
+    "num_labels = len(label_list)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download fp32 model from the hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load pretrained model and tokenizer\n",
+    "config = AutoConfig.from_pretrained(\n",
+    "    model_args.model_name_or_path,\n",
+    "    num_labels=num_labels,\n",
+    "    finetuning_task=data_args.task_name,\n",
+    "    revision=\"main\"\n",
+    ")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\n",
+    "    model_args.model_name_or_path,\n",
+    "    use_fast=True,\n",
+    "    revision=\"main\"\n",
+    ")\n",
+    "model = AutoModelForSequenceClassification.from_pretrained(\n",
+    "    model_args.model_name_or_path,\n",
+    "    from_tf=bool(\".ckpt\" in model_args.model_name_or_path),\n",
+    "    config=config,\n",
+    "    revision=\"main\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preprocessing the dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Preprocessing the raw_datasets\n",
+    "sentence1_key, sentence2_key = task_to_keys[data_args.task_name]\n",
+    "# Padding strategy\n",
+    "padding = False\n",
+    "# Some models have set the order of the labels to use, so let's make sure we do use it.\n",
+    "label_to_id = None\n",
+    "if model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id:\n",
+    "    # Some have all caps in their config, some don't.\n",
+    "    label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}\n",
+    "    if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):\n",
+    "        label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}\n",
+    "    else:\n",
+    "        logger.warning(\n",
+    "            f\"Your model seems to have been trained with labels, but they don't match the dataset: \"\n",
+    "            f\"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}.\\n\"\n",
+    "            f\"Ignoring the model labels as a result.\"\n",
+    "        )\n",
+    "if label_to_id is not None:\n",
+    "    model.config.label2id = label_to_id\n",
+    "    model.config.id2label = {id: label for label, id in config.label2id.items()}\n",
+    "max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)\n",
+    "\n",
+    "def preprocess_function(examples, tokenizer=tokenizer):\n",
+    "    # Tokenize the texts\n",
+    "    args = (\n",
+    "        (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])\n",
+    "    )\n",
+    "    result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)\n",
+    "\n",
+    "    # Map labels to IDs (not necessary for GLUE tasks)\n",
+    "    if label_to_id is not None and \"label\" in examples:\n",
+    "        result[\"label\"] = [(label_to_id[l] if l != -1 else -1) for l in examples[\"label\"]]\n",
+    "    return result\n",
+    "\n",
+    "with training_args.main_process_first(desc=\"dataset map pre-processing\"):\n",
+    "    raw_datasets = raw_datasets.map(\n",
+    "        preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache\n",
+    "    )\n",
+    "\n",
+    "if training_args.do_train:\n",
+    "    if \"train\" not in raw_datasets:\n",
+    "        raise ValueError(\"--do_train requires a train dataset\")\n",
+    "    train_dataset = raw_datasets[\"train\"]\n",
+    "\n",
+    "if training_args.do_eval:\n",
+    "    if \"validation\" not in raw_datasets and \"validation_matched\" not in raw_datasets:\n",
+    "        raise ValueError(\"--do_eval requires a validation dataset\")\n",
+    "    eval_dataset = raw_datasets[\"validation_matched\" if data_args.task_name == \"mnli\" else \"validation\"]\n",
+    "\n",
+    "# Log a few random samples from the training set:\n",
+    "if training_args.do_train:\n",
+    "    for index in random.sample(range(len(train_dataset)), 3):\n",
+    "        logger.info(f\"Sample {index} of the training set: {train_dataset[index]}.\")\n",
+    "\n",
+    "# Get the metric function\n",
+    "metric = load_metric(\"glue\", data_args.task_name)\n",
+    "\n",
+    "metric_name = \"eval_accuracy\"\n",
+    "\n",
+    "# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with\n",
+    "# predictions and label_ids field) and has to return a dictionary string to float.\n",
+    "def compute_metrics(p: EvalPrediction):\n",
+    "    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions\n",
+    "    preds =  np.argmax(preds, axis=1)\n",
+    "    if data_args.task_name is not None:\n",
+    "        result = metric.compute(predictions=preds, references=p.label_ids)\n",
+    "        if len(result) > 1:\n",
+    "            result[\"combined_score\"] = np.mean(list(result.values())).item()\n",
+    "        return result\n",
+    "    else:\n",
+    "        return {\"accuracy\": (preds == p.label_ids).astype(np.float32).mean().item()}\n",
+    "\n",
+    "# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.\n",
+    "data_collator = None"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Distillation & Benchmark"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model Distillation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class BertModelforLogitsOutputOnly(torch.nn.Module):\n",
+    "    def __init__(self, model):\n",
+    "        super(BertModelforLogitsOutputOnly, self).__init__()\n",
+    "        self.model = model\n",
+    "    def forward(self, *args, **kwargs):\n",
+    "        output = self.model(*args, **kwargs)\n",
+    "        return output['logits']\n",
+    "\n",
+    "teacher_config = AutoConfig.from_pretrained(optim_args.teacher_model_name_or_path, \\\n",
+    "                    num_labels=num_labels, finetuning_task=data_args.task_name)\n",
+    "teacher_tokenizer = AutoTokenizer.from_pretrained(optim_args.teacher_model_name_or_path, \\\n",
+    "                    use_fast=model_args.use_fast_tokenizer)\n",
+    "teacher_model = AutoModelForSequenceClassification.from_pretrained(\n",
+    "    optim_args.teacher_model_name_or_path,\n",
+    "    from_tf=bool(\".ckpt\" in optim_args.teacher_model_name_or_path),\n",
+    "    config=teacher_config,\n",
+    ")\n",
+    "teacher_model.to(training_args.device)\n",
+    "\n",
+    "# prepare datasets for teacher model\n",
+    "teacher_processed_datasets = raw_datasets.map(\n",
+    "    functools.partial(preprocess_function, tokenizer=teacher_tokenizer), \n",
+    "    batched=True, remove_columns=raw_datasets[\"train\"].column_names\n",
+    ")\n",
+    "teacher_train_dataset = teacher_processed_datasets[\"train\"]\n",
+    "teacher_train_dataset = teacher_train_dataset.select(range(data_args.max_train_samples))\n",
+    "teacher_eval_dataset = teacher_processed_datasets[\"validation_matched\" \\\n",
+    "                            if data_args.task_name == \"mnli\" else \"validation\"]\n",
+    "teacher_eval_dataset = teacher_eval_dataset.select(range(data_args.max_eval_samples))\n",
+    "    \n",
+    "# get logits of teacher model\n",
+    "def dict_tensor_to_model_device(batch, model):\n",
+    "    device = next(model.parameters()).device\n",
+    "    for k in batch:\n",
+    "        batch[k] = batch[k].to(device)\n",
+    "\n",
+    "def get_logits(teacher_model, train_dataset, teacher_train_dataset):\n",
+    "    logger.info(\"***** Getting logits of teacher model *****\")\n",
+    "    logger.info(f\"  Num examples = {len(train_dataset) }\")\n",
+    "    teacher_model.eval()\n",
+    "    npy_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),\n",
+    "        '{}.{}.npy'.format(data_args.task_name, \n",
+    "                            optim_args.teacher_model_name_or_path.replace('/', '.')))\n",
+    "    if os.path.exists(npy_file):\n",
+    "        teacher_logits = [x for x in np.load(npy_file)]\n",
+    "    else:\n",
+    "        sampler = None\n",
+    "        if training_args.world_size > 1:\n",
+    "            from transformers.trainer_pt_utils import ShardSampler\n",
+    "            sampler = ShardSampler(\n",
+    "                teacher_train_dataset,\n",
+    "                batch_size=training_args.per_device_eval_batch_size,\n",
+    "                num_processes=training_args.world_size,\n",
+    "                process_index=training_args.process_index,\n",
+    "            )\n",
+    "            teacher_model = torch.nn.parallel.DistributedDataParallel(\n",
+    "                teacher_model,\n",
+    "                device_ids=[training_args.local_rank] \\\n",
+    "                    if training_args._n_gpu != 0 else None,\n",
+    "                output_device=training_args.local_rank \\\n",
+    "                    if training_args._n_gpu != 0 else None,\n",
+    "            )\n",
+    "        train_dataloader = DataLoader(teacher_train_dataset, \n",
+    "                                        collate_fn=data_collator,\n",
+    "                                        sampler=sampler,\n",
+    "                                        batch_size=training_args.per_device_eval_batch_size)\n",
+    "        train_dataloader = tqdm(train_dataloader, desc=\"Evaluating\")\n",
+    "        teacher_logits = []\n",
+    "        for step, batch in enumerate(train_dataloader):\n",
+    "            dict_tensor_to_model_device(batch, teacher_model)\n",
+    "            outputs = teacher_model(**batch)\n",
+    "            if training_args.world_size > 1:\n",
+    "                outputs_list = [None for i in range(training_args.world_size)]\n",
+    "                torch.distributed.all_gather_object(outputs_list, outputs)\n",
+    "                outputs = torch.concat(outputs_list, dim=0)\n",
+    "            teacher_logits += [x for x in outputs.cpu().numpy()]\n",
+    "        if training_args.world_size > 1:\n",
+    "            teacher_logits = teacher_logits[:len(teacher_train_dataset)]\n",
+    "        if training_args.local_rank in [-1, 0]:\n",
+    "            np.save(npy_file, np.array(teacher_logits))\n",
+    "    return train_dataset.add_column('teacher_logits', teacher_logits)\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    train_dataset = get_logits(BertModelforLogitsOutputOnly(teacher_model), train_dataset, teacher_train_dataset)\n",
+    "        \n",
+    "para_counter = lambda model:sum(p.numel() for p in model.parameters())\n",
+    "logger.info(\"***** Number of teacher model parameters: {:.2f}M *****\".format(\\\n",
+    "            para_counter(teacher_model)/10**6))\n",
+    "logger.info(\"***** Number of student model parameters: {:.2f}M *****\".format(\\\n",
+    "            para_counter(model)/10**6))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set seed before initializing model.\n",
+    "set_seed(training_args.seed)\n",
+    "# Initialize our Trainer\n",
+    "trainer = NLPTrainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset if training_args.do_train else None,\n",
+    "    eval_dataset=eval_dataset if training_args.do_eval else None,\n",
+    "    compute_metrics=compute_metrics,\n",
+    "    tokenizer=tokenizer,\n",
+    "    data_collator=data_collator,\n",
+    ")\n",
+    "# distillation\n",
+    "if not training_args.do_eval:\n",
+    "    raise ValueError(\"do_eval must be set to True for distillation.\")\n",
+    "\n",
+    "tune_metric = metrics.Metric(name=metric_name)\n",
+    "distillation_conf = DistillationConfig(metrics=tune_metric)\n",
+    "model = trainer.distill(\n",
+    "    distillation_config=distillation_conf, teacher_model=teacher_model\n",
+    ")\n",
+    "trainer.save_model(training_args.output_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark after Distillation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the model obtained after Intel Neural Compressor (INC) quantization\n",
+    "model = OptimizedModel.from_pretrained(\n",
+    "    training_args.output_dir,\n",
+    ")\n",
+    "model.eval()\n",
+    "trainer.model = model\n",
+    "results = trainer.evaluate()\n",
+    "logger.info(\"metrics keys: {}\".format(results.keys()))\n",
+    "bert_task_acc_keys = ['eval_f1', 'eval_accuracy', 'eval_matthews_correlation',\n",
+    "                        'eval_pearson', 'eval_mcc', 'eval_spearmanr']\n",
+    "\n",
+    "for key in bert_task_acc_keys:\n",
+    "    if key in results.keys():\n",
+    "        ret = True\n",
+    "        throughput = results.get(\"eval_samples_per_second\")\n",
+    "        print('Batch size = ', training_args.per_device_eval_batch_size)\n",
+    "        print(\"Finally Eval {} Accuracy: {}\".format(key, results[key]))\n",
+    "        print(\"Latency: {:.5f} ms\".format(1000 / throughput))\n",
+    "        print(\"Throughput: {:.5f} samples/sec\".format(throughput))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.6 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.6"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "a3ed54c68abdb79eabea0140062ffa976ea4d8132b937aa83ca919a8d862edf2"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/tutorials/pytorch/token-classification/benchmark.py b/docs/tutorials/pytorch/token-classification/benchmark.py
new file mode 100644
index 00000000000..9c2c083c672
--- /dev/null
+++ b/docs/tutorials/pytorch/token-classification/benchmark.py
@@ -0,0 +1,242 @@
+import logging
+import os
+import numpy as np
+from datasets import ClassLabel, load_dataset, load_metric
+from intel_extension_for_transformers import OptimizedModel
+from intel_extension_for_transformers.optimization.trainer import NLPTrainer
+from argparse import ArgumentParser
+from transformers import (
+    AutoConfig,
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+    DataCollatorForTokenClassification,
+    PretrainedConfig,
+    TrainingArguments,
+    set_seed,
+)
+
+os.environ["WANDB_DISABLED"] = "true"
+
+logger = logging.getLogger(__name__)
+
+arg_parser = ArgumentParser(description='Parse args')
+arg_parser.add_argument('--data_type', default = "int8", help='data type of model')
+arg_parser.add_argument('--model_name_or_path', default = "elastic/distilbert-base-uncased-finetuned-conll03-english", help = 'input model for benchmark')
+args = arg_parser.parse_args()
+
+# download the dataset.
+raw_datasets = load_dataset("conll2003")
+training_args = TrainingArguments(
+    output_dir=args.model_name_or_path,
+    do_eval=True,
+    do_train=True,
+    no_cuda=True,
+    overwrite_output_dir=True,
+    per_device_train_batch_size=8,
+    per_device_eval_batch_size=8,
+)
+column_names = raw_datasets["train"].column_names
+features = raw_datasets["train"].features
+text_column_name = "tokens"
+label_column_name = "ner_tags"
+
+# In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
+# unique labels.
+def get_label_list(labels):
+    unique_labels = set()
+    for label in labels:
+        unique_labels = unique_labels | set(label)
+    label_list = list(unique_labels)
+    label_list.sort()
+    return label_list
+
+# If the labels are of type ClassLabel, they are already integers and we have the map stored somewhere.
+# Otherwise, we have to get the list of labels manually.
+labels_are_int = isinstance(features[label_column_name].feature, ClassLabel)
+if labels_are_int:
+    label_list = features[label_column_name].feature.names
+    label_to_id = {i: i for i in range(len(label_list))}
+else:
+    label_list = get_label_list(raw_datasets["train"][label_column_name])
+    label_to_id = {l: i for i, l in enumerate(label_list)}
+
+num_labels = len(label_list)
+
+# download model & vocab.
+config = AutoConfig.from_pretrained(
+    args.model_name_or_path,
+    num_labels=num_labels,
+    finetuning_task="ner",
+    revision="main",
+)
+
+tokenizer_name_or_path = args.model_name_or_path
+if config.model_type in {"gpt2", "roberta"}:
+    tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_name_or_path,
+        use_fast=True,
+        revision="main",
+        add_prefix_space=True,
+    )
+else:
+    tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_name_or_path,
+        use_fast=True,
+        revision="main",
+    )
+
+## start with int8 benchmarking
+if args.data_type == "int8":
+    # Load the model obtained after Intel Neural Compressor (INC) quantization
+    model = OptimizedModel.from_pretrained(
+          args.model_name_or_path,
+          from_tf=bool(".ckpt" in args.model_name_or_path),
+          config=config,
+          revision="main",
+          use_auth_token=None,
+    )
+else:
+    ## original fp32 model benchmarking
+    # Load the model obtained after Intel Neural Compressor (INC) quantization
+    model = AutoModelForTokenClassification.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        revision="main"
+    )
+# Model has labels -> use them.
+if model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id:
+    if list(sorted(model.config.label2id.keys())) == list(sorted(label_list)):
+        # Reorganize `label_list` to match the ordering of the model.
+        if labels_are_int:
+            label_to_id = {i: int(model.config.label2id[l]) for i, l in enumerate(label_list)}
+            label_list = [model.config.id2label[i] for i in range(num_labels)]
+        else:
+            label_list = [model.config.id2label[i] for i in range(num_labels)]
+            label_to_id = {l: i for i, l in enumerate(label_list)}
+    else:
+        logger.warning(
+            "Your model seems to have been trained with labels, but they don't match the dataset: ",
+            f"model labels: {list(sorted(model.config.label2id.keys()))}, dataset labels: {list(sorted(label_list))}."
+            "\nIgnoring the model labels as a result.",
+        )
+
+# Set the correspondences label/ID inside the model config
+model.config.label2id = {l: i for i, l in enumerate(label_list)}
+model.config.id2label = {i: l for i, l in enumerate(label_list)}
+
+# Map that sends B-Xxx label to its I-Xxx counterpart
+b_to_i_label = []
+for idx, label in enumerate(label_list):
+    if label.startswith("B-") and label.replace("B-", "I-") in label_list:
+        b_to_i_label.append(label_list.index(label.replace("B-", "I-")))
+    else:
+        b_to_i_label.append(idx)
+
+# Padding strategy
+padding = "max_length"
+
+# Tokenize all texts and align the labels with them.
+def tokenize_and_align_labels(examples):
+    tokenized_inputs = tokenizer(
+        examples[text_column_name],
+        padding=padding,
+        truncation=True,
+        # We use this argument because the texts in our dataset are lists of words (with a label for each word).
+        is_split_into_words=True,
+    )
+    labels = []
+    for i, label in enumerate(examples[label_column_name]):
+        word_ids = tokenized_inputs.word_ids(batch_index=i)
+        previous_word_idx = None
+        label_ids = []
+        for word_idx in word_ids:
+            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
+            # ignored in the loss function.
+            if word_idx is None:
+                label_ids.append(-100)
+            # We set the label for the first token of each word.
+            elif word_idx != previous_word_idx:
+                label_ids.append(label_to_id[label[word_idx]])
+            # For the other tokens in a word, we set the label to either the current label or -100, depending on
+            # the label_all_tokens flag.
+            else:
+                label_ids.append(-100)
+            previous_word_idx = word_idx
+
+        labels.append(label_ids)
+    tokenized_inputs["labels"] = labels
+    return tokenized_inputs
+
+# train dataset
+train_dataset = raw_datasets["train"]
+with training_args.main_process_first(desc="train dataset map pre-processing"):
+    train_dataset = train_dataset.map(
+        tokenize_and_align_labels,
+        batched=True,
+        load_from_cache_file=False,
+        desc="Running tokenizer on train dataset",
+    )
+
+# evaluation dataset
+eval_dataset = raw_datasets["validation"]
+eval_dataset = eval_dataset.select(range(1000))
+with training_args.main_process_first(desc="validation dataset map pre-processing"):
+    eval_dataset = eval_dataset.map(
+        tokenize_and_align_labels,
+        batched=True,
+        load_from_cache_file=False,
+        desc="Running tokenizer on validation dataset",
+    )
+
+# Data collator
+data_collator = DataCollatorForTokenClassification(tokenizer)
+
+# Metrics
+metric = load_metric("seqeval")
+metric_name = "eval_f1"
+
+def compute_metrics(p):
+    predictions, labels = p
+    predictions = np.argmax(predictions, axis=2)
+
+    # Remove ignored index (special tokens)
+    true_predictions = [
+        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
+        for prediction, label in zip(predictions, labels)
+    ]
+    true_labels = [
+        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
+        for prediction, label in zip(predictions, labels)
+    ]
+
+    results = metric.compute(predictions=true_predictions, references=true_labels)
+    return {
+        "precision": results["overall_precision"],
+        "recall": results["overall_recall"],
+        "f1": results["overall_f1"],
+        "accuracy": results["overall_accuracy"],
+    }
+
+# Initialize the Trainer
+set_seed(training_args.seed)
+trainer = NLPTrainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset if training_args.do_train else None,
+    eval_dataset=eval_dataset if training_args.do_eval else None,
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+)
+
+results = trainer.evaluate()
+bert_task_acc_keys = ['eval_loss', 'eval_f1', 'eval_accuracy', 'eval_matthews_correlation',
+                              'eval_pearson', 'eval_mcc', 'eval_spearmanr']
+
+throughput = results.get("eval_samples_per_second")
+eval_loss = results["eval_loss"]
+print('Batch size = {}'.format(training_args.per_device_eval_batch_size))
+print("Finally Eval eval_loss Accuracy: {}".format(eval_loss))
+print("Latency: {:.3f} ms".format(1000 / throughput))
+print("Throughput: {} samples/sec".format(throughput))
diff --git a/docs/tutorials/pytorch/token-classification/distilbert_base_ner.ipynb b/docs/tutorials/pytorch/token-classification/distilbert_base_ner.ipynb
index b7bb2c2ba78..25638c19078 100644
--- a/docs/tutorials/pytorch/token-classification/distilbert_base_ner.ipynb
+++ b/docs/tutorials/pytorch/token-classification/distilbert_base_ner.ipynb
@@ -57,7 +57,7 @@
     }
    },
    "source": [
-    "* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. "
+    "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. "
    ]
   },
   {
@@ -242,7 +242,7 @@
     "    pad_to_max_length=True\n",
     ")\n",
     "training_args = TrainingArguments(\n",
-    "    output_dir=\"/tmp/conll03_output\",\n",
+    "    output_dir=\"./saved_results_static\",\n",
     "    do_eval=True,\n",
     "    do_train=True,\n",
     "    no_cuda=True,\n",
@@ -554,8 +554,8 @@
     ")\n",
     "\n",
     "# tuning\n",
-    "model.config.save_pretrained(training_args.output_dir)\n",
-    "trainer_static.save_model(training_args.output_dir)\n",
+    "model.config.save_pretrained(\"./saved_results_static\")\n",
+    "trainer_static.save_model(\"./saved_results_static\")\n",
     "tune_metric = metrics.Metric(\n",
     "    name=metric_name, is_relative=True, criterion=0.25\n",
     ")\n",
@@ -598,6 +598,26 @@
     "print(\"Throughput: {} samples/sec\".format(throughput_static))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "765c996e",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark after Static Post Training Quantization with Multi-Instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "103c648d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.system('numactl --hardware')\n",
+    "results = os.system('bash ../multi_instance.sh --model=saved_results_static --core_per_instance=4 --data_type=int8')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "7a51f6ca",
@@ -623,6 +643,7 @@
    "source": [
     "# Set seed before initializing model.\n",
     "set_seed(training_args.seed)\n",
+    "training_args.output_dir = \"saved_results_dynamic\"\n",
     "# Initialize our Trainer\n",
     "trainer_dynamic = NLPTrainer(\n",
     "    model=model,\n",
@@ -635,8 +656,8 @@
     ")\n",
     "\n",
     "# tuning\n",
-    "model.config.save_pretrained(training_args.output_dir)\n",
-    "trainer_dynamic.save_model(training_args.output_dir)\n",
+    "model.config.save_pretrained(\"./saved_results_dynamic\")\n",
+    "trainer_dynamic.save_model(\"./saved_results_dynamic\")\n",
     "tune_metric = metrics.Metric(\n",
     "    name=metric_name, is_relative=True, criterion=0.25\n",
     ")\n",
@@ -678,6 +699,25 @@
     "print(\"Throughput: {} samples/sec\".format(throughput_dynamic))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "cb86bdae",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark after Dynamic Post Training Quantization with Multi-Instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d92088a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "results = os.system('bash ../multi_instance.sh --model=saved_results_dynamic --core_per_instance=4 --data_type=int8')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "44cca2a1",
@@ -722,6 +762,25 @@
     "print(\"Latency: {:.3f} ms\".format(1000 / throughput_fp32))\n",
     "print(\"Throughput: {} samples/sec\".format(throughput_fp32))"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "784fe8d5",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark for FP32 Model with Multi-Instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0239337e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "results = os.system('bash ../multi_instance.sh --model=elastic/distilbert-base-uncased-finetuned-conll03-english --core_per_instance=4 --data_type=fp32')"
+   ]
   }
  ],
  "metadata": {
diff --git a/docs/tutorials/pytorch/translation/benchmark.py b/docs/tutorials/pytorch/translation/benchmark.py
new file mode 100644
index 00000000000..30fbc5b2e59
--- /dev/null
+++ b/docs/tutorials/pytorch/translation/benchmark.py
@@ -0,0 +1,171 @@
+import logging
+import os
+import numpy as np
+from datasets import load_dataset, load_metric
+from intel_extension_for_transformers import OptimizedModel
+from intel_extension_for_transformers.optimization.trainer import  NLPSeq2SeqTrainer
+from argparse import ArgumentParser
+from transformers import (
+    AutoConfig,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    DataCollatorForSeq2Seq,
+    Seq2SeqTrainingArguments,
+    set_seed,
+)
+
+os.environ["WANDB_DISABLED"] = "true"
+
+logger = logging.getLogger(__name__)
+
+arg_parser = ArgumentParser(description='Parse args')
+arg_parser.add_argument('--data_type', default = "int8", help='data type of model')
+arg_parser.add_argument('--model_name_or_path', default = "t5-small", help = 'input model for benchmark')
+args = arg_parser.parse_args()
+
+raw_datasets = load_dataset("wmt16", "ro-en")
+training_args = Seq2SeqTrainingArguments(
+    output_dir="./saved_results_dynamic",
+    do_eval=True,
+    do_train=True,
+    no_cuda=True,
+    overwrite_output_dir=True,
+    per_device_eval_batch_size=8,
+    predict_with_generate=True
+)
+config = AutoConfig.from_pretrained(args.model_name_or_path, revision="main")
+tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, revision="main", use_fast=True)
+prefix = ""
+
+## start with int8 benchmarking
+if args.data_type == "int8":
+    # Load the model obtained after Intel Neural Compressor (INC) quantization
+    model = OptimizedModel.from_pretrained(
+          args.model_name_or_path,
+          from_tf=bool(".ckpt" in args.model_name_or_path),
+          config=config,
+          revision="main",
+          use_auth_token=None,
+    )
+else:
+    ## original fp32 model benchmarking
+    # Load the model obtained after Intel Neural Compressor (INC) quantization
+    model = AutoModelForSeq2SeqLM.from_pretrained(
+        "t5-small",
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        revision="main"
+    )
+    model.resize_token_embeddings(len(tokenizer))
+
+# We need to tokenize inputs and targets.
+column_names = raw_datasets["train"].column_names
+
+# Get the language codes for input/target.
+source_lang = "en"
+target_lang = "ro"
+
+# Temporarily set max_target_length for training.
+max_target_length = 128
+padding = False
+
+def preprocess_function(examples):
+    inputs = [ex[source_lang] for ex in examples["translation"]]
+    targets = [ex[target_lang] for ex in examples["translation"]]
+    inputs = [prefix + inp for inp in inputs]
+    model_inputs = tokenizer(inputs, max_length=1024, padding=padding, truncation=True)
+
+    # Setup the tokenizer for targets
+    with tokenizer.as_target_tokenizer():
+        labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
+
+    model_inputs["labels"] = labels["input_ids"]
+    return model_inputs
+
+# define train dataset
+train_dataset = raw_datasets["train"]
+with training_args.main_process_first(desc="train dataset map pre-processing"):
+    train_dataset = train_dataset.map(
+        preprocess_function,
+        batched=True,
+        remove_columns=column_names,
+        load_from_cache_file=False,
+        desc="Running tokenizer on train dataset",
+    )
+
+# define eval dataset
+eval_dataset = raw_datasets["validation"]
+max_eval_samples = min(len(eval_dataset), 400)
+eval_dataset = eval_dataset.select(range(max_eval_samples))
+with training_args.main_process_first(desc="validation dataset map pre-processing"):
+    eval_dataset = eval_dataset.map(
+        preprocess_function,
+        batched=True,
+        remove_columns=column_names,
+        load_from_cache_file=False,
+        desc="Running tokenizer on validation dataset",
+    )
+
+# Data collator
+label_pad_token_id = -100
+data_collator = DataCollatorForSeq2Seq(
+    tokenizer,
+    model=model,
+    label_pad_token_id=label_pad_token_id,
+    pad_to_multiple_of=8 if training_args.fp16 else None,
+)
+
+# Metric
+metric = load_metric("sacrebleu")
+
+def postprocess_text(preds, labels):
+    preds = [pred.strip() for pred in preds]
+    labels = [[label.strip()] for label in labels]
+
+    return preds, labels
+
+def compute_metrics(eval_preds):
+    preds, labels = eval_preds
+    if isinstance(preds, tuple):
+        preds = preds[0]
+    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+    # Replace -100 in the labels as we can't decode them.
+    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+    # Some simple post-processing
+    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
+
+    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
+    result = {"bleu": result["score"]}
+
+    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
+    result["gen_len"] = np.mean(prediction_lens)
+    result = {k: round(v, 4) for k, v in result.items()}
+    return result
+
+metric_name = "eval_bleu"
+max_length = 128
+num_beams = None
+# Initialize the Trainer
+set_seed(training_args.seed)
+trainer = NLPSeq2SeqTrainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset if training_args.do_train else None,
+    eval_dataset=eval_dataset if training_args.do_eval else None,
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics if training_args.predict_with_generate else None,
+)
+
+results = trainer.evaluate(max_length=max_length, num_beams=num_beams)
+bert_task_acc_keys = ['eval_loss', 'eval_f1', 'eval_accuracy', 'eval_matthews_correlation',
+                              'eval_pearson', 'eval_mcc', 'eval_spearmanr']
+
+throughput = results.get("eval_samples_per_second")
+eval_loss = results["eval_loss"]
+print('Batch size = {}'.format(training_args.per_device_eval_batch_size))
+print("Finally Eval eval_loss Accuracy: {}".format(eval_loss))
+print("Latency: {:.3f} ms".format(1000 / throughput))
+print("Throughput: {} samples/sec".format(throughput))
diff --git a/docs/tutorials/pytorch/translation/t5-small.ipynb b/docs/tutorials/pytorch/translation/t5-small.ipynb
index cff4566c044..90e41e9f12e 100644
--- a/docs/tutorials/pytorch/translation/t5-small.ipynb
+++ b/docs/tutorials/pytorch/translation/t5-small.ipynb
@@ -57,7 +57,7 @@
     }
    },
    "source": [
-    "* Follow [installation](https://github.com/intel/intel-extension-for-transformers#installation) to install **intel-extension-for-transformers**. "
+    "* Follow [installation](https://github.com/intel/intel_extension_for_transformers#installation) to install **intel-extension-for-transformers**. "
    ]
   },
   {
@@ -249,7 +249,7 @@
     "    source_prefix=\"translate English to Romanian: \"\n",
     ")\n",
     "training_args = Seq2SeqTrainingArguments(\n",
-    "    output_dir=\"/tmp/tst-translation\",\n",
+    "    output_dir=\"./saved_results_dynamic\",\n",
     "    do_eval=True,\n",
     "    do_train=True,\n",
     "    no_cuda=True,\n",
@@ -551,6 +551,25 @@
     "print(\"Throughput: {:.5f} samples/sec\".format(throughput))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "4f477616",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark after Dynamic Post Training Quantization with Multi-Instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d40db91b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "results = os.system('bash ../multi_instance.sh --model=saved_results_dynamic --core_per_instance=4 --data_type=int8')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "44cca2a1",
@@ -594,6 +613,25 @@
     "print(\"Latency: {:.5f} ms\".format(1000 / throughput_fp32))\n",
     "print(\"Throughput: {:.5f} samples/sec\".format(throughput_fp32))"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "96e88ce1",
+   "metadata": {},
+   "source": [
+    "## Run Benchmark for FP32 Model with Multi-Instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ade4c930",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "results = os.system('bash ../multi_instance.sh --model=t5-small --core_per_instance=4 --data_type=fp32')"
+   ]
   }
  ],
  "metadata": {