LLMEasyQuant is a comprehensive quantization framework for Large Language Models (LLMs) that provides easy-to-use tools for model compression and optimization.
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig
# Set device to CPU for now
device = 'cpu'
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Load model and tokenizer
model_id = 'gpt2' # 137m F32 params
# model_id = 'facebook/opt-1.3b' # 1.3b f16 params
# model_id = 'mistralai/Mistral-7B-v0.1' # 7.24b bf16 params, auth required
# model_id = 'meta-llama/Llama-2-7b-hf' # auth required
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model_int8 = AutoModelForCausalLM.from_pretrained(model_id,
device_map='auto',
quantization_config=BitsAndBytesConfig(
load_in_8bit=True)
)
model_int8.name_or_path += "_int8"
absmax
absq = Quantizer(model, tokenizer, absmax_quantize)
quantizers.append(absq)
zeropoint
zpq = Quantizer(model, tokenizer, zeropoint_quantize)
quantizers.append(zpq)
smoothquant
smooth_quant = SmoothQuantMatrix(alpha=0.5)
smoothq = Quantizer (model, tokenizer, smooth_quant.smooth_quant_apply)
quantizers.append(smoothq)
simquant
simq = Quantizer(model, tokenizer, sim_quantize )
quantizers.append(simq)
simquant, zeroquant and knowledge distllation of both each
symq = Quantizer(model, tokenizer, sym_quantize_8bit)
zeroq = Quantizer(model, tokenizer, sym_quantize_8bit, zeroquant_func)
quantizers.extend([symq, zeroq])
AWQ
awq = Quantizer(model, tokenizer, awq_quantize )
quantizers.append(simq)
BiLLM
billmq = Quantizer(model, tokenizer, billm_quantize )
quantizers.append(simq)
QLora
qloraq = Quantizer(model, tokenizer, qlora_quantize )
quantizers.append(simq)
[q.quantize() for q in quantizers]
dist_plot([model, model_int8] + [q.quant for q in quantizers])
generated = compare_generation([model, model_int8] + [q.quant for q in quantizers], tokenizer, max_length=200, temperature=0.8)
ppls = compare_ppl([model, model_int8] + [q.quant for q in quantizers], tokenizer, list(generated.values()))
from llmeasyquant.algorithm import TeacherStudentDistillation, LayerWiseDistillation
# Initialize distillation methods
teacher_student = TeacherStudentDistillation(temperature=2.0)
layer_wise = LayerWiseDistillation(layer_mapping={'teacher.layer1': 'student.layer1'})
If you find LLMEasyQuant useful in your research, please cite our paper:
@inproceedings{liu2025llmeasyquant,
title={LLMEasyQuant: Scalable Quantization for Parallel and Distributed LLM Inference},
author={Liu, Dong and Yu, Yanxuan},
booktitle={AI4X 2025 International Conference}
}
@article{liu2024llmeasyquant,
title={LLMEasyQuant--An Easy to Use Toolkit for LLM Quantization},
author={Liu, Dong and Jiang, Meng and Pister, Kaiser},
journal={arXiv preprint arXiv:2406.19657},
year={2024}
}