!pip install -q transformers torch gptqmodel autoawq gguf bitsandbytesQuantization Methods for LLMs
A hands-on guide to GPTQ, AWQ, GGUF, and bitsandbytes — reducing LLM memory footprint while preserving quality
Table of Contents
1. Setup & Installation
Install the required packages for LLM quantization.
2. Why Quantize?
Quantization reduces the numerical precision of model weights, dramatically cutting memory usage and often improving inference speed.
Precision vs Memory
| Precision | Bytes/Param | 7B Model | 13B Model | 70B Model |
|---|---|---|---|---|
| FP32 | 4 | 28 GB | 52 GB | 280 GB |
| FP16 / BF16 | 2 | 14 GB | 26 GB | 140 GB |
| INT8 | 1 | 7 GB | 13 GB | 70 GB |
| INT4 | 0.5 | 3.5 GB | 6.5 GB | 35 GB |
Key Trade-offs
- Lower precision = less memory + faster inference
- Higher precision = better accuracy + more memory
- Modern quantization methods (GPTQ, AWQ) minimize quality loss
3. GPTQ Quantization
GPTQ is a post-training quantization method that uses a calibration dataset to find optimal quantized weights layer by layer, minimizing the output error.
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
model_id = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Configure GPTQ quantization
gptq_config = GPTQConfig(
bits=4, # Quantize to 4 bits
dataset="c4", # Calibration dataset
tokenizer=tokenizer,
)
# Load and quantize the model
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=gptq_config,
device_map="auto",
)
print(f"Model loaded with GPTQ 4-bit quantization")
print(f"Model dtype: {model.dtype}")
# Save the quantized model
# model.save_pretrained("./opt-125m-gptq-4bit")
# tokenizer.save_pretrained("./opt-125m-gptq-4bit")4. Load Pre-quantized GPTQ
Many GPTQ-quantized models are available on the Hugging Face Hub, ready to use without running quantization yourself.
from transformers import AutoModelForCausalLM, AutoTokenizer
# Load a pre-quantized GPTQ model
quantized_model_id = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(quantized_model_id)
model = AutoModelForCausalLM.from_pretrained(
quantized_model_id,
device_map="auto",
)
# Generate text
inputs = tokenizer("Quantization reduces model size by", return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(output[0], skip_special_tokens=True))5. AWQ Quantization
AWQ (Activation-aware Weight Quantization) identifies the most important weights by analyzing activations and protects them during quantization. It supports kernel fusion for faster inference.
from transformers import AutoModelForCausalLM, AutoTokenizer, AwqConfig
# Load a pre-quantized AWQ model
awq_model_id = "TheBloke/Mistral-7B-OpenOrca-AWQ"
# AWQ with kernel fusion for faster inference
awq_config = AwqConfig(
do_fuse=True,
fuse_max_seq_len=512,
)
tokenizer = AutoTokenizer.from_pretrained(awq_model_id)
model = AutoModelForCausalLM.from_pretrained(
awq_model_id,
quantization_config=awq_config,
device_map="auto",
)
print(f"AWQ model loaded with kernel fusion enabled")
# Generate text
inputs = tokenizer("The benefits of quantization include", return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(output[0], skip_special_tokens=True))6. GGUF in Transformers
GGUF is the file format used by llama.cpp. Transformers can now load GGUF files directly, making it easy to use community-quantized models.
from transformers import AutoModelForCausalLM, AutoTokenizer
# Load a GGUF quantized model directly in Transformers
gguf_model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
gguf_file = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
tokenizer = AutoTokenizer.from_pretrained(gguf_model_id, gguf_file=gguf_file)
model = AutoModelForCausalLM.from_pretrained(
gguf_model_id,
gguf_file=gguf_file,
device_map="auto",
)
print(f"GGUF model loaded: {gguf_file}")
# Generate text
inputs = tokenizer("Explain quantization in simple terms:", return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(output[0], skip_special_tokens=True))7. bitsandbytes 8-bit
bitsandbytes provides on-the-fly quantization when loading models. 8-bit mode uses LLM.int8() which keeps outlier features in FP16 for quality.
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# 8-bit quantization config
bnb_config_8bit = BitsAndBytesConfig(
load_in_8bit=True,
)
model_id = "facebook/opt-350m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=bnb_config_8bit,
device_map="auto",
)
print(f"Model loaded in 8-bit precision")
# Generate text
inputs = tokenizer("Machine learning models can be compressed by", return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(output[0], skip_special_tokens=True))8. bitsandbytes 4-bit NF4 (QLoRA)
NF4 (NormalFloat 4-bit) is a quantization type optimized for normally distributed weights. Combined with double quantization, it powers QLoRA — enabling fine-tuning of quantized models.
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# 4-bit NF4 quantization with double quantization (QLoRA config)
bnb_config_4bit = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4", # NormalFloat 4-bit
bnb_4bit_compute_dtype=torch.bfloat16, # Compute in BF16
bnb_4bit_use_double_quant=True, # Double quantization saves ~0.4 bits/param
)
model_id = "facebook/opt-350m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=bnb_config_4bit,
device_map="auto",
)
print(f"Model loaded in 4-bit NF4 precision (QLoRA-ready)")
# Generate text
inputs = tokenizer("Neural network quantization", return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(output[0], skip_special_tokens=True))9. Memory Usage Comparison
A practical comparison of memory footprints across different quantization methods using the same base model.
import torch
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
model_id = "facebook/opt-125m"
prompt = "Artificial intelligence is transforming"
def get_model_memory_mb(model):
"""Estimate model memory usage in MB."""
total_bytes = sum(
p.nelement() * p.element_size() for p in model.parameters()
)
return total_bytes / (1024 * 1024)
def load_and_test(model_id, prompt, config_name, quantization_config=None, dtype=None):
"""Load model, measure memory, generate text."""
tokenizer = AutoTokenizer.from_pretrained(model_id)
load_kwargs = {"device_map": "auto"}
if quantization_config:
load_kwargs["quantization_config"] = quantization_config
if dtype:
load_kwargs["torch_dtype"] = dtype
model = AutoModelForCausalLM.from_pretrained(model_id, **load_kwargs)
memory_mb = get_model_memory_mb(model)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=30)
text = tokenizer.decode(output[0], skip_special_tokens=True)
print(f"\n{'='*60}")
print(f"Config: {config_name}")
print(f"Memory: {memory_mb:.1f} MB")
print(f"Output: {text}")
# Cleanup
del model
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
return {"config": config_name, "memory_mb": memory_mb, "output": text}
results = []
# 1. FP16
results.append(load_and_test(
model_id, prompt, "FP16",
dtype=torch.float16,
))
# 2. 8-bit
results.append(load_and_test(
model_id, prompt, "INT8 (bitsandbytes)",
quantization_config=BitsAndBytesConfig(load_in_8bit=True),
))
# 3. 4-bit NF4
results.append(load_and_test(
model_id, prompt, "NF4 4-bit (bitsandbytes)",
quantization_config=BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
),
))
# Summary table
print(f"\n\n{'='*60}")
print(f"{'Config':<30} {'Memory (MB)':>12}")
print(f"{'-'*30} {'-'*12}")
for r in results:
print(f"{r['config']:<30} {r['memory_mb']:>10.1f} MB")10. When to Use What
Decision Rules
- Quick experimentation →
bitsandbytes(zero-setup, just add a config) - Production inference serving →
GPTQorAWQ(optimized kernels, best throughput) - CPU / edge deployment →
GGUFviallama.cpp(ubiquitous, cross-platform) - Fine-tuning a large model →
bitsandbytes 4-bit NF4+ LoRA (QLoRA) - Maximum quality preservation →
AWQ(activation-aware, best quality/size ratio)
Comparison Table
| Method | Bits | Calibration | Speed | Quality | Fine-tuning | Best For |
|---|---|---|---|---|---|---|
| GPTQ | 2-8 | Required | Fast (GPU) | Good | Limited | GPU inference serving |
| AWQ | 4 | Required | Fastest (fused) | Best | Limited | High-throughput inference |
| GGUF | 2-8 | Pre-computed | Moderate | Good | No | CPU / edge / llama.cpp |
| bnb 8-bit | 8 | None | Good | Very Good | Yes | Quick experiments |
| bnb 4-bit NF4 | 4 | None | Good | Good | Yes (QLoRA) | Fine-tuning large models |