Loading...
Loading...
Memory-efficient fine-tuning with 4-bit quantization and LoRA adapters. Use when fine-tuning large models (7B+) on consumer GPUs, when VRAM is limited, or when standard LoRA still exceeds memory. Builds on the lora skill.
npx skill4agent add itsmostafa/llm-engineering-skills qloraPrerequisites: This skill assumes familiarity with LoRA. See theskill for LoRA fundamentals (LoraConfig, target_modules, training patterns).lora
Storage: 4-bit NF4 (quantized weights)
Compute: 16-bit BF16 (dequantized for forward/backward pass)| Quantization | Description | Use Case |
|---|---|---|
| Normalized Float 4-bit, optimal for normal distributions | Default, recommended |
| Standard 4-bit float | Legacy, rarely needed |
First quantization: weights → 4-bit + fp32 scaling constants
Double quantization: scaling constants → 8-bit + fp32 second-level constantsNormal training: OOM on memory spike
Paged optimizers: GPU ↔ CPU transfer handles spikes gracefullyfrom transformers import BitsAndBytesConfig
import torch
bnb_config = BitsAndBytesConfig(
# Core 4-bit settings
load_in_4bit=True, # Enable 4-bit quantization
bnb_4bit_quant_type="nf4", # "nf4" (recommended) or "fp4"
# Double quantization
bnb_4bit_use_double_quant=True, # Quantize the quantization constants
# Compute precision
bnb_4bit_compute_dtype=torch.bfloat16, # Dequantize to this dtype for compute
# Optional: specific storage type (usually auto-detected)
bnb_4bit_quant_storage=torch.uint8, # Storage dtype for quantized weights
)| Dtype | Hardware | Notes |
|---|---|---|
| Ampere+ (RTX 30xx, A100) | Recommended, faster |
| Older GPUs (V100, RTX 20xx) | Use if bf16 not supported |
| Any | Slower, only for debugging |
import torch
print(torch.cuda.is_bf16_supported()) # True on Ampere+# Recommended: NF4 + double quant + bf16
optimal_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
)
# Maximum memory savings (slightly slower)
max_savings_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.float16, # fp16 uses less memory than bf16
)
# 8-bit alternative (less compression, sometimes more stable)
eight_bit_config = BitsAndBytesConfig(
load_in_8bit=True,
)| Model Size | Full Fine-tuning | LoRA (16-bit) | QLoRA (4-bit) |
|---|---|---|---|
| 7B | ~60 GB | ~16 GB | ~6 GB |
| 13B | ~104 GB | ~28 GB | ~10 GB |
| 34B | ~272 GB | ~75 GB | ~20 GB |
| 70B | ~560 GB | ~160 GB | ~48 GB |
| GPU VRAM | Max Model Size (QLoRA) |
|---|---|
| 8 GB | 7B (tight) |
| 16 GB | 7-13B |
| 24 GB | 13-34B |
| 48 GB | 34-70B |
| 80 GB | 70B+ comfortably |
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TrainingArguments,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
import torch
# 1. Quantization config
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
)
# 2. Load quantized model
model_name = "meta-llama/Llama-3.1-8B"
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="auto",
attn_implementation="flash_attention_2", # Optional: faster attention
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
# 3. Prepare for k-bit training (critical step!)
model = prepare_model_for_kbit_training(model)
# 4. LoRA config (see lora skill for parameter details)
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# 5. Dataset
dataset = load_dataset("tatsu-lab/alpaca", split="train[:1000]")
def format_example(example):
if example["input"]:
return {"text": f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"}
return {"text": f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"}
dataset = dataset.map(format_example)
# 6. Training
sft_config = SFTConfig(
output_dir="./qlora-output",
max_seq_length=512,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
num_train_epochs=1,
learning_rate=2e-4,
bf16=True,
logging_steps=10,
save_steps=100,
gradient_checkpointing=True,
gradient_checkpointing_kwargs={"use_reentrant": False},
optim="paged_adamw_8bit", # Paged optimizer for memory efficiency
)
trainer = SFTTrainer(
model=model,
args=sft_config,
train_dataset=dataset,
processing_class=tokenizer,
dataset_text_field="text",
)
trainer.train()
# 7. Save adapter
model.save_pretrained("./qlora-adapter")
tokenizer.save_pretrained("./qlora-adapter")from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch
model_name = "meta-llama/Llama-3.1-8B"
# Load quantized base model
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
base_model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load adapter
model = PeftModel.from_pretrained(base_model, "./qlora-adapter")
model.eval()
# Generate
inputs = tokenizer("### Instruction:\nExplain quantum computing.\n\n### Response:\n", return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=256)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))from transformers import AutoModelForCausalLM
from peft import PeftModel
import torch
# Load base model in full precision (on CPU to avoid OOM)
base_model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B",
torch_dtype=torch.bfloat16,
device_map="cpu",
)
# Load adapter
model = PeftModel.from_pretrained(base_model, "./qlora-adapter")
# Merge and unload
merged_model = model.merge_and_unload()
# Save merged model
merged_model.save_pretrained("./merged-model")# Check CUDA version
nvcc --version
python -c "import torch; print(torch.version.cuda)"
# bitsandbytes requires CUDA 11.7+
# If version mismatch, reinstall:
pip uninstall bitsandbytes
pip install bitsandbytes --upgrade# Find CUDA installation
find /usr -name "libcudart*" 2>/dev/null
# Set environment variable
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
# Or for conda:
export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH# Check if model is using expected dtype
for name, param in model.named_parameters():
if param.requires_grad:
print(f"{name}: {param.dtype}")
break # All LoRA params should match
# Ensure bf16 is used in training args if BitsAndBytesConfig uses bf16
# Mismatch causes constant dtype conversions# 1. Enable gradient checkpointing
model.gradient_checkpointing_enable()
# 2. Reduce batch size, increase accumulation
per_device_train_batch_size = 1
gradient_accumulation_steps = 16
# 3. Use paged optimizer
optim = "paged_adamw_8bit"
# 4. Reduce sequence length
max_seq_length = 256
# 5. Target fewer modules
target_modules = ["q_proj", "v_proj"] # Minimal set# Ensure prepare_model_for_kbit_training is called
from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model) # Don't skip this!
# Enable input gradients if needed
model.enable_input_require_grads()prepare_model_for_kbit_trainingbnb_4bit_compute_dtype=torch.bfloat16bf16=Trueoptim="paged_adamw_8bit""paged_adamw_32bit"nvidia-smitorch.cuda.memory_summary()load_in_8bit=True