Loading...
Loading...
Fine-tune LLMs with Unsloth using GRPO or SFT. Supports FP8, vision models, mobile deployment, Docker, packing, GGUF export. Use when: train with GRPO, fine-tune, reward functions, SFT training, FP8 training, vision fine-tuning, phone deployment, docker training, packing, export to GGUF.
npx skill4agent add scientiacapital/skills unsloth-trainingunsloth/unslothimport os
os.environ['UNSLOTH_VLLM_STANDBY'] = "1" # Shared memory
from unsloth import FastLanguageModel
from trl import GRPOConfig, GRPOTrainer
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/Qwen3-8B",
max_seq_length=2048, load_in_fp8=True, fast_inference=True,
)
model = FastLanguageModel.get_peft_model(
model, r=64,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
use_gradient_checkpointing="unsloth",
)
def correctness_reward(completions, answer, **kwargs):
return [2.0 if extract_answer(c) == a else 0.0
for c, a in zip(completions, answer)]
trainer = GRPOTrainer(
model=model,
args=GRPOConfig(num_generations=4, beta=0.04, learning_rate=5e-6),
train_dataset=dataset, reward_funcs=[correctness_reward],
)
trainer.train()from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
model=model, train_dataset=dataset, processing_class=tokenizer,
args=SFTConfig(
per_device_train_batch_size=2, num_train_epochs=3,
learning_rate=2e-4, packing=True, # 2-5x speedup
),
)
trainer.train()/unsloth grpo/unsloth sft/unsloth fp8/unsloth vision/unsloth mobile/unsloth docker/unsloth troubleshootreference/reward-design.mdreference/domain-examples.mdreference/hyperparameters.mdreference/troubleshooting.mdreference/fp8-training.mdreference/deployment.mdreference/export-formats.mdreference/advanced-training.mdreference/vision-training.mdreference/mobile-deployment.mdreference/grpo/reference/sft/| Method | Use When | Data Needed |
|---|---|---|
| GRPO | Improving reasoning quality | Prompts + verifiable answers |
| GRPO | Aligning behavior with preferences | Reward functions |
| GRPO | When you can verify correctness | Verifiable outputs |
| SFT | Teaching specific output format | Input/output pairs |
| SFT | Following new instructions | Conversation examples |
| SFT | Learning domain knowledge | Labeled examples |
| Model | Size | VRAM | Use Case |
|---|---|---|---|
| 0.5B | 5GB | Mobile deployment (~200MB GGUF) |
| 1.5B | 5GB | Learning/prototyping |
| 3B | 8GB | Good balance (recommended start) |
| 7B | 16GB | Production quality |
| 14B | 20GB | Strong reasoning |
GRPOConfig(
num_generations=4, # Completions per prompt (2-8)
beta=0.04, # KL penalty (0.01-0.1)
learning_rate=5e-6, # 10x smaller than SFT!
max_completion_length=512,
max_steps=300, # Minimum for results
)TrainingArguments(
learning_rate=2e-4, # Standard SFT rate
num_train_epochs=3, # 2-4 typical
per_device_train_batch_size=2,
)def correctness_reward(completions, answer, **kwargs):
"""
+2.0 for correct answer, 0.0 otherwise.
This should be your highest-weighted reward.
"""
rewards = []
for completion, true_answer in zip(completions, answer):
extracted = extract_answer(completion)
try:
pred = float(extracted.replace(",", "").strip())
true = float(true_answer.replace(",", "").strip())
reward = 2.0 if abs(pred - true) < 0.01 else 0.0
except ValueError:
reward = 2.0 if extracted.strip() == str(true_answer).strip() else 0.0
rewards.append(reward)
return rewardsdef format_reward(completions, **kwargs):
"""
+0.5 for proper XML structure with reasoning and answer tags.
"""
rewards = []
for completion in completions:
has_reasoning = bool(re.search(r"<reasoning>.*?</reasoning>", completion, re.DOTALL))
has_answer = bool(re.search(r"<answer>.*?</answer>", completion, re.DOTALL))
if has_reasoning and has_answer:
rewards.append(0.5)
elif has_answer:
rewards.append(0.2)
else:
rewards.append(0.0)
return rewardsdef reasoning_length_reward(completions, **kwargs):
"""
+0.3 for substantive reasoning (30-200 words).
"""
rewards = []
for completion in completions:
reasoning = extract_reasoning(completion)
word_count = len(reasoning.split()) if reasoning else 0
if 30 <= word_count <= 200:
rewards.append(0.3)
elif 15 <= word_count < 30:
rewards.append(0.1)
else:
rewards.append(0.0)
return rewardsdef no_hedging_reward(completions, **kwargs):
"""
-0.3 penalty for uncertainty language.
"""
hedging = ["i think", "maybe", "perhaps", "possibly", "i'm not sure"]
rewards = []
for completion in completions:
has_hedging = any(phrase in completion.lower() for phrase in hedging)
rewards.append(-0.3 if has_hedging else 0.0)
return rewardsreward_funcs = [
correctness_reward, # +2.0 max (primary signal)
format_reward, # +0.5 max (structure)
reasoning_length_reward, # +0.3 max (quality)
no_hedging_reward, # -0.3 max (constraint)
]
# Total range: -0.3 to +2.8For domain-specific rewards: Seefor Voice AI, Sales Agent, and Support patterns. </reward_functions>reference/domain-examples.md
SYSTEM_PROMPT = """You are a helpful assistant that thinks step-by-step.
Always respond in this exact format:
<reasoning>
[Your step-by-step thinking process]
</reasoning>
<answer>
[Your final answer - just the number or short response]
</answer>
"""import re
def extract_answer(text: str) -> str:
"""Extract answer from XML tags"""
match = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
return match.group(1).strip() if match else ""
def extract_reasoning(text: str) -> str:
"""Extract reasoning from XML tags"""
match = re.search(r"<reasoning>(.*?)</reasoning>", text, re.DOTALL)
return match.group(1).strip() if match else ""dataset = dataset.map(lambda ex: {
"prompt": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": ex["question"]}
],
"answer": ex["answer"] # Ground truth for verification
})dataset = dataset.map(lambda ex: {
"conversations": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": ex["input"]},
{"role": "assistant", "content": ex["output"]}
]
})model.save_lora("grpo_lora")model.save_pretrained_merged(
"grpo_merged", tokenizer,
save_method="merged_16bit",
)model.save_pretrained_gguf(
"grpo_gguf", tokenizer,
quantization_method="q4_k_m", # Options: q4_k_m, q8_0, q5_k_m
)# Create Modelfile
cat > Modelfile << EOF
FROM ./grpo_gguf/unsloth.Q4_K_M.gguf
TEMPLATE """{{ .System }}
User: {{ .Prompt }}
Assistant: """
PARAMETER temperature 0.7
EOF
ollama create my-model -f Modelfile
ollama run my-model "Solve: 15 + 27 = ?"reference/grpo/basic_grpo.pyreference/sft/sales_extractor_training.pyreference/reward-design.mdreference/domain-examples.mdreference/fp8-training.mdreference/deployment.mdreference/vision-training.mdreference/mobile-deployment.mdreference/advanced-training.mdreference/export-formats.mdreference/troubleshooting.md| Symptom | Fix |
|---|---|
| Reward not increasing | Wait 300+ steps, then increase learning_rate 2x |
| Reward spiky/unstable | Decrease learning_rate 0.5x, increase beta |
| Model outputs garbage | Increase beta 2-4x, check prompt format |
| Out of memory | Reduce max_completion_length, num_generations=2 |
| No reasoning appearing | Train 500+ steps, use model >= 1.5B |
For detailed troubleshooting: See</troubleshooting_quick>reference/troubleshooting.md
use_gradient_checkpointing="unsloth"promptanswernum_generations >= 2betalearning_ratelearning_ratemodel, tokenizer = FastLanguageModel.from_pretrained(
model_name="Qwen/Qwen2.5-3B-Instruct",
max_seq_length=1024, load_in_4bit=True,
)def brevity_reward(completions, **kwargs):
"""Voice responses under 50 words get +0.5"""
return [0.5 if len(c.split()) <= 50 else -0.3 for c in completions]
def speakable_reward(completions, **kwargs):
"""Penalize markdown that sounds bad spoken"""
bad = ["**", "- ", "```", "http"]
return [-0.2 * sum(1 for b in bad if b in c) for c in completions]
reward_funcs = [correctness_reward, brevity_reward, speakable_reward]reference/domain-examples.md