Loading...
Loading...
Train and fine-tune transformer language models using TRL (Transformers Reinforcement Learning). Supports SFT, DPO, GRPO, KTO, RLOO and Reward Model training via CLI commands.
npx skill4agent add huggingface/skills trl-trainingtrl sft \
--model_name_or_path Qwen/Qwen2-0.5B \
--dataset_name trl-lib/Capybara \
--learning_rate 2.0e-5 \
--num_train_epochs 1 \
--packing \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps 8 \
--eos_token '<|im_end|>' \
--eval_strategy steps \
--eval_steps 100 \
--output_dir Qwen2-0.5B-SFT \
--push_to_hubtrl sft \
--model_name_or_path Qwen/Qwen2-0.5B \
--dataset_name trl-lib/Capybara \
--learning_rate 2.0e-4 \
--num_train_epochs 1 \
--packing \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps 8 \
--eos_token '<|im_end|>' \
--eval_strategy steps \
--eval_steps 100 \
--use_peft \
--lora_r 32 \
--lora_alpha 16 \
--output_dir Qwen2-0.5B-SFT \
--push_to_hubtrl dpo \
--dataset_name trl-lib/ultrafeedback_binarized \
--model_name_or_path Qwen/Qwen2-0.5B-Instruct \
--learning_rate 5.0e-7 \
--num_train_epochs 1 \
--per_device_train_batch_size 2 \
--max_steps 1000 \
--gradient_accumulation_steps 8 \
--eval_strategy steps \
--eval_steps 50 \
--output_dir Qwen2-0.5B-DPO \
--no_remove_unused_columnstrl dpo \
--dataset_name trl-lib/ultrafeedback_binarized \
--model_name_or_path Qwen/Qwen2-0.5B-Instruct \
--learning_rate 5.0e-6 \
--num_train_epochs 1 \
--per_device_train_batch_size 2 \
--max_steps 1000 \
--gradient_accumulation_steps 8 \
--eval_strategy steps \
--eval_steps 50 \
--output_dir Qwen2-0.5B-DPO \
--no_remove_unused_columns \
--use_peft \
--lora_r 32 \
--lora_alpha 16trl grpo \
--model_name_or_path Qwen/Qwen2.5-0.5B \
--dataset_name trl-lib/gsm8k \
--reward_funcs accuracy_reward \
--output_dir Qwen2-0.5B-GRPO \
--push_to_hubtrl rloo \
--model_name_or_path Qwen/Qwen2.5-0.5B \
--dataset_name trl-lib/tldr \
--reward_model_name_or_path sentiment-analysis:nlptown/bert-base-multilingual-uncased-sentiment \
--output_dir Qwen2-0.5B-RLOO \
--push_to_hubtrl reward \
--model_name_or_path Qwen/Qwen2-0.5B-Instruct \
--dataset_name trl-lib/ultrafeedback_binarized \
--output_dir Qwen2-0.5B-Reward \
--per_device_train_batch_size 8 \
--num_train_epochs 1 \
--learning_rate 1.0e-5 \
--eval_strategy steps \
--eval_steps 50 \
--max_length 2048trl reward \
--model_name_or_path Qwen/Qwen2-0.5B-Instruct \
--dataset_name trl-lib/ultrafeedback_binarized \
--output_dir Qwen2-0.5B-Reward-LoRA \
--per_device_train_batch_size 8 \
--num_train_epochs 1 \
--learning_rate 1.0e-4 \
--eval_strategy steps \
--eval_steps 50 \
--max_length 2048 \
--use_peft \
--lora_task_type SEQ_CLS \
--lora_r 32 \
--lora_alpha 16model_name_or_path: Qwen/Qwen2.5-0.5B
dataset_name: trl-lib/Capybara
learning_rate: 2.0e-5
num_train_epochs: 1
per_device_train_batch_size: 8
gradient_accumulation_steps: 2
output_dir: ./sft_output
use_peft: true
lora_r: 16
lora_alpha: 16
report_to: trackiotrl sft --config sft_config.yamltrl sft --config sft_config.yaml --learning_rate 1.0e-5trl sft \
--config sft_config.yaml \
--num_processes 4single_gpumulti_gpufsdp1fsdp2zero1zero2zero3trl sft \
--config sft_config.yaml \
--accelerate_config zero2# Generate custom config
accelerate config
# Use custom config
trl sft --config sft_config.yaml --config_file ~/.cache/huggingface/accelerate/default_config.yamltrl sft --config sft_config.yaml --accelerate_config fsdp2trl sft --config sft_config.yaml --accelerate_config zero3--per_device_train_batch_size--gradient_accumulation_steps--use_peft--gradient_checkpointing--dataset_configfrom datasets import load_dataset; ds = load_dataset(name)hf auth login--packing--per_device_train_batch_size--tf32--bf16--num_processes--temperature--top_p--use_peft--report_to trackio--report_to wandb--report_to tensorboard--output_dir