Loading...
Loading...
Use this skill for reinforcement learning tasks including training RL agents (PPO, SAC, DQN, TD3, DDPG, A2C, etc.), creating custom Gym environments, implementing callbacks for monitoring and control, using vectorized environments for parallel training, and integrating with deep RL workflows. This skill should be used when users request RL algorithm implementation, agent training, environment design, or RL experimentation.
npx skill4agent add ovachiever/droid-tings stable-baselines3import gymnasium as gym
from stable_baselines3 import PPO
# Create environment
env = gym.make("CartPole-v1")
# Initialize agent
model = PPO("MlpPolicy", env, verbose=1)
# Train the agent
model.learn(total_timesteps=10000)
# Save the model
model.save("ppo_cartpole")
# Load the model (without prior instantiation)
model = PPO.load("ppo_cartpole", env=env)total_timestepsmodel.load()references/algorithms.mdscripts/train_rl_agent.pygymnasium.Env__init__()reset(seed, options)step(action)render()close()np.uint8normalize_images=FalseDiscreteMultiDiscretestart!=0from stable_baselines3.common.env_checker import check_env
check_env(env, warn=True)scripts/custom_env_template.pyreferences/custom_environments.mdfrom stable_baselines3.common.env_util import make_vec_env
# Create 4 parallel environments
env = make_vec_env("CartPole-v1", n_envs=4, vec_env_cls=SubprocVecEnv)
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=25000)gradient_steps=-1reset()vec_env.reset_infosstep()(obs, rewards, dones, infos)infos[env_idx]["terminal_observation"]references/vectorized_envs.mdfrom stable_baselines3.common.callbacks import BaseCallback
class CustomCallback(BaseCallback):
def _on_training_start(self):
# Called before first rollout
pass
def _on_step(self):
# Called after each environment step
# Return False to stop training
return True
def _on_rollout_end(self):
# Called at end of rollout
passself.modelself.num_timestepsself.training_envfrom stable_baselines3.common.callbacks import CallbackList
callback = CallbackList([eval_callback, checkpoint_callback, custom_callback])
model.learn(total_timesteps=10000, callback=callback)references/callbacks.md# Save model
model.save("model_name")
# Save normalization statistics (if using VecNormalize)
vec_env.save("vec_normalize.pkl")
# Load model
model = PPO.load("model_name", env=env)
# Load normalization statistics
vec_env = VecNormalize.load("vec_normalize.pkl", vec_env)# Get parameters
params = model.get_parameters()
# Set parameters
model.set_parameters(params)
# Access PyTorch state dict
state_dict = model.policy.state_dict()from stable_baselines3.common.evaluation import evaluate_policy
mean_reward, std_reward = evaluate_policy(
model,
env,
n_eval_episodes=10,
deterministic=True
)from stable_baselines3.common.vec_env import VecVideoRecorder
# Wrap environment with video recorder
env = VecVideoRecorder(
env,
"videos/",
record_video_trigger=lambda x: x % 2000 == 0,
video_length=200
)scripts/evaluate_agent.pydef linear_schedule(initial_value):
def func(progress_remaining):
# progress_remaining goes from 1 to 0
return progress_remaining * initial_value
return func
model = PPO("MlpPolicy", env, learning_rate=linear_schedule(0.001))model = PPO("MultiInputPolicy", env, verbose=1)from stable_baselines3 import SAC, HerReplayBuffer
model = SAC(
"MultiInputPolicy",
env,
replay_buffer_class=HerReplayBuffer,
replay_buffer_kwargs=dict(
n_sampled_goal=4,
goal_selection_strategy="future",
),
)model = PPO("MlpPolicy", env, tensorboard_log="./tensorboard/")
model.learn(total_timesteps=10000)references/algorithms.mdscripts/custom_env_template.pycheck_env()scripts/train_rl_agent.pyscripts/evaluate_agent.pybuffer_sizestable_baselines3uv pip install stable-baselines3[extra]train_rl_agent.pyevaluate_agent.pycustom_env_template.pyalgorithms.mdcustom_environments.mdcallbacks.mdvectorized_envs.md# Basic installation
uv pip install stable-baselines3
# With extra dependencies (Tensorboard, etc.)
uv pip install stable-baselines3[extra]