⚡️ Makes training parallelized
This commit is contained in:
@@ -95,6 +95,8 @@ class QuadrupedEnv(gym.Env):
|
|||||||
self.max_steps = max_steps
|
self.max_steps = max_steps
|
||||||
self.current_step = 0
|
self.current_step = 0
|
||||||
|
|
||||||
|
self.prev_velocity = None
|
||||||
|
|
||||||
self._setup_world()
|
self._setup_world()
|
||||||
if render_mode == "human":
|
if render_mode == "human":
|
||||||
self.env_start_state = p.saveState()
|
self.env_start_state = p.saveState()
|
||||||
@@ -143,6 +145,7 @@ class QuadrupedEnv(gym.Env):
|
|||||||
p.resetSimulation()
|
p.resetSimulation()
|
||||||
self._setup_world()
|
self._setup_world()
|
||||||
self.current_step = 0
|
self.current_step = 0
|
||||||
|
self.prev_velocity = None
|
||||||
return self.robot.get_observation(), {}
|
return self.robot.get_observation(), {}
|
||||||
|
|
||||||
def step(self, action):
|
def step(self, action):
|
||||||
@@ -165,17 +168,42 @@ class QuadrupedEnv(gym.Env):
|
|||||||
|
|
||||||
def calculate_reward(self, obs):
|
def calculate_reward(self, obs):
|
||||||
position = obs[:3]
|
position = obs[:3]
|
||||||
|
orientation = obs[3:6]
|
||||||
velocity = obs[6:9]
|
velocity = obs[6:9]
|
||||||
angular_velocity = obs[9:12]
|
angular_velocity = obs[9:12]
|
||||||
|
|
||||||
forward_velocity = velocity[0]
|
forward_velocity = velocity[0]
|
||||||
velocity_reward = -abs(forward_velocity - self.target_velocity)
|
velocity_reward = -abs(forward_velocity - self.target_velocity)
|
||||||
|
|
||||||
height_penalty = -abs(position[2] - 0.3)
|
height_penalty = -abs(position[2] - 0.3) * 0.5
|
||||||
|
|
||||||
angular_penalty = -np.sum(np.square(angular_velocity))
|
roll, pitch, yaw = orientation
|
||||||
|
orientation_penalty = -(abs(roll) + abs(pitch)) * 1.0
|
||||||
|
|
||||||
total_reward = velocity_reward + 0.1 * height_penalty + 0.01 * angular_penalty
|
angular_penalty = -np.sum(np.square(angular_velocity)) * 0.05
|
||||||
|
|
||||||
|
sideways_velocity_penalty = -abs(velocity[1]) * 0.3
|
||||||
|
|
||||||
|
if self.prev_velocity is not None:
|
||||||
|
dt = 1.0 / 240.0
|
||||||
|
acceleration = (velocity - self.prev_velocity) / dt
|
||||||
|
lateral_acc_penalty = -abs(acceleration[1]) * 0.01
|
||||||
|
vertical_acc_penalty = -abs(acceleration[2]) * 0.01
|
||||||
|
else:
|
||||||
|
lateral_acc_penalty = 0
|
||||||
|
vertical_acc_penalty = 0
|
||||||
|
|
||||||
|
self.prev_velocity = velocity.copy()
|
||||||
|
|
||||||
|
total_reward = (
|
||||||
|
velocity_reward
|
||||||
|
+ height_penalty
|
||||||
|
+ orientation_penalty
|
||||||
|
+ angular_penalty
|
||||||
|
+ sideways_velocity_penalty
|
||||||
|
+ lateral_acc_penalty
|
||||||
|
+ vertical_acc_penalty
|
||||||
|
)
|
||||||
return total_reward
|
return total_reward
|
||||||
|
|
||||||
def is_done(self, obs):
|
def is_done(self, obs):
|
||||||
|
|||||||
+35
-8
@@ -3,7 +3,7 @@ import os
|
|||||||
import gymnasium as gym
|
import gymnasium as gym
|
||||||
from stable_baselines3 import PPO, SAC
|
from stable_baselines3 import PPO, SAC
|
||||||
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
|
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
|
||||||
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
|
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecNormalize
|
||||||
from stable_baselines3.common.monitor import Monitor
|
from stable_baselines3.common.monitor import Monitor
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
@@ -37,13 +37,15 @@ def train_ppo(
|
|||||||
eval_freq=10000,
|
eval_freq=10000,
|
||||||
save_freq=50000,
|
save_freq=50000,
|
||||||
terrain_type=TerrainType.FLAT,
|
terrain_type=TerrainType.FLAT,
|
||||||
|
n_envs=8,
|
||||||
|
use_gpu=True,
|
||||||
):
|
):
|
||||||
os.makedirs(save_dir, exist_ok=True)
|
os.makedirs(save_dir, exist_ok=True)
|
||||||
os.makedirs(log_dir, exist_ok=True)
|
os.makedirs(log_dir, exist_ok=True)
|
||||||
os.makedirs(f"{log_dir}/eval", exist_ok=True)
|
os.makedirs(f"{log_dir}/eval", exist_ok=True)
|
||||||
|
|
||||||
print("Creating training environment...")
|
print(f"Creating {n_envs} parallel training environments...")
|
||||||
env = DummyVecEnv([make_env(terrain_type=terrain_type)])
|
env = SubprocVecEnv([make_env(terrain_type=terrain_type) for _ in range(n_envs)])
|
||||||
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0)
|
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0)
|
||||||
|
|
||||||
print("Creating evaluation environment...")
|
print("Creating evaluation environment...")
|
||||||
@@ -66,7 +68,8 @@ def train_ppo(
|
|||||||
render=False,
|
render=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
print("Creating PPO model...")
|
device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
|
||||||
|
print(f"Creating PPO model on device: {device}")
|
||||||
model = PPO(
|
model = PPO(
|
||||||
"MlpPolicy",
|
"MlpPolicy",
|
||||||
env,
|
env,
|
||||||
@@ -82,6 +85,7 @@ def train_ppo(
|
|||||||
max_grad_norm=max_grad_norm,
|
max_grad_norm=max_grad_norm,
|
||||||
verbose=1,
|
verbose=1,
|
||||||
tensorboard_log=log_dir,
|
tensorboard_log=log_dir,
|
||||||
|
device=device,
|
||||||
policy_kwargs=dict(
|
policy_kwargs=dict(
|
||||||
net_arch=[dict(pi=[256, 256], vf=[256, 256])],
|
net_arch=[dict(pi=[256, 256], vf=[256, 256])],
|
||||||
activation_fn=torch.nn.ReLU,
|
activation_fn=torch.nn.ReLU,
|
||||||
@@ -119,13 +123,15 @@ def train_sac(
|
|||||||
eval_freq=10000,
|
eval_freq=10000,
|
||||||
save_freq=50000,
|
save_freq=50000,
|
||||||
terrain_type=TerrainType.FLAT,
|
terrain_type=TerrainType.FLAT,
|
||||||
|
n_envs=8,
|
||||||
|
use_gpu=True,
|
||||||
):
|
):
|
||||||
os.makedirs(save_dir, exist_ok=True)
|
os.makedirs(save_dir, exist_ok=True)
|
||||||
os.makedirs(log_dir, exist_ok=True)
|
os.makedirs(log_dir, exist_ok=True)
|
||||||
os.makedirs(f"{log_dir}/eval", exist_ok=True)
|
os.makedirs(f"{log_dir}/eval", exist_ok=True)
|
||||||
|
|
||||||
print("Creating training environment...")
|
print(f"Creating {n_envs} parallel training environments...")
|
||||||
env = DummyVecEnv([make_env(terrain_type=terrain_type)])
|
env = SubprocVecEnv([make_env(terrain_type=terrain_type) for _ in range(n_envs)])
|
||||||
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0)
|
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0)
|
||||||
|
|
||||||
print("Creating evaluation environment...")
|
print("Creating evaluation environment...")
|
||||||
@@ -148,7 +154,8 @@ def train_sac(
|
|||||||
render=False,
|
render=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
print("Creating SAC model...")
|
device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
|
||||||
|
print(f"Creating SAC model on device: {device}")
|
||||||
model = SAC(
|
model = SAC(
|
||||||
"MlpPolicy",
|
"MlpPolicy",
|
||||||
env,
|
env,
|
||||||
@@ -163,6 +170,7 @@ def train_sac(
|
|||||||
ent_coef=ent_coef,
|
ent_coef=ent_coef,
|
||||||
verbose=1,
|
verbose=1,
|
||||||
tensorboard_log=log_dir,
|
tensorboard_log=log_dir,
|
||||||
|
device=device,
|
||||||
policy_kwargs=dict(
|
policy_kwargs=dict(
|
||||||
net_arch=dict(pi=[256, 256], qf=[256, 256]),
|
net_arch=dict(pi=[256, 256], qf=[256, 256]),
|
||||||
activation_fn=torch.nn.ReLU,
|
activation_fn=torch.nn.ReLU,
|
||||||
@@ -224,6 +232,17 @@ def main():
|
|||||||
default="logs",
|
default="logs",
|
||||||
help="Directory to save logs",
|
help="Directory to save logs",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--n-envs",
|
||||||
|
type=int,
|
||||||
|
default=8,
|
||||||
|
help="Number of parallel environments (default: 8, max: 16)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--cpu-only",
|
||||||
|
action="store_true",
|
||||||
|
help="Force CPU training even if GPU is available",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
@@ -235,13 +254,17 @@ def main():
|
|||||||
}
|
}
|
||||||
terrain_type = terrain_map[args.terrain]
|
terrain_type = terrain_map[args.terrain]
|
||||||
|
|
||||||
|
use_gpu = not args.cpu_only and torch.cuda.is_available()
|
||||||
|
|
||||||
print(f"\n{'='*50}")
|
print(f"\n{'='*50}")
|
||||||
print(f"Training Configuration:")
|
print(f"Training Configuration:")
|
||||||
print(f" Algorithm: {args.algo}")
|
print(f" Algorithm: {args.algo}")
|
||||||
print(f" Total timesteps: {args.timesteps:,}")
|
print(f" Total timesteps: {args.timesteps:,}")
|
||||||
print(f" Learning rate: {args.learning_rate}")
|
print(f" Learning rate: {args.learning_rate}")
|
||||||
print(f" Terrain: {args.terrain}")
|
print(f" Terrain: {args.terrain}")
|
||||||
print(f" Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
|
print(f" Parallel environments: {args.n_envs}")
|
||||||
|
print(f" Device: {'CUDA (GPU)' if use_gpu else 'CPU'}")
|
||||||
|
print(f" CPU cores available: {os.cpu_count()}")
|
||||||
print(f"{'='*50}\n")
|
print(f"{'='*50}\n")
|
||||||
|
|
||||||
if args.algo == "ppo" or args.algo == "both":
|
if args.algo == "ppo" or args.algo == "both":
|
||||||
@@ -252,6 +275,8 @@ def main():
|
|||||||
save_dir=f"{args.save_dir}/ppo",
|
save_dir=f"{args.save_dir}/ppo",
|
||||||
log_dir=f"{args.log_dir}/ppo",
|
log_dir=f"{args.log_dir}/ppo",
|
||||||
terrain_type=terrain_type,
|
terrain_type=terrain_type,
|
||||||
|
n_envs=args.n_envs,
|
||||||
|
use_gpu=use_gpu,
|
||||||
)
|
)
|
||||||
|
|
||||||
if args.algo == "sac" or args.algo == "both":
|
if args.algo == "sac" or args.algo == "both":
|
||||||
@@ -262,6 +287,8 @@ def main():
|
|||||||
save_dir=f"{args.save_dir}/sac",
|
save_dir=f"{args.save_dir}/sac",
|
||||||
log_dir=f"{args.log_dir}/sac",
|
log_dir=f"{args.log_dir}/sac",
|
||||||
terrain_type=terrain_type,
|
terrain_type=terrain_type,
|
||||||
|
n_envs=args.n_envs,
|
||||||
|
use_gpu=use_gpu,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user