️ Makes training parallelized

This commit is contained in:
Rune Harlyk
2025-10-10 20:22:39 +02:00
committed by Rune Harlyk
parent 01c4a80c8f
commit d47ce02cc6
2 changed files with 66 additions and 11 deletions
+31 -3
View File
@@ -95,6 +95,8 @@ class QuadrupedEnv(gym.Env):
self.max_steps = max_steps self.max_steps = max_steps
self.current_step = 0 self.current_step = 0
self.prev_velocity = None
self._setup_world() self._setup_world()
if render_mode == "human": if render_mode == "human":
self.env_start_state = p.saveState() self.env_start_state = p.saveState()
@@ -143,6 +145,7 @@ class QuadrupedEnv(gym.Env):
p.resetSimulation() p.resetSimulation()
self._setup_world() self._setup_world()
self.current_step = 0 self.current_step = 0
self.prev_velocity = None
return self.robot.get_observation(), {} return self.robot.get_observation(), {}
def step(self, action): def step(self, action):
@@ -165,17 +168,42 @@ class QuadrupedEnv(gym.Env):
def calculate_reward(self, obs): def calculate_reward(self, obs):
position = obs[:3] position = obs[:3]
orientation = obs[3:6]
velocity = obs[6:9] velocity = obs[6:9]
angular_velocity = obs[9:12] angular_velocity = obs[9:12]
forward_velocity = velocity[0] forward_velocity = velocity[0]
velocity_reward = -abs(forward_velocity - self.target_velocity) velocity_reward = -abs(forward_velocity - self.target_velocity)
height_penalty = -abs(position[2] - 0.3) height_penalty = -abs(position[2] - 0.3) * 0.5
angular_penalty = -np.sum(np.square(angular_velocity)) roll, pitch, yaw = orientation
orientation_penalty = -(abs(roll) + abs(pitch)) * 1.0
total_reward = velocity_reward + 0.1 * height_penalty + 0.01 * angular_penalty angular_penalty = -np.sum(np.square(angular_velocity)) * 0.05
sideways_velocity_penalty = -abs(velocity[1]) * 0.3
if self.prev_velocity is not None:
dt = 1.0 / 240.0
acceleration = (velocity - self.prev_velocity) / dt
lateral_acc_penalty = -abs(acceleration[1]) * 0.01
vertical_acc_penalty = -abs(acceleration[2]) * 0.01
else:
lateral_acc_penalty = 0
vertical_acc_penalty = 0
self.prev_velocity = velocity.copy()
total_reward = (
velocity_reward
+ height_penalty
+ orientation_penalty
+ angular_penalty
+ sideways_velocity_penalty
+ lateral_acc_penalty
+ vertical_acc_penalty
)
return total_reward return total_reward
def is_done(self, obs): def is_done(self, obs):
+35 -8
View File
@@ -3,7 +3,7 @@ import os
import gymnasium as gym import gymnasium as gym
from stable_baselines3 import PPO, SAC from stable_baselines3 import PPO, SAC
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecNormalize
from stable_baselines3.common.monitor import Monitor from stable_baselines3.common.monitor import Monitor
import numpy as np import numpy as np
import torch import torch
@@ -37,13 +37,15 @@ def train_ppo(
eval_freq=10000, eval_freq=10000,
save_freq=50000, save_freq=50000,
terrain_type=TerrainType.FLAT, terrain_type=TerrainType.FLAT,
n_envs=8,
use_gpu=True,
): ):
os.makedirs(save_dir, exist_ok=True) os.makedirs(save_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True)
os.makedirs(f"{log_dir}/eval", exist_ok=True) os.makedirs(f"{log_dir}/eval", exist_ok=True)
print("Creating training environment...") print(f"Creating {n_envs} parallel training environments...")
env = DummyVecEnv([make_env(terrain_type=terrain_type)]) env = SubprocVecEnv([make_env(terrain_type=terrain_type) for _ in range(n_envs)])
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0)
print("Creating evaluation environment...") print("Creating evaluation environment...")
@@ -66,7 +68,8 @@ def train_ppo(
render=False, render=False,
) )
print("Creating PPO model...") device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
print(f"Creating PPO model on device: {device}")
model = PPO( model = PPO(
"MlpPolicy", "MlpPolicy",
env, env,
@@ -82,6 +85,7 @@ def train_ppo(
max_grad_norm=max_grad_norm, max_grad_norm=max_grad_norm,
verbose=1, verbose=1,
tensorboard_log=log_dir, tensorboard_log=log_dir,
device=device,
policy_kwargs=dict( policy_kwargs=dict(
net_arch=[dict(pi=[256, 256], vf=[256, 256])], net_arch=[dict(pi=[256, 256], vf=[256, 256])],
activation_fn=torch.nn.ReLU, activation_fn=torch.nn.ReLU,
@@ -119,13 +123,15 @@ def train_sac(
eval_freq=10000, eval_freq=10000,
save_freq=50000, save_freq=50000,
terrain_type=TerrainType.FLAT, terrain_type=TerrainType.FLAT,
n_envs=8,
use_gpu=True,
): ):
os.makedirs(save_dir, exist_ok=True) os.makedirs(save_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True)
os.makedirs(f"{log_dir}/eval", exist_ok=True) os.makedirs(f"{log_dir}/eval", exist_ok=True)
print("Creating training environment...") print(f"Creating {n_envs} parallel training environments...")
env = DummyVecEnv([make_env(terrain_type=terrain_type)]) env = SubprocVecEnv([make_env(terrain_type=terrain_type) for _ in range(n_envs)])
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0)
print("Creating evaluation environment...") print("Creating evaluation environment...")
@@ -148,7 +154,8 @@ def train_sac(
render=False, render=False,
) )
print("Creating SAC model...") device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
print(f"Creating SAC model on device: {device}")
model = SAC( model = SAC(
"MlpPolicy", "MlpPolicy",
env, env,
@@ -163,6 +170,7 @@ def train_sac(
ent_coef=ent_coef, ent_coef=ent_coef,
verbose=1, verbose=1,
tensorboard_log=log_dir, tensorboard_log=log_dir,
device=device,
policy_kwargs=dict( policy_kwargs=dict(
net_arch=dict(pi=[256, 256], qf=[256, 256]), net_arch=dict(pi=[256, 256], qf=[256, 256]),
activation_fn=torch.nn.ReLU, activation_fn=torch.nn.ReLU,
@@ -224,6 +232,17 @@ def main():
default="logs", default="logs",
help="Directory to save logs", help="Directory to save logs",
) )
parser.add_argument(
"--n-envs",
type=int,
default=8,
help="Number of parallel environments (default: 8, max: 16)",
)
parser.add_argument(
"--cpu-only",
action="store_true",
help="Force CPU training even if GPU is available",
)
args = parser.parse_args() args = parser.parse_args()
@@ -235,13 +254,17 @@ def main():
} }
terrain_type = terrain_map[args.terrain] terrain_type = terrain_map[args.terrain]
use_gpu = not args.cpu_only and torch.cuda.is_available()
print(f"\n{'='*50}") print(f"\n{'='*50}")
print(f"Training Configuration:") print(f"Training Configuration:")
print(f" Algorithm: {args.algo}") print(f" Algorithm: {args.algo}")
print(f" Total timesteps: {args.timesteps:,}") print(f" Total timesteps: {args.timesteps:,}")
print(f" Learning rate: {args.learning_rate}") print(f" Learning rate: {args.learning_rate}")
print(f" Terrain: {args.terrain}") print(f" Terrain: {args.terrain}")
print(f" Device: {'cuda' if torch.cuda.is_available() else 'cpu'}") print(f" Parallel environments: {args.n_envs}")
print(f" Device: {'CUDA (GPU)' if use_gpu else 'CPU'}")
print(f" CPU cores available: {os.cpu_count()}")
print(f"{'='*50}\n") print(f"{'='*50}\n")
if args.algo == "ppo" or args.algo == "both": if args.algo == "ppo" or args.algo == "both":
@@ -252,6 +275,8 @@ def main():
save_dir=f"{args.save_dir}/ppo", save_dir=f"{args.save_dir}/ppo",
log_dir=f"{args.log_dir}/ppo", log_dir=f"{args.log_dir}/ppo",
terrain_type=terrain_type, terrain_type=terrain_type,
n_envs=args.n_envs,
use_gpu=use_gpu,
) )
if args.algo == "sac" or args.algo == "both": if args.algo == "sac" or args.algo == "both":
@@ -262,6 +287,8 @@ def main():
save_dir=f"{args.save_dir}/sac", save_dir=f"{args.save_dir}/sac",
log_dir=f"{args.log_dir}/sac", log_dir=f"{args.log_dir}/sac",
terrain_type=terrain_type, terrain_type=terrain_type,
n_envs=args.n_envs,
use_gpu=use_gpu,
) )