Add DDPG and fix Gym wrapper (#1207)

* Fix Gym wrapper - It was returning things in the wrong order - Gym now differentiates between terminated and truncated * Add DDPG * Apply fixes * Remove Result annotations * Also remove Vec annotation * rustfmt * Various small improvements (avoid cloning, mutability, get clippy to pass, ...) --------- Co-authored-by: Travis Hammond <travis.hammond@alexanderthamm.com> Co-authored-by: Laurent <laurent.mazare@gmail.com>
2025-06-16 18:48:51 +00:00 · 2023-10-28 20:53:34 +02:00
parent 012ae0090e
commit 498c50348c
3 changed files with 549 additions and 25 deletions
--- a/candle-examples/examples/reinforcement-learning/main.rs
+++ b/candle-examples/examples/reinforcement-learning/main.rs
@ -9,14 +9,34 @@ extern crate accelerate_src;
 mod gym_env;
 mod vec_gym_env;

-use candle::Result;
+mod ddpg;
+
+use candle::{Device, Result, Tensor};
 use clap::Parser;
 use rand::Rng;

+// The impact of the q value of the next state on the current state's q value.
+const GAMMA: f64 = 0.99;
+// The weight for updating the target networks.
+const TAU: f64 = 0.005;
+// The capacity of the replay buffer used for sampling training data.
+const REPLAY_BUFFER_CAPACITY: usize = 100_000;
+// The training batch size for each training iteration.
+const TRAINING_BATCH_SIZE: usize = 100;
 // The total number of episodes.
 const MAX_EPISODES: usize = 100;
 // The maximum length of an episode.
 const EPISODE_LENGTH: usize = 200;
+// The number of training iterations after one episode finishes.
+const TRAINING_ITERATIONS: usize = 200;
+
+// Ornstein-Uhlenbeck process parameters.
+const MU: f64 = 0.0;
+const THETA: f64 = 0.15;
+const SIGMA: f64 = 0.1;
+
+const ACTOR_LEARNING_RATE: f64 = 1e-4;
+const CRITIC_LEARNING_RATE: f64 = 1e-3;

 #[derive(Parser, Debug, Clone)]
 #[command(author, version, about, long_about = None)]
@ -48,28 +68,77 @@ fn main() -> Result<()> {
    println!("action space: {}", env.action_space());
    println!("observation space: {:?}", env.observation_space());

-    let _num_obs = env.observation_space().iter().product::<usize>();
-    let _num_actions = env.action_space();
+    let size_state = env.observation_space().iter().product::<usize>();
+    let size_action = env.action_space();
+
+    let mut agent = ddpg::DDPG::new(
+        &Device::Cpu,
+        size_state,
+        size_action,
+        true,
+        ACTOR_LEARNING_RATE,
+        CRITIC_LEARNING_RATE,
+        GAMMA,
+        TAU,
+        REPLAY_BUFFER_CAPACITY,
+        ddpg::OuNoise::new(MU, THETA, SIGMA, size_action)?,
+    )?;

    let mut rng = rand::thread_rng();

    for episode in 0..MAX_EPISODES {
-        let mut obs = env.reset(episode as u64)?;
+        // let mut state = env.reset(episode as u64)?;
+        let mut state = env.reset(rng.gen::<u64>())?;

        let mut total_reward = 0.0;
        for _ in 0..EPISODE_LENGTH {
-            let actions = rng.gen_range(-2.0..2.0);
+            let mut action = 2.0 * agent.actions(&state)?;
+            action = action.clamp(-2.0, 2.0);

-            let step = env.step(vec![actions])?;
+            let step = env.step(vec![action])?;
            total_reward += step.reward;

-            if step.is_done {
+            agent.remember(
+                &state,
+                &Tensor::new(vec![action], &Device::Cpu)?,
+                &Tensor::new(vec![step.reward as f32], &Device::Cpu)?,
+                &step.state,
+                step.terminated,
+                step.truncated,
+            );
+
+            if step.terminated || step.truncated {
                break;
            }
-            obs = step.obs;
+            state = step.state;
        }

        println!("episode {episode} with total reward of {total_reward}");
+
+        for _ in 0..TRAINING_ITERATIONS {
+            agent.train(TRAINING_BATCH_SIZE)?;
+        }
+    }
+
+    println!("Testing...");
+    agent.train = false;
+    for episode in 0..10 {
+        // let mut state = env.reset(episode as u64)?;
+        let mut state = env.reset(rng.gen::<u64>())?;
+        let mut total_reward = 0.0;
+        for _ in 0..EPISODE_LENGTH {
+            let mut action = 2.0 * agent.actions(&state)?;
+            action = action.clamp(-2.0, 2.0);
+
+            let step = env.step(vec![action])?;
+            total_reward += step.reward;
+
+            if step.terminated || step.truncated {
+                break;
+            }
+            state = step.state;
+        }
+        println!("episode {episode} with total reward of {total_reward}");
    }
    Ok(())
 }