Reinforcement Learning: Greedy Policy & Epsilon Greedy Policy

DevOpsThinh · DevOpsThinh · commit 513151848cb5 · 2023-06-03T00:01:04.000+07:00
diff --git a/ml/rl_in_robotics/images/casino_e_greedy_policy.png b/ml/rl_in_robotics/images/casino_e_greedy_policy.png
diff --git a/ml/rl_in_robotics/images/casino_greedy_policy.png b/ml/rl_in_robotics/images/casino_greedy_policy.png
diff --git a/ml/rl_in_robotics/multi_armed_bandits/epsilon_greedy_policy_agent.py b/ml/rl_in_robotics/multi_armed_bandits/epsilon_greedy_policy_agent.py
@@ -0,0 +1,28 @@
+# Learner: Nguyen Truong Thinh
+# Contact me: nguyentruongthinhvn2020@gmail.com || +84393280504
+#
+# Topic: Reinforcement Learning (RL): Multi-Armed Bandits RL problem
+
+import numpy as np
+import random as rad
+
+from fundamentals.custom_functions import make_the_graph
+from multiarmed_bandit_rl_problem import MultiArmedBanditEnv
+from ml.rl_in_robotics.utility import run_epsilon_greedy_policy
+
+# Hyperparameters we can adjust
+BANDITS = [.45, .45, .4, .6, .4]
+SEED = 1  # 0 when use greedy policy
+POLICY = 2  # 1 when use epsilon greedy policy
+BALANCES = 1000
+EXPLORATION = 10
+EPSILON = .1
+
+MODE = "ascii"  # "human"
+
+
+rad.seed(SEED)
+env, rewards = run_epsilon_greedy_policy(MultiArmedBanditEnv(BANDITS), MODE, POLICY, BALANCES, EXPLORATION, EPSILON)
+cum_rewards = np.cumsum(rewards)
+
+make_the_graph(cum_rewards, "Casino RL Problem with Epsilon Greedy Policy", "Trials", "Reward")
diff --git a/ml/rl_in_robotics/multi_armed_bandits/multiarmed_bandit_rl_problem.py b/ml/rl_in_robotics/multi_armed_bandits/multiarmed_bandit_rl_problem.py
@@ -0,0 +1,87 @@
+# Learner: Nguyen Truong Thinh
+# Contact me: nguyentruongthinhvn2020@gmail.com || +84393280504
+#
+# Topic: Reinforcement Learning (RL): Multi-Armed Bandits RL problem
+
+import random
+import gym
+
+
+class MultiArmedBanditEnv(gym.Env):
+    """
+    A Multi-Armed Bandits gym's custom environment that simulates our casino problem.
+    """
+    metadata = {"render.modes": ["human", "ascii"]}
+
+    def __init__(self, bandits):
+        """
+        Environment Initialization
+        :param bandits: The winning probabilities
+        """
+        self.bandits = bandits
+        self.state = {}
+        self.reset()
+
+    def step(self, action):
+        """
+        Takes the agent's action & calculates the rewards.
+        Each action environment returns 1$ or -1$ to an agent
+        :param action: The Agent's action
+        :return: 1$ or -1$ to an agent.
+        """
+        p = self.bandits[action]
+        r = random.random()
+        reward = 1 if r <= p else -1
+        self.state[action].append(reward)
+        done = False
+        debug = None
+
+        return self.state, reward, done, debug
+
+    def _render_human(self):
+        """
+         The Graphic environment rendering
+        """
+
+    def _render_ascii(self):
+        """
+         The ASCII environment rendering
+        """
+        returns = {}
+        trials = {}
+
+        for e in range(len(self.bandits)):
+            returns[e] = sum(self.state[e])
+            trials[e] = len(self.state[e])
+
+        print(f'***** Total Trials: {sum(trials.values())} *****')
+
+        for b, r in returns.items():
+            t = trials[b]
+            print(f'"Bandit {b}"| returns: {r}, trials: {t}')
+
+        print(f'***** Total Returns: {sum(returns.values())} *****')
+
+    def render(self, mode="human"):
+        """
+        Render the current state of the environment:
+        Shows the overall statistics of all rounds
+        """
+        if mode == "human":
+            self._render_human()
+        elif mode == "ascii":
+            self._render_ascii()
+        else:
+            raise Exception("Not Implemented!")
+
+    def reset(self):
+        """
+        Reset environment to its original state
+        :return:
+        """
+        self.state = {}
+
+        for e in range(len(self.bandits)):
+            self.state[e] = []
+
+        return self.state
diff --git a/ml/rl_in_robotics/multi_armed_bandits/random_agent.py b/ml/rl_in_robotics/multi_armed_bandits/random_agent.py
@@ -0,0 +1,26 @@
+# Learner: Nguyen Truong Thinh
+# Contact me: nguyentruongthinhvn2020@gmail.com || +84393280504
+#
+# Topic: Reinforcement Learning (RL): Multi-Armed Bandits RL problem
+
+import numpy as np
+import random as rad
+from multiarmed_bandit_rl_problem import MultiArmedBanditEnv
+from ml.rl_in_robotics.utility import gym_rl_custom_tasks
+
+# Hyperparameters we can adjust
+BANDITS = [.45, .45, .4, .6, .4]
+SEED = 1
+
+BALANCES = 1000
+MODE = "ascii"  # "human"
+SLEEP = .0          # ?
+
+rad.seed(SEED)
+np.random.seed(1)
+env = MultiArmedBanditEnv(BANDITS)
+
+# 5 one-armed bandits (0 -> 4)
+action = rad.randint(0, 4)
+
+gym_rl_custom_tasks(env, BALANCES, action, MODE, SLEEP)
diff --git a/ml/rl_in_robotics/utility.py b/ml/rl_in_robotics/utility.py
@@ -8,14 +8,86 @@
 import random
 from time import sleep
 
+import numpy as np
 import gym
 import pygame
 
 
+def run_epsilon_greedy_policy(env, mode, policy, episodes=1000, exploration=10, epsilon=.1):
+    """
+    Epsilon Greedy Policy In Action
+    """
+    state = env.reset()
+    rewards = []
+
+    for e in range(episodes):
+        action = []
+
+        match policy:
+            case 1:
+                action = greedy_policy(state, exploration)
+            case 2:
+                action = epsilon_greedy_policy(state, exploration, epsilon)
+
+        state, reward, done, debug = env.step(action)
+        rewards.append(reward)
+
+    for _ in range(episodes):
+        if mode == "human":
+            env.render(mode)
+        elif mode == "ascii":
+            env.render("ascii")
+        else:
+            env.render()
+
+    env.close()
+
+    return env, rewards
+
+
+def epsilon_greedy_policy(state, explore=10, epsilon=.1):
+    """
+    Implement of the Epsilon Greedy Policy
+    """
+    machines = len(state)
+    trials = sum(len(state[m]) for m in range(machines))
+    total_explore_trials = machines * explore
+
+    # Exploration
+    if trials <= total_explore_trials:
+        return trials % machines
+    # Random machine
+    if random.random() < epsilon:
+        return random.randint(0, machines - 1)
+    # Exploitation
+    avg_rewards = [sum(state[m]) / len(state[m]) for m in range(machines)]
+
+    best_machine = np.argmax(avg_rewards)
+    return best_machine
+
+
+def greedy_policy(state, explore=10):
+    """
+    Implement of the Greedy Policy
+    """
+    machines = len(state)
+    trials = sum(len(state[m]) for m in range(machines))
+    total_explore_trials = machines * explore
+
+    # Exploration
+    if trials <= total_explore_trials:
+        return trials % machines
+    # Exploitation
+    avg_rewards = [sum(state[m]) / len(state[m]) for m in range(machines)]
+
+    best_machine = np.argmax(avg_rewards)
+    return best_machine
+
+
 def gym_rl_custom_tasks(env, episodes, action, mode, duration):
     """
-      Unifying all RL tasks by Gym toolkit with seeding
-      """
+    Unifying all RL custom tasks by Gym toolkit
+    """
     init_reset_environment(env)
 
     gym_customize_tasks(env, episodes, action, mode, duration)