Skip to content

Commit 5131518

Browse files
committed
Reinforcement Learning: Greedy Policy & Epsilon Greedy Policy
1 parent 104703c commit 5131518

File tree

6 files changed

+215
-2
lines changed

6 files changed

+215
-2
lines changed
53.2 KB
Loading
50.1 KB
Loading
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Learner: Nguyen Truong Thinh
2+
# Contact me: [email protected] || +84393280504
3+
#
4+
# Topic: Reinforcement Learning (RL): Multi-Armed Bandits RL problem
5+
6+
import numpy as np
7+
import random as rad
8+
9+
from fundamentals.custom_functions import make_the_graph
10+
from multiarmed_bandit_rl_problem import MultiArmedBanditEnv
11+
from ml.rl_in_robotics.utility import run_epsilon_greedy_policy
12+
13+
# Hyperparameters we can adjust
14+
BANDITS = [.45, .45, .4, .6, .4]
15+
SEED = 1 # 0 when use greedy policy
16+
POLICY = 2 # 1 when use epsilon greedy policy
17+
BALANCES = 1000
18+
EXPLORATION = 10
19+
EPSILON = .1
20+
21+
MODE = "ascii" # "human"
22+
23+
24+
rad.seed(SEED)
25+
env, rewards = run_epsilon_greedy_policy(MultiArmedBanditEnv(BANDITS), MODE, POLICY, BALANCES, EXPLORATION, EPSILON)
26+
cum_rewards = np.cumsum(rewards)
27+
28+
make_the_graph(cum_rewards, "Casino RL Problem with Epsilon Greedy Policy", "Trials", "Reward")
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# Learner: Nguyen Truong Thinh
2+
# Contact me: [email protected] || +84393280504
3+
#
4+
# Topic: Reinforcement Learning (RL): Multi-Armed Bandits RL problem
5+
6+
import random
7+
import gym
8+
9+
10+
class MultiArmedBanditEnv(gym.Env):
11+
"""
12+
A Multi-Armed Bandits gym's custom environment that simulates our casino problem.
13+
"""
14+
metadata = {"render.modes": ["human", "ascii"]}
15+
16+
def __init__(self, bandits):
17+
"""
18+
Environment Initialization
19+
:param bandits: The winning probabilities
20+
"""
21+
self.bandits = bandits
22+
self.state = {}
23+
self.reset()
24+
25+
def step(self, action):
26+
"""
27+
Takes the agent's action & calculates the rewards.
28+
Each action environment returns 1$ or -1$ to an agent
29+
:param action: The Agent's action
30+
:return: 1$ or -1$ to an agent.
31+
"""
32+
p = self.bandits[action]
33+
r = random.random()
34+
reward = 1 if r <= p else -1
35+
self.state[action].append(reward)
36+
done = False
37+
debug = None
38+
39+
return self.state, reward, done, debug
40+
41+
def _render_human(self):
42+
"""
43+
The Graphic environment rendering
44+
"""
45+
46+
def _render_ascii(self):
47+
"""
48+
The ASCII environment rendering
49+
"""
50+
returns = {}
51+
trials = {}
52+
53+
for e in range(len(self.bandits)):
54+
returns[e] = sum(self.state[e])
55+
trials[e] = len(self.state[e])
56+
57+
print(f'***** Total Trials: {sum(trials.values())} *****')
58+
59+
for b, r in returns.items():
60+
t = trials[b]
61+
print(f'"Bandit {b}"| returns: {r}, trials: {t}')
62+
63+
print(f'***** Total Returns: {sum(returns.values())} *****')
64+
65+
def render(self, mode="human"):
66+
"""
67+
Render the current state of the environment:
68+
Shows the overall statistics of all rounds
69+
"""
70+
if mode == "human":
71+
self._render_human()
72+
elif mode == "ascii":
73+
self._render_ascii()
74+
else:
75+
raise Exception("Not Implemented!")
76+
77+
def reset(self):
78+
"""
79+
Reset environment to its original state
80+
:return:
81+
"""
82+
self.state = {}
83+
84+
for e in range(len(self.bandits)):
85+
self.state[e] = []
86+
87+
return self.state
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Learner: Nguyen Truong Thinh
2+
# Contact me: [email protected] || +84393280504
3+
#
4+
# Topic: Reinforcement Learning (RL): Multi-Armed Bandits RL problem
5+
6+
import numpy as np
7+
import random as rad
8+
from multiarmed_bandit_rl_problem import MultiArmedBanditEnv
9+
from ml.rl_in_robotics.utility import gym_rl_custom_tasks
10+
11+
# Hyperparameters we can adjust
12+
BANDITS = [.45, .45, .4, .6, .4]
13+
SEED = 1
14+
15+
BALANCES = 1000
16+
MODE = "ascii" # "human"
17+
SLEEP = .0 # ?
18+
19+
rad.seed(SEED)
20+
np.random.seed(1)
21+
env = MultiArmedBanditEnv(BANDITS)
22+
23+
# 5 one-armed bandits (0 -> 4)
24+
action = rad.randint(0, 4)
25+
26+
gym_rl_custom_tasks(env, BALANCES, action, MODE, SLEEP)

ml/rl_in_robotics/utility.py

Lines changed: 74 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,86 @@
88
import random
99
from time import sleep
1010

11+
import numpy as np
1112
import gym
1213
import pygame
1314

1415

16+
def run_epsilon_greedy_policy(env, mode, policy, episodes=1000, exploration=10, epsilon=.1):
17+
"""
18+
Epsilon Greedy Policy In Action
19+
"""
20+
state = env.reset()
21+
rewards = []
22+
23+
for e in range(episodes):
24+
action = []
25+
26+
match policy:
27+
case 1:
28+
action = greedy_policy(state, exploration)
29+
case 2:
30+
action = epsilon_greedy_policy(state, exploration, epsilon)
31+
32+
state, reward, done, debug = env.step(action)
33+
rewards.append(reward)
34+
35+
for _ in range(episodes):
36+
if mode == "human":
37+
env.render(mode)
38+
elif mode == "ascii":
39+
env.render("ascii")
40+
else:
41+
env.render()
42+
43+
env.close()
44+
45+
return env, rewards
46+
47+
48+
def epsilon_greedy_policy(state, explore=10, epsilon=.1):
49+
"""
50+
Implement of the Epsilon Greedy Policy
51+
"""
52+
machines = len(state)
53+
trials = sum(len(state[m]) for m in range(machines))
54+
total_explore_trials = machines * explore
55+
56+
# Exploration
57+
if trials <= total_explore_trials:
58+
return trials % machines
59+
# Random machine
60+
if random.random() < epsilon:
61+
return random.randint(0, machines - 1)
62+
# Exploitation
63+
avg_rewards = [sum(state[m]) / len(state[m]) for m in range(machines)]
64+
65+
best_machine = np.argmax(avg_rewards)
66+
return best_machine
67+
68+
69+
def greedy_policy(state, explore=10):
70+
"""
71+
Implement of the Greedy Policy
72+
"""
73+
machines = len(state)
74+
trials = sum(len(state[m]) for m in range(machines))
75+
total_explore_trials = machines * explore
76+
77+
# Exploration
78+
if trials <= total_explore_trials:
79+
return trials % machines
80+
# Exploitation
81+
avg_rewards = [sum(state[m]) / len(state[m]) for m in range(machines)]
82+
83+
best_machine = np.argmax(avg_rewards)
84+
return best_machine
85+
86+
1587
def gym_rl_custom_tasks(env, episodes, action, mode, duration):
1688
"""
17-
Unifying all RL tasks by Gym toolkit with seeding
18-
"""
89+
Unifying all RL custom tasks by Gym toolkit
90+
"""
1991
init_reset_environment(env)
2092

2193
gym_customize_tasks(env, episodes, action, mode, duration)

0 commit comments

Comments
 (0)