-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathOn-Policy Optimal.kt
76 lines (71 loc) · 1.85 KB
/
On-Policy Optimal.kt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
@file:Suppress("NAME_SHADOWING")
package lab.mars.rl.algo.mc
import lab.mars.rl.algo.V_from_Q
import lab.mars.rl.model.impl.mdp.IndexedMDP
import lab.mars.rl.model.impl.mdp.IndexedState
import lab.mars.rl.model.impl.mdp.OptimalSolution
import lab.mars.rl.model.isNotTerminal
import lab.mars.rl.model.log
import lab.mars.rl.util.buf.newBuf
import lab.mars.rl.util.collection.fork
import lab.mars.rl.util.log.debug
import lab.mars.rl.util.math.argmax
import lab.mars.rl.util.tuples.tuple3
fun IndexedMDP.`On-policy first-visit MC control`(episodes: Int): OptimalSolution {
val ε = 0.1
val π = equiprobablePolicy()
val Q = QFunc { 0.0 }
val tmpQ = QFunc { Double.NaN }
val count = QFunc { 0 }
val tmpS = newBuf<IndexedState>(states.size)
for (episode in 1..episodes) {
log.debug { "$episode/$episodes" }
var s = started()
var accumulate = 0.0
while (s.isNotTerminal) {
val a = π(s)
val (s_next, reward) = a.sample()
if (tmpQ[s, a].isNaN())
tmpQ[s, a] = accumulate
accumulate += reward
s = s_next
}
tmpS.clear()
for ((s, a) in states.fork { it.actions }) {
val value = tmpQ[s, a]
if (!value.isNaN()) {
Q[s, a] += accumulate - value
count[s, a] += 1
tmpS.append(s)
tmpQ[s, a] = Double.NaN
}
}
for (s in tmpS) {
val a_opt = argmax(s.actions) {
val n = count[s, it]
if (n > 0)
Q[s, it] / n
else
Q[s, it]
}
val size = s.actions.size
for (a in s.actions) {
π[s, a] = when {
a === a_opt -> 1 - ε + ε / size
else -> ε / size
}
}
}
}
Q.set { idx, value ->
val n = count[idx]
if (n > 0)
value / n
else
value
}
val V = VFunc { 0.0 }
val result = tuple3(π, V, Q)
V_from_Q(states, result)
return result
}