Skip to content

Commit

Permalink
Add fpu to uct_score()
Browse files Browse the repository at this point in the history
  • Loading branch information
Whojo committed May 4, 2022
1 parent 8a747f2 commit d5b051d
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 10 deletions.
31 changes: 23 additions & 8 deletions src/mcts.jl
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ Ntot(b::StateInfo) = sum(s.N for s in b.stats)
##### MCTS Environment
#####

@enum FpuStrategy reduction absolute

"""
MCTS.Env(game_spec::AbstractGameSpec, oracle; <keyword args>)
Expand All @@ -105,6 +107,10 @@ Create and initialize an MCTS environment with a given `oracle`.
(see below)
- `prior_temperature=1.`: temperature to apply to the oracle's output
to get the prior probability vector used by MCTS.
- `fpu_strategy=reduction`: “First Play Urgency” strategy
- `fpu_value=0.44`: "First Play Urgency” value used to adjust
unvisited nodes evaluation based on `fpu_strategy`. It is
set to 0.44 by default, according to Leela-Chess value.
## Dirichlet Noise
Expand Down Expand Up @@ -132,21 +138,24 @@ mutable struct Env{State, Oracle}
noise_ϵ :: Float64
noise_α :: Float64
prior_temperature :: Float64
fpu_strategy :: FpuStrategy
fpu_value :: Float64
# Performance statistics
total_simulations :: Int64
total_nodes_traversed :: Int64
# Game specification
gspec :: GI.AbstractGameSpec

function Env(gspec, oracle;
gamma=1., cpuct=1., noise_ϵ=0., noise_α=1., prior_temperature=1.)
gamma=1., cpuct=1., noise_ϵ=0., noise_α=1., prior_temperature=1.,
fpu_strategy=reduction, fpu_value=0.44) # Magic value inspired from Leela-Chess
S = GI.state_type(gspec)
tree = Dict{S, StateInfo}()
total_simulations = 0
total_nodes_traversed = 0
new{S, typeof(oracle)}(
tree, oracle, gamma, cpuct, noise_ϵ, noise_α, prior_temperature,
total_simulations, total_nodes_traversed, gspec)
fpu_strategy, fpu_value, total_simulations, total_nodes_traversed, gspec)
end
end

Expand Down Expand Up @@ -177,13 +186,18 @@ end
##### Main algorithm
#####

function uct_scores(info::StateInfo, cpuct, ϵ, η)
function uct_scores(info::StateInfo, env, ϵ, η, parent_stats)
@assert iszero(ϵ) || length(η) == length(info.stats)
sqrtNtot = sqrt(Ntot(info))
fpu_Q = env.fpu_value
if (!isnothing(parent_stats) && env.fpu_strategy == reduction)
parent_Q = parent_stats.W / parent_stats.N
fpu_Q = parent_Q - env.fpu_value
end
return map(enumerate(info.stats)) do (i, a)
Q = a.W / max(a.N, 1)
Q = (a.N != 0) ? a.W / max(a.N, 1) : fpu_Q
P = iszero(ϵ) ? a.P : (1-ϵ) * a.P + ϵ * η[i]
Q + cpuct * P * sqrtNtot / (a.N + 1)
Q + env.cpuct * P * sqrtNtot / (a.N + 1)
end
end

Expand All @@ -196,7 +210,7 @@ end
# Run a single MCTS simulation, updating the statistics of all traversed states.
# Return the estimated Q-value for the current player.
# Modifies the state of the game environment.
function run_simulation!(env::Env, game; η, root=true)
function run_simulation!(env::Env, game; η, root=true, parent_stats=nothing)
if GI.game_terminated(game)
return 0.
else
Expand All @@ -207,15 +221,16 @@ function run_simulation!(env::Env, game; η, root=true)
return info.Vest
else
ϵ = root ? env.noise_ϵ : 0.
scores = uct_scores(info, env.cpuct, ϵ, η)
scores = uct_scores(info, env, ϵ, η, parent_stats)
action_id = argmax(scores)
action = actions[action_id]
next_parent_stats = info.stats[action_id]
wp = GI.white_playing(game)
GI.play!(game, action)
wr = GI.white_reward(game)
r = wp ? wr : -wr
pswitch = wp != GI.white_playing(game)
qnext = run_simulation!(env, game, η=η, root=false)
qnext = run_simulation!(env, game, η=η, root=false, parent_stats=next_parent_stats)
qnext = pswitch ? -qnext : qnext
q = r + env.gamma * qnext
update_state_info!(env, state, action_id, q)
Expand Down
12 changes: 12 additions & 0 deletions src/params.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ Parameters of an MCTS player.
| `dirichlet_noise_ϵ` | `Float64` | - |
| `dirichlet_noise_α` | `Float64` | - |
| `prior_temperature` | `Float64` | `1.` |
| `fpu_strategy` | `MCTS.FpuStrategy` | `MCTS.reduction` |
| `fpu_value` | `Float64` | `0.44` |
# Explanation
Expand All @@ -29,6 +31,14 @@ It is typical to use a high value of the temperature parameter ``τ``
during the first moves of a game to increase exploration and then switch to
a small value. Therefore, `temperature` is am [`AbstractSchedule`](@ref).
The "First Play Urgency" strategy (a.k.a `fpu_strategy`) specifies the
evaluation strategy of unvisited nodes. It changes search behavior to
visit unvisited nodes earlier or later by using a placeholder eval before
checking the network. The value specified with `fpu_value` results in
“reduction” subtracting that value from the parent eval while “absolute”
directly uses that value. `fpu_value` is set to 0.44 by default,
according to Leela-Chess value.
For information on parameters `cpuct`, `dirichlet_noise_ϵ`,
`dirichlet_noise_α` and `prior_temperature`, see [`MCTS.Env`](@ref).
Expand All @@ -54,6 +64,8 @@ In the original AlphaGo Zero paper:
dirichlet_noise_ϵ :: Float64
dirichlet_noise_α :: Float64
prior_temperature :: Float64 = 1.
fpu_strategy :: MCTS.FpuStrategy = MCTS.reduction
fpu_value :: Float64 = 0.44
end

"""
Expand Down
4 changes: 2 additions & 2 deletions src/ui/explorer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ function state_statistics(game, player, turn, memory=nothing)
if isa(player, MctsPlayer) && haskey(player.mcts.tree, state)
mcts = player.mcts
info = mcts.tree[state]
ucts = MCTS.uct_scores(info, mcts.cpuct, 0., nothing)
ucts = MCTS.uct_scores(info, mcts, 0., nothing, nothing)
report.Nmcts = MCTS.Ntot(info)
for (i, a) in enumerate(actions)
astats = info.stats[i]
Expand Down Expand Up @@ -305,4 +305,4 @@ end

function explore(player::AbstractPlayer, gspec::AbstractGameSpec; args...)
return explore(player, GI.init(gspec), args...)
end
end

0 comments on commit d5b051d

Please sign in to comment.