Add fpu to uct_score()

jonathan-laurent · May 4, 2022 · d5b051d · d5b051d
1 parent 8a747f2
commit d5b051d
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 10 deletions.
diff --git a/src/mcts.jl b/src/mcts.jl
@@ -92,6 +92,8 @@ Ntot(b::StateInfo) = sum(s.N for s in b.stats)
 ##### MCTS Environment
 #####
 
+@enum FpuStrategy reduction absolute
+
 """
     MCTS.Env(game_spec::AbstractGameSpec, oracle; <keyword args>)
 
@@ -105,6 +107,10 @@ Create and initialize an MCTS environment with a given `oracle`.
      (see below)
   - `prior_temperature=1.`: temperature to apply to the oracle's output
      to get the prior probability vector used by MCTS.
+  - `fpu_strategy=reduction`: “First Play Urgency” strategy
+  - `fpu_value=0.44`: "First Play Urgency” value used to adjust
+     unvisited nodes evaluation based on `fpu_strategy`. It is
+     set to 0.44 by default, according to Leela-Chess value.
 
 ## Dirichlet Noise
 
@@ -132,21 +138,24 @@ mutable struct Env{State, Oracle}
   noise_ϵ :: Float64
   noise_α :: Float64
   prior_temperature :: Float64
+  fpu_strategy :: FpuStrategy
+  fpu_value :: Float64
   # Performance statistics
   total_simulations :: Int64
   total_nodes_traversed :: Int64
   # Game specification
   gspec :: GI.AbstractGameSpec
 
   function Env(gspec, oracle;
-      gamma=1., cpuct=1., noise_ϵ=0., noise_α=1., prior_temperature=1.)
+      gamma=1., cpuct=1., noise_ϵ=0., noise_α=1., prior_temperature=1.,
+      fpu_strategy=reduction, fpu_value=0.44) # Magic value inspired from Leela-Chess
     S = GI.state_type(gspec)
     tree = Dict{S, StateInfo}()
     total_simulations = 0
     total_nodes_traversed = 0
     new{S, typeof(oracle)}(
       tree, oracle, gamma, cpuct, noise_ϵ, noise_α, prior_temperature,
-      total_simulations, total_nodes_traversed, gspec)
+      fpu_strategy, fpu_value, total_simulations, total_nodes_traversed, gspec)
   end
 end
 
@@ -177,13 +186,18 @@ end
 ##### Main algorithm
 #####
 
-function uct_scores(info::StateInfo, cpuct, ϵ, η)
+function uct_scores(info::StateInfo, env, ϵ, η, parent_stats)
   @assert iszero(ϵ) || length(η) == length(info.stats)
   sqrtNtot = sqrt(Ntot(info))
+  fpu_Q = env.fpu_value
+  if (!isnothing(parent_stats) && env.fpu_strategy == reduction)
+    parent_Q = parent_stats.W / parent_stats.N
+    fpu_Q = parent_Q - env.fpu_value
+  end
   return map(enumerate(info.stats)) do (i, a)
-    Q = a.W / max(a.N, 1)
+    Q = (a.N != 0) ? a.W / max(a.N, 1) : fpu_Q
     P = iszero(ϵ) ? a.P : (1-ϵ) * a.P + ϵ * η[i]
-    Q + cpuct * P * sqrtNtot / (a.N + 1)
+    Q + env.cpuct * P * sqrtNtot / (a.N + 1)
   end
 end
 
@@ -196,7 +210,7 @@ end
 # Run a single MCTS simulation, updating the statistics of all traversed states.
 # Return the estimated Q-value for the current player.
 # Modifies the state of the game environment.
-function run_simulation!(env::Env, game; η, root=true)
+function run_simulation!(env::Env, game; η, root=true, parent_stats=nothing)
   if GI.game_terminated(game)
     return 0.
   else
@@ -207,15 +221,16 @@ function run_simulation!(env::Env, game; η, root=true)
       return info.Vest
     else
       ϵ = root ? env.noise_ϵ : 0.
-      scores = uct_scores(info, env.cpuct, ϵ, η)
+      scores = uct_scores(info, env, ϵ, η, parent_stats)
       action_id = argmax(scores)
       action = actions[action_id]
+      next_parent_stats = info.stats[action_id]
       wp = GI.white_playing(game)
       GI.play!(game, action)
       wr = GI.white_reward(game)
       r = wp ? wr : -wr
       pswitch = wp != GI.white_playing(game)
-      qnext = run_simulation!(env, game, η=η, root=false)
+      qnext = run_simulation!(env, game, η=η, root=false, parent_stats=next_parent_stats)
       qnext = pswitch ? -qnext : qnext
       q = r + env.gamma * qnext
       update_state_info!(env, state, action_id, q)

diff --git a/src/params.jl b/src/params.jl
@@ -14,6 +14,8 @@ Parameters of an MCTS player.
 | `dirichlet_noise_ϵ`    | `Float64`                    |  -                  |
 | `dirichlet_noise_α`    | `Float64`                    |  -                  |
 | `prior_temperature`    | `Float64`                    | `1.`                |
+| `fpu_strategy`         | `MCTS.FpuStrategy`           | `MCTS.reduction`    |
+| `fpu_value`            | `Float64`                    | `0.44`              |
 
 # Explanation
 
@@ -29,6 +31,14 @@ It is typical to use a high value of the temperature parameter ``τ``
 during the first moves of a game to increase exploration and then switch to
 a small value. Therefore, `temperature` is am [`AbstractSchedule`](@ref).
 
+The "First Play Urgency" strategy (a.k.a `fpu_strategy`) specifies the
+evaluation strategy of unvisited nodes. It changes search behavior to
+visit unvisited nodes earlier or later by using a placeholder eval before
+checking the network. The value specified with `fpu_value` results in
+“reduction” subtracting that value from the parent eval while “absolute”
+directly uses that value. `fpu_value` is set to 0.44 by default,
+according to Leela-Chess value.
+
 For information on parameters `cpuct`, `dirichlet_noise_ϵ`,
 `dirichlet_noise_α` and `prior_temperature`, see [`MCTS.Env`](@ref).
 
@@ -54,6 +64,8 @@ In the original AlphaGo Zero paper:
   dirichlet_noise_ϵ :: Float64
   dirichlet_noise_α :: Float64
   prior_temperature :: Float64 = 1.
+  fpu_strategy :: MCTS.FpuStrategy = MCTS.reduction
+  fpu_value :: Float64 = 0.44
 end
 
 """

diff --git a/src/ui/explorer.jl b/src/ui/explorer.jl
@@ -74,7 +74,7 @@ function state_statistics(game, player, turn, memory=nothing)
   if isa(player, MctsPlayer) && haskey(player.mcts.tree, state)
     mcts = player.mcts
     info = mcts.tree[state]
-    ucts = MCTS.uct_scores(info, mcts.cpuct, 0., nothing)
+    ucts = MCTS.uct_scores(info, mcts, 0., nothing, nothing)
     report.Nmcts = MCTS.Ntot(info)
     for (i, a) in enumerate(actions)
       astats = info.stats[i]
@@ -305,4 +305,4 @@ end
 
 function explore(player::AbstractPlayer, gspec::AbstractGameSpec; args...)
   return explore(player, GI.init(gspec), args...)
-end
+end