Fix abstract_learner for multiplayer games (#1054)

jeremiahpslewis · web-flow · commit f8d5eb704094 · 2024-03-22T10:43:28.000+01:00
* Fix abstract_learner for multiplayer games

* fix test

* drop excess mutability

* fix

* fix type instability

* fix type instability

* type stability

* fix type instability in stop condition

* add missing default method

* drop excess code

---------

Co-authored-by: Jeremiah Lewis &lt;--get&gt;
diff --git a/src/ReinforcementLearningCore/src/core/stop_conditions.jl b/src/ReinforcementLearningCore/src/core/stop_conditions.jl
@@ -15,16 +15,16 @@ abstract type AbstractStopCondition end
 
 The result of `stop_conditions` is reduced by `reducer`. The default `reducer` is the `any` function, which means that the condition is true when any one of the `stop_conditions...` is true. Can be replaced by any function returning a boolean. For example `reducer = x->sum(x) >= 2` will require at least two of the conditions to be true.
 """
-struct ComposedStopCondition{S,T} <: AbstractStopCondition
+struct ComposedStopCondition{S,reducer} <: AbstractStopCondition
     stop_conditions::S
-    reducer::T
+    reducer
     function ComposedStopCondition(stop_conditions...; reducer = any)
-        new{typeof(stop_conditions),typeof(reducer)}(stop_conditions, reducer)
+        new{typeof(stop_conditions),reducer}(stop_conditions, reducer)
     end
 end
 
-function check!(s::ComposedStopCondition, args...)
-    s.reducer(check!(sc, args...) for sc in s.stop_conditions)
+function check!(s::ComposedStopCondition{S,R}, policy::P, env::E) where {S,R,P<:AbstractPolicy,E<:AbstractEnv}
+    s.reducer(check!(sc, policy, env) for sc in s.stop_conditions)
 end
 
 #####
@@ -58,12 +58,12 @@ function _stop_after_step(s::StopAfterNSteps)
     res
 end
 
-function check!(s::StopAfterNSteps, args...)
+function check!(s::StopAfterNSteps, agent, env)
     ProgressMeter.next!(s.progress)
     _stop_after_step(s)
 end
 
-check!(s::StopAfterNSteps{Nothing}, args...) = _stop_after_step(s)
+check!(s::StopAfterNSteps{Nothing}, agent, env) = _stop_after_step(s)
 
 #####
 # StopAfterNEpisodes
diff --git a/src/ReinforcementLearningCore/src/policies/learners/abstract_learner.jl b/src/ReinforcementLearningCore/src/policies/learners/abstract_learner.jl
@@ -11,14 +11,21 @@ function forward(learner::L, env::E) where {L <: AbstractLearner, E <: AbstractE
     env |> state |> (x -> forward(learner, x))
 end
 
+# Take Learner and Environment, get state, send to RLCore.forward(Learner, State)
+function forward(learner::L, env::E, player::Symbol) where {L <: AbstractLearner, E <: AbstractEnv}
+    env |> (x -> state(x, player)) |> (x -> forward(learner, x))
+end
+
 function RLBase.optimise!(::AbstractLearner, ::AbstractStage, ::Trajectory) end
 
+function RLBase.optimise!(::AbstractLearner, ::AbstractStage, ::NamedTuple) end
+
 function RLBase.plan!(explorer::AbstractExplorer, learner::AbstractLearner, env::AbstractEnv)
     legal_action_space_ = RLBase.legal_action_space_mask(env)
     RLBase.plan!(explorer, forward(learner, env), legal_action_space_)
 end
 
 function RLBase.plan!(explorer::AbstractExplorer, learner::AbstractLearner, env::AbstractEnv, player::Symbol)
     legal_action_space_ = RLBase.legal_action_space_mask(env, player)
-    return RLBase.plan!(explorer, forward(learner, env), legal_action_space_)
+    return RLBase.plan!(explorer, forward(learner, env, player), legal_action_space_)
 end
diff --git a/src/ReinforcementLearningCore/src/policies/learners/td_learner.jl b/src/ReinforcementLearningCore/src/policies/learners/td_learner.jl
@@ -90,4 +90,3 @@ end
 
 # TDLearner{:SARS} is optimized at the PostActStage
 RLBase.optimise!(learner::TDLearner{:SARS}, stage::PostActStage, trace::NamedTuple) = RLBase.optimise!(learner, trace)
-
diff --git a/src/ReinforcementLearningCore/src/policies/q_based_policy.jl b/src/ReinforcementLearningCore/src/policies/q_based_policy.jl
@@ -10,11 +10,19 @@ action of an environment at its current state. It is typically a table or a neur
 QBasedPolicy can be queried for an action with `RLBase.plan!`, the explorer will affect the action selection
 accordingly.
 """
-Base.@kwdef mutable struct QBasedPolicy{L<:TDLearner,E<:AbstractExplorer} <: AbstractPolicy
+struct QBasedPolicy{L<:TDLearner,E<:AbstractExplorer} <: AbstractPolicy
     "estimate the Q value"
     learner::L
     "select the action based on Q values calculated by the learner"
     explorer::E
+
+    function QBasedPolicy(; learner::L, explorer::E) where {L<:TDLearner, E<:AbstractExplorer}
+        new{L,E}(learner, explorer)
+    end
+
+    function QBasedPolicy(learner::L, explorer::E) where {L<:TDLearner, E<:AbstractExplorer}
+        new{L,E}(learner, explorer)
+    end
 end
 
 Flux.@layer QBasedPolicy trainable=(learner,)
diff --git a/src/ReinforcementLearningCore/test/core/stop_conditions.jl b/src/ReinforcementLearningCore/test/core/stop_conditions.jl
@@ -2,18 +2,24 @@ import ReinforcementLearningCore.check!
 
 @testset "StopAfterNSteps" begin
     stop_condition = StopAfterNSteps(10)
-    @test sum([check!(stop_condition) for i in 1:20]) == 11
+    env = RandomWalk1D()
+    policy = RandomPolicy(legal_action_space(env))
+
+    @test sum([check!(stop_condition, policy, env) for i in 1:20]) == 11
 
     stop_condition = StopAfterNSteps(10; is_show_progress=false)
-    @test sum([check!(stop_condition) for i in 1:20]) == 11
+    @test sum([check!(stop_condition, policy, env) for i in 1:20]) == 11
 end
 
 @testset "ComposedStopCondition" begin
     stop_10 = StopAfterNSteps(10)
     stop_3 = StopAfterNSteps(3)
 
+    env = RandomWalk1D()
+    policy = RandomPolicy(legal_action_space(env))
+
     composed_stop = ComposedStopCondition(stop_10, stop_3)
-    @test sum([check!(composed_stop) for i in 1:20]) == 18
+    @test sum([check!(composed_stop, policy, env) for i in 1:20]) == 18
 end
 
 @testset "StopAfterNEpisodes" begin
diff --git a/src/ReinforcementLearningCore/test/policies/learners/abstract_learner.jl b/src/ReinforcementLearningCore/test/policies/learners/abstract_learner.jl
@@ -15,12 +15,16 @@ struct MockLearner <: AbstractLearner end
         end
 
         RLBase.state(::MockEnv, ::Observation{Any}, ::DefaultPlayer) = 1
+        RLBase.state(::MockEnv, ::Observation{Any}, ::Symbol) = 1
 
         env = MockEnv()
         learner = MockLearner()
 
         output = RLCore.forward(learner, env)
         @test output == Float64[1.0, 2.0]
+
+        output = RLCore.forward(learner, env, Symbol(1))
+        @test output == Float64[1.0, 2.0]
     end
 
     @testset "Plan" begin

Original file line number	Diff line number	Diff line change
`@@ -90,4 +90,3 @@ end`
`90`	`90`
`91`	`91`	`# TDLearner{:SARS} is optimized at the PostActStage`
`92`	`92`	`RLBase.optimise!(learner::TDLearner{:SARS}, stage::PostActStage, trace::NamedTuple) = RLBase.optimise!(learner, trace)`
`93`		`-`