Lyceum
diff --git a/‎Manifest.toml
+122-85 b/‎Manifest.toml
+122-85
diff --git a/‎Project.toml
+2-1 b/‎Project.toml
+2-1
diff --git a/‎src/algorithms/MPPI.jl
+24-13 b/‎src/algorithms/MPPI.jl
+24-13
diff --git a/‎src/algorithms/NPG.jl
+2-2 b/‎src/algorithms/NPG.jl
+2-2
diff --git a/‎src/controller.jl
+9-4 b/‎src/controller.jl
+9-4
diff --git a/‎test/NPG.jl
-92 b/‎test/NPG.jl
-92
diff --git a/‎test/algorithms/MPPI.jl
+22 b/‎test/algorithms/MPPI.jl
+22
diff --git a/‎test/algorithms/NPG.jl
+48 b/‎test/algorithms/NPG.jl
+48
@@ -14,6 +14,7 @@ IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
 LearnBase = "7f8f8fb0-2700-5f03-b4bd-41f8cfc144b6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LyceumBase = "db31fed1-ca1e-4084-8a49-12fae1996a55"
+LyceumMuJoCo = "48b9757e-04b8-4dbf-b6ed-75c13d9e4026"
 MLDataPattern = "9920b226-0b2a-5f5f-9153-9aa70a013f8b"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 MuJoCo = "93189219-7048-461c-94ec-443a161ed927"
@@ -28,8 +29,8 @@ UnsafeArrays = "c4a57d5a-5b31-53a6-b365-19f8c011fbd6"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
-julia = "1.3"
 Flux = "0.10"
+julia = "1.3"
 
 [extras]
 LyceumMuJoCo = "48b9757e-04b8-4dbf-b6ed-75c13d9e4026"
 
@@ -1,5 +1,5 @@
 # Algorithm 2 from https://www.cc.gatech.edu/~bboots3/files/InformationTheoreticMPC.pdf
-struct MPPI{DT,nu,C<:AbstractMatrix{DT},V,E,F,O}
+struct MPPI{DT,nu,C<:AbstractMatrix{DT},V,E,F,O,S}
     # MPPI parameters
     K::Int
     H::Int
@@ -15,10 +15,11 @@ struct MPPI{DT,nu,C<:AbstractMatrix{DT},V,E,F,O}
     covar_ul::UpperTriangular{DT,C}
     meantrajectory::Matrix{DT}
     trajectorycosts::Vector{DT}
-    observationbuffers::Vector{O}
+    obsbuffers::Vector{O}
+    statebuffers::Vector{S}
 
     function MPPI{DT}(
-        sharedmemory_envctor,
+        env_tconstructor,
         K::Integer,
         H::Integer,
         covar0::AbstractMatrix{<:Real},
@@ -27,10 +28,11 @@ struct MPPI{DT,nu,C<:AbstractMatrix{DT},V,E,F,O}
         valuefn,
         initfn!,
     ) where {DT<:AbstractFloat}
-        envs = [e for e in sharedmemory_envctor(Threads.nthreads())]
+        envs = [e for e in env_tconstructor(Threads.nthreads())]
 
+        ssp = statespace(first(envs))
         asp = actionspace(first(envs))
-        osp = observationspace(first(envs))
+        osp = obsspace(first(envs))
 
         nd, elt = ndims(asp), eltype(asp)
         if nd != 1 || !(elt <: AbstractFloat)
@@ -55,7 +57,8 @@ struct MPPI{DT,nu,C<:AbstractMatrix{DT},V,E,F,O}
         meantrajectory = zeros(DT, asp, H)
         trajectorycosts = zeros(DT, K)
         noise = zeros(DT, asp, H, K)
-        observationbuffers = [allocate(osp) for _ = 1:Threads.nthreads()]
+        obsbuffers = [allocate(osp) for _ = 1:Threads.nthreads()]
+        statebuffers = [allocate(ssp) for _ = 1:Threads.nthreads()]
 
         new{
             DT,
@@ -64,7 +67,8 @@ struct MPPI{DT,nu,C<:AbstractMatrix{DT},V,E,F,O}
             typeof(valuefn),
             eltype(envs),
             typeof(initfn!),
-            eltype(observationbuffers),
+            eltype(obsbuffers),
+            eltype(statebuffers),
         }(
             K,
             H,
@@ -78,14 +82,15 @@ struct MPPI{DT,nu,C<:AbstractMatrix{DT},V,E,F,O}
             covar_ul,
             meantrajectory,
             trajectorycosts,
-            observationbuffers,
+            obsbuffers,
+            statebuffers
         )
     end
 end
 
 function MPPI(;
     dtype = Float64,
-    sharedmemory_envctor,
+    env_tconstructor,
     covar0,
     lambda,
     K,
@@ -94,7 +99,7 @@ function MPPI(;
     valuefn = zerofn,
     initfn! = default_initfn!,
 )
-    MPPI{dtype}(sharedmemory_envctor, K, H, covar0, lambda, gamma, valuefn, initfn!)
+    MPPI{dtype}(env_tconstructor, K, H, covar0, lambda, gamma, valuefn, initfn!)
 end
 
 LyceumBase.reset!(m::MPPI) = (fill!(m.meantrajectory, 0); m)
@@ -142,20 +147,26 @@ end
 
 function perturbedrollout!(m::MPPI{DT,nu}, state, k, tid) where {DT,nu}
     env = m.envs[tid]
-    obsbuf = m.observationbuffers[tid]
+    obsbuf = m.obsbuffers[tid]
+    statebuf = m.statebuffers[tid]
     mean = m.meantrajectory
     noise = m.noise
 
-    reset!(env, state)
+    setstate!(env, state)
     discountedreward = zero(DT)
     discountfactor = one(DT)
     @uviews mean noise @inbounds for t = 1:m.H
         mean_t = SVector{nu,DT}(view(mean, :, t))
         noise_tk = SVector{nu,DT}(view(noise, :, t, k))
         action_t = mean_t + noise_tk
         setaction!(env, action_t)
+
         step!(env)
-        reward = getreward(env)
+
+        getobs!(obsbuf, env)
+        getstate!(statebuf, env)
+        reward = getreward(statebuf, action_t, obsbuf, env)
+
         discountedreward += reward * discountfactor
         discountfactor *= m.gamma
     end # env at t=H+1
 
@@ -27,7 +27,7 @@ struct NaturalPolicyGradient{DT,S,P,V,VF,CB}
     returns_vec::Vector{DT} # N
 
     function NaturalPolicyGradient(
-        env_ctor,
+        env_tconstructor,
         policy,
         value,
         valuefit!;
@@ -70,7 +70,7 @@ struct NaturalPolicyGradient{DT,S,P,V,VF,CB}
             DT = DTnew
         end
 
-        envsampler = EnvSampler(env_ctor, dtype=DT)
+        envsampler = EnvSampler(env_tconstructor, dtype=DT)
 
         z(d...) = zeros(DT, d...)
         new{
 
@@ -8,16 +8,16 @@ struct ControllerIterator{C,E,B}
 
     function ControllerIterator(
         controller,
-        env::AbstractEnv;
+        env::AbstractEnvironment;
         T = 1000,
         plotiter = 1,
     )
         trajectory = (
             states = Array(undef, statespace(env), T),
-            observations = Array(undef, observationspace(env), T),
+            observations = Array(undef, obsspace(env), T),
             actions = Array(undef, actionspace(env), T),
             rewards = Array(undef, rewardspace(env), T),
-            evaluations = Array(undef, evaluationspace(env), T),
+            evaluations = Array(undef, evalspace(env), T),
         )
         new{typeof(controller),typeof(env),typeof(trajectory)}(
             controller,
@@ -59,8 +59,13 @@ function rolloutstep!(controller, traj, env, t)
     getstate!(st, env)
     getobs!(ot, env)
     getaction!(at, st, ot, controller)
+    setaction!(env, at)
+
+    step!(env)
+    r = getreward(st, at, ot, env)
+    e = geteval(st, at, ot, env)
+    done = isdone(st, at, ot, env)
 
-    r, e, done = step!(env, at)
     traj.rewards[t] = r
     traj.evaluations[t] = e
 
 
@@ -0,0 +1,22 @@
+@testset "MPPI (PointMass)" begin
+    seed_threadrngs!(1)
+    etype = LyceumMuJoCo.PointMass
+    env = etype()
+    T = 300
+    K = 8
+    H = 10
+
+    mppi = MPPI(
+        env_tconstructor = n -> tconstruct(etype, n),
+        covar0 = Diagonal(0.1^2*I, size(actionspace(env), 1)),
+        lambda = 0.01,
+        K =  K,
+        H = H,
+        gamma = 0.99
+    )
+    env = testrollout(env, T) do a, s, o
+        getaction!(a, s, o, mppi)
+    end
+    @test abs(geteval(env)) < 0.001
+end
+
@@ -0,0 +1,48 @@
+@testset "NPG (PointMass)" begin
+    seed_threadrngs!(1)
+    etype = LyceumMuJoCo.PointMass
+
+    e = etype()
+    dobs, dact = length(obsspace(e)), length(actionspace(e))
+
+    DT = Float32
+    Hmax, K = 300, 16
+    N = Hmax * K
+
+    policy = DiagGaussianPolicy(
+        multilayer_perceptron(dobs, 32, 32, dact, σ=tanh),
+        zeros(dact)
+    )
+    policy = Flux.paramtype(DT, policy)
+
+    value = multilayer_perceptron(dobs, 32, 32, 1, σ=Flux.relu)
+    valueloss(bl, X, Y) = mse(vec(bl(X)), vec(Y))
+
+    valuetrainer = FluxTrainer(
+        optimiser = ADAM(1e-2),
+        szbatch = 32,
+        lossfn = valueloss,
+        stopcb = s->s.nepochs > 4
+    )
+    value = Flux.paramtype(DT, value)
+
+    npg = NaturalPolicyGradient(
+        n -> tconstruct(etype, n),
+        policy,
+        value,
+        gamma = 0.95,
+        gaelambda = 0.99,
+        valuetrainer,
+        Hmax=Hmax,
+        norm_step_size=0.05,
+        N=N,
+    )
+
+    meanterminal_eval = nothing
+    for (i, state) in enumerate(npg)
+        i > 30 && break
+        meanterminal_eval = state.meanterminal_eval
+    end
+
+    @test meanterminal_eval < 0.1
+end