@@ -603,16 +603,16 @@ \section{TD(\texorpdfstring{$\lambda$}{Lambda})}
603
603
Based on the eligibility trace definition from \eqref {eq:Elig_trace } we can modify our value estimates:
604
604
\begin {block }{TD($ \lambda $ ) state-value update}
605
605
The TD($ \lambda $ ) state-value update is:
606
- \begin {equation }
606
+ \begin {equation }
607
607
\hat {v}(x_k) \leftarrow \hat {v}(x_k) + \alpha \left [r_{k+1} + \gamma \hat {v}(x_{k+1})- \hat {v}(x_k)\right ]z_k(x_k).
608
- \end {equation }
608
+ \end {equation }
609
609
\end {block }
610
610
\pause
611
- \begin {block }{TD ($ \lambda $ ) action-value update}
612
- The TD ($ \lambda $ ) action-value update is:
613
- \begin {equation }
614
- \hat {q}(x_k, u_k) \leftarrow \hat {q}(x_k, u_k) + \alpha\left [r_{k+1}+\gamma\hat {q}(x_{k+1}, u_{k+1}) - \hat {q}(x_k, u_k)\right ]z_k(x_k) .
615
- \end {equation }
611
+ \begin {block }{SARSA ($ \lambda $ ) action-value update}
612
+ The SARSA ($ \lambda $ ) action-value update is:
613
+ \begin {equation }
614
+ \hat {q}(x_k, u_k) \leftarrow \hat {q}(x_k, u_k) + \alpha \left [r_{k+1}+ \gamma \hat {q}(x_{k+1}, u_{k+1}) - \hat {q}(x_k, u_k)\right ]z_k(x_k, u_k ) .
615
+ \end {equation }
616
616
\end {block }
617
617
\pause
618
618
Already known prediction and control methods can be modified accordingly. In contrast to $ n$ -step forward updates, one can conclude:
@@ -622,6 +622,45 @@ \section{TD(\texorpdfstring{$\lambda$}{Lambda})}
622
622
\end {itemize }
623
623
}
624
624
625
+ % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
626
+ % % Algorithmic Implementation: Tabular SARSA($\lambda$) %%
627
+ % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
628
+ \frame {\frametitle {Algorithmic implementation: SARSA($ \lambda $ )}
629
+ \setlength {\algomargin }{0.5em}
630
+ \begin {algorithm }[H]
631
+ \footnotesize
632
+ \SetKwInput {Input}{input}
633
+ \SetKwInput {Output}{output}
634
+ \SetKwInput {Init}{init}
635
+ \SetKwInput {Param}{parameter}
636
+ % \Output{estimate $\hat{q}_\pi$ or $\hat{q}^*$}
637
+ \Param {$ \alpha\in (0 ,1 ]$ , $ \lambda\in (0 ,1 ]$ , $ \varepsilon\in \left \{ \mathbb {R}|0 <\varepsilon <<1 \right \} $ }
638
+ \Init {$ \hat {q}(x,u)$ arbitrarily (except terminal states) $ \forall \, \left\{ x\in \mathcal {X}, u\in \mathcal {U}\right \} $ }
639
+ \Init {$ \pi $ to be $ \varepsilon $ -greedy with respect to $ \hat {q}$ or to a given, fixed policy}
640
+ \For {$ j=1 ,\ldots ,J$ episodes}{
641
+ initialize $ x_{0}$ and action $ u_0 \sim \pi (\cdot | x_0 )$ \;
642
+ initialize $ z_0 (x, u) = 0 $ $ \forall \, \left\{ x\in \mathcal {X}, u\in \mathcal {U}\right \} $
643
+
644
+ \Repeat {$ x_k$ is terminal}{
645
+
646
+ take action $ u_k$ , observe $ x_{k+1}$ and $ r_{k+1}$ \;
647
+ choose $ u_{k+1} \sim \pi (\cdot | x_{k+1})$
648
+
649
+ $ z_k(x, u) \leftarrow \gamma\lambda z_{k-1}(x, u)+\begin {cases}0 , \quad\mbox {if } x_k \neq x \mbox { or } u_k \neq u, \\ 1 , \quad \mbox {if } x_k = x \mbox { and } u_k = u.\end {cases}$
650
+ $ \forall \, \left\{ x\in \mathcal {X}, u\in \mathcal {U}\right \} $
651
+
652
+ $ \delta \leftarrow r_{k+1}+\gamma \hat {q}(x_{k+1}, u_{k+1}) - \hat {q}(x_k, u_k) $
653
+
654
+ $ \hat {q}(x, u) \leftarrow \hat {q}(x, u) + \alpha \delta z_k(x, u)$ $ \forall \, \left\{ x\in \mathcal {X}, u\in \mathcal {U}\right \} $
655
+
656
+ $ k \leftarrow k+1 $ \;
657
+ }
658
+ }
659
+ \caption {SARSA($ \lambda $ ) (output is an estimate $ \hat {q}_\pi $ or $ \hat {q}^*$ )}
660
+ \label {algo:Sarsa_lambda }
661
+ \end {algorithm }
662
+ }
663
+
625
664
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
626
665
% % Sarsa Learning Comparison in Gridworld Example %%
627
666
% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0 commit comments