Skip to content

Commit d878ccb

Browse files
authored
Fix Eligibility traces for SARSA(lambda) (#20)
1 parent 3d71a59 commit d878ccb

File tree

2 files changed

+47
-8
lines changed

2 files changed

+47
-8
lines changed

lecture_slides/main.tex

+1-1
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@
184184

185185
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
186186
%%%Lecture Include Onlys%%%
187-
\includeonly{tex/Lecture14}
187+
\includeonly{tex/Lecture06}
188188
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
189189

190190
\begin{document}

lecture_slides/tex/Lecture06.tex

+46-7
Original file line numberDiff line numberDiff line change
@@ -603,16 +603,16 @@ \section{TD(\texorpdfstring{$\lambda$}{Lambda})}
603603
Based on the eligibility trace definition from \eqref{eq:Elig_trace} we can modify our value estimates:
604604
\begin{block}{TD($\lambda$) state-value update}
605605
The TD($\lambda$) state-value update is:
606-
\begin{equation}
606+
\begin{equation}
607607
\hat{v}(x_k) \leftarrow \hat{v}(x_k) + \alpha \left[r_{k+1} + \gamma \hat{v}(x_{k+1})- \hat{v}(x_k)\right]z_k(x_k).
608-
\end{equation}
608+
\end{equation}
609609
\end{block}
610610
\pause
611-
\begin{block}{TD($\lambda$) action-value update}
612-
The TD($\lambda$) action-value update is:
613-
\begin{equation}
614-
\hat{q}(x_k, u_k) \leftarrow \hat{q}(x_k, u_k) + \alpha\left[r_{k+1}+\gamma\hat{q}(x_{k+1}, u_{k+1}) - \hat{q}(x_k, u_k)\right]z_k(x_k) .
615-
\end{equation}
611+
\begin{block}{SARSA($\lambda$) action-value update}
612+
The SARSA($\lambda$) action-value update is:
613+
\begin{equation}
614+
\hat{q}(x_k, u_k) \leftarrow \hat{q}(x_k, u_k) + \alpha \left[r_{k+1}+ \gamma \hat{q}(x_{k+1}, u_{k+1}) - \hat{q}(x_k, u_k)\right]z_k(x_k, u_k) .
615+
\end{equation}
616616
\end{block}
617617
\pause
618618
Already known prediction and control methods can be modified accordingly. In contrast to $n$-step forward updates, one can conclude:
@@ -622,6 +622,45 @@ \section{TD(\texorpdfstring{$\lambda$}{Lambda})}
622622
\end{itemize}
623623
}
624624

625+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
626+
%% Algorithmic Implementation: Tabular SARSA($\lambda$) %%
627+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
628+
\frame{\frametitle{Algorithmic implementation: SARSA($\lambda$)}
629+
\setlength{\algomargin}{0.5em}
630+
\begin{algorithm}[H]
631+
\footnotesize
632+
\SetKwInput{Input}{input}
633+
\SetKwInput{Output}{output}
634+
\SetKwInput{Init}{init}
635+
\SetKwInput{Param}{parameter}
636+
%\Output{estimate $\hat{q}_\pi$ or $\hat{q}^*$}
637+
\Param{$\alpha\in(0,1]$, $\lambda\in(0,1]$, $\varepsilon\in\left\{\mathbb{R}|0<\varepsilon<<1\right\}$}
638+
\Init{$\hat{q}(x,u)$ arbitrarily (except terminal states) $\forall \, \left\{x\in\mathcal{X}, u\in\mathcal{U}\right\}$}
639+
\Init{$\pi$ to be $\varepsilon$-greedy with respect to $\hat{q}$ or to a given, fixed policy}
640+
\For{$j=1,\ldots,J$ episodes}{
641+
initialize $x_{0}$ and action $u_0 \sim \pi(\cdot | x_0)$\;
642+
initialize $z_0(x, u) = 0$ $\forall \, \left\{x\in\mathcal{X}, u\in\mathcal{U}\right\}$
643+
644+
\Repeat{$x_k$ is terminal}{
645+
646+
take action $u_k$, observe $x_{k+1}$ and $r_{k+1}$\;
647+
choose $u_{k+1} \sim \pi(\cdot | x_{k+1})$
648+
649+
$z_k(x, u) \leftarrow \gamma\lambda z_{k-1}(x, u)+\begin{cases}0, \quad\mbox{if } x_k \neq x \mbox{ or } u_k \neq u, \\ 1, \quad \mbox{if } x_k = x \mbox{ and } u_k = u.\end{cases}$
650+
$\forall \, \left\{x\in\mathcal{X}, u\in\mathcal{U}\right\}$
651+
652+
$\delta \leftarrow r_{k+1}+\gamma\hat{q}(x_{k+1}, u_{k+1}) - \hat{q}(x_k, u_k) $
653+
654+
$\hat{q}(x, u) \leftarrow \hat{q}(x, u) + \alpha \delta z_k(x, u)$ $\forall \, \left\{x\in\mathcal{X}, u\in\mathcal{U}\right\}$
655+
656+
$k \leftarrow k+1$\;
657+
}
658+
}
659+
\caption{SARSA($\lambda$) (output is an estimate $\hat{q}_\pi$ or $\hat{q}^*$)}
660+
\label{algo:Sarsa_lambda}
661+
\end{algorithm}
662+
}
663+
625664
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
626665
%% Sarsa Learning Comparison in Gridworld Example %%
627666
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

0 commit comments

Comments
 (0)