diff --git a/.gitignore b/.gitignore
index 1e25f17126a5900b78b27d3822d62ca6a634a872..1ef69d936b5521202b3fd5df634913c18ac88286 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,11 +4,11 @@ exam_tabular_examples
 #solutions/ex01
 #solutions/ex02
 #solutions/ex03
-solutions/ex04
-solutions/ex05
-solutions/ex06
-solutions/ex07
-solutions/ex08
+#solutions/ex04
+#solutions/ex05
+#solutions/ex06
+#solutions/ex07
+#solutions/ex08
 solutions/ex09
 solutions/ex10
 solutions/ex11
@@ -31,10 +31,10 @@ solutions/ex13
 #irlc/tests/tests_week02.py
 #irlc/tests/tests_week03.py
 #irlc/tests/tests_week04.py
-irlc/tests/tests_week05.py
-irlc/tests/tests_week06.py
-irlc/tests/tests_week07.py
-irlc/tests/tests_week08.py
+#irlc/tests/tests_week05.py
+#irlc/tests/tests_week06.py
+#irlc/tests/tests_week07.py
+#irlc/tests/tests_week08.py
 irlc/tests/tests_week09.py
 irlc/tests/tests_week10.py
 irlc/tests/tests_week11.py
@@ -68,10 +68,10 @@ irlc/exam/exam20*/solution
 # irlc/lectures/lec02
 #irlc/lectures/lec03
 #irlc/lectures/lec04
-irlc/lectures/lec05
-irlc/lectures/lec06
-irlc/lectures/lec07
-irlc/lectures/lec08
+#irlc/lectures/lec05
+#irlc/lectures/lec06
+#irlc/lectures/lec07
+#irlc/lectures/lec08
 irlc/lectures/lec09
 irlc/lectures/lec10
 irlc/lectures/lec11
diff --git a/irlc/ex08/__init__.py b/irlc/ex08/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..28514114cf38978975fea28d6e6670715223cfb8
--- /dev/null
+++ b/irlc/ex08/__init__.py
@@ -0,0 +1,2 @@
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""This directory contains the exercises for week 8."""
diff --git a/irlc/ex08/bandit_example.py b/irlc/ex08/bandit_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eb8cccf7a9c703d2de0cbc66ec6e6d00b60d946
--- /dev/null
+++ b/irlc/ex08/bandit_example.py
@@ -0,0 +1,27 @@
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+import numpy as np
+import matplotlib.pyplot as plt
+
+if __name__ == "__main__":
+    from irlc import Agent, train, savepdf 
+    from irlc.ex08.bandits import StationaryBandit
+    bandit = StationaryBandit(k=10) # A 10-armed bandit
+    agent = Agent(bandit)  # Recall the agent takes random actions
+    _, trajectories = train(bandit, agent, return_trajectory=True, num_episodes=1, max_steps=500)
+    plt.plot(trajectories[0].reward)
+    plt.xlabel("Time step")
+    plt.ylabel("Reward per time step") 
+    savepdf("dumbitA")
+    plt.show()
+
+    agent = Agent(bandit)  # Recall the agent takes random actions  
+    for i in range(10):
+        _, trajectories = train(bandit, agent, return_trajectory=True, num_episodes=1, max_steps=500)
+        regret = np.asarray([r['gab'] for r in trajectories[0].env_info[1:]])
+        cum_regret = np.cumsum(regret)
+        plt.plot(cum_regret, label=f"Episode {i}")
+    plt.legend()
+    plt.xlabel("Time step")
+    plt.ylabel("Accumulated Regret") 
+    savepdf("dumbitB")
+    plt.show()
diff --git a/irlc/ex08/bandits.py b/irlc/ex08/bandits.py
new file mode 100644
index 0000000000000000000000000000000000000000..5df7412724989bf621be957e6b40687283a8d044
--- /dev/null
+++ b/irlc/ex08/bandits.py
@@ -0,0 +1,213 @@
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""
+
+References:
+  [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+from gymnasium import Env
+from gymnasium.spaces import Discrete
+from irlc import train
+from tqdm import tqdm
+import sys
+from irlc import cache_read, cache_write, cache_exists
+
+class BanditEnvironment(Env): 
+    r"""
+    A helper class for defining bandit problems similar to e.g. the 10-armed testbed discsused in (SB18).
+    We are going to implement the bandit problems as greatly simplfied gym environments, as this will allow us to
+    implement the bandit agents as the familiar ``Agent``. I hope this way of doing it will make it clearer that bandits
+    are in fact a sort of reinforcement learning method.
+
+    The following code shows an example of how to use a bandit environment:
+
+    .. runblock:: pycon
+
+        >>> from irlc.ex08.bandits import StationaryBandit
+        >>> env = StationaryBandit(k=10)                    # 10-armed testbed.
+        >>> env.reset()                                     # Reset env.q_star
+        >>> s, r, _, _, info = env.step(3)
+        >>> print(f"The reward we got from taking arm a=3 was {r=}")
+
+    """
+    def __init__(self, k : int): 
+        r"""
+        Initialize a bandit problem. The observation space is given a dummy value since bandit problems of the sort
+        (SB18) discuss don't have observations.
+
+        :param k: The number of arms.
+        """
+        super().__init__() 
+        self.observation_space = Discrete(1)  # Dummy observation space with a single observation.
+        self.action_space = Discrete(k)       # The arms labelled 0,1,...,k-1.
+        self.k = k  # Number of arms 
+
+    def reset(self): 
+        r"""
+        Use this function to reset the all internal parameters of the environment and get ready for a new episode.
+        In the (SB18) 10-armed bandit testbed, this would involve resetting the expected return
+
+        .. math::
+            q^*_a
+
+        The function must return a dummy state and info dictionary to agree with the gym ``Env`` class, but their values are
+        irrelevant
+
+        :return:
+            - s - a state, for instance 0
+            - info - the info dictionary, for instance {}
+        """
+        raise NotImplementedError("Implement the reset method") 
+
+    def bandit_step(self, a): 
+        r"""This helper function simplify the definition of the environments ``step``-function.
+
+        Given an action :math:`r`, this function computes the reward obtained by taking that action :math:`r_t`
+        and the gab. This is defined as the expected reward we miss out on by taking the potentially suboptimal action :math:`a`
+        and is defined as:
+
+        .. math::
+            \Delta = \max_{a'} q^*_{a'} - q_a
+
+        Once implemented, the reward and regret enters into the ``step`` function as follows:
+
+        .. runblock:: pycon
+
+            >>> from irlc.ex08.bandits import StationaryBandit
+            >>> env = StationaryBandit(k=4)     # 4-armed testbed.
+            >>> env.reset()                     # Reset all parameters.
+            >>> _, r, _, _, info = env.step(2)  # Take action a=2
+            >>> print(f"Reward from a=2 was {r=}, the gab was {info['gab']=}")
+
+        :param a: The current action we take
+        :return:
+            - r - The reward we thereby incur
+            - gab - The regret incurred by taking this action (0 for an optimal action)
+        """
+        reward = 0 # Compute the reward associated with arm a 
+        gab = 0 # Compute the gab, by comparing to the optimal arms reward.
+        return reward, gab
+
+    def step(self, action): 
+        r"""You do not have to edit this function.
+        In a bandit environment, the step function is simplified greatly since there are no
+        states to keep track on. It should simply return the reward incurred by the action ``a``
+        and (for convenience) also returns the gab in the ``info``-dictionary.
+
+        :param action: The current action we take :math:`a_t`
+        :return:
+            - next_state - This is always ``None``
+            - reward - The reward obtained by taking the given action. In (SB18) this is defined as :math:`r_t`
+            - terminated - Always ``False``. Bandit problems don't terminate.
+            - truncated - Always ``False``
+            - info - For convenience, this includes the gab (used by the plotting methods)
+
+        """
+        reward, gab = self.bandit_step(action) 
+        info = {'gab': gab}
+        return None, reward, False, False, info  
+
+class StationaryBandit(BanditEnvironment): 
+    r"""Implement the 'stationary bandit environment' which is described in (SB18, Section 2.3)
+    and used as a running example throughout the chapter.
+
+    We will implement a version with a constant mean offset (q_star_mean), so that
+
+     q* = x + q_star_mean,   x ~ Normal(0,1)
+
+    q_star_mean can just be considered to be zero at first.
+    """
+    def __init__(self, k, q_star_mean=0):
+        super().__init__(k)
+        self.q_star_mean = q_star_mean
+
+    def reset(self): 
+        """ Set q^*_k = N(0,1) + mean_value. The mean_value is 0 in most examples. I.e., implement the 10-armed testbed environment. """
+        self.q_star = np.random.randn(self.k) + self.q_star_mean
+        self.optimal_action = np.argmax(self.q_star) # Optimal action is the one with the largest q^*-value. 
+        return 0, {} # The reset method in a gym Env must return a (dummy) state and a dictionary.
+
+    def bandit_step(self, a): 
+        """ Return the reward/gab for action a for the simple bandit. Use self.q_star (see reset-function above).
+         To implement it, implement the reward (see the description of the 10-armed testbed for more information.
+         How is it computed from q^*_k?) and also compute the gab.
+
+         As a small hint, since we are computing the gab, it will in fact be the difference between the
+         value of q^* corresponding to the current arm, and the q^* value for the optimal arm.
+         Remember it is 0 if the optimal action is selected.
+         """
+        # TODO: 2 lines missing.
+        raise NotImplementedError("Insert your solution and remove this error.")
+        # Actual logic goes here. Use self.q_star[a] to get mean reward and np.random.randn() to generate random numbers.  
+        return reward, gab 
+
+    def __str__(self):
+        return f"{type(self).__name__}_{self.q_star_mean}"
+
+"""
+Helper function for running a bunch of bandit experiments and plotting the results.
+
+The function will run the agents in 'agents' (a list of bandit agents) 
+on the bandit environment 'bandit' and plot the result.
+
+Each agent will be evaluated for num_episodes episodes, and one episode consist of 'steps' steps.
+However, to speed things up you can use cache, and the bandit will not be evaluated for more than 
+'max_episodes' over all cache runs. 
+
+"""
+def eval_and_plot(bandit, agents, num_episodes=2000, max_episodes=2000, steps=1000, labels=None, use_cache=True):
+    if labels is None:
+        labels = [str(agent) for agent in agents]
+
+    f, axs = plt.subplots(nrows=3, ncols=1)
+    f.set_size_inches(10,7)
+    (ax1, ax2, ax3) = axs
+    for i,agent in enumerate(agents):
+        rw, oa, regret, num_episodes = run_agent(bandit, agent, episodes=num_episodes, max_episodes=max_episodes, steps=steps, use_cache=use_cache)
+        ax1.plot(rw, label=labels[i])
+        ax2.plot(oa, label=labels[i])
+        ax3.plot(regret, label=labels[i])
+
+    for ax in axs:
+        ax.grid()
+        ax.set_xlabel("Steps")
+
+    ax1.set_ylabel("Average Reward")
+    ax2.set_ylabel("% optimal action")
+    ax3.set_ylabel("Regret $L_t$")
+    ax3.legend()
+    f.suptitle(f"Evaluated on {str(bandit)} for {num_episodes} episodes")
+
+def run_agent(env, agent, episodes=2000, max_episodes=2000, steps=1000, use_cache=False, verbose=True):
+    """
+    Helper function. most of the work involves the cache; the actual training is done by 'train'.
+    """
+    C_regrets_cum_sum, C_oas_sum, C_rewards_sum, C_n_episodes = 0, 0, 0, 0
+    if use_cache:
+        cache = f"cache/{str(env)}_{str(agent)}_{steps}.pkl"
+        if cache_exists(cache):
+            print("> Reading from cache", cache)
+            C_regrets_cum_sum, C_oas_sum, C_rewards_sum, C_n_episodes = cache_read(cache)
+
+    regrets = []
+    rewards = []
+    cruns = max(0, min(episodes, max_episodes - C_n_episodes)) # Missing runs.
+    for _ in tqdm(range(cruns), file=sys.stdout, desc=str(agent),disable=not verbose):
+        stats, traj = train(env, agent, max_steps=steps, verbose=False, return_trajectory=True)
+        regret = np.asarray([r['gab'] for r in traj[0].env_info[1:]])
+        regrets.append(regret)
+        rewards.append(traj[0].reward)
+
+    regrets_cum_sum = C_regrets_cum_sum
+    oas_sum = C_oas_sum
+    rewards_sum = C_rewards_sum
+    episodes = C_n_episodes
+    if len(regrets) > 0:
+        regrets_cum_sum += np.cumsum(np.sum(np.stack(regrets), axis=0))
+        oas_sum += np.sum(np.stack(regrets) == 0, axis=0)
+        rewards_sum += np.sum(np.stack(rewards), axis=0)
+        episodes += cruns
+    if use_cache and cruns > 0:
+        cache_write((regrets_cum_sum, oas_sum, rewards_sum, episodes), cache, protocol=4)
+    return rewards_sum/episodes, oas_sum/episodes, regrets_cum_sum/episodes, episodes
diff --git a/irlc/ex08/gradient_agent.py b/irlc/ex08/gradient_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..34b296b0cd1dbcea66b194b63d16422b423df98e
--- /dev/null
+++ b/irlc/ex08/gradient_agent.py
@@ -0,0 +1,48 @@
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from irlc import savepdf
+import numpy as np
+import matplotlib.pyplot as plt
+from irlc.ex08.bandits import eval_and_plot, StationaryBandit
+from irlc import Agent
+
+class GradientAgent(Agent):
+    def __init__(self, env, alpha=None, use_baseline=True):
+        self.k = env.action_space.n
+        self.alpha = alpha
+        self.baseline=use_baseline
+        self.H = np.zeros((self.k,))
+        super().__init__(env)
+
+    def Pa(self):
+        """ This helper method returns the probability distribution P(A=a) of chosing the
+        arm a as a vector
+        """
+        pi_a = np.exp(self.H)
+        return pi_a / np.sum(pi_a)
+
+    def pi(self, s, t, info_s=None):
+        if t == 0:
+            self.R_bar = 0  # average reward baseline
+            self.H *= 0 # Reset H to all-zeros.
+        self.t = t  # Sore the current time step.
+        return np.random.choice( self.k, p=self.Pa() )
+
+    def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): 
+        # TODO: 9 lines missing.
+        raise NotImplementedError("Implement function body")
+
+    def __str__(self):
+        return f"{type(self).__name__}_{self.alpha}_{'baseline' if self.baseline else 'no_baseline'}"
+
+if __name__ == "__main__":
+    baseline_bandit = StationaryBandit(k=10, q_star_mean=4)
+    alphas = [0.1, 0.4]
+    agents = [GradientAgent(baseline_bandit, alpha=alpha, use_baseline=False) for alpha in alphas]
+    agents += [GradientAgent(baseline_bandit, alpha=alpha, use_baseline=True) for alpha in alphas]
+
+    labels = [f'Gradient Bandit alpha={alpha}' for alpha in alphas ]
+    labels += [f'With baseline: Gradient Bandit alpha={alpha}' for alpha in alphas ]
+    use_cache = False
+    eval_and_plot(baseline_bandit, agents, max_episodes=2000, num_episodes=100, labels=labels, use_cache=use_cache)
+    savepdf("gradient_baseline")
+    plt.show()
diff --git a/irlc/ex08/grand_bandit_race.py b/irlc/ex08/grand_bandit_race.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad466aaaffc88b0b4aa43375b55640aa17dc096a
--- /dev/null
+++ b/irlc/ex08/grand_bandit_race.py
@@ -0,0 +1,78 @@
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+import matplotlib.pyplot as plt
+from irlc.ex08.simple_agents import BasicAgent
+from irlc.ex08.bandits import StationaryBandit, eval_and_plot
+from irlc.ex08.nonstationary import MovingAverageAgent, NonstationaryBandit
+from irlc.ex08.gradient_agent import GradientAgent
+from irlc.ex08.ucb_agent import UCBAgent
+from irlc import savepdf
+import time
+
+if __name__ == "__main__":
+    print("Ladies and gentlemen. It is time for the graaand bandit race")
+    def intro(bandit, agents):
+        print("We are live from the beautiful surroundings where they will compete in:")
+        print(bandit)
+        print("Who will win? who will have the most regret? we are about to find out")
+        print("in a minute after a brief word from our sponsors")
+        time.sleep(1)
+        print("And we are back. Let us introduce todays contestants:")
+        for a in agents:
+            print(a)
+        print("And they are off!")
+    epsilon = 0.1
+    alpha = 0.1
+    c = 2
+    # TODO: 1 lines missing.
+    raise NotImplementedError("Define the bandit here: bandit1 = ...")
+    # TODO: 5 lines missing.
+    raise NotImplementedError("define agents list here")
+    labels = ["Basic", "Moving avg.", "gradient", "Gradient+baseline", "UCB"]
+    '''
+    Stationary, no offset. Vanilla setting.
+    '''
+    intro(bandit1, agents)
+    # TODO: 1 lines missing.
+    raise NotImplementedError("Call eval_and_plot here")
+    plt.suptitle("Stationary bandit (no offset)")
+    savepdf("grand_race_1")
+    plt.show()
+    '''
+    Stationary, but with offset
+    '''
+    print("Whew what a race. Let's get ready to next round:")
+    # TODO: 1 lines missing.
+    raise NotImplementedError("Define bandit2 = ... here")
+    intro(bandit2, agents)
+    # TODO: 1 lines missing.
+    raise NotImplementedError("Call eval_and_plot here")
+    plt.suptitle("Stationary bandit (with offset)")
+    savepdf("grand_race_2")
+    plt.show()
+    '''
+    Long (nonstationary) simulations
+    '''
+    print("Whew what a race. Let's get ready to next round which will be a long one.")
+    # TODO: 1 lines missing.
+    raise NotImplementedError("define bandit3 here")
+    intro(bandit3, agents)
+    # TODO: 1 lines missing.
+    raise NotImplementedError("call eval_and_plot here")
+    plt.suptitle("Non-stationary bandit (no offset)")
+    savepdf("grand_race_3")
+    plt.show()
+
+    '''
+    Stationary, no offset, long run. Exclude stupid bandits.
+    '''
+    agents2 = []
+    agents2 += [GradientAgent(bandit1, alpha=alpha, use_baseline=False)]
+    agents2 += [GradientAgent(bandit1, alpha=alpha, use_baseline=True)]
+    agents2 += [UCBAgent(bandit1, c=2)]
+    labels = ["Gradient", "Gradient+baseline", "UCB"]
+    intro(bandit1, agents2)
+    # TODO: 1 lines missing.
+    raise NotImplementedError("Call eval_and_plot here")
+    plt.suptitle("Stationary bandit (no offset)")
+    savepdf("grand_race_4")
+    plt.show()
diff --git a/irlc/ex08/nonstationary.py b/irlc/ex08/nonstationary.py
new file mode 100644
index 0000000000000000000000000000000000000000..546c5ec8b7fd10f4a93a2869a2373d70756dc84c
--- /dev/null
+++ b/irlc/ex08/nonstationary.py
@@ -0,0 +1,62 @@
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""
+
+References:
+  [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+from irlc.ex08.simple_agents import BasicAgent
+from irlc.ex08.bandits import StationaryBandit, eval_and_plot
+from irlc import savepdf
+
+class NonstationaryBandit(StationaryBandit):
+    def __init__(self, k, q_star_mean=0, reward_change_std=0.01):
+        self.reward_change_std = reward_change_std
+        super().__init__(k, q_star_mean)
+
+    def bandit_step(self, a): 
+        r""" Implement the non-stationary bandit environment (as described in (SB18)).
+        Hint: use reward_change_std * np.random.randn() to generate a single random number with the given std.
+         then add one to each coordinate. Remember you have to compute the regret as well, see StationaryBandit for ideas.
+         (remember the optimal arm will change when you add noise to q_star) """
+        # TODO: 2 lines missing.
+        raise NotImplementedError("Implement function body")
+        return super().bandit_step(a)
+
+    def __str__(self):
+        return f"{type(self).__name__}_{self.q_star_mean}_{self.reward_change_std}"
+
+
+class MovingAverageAgent(BasicAgent):
+    r"""
+    The simple bandit from (SB18, Section 2.4), but with moving average alpha
+    as described in (SB18, Eqn. (2.3))
+    """
+    def __init__(self, env, epsilon, alpha): 
+        # TODO: 2 lines missing.
+        raise NotImplementedError("Implement function body")
+
+    def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): 
+        # TODO: 1 lines missing.
+        raise NotImplementedError("Implement function body")
+
+    def __str__(self):
+        return f"{type(self).__name__}_{self.epsilon}_{self.alpha}"
+
+
+if __name__ == "__main__":
+    plt.figure(figsize=(10, 10))
+    epsilon = 0.1
+    alphas = [0.15, 0.1, 0.05]
+
+    # TODO: 4 lines missing.
+    raise NotImplementedError("Insert your solution and remove this error.")
+
+    labels = [f"Basic agent, epsilon={epsilon}"]
+    # TODO: 1 lines missing.
+    raise NotImplementedError("Insert your solution and remove this error.")
+    use_cache = False # Set this to True to use cache (after code works!)
+    eval_and_plot(bandit, agents, steps=10000, num_episodes=200, labels=labels, use_cache=use_cache)
+    savepdf("nonstationary_bandits")
+    plt.show()
diff --git a/irlc/ex08/simple_agents.py b/irlc/ex08/simple_agents.py
new file mode 100644
index 0000000000000000000000000000000000000000..18bdca138c81829568366b66e538f8de928bc27f
--- /dev/null
+++ b/irlc/ex08/simple_agents.py
@@ -0,0 +1,57 @@
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""
+
+References:
+  [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+from irlc.ex08.bandits import StationaryBandit, eval_and_plot
+from irlc import Agent
+from irlc import savepdf
+
+class BasicAgent(Agent):
+    r"""
+    Simple bandit as described on (SB18, Section 2.4).
+    """
+    def __init__(self, env, epsilon):
+        super().__init__(env)
+        self.k = env.action_space.n
+        self.epsilon = epsilon
+
+    def pi(self, s, t, info=None): 
+        """ Since this is a bandit, s=None and can be ignored, while t refers to the time step in the current episode """
+        if t == 0:
+            # At step 0 of episode. Re-initialize data structure. 
+            # TODO: 2 lines missing.
+            raise NotImplementedError("Insert your solution and remove this error.")
+        # compute action here 
+        # TODO: 1 lines missing.
+        raise NotImplementedError("Insert your solution and remove this error.")
+
+    def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): 
+        """ Since this is a bandit, done, s, sp, info_s, info_sp can all be ignored.
+        From the input arguments you should only use a
+        """
+        # TODO: 2 lines missing.
+        raise NotImplementedError("Implement function body")
+
+    def __str__(self):
+        return f"BasicAgent_{self.epsilon}"
+
+if __name__ == "__main__":
+    N = 100000
+    S = [np.max( np.random.randn(10) ) for _ in range(100000) ]
+    print( np.mean(S), np.std(S)/np.sqrt(N) )
+
+    use_cache = False # Set this to True to use cache (after code works!)
+    from irlc.utils.timer import Timer
+    timer = Timer(start=True)
+    R = 100
+    steps = 1000
+    env = StationaryBandit(k=10) 
+    agents = [BasicAgent(env, epsilon=.1), BasicAgent(env, epsilon=.01), BasicAgent(env, epsilon=0) ]
+    eval_and_plot(env, agents, num_episodes=100, steps=1000, max_episodes=150, use_cache=use_cache)
+    savepdf("bandit_epsilon")
+    plt.show() 
+    print(timer.display())
diff --git a/irlc/ex08/ucb_agent.py b/irlc/ex08/ucb_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c805ea4437b597f48f8cf5f0dc8608f95b96182
--- /dev/null
+++ b/irlc/ex08/ucb_agent.py
@@ -0,0 +1,45 @@
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""
+
+References:
+  [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+from irlc.ex08.simple_agents import BasicAgent
+from irlc import savepdf
+from irlc import Agent
+
+class UCBAgent(Agent):
+    def __init__(self, env, c=2):
+        self.c = c
+        super().__init__(env)
+
+    def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): 
+        # TODO: 2 lines missing.
+        raise NotImplementedError("Train agent here")
+
+    def pi(self, s, k, info=None):
+        if k == 0: 
+            """ Initialize the agent"""
+            # TODO: 3 lines missing.
+            raise NotImplementedError("Reset agent (i.e., make it ready to learn in a new episode with a new optimal action)")
+        # TODO: 1 lines missing.
+        raise NotImplementedError("Compute (and return) optimal action")
+
+    def __str__(self):
+        return f"{type(self).__name__}_{self.c}"
+
+from irlc.ex08.bandits import StationaryBandit, eval_and_plot
+if __name__ == "__main__":
+    r"""Reproduce (SB18, Fig. 2.4) comparing UCB agent to epsilon greedy """
+    runs, use_cache = 100, False
+    c = 2
+    eps = 0.1
+
+    steps = 1000
+    env = StationaryBandit(k=10)
+    agents = [UCBAgent(env,c=c), BasicAgent(env, epsilon=eps)]
+    eval_and_plot(bandit=env, agents=agents, num_episodes=runs, steps=steps, max_episodes=2000, use_cache=use_cache)
+    savepdf("UCB_agent")
+    plt.show()
diff --git a/irlc/lectures/lec08/__init__.py b/irlc/lectures/lec08/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be
--- /dev/null
+++ b/irlc/lectures/lec08/__init__.py
@@ -0,0 +1 @@
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
diff --git a/irlc/lectures/lec08/demo_bandit.py b/irlc/lectures/lec08/demo_bandit.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8b47621d5cd20893a8af401abed59db0af9b63b
--- /dev/null
+++ b/irlc/lectures/lec08/demo_bandit.py
@@ -0,0 +1,23 @@
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from irlc.utils.bandit_graphics_environment import GraphicalBandit
+import time
+from irlc import train
+from irlc.ex08.simple_agents import BasicAgent
+from irlc import interactive
+
+def bandit_eps(autoplay=False):
+    env = GraphicalBandit(10, render_mode='human',frames_per_second=30)
+    env.reset()
+    agent = BasicAgent(env, epsilon=0.1)
+    agent.method = 'Epsilon-greedy'
+    env, agent = interactive(env, agent, autoplay=autoplay)
+
+    t0 = time.time()
+    n = 3000
+    stats, _ = train(env, agent, max_steps=n, num_episodes=10, return_trajectory=False, verbose=False)
+    tpf = (time.time()-t0)/ n
+    print("tpf", tpf, 'fps', 1/tpf)
+    env.close()
+
+if __name__ == "__main__":
+    bandit_eps()
diff --git a/irlc/lectures/lec08/demo_bandit_ucb.py b/irlc/lectures/lec08/demo_bandit_ucb.py
new file mode 100644
index 0000000000000000000000000000000000000000..440c9760cc0dfcf90a0414525aa8521df768047a
--- /dev/null
+++ b/irlc/lectures/lec08/demo_bandit_ucb.py
@@ -0,0 +1,26 @@
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from irlc.utils.bandit_graphics_environment import GraphicalBandit
+from irlc import interactive, train
+# import numpy as np
+import time
+
+def bandit_ucb(autoplay=False):
+    env = GraphicalBandit(10, render_mode='human', frames_per_second=30)
+    env.reset()
+    #env.viewer.show_q_star = True
+    #env.viewer.show_q_ucb = True
+    from irlc.ex08.ucb_agent import UCBAgent
+    agent = UCBAgent(env, c=1)
+    agent.method = 'UCB'
+
+    env, agent = interactive(env, agent, autoplay=autoplay)
+    t0 = time.time()
+    n = 500
+    stats, _ = train(env, agent, max_steps=n, num_episodes=10, return_trajectory=False, verbose=False)
+    tpf = (time.time() - t0) / n
+    print("tpf", tpf, 'fps', 1 / tpf)
+    env.close()
+
+
+if __name__ == "__main__":
+    bandit_ucb()
diff --git a/irlc/tests/tests_week05.py b/irlc/tests/tests_week05.py
index 4a7f813840b6670d6caa99c16576d2b90ff7572c..e863e8f73cfd4652465e33a8dacaeb15e661a0fe 100644
--- a/irlc/tests/tests_week05.py
+++ b/irlc/tests/tests_week05.py
@@ -79,20 +79,20 @@ class CartpoleCostQuestion(DirectSolverQuestion):
         from irlc.ex05.direct_cartpole_kelly import compute_solutions
         return compute_solutions()[1]
 
-class BrachistochroneQuestion(DirectSolverQuestion):
-    """ Brachistochrone (unconstrained) """
-
-    @classmethod
-    def compute_solution(cls):
-        from irlc.ex05.direct_brachistochrone import compute_constrained_solutions
-        return compute_constrained_solutions()[1]
-
-class BrachistochroneConstrainedQuestion(DirectSolverQuestion):
-    """ Brachistochrone (constrained) """
-    @classmethod
-    def compute_solution(cls):
-        from irlc.ex05.direct_brachistochrone import compute_constrained_solutions
-        return compute_constrained_solutions()[1]
+# class BrachistochroneQuestion(DirectSolverQuestion):
+#     """ Brachistochrone (unconstrained) """
+#
+#     @classmethod
+#     def compute_solution(cls):
+#         from irlc.ex05.direct_brachistochrone import compute_constrained_solutions
+#         return compute_constrained_solutions()[1]
+#
+# class BrachistochroneConstrainedQuestion(DirectSolverQuestion):
+#     """ Brachistochrone (constrained) """
+#     @classmethod
+#     def compute_solution(cls):
+#         from irlc.ex05.direct_brachistochrone import compute_constrained_solutions
+#         return compute_constrained_solutions()[1]
 
 class Week05Tests(Report):
     title = "Tests for week 05"
@@ -105,8 +105,8 @@ class Week05Tests(Report):
         (DirectAgentPendulum, 10),                  # ok
         (CartpoleTimeQuestion, 5),                  # ok
         (CartpoleCostQuestion, 5),                  # ok
-        (BrachistochroneQuestion, 5),               # ok
-        (BrachistochroneConstrainedQuestion, 10),   # ok
+        # (BrachistochroneQuestion, 5),               # ok
+        # (BrachistochroneConstrainedQuestion, 10),   # ok
                  ]
 
 if __name__ == '__main__':
diff --git a/irlc/tests/tests_week08.py b/irlc/tests/tests_week08.py
new file mode 100644
index 0000000000000000000000000000000000000000..340d69c01c3ef2cae94901444ba52b9887a47bef
--- /dev/null
+++ b/irlc/tests/tests_week08.py
@@ -0,0 +1,278 @@
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from unitgrade import UTestCase, Report, cache
+import numpy as np
+from irlc import train
+
+
+def train_recording(env, agent, trajectories):
+    for t in trajectories:
+        env.reset()
+        for k in range(len(t.action)):
+            s = t.state[k]
+            r = t.reward[k]
+            a = t.action[k]
+            sp = t.state[k+1]
+            agent.pi(s,k)
+            agent.train(s, a, r, sp, done=k == len(t.action)-1)
+
+
+class BanditQuestion(UTestCase):
+    """ Value (Q) function estimate """
+    tol = 1e-2 # tie-breaking in the gradient bandit is ill-defined.
+    # testfun = QPrintItem.assertL2
+
+    # def setUpClass(cls) -> None:
+    #     from irlc.ex08.simple_agents import BasicAgent
+    #     from irlc.ex08.bandits import StationaryBandit
+    #     env = StationaryBandit(k=10, )
+    #     agent = BasicAgent(env, epsilon=0.1)
+    #     _, cls.trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100)
+    #     cls.Q = agent.Q
+    #     cls.env = env
+    #     cls.agent = agent
+
+    def get_env_agent(self):
+        from irlc.ex08.simple_agents import BasicAgent
+        from irlc.ex08.bandits import StationaryBandit
+        env = StationaryBandit(k=10)
+        agent = BasicAgent(env, epsilon=0.1)
+        return env, agent
+
+    @cache
+    def get_trajectories(self):
+        env, agent = self.get_env_agent()
+        _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100)
+        return trajectories
+
+    # def precompute_payload(self):
+    #     env, agent = self.get_env_agent()
+    #     _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100)
+    #     return trajectories, agent.Q
+
+
+    def test_agent(self):
+        trajectories = self.get_trajectories()
+        env, agent = self.get_env_agent()
+        train_recording(env, agent, trajectories)
+        self.assertL2(agent.Q, tol=1e-5)
+        # return agent.Q
+        # self.Q = Q
+        # self.question.agent = agent
+        # return agent.Q
+
+    # testfun = QPrintItem.assertL2
+
+    def test_action_distributin(self):
+        T = 10000
+        tol = 1 / np.sqrt(T) * 5
+        trajectories = self.get_trajectories()
+        env, agent = self.get_env_agent()
+        train_recording(env, agent, trajectories)
+        # for k in self._cache.keys(): print(k)
+
+        from collections import Counter
+        counts = Counter([agent.pi(None, k) for k in range(T)])
+        distrib = [counts[k] / T for k in range(env.k)]
+        self.assertL2(np.asarray(distrib), tol=tol)
+
+
+    # def process_output(self, res, txt, numbers):
+    #     return res
+
+    # def process_output(self, res, txt, numbers):
+    #     return res
+    #
+    # def test(self, computed, expected):
+    #     super().test(computed, self.Q)
+
+# class BanditQuestion(QPrintItem):
+#     # tol = 1e-6
+#     tol = 1e-2 # tie-breaking in the gradient bandit is ill-defined.
+#     title = "Value (Q) function estimate"
+#     testfun = QPrintItem.assertL2
+#
+#     def get_env_agent(self):
+#         from irlc.ex08.simple_agents import BasicAgent
+#         from irlc.ex08.bandits import StationaryBandit
+#         env = StationaryBandit(k=10, )
+#         agent = BasicAgent(env, epsilon=0.1)
+#         return env, agent
+#
+#     def precompute_payload(self):
+#         env, agent = self.get_env_agent()
+#         _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100)
+#         return trajectories, agent.Q
+#
+#     def compute_answer_print(self):
+#         trajectories, Q = self.precomputed_payload()
+#         env, agent = self.get_env_agent()
+#         train_recording(env, agent, trajectories)
+#         self.Q = Q
+#         self.question.agent = agent
+#         return agent.Q
+#
+#     def process_output(self, res, txt, numbers):
+#         return res
+#
+#     def test(self, computed, expected):
+#         super().test(computed, self.Q)
+#
+# class BanditItemActionDistribution(QPrintItem):
+#     # Assumes setup has already been done.
+#     title = "Action distribution test"
+#     T = 10000
+#     tol = 1/np.sqrt(T)*5
+#     testfun = QPrintItem.assertL2
+#
+#     def compute_answer_print(self):
+#         # print("In agent print code")
+#         from collections import Counter
+#         counts = Counter( [self.question.agent.pi(None, k) for k in range(self.T)] )
+#         distrib = [counts[k] / self.T for k in range(self.question.agent.env.k)]
+#         return np.asarray(distrib)
+#
+#     def process_output(self, res, txt, numbers):
+#         return res
+#
+# class BanditQuestion(QuestionGroup):
+#     title = "Simple bandits"
+#     class SimpleBanditItem(BanditItem):
+#         #title = "Value function estimate"
+#         def get_env_agent(self):
+#             from irlc.ex08.simple_agents import BasicAgent
+#             from irlc.ex08.bandits import StationaryBandit
+#             env = StationaryBandit(k=10, )
+#             agent = BasicAgent(env, epsilon=0.1)
+#             return env, agent
+#     class SimpleBanditActionDistribution(BanditItemActionDistribution):
+#         pass
+
+
+
+class GradientBanditQuestion(BanditQuestion):
+    """ Gradient agent """
+    # class SimpleBanditItem(BanditItem):
+        # title = "Simple agent question"
+    def get_env_agent(self):
+        from irlc.ex08.bandits import StationaryBandit
+        from irlc.ex08.gradient_agent import GradientAgent
+        env = StationaryBandit(k=10)
+        agent = GradientAgent(env, alpha=0.05)
+        return env, agent
+
+    # def precompute_payload(self):
+    #     env, agent = self.get_env_agent()
+    #     _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100)
+    #     return trajectories
+
+    def test_agent(self):
+        trajectories = self.get_trajectories()
+        env, agent = self.get_env_agent()
+        train_recording(env, agent, trajectories)
+        self.assertL2(agent.H, tol=1e-5)
+
+
+    # def test(self, computed, expected):
+    #     self.testfun(computed, self.H)
+    #
+    # class SimpleBanditActionDistribution(BanditItemActionDistribution):
+    #     pass
+
+
+# class GradientBanditQuestion(QuestionGroup):
+#     title = "Gradient agent"
+#     class SimpleBanditItem(BanditItem):
+#         # title = "Simple agent question"
+#         def get_env_agent(self):
+#             from irlc.ex08.bandits import StationaryBandit
+#             from irlc.ex08.gradient_agent import GradientAgent
+#             env = StationaryBandit(k=10)
+#             agent = GradientAgent(env, alpha=0.05)
+#             return env, agent
+#
+#         def precompute_payload(self):
+#             env, agent = self.get_env_agent()
+#             _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100)
+#             return trajectories, agent.H
+#
+#         def compute_answer_print(self):
+#             trajectories, H = self.precomputed_payload()
+#             env, agent = self.get_env_agent()
+#             train_recording(env, agent, trajectories)
+#             self.H = H
+#             self.question.agent = agent
+#             return agent.H
+#
+#         def test(self, computed, expected):
+#             self.testfun(computed, self.H)
+#
+#     class SimpleBanditActionDistribution(BanditItemActionDistribution):
+#         pass
+
+
+
+class UCBAgentQuestion(BanditQuestion):
+    """ UCB agent """
+    # class UCBAgentItem(BanditItem):
+    def get_env_agent(self):
+        from irlc.ex08.bandits import StationaryBandit
+        from irlc.ex08.ucb_agent import UCBAgent
+        env = StationaryBandit(k=10)
+        agent = UCBAgent(env)
+        return env, agent
+
+    # class UCBAgentActionDistribution(BanditItemActionDistribution):
+    #     pass
+
+
+# class UCBAgentQuestion(QuestionGroup):
+#     title = "UCB agent"
+#     class UCBAgentItem(BanditItem):
+#         def get_env_agent(self):
+#             from irlc.ex08.bandits import StationaryBandit
+#             from irlc.ex08.ucb_agent import UCBAgent
+#             env = StationaryBandit(k=10)
+#             agent = UCBAgent(env)
+#             return env, agent
+#
+#     class UCBAgentActionDistribution(BanditItemActionDistribution):
+#         pass
+
+# class NonstatiotnaryAgentQuestion(QuestionGroup):
+#     title = "Nonstationary bandit environment"
+#     class NonstationaryItem(BanditItem):
+#         def get_env_agent(self):
+#             epsilon = 0.1
+#             from irlc.ex08.nonstationary import NonstationaryBandit, MovingAverageAgent
+#             bandit = NonstationaryBandit(k=10)
+#             agent = MovingAverageAgent(bandit, epsilon=epsilon, alpha=0.15)
+#             return bandit, agent
+#
+# class NonstationaryActionDistribution(BanditItemActionDistribution):
+#     pass
+
+class NonstatiotnaryAgentQuestion(BanditQuestion):
+    """ UCB agent """
+    # class UCBAgentItem(BanditItem):
+    def get_env_agent(self):
+        epsilon = 0.1
+        from irlc.ex08.nonstationary import NonstationaryBandit, MovingAverageAgent
+        bandit = NonstationaryBandit(k=10)
+        agent = MovingAverageAgent(bandit, epsilon=epsilon, alpha=0.15)
+        return bandit, agent
+
+import irlc
+class Week08Tests(Report):
+    title = "Tests for week 08"
+    pack_imports = [irlc]
+    individual_imports = []
+    questions = [
+            (BanditQuestion, 10),
+            (GradientBanditQuestion, 10),
+            (UCBAgentQuestion, 5),
+            (NonstatiotnaryAgentQuestion, 5)
+                ]
+
+if __name__ == '__main__':
+    from unitgrade import evaluate_report_student
+    evaluate_report_student(Week08Tests())
diff --git a/irlc/tests/unitgrade_data/BanditQuestion.pkl b/irlc/tests/unitgrade_data/BanditQuestion.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..55e379fe474d7a967700bb9c83202905b8ebcbfa
Binary files /dev/null and b/irlc/tests/unitgrade_data/BanditQuestion.pkl differ
diff --git a/irlc/tests/unitgrade_data/BrachistochroneConstrainedQuestion.pkl b/irlc/tests/unitgrade_data/BrachistochroneConstrainedQuestion.pkl
deleted file mode 100644
index 1d1b594d8ccdde8336eb3cd174e105beaf5eaf6f..0000000000000000000000000000000000000000
Binary files a/irlc/tests/unitgrade_data/BrachistochroneConstrainedQuestion.pkl and /dev/null differ
diff --git a/irlc/tests/unitgrade_data/BrachistochroneQuestion.pkl b/irlc/tests/unitgrade_data/BrachistochroneQuestion.pkl
deleted file mode 100644
index 1d1b594d8ccdde8336eb3cd174e105beaf5eaf6f..0000000000000000000000000000000000000000
Binary files a/irlc/tests/unitgrade_data/BrachistochroneQuestion.pkl and /dev/null differ
diff --git a/irlc/tests/unitgrade_data/CartpoleCostQuestion.pkl b/irlc/tests/unitgrade_data/CartpoleCostQuestion.pkl
index 1d1b594d8ccdde8336eb3cd174e105beaf5eaf6f..21e4c24c13dd49d445c4efe18438fe4a0b360513 100644
Binary files a/irlc/tests/unitgrade_data/CartpoleCostQuestion.pkl and b/irlc/tests/unitgrade_data/CartpoleCostQuestion.pkl differ
diff --git a/irlc/tests/unitgrade_data/CartpoleTimeQuestion.pkl b/irlc/tests/unitgrade_data/CartpoleTimeQuestion.pkl
index 1d1b594d8ccdde8336eb3cd174e105beaf5eaf6f..21e4c24c13dd49d445c4efe18438fe4a0b360513 100644
Binary files a/irlc/tests/unitgrade_data/CartpoleTimeQuestion.pkl and b/irlc/tests/unitgrade_data/CartpoleTimeQuestion.pkl differ
diff --git a/irlc/tests/unitgrade_data/DirectAgentPendulum.pkl b/irlc/tests/unitgrade_data/DirectAgentPendulum.pkl
index e9d2ed475214b22bc52f1cc0dfc8c04c71d9a2b9..8bcfd04385b49acb537aa90a6c1906443c00c348 100644
Binary files a/irlc/tests/unitgrade_data/DirectAgentPendulum.pkl and b/irlc/tests/unitgrade_data/DirectAgentPendulum.pkl differ
diff --git a/irlc/tests/unitgrade_data/DirectMethods.pkl b/irlc/tests/unitgrade_data/DirectMethods.pkl
index f81ab2560bf4752a237712f1df94fc8ae01ac0ce..1872c37be157b1d23e330e90fb98df324bc707a7 100644
Binary files a/irlc/tests/unitgrade_data/DirectMethods.pkl and b/irlc/tests/unitgrade_data/DirectMethods.pkl differ
diff --git a/irlc/tests/unitgrade_data/DirectSolverQuestion.pkl b/irlc/tests/unitgrade_data/DirectSolverQuestion.pkl
index 1d1b594d8ccdde8336eb3cd174e105beaf5eaf6f..21e4c24c13dd49d445c4efe18438fe4a0b360513 100644
Binary files a/irlc/tests/unitgrade_data/DirectSolverQuestion.pkl and b/irlc/tests/unitgrade_data/DirectSolverQuestion.pkl differ
diff --git a/irlc/tests/unitgrade_data/Exam5InventoryEvaluation.pkl b/irlc/tests/unitgrade_data/Exam5InventoryEvaluation.pkl
index a511917ab3f43109f1d3fe37b025f2fde713339f..288459bca52e824a5d9dabdcb4cf10e164f64114 100644
Binary files a/irlc/tests/unitgrade_data/Exam5InventoryEvaluation.pkl and b/irlc/tests/unitgrade_data/Exam5InventoryEvaluation.pkl differ
diff --git a/irlc/tests/unitgrade_data/Exam6Toy2d.pkl b/irlc/tests/unitgrade_data/Exam6Toy2d.pkl
index f927b5ae7578002c5a8840c91705f5d4c7d806f0..06341fef90fd2beed50cccac023bdd729b480a91 100644
Binary files a/irlc/tests/unitgrade_data/Exam6Toy2d.pkl and b/irlc/tests/unitgrade_data/Exam6Toy2d.pkl differ
diff --git a/irlc/tests/unitgrade_data/ExamQuestion7FlowersStore.pkl b/irlc/tests/unitgrade_data/ExamQuestion7FlowersStore.pkl
index 6555641d17ccd50bdc906dab13481cdad59254cc..7de7875d690be1fc4143070c2139bd34f61288ae 100644
Binary files a/irlc/tests/unitgrade_data/ExamQuestion7FlowersStore.pkl and b/irlc/tests/unitgrade_data/ExamQuestion7FlowersStore.pkl differ
diff --git a/irlc/tests/unitgrade_data/GradientBanditQuestion.pkl b/irlc/tests/unitgrade_data/GradientBanditQuestion.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..55e379fe474d7a967700bb9c83202905b8ebcbfa
Binary files /dev/null and b/irlc/tests/unitgrade_data/GradientBanditQuestion.pkl differ
diff --git a/irlc/tests/unitgrade_data/ILQRAgentQuestion.pkl b/irlc/tests/unitgrade_data/ILQRAgentQuestion.pkl
index 4af21ecb688c99771e1897bc53ecbae1bc667b8f..94b38667b6a59b2bdd827e9569ad5bce677cc91e 100644
Binary files a/irlc/tests/unitgrade_data/ILQRAgentQuestion.pkl and b/irlc/tests/unitgrade_data/ILQRAgentQuestion.pkl differ
diff --git a/irlc/tests/unitgrade_data/ILQRPendulumQuestion.pkl b/irlc/tests/unitgrade_data/ILQRPendulumQuestion.pkl
index 1019b5db1c0a0ad9b3e14545aaf80055652fcb66..af4efa1cc7fc8336bfab2d97317419f4573a58da 100644
Binary files a/irlc/tests/unitgrade_data/ILQRPendulumQuestion.pkl and b/irlc/tests/unitgrade_data/ILQRPendulumQuestion.pkl differ
diff --git a/irlc/tests/unitgrade_data/NonstatiotnaryAgentQuestion.pkl b/irlc/tests/unitgrade_data/NonstatiotnaryAgentQuestion.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..55e379fe474d7a967700bb9c83202905b8ebcbfa
Binary files /dev/null and b/irlc/tests/unitgrade_data/NonstatiotnaryAgentQuestion.pkl differ
diff --git a/irlc/tests/unitgrade_data/PendulumQuestion.pkl b/irlc/tests/unitgrade_data/PendulumQuestion.pkl
index 1d1b594d8ccdde8336eb3cd174e105beaf5eaf6f..21e4c24c13dd49d445c4efe18438fe4a0b360513 100644
Binary files a/irlc/tests/unitgrade_data/PendulumQuestion.pkl and b/irlc/tests/unitgrade_data/PendulumQuestion.pkl differ
diff --git a/irlc/tests/unitgrade_data/Problem1BobsFriend.pkl b/irlc/tests/unitgrade_data/Problem1BobsFriend.pkl
index cb3e05e507f46a771194aa47e9d478a38ea6dc4a..0a911216fa96ee726261d5fd6122f47c63b7becd 100644
Binary files a/irlc/tests/unitgrade_data/Problem1BobsFriend.pkl and b/irlc/tests/unitgrade_data/Problem1BobsFriend.pkl differ
diff --git a/irlc/tests/unitgrade_data/Problem1DiscreteKuromoto.pkl b/irlc/tests/unitgrade_data/Problem1DiscreteKuromoto.pkl
index 5884cbbaec323589db6f77d93e93aab42b835c82..6174c0b3159b23350a66f8510986566388f9a9e9 100644
Binary files a/irlc/tests/unitgrade_data/Problem1DiscreteKuromoto.pkl and b/irlc/tests/unitgrade_data/Problem1DiscreteKuromoto.pkl differ
diff --git a/irlc/tests/unitgrade_data/Problem1Kuramoto.pkl b/irlc/tests/unitgrade_data/Problem1Kuramoto.pkl
index ef1b528ea3cb8460738583d5cf526ff19f05dc78..5da65912b9c77917947555ed5b62336969918a99 100644
Binary files a/irlc/tests/unitgrade_data/Problem1Kuramoto.pkl and b/irlc/tests/unitgrade_data/Problem1Kuramoto.pkl differ
diff --git a/irlc/tests/unitgrade_data/Problem2BobsPolicy.pkl b/irlc/tests/unitgrade_data/Problem2BobsPolicy.pkl
index 25f36de2f85c245c47e3f76ccfd13411d7dbb190..5d994baa391da54fd3a6e1c1a369b72a9df5f17a 100644
Binary files a/irlc/tests/unitgrade_data/Problem2BobsPolicy.pkl and b/irlc/tests/unitgrade_data/Problem2BobsPolicy.pkl differ
diff --git a/irlc/tests/unitgrade_data/Problem2DeterministicDP.pkl b/irlc/tests/unitgrade_data/Problem2DeterministicDP.pkl
index c6a6d86c444884d85398ab5d68b4a8f2d731b17c..4029b85e80a9ebbf315924351ada7ba445fcb24a 100644
Binary files a/irlc/tests/unitgrade_data/Problem2DeterministicDP.pkl and b/irlc/tests/unitgrade_data/Problem2DeterministicDP.pkl differ
diff --git a/irlc/tests/unitgrade_data/Problem2DeterministicInventory.pkl b/irlc/tests/unitgrade_data/Problem2DeterministicInventory.pkl
index d7b664c9873a3f229a3ada6b5c0794429402f080..547769c9bb40f7e2f9e061a3d24943b7bf016ea1 100644
Binary files a/irlc/tests/unitgrade_data/Problem2DeterministicInventory.pkl and b/irlc/tests/unitgrade_data/Problem2DeterministicInventory.pkl differ
diff --git a/irlc/tests/unitgrade_data/Problem3InventoryInventoryEnvironment.pkl b/irlc/tests/unitgrade_data/Problem3InventoryInventoryEnvironment.pkl
index ed6643cb8af26e6368d19072746fd41ae4c60ab0..f8b966396874d03b37f527e8166a7431bd63ce66 100644
Binary files a/irlc/tests/unitgrade_data/Problem3InventoryInventoryEnvironment.pkl and b/irlc/tests/unitgrade_data/Problem3InventoryInventoryEnvironment.pkl differ
diff --git a/irlc/tests/unitgrade_data/Problem3LQR.pkl b/irlc/tests/unitgrade_data/Problem3LQR.pkl
index 604d5c20678f95431aae0b2c24d839ff98a641a0..cd8f6f6cd8072c224d9de2763d5585bdba4a6d80 100644
Binary files a/irlc/tests/unitgrade_data/Problem3LQR.pkl and b/irlc/tests/unitgrade_data/Problem3LQR.pkl differ
diff --git a/irlc/tests/unitgrade_data/Problem3PID.pkl b/irlc/tests/unitgrade_data/Problem3PID.pkl
index 535051b828a3939a46f439310b29198a5e080a0a..252cfd024c97e5da728820dacd87ab9910607247 100644
Binary files a/irlc/tests/unitgrade_data/Problem3PID.pkl and b/irlc/tests/unitgrade_data/Problem3PID.pkl differ
diff --git a/irlc/tests/unitgrade_data/Problem3StochasticDP.pkl b/irlc/tests/unitgrade_data/Problem3StochasticDP.pkl
index 04713f394a37f31128be4aead3269f8c3c2e4695..0e1fc83741cb9bd0877d29de2b3828b78bdd5b01 100644
Binary files a/irlc/tests/unitgrade_data/Problem3StochasticDP.pkl and b/irlc/tests/unitgrade_data/Problem3StochasticDP.pkl differ
diff --git a/irlc/tests/unitgrade_data/Problem4DPAgent.pkl b/irlc/tests/unitgrade_data/Problem4DPAgent.pkl
index 54dc9c584545f20c53ce6eab91442e358d24233e..178368d13873f75c43be9a31cb3dbdb10d5fef36 100644
Binary files a/irlc/tests/unitgrade_data/Problem4DPAgent.pkl and b/irlc/tests/unitgrade_data/Problem4DPAgent.pkl differ
diff --git a/irlc/tests/unitgrade_data/Problem4InventoryTrain.pkl b/irlc/tests/unitgrade_data/Problem4InventoryTrain.pkl
index 2e0efe223d9500337a95c76732a6fcfc3cbb2872..22065591b65be79d935c05472a7603be0e00bcdb 100644
Binary files a/irlc/tests/unitgrade_data/Problem4InventoryTrain.pkl and b/irlc/tests/unitgrade_data/Problem4InventoryTrain.pkl differ
diff --git a/irlc/tests/unitgrade_data/Problem4LQRAgent.pkl b/irlc/tests/unitgrade_data/Problem4LQRAgent.pkl
index b7f0a6a6d9def299660158f65d0b85af24d92699..42b50d8f321a365c574de2e27cc5dead749dbee4 100644
Binary files a/irlc/tests/unitgrade_data/Problem4LQRAgent.pkl and b/irlc/tests/unitgrade_data/Problem4LQRAgent.pkl differ
diff --git a/irlc/tests/unitgrade_data/Problem4PIDAgent.pkl b/irlc/tests/unitgrade_data/Problem4PIDAgent.pkl
index 70a85f4bcf6fc78c263690ba8b0491e7799bf59c..14b3e4b4c95270f0c2953a2cc41a66833ba99d7f 100644
Binary files a/irlc/tests/unitgrade_data/Problem4PIDAgent.pkl and b/irlc/tests/unitgrade_data/Problem4PIDAgent.pkl differ
diff --git a/irlc/tests/unitgrade_data/Problem5PacmanHardcoded.pkl b/irlc/tests/unitgrade_data/Problem5PacmanHardcoded.pkl
index 3968cbf7284d8d716e19ed9972ad53d4d3172bf7..33dfa81f677fd061a0a39b2c51757d929785cd80 100644
Binary files a/irlc/tests/unitgrade_data/Problem5PacmanHardcoded.pkl and b/irlc/tests/unitgrade_data/Problem5PacmanHardcoded.pkl differ
diff --git a/irlc/tests/unitgrade_data/Problem5_6_Boeing.pkl b/irlc/tests/unitgrade_data/Problem5_6_Boeing.pkl
index 4650e0a6d3fa65b094ffb89356fae69744d7793a..b61782009434e3024f670821a02eff567ea7220c 100644
Binary files a/irlc/tests/unitgrade_data/Problem5_6_Boeing.pkl and b/irlc/tests/unitgrade_data/Problem5_6_Boeing.pkl differ
diff --git a/irlc/tests/unitgrade_data/Problem6ChessTournament.pkl b/irlc/tests/unitgrade_data/Problem6ChessTournament.pkl
index e7da887928def8fb94b9e7e3d034e373928ce32e..354e3485c6913c4ed2b0e90c1416d05becf63c1c 100644
Binary files a/irlc/tests/unitgrade_data/Problem6ChessTournament.pkl and b/irlc/tests/unitgrade_data/Problem6ChessTournament.pkl differ
diff --git a/irlc/tests/unitgrade_data/Problem7PIDCar.pkl b/irlc/tests/unitgrade_data/Problem7PIDCar.pkl
index e22d2e24d83e87606e137a33987ca83d3c67a210..2ff576403f28ebc1f96c87a40defa18f2263737b 100644
Binary files a/irlc/tests/unitgrade_data/Problem7PIDCar.pkl and b/irlc/tests/unitgrade_data/Problem7PIDCar.pkl differ
diff --git a/irlc/tests/unitgrade_data/Problem7_8_PidLQR.pkl b/irlc/tests/unitgrade_data/Problem7_8_PidLQR.pkl
index d455290c459b3de4a4a31f3905f4f131de9a2b78..c0103b3e977fa2b98a34cf16e69b4168cf7d8d53 100644
Binary files a/irlc/tests/unitgrade_data/Problem7_8_PidLQR.pkl and b/irlc/tests/unitgrade_data/Problem7_8_PidLQR.pkl differ
diff --git a/irlc/tests/unitgrade_data/RendevouzItem.pkl b/irlc/tests/unitgrade_data/RendevouzItem.pkl
index 91c3ae562ba9fce0668664f1fa58979cd070b2a7..2ea308be8ae3ae254027640d548e0f9972c8cfe6 100644
Binary files a/irlc/tests/unitgrade_data/RendevouzItem.pkl and b/irlc/tests/unitgrade_data/RendevouzItem.pkl differ
diff --git a/irlc/tests/unitgrade_data/UCBAgentQuestion.pkl b/irlc/tests/unitgrade_data/UCBAgentQuestion.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..55e379fe474d7a967700bb9c83202905b8ebcbfa
Binary files /dev/null and b/irlc/tests/unitgrade_data/UCBAgentQuestion.pkl differ
diff --git a/irlc/utils/bandit_graphics_environment.py b/irlc/utils/bandit_graphics_environment.py
index 391f0ea5f31d49a4bc70edddc624eb8d27fd018c..a050ed32a36636d75c61fb146fe25f8a5ad9afc3 100644
--- a/irlc/utils/bandit_graphics_environment.py
+++ b/irlc/utils/bandit_graphics_environment.py
@@ -50,7 +50,12 @@ class GraphicalBandit(BinaryBandit):
 
     def reset(self):
         s, info = super().reset()
+        if hasattr(self, 'agent'):
+            if hasattr(self.agent, 'Q'): del self.agent.Q
+            if hasattr(self.agent, 'N'): del self.agent.N
+
         self.render()
+
         return s, info
 
     def step(self, action):
@@ -217,7 +222,7 @@ class BanditViewer:
         reward = self.last_reward
         action = self.last_action
         self.ghost.set_direction(self.ghost.rand_eyes()) # Random eyes.
-        if reward is not None:
+        if reward is not None and action is not None:
             if reward <= 0:
                 self.ghost.kill()
             else:
diff --git a/solutions/ex07/ilqr_TODO_1.py b/solutions/ex07/ilqr_TODO_1.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9ada2c3a7adcd0a1d3995bd723716a749f4c08e
--- /dev/null
+++ b/solutions/ex07/ilqr_TODO_1.py
@@ -0,0 +1,2 @@
+    l, L = [np.zeros((m,))]*N, [np.zeros((m,n))]*N  
+    x_bar, u_bar = forward_pass(model, x_bar, u_bar, L=L, l=l) 
\ No newline at end of file
diff --git a/solutions/ex07/ilqr_TODO_10.py b/solutions/ex07/ilqr_TODO_10.py
new file mode 100644
index 0000000000000000000000000000000000000000..97423181e2f17ec2e986788e79b4fd978a7c6838
--- /dev/null
+++ b/solutions/ex07/ilqr_TODO_10.py
@@ -0,0 +1 @@
+            Delta, mu = max(1.0, Delta) * Delta_0, max(mu_min, mu * Delta) # Increase 
\ No newline at end of file
diff --git a/solutions/ex07/ilqr_TODO_11.py b/solutions/ex07/ilqr_TODO_11.py
new file mode 100644
index 0000000000000000000000000000000000000000..dafc65dda3f1d6f36c21e3a0a612e97e4606bafb
--- /dev/null
+++ b/solutions/ex07/ilqr_TODO_11.py
@@ -0,0 +1,4 @@
+    R = c_uu  
+    H = c_ux
+    q, qN = c_x[:-1], c_x[-1]
+    r = c_u 
\ No newline at end of file
diff --git a/solutions/ex07/ilqr_TODO_12.py b/solutions/ex07/ilqr_TODO_12.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5823c6a0680218628af756b10761f3211c04501
--- /dev/null
+++ b/solutions/ex07/ilqr_TODO_12.py
@@ -0,0 +1,4 @@
+    # fs = [(v[1],v[2]) for v in [model.f(x, u, k, compute_jacobian=True) for k, (x, u) in enumerate(zip(x_bar[:-1], u_bar))]]  
+    fs = [model.f_jacobian(x, u, k) for k, (x, u) in enumerate(zip(x_bar[:-1], u_bar))]
+
+    A, B = zip(*fs) 
\ No newline at end of file
diff --git a/solutions/ex07/ilqr_TODO_13.py b/solutions/ex07/ilqr_TODO_13.py
new file mode 100644
index 0000000000000000000000000000000000000000..41ceba56deb0495616a22805f7a38bb9733ec181
--- /dev/null
+++ b/solutions/ex07/ilqr_TODO_13.py
@@ -0,0 +1,2 @@
+    gs = [model.cost.c(x, u, i, compute_gradients=True) for i, (x, u) in enumerate(zip(x_bar[:-1], u_bar))] 
+    c, c_x, c_u, c_xx, c_ux, c_uu = zip(*gs) 
\ No newline at end of file
diff --git a/solutions/ex07/ilqr_TODO_14.py b/solutions/ex07/ilqr_TODO_14.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c925083e59871befab170fb5ec809e6311e5452
--- /dev/null
+++ b/solutions/ex07/ilqr_TODO_14.py
@@ -0,0 +1,3 @@
+    c = c + (cN,) 
+    c_x = c_x + (c_xN,)
+    c_xx = c_xx + (c_xxN,) 
\ No newline at end of file
diff --git a/solutions/ex07/ilqr_TODO_15.py b/solutions/ex07/ilqr_TODO_15.py
new file mode 100644
index 0000000000000000000000000000000000000000..73a0fa4ac59c0fedc7f4d7e87645ab4372a80399
--- /dev/null
+++ b/solutions/ex07/ilqr_TODO_15.py
@@ -0,0 +1 @@
+        u_star[i] = u_bar[i] + alpha * l[i] + L[i] @ (x[i] - x_bar[i]) 
\ No newline at end of file
diff --git a/solutions/ex07/ilqr_TODO_16.py b/solutions/ex07/ilqr_TODO_16.py
new file mode 100644
index 0000000000000000000000000000000000000000..20904f4a15cb7b9db30da28ccb88bb21a1bb040d
--- /dev/null
+++ b/solutions/ex07/ilqr_TODO_16.py
@@ -0,0 +1 @@
+        x[i + 1] = model.f(x[i], u_star[i], i) 
\ No newline at end of file
diff --git a/solutions/ex07/ilqr_TODO_2.py b/solutions/ex07/ilqr_TODO_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5ff3ca1047f1886bf4e94a5705d33b3eb88f1fc
--- /dev/null
+++ b/solutions/ex07/ilqr_TODO_2.py
@@ -0,0 +1,2 @@
+        A, B, c, c_x, c_u, c_xx, c_ux, c_uu = get_derivatives(model, x_bar, u_bar) 
+        J = sum(c) 
\ No newline at end of file
diff --git a/solutions/ex07/ilqr_TODO_3.py b/solutions/ex07/ilqr_TODO_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..417755d3e4956fbbf5bc0bb596377781412cb47b
--- /dev/null
+++ b/solutions/ex07/ilqr_TODO_3.py
@@ -0,0 +1 @@
+        L, l = backward_pass(A, B, c_x, c_u, c_xx, c_ux, c_uu, mu)  
\ No newline at end of file
diff --git a/solutions/ex07/ilqr_TODO_4.py b/solutions/ex07/ilqr_TODO_4.py
new file mode 100644
index 0000000000000000000000000000000000000000..6db866f5b27f7921e409d179462e51c8b8ea1420
--- /dev/null
+++ b/solutions/ex07/ilqr_TODO_4.py
@@ -0,0 +1 @@
+        x_bar, u_bar = forward_pass(model, x_bar, u_bar, L=L, l=l, alpha=alpha) 
\ No newline at end of file
diff --git a/solutions/ex07/ilqr_TODO_5.py b/solutions/ex07/ilqr_TODO_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ead664be5795faaebda22e5321ae4673eefdebf
--- /dev/null
+++ b/solutions/ex07/ilqr_TODO_5.py
@@ -0,0 +1,2 @@
+    l, L = [np.zeros((m,))] * N, [np.zeros((m, n))] * N  
+    x_bar, u_bar = forward_pass(model, x_bar, u_bar, L=L, l=l)  
\ No newline at end of file
diff --git a/solutions/ex07/ilqr_TODO_6.py b/solutions/ex07/ilqr_TODO_6.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e7b84c3c79530d3e24b420e11f4d15140e6fcd9
--- /dev/null
+++ b/solutions/ex07/ilqr_TODO_6.py
@@ -0,0 +1,2 @@
+        A, B, c, c_x, c_u, c_xx, c_ux, c_uu = get_derivatives(model, x_bar, u_bar)  
+        J_prime = sum(c)  
\ No newline at end of file
diff --git a/solutions/ex07/ilqr_TODO_7.py b/solutions/ex07/ilqr_TODO_7.py
new file mode 100644
index 0000000000000000000000000000000000000000..f23d1c9bd15a76d13938844705782aa675cc9e7e
--- /dev/null
+++ b/solutions/ex07/ilqr_TODO_7.py
@@ -0,0 +1 @@
+            L, l = backward_pass(A, B, c_x, c_u, c_xx, c_ux, c_uu, mu) 
\ No newline at end of file
diff --git a/solutions/ex07/ilqr_TODO_8.py b/solutions/ex07/ilqr_TODO_8.py
new file mode 100644
index 0000000000000000000000000000000000000000..123b9bb988a13d8f828a0af3b37c6a9f213495ff
--- /dev/null
+++ b/solutions/ex07/ilqr_TODO_8.py
@@ -0,0 +1 @@
+                J_new = cost_of_trajectory(model, x_hat, u_hat) 
\ No newline at end of file
diff --git a/solutions/ex07/ilqr_TODO_9.py b/solutions/ex07/ilqr_TODO_9.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fe4de670d25185a976a1fb36ae4eb04e539a456
--- /dev/null
+++ b/solutions/ex07/ilqr_TODO_9.py
@@ -0,0 +1 @@
+                    Delta, mu = min(1.0, Delta) / Delta_0, max(0, mu*Delta) 
\ No newline at end of file
diff --git a/solutions/ex07/ilqr_agent_TODO_1.py b/solutions/ex07/ilqr_agent_TODO_1.py
new file mode 100644
index 0000000000000000000000000000000000000000..5632d146890b0bcadefd5b04fccf7393338debcf
--- /dev/null
+++ b/solutions/ex07/ilqr_agent_TODO_1.py
@@ -0,0 +1 @@
+            u = self.ubar[k] + self.L[k]@ (x-self.xbar[k]) + self.l[k] 
\ No newline at end of file
diff --git a/solutions/ex07/ilqr_pendulum_TODO_1.py b/solutions/ex07/ilqr_pendulum_TODO_1.py
new file mode 100644
index 0000000000000000000000000000000000000000..dee07f700ec7c23c71a96ddb29e0e4e1eb9ba7e5
--- /dev/null
+++ b/solutions/ex07/ilqr_pendulum_TODO_1.py
@@ -0,0 +1 @@
+    xs, us, J_hist, L, l = ilqr(model, N, x0, n_iter=n_iter, use_linesearch=use_linesearch)  
\ No newline at end of file
diff --git a/solutions/ex07/linearization_agent_TODO_1.py b/solutions/ex07/linearization_agent_TODO_1.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a9c162e64d62675433a1f5174da17654e528e42
--- /dev/null
+++ b/solutions/ex07/linearization_agent_TODO_1.py
@@ -0,0 +1,4 @@
+        xp = model.f(xbar, ubar, k=0) 
+        A, B = model.f_jacobian(xbar, ubar, k=0)
+
+        d = xp - A @ xbar - B @ ubar 
\ No newline at end of file
diff --git a/solutions/ex07/linearization_agent_TODO_2.py b/solutions/ex07/linearization_agent_TODO_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..f84fac4dd7bf0800e2734cee2d913762b410355e
--- /dev/null
+++ b/solutions/ex07/linearization_agent_TODO_2.py
@@ -0,0 +1 @@
+        (self.L, self.l), (V, v, vc) = LQR(A=[A]*N, B=[B]*N, d=[d]*N, Q=[Q]*N, q=[q]*N, R=[self.model.cost.R]*N) 
\ No newline at end of file
diff --git a/solutions/ex07/linearization_agent_TODO_3.py b/solutions/ex07/linearization_agent_TODO_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..797bd934a4c3c08c245bbdbe956a9008510367a8
--- /dev/null
+++ b/solutions/ex07/linearization_agent_TODO_3.py
@@ -0,0 +1 @@
+        u = self.L[0] @ x + self.l[0] 
\ No newline at end of file
diff --git a/solutions/ex08/bandits_TODO_1.py b/solutions/ex08/bandits_TODO_1.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6ee400597c9534a859b929037ae3c6483ec44ee
--- /dev/null
+++ b/solutions/ex08/bandits_TODO_1.py
@@ -0,0 +1,2 @@
+        reward = self.q_star[a] + np.random.randn() 
+        gab = self.q_star[self.optimal_action] - self.q_star[a] 
\ No newline at end of file
diff --git a/solutions/ex08/gradient_agent_TODO_1.py b/solutions/ex08/gradient_agent_TODO_1.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c166f9787f05f6acd9b977fe814558616388c3d
--- /dev/null
+++ b/solutions/ex08/gradient_agent_TODO_1.py
@@ -0,0 +1,9 @@
+        pi_a = self.Pa()
+        for b in range(self.k):
+            if b == a:
+                self.H[b] += self.alpha * (r - self.R_bar) * (1 - pi_a[b])
+            else:
+                self.H[b] -= self.alpha * (r - self.R_bar) * pi_a[b]
+
+        if self.baseline:
+            self.R_bar = self.R_bar + (self.alpha if self.alpha is not None else 1/(self.t+1)) * (r - self.R_bar) 
\ No newline at end of file
diff --git a/solutions/ex08/grand_bandit_race_TODO_1.py b/solutions/ex08/grand_bandit_race_TODO_1.py
new file mode 100644
index 0000000000000000000000000000000000000000..06e8845cc54405e1d5cac07b51eb414b9170e1ef
--- /dev/null
+++ b/solutions/ex08/grand_bandit_race_TODO_1.py
@@ -0,0 +1 @@
+    bandit1 = StationaryBandit(k=10) 
\ No newline at end of file
diff --git a/solutions/ex08/grand_bandit_race_TODO_2.py b/solutions/ex08/grand_bandit_race_TODO_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9ffb8db36b8c1313eab01a4668a45f30d3cc243
--- /dev/null
+++ b/solutions/ex08/grand_bandit_race_TODO_2.py
@@ -0,0 +1,5 @@
+    agents = [BasicAgent(bandit1, epsilon=epsilon)] 
+    agents += [MovingAverageAgent(bandit1, epsilon=epsilon, alpha=alpha)]
+    agents += [GradientAgent(bandit1, alpha=alpha,use_baseline=False) ]
+    agents += [GradientAgent(bandit1, alpha=alpha,use_baseline=True) ]
+    agents += [UCBAgent(bandit1, c=2)] 
\ No newline at end of file
diff --git a/solutions/ex08/grand_bandit_race_TODO_3.py b/solutions/ex08/grand_bandit_race_TODO_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..807579c172456281f047d47aa499d87869460af6
--- /dev/null
+++ b/solutions/ex08/grand_bandit_race_TODO_3.py
@@ -0,0 +1 @@
+    eval_and_plot(bandit1, agents, max_episodes=2000, labels=labels) 
\ No newline at end of file
diff --git a/solutions/ex08/grand_bandit_race_TODO_4.py b/solutions/ex08/grand_bandit_race_TODO_4.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a9cb82fb4ef4d9adbd1f7d726d53189b2827710
--- /dev/null
+++ b/solutions/ex08/grand_bandit_race_TODO_4.py
@@ -0,0 +1 @@
+    bandit2 = StationaryBandit(k=10, q_star_mean=4) 
\ No newline at end of file
diff --git a/solutions/ex08/grand_bandit_race_TODO_5.py b/solutions/ex08/grand_bandit_race_TODO_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a6cfc7bd723679fac58db343018033760fcd8f9
--- /dev/null
+++ b/solutions/ex08/grand_bandit_race_TODO_5.py
@@ -0,0 +1 @@
+    eval_and_plot(bandit2, agents, max_episodes=2000, labels=labels) 
\ No newline at end of file
diff --git a/solutions/ex08/grand_bandit_race_TODO_6.py b/solutions/ex08/grand_bandit_race_TODO_6.py
new file mode 100644
index 0000000000000000000000000000000000000000..20c9ba027fd3dc0a70ea72bf97622c69031199f3
--- /dev/null
+++ b/solutions/ex08/grand_bandit_race_TODO_6.py
@@ -0,0 +1 @@
+    bandit3 = NonstationaryBandit(k=10)  
\ No newline at end of file
diff --git a/solutions/ex08/grand_bandit_race_TODO_7.py b/solutions/ex08/grand_bandit_race_TODO_7.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2a5676b54581b39119cddc327bdf79459f97f57
--- /dev/null
+++ b/solutions/ex08/grand_bandit_race_TODO_7.py
@@ -0,0 +1 @@
+    eval_and_plot(bandit3, agents, max_episodes=2000, steps=10000, labels=labels)  
\ No newline at end of file
diff --git a/solutions/ex08/grand_bandit_race_TODO_8.py b/solutions/ex08/grand_bandit_race_TODO_8.py
new file mode 100644
index 0000000000000000000000000000000000000000..b34bc62119f16531be2e9792bf8b442f365132d2
--- /dev/null
+++ b/solutions/ex08/grand_bandit_race_TODO_8.py
@@ -0,0 +1 @@
+    eval_and_plot(bandit1, agents2, steps=10000, labels=labels)   
\ No newline at end of file
diff --git a/solutions/ex08/nonstationary_TODO_1.py b/solutions/ex08/nonstationary_TODO_1.py
new file mode 100644
index 0000000000000000000000000000000000000000..e53da8bcceffb8f6b6c54f0cdfe39eaef44f9959
--- /dev/null
+++ b/solutions/ex08/nonstationary_TODO_1.py
@@ -0,0 +1,2 @@
+        self.q_star += self.reward_change_std * np.random.randn(self.k)
+        self.optimal_action = np.argmax(self.q_star) 
\ No newline at end of file
diff --git a/solutions/ex08/nonstationary_TODO_2.py b/solutions/ex08/nonstationary_TODO_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..26dc95b4931d7fab277c37fff8b5894f3315c332
--- /dev/null
+++ b/solutions/ex08/nonstationary_TODO_2.py
@@ -0,0 +1,2 @@
+        self.alpha=alpha
+        super().__init__(env, epsilon=epsilon) 
\ No newline at end of file
diff --git a/solutions/ex08/nonstationary_TODO_3.py b/solutions/ex08/nonstationary_TODO_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..d45b3a0f69beac0af8bfd8249534906f13bdd167
--- /dev/null
+++ b/solutions/ex08/nonstationary_TODO_3.py
@@ -0,0 +1 @@
+        self.Q[a] = self.Q[a] + self.alpha * (r-self.Q[a])
\ No newline at end of file
diff --git a/solutions/ex08/nonstationary_TODO_4.py b/solutions/ex08/nonstationary_TODO_4.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4ffd80476a720982fcd5655d5b58f00ee8e6d42
--- /dev/null
+++ b/solutions/ex08/nonstationary_TODO_4.py
@@ -0,0 +1,4 @@
+    bandit = NonstationaryBandit(k=10) 
+
+    agents = [BasicAgent(bandit, epsilon=epsilon)]
+    agents += [MovingAverageAgent(bandit, epsilon=epsilon, alpha=alpha) for alpha in alphas] 
\ No newline at end of file
diff --git a/solutions/ex08/nonstationary_TODO_5.py b/solutions/ex08/nonstationary_TODO_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..9742313984f8906af30c34cac26785b1b2ec8791
--- /dev/null
+++ b/solutions/ex08/nonstationary_TODO_5.py
@@ -0,0 +1 @@
+    labels += [f"Mov.avg. agent, epsilon={epsilon}, alpha={alpha}" for alpha in alphas] 
\ No newline at end of file
diff --git a/solutions/ex08/simple_agents_TODO_1.py b/solutions/ex08/simple_agents_TODO_1.py
new file mode 100644
index 0000000000000000000000000000000000000000..5416e07d0958fe82276fa186510f0402eafc7ece
--- /dev/null
+++ b/solutions/ex08/simple_agents_TODO_1.py
@@ -0,0 +1,2 @@
+            self.Q = np.zeros((self.k,)) 
+            self.N = np.zeros((self.k,)) 
\ No newline at end of file
diff --git a/solutions/ex08/simple_agents_TODO_2.py b/solutions/ex08/simple_agents_TODO_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d91b3938f2b78d14de98a867793d8b33f61cae55
--- /dev/null
+++ b/solutions/ex08/simple_agents_TODO_2.py
@@ -0,0 +1 @@
+        return np.random.randint(self.k) if np.random.rand() < self.epsilon else np.argmax(self.Q) 
\ No newline at end of file
diff --git a/solutions/ex08/simple_agents_TODO_3.py b/solutions/ex08/simple_agents_TODO_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..df218f01d8927b72f1920bc2841a8f2ae0b0e616
--- /dev/null
+++ b/solutions/ex08/simple_agents_TODO_3.py
@@ -0,0 +1,2 @@
+        self.N[a] = self.N[a] + 1
+        self.Q[a] = self.Q[a] + 1/self.N[a] * (r-self.Q[a]) 
\ No newline at end of file
diff --git a/solutions/ex08/ucb_agent_TODO_1.py b/solutions/ex08/ucb_agent_TODO_1.py
new file mode 100644
index 0000000000000000000000000000000000000000..4812f63b0e4fc5378d676a4d2e0459c3d2b07ca8
--- /dev/null
+++ b/solutions/ex08/ucb_agent_TODO_1.py
@@ -0,0 +1,2 @@
+        self.N[a] += 1
+        self.Q[a] += 1/self.N[a] * (r - self.Q[a]) 
\ No newline at end of file
diff --git a/solutions/ex08/ucb_agent_TODO_2.py b/solutions/ex08/ucb_agent_TODO_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..437563cd1c647bce69dc8778e93c5bf9854f0a5c
--- /dev/null
+++ b/solutions/ex08/ucb_agent_TODO_2.py
@@ -0,0 +1,3 @@
+            k = self.env.action_space.n
+            self.Q = np.zeros((k,))
+            self.N = np.zeros((k,)) 
\ No newline at end of file
diff --git a/solutions/ex08/ucb_agent_TODO_3.py b/solutions/ex08/ucb_agent_TODO_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..59255040547725524492f30aa7b91b3267f04c27
--- /dev/null
+++ b/solutions/ex08/ucb_agent_TODO_3.py
@@ -0,0 +1 @@
+        return np.argmax( self.Q + self.c * np.sqrt( np.log(k+1)/(self.N+1e-8)  )  ) 
\ No newline at end of file