Exercise 11

72620f61 · tuhe · bb039ad1 · 72620f61 · 72620f61 · 72620f61
Commit 72620f61 authored 3 months ago by tuhe
--- a/irlc/ex11/__init__.py
+++ b/irlc/ex11/__init__.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""This directory contains the exercises for week 11."""
--- a/irlc/ex11/feature_encoder.py
+++ b/irlc/ex11/feature_encoder.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""
+References:
+  [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
+"""
+from math import floor
+from gymnasium.spaces.box import Box
+import numpy as np
+from irlc.ex09.rl_agent import _masked_actions
+from irlc.utils.common import defaultdict2
+class FeatureEncoder:
+    r"""
+    The idea behind linear function approximation of :math:`Q`-values is that
+    - We initialize (and eventually learn) a :math:`d`-dimensional weight vector :math:`w \in \mathbb{R}^d`
+    - We assume there exists a function to compute a :math:`d`-dimensional feature vector :math:`x(s,a) \in \mathbb{R}^d`
+    - The :math:`Q`-values are then represented as
+      .. math::
+         Q(s,a) = x(s,a)^\top w
+    Learning is therefore entirely about updating :math:`w`.
+    The following example shows how you initialize the linear :math:`Q`-values and compute them in a given state:
+    .. runblock:: pycon
+        >>> import gymnasium as gym
+        >>> from irlc.ex11.feature_encoder import LinearQEncoder
+        >>> env = gym.make('MountainCar-v0')
+        >>> Q = LinearQEncoder(env, tilings=8)
+        >>> s, _ = env.reset()
+        >>> a = env.action_space.sample()
+        >>> Q(s,a) # Compute a Q-value.
+        >>> Q.d             # Get the number of dimensions
+        >>> Q.x(s,a)[:4]    # Get the first four coordinates of the x-vector
+        >>> Q.w[:4]         # Get the first four coordinates of the w-vector
+    """
+    def __init__(self, env):
+        """
+        Initialize the feature encoder. It requires an environment to know the number of actions and dimension of the state space.
+        :param env: An openai Gym ``Env``.
+        """
+        self.env = env
+        self.w = np.zeros((self.d, ))
+        self._known_masks = {}
+        def q_default(s):
+            from irlc.utils.common import DiscreteTextActionSpace
+            if s in self._known_masks:
+                return {a: 0 for a in range(self.env.action_space.n) if
+                        self._known_masks[s][(a - self.env.action_space.start) if not isinstance(self.env.action_space, DiscreteTextActionSpace) else a] == 1}
+            else:
+                return {a: 0 for a in range(self.env.action_space.n)}
+        self.q_ = defaultdict2(lambda s: q_default(s))
+    @property
+    def d(self):
+        """ Get the number of dimensions of :math:`w`
+        .. runblock:: pycon
+            >>> import gymnasium as gym
+            >>> from irlc.ex11.feature_encoder import LinearQEncoder
+            >>> env = gym.make('MountainCar-v0')
+            >>> Q = LinearQEncoder(env, tilings=8) # Same encoding as Sutton & Barto
+            >>> Q.d
+        """
+        raise NotImplementedError()
+    def x(self, s, a):
+        """
+        Computes the :math:`d`-dimensional feature vector :math:`x(s,a)`
+        .. runblock:: pycon
+           >>> import gymnasium as gym
+           >>> from irlc.ex11.feature_encoder import LinearQEncoder
+           >>> env = gym.make('MountainCar-v0')
+           >>> Q = LinearQEncoder(env, tilings=8) # Same encoding as Sutton & Barto
+           >>> s, info = env.reset()
+           >>> x = Q.x(s, env.action_space.sample())
+        :param s: A state :math:`s`
+        :param a: An action :math:`a`
+        :return: Feature vector :math:`x(s,a)`
+        """
+        raise NotImplementedError()
+    def get_Qs(self, state, info_s=None):
+        """
+        This is a helper function, it is only for internal use.
+        :param state:
+        :param info_s:
+        :return:
+        """
+        if info_s is not None and 'mask' in info_s and not isinstance(state, np.ndarray):
+            if state not in self._known_masks:
+                self._known_masks[state] = info_s['mask']
+                # Probably a good idea to check the Q-values are okay...
+                avail_actions = _masked_actions(self.env.action_space, info_s['mask'])
+                self.q_[state] = {a: self.q_[state][a] for a in avail_actions}
+            # raise Exception()
+        # from irlc.utils.common import ExplicitActionSpace
+        #
+        # zip(*self.q_[state].items())
+        from irlc.pacman.pacman_environment import PacmanEnvironment
+        from irlc.pacman.pacman_utils import Actions
+        if isinstance(state, np.ndarray):
+            actions = tuple(range(self.env.action_space.n))
+        elif isinstance(self.env, PacmanEnvironment):
+            # actions = Actions
+            # actions = tuple(Actions._directions.keys())
+            actions =  _masked_actions(self.env.action_space, info_s['mask'])
+            actions = tuple([self.env.action_space.actions[n] for n in actions])
+        else:
+            actions = tuple(self.q_[state].keys())
+        # if isinstance(self.env, PacmanEnvironment):
+        #     # TODO: Make smarter masking.
+        #     actions = [a for a in actions if a in self.env.A(state)]
+        # actions =
+        Qs = tuple([self(state,a) for a in actions])
+        # TODO: Implement masking and masking-cache.
+        return actions, Qs
+        #
+        # actions = list( self.env.P[state].keys() if hasattr(self.env, 'P') else range(self.env.action_space.n) )
+        # Qs = [self(state, a) for a in actions]
+        # return tuple(actions), tuple(Qs)
+    def get_optimal_action(self, state, info=None):
+        r"""
+        For a given state ``state``, this function returns the optimal action for that state.
+        .. math::
+            a^* = \arg\max_a Q(s,a)
+        An example:
+        .. runblock:: pycon
+           >>> from irlc.ex09.rl_agent import TabularAgent
+           >>> class MyAgent(TabularAgent):
+           ...     def pi(self, s, k, info=None):
+           ...         a_star = self.Q.get_optimal_action(s, info)
+        :param state: State to find the optimal action in :math:`s`
+        :param info: The ``info``-dictionary corresponding to this state
+        :return: The optimal action according to the Q-values :math:`a^*`
+        """
+        actions, Qa = self.get_Qs(state, info)
+        if len(actions) == 0:
+            print("Bad actions list")
+        a_ = np.argmax(np.asarray(Qa) + np.random.rand(len(Qa)) * 1e-8)
+        return actions[a_]
+    def __call__(self, s, a):
+        """
+        Evaluate the Q-values for the given state and action. An example:
+        .. runblock:: pycon
+           >>> import gymnasium as gym
+           >>> from irlc.ex11.feature_encoder import LinearQEncoder
+           >>> env = gym.make('MountainCar-v0')
+           >>> Q = LinearQEncoder(env, tilings=8) # Same encoding as Sutton & Barto
+           >>> s, info = env.reset()
+           >>> Q(s, env.action_space.sample()). # Compute Q(s,a)
+        :param s: A state :math:`s`
+        :param a: An action :math:`a`
+        :return: Feature vector :math:`x(s,a)`
+        """
+        return self.x(s, a) @ self.w
+    def __getitem__(self, item):
+        raise Exception("Hi! You tried to access linear Q-values as Q[s,a]. You need to use Q(s,a). This choice signifies they are not represented as a table, but as a linear combination x(s,a)^T w")
+        # s,a = item
+        # return self.__call__(s, a)
+    def __setitem__(self, key, value):
+        raise Exception("Oy! You tried to set a linearly encoded Q-value as in Q[s, a] = new_q_value.\n This is not possible since they are represented as x(s,a)^T w. Rewrite the expression to update Q.w.")
+class DirectEncoder(FeatureEncoder):
+    def __init__(self, env):
+        self.d_ = np.prod( env.observation_space.shape ) * env.action_space.n
+        # self.d_ = len(self.x(env.reset(), env.action_space.n))
+        super().__init__(env)
+    def x(self, s, a):
+        xx = np.zeros( (self.d,))
+        n = s.size
+        xx[n * a:n*(a+1) ] = s
+        return xx
+        ospace = self.env.observation_space.shape
+        simple = False
+        if not isinstance(ospace, tuple):
+            ospace = (ospace,)
+            simple = True
+        sz = []
+        for j, disc in enumerate(ospace):
+            sz.append(disc.n)
+        total_size = sum(sz)
+        csum = np.cumsum(sz, ) - sz[0]
+        self.max_size = total_size * self.env.action_space.n
+        def fixed_sparse_representation(s, action):
+            if simple:
+                s = (s,)
+            s_encoded = [cs + ds + total_size * action for ds, cs in zip(s, csum)]
+            return s_encoded
+        self.get_active_tiles = fixed_sparse_representation
+    # super().__init__(env)
+    @property
+    def d(self):
+        return self.d_
+        return 10000*8
+        x = np.zeros(self.d)
+        at = self.get_active_tiles(s, a)
+        x[at] = 1.0
+        return x
+class GridworldXYEncoder(FeatureEncoder):
+    def __init__(self, env):
+        self.env = env
+        self.na = self.env.action_space.n
+        self.ns = 2
+        super().__init__(env)
+    @property
+    def d(self):
+        return self.na*self.ns
+    def x(self, s, a):
+        x,y = s
+        xx = [np.zeros(self.ns) for _ in range(self.na)]
+        xx[a][0] = x
+        xx[a][1] = y
+        # return xx[a]
+        xx = np.concatenate(xx)
+        return xx
+class SimplePacmanExtractor(FeatureEncoder):
+    def __init__(self, env):
+        self.env = env
+        from irlc.pacman.feature_extractor import SimpleExtractor
+        # from reinforcement.featureExtractors import SimpleExtractor
+        self._extractor = SimpleExtractor()
+        self.fields = ["bias", "#-of-ghosts-1-step-away", "#-of-ghosts-1-step-away", "eats-food", "closest-food"]
+        super().__init__(env)
+    def x(self, s, a):
+        xx = np.zeros_like(self.w)
+        # ap = self.env._actions_gym2pac[a]
+        ap = a
+        for k, v in self._extractor.getFeatures(s, ap).items():
+            xx[self.fields.index(k)] = v
+        return xx
+    @property
+    def d(self):
+        return len(self.fields)
+class LinearQEncoder(FeatureEncoder):
+    def __init__(self, env, tilings=8, max_size=2048):
+        r"""
+        Implements the tile-encoder described by (SB18)
+        :param env: The openai Gym environment we wish to solve.
+        :param tilings: Number of tilings (translations). Typically 8.
+        :param max_size: Maximum number of dimensions.
+        """
+        if isinstance(env.observation_space, Box):
+            os = env.observation_space
+            low = os.low
+            high = os.high
+            scale = tilings / (high - low)
+            hash_table = IHT(max_size)
+            self.max_size = max_size
+            def tile_representation(s, action):
+                s_ = list( (s*scale).flat )
+                active_tiles = tiles(hash_table, tilings, s_, [action]) # (s * scale).tolist()
+                # if 0 not in active_tiles:
+                #     active_tiles.append(0)
+                return active_tiles
+            self.get_active_tiles = tile_representation
+        else:
+            # raise Exception("Implement in new class")
+            #
+            # Use Fixed Sparse Representation. See:
+            # https://castlelab.princeton.edu/html/ORF544/Readings/Geramifard%20-%20Tutorial%20on%20linear%20function%20approximations%20for%20dynamic%20programming%20and%20RL.pdf
+            ospace = env.observation_space
+            simple = False
+            if not isinstance(ospace, tuple):
+                ospace = (ospace,)
+                simple = True
+            sz = []
+            for j,disc in enumerate(ospace):
+                sz.append( disc.n )
+            total_size = sum(sz)
+            csum = np.cumsum(sz,) - sz[0]
+            self.max_size = total_size * env.action_space.n
+            def fixed_sparse_representation(s, action):
+                if simple:
+                   s = (s,)
+                s_encoded = [cs + ds + total_size * action for ds,cs in zip(s, csum)]
+                return s_encoded
+            self.get_active_tiles = fixed_sparse_representation
+        super().__init__(env)
+    def x(self, s, a):
+        x = np.zeros(self.d)
+        at = self.get_active_tiles(s, a)
+        x[at] = 1.0
+        return x
+    @property
+    def d(self):
+        return self.max_size
+"""
+Following code contains the tile-coding utilities copied from:
+http://incompleteideas.net/tiles/tiles3.py-remove
+"""
+class IHT:
+    """Structure to handle collisions"""
+    def __init__(self, size_val):
+        self.size = size_val
+        self.overfull_count = 0
+        self.dictionary = {}
+    def count(self):
+        return len(self.dictionary)
+    def full(self):
+        return len(self.dictionary) >= self.size
+    def get_index(self, obj, read_only=False):
+        d = self.dictionary
+        if obj in d:
+            return d[obj]
+        elif read_only:
+            return None
+        size = self.size
+        count = self.count()
+        if count >= size:
+            if self.overfull_count == 0:
+                print('IHT full, starting to allow collisions')
+            self.overfull_count += 1
+            return hash(obj) % self.size
+        else:
+            d[obj] = count
+            return count
+def hash_coords(coordinates, m, read_only=False):
+    if isinstance(m, IHT): return m.get_index(tuple(coordinates), read_only)
+    if isinstance(m, int): return hash(tuple(coordinates)) % m
+    if m is None: return coordinates
+def tiles(iht_or_size, num_tilings, floats, ints=None, read_only=False):
+    """returns num-tilings tile indices corresponding to the floats and ints"""
+    if ints is None:
+        ints = []
+    qfloats = [floor(f * num_tilings) for f in floats]
+    tiles = []
+    for tiling in range(num_tilings):
+        tilingX2 = tiling * 2
+        coords = [tiling]
+        b = tiling
+        for q in qfloats:
+            coords.append((q + b) // num_tilings)
+            b += tilingX2
+        coords.extend(ints)
+        tiles.append(hash_coords(coords, iht_or_size, read_only))
+    return tiles
--- a/irlc/ex11/nstep_sarsa_agent.py
+++ b/irlc/ex11/nstep_sarsa_agent.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""
+References:
+  [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
+"""
+from irlc.ex01.agent import train
+import gymnasium as gym
+from irlc import main_plot
+import matplotlib.pyplot as plt
+from irlc.ex11.q_agent import QAgent
+class SarsaNAgent(QAgent):
+    r""" Implement the N-step semi-gradient sarsa agent from (SB18, Section 7.2)"""
+    def __init__(self, env, gamma=1, alpha=0.2, epsilon=0.1, n=1):
+        # Variables for TD-n
+        self.n = n # as in n-step sarse
+        # Buffer lists for previous (S_t, R_{t}, A_t) triplets
+        self.R, self.S, self.A = [None] * (self.n + 1), [None] * (self.n + 1), [None] * (self.n + 1)
+        super().__init__(env, gamma=gamma, alpha=alpha, epsilon=epsilon)
+    def pi(self, s, k, info=None):
+        self.t = k  # Save current step in episode for use in train.
+        if self.t == 0: # First action is epsilon-greedy.
+            self.A[self.t] = self.pi_eps(s, info)
+        return self.A[self.t % (self.n+1)]
+    def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):
+        # Recall we are given S_t, A_t, R_{t+1}, S_{t+1} and done is whether t=T+1.
+        n = self.n  # n as in n-step sarsa.
+        t = self.t  # Current time step t as in s_t.
+        if t == 0:  # We are in the initial state. Reset buffer.
+            self.S[0], self.A[0] = s, a
+        # Store current observations in buffer.
+        self.S[(t+1)%(n+1)] = sp
+        self.R[(t+1)%(n+1)] = r
+        self.A[(t+1)%(n+1)] = self.pi_eps(sp, info_sp) if not done else -1
+        if done:
+            T = t+1
+            tau_steps_to_train = range(t - n + 1, T)
+        else:
+            T = 1e10
+            tau_steps_to_train = [t - n + 1]
+        # Tau represent the current tau-steps which are to be updated. The notation is compatible with that in Sutton.
+        for tau in tau_steps_to_train:
+            if tau >= 0:
+                """
+                Compute the return for this tau-step and perform the relevant Q-update. 
+                The first step is to compute the expected return G in the below section. 
+                """
+                # TODO: 4 lines missing.
+                raise NotImplementedError("Compute G= (expected return) here.")
+                S_tau, A_tau = self.S[tau%(n+1)], self.A[tau%(n+1)]
+                delta = (G - self._q(S_tau, A_tau))
+                if n == 1: # Check your implementation is correct when n=1 by comparing it with regular Sarsa learning.
+                    delta_Sarsa = (r + (0 if done else self.gamma * self._q(sp,A_tau_n)) - self._q(S_tau,A_tau))
+                    if abs(delta-delta_Sarsa) > 1e-10:
+                        raise Exception("n=1 agreement with Sarsa learning failed. You have at least one bug!")
+                self._upd_q(S_tau, A_tau, delta)
+    def _q(self, s, a): return self.Q[s,a] # Using these helper methods will come in handy when we work with function approximators, but it is optional.
+    def _upd_q(self, s, a, delta): self.Q[s,a] += self.alpha * delta
+    def __str__(self):
+        return f"SarsaN_{self.gamma}_{self.epsilon}_{self.alpha}_{self.n}"
+if __name__ == "__main__":
+    envn = 'CliffWalking-v0'
+    env = gym.make(envn)
+    from irlc.ex11.sarsa_agent import sarsa_exp
+    from irlc.ex11.q_agent import q_exp
+    agent = SarsaNAgent(env, n=5, epsilon=0.1,alpha=0.5)
+    exp = f"experiments/{envn}_{agent}"
+    for _ in range(10): # Train 10 times to get an idea about the average performance.
+        train(env, agent, exp, num_episodes=200, max_runs=10)
+    main_plot([q_exp, sarsa_exp, exp], smoothing_window=10) # plot with results from Q/Sarsa simulations.
+    plt.ylim([-100,0])
+    from irlc import savepdf
+    savepdf("n_step_sarsa_cliff")
+    plt.show()
--- a/irlc/ex11/q_agent.py
+++ b/irlc/ex11/q_agent.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""
+References:
+  [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
+"""
+from irlc.ex09.mdp import GymEnv2MDP
+from irlc.ex09.rl_agent import TabularAgent
+from irlc import train
+import gymnasium as gym
+from irlc import main_plot
+import matplotlib.pyplot as plt
+from irlc import savepdf
+from irlc.ex09.value_iteration_agent import ValueIterationAgent
+class QAgent(TabularAgent):
+    r"""
+    Implement the Q-learning agent (SB18, Section 6.5)
+    Note that the Q-datastructure already exist, as do helper functions useful to compute an epsilon-greedy policy.
+    You can access these as
+    > self.Q[s,a] = 31 # Set a Q-value.
+    See the TabularAgent class for more information.
+    """
+    def __init__(self, env, gamma=1.0, alpha=0.5, epsilon=0.1):
+        self.alpha = alpha
+        super().__init__(env, gamma, epsilon)
+    def pi(self, s, k, info=None): 
+        """
+        Return current action using epsilon-greedy exploration. You should look at the TabularAgent class for ideas.
+        """
+        # TODO: 1 lines missing.
+        raise NotImplementedError("Implement the epsilon-greedy policy here.")
+        return action
+    def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): 
+        """
+        Implement the Q-learning update rule, i.e. compute a* from the Q-values.
+        As a hint, note that self.Q[sp,a] corresponds to q(s_{t+1}, a) and
+        that what you need to update is self.Q[s, a] = ...
+        You may want to look at self.Q.get_optimal_action(state) to compute a = argmax_a Q[s,a].
+        """
+        # TODO: 3 lines missing.
+        raise NotImplementedError("Update the Q[s,a]-values here.")
+    def __str__(self):
+        return f"QLearner_{self.gamma}_{self.epsilon}_{self.alpha}"
+q_exp = f"experiments/cliffwalk_Q"
+epsilon = 0.1
+max_runs = 10
+alpha = 0.5
+def cliffwalk():
+    env = gym.make('CliffWalking-v0')
+    agent = QAgent(env, epsilon=epsilon, alpha=alpha)
+    train(env, agent, q_exp, num_episodes=200, max_runs=max_runs)
+    # As a baseline, we set up/evaluate a value-iteration agent to get an idea about the optimal performance.
+    # To do so, we need an MDP object. We create an MDP object out of the gym environment below.
+    # You can look at the code if you like, but it is simply a helper function to convert from one datastructure to another,
+    # and all it does is to give a MDP object which is needed for our value-iteration implementation from the previous
+    # week.
+    mdp = GymEnv2MDP(env)
+    vi_exp = "experiments/cliffwalk_VI"
+    Vagent = ValueIterationAgent(env, mdp=mdp, epsilon=epsilon)
+    train(env, Vagent, vi_exp, num_episodes=200, max_runs=max_runs)
+    vi_exp_opt = "experiments/cliffwalk_VI_optimal"
+    Vagent_opt = ValueIterationAgent(env, mdp=mdp, epsilon=0) # Same, but with epsilon=0
+    train(env, Vagent_opt, vi_exp_opt, num_episodes=200, max_runs=max_runs)
+    exp_names = [q_exp, vi_exp, vi_exp_opt]
+    return env, exp_names
+if __name__ == "__main__":
+    for _ in range(10):
+        env, exp_names = cliffwalk()
+    main_plot(exp_names, smoothing_window=10)
+    plt.ylim([-100, 0])
+    plt.title("Q-learning on " + env.spec.name)
+    savepdf("Q_learning_cliff")
+    plt.show()
--- a/irlc/ex11/sarsa_agent.py
+++ b/irlc/ex11/sarsa_agent.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""
+References:
+  [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
+"""
+import matplotlib.pyplot as plt
+from irlc.ex11.q_agent import QAgent
+from irlc import main_plot, savepdf
+from irlc.ex01.agent import train
+from irlc.ex11.q_agent import cliffwalk, alpha, epsilon
+class SarsaAgent(QAgent):
+    r""" Implement the Sarsa control method from (SB18, Section 6.4). It is recommended you complete
+    the Q-agent first because the two methods are very similar and the Q-agent is easier to implement. """
+    def __init__(self, env, gamma=1, alpha=0.5, epsilon=0.1):
+        super().__init__(env, gamma=gamma, alpha=alpha, epsilon=epsilon)
+    def pi(self, s, k, info=None):
+        if k == 0: 
+            """ we are at the beginning of the episode. Generate a by being epsilon-greedy"""
+            # TODO: 1 lines missing.
+            raise NotImplementedError("Implement function body")
+        else: 
+            """ Return the action self.a you generated during the train where you know s_{t+1} """
+            # TODO: 1 lines missing.
+            raise NotImplementedError("Implement function body")
+    def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):
+        """
+        generate A' as self.a by being epsilon-greedy. Re-use code from the Agent class.
+        """
+        # TODO: 1 lines missing.
+        raise NotImplementedError("self.a = ....")
+        """ now that you know A' = self.a, perform the update to self.Q[s,a] here """
+        # TODO: 2 lines missing.
+        raise NotImplementedError("Insert your solution and remove this error.")
+    def __str__(self):
+        return f"Sarsa{self.gamma}_{self.epsilon}_{self.alpha}"
+sarsa_exp = f"experiments/cliffwalk_Sarsa"
+if __name__ == "__main__":
+    env, q_experiments = cliffwalk()  # get results from Q-learning
+    agent = SarsaAgent(env, epsilon=epsilon, alpha=alpha)
+    for _ in range(10):
+        train(env, agent, sarsa_exp, num_episodes=200, max_runs=10)
+    main_plot(q_experiments + [sarsa_exp], smoothing_window=10)
+    plt.ylim([-100, 0])
+    plt.title("Q and Sarsa learning on " + env.spec.name)
+    savepdf("QSarsa_learning_cliff")
+    plt.show()
--- a/irlc/ex11/semi_grad_q.py
+++ b/irlc/ex11/semi_grad_q.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+import gymnasium as gym
+from irlc.ex01.agent import train
+from irlc import main_plot
+import matplotlib.pyplot as plt
+from irlc.ex11.q_agent import QAgent
+from irlc.ex11.feature_encoder import LinearQEncoder
+from irlc import savepdf
+class LinearSemiGradQAgent(QAgent): 
+    def __init__(self, env, gamma=1.0, alpha=0.5, epsilon=0.1, q_encoder=None):
+        """ The Q-values, as implemented using a function approximator, can now be accessed as follows:
+        >> self.Q(s,a) # Compute q-value
+        >> self.Q.x(s,a) # Compute gradient of the above expression wrt. w
+        >> self.Q.w # get weight-vector.
+        I would recommend inserting a breakpoint and investigating the above expressions yourself;
+        you can of course al check the class LinearQEncoder if you want to see how it is done in practice.
+        """
+        super().__init__(env, gamma, epsilon=epsilon, alpha=alpha)
+        self.Q = LinearQEncoder(env, tilings=8) if q_encoder is None else q_encoder 
+    def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): 
+        # TODO: 4 lines missing.
+        raise NotImplementedError("Implement function body")
+    def __str__(self):
+        return f"LinearSemiGradQ{self.gamma}_{self.epsilon}_{self.alpha}"
+num_of_tilings = 8
+alpha = 1 / num_of_tilings
+episodes = 300
+x = "Episode"
+experiment_q = "experiments/mountaincar_semigrad_q"
+if __name__ == "__main__":
+    from irlc.ex10 import envs
+    env = gym.make("MountainCar500-v0")
+    for _ in range(10):
+        agent = LinearSemiGradQAgent(env, gamma=1, alpha=alpha, epsilon=0)
+        train(env, agent, experiment_q, num_episodes=episodes, max_runs=10)
+    main_plot(experiments=[experiment_q], x_key=x, y_key='Length', smoothing_window=30, resample_ticks=100)
+    savepdf("semigrad_q")
+    plt.show()
--- a/irlc/ex11/semi_grad_sarsa.py
+++ b/irlc/ex11/semi_grad_sarsa.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""
+References:
+  [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
+"""
+import matplotlib.pyplot as plt
+from irlc import main_plot, savepdf
+from irlc.ex01.agent import train
+import numpy as np
+import gymnasium as gym
+from irlc.ex11.semi_grad_q import LinearSemiGradQAgent
+np.seterr(all='raise')
+class LinearSemiGradSarsa(LinearSemiGradQAgent):
+    def __init__(self, env, gamma=0.99, epsilon=0.1, alpha=0.5, q_encoder=None):
+        r"""Implement the Linear semi-gradient Sarsa method from (SB18, Section 10.1)"""
+        super().__init__(env, gamma, epsilon=epsilon, alpha=alpha, q_encoder=q_encoder)
+    def pi(self, s, k, info=None): 
+        # TODO: 1 lines missing.
+        raise NotImplementedError("Implement function body")
+        return action
+    def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):
+        # TODO: 4 lines missing.
+        raise NotImplementedError("Insert your solution and remove this error.")
+        if sum(np.abs(self.Q.w)) > 1e5: raise Exception("Weights diverged. Decrease alpha")
+    def __str__(self):
+        return f"LinSemiGradSarsa{self.gamma}_{self.epsilon}_{self.alpha}"
+experiment_sarsa = "experiments/mountaincar_Sarsa"
+if __name__ == "__main__":
+    from irlc.ex11.semi_grad_q import experiment_q, alpha, x
+    from irlc.ex10 import envs
+    env = gym.make("MountainCar500-v0")
+    for _ in range(10):
+        agent = LinearSemiGradSarsa(env, gamma=1, alpha=alpha, epsilon=0)
+        train(env, agent, experiment_sarsa, num_episodes=300, max_runs=10)
+    main_plot(experiments=[experiment_q, experiment_sarsa], x_key=x, y_key='Length', smoothing_window=30)
+    savepdf("semigrad_q_sarsa")
+    plt.show()
+    # Turn off averaging
+    main_plot(experiments=[experiment_q, experiment_sarsa], x_key=x, y_key='Length', smoothing_window=30, units="Unit", estimator=None)
+    savepdf("semigrad_q_sarsa_individual")
+    plt.show()
--- a/irlc/tests/unitgrade_data/BanditQuestion.pkl
+++ b/irlc/tests/unitgrade_data/BanditQuestion.pkl
--- a/irlc/tests/unitgrade_data/CartpoleCostQuestion.pkl
+++ b/irlc/tests/unitgrade_data/CartpoleCostQuestion.pkl
--- a/irlc/tests/unitgrade_data/CartpoleTimeQuestion.pkl
+++ b/irlc/tests/unitgrade_data/CartpoleTimeQuestion.pkl
--- a/irlc/tests/unitgrade_data/DirectAgentPendulum.pkl
+++ b/irlc/tests/unitgrade_data/DirectAgentPendulum.pkl
--- a/irlc/tests/unitgrade_data/DirectMethods.pkl
+++ b/irlc/tests/unitgrade_data/DirectMethods.pkl
--- a/irlc/tests/unitgrade_data/DirectSolverQuestion.pkl
+++ b/irlc/tests/unitgrade_data/DirectSolverQuestion.pkl
--- a/irlc/tests/unitgrade_data/Exam5InventoryEvaluation.pkl
+++ b/irlc/tests/unitgrade_data/Exam5InventoryEvaluation.pkl
--- a/irlc/tests/unitgrade_data/Exam6Toy2d.pkl
+++ b/irlc/tests/unitgrade_data/Exam6Toy2d.pkl
--- a/irlc/tests/unitgrade_data/ExamQuestion7FlowersStore.pkl
+++ b/irlc/tests/unitgrade_data/ExamQuestion7FlowersStore.pkl
--- a/irlc/tests/unitgrade_data/ExamQuestionTD0.pkl
+++ b/irlc/tests/unitgrade_data/ExamQuestionTD0.pkl
--- a/irlc/tests/unitgrade_data/GradientBanditQuestion.pkl
+++ b/irlc/tests/unitgrade_data/GradientBanditQuestion.pkl
--- a/irlc/tests/unitgrade_data/ILQRAgentQuestion.pkl
+++ b/irlc/tests/unitgrade_data/ILQRAgentQuestion.pkl
--- a/irlc/tests/unitgrade_data/ILQRPendulumQuestion.pkl
+++ b/irlc/tests/unitgrade_data/ILQRPendulumQuestion.pkl