Skip to content
Snippets Groups Projects
Commit 72620f61 authored by tuhe's avatar tuhe
Browse files

Exercise 11

parent bb039ad1
Branches
No related tags found
No related merge requests found
Showing
with 719 additions and 0 deletions
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
"""This directory contains the exercises for week 11."""
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
"""
References:
[SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
"""
from math import floor
from gymnasium.spaces.box import Box
import numpy as np
from irlc.ex09.rl_agent import _masked_actions
from irlc.utils.common import defaultdict2
class FeatureEncoder:
r"""
The idea behind linear function approximation of :math:`Q`-values is that
- We initialize (and eventually learn) a :math:`d`-dimensional weight vector :math:`w \in \mathbb{R}^d`
- We assume there exists a function to compute a :math:`d`-dimensional feature vector :math:`x(s,a) \in \mathbb{R}^d`
- The :math:`Q`-values are then represented as
.. math::
Q(s,a) = x(s,a)^\top w
Learning is therefore entirely about updating :math:`w`.
The following example shows how you initialize the linear :math:`Q`-values and compute them in a given state:
.. runblock:: pycon
>>> import gymnasium as gym
>>> from irlc.ex11.feature_encoder import LinearQEncoder
>>> env = gym.make('MountainCar-v0')
>>> Q = LinearQEncoder(env, tilings=8)
>>> s, _ = env.reset()
>>> a = env.action_space.sample()
>>> Q(s,a) # Compute a Q-value.
>>> Q.d # Get the number of dimensions
>>> Q.x(s,a)[:4] # Get the first four coordinates of the x-vector
>>> Q.w[:4] # Get the first four coordinates of the w-vector
"""
def __init__(self, env):
"""
Initialize the feature encoder. It requires an environment to know the number of actions and dimension of the state space.
:param env: An openai Gym ``Env``.
"""
self.env = env
self.w = np.zeros((self.d, ))
self._known_masks = {}
def q_default(s):
from irlc.utils.common import DiscreteTextActionSpace
if s in self._known_masks:
return {a: 0 for a in range(self.env.action_space.n) if
self._known_masks[s][(a - self.env.action_space.start) if not isinstance(self.env.action_space, DiscreteTextActionSpace) else a] == 1}
else:
return {a: 0 for a in range(self.env.action_space.n)}
self.q_ = defaultdict2(lambda s: q_default(s))
@property
def d(self):
""" Get the number of dimensions of :math:`w`
.. runblock:: pycon
>>> import gymnasium as gym
>>> from irlc.ex11.feature_encoder import LinearQEncoder
>>> env = gym.make('MountainCar-v0')
>>> Q = LinearQEncoder(env, tilings=8) # Same encoding as Sutton & Barto
>>> Q.d
"""
raise NotImplementedError()
def x(self, s, a):
"""
Computes the :math:`d`-dimensional feature vector :math:`x(s,a)`
.. runblock:: pycon
>>> import gymnasium as gym
>>> from irlc.ex11.feature_encoder import LinearQEncoder
>>> env = gym.make('MountainCar-v0')
>>> Q = LinearQEncoder(env, tilings=8) # Same encoding as Sutton & Barto
>>> s, info = env.reset()
>>> x = Q.x(s, env.action_space.sample())
:param s: A state :math:`s`
:param a: An action :math:`a`
:return: Feature vector :math:`x(s,a)`
"""
raise NotImplementedError()
def get_Qs(self, state, info_s=None):
"""
This is a helper function, it is only for internal use.
:param state:
:param info_s:
:return:
"""
if info_s is not None and 'mask' in info_s and not isinstance(state, np.ndarray):
if state not in self._known_masks:
self._known_masks[state] = info_s['mask']
# Probably a good idea to check the Q-values are okay...
avail_actions = _masked_actions(self.env.action_space, info_s['mask'])
self.q_[state] = {a: self.q_[state][a] for a in avail_actions}
# raise Exception()
# from irlc.utils.common import ExplicitActionSpace
#
# zip(*self.q_[state].items())
from irlc.pacman.pacman_environment import PacmanEnvironment
from irlc.pacman.pacman_utils import Actions
if isinstance(state, np.ndarray):
actions = tuple(range(self.env.action_space.n))
elif isinstance(self.env, PacmanEnvironment):
# actions = Actions
# actions = tuple(Actions._directions.keys())
actions = _masked_actions(self.env.action_space, info_s['mask'])
actions = tuple([self.env.action_space.actions[n] for n in actions])
else:
actions = tuple(self.q_[state].keys())
# if isinstance(self.env, PacmanEnvironment):
# # TODO: Make smarter masking.
# actions = [a for a in actions if a in self.env.A(state)]
# actions =
Qs = tuple([self(state,a) for a in actions])
# TODO: Implement masking and masking-cache.
return actions, Qs
#
# actions = list( self.env.P[state].keys() if hasattr(self.env, 'P') else range(self.env.action_space.n) )
# Qs = [self(state, a) for a in actions]
# return tuple(actions), tuple(Qs)
def get_optimal_action(self, state, info=None):
r"""
For a given state ``state``, this function returns the optimal action for that state.
.. math::
a^* = \arg\max_a Q(s,a)
An example:
.. runblock:: pycon
>>> from irlc.ex09.rl_agent import TabularAgent
>>> class MyAgent(TabularAgent):
... def pi(self, s, k, info=None):
... a_star = self.Q.get_optimal_action(s, info)
:param state: State to find the optimal action in :math:`s`
:param info: The ``info``-dictionary corresponding to this state
:return: The optimal action according to the Q-values :math:`a^*`
"""
actions, Qa = self.get_Qs(state, info)
if len(actions) == 0:
print("Bad actions list")
a_ = np.argmax(np.asarray(Qa) + np.random.rand(len(Qa)) * 1e-8)
return actions[a_]
def __call__(self, s, a):
"""
Evaluate the Q-values for the given state and action. An example:
.. runblock:: pycon
>>> import gymnasium as gym
>>> from irlc.ex11.feature_encoder import LinearQEncoder
>>> env = gym.make('MountainCar-v0')
>>> Q = LinearQEncoder(env, tilings=8) # Same encoding as Sutton & Barto
>>> s, info = env.reset()
>>> Q(s, env.action_space.sample()). # Compute Q(s,a)
:param s: A state :math:`s`
:param a: An action :math:`a`
:return: Feature vector :math:`x(s,a)`
"""
return self.x(s, a) @ self.w
def __getitem__(self, item):
raise Exception("Hi! You tried to access linear Q-values as Q[s,a]. You need to use Q(s,a). This choice signifies they are not represented as a table, but as a linear combination x(s,a)^T w")
# s,a = item
# return self.__call__(s, a)
def __setitem__(self, key, value):
raise Exception("Oy! You tried to set a linearly encoded Q-value as in Q[s, a] = new_q_value.\n This is not possible since they are represented as x(s,a)^T w. Rewrite the expression to update Q.w.")
class DirectEncoder(FeatureEncoder):
def __init__(self, env):
self.d_ = np.prod( env.observation_space.shape ) * env.action_space.n
# self.d_ = len(self.x(env.reset(), env.action_space.n))
super().__init__(env)
def x(self, s, a):
xx = np.zeros( (self.d,))
n = s.size
xx[n * a:n*(a+1) ] = s
return xx
ospace = self.env.observation_space.shape
simple = False
if not isinstance(ospace, tuple):
ospace = (ospace,)
simple = True
sz = []
for j, disc in enumerate(ospace):
sz.append(disc.n)
total_size = sum(sz)
csum = np.cumsum(sz, ) - sz[0]
self.max_size = total_size * self.env.action_space.n
def fixed_sparse_representation(s, action):
if simple:
s = (s,)
s_encoded = [cs + ds + total_size * action for ds, cs in zip(s, csum)]
return s_encoded
self.get_active_tiles = fixed_sparse_representation
# super().__init__(env)
@property
def d(self):
return self.d_
return 10000*8
x = np.zeros(self.d)
at = self.get_active_tiles(s, a)
x[at] = 1.0
return x
class GridworldXYEncoder(FeatureEncoder):
def __init__(self, env):
self.env = env
self.na = self.env.action_space.n
self.ns = 2
super().__init__(env)
@property
def d(self):
return self.na*self.ns
def x(self, s, a):
x,y = s
xx = [np.zeros(self.ns) for _ in range(self.na)]
xx[a][0] = x
xx[a][1] = y
# return xx[a]
xx = np.concatenate(xx)
return xx
class SimplePacmanExtractor(FeatureEncoder):
def __init__(self, env):
self.env = env
from irlc.pacman.feature_extractor import SimpleExtractor
# from reinforcement.featureExtractors import SimpleExtractor
self._extractor = SimpleExtractor()
self.fields = ["bias", "#-of-ghosts-1-step-away", "#-of-ghosts-1-step-away", "eats-food", "closest-food"]
super().__init__(env)
def x(self, s, a):
xx = np.zeros_like(self.w)
# ap = self.env._actions_gym2pac[a]
ap = a
for k, v in self._extractor.getFeatures(s, ap).items():
xx[self.fields.index(k)] = v
return xx
@property
def d(self):
return len(self.fields)
class LinearQEncoder(FeatureEncoder):
def __init__(self, env, tilings=8, max_size=2048):
r"""
Implements the tile-encoder described by (SB18)
:param env: The openai Gym environment we wish to solve.
:param tilings: Number of tilings (translations). Typically 8.
:param max_size: Maximum number of dimensions.
"""
if isinstance(env.observation_space, Box):
os = env.observation_space
low = os.low
high = os.high
scale = tilings / (high - low)
hash_table = IHT(max_size)
self.max_size = max_size
def tile_representation(s, action):
s_ = list( (s*scale).flat )
active_tiles = tiles(hash_table, tilings, s_, [action]) # (s * scale).tolist()
# if 0 not in active_tiles:
# active_tiles.append(0)
return active_tiles
self.get_active_tiles = tile_representation
else:
# raise Exception("Implement in new class")
#
# Use Fixed Sparse Representation. See:
# https://castlelab.princeton.edu/html/ORF544/Readings/Geramifard%20-%20Tutorial%20on%20linear%20function%20approximations%20for%20dynamic%20programming%20and%20RL.pdf
ospace = env.observation_space
simple = False
if not isinstance(ospace, tuple):
ospace = (ospace,)
simple = True
sz = []
for j,disc in enumerate(ospace):
sz.append( disc.n )
total_size = sum(sz)
csum = np.cumsum(sz,) - sz[0]
self.max_size = total_size * env.action_space.n
def fixed_sparse_representation(s, action):
if simple:
s = (s,)
s_encoded = [cs + ds + total_size * action for ds,cs in zip(s, csum)]
return s_encoded
self.get_active_tiles = fixed_sparse_representation
super().__init__(env)
def x(self, s, a):
x = np.zeros(self.d)
at = self.get_active_tiles(s, a)
x[at] = 1.0
return x
@property
def d(self):
return self.max_size
"""
Following code contains the tile-coding utilities copied from:
http://incompleteideas.net/tiles/tiles3.py-remove
"""
class IHT:
"""Structure to handle collisions"""
def __init__(self, size_val):
self.size = size_val
self.overfull_count = 0
self.dictionary = {}
def count(self):
return len(self.dictionary)
def full(self):
return len(self.dictionary) >= self.size
def get_index(self, obj, read_only=False):
d = self.dictionary
if obj in d:
return d[obj]
elif read_only:
return None
size = self.size
count = self.count()
if count >= size:
if self.overfull_count == 0:
print('IHT full, starting to allow collisions')
self.overfull_count += 1
return hash(obj) % self.size
else:
d[obj] = count
return count
def hash_coords(coordinates, m, read_only=False):
if isinstance(m, IHT): return m.get_index(tuple(coordinates), read_only)
if isinstance(m, int): return hash(tuple(coordinates)) % m
if m is None: return coordinates
def tiles(iht_or_size, num_tilings, floats, ints=None, read_only=False):
"""returns num-tilings tile indices corresponding to the floats and ints"""
if ints is None:
ints = []
qfloats = [floor(f * num_tilings) for f in floats]
tiles = []
for tiling in range(num_tilings):
tilingX2 = tiling * 2
coords = [tiling]
b = tiling
for q in qfloats:
coords.append((q + b) // num_tilings)
b += tilingX2
coords.extend(ints)
tiles.append(hash_coords(coords, iht_or_size, read_only))
return tiles
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
"""
References:
[SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
"""
from irlc.ex01.agent import train
import gymnasium as gym
from irlc import main_plot
import matplotlib.pyplot as plt
from irlc.ex11.q_agent import QAgent
class SarsaNAgent(QAgent):
r""" Implement the N-step semi-gradient sarsa agent from (SB18, Section 7.2)"""
def __init__(self, env, gamma=1, alpha=0.2, epsilon=0.1, n=1):
# Variables for TD-n
self.n = n # as in n-step sarse
# Buffer lists for previous (S_t, R_{t}, A_t) triplets
self.R, self.S, self.A = [None] * (self.n + 1), [None] * (self.n + 1), [None] * (self.n + 1)
super().__init__(env, gamma=gamma, alpha=alpha, epsilon=epsilon)
def pi(self, s, k, info=None):
self.t = k # Save current step in episode for use in train.
if self.t == 0: # First action is epsilon-greedy.
self.A[self.t] = self.pi_eps(s, info)
return self.A[self.t % (self.n+1)]
def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):
# Recall we are given S_t, A_t, R_{t+1}, S_{t+1} and done is whether t=T+1.
n = self.n # n as in n-step sarsa.
t = self.t # Current time step t as in s_t.
if t == 0: # We are in the initial state. Reset buffer.
self.S[0], self.A[0] = s, a
# Store current observations in buffer.
self.S[(t+1)%(n+1)] = sp
self.R[(t+1)%(n+1)] = r
self.A[(t+1)%(n+1)] = self.pi_eps(sp, info_sp) if not done else -1
if done:
T = t+1
tau_steps_to_train = range(t - n + 1, T)
else:
T = 1e10
tau_steps_to_train = [t - n + 1]
# Tau represent the current tau-steps which are to be updated. The notation is compatible with that in Sutton.
for tau in tau_steps_to_train:
if tau >= 0:
"""
Compute the return for this tau-step and perform the relevant Q-update.
The first step is to compute the expected return G in the below section.
"""
# TODO: 4 lines missing.
raise NotImplementedError("Compute G= (expected return) here.")
S_tau, A_tau = self.S[tau%(n+1)], self.A[tau%(n+1)]
delta = (G - self._q(S_tau, A_tau))
if n == 1: # Check your implementation is correct when n=1 by comparing it with regular Sarsa learning.
delta_Sarsa = (r + (0 if done else self.gamma * self._q(sp,A_tau_n)) - self._q(S_tau,A_tau))
if abs(delta-delta_Sarsa) > 1e-10:
raise Exception("n=1 agreement with Sarsa learning failed. You have at least one bug!")
self._upd_q(S_tau, A_tau, delta)
def _q(self, s, a): return self.Q[s,a] # Using these helper methods will come in handy when we work with function approximators, but it is optional.
def _upd_q(self, s, a, delta): self.Q[s,a] += self.alpha * delta
def __str__(self):
return f"SarsaN_{self.gamma}_{self.epsilon}_{self.alpha}_{self.n}"
if __name__ == "__main__":
envn = 'CliffWalking-v0'
env = gym.make(envn)
from irlc.ex11.sarsa_agent import sarsa_exp
from irlc.ex11.q_agent import q_exp
agent = SarsaNAgent(env, n=5, epsilon=0.1,alpha=0.5)
exp = f"experiments/{envn}_{agent}"
for _ in range(10): # Train 10 times to get an idea about the average performance.
train(env, agent, exp, num_episodes=200, max_runs=10)
main_plot([q_exp, sarsa_exp, exp], smoothing_window=10) # plot with results from Q/Sarsa simulations.
plt.ylim([-100,0])
from irlc import savepdf
savepdf("n_step_sarsa_cliff")
plt.show()
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
"""
References:
[SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
"""
from irlc.ex09.mdp import GymEnv2MDP
from irlc.ex09.rl_agent import TabularAgent
from irlc import train
import gymnasium as gym
from irlc import main_plot
import matplotlib.pyplot as plt
from irlc import savepdf
from irlc.ex09.value_iteration_agent import ValueIterationAgent
class QAgent(TabularAgent):
r"""
Implement the Q-learning agent (SB18, Section 6.5)
Note that the Q-datastructure already exist, as do helper functions useful to compute an epsilon-greedy policy.
You can access these as
> self.Q[s,a] = 31 # Set a Q-value.
See the TabularAgent class for more information.
"""
def __init__(self, env, gamma=1.0, alpha=0.5, epsilon=0.1):
self.alpha = alpha
super().__init__(env, gamma, epsilon)
def pi(self, s, k, info=None):
"""
Return current action using epsilon-greedy exploration. You should look at the TabularAgent class for ideas.
"""
# TODO: 1 lines missing.
raise NotImplementedError("Implement the epsilon-greedy policy here.")
return action
def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):
"""
Implement the Q-learning update rule, i.e. compute a* from the Q-values.
As a hint, note that self.Q[sp,a] corresponds to q(s_{t+1}, a) and
that what you need to update is self.Q[s, a] = ...
You may want to look at self.Q.get_optimal_action(state) to compute a = argmax_a Q[s,a].
"""
# TODO: 3 lines missing.
raise NotImplementedError("Update the Q[s,a]-values here.")
def __str__(self):
return f"QLearner_{self.gamma}_{self.epsilon}_{self.alpha}"
q_exp = f"experiments/cliffwalk_Q"
epsilon = 0.1
max_runs = 10
alpha = 0.5
def cliffwalk():
env = gym.make('CliffWalking-v0')
agent = QAgent(env, epsilon=epsilon, alpha=alpha)
train(env, agent, q_exp, num_episodes=200, max_runs=max_runs)
# As a baseline, we set up/evaluate a value-iteration agent to get an idea about the optimal performance.
# To do so, we need an MDP object. We create an MDP object out of the gym environment below.
# You can look at the code if you like, but it is simply a helper function to convert from one datastructure to another,
# and all it does is to give a MDP object which is needed for our value-iteration implementation from the previous
# week.
mdp = GymEnv2MDP(env)
vi_exp = "experiments/cliffwalk_VI"
Vagent = ValueIterationAgent(env, mdp=mdp, epsilon=epsilon)
train(env, Vagent, vi_exp, num_episodes=200, max_runs=max_runs)
vi_exp_opt = "experiments/cliffwalk_VI_optimal"
Vagent_opt = ValueIterationAgent(env, mdp=mdp, epsilon=0) # Same, but with epsilon=0
train(env, Vagent_opt, vi_exp_opt, num_episodes=200, max_runs=max_runs)
exp_names = [q_exp, vi_exp, vi_exp_opt]
return env, exp_names
if __name__ == "__main__":
for _ in range(10):
env, exp_names = cliffwalk()
main_plot(exp_names, smoothing_window=10)
plt.ylim([-100, 0])
plt.title("Q-learning on " + env.spec.name)
savepdf("Q_learning_cliff")
plt.show()
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
"""
References:
[SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
"""
import matplotlib.pyplot as plt
from irlc.ex11.q_agent import QAgent
from irlc import main_plot, savepdf
from irlc.ex01.agent import train
from irlc.ex11.q_agent import cliffwalk, alpha, epsilon
class SarsaAgent(QAgent):
r""" Implement the Sarsa control method from (SB18, Section 6.4). It is recommended you complete
the Q-agent first because the two methods are very similar and the Q-agent is easier to implement. """
def __init__(self, env, gamma=1, alpha=0.5, epsilon=0.1):
super().__init__(env, gamma=gamma, alpha=alpha, epsilon=epsilon)
def pi(self, s, k, info=None):
if k == 0:
""" we are at the beginning of the episode. Generate a by being epsilon-greedy"""
# TODO: 1 lines missing.
raise NotImplementedError("Implement function body")
else:
""" Return the action self.a you generated during the train where you know s_{t+1} """
# TODO: 1 lines missing.
raise NotImplementedError("Implement function body")
def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):
"""
generate A' as self.a by being epsilon-greedy. Re-use code from the Agent class.
"""
# TODO: 1 lines missing.
raise NotImplementedError("self.a = ....")
""" now that you know A' = self.a, perform the update to self.Q[s,a] here """
# TODO: 2 lines missing.
raise NotImplementedError("Insert your solution and remove this error.")
def __str__(self):
return f"Sarsa{self.gamma}_{self.epsilon}_{self.alpha}"
sarsa_exp = f"experiments/cliffwalk_Sarsa"
if __name__ == "__main__":
env, q_experiments = cliffwalk() # get results from Q-learning
agent = SarsaAgent(env, epsilon=epsilon, alpha=alpha)
for _ in range(10):
train(env, agent, sarsa_exp, num_episodes=200, max_runs=10)
main_plot(q_experiments + [sarsa_exp], smoothing_window=10)
plt.ylim([-100, 0])
plt.title("Q and Sarsa learning on " + env.spec.name)
savepdf("QSarsa_learning_cliff")
plt.show()
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
import gymnasium as gym
from irlc.ex01.agent import train
from irlc import main_plot
import matplotlib.pyplot as plt
from irlc.ex11.q_agent import QAgent
from irlc.ex11.feature_encoder import LinearQEncoder
from irlc import savepdf
class LinearSemiGradQAgent(QAgent):
def __init__(self, env, gamma=1.0, alpha=0.5, epsilon=0.1, q_encoder=None):
""" The Q-values, as implemented using a function approximator, can now be accessed as follows:
>> self.Q(s,a) # Compute q-value
>> self.Q.x(s,a) # Compute gradient of the above expression wrt. w
>> self.Q.w # get weight-vector.
I would recommend inserting a breakpoint and investigating the above expressions yourself;
you can of course al check the class LinearQEncoder if you want to see how it is done in practice.
"""
super().__init__(env, gamma, epsilon=epsilon, alpha=alpha)
self.Q = LinearQEncoder(env, tilings=8) if q_encoder is None else q_encoder
def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):
# TODO: 4 lines missing.
raise NotImplementedError("Implement function body")
def __str__(self):
return f"LinearSemiGradQ{self.gamma}_{self.epsilon}_{self.alpha}"
num_of_tilings = 8
alpha = 1 / num_of_tilings
episodes = 300
x = "Episode"
experiment_q = "experiments/mountaincar_semigrad_q"
if __name__ == "__main__":
from irlc.ex10 import envs
env = gym.make("MountainCar500-v0")
for _ in range(10):
agent = LinearSemiGradQAgent(env, gamma=1, alpha=alpha, epsilon=0)
train(env, agent, experiment_q, num_episodes=episodes, max_runs=10)
main_plot(experiments=[experiment_q], x_key=x, y_key='Length', smoothing_window=30, resample_ticks=100)
savepdf("semigrad_q")
plt.show()
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
"""
References:
[SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
"""
import matplotlib.pyplot as plt
from irlc import main_plot, savepdf
from irlc.ex01.agent import train
import numpy as np
import gymnasium as gym
from irlc.ex11.semi_grad_q import LinearSemiGradQAgent
np.seterr(all='raise')
class LinearSemiGradSarsa(LinearSemiGradQAgent):
def __init__(self, env, gamma=0.99, epsilon=0.1, alpha=0.5, q_encoder=None):
r"""Implement the Linear semi-gradient Sarsa method from (SB18, Section 10.1)"""
super().__init__(env, gamma, epsilon=epsilon, alpha=alpha, q_encoder=q_encoder)
def pi(self, s, k, info=None):
# TODO: 1 lines missing.
raise NotImplementedError("Implement function body")
return action
def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):
# TODO: 4 lines missing.
raise NotImplementedError("Insert your solution and remove this error.")
if sum(np.abs(self.Q.w)) > 1e5: raise Exception("Weights diverged. Decrease alpha")
def __str__(self):
return f"LinSemiGradSarsa{self.gamma}_{self.epsilon}_{self.alpha}"
experiment_sarsa = "experiments/mountaincar_Sarsa"
if __name__ == "__main__":
from irlc.ex11.semi_grad_q import experiment_q, alpha, x
from irlc.ex10 import envs
env = gym.make("MountainCar500-v0")
for _ in range(10):
agent = LinearSemiGradSarsa(env, gamma=1, alpha=alpha, epsilon=0)
train(env, agent, experiment_sarsa, num_episodes=300, max_runs=10)
main_plot(experiments=[experiment_q, experiment_sarsa], x_key=x, y_key='Length', smoothing_window=30)
savepdf("semigrad_q_sarsa")
plt.show()
# Turn off averaging
main_plot(experiments=[experiment_q, experiment_sarsa], x_key=x, y_key='Length', smoothing_window=30, units="Unit", estimator=None)
savepdf("semigrad_q_sarsa_individual")
plt.show()
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment