diff --git a/.gitignore b/.gitignore index 1e25f17126a5900b78b27d3822d62ca6a634a872..1ef69d936b5521202b3fd5df634913c18ac88286 100644 --- a/.gitignore +++ b/.gitignore @@ -4,11 +4,11 @@ exam_tabular_examples #solutions/ex01 #solutions/ex02 #solutions/ex03 -solutions/ex04 -solutions/ex05 -solutions/ex06 -solutions/ex07 -solutions/ex08 +#solutions/ex04 +#solutions/ex05 +#solutions/ex06 +#solutions/ex07 +#solutions/ex08 solutions/ex09 solutions/ex10 solutions/ex11 @@ -31,10 +31,10 @@ solutions/ex13 #irlc/tests/tests_week02.py #irlc/tests/tests_week03.py #irlc/tests/tests_week04.py -irlc/tests/tests_week05.py -irlc/tests/tests_week06.py -irlc/tests/tests_week07.py -irlc/tests/tests_week08.py +#irlc/tests/tests_week05.py +#irlc/tests/tests_week06.py +#irlc/tests/tests_week07.py +#irlc/tests/tests_week08.py irlc/tests/tests_week09.py irlc/tests/tests_week10.py irlc/tests/tests_week11.py @@ -68,10 +68,10 @@ irlc/exam/exam20*/solution # irlc/lectures/lec02 #irlc/lectures/lec03 #irlc/lectures/lec04 -irlc/lectures/lec05 -irlc/lectures/lec06 -irlc/lectures/lec07 -irlc/lectures/lec08 +#irlc/lectures/lec05 +#irlc/lectures/lec06 +#irlc/lectures/lec07 +#irlc/lectures/lec08 irlc/lectures/lec09 irlc/lectures/lec10 irlc/lectures/lec11 diff --git a/irlc/ex08/__init__.py b/irlc/ex08/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..28514114cf38978975fea28d6e6670715223cfb8 --- /dev/null +++ b/irlc/ex08/__init__.py @@ -0,0 +1,2 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +"""This directory contains the exercises for week 8.""" diff --git a/irlc/ex08/bandit_example.py b/irlc/ex08/bandit_example.py new file mode 100644 index 0000000000000000000000000000000000000000..2eb8cccf7a9c703d2de0cbc66ec6e6d00b60d946 --- /dev/null +++ b/irlc/ex08/bandit_example.py @@ -0,0 +1,27 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import numpy as np +import matplotlib.pyplot as plt + +if __name__ == "__main__": + from irlc import Agent, train, savepdf + from irlc.ex08.bandits import StationaryBandit + bandit = StationaryBandit(k=10) # A 10-armed bandit + agent = Agent(bandit) # Recall the agent takes random actions + _, trajectories = train(bandit, agent, return_trajectory=True, num_episodes=1, max_steps=500) + plt.plot(trajectories[0].reward) + plt.xlabel("Time step") + plt.ylabel("Reward per time step") + savepdf("dumbitA") + plt.show() + + agent = Agent(bandit) # Recall the agent takes random actions + for i in range(10): + _, trajectories = train(bandit, agent, return_trajectory=True, num_episodes=1, max_steps=500) + regret = np.asarray([r['gab'] for r in trajectories[0].env_info[1:]]) + cum_regret = np.cumsum(regret) + plt.plot(cum_regret, label=f"Episode {i}") + plt.legend() + plt.xlabel("Time step") + plt.ylabel("Accumulated Regret") + savepdf("dumbitB") + plt.show() diff --git a/irlc/ex08/bandits.py b/irlc/ex08/bandits.py new file mode 100644 index 0000000000000000000000000000000000000000..5df7412724989bf621be957e6b40687283a8d044 --- /dev/null +++ b/irlc/ex08/bandits.py @@ -0,0 +1,213 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import numpy as np +import matplotlib.pyplot as plt +from gymnasium import Env +from gymnasium.spaces import Discrete +from irlc import train +from tqdm import tqdm +import sys +from irlc import cache_read, cache_write, cache_exists + +class BanditEnvironment(Env): + r""" + A helper class for defining bandit problems similar to e.g. the 10-armed testbed discsused in (SB18). + We are going to implement the bandit problems as greatly simplfied gym environments, as this will allow us to + implement the bandit agents as the familiar ``Agent``. I hope this way of doing it will make it clearer that bandits + are in fact a sort of reinforcement learning method. + + The following code shows an example of how to use a bandit environment: + + .. runblock:: pycon + + >>> from irlc.ex08.bandits import StationaryBandit + >>> env = StationaryBandit(k=10) # 10-armed testbed. + >>> env.reset() # Reset env.q_star + >>> s, r, _, _, info = env.step(3) + >>> print(f"The reward we got from taking arm a=3 was {r=}") + + """ + def __init__(self, k : int): + r""" + Initialize a bandit problem. The observation space is given a dummy value since bandit problems of the sort + (SB18) discuss don't have observations. + + :param k: The number of arms. + """ + super().__init__() + self.observation_space = Discrete(1) # Dummy observation space with a single observation. + self.action_space = Discrete(k) # The arms labelled 0,1,...,k-1. + self.k = k # Number of arms + + def reset(self): + r""" + Use this function to reset the all internal parameters of the environment and get ready for a new episode. + In the (SB18) 10-armed bandit testbed, this would involve resetting the expected return + + .. math:: + q^*_a + + The function must return a dummy state and info dictionary to agree with the gym ``Env`` class, but their values are + irrelevant + + :return: + - s - a state, for instance 0 + - info - the info dictionary, for instance {} + """ + raise NotImplementedError("Implement the reset method") + + def bandit_step(self, a): + r"""This helper function simplify the definition of the environments ``step``-function. + + Given an action :math:`r`, this function computes the reward obtained by taking that action :math:`r_t` + and the gab. This is defined as the expected reward we miss out on by taking the potentially suboptimal action :math:`a` + and is defined as: + + .. math:: + \Delta = \max_{a'} q^*_{a'} - q_a + + Once implemented, the reward and regret enters into the ``step`` function as follows: + + .. runblock:: pycon + + >>> from irlc.ex08.bandits import StationaryBandit + >>> env = StationaryBandit(k=4) # 4-armed testbed. + >>> env.reset() # Reset all parameters. + >>> _, r, _, _, info = env.step(2) # Take action a=2 + >>> print(f"Reward from a=2 was {r=}, the gab was {info['gab']=}") + + :param a: The current action we take + :return: + - r - The reward we thereby incur + - gab - The regret incurred by taking this action (0 for an optimal action) + """ + reward = 0 # Compute the reward associated with arm a + gab = 0 # Compute the gab, by comparing to the optimal arms reward. + return reward, gab + + def step(self, action): + r"""You do not have to edit this function. + In a bandit environment, the step function is simplified greatly since there are no + states to keep track on. It should simply return the reward incurred by the action ``a`` + and (for convenience) also returns the gab in the ``info``-dictionary. + + :param action: The current action we take :math:`a_t` + :return: + - next_state - This is always ``None`` + - reward - The reward obtained by taking the given action. In (SB18) this is defined as :math:`r_t` + - terminated - Always ``False``. Bandit problems don't terminate. + - truncated - Always ``False`` + - info - For convenience, this includes the gab (used by the plotting methods) + + """ + reward, gab = self.bandit_step(action) + info = {'gab': gab} + return None, reward, False, False, info + +class StationaryBandit(BanditEnvironment): + r"""Implement the 'stationary bandit environment' which is described in (SB18, Section 2.3) + and used as a running example throughout the chapter. + + We will implement a version with a constant mean offset (q_star_mean), so that + + q* = x + q_star_mean, x ~ Normal(0,1) + + q_star_mean can just be considered to be zero at first. + """ + def __init__(self, k, q_star_mean=0): + super().__init__(k) + self.q_star_mean = q_star_mean + + def reset(self): + """ Set q^*_k = N(0,1) + mean_value. The mean_value is 0 in most examples. I.e., implement the 10-armed testbed environment. """ + self.q_star = np.random.randn(self.k) + self.q_star_mean + self.optimal_action = np.argmax(self.q_star) # Optimal action is the one with the largest q^*-value. + return 0, {} # The reset method in a gym Env must return a (dummy) state and a dictionary. + + def bandit_step(self, a): + """ Return the reward/gab for action a for the simple bandit. Use self.q_star (see reset-function above). + To implement it, implement the reward (see the description of the 10-armed testbed for more information. + How is it computed from q^*_k?) and also compute the gab. + + As a small hint, since we are computing the gab, it will in fact be the difference between the + value of q^* corresponding to the current arm, and the q^* value for the optimal arm. + Remember it is 0 if the optimal action is selected. + """ + # TODO: 2 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + # Actual logic goes here. Use self.q_star[a] to get mean reward and np.random.randn() to generate random numbers. + return reward, gab + + def __str__(self): + return f"{type(self).__name__}_{self.q_star_mean}" + +""" +Helper function for running a bunch of bandit experiments and plotting the results. + +The function will run the agents in 'agents' (a list of bandit agents) +on the bandit environment 'bandit' and plot the result. + +Each agent will be evaluated for num_episodes episodes, and one episode consist of 'steps' steps. +However, to speed things up you can use cache, and the bandit will not be evaluated for more than +'max_episodes' over all cache runs. + +""" +def eval_and_plot(bandit, agents, num_episodes=2000, max_episodes=2000, steps=1000, labels=None, use_cache=True): + if labels is None: + labels = [str(agent) for agent in agents] + + f, axs = plt.subplots(nrows=3, ncols=1) + f.set_size_inches(10,7) + (ax1, ax2, ax3) = axs + for i,agent in enumerate(agents): + rw, oa, regret, num_episodes = run_agent(bandit, agent, episodes=num_episodes, max_episodes=max_episodes, steps=steps, use_cache=use_cache) + ax1.plot(rw, label=labels[i]) + ax2.plot(oa, label=labels[i]) + ax3.plot(regret, label=labels[i]) + + for ax in axs: + ax.grid() + ax.set_xlabel("Steps") + + ax1.set_ylabel("Average Reward") + ax2.set_ylabel("% optimal action") + ax3.set_ylabel("Regret $L_t$") + ax3.legend() + f.suptitle(f"Evaluated on {str(bandit)} for {num_episodes} episodes") + +def run_agent(env, agent, episodes=2000, max_episodes=2000, steps=1000, use_cache=False, verbose=True): + """ + Helper function. most of the work involves the cache; the actual training is done by 'train'. + """ + C_regrets_cum_sum, C_oas_sum, C_rewards_sum, C_n_episodes = 0, 0, 0, 0 + if use_cache: + cache = f"cache/{str(env)}_{str(agent)}_{steps}.pkl" + if cache_exists(cache): + print("> Reading from cache", cache) + C_regrets_cum_sum, C_oas_sum, C_rewards_sum, C_n_episodes = cache_read(cache) + + regrets = [] + rewards = [] + cruns = max(0, min(episodes, max_episodes - C_n_episodes)) # Missing runs. + for _ in tqdm(range(cruns), file=sys.stdout, desc=str(agent),disable=not verbose): + stats, traj = train(env, agent, max_steps=steps, verbose=False, return_trajectory=True) + regret = np.asarray([r['gab'] for r in traj[0].env_info[1:]]) + regrets.append(regret) + rewards.append(traj[0].reward) + + regrets_cum_sum = C_regrets_cum_sum + oas_sum = C_oas_sum + rewards_sum = C_rewards_sum + episodes = C_n_episodes + if len(regrets) > 0: + regrets_cum_sum += np.cumsum(np.sum(np.stack(regrets), axis=0)) + oas_sum += np.sum(np.stack(regrets) == 0, axis=0) + rewards_sum += np.sum(np.stack(rewards), axis=0) + episodes += cruns + if use_cache and cruns > 0: + cache_write((regrets_cum_sum, oas_sum, rewards_sum, episodes), cache, protocol=4) + return rewards_sum/episodes, oas_sum/episodes, regrets_cum_sum/episodes, episodes diff --git a/irlc/ex08/gradient_agent.py b/irlc/ex08/gradient_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..34b296b0cd1dbcea66b194b63d16422b423df98e --- /dev/null +++ b/irlc/ex08/gradient_agent.py @@ -0,0 +1,48 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc import savepdf +import numpy as np +import matplotlib.pyplot as plt +from irlc.ex08.bandits import eval_and_plot, StationaryBandit +from irlc import Agent + +class GradientAgent(Agent): + def __init__(self, env, alpha=None, use_baseline=True): + self.k = env.action_space.n + self.alpha = alpha + self.baseline=use_baseline + self.H = np.zeros((self.k,)) + super().__init__(env) + + def Pa(self): + """ This helper method returns the probability distribution P(A=a) of chosing the + arm a as a vector + """ + pi_a = np.exp(self.H) + return pi_a / np.sum(pi_a) + + def pi(self, s, t, info_s=None): + if t == 0: + self.R_bar = 0 # average reward baseline + self.H *= 0 # Reset H to all-zeros. + self.t = t # Sore the current time step. + return np.random.choice( self.k, p=self.Pa() ) + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + # TODO: 9 lines missing. + raise NotImplementedError("Implement function body") + + def __str__(self): + return f"{type(self).__name__}_{self.alpha}_{'baseline' if self.baseline else 'no_baseline'}" + +if __name__ == "__main__": + baseline_bandit = StationaryBandit(k=10, q_star_mean=4) + alphas = [0.1, 0.4] + agents = [GradientAgent(baseline_bandit, alpha=alpha, use_baseline=False) for alpha in alphas] + agents += [GradientAgent(baseline_bandit, alpha=alpha, use_baseline=True) for alpha in alphas] + + labels = [f'Gradient Bandit alpha={alpha}' for alpha in alphas ] + labels += [f'With baseline: Gradient Bandit alpha={alpha}' for alpha in alphas ] + use_cache = False + eval_and_plot(baseline_bandit, agents, max_episodes=2000, num_episodes=100, labels=labels, use_cache=use_cache) + savepdf("gradient_baseline") + plt.show() diff --git a/irlc/ex08/grand_bandit_race.py b/irlc/ex08/grand_bandit_race.py new file mode 100644 index 0000000000000000000000000000000000000000..ad466aaaffc88b0b4aa43375b55640aa17dc096a --- /dev/null +++ b/irlc/ex08/grand_bandit_race.py @@ -0,0 +1,78 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +import matplotlib.pyplot as plt +from irlc.ex08.simple_agents import BasicAgent +from irlc.ex08.bandits import StationaryBandit, eval_and_plot +from irlc.ex08.nonstationary import MovingAverageAgent, NonstationaryBandit +from irlc.ex08.gradient_agent import GradientAgent +from irlc.ex08.ucb_agent import UCBAgent +from irlc import savepdf +import time + +if __name__ == "__main__": + print("Ladies and gentlemen. It is time for the graaand bandit race") + def intro(bandit, agents): + print("We are live from the beautiful surroundings where they will compete in:") + print(bandit) + print("Who will win? who will have the most regret? we are about to find out") + print("in a minute after a brief word from our sponsors") + time.sleep(1) + print("And we are back. Let us introduce todays contestants:") + for a in agents: + print(a) + print("And they are off!") + epsilon = 0.1 + alpha = 0.1 + c = 2 + # TODO: 1 lines missing. + raise NotImplementedError("Define the bandit here: bandit1 = ...") + # TODO: 5 lines missing. + raise NotImplementedError("define agents list here") + labels = ["Basic", "Moving avg.", "gradient", "Gradient+baseline", "UCB"] + ''' + Stationary, no offset. Vanilla setting. + ''' + intro(bandit1, agents) + # TODO: 1 lines missing. + raise NotImplementedError("Call eval_and_plot here") + plt.suptitle("Stationary bandit (no offset)") + savepdf("grand_race_1") + plt.show() + ''' + Stationary, but with offset + ''' + print("Whew what a race. Let's get ready to next round:") + # TODO: 1 lines missing. + raise NotImplementedError("Define bandit2 = ... here") + intro(bandit2, agents) + # TODO: 1 lines missing. + raise NotImplementedError("Call eval_and_plot here") + plt.suptitle("Stationary bandit (with offset)") + savepdf("grand_race_2") + plt.show() + ''' + Long (nonstationary) simulations + ''' + print("Whew what a race. Let's get ready to next round which will be a long one.") + # TODO: 1 lines missing. + raise NotImplementedError("define bandit3 here") + intro(bandit3, agents) + # TODO: 1 lines missing. + raise NotImplementedError("call eval_and_plot here") + plt.suptitle("Non-stationary bandit (no offset)") + savepdf("grand_race_3") + plt.show() + + ''' + Stationary, no offset, long run. Exclude stupid bandits. + ''' + agents2 = [] + agents2 += [GradientAgent(bandit1, alpha=alpha, use_baseline=False)] + agents2 += [GradientAgent(bandit1, alpha=alpha, use_baseline=True)] + agents2 += [UCBAgent(bandit1, c=2)] + labels = ["Gradient", "Gradient+baseline", "UCB"] + intro(bandit1, agents2) + # TODO: 1 lines missing. + raise NotImplementedError("Call eval_and_plot here") + plt.suptitle("Stationary bandit (no offset)") + savepdf("grand_race_4") + plt.show() diff --git a/irlc/ex08/nonstationary.py b/irlc/ex08/nonstationary.py new file mode 100644 index 0000000000000000000000000000000000000000..546c5ec8b7fd10f4a93a2869a2373d70756dc84c --- /dev/null +++ b/irlc/ex08/nonstationary.py @@ -0,0 +1,62 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import numpy as np +import matplotlib.pyplot as plt +from irlc.ex08.simple_agents import BasicAgent +from irlc.ex08.bandits import StationaryBandit, eval_and_plot +from irlc import savepdf + +class NonstationaryBandit(StationaryBandit): + def __init__(self, k, q_star_mean=0, reward_change_std=0.01): + self.reward_change_std = reward_change_std + super().__init__(k, q_star_mean) + + def bandit_step(self, a): + r""" Implement the non-stationary bandit environment (as described in (SB18)). + Hint: use reward_change_std * np.random.randn() to generate a single random number with the given std. + then add one to each coordinate. Remember you have to compute the regret as well, see StationaryBandit for ideas. + (remember the optimal arm will change when you add noise to q_star) """ + # TODO: 2 lines missing. + raise NotImplementedError("Implement function body") + return super().bandit_step(a) + + def __str__(self): + return f"{type(self).__name__}_{self.q_star_mean}_{self.reward_change_std}" + + +class MovingAverageAgent(BasicAgent): + r""" + The simple bandit from (SB18, Section 2.4), but with moving average alpha + as described in (SB18, Eqn. (2.3)) + """ + def __init__(self, env, epsilon, alpha): + # TODO: 2 lines missing. + raise NotImplementedError("Implement function body") + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + # TODO: 1 lines missing. + raise NotImplementedError("Implement function body") + + def __str__(self): + return f"{type(self).__name__}_{self.epsilon}_{self.alpha}" + + +if __name__ == "__main__": + plt.figure(figsize=(10, 10)) + epsilon = 0.1 + alphas = [0.15, 0.1, 0.05] + + # TODO: 4 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + + labels = [f"Basic agent, epsilon={epsilon}"] + # TODO: 1 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + use_cache = False # Set this to True to use cache (after code works!) + eval_and_plot(bandit, agents, steps=10000, num_episodes=200, labels=labels, use_cache=use_cache) + savepdf("nonstationary_bandits") + plt.show() diff --git a/irlc/ex08/simple_agents.py b/irlc/ex08/simple_agents.py new file mode 100644 index 0000000000000000000000000000000000000000..18bdca138c81829568366b66e538f8de928bc27f --- /dev/null +++ b/irlc/ex08/simple_agents.py @@ -0,0 +1,57 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import numpy as np +import matplotlib.pyplot as plt +from irlc.ex08.bandits import StationaryBandit, eval_and_plot +from irlc import Agent +from irlc import savepdf + +class BasicAgent(Agent): + r""" + Simple bandit as described on (SB18, Section 2.4). + """ + def __init__(self, env, epsilon): + super().__init__(env) + self.k = env.action_space.n + self.epsilon = epsilon + + def pi(self, s, t, info=None): + """ Since this is a bandit, s=None and can be ignored, while t refers to the time step in the current episode """ + if t == 0: + # At step 0 of episode. Re-initialize data structure. + # TODO: 2 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + # compute action here + # TODO: 1 lines missing. + raise NotImplementedError("Insert your solution and remove this error.") + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + """ Since this is a bandit, done, s, sp, info_s, info_sp can all be ignored. + From the input arguments you should only use a + """ + # TODO: 2 lines missing. + raise NotImplementedError("Implement function body") + + def __str__(self): + return f"BasicAgent_{self.epsilon}" + +if __name__ == "__main__": + N = 100000 + S = [np.max( np.random.randn(10) ) for _ in range(100000) ] + print( np.mean(S), np.std(S)/np.sqrt(N) ) + + use_cache = False # Set this to True to use cache (after code works!) + from irlc.utils.timer import Timer + timer = Timer(start=True) + R = 100 + steps = 1000 + env = StationaryBandit(k=10) + agents = [BasicAgent(env, epsilon=.1), BasicAgent(env, epsilon=.01), BasicAgent(env, epsilon=0) ] + eval_and_plot(env, agents, num_episodes=100, steps=1000, max_episodes=150, use_cache=use_cache) + savepdf("bandit_epsilon") + plt.show() + print(timer.display()) diff --git a/irlc/ex08/ucb_agent.py b/irlc/ex08/ucb_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..5c805ea4437b597f48f8cf5f0dc8608f95b96182 --- /dev/null +++ b/irlc/ex08/ucb_agent.py @@ -0,0 +1,45 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +""" + +References: + [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online). +""" +import numpy as np +import matplotlib.pyplot as plt +from irlc.ex08.simple_agents import BasicAgent +from irlc import savepdf +from irlc import Agent + +class UCBAgent(Agent): + def __init__(self, env, c=2): + self.c = c + super().__init__(env) + + def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None): + # TODO: 2 lines missing. + raise NotImplementedError("Train agent here") + + def pi(self, s, k, info=None): + if k == 0: + """ Initialize the agent""" + # TODO: 3 lines missing. + raise NotImplementedError("Reset agent (i.e., make it ready to learn in a new episode with a new optimal action)") + # TODO: 1 lines missing. + raise NotImplementedError("Compute (and return) optimal action") + + def __str__(self): + return f"{type(self).__name__}_{self.c}" + +from irlc.ex08.bandits import StationaryBandit, eval_and_plot +if __name__ == "__main__": + r"""Reproduce (SB18, Fig. 2.4) comparing UCB agent to epsilon greedy """ + runs, use_cache = 100, False + c = 2 + eps = 0.1 + + steps = 1000 + env = StationaryBandit(k=10) + agents = [UCBAgent(env,c=c), BasicAgent(env, epsilon=eps)] + eval_and_plot(bandit=env, agents=agents, num_episodes=runs, steps=steps, max_episodes=2000, use_cache=use_cache) + savepdf("UCB_agent") + plt.show() diff --git a/irlc/lectures/lec08/__init__.py b/irlc/lectures/lec08/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a56057c84d0ceac54aab1d40ba0f370c77fe10be --- /dev/null +++ b/irlc/lectures/lec08/__init__.py @@ -0,0 +1 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. diff --git a/irlc/lectures/lec08/demo_bandit.py b/irlc/lectures/lec08/demo_bandit.py new file mode 100644 index 0000000000000000000000000000000000000000..e8b47621d5cd20893a8af401abed59db0af9b63b --- /dev/null +++ b/irlc/lectures/lec08/demo_bandit.py @@ -0,0 +1,23 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.utils.bandit_graphics_environment import GraphicalBandit +import time +from irlc import train +from irlc.ex08.simple_agents import BasicAgent +from irlc import interactive + +def bandit_eps(autoplay=False): + env = GraphicalBandit(10, render_mode='human',frames_per_second=30) + env.reset() + agent = BasicAgent(env, epsilon=0.1) + agent.method = 'Epsilon-greedy' + env, agent = interactive(env, agent, autoplay=autoplay) + + t0 = time.time() + n = 3000 + stats, _ = train(env, agent, max_steps=n, num_episodes=10, return_trajectory=False, verbose=False) + tpf = (time.time()-t0)/ n + print("tpf", tpf, 'fps', 1/tpf) + env.close() + +if __name__ == "__main__": + bandit_eps() diff --git a/irlc/lectures/lec08/demo_bandit_ucb.py b/irlc/lectures/lec08/demo_bandit_ucb.py new file mode 100644 index 0000000000000000000000000000000000000000..440c9760cc0dfcf90a0414525aa8521df768047a --- /dev/null +++ b/irlc/lectures/lec08/demo_bandit_ucb.py @@ -0,0 +1,26 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from irlc.utils.bandit_graphics_environment import GraphicalBandit +from irlc import interactive, train +# import numpy as np +import time + +def bandit_ucb(autoplay=False): + env = GraphicalBandit(10, render_mode='human', frames_per_second=30) + env.reset() + #env.viewer.show_q_star = True + #env.viewer.show_q_ucb = True + from irlc.ex08.ucb_agent import UCBAgent + agent = UCBAgent(env, c=1) + agent.method = 'UCB' + + env, agent = interactive(env, agent, autoplay=autoplay) + t0 = time.time() + n = 500 + stats, _ = train(env, agent, max_steps=n, num_episodes=10, return_trajectory=False, verbose=False) + tpf = (time.time() - t0) / n + print("tpf", tpf, 'fps', 1 / tpf) + env.close() + + +if __name__ == "__main__": + bandit_ucb() diff --git a/irlc/tests/tests_week05.py b/irlc/tests/tests_week05.py index 4a7f813840b6670d6caa99c16576d2b90ff7572c..e863e8f73cfd4652465e33a8dacaeb15e661a0fe 100644 --- a/irlc/tests/tests_week05.py +++ b/irlc/tests/tests_week05.py @@ -79,20 +79,20 @@ class CartpoleCostQuestion(DirectSolverQuestion): from irlc.ex05.direct_cartpole_kelly import compute_solutions return compute_solutions()[1] -class BrachistochroneQuestion(DirectSolverQuestion): - """ Brachistochrone (unconstrained) """ - - @classmethod - def compute_solution(cls): - from irlc.ex05.direct_brachistochrone import compute_constrained_solutions - return compute_constrained_solutions()[1] - -class BrachistochroneConstrainedQuestion(DirectSolverQuestion): - """ Brachistochrone (constrained) """ - @classmethod - def compute_solution(cls): - from irlc.ex05.direct_brachistochrone import compute_constrained_solutions - return compute_constrained_solutions()[1] +# class BrachistochroneQuestion(DirectSolverQuestion): +# """ Brachistochrone (unconstrained) """ +# +# @classmethod +# def compute_solution(cls): +# from irlc.ex05.direct_brachistochrone import compute_constrained_solutions +# return compute_constrained_solutions()[1] +# +# class BrachistochroneConstrainedQuestion(DirectSolverQuestion): +# """ Brachistochrone (constrained) """ +# @classmethod +# def compute_solution(cls): +# from irlc.ex05.direct_brachistochrone import compute_constrained_solutions +# return compute_constrained_solutions()[1] class Week05Tests(Report): title = "Tests for week 05" @@ -105,8 +105,8 @@ class Week05Tests(Report): (DirectAgentPendulum, 10), # ok (CartpoleTimeQuestion, 5), # ok (CartpoleCostQuestion, 5), # ok - (BrachistochroneQuestion, 5), # ok - (BrachistochroneConstrainedQuestion, 10), # ok + # (BrachistochroneQuestion, 5), # ok + # (BrachistochroneConstrainedQuestion, 10), # ok ] if __name__ == '__main__': diff --git a/irlc/tests/tests_week08.py b/irlc/tests/tests_week08.py new file mode 100644 index 0000000000000000000000000000000000000000..340d69c01c3ef2cae94901444ba52b9887a47bef --- /dev/null +++ b/irlc/tests/tests_week08.py @@ -0,0 +1,278 @@ +# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text. +from unitgrade import UTestCase, Report, cache +import numpy as np +from irlc import train + + +def train_recording(env, agent, trajectories): + for t in trajectories: + env.reset() + for k in range(len(t.action)): + s = t.state[k] + r = t.reward[k] + a = t.action[k] + sp = t.state[k+1] + agent.pi(s,k) + agent.train(s, a, r, sp, done=k == len(t.action)-1) + + +class BanditQuestion(UTestCase): + """ Value (Q) function estimate """ + tol = 1e-2 # tie-breaking in the gradient bandit is ill-defined. + # testfun = QPrintItem.assertL2 + + # def setUpClass(cls) -> None: + # from irlc.ex08.simple_agents import BasicAgent + # from irlc.ex08.bandits import StationaryBandit + # env = StationaryBandit(k=10, ) + # agent = BasicAgent(env, epsilon=0.1) + # _, cls.trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100) + # cls.Q = agent.Q + # cls.env = env + # cls.agent = agent + + def get_env_agent(self): + from irlc.ex08.simple_agents import BasicAgent + from irlc.ex08.bandits import StationaryBandit + env = StationaryBandit(k=10) + agent = BasicAgent(env, epsilon=0.1) + return env, agent + + @cache + def get_trajectories(self): + env, agent = self.get_env_agent() + _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100) + return trajectories + + # def precompute_payload(self): + # env, agent = self.get_env_agent() + # _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100) + # return trajectories, agent.Q + + + def test_agent(self): + trajectories = self.get_trajectories() + env, agent = self.get_env_agent() + train_recording(env, agent, trajectories) + self.assertL2(agent.Q, tol=1e-5) + # return agent.Q + # self.Q = Q + # self.question.agent = agent + # return agent.Q + + # testfun = QPrintItem.assertL2 + + def test_action_distributin(self): + T = 10000 + tol = 1 / np.sqrt(T) * 5 + trajectories = self.get_trajectories() + env, agent = self.get_env_agent() + train_recording(env, agent, trajectories) + # for k in self._cache.keys(): print(k) + + from collections import Counter + counts = Counter([agent.pi(None, k) for k in range(T)]) + distrib = [counts[k] / T for k in range(env.k)] + self.assertL2(np.asarray(distrib), tol=tol) + + + # def process_output(self, res, txt, numbers): + # return res + + # def process_output(self, res, txt, numbers): + # return res + # + # def test(self, computed, expected): + # super().test(computed, self.Q) + +# class BanditQuestion(QPrintItem): +# # tol = 1e-6 +# tol = 1e-2 # tie-breaking in the gradient bandit is ill-defined. +# title = "Value (Q) function estimate" +# testfun = QPrintItem.assertL2 +# +# def get_env_agent(self): +# from irlc.ex08.simple_agents import BasicAgent +# from irlc.ex08.bandits import StationaryBandit +# env = StationaryBandit(k=10, ) +# agent = BasicAgent(env, epsilon=0.1) +# return env, agent +# +# def precompute_payload(self): +# env, agent = self.get_env_agent() +# _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100) +# return trajectories, agent.Q +# +# def compute_answer_print(self): +# trajectories, Q = self.precomputed_payload() +# env, agent = self.get_env_agent() +# train_recording(env, agent, trajectories) +# self.Q = Q +# self.question.agent = agent +# return agent.Q +# +# def process_output(self, res, txt, numbers): +# return res +# +# def test(self, computed, expected): +# super().test(computed, self.Q) +# +# class BanditItemActionDistribution(QPrintItem): +# # Assumes setup has already been done. +# title = "Action distribution test" +# T = 10000 +# tol = 1/np.sqrt(T)*5 +# testfun = QPrintItem.assertL2 +# +# def compute_answer_print(self): +# # print("In agent print code") +# from collections import Counter +# counts = Counter( [self.question.agent.pi(None, k) for k in range(self.T)] ) +# distrib = [counts[k] / self.T for k in range(self.question.agent.env.k)] +# return np.asarray(distrib) +# +# def process_output(self, res, txt, numbers): +# return res +# +# class BanditQuestion(QuestionGroup): +# title = "Simple bandits" +# class SimpleBanditItem(BanditItem): +# #title = "Value function estimate" +# def get_env_agent(self): +# from irlc.ex08.simple_agents import BasicAgent +# from irlc.ex08.bandits import StationaryBandit +# env = StationaryBandit(k=10, ) +# agent = BasicAgent(env, epsilon=0.1) +# return env, agent +# class SimpleBanditActionDistribution(BanditItemActionDistribution): +# pass + + + +class GradientBanditQuestion(BanditQuestion): + """ Gradient agent """ + # class SimpleBanditItem(BanditItem): + # title = "Simple agent question" + def get_env_agent(self): + from irlc.ex08.bandits import StationaryBandit + from irlc.ex08.gradient_agent import GradientAgent + env = StationaryBandit(k=10) + agent = GradientAgent(env, alpha=0.05) + return env, agent + + # def precompute_payload(self): + # env, agent = self.get_env_agent() + # _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100) + # return trajectories + + def test_agent(self): + trajectories = self.get_trajectories() + env, agent = self.get_env_agent() + train_recording(env, agent, trajectories) + self.assertL2(agent.H, tol=1e-5) + + + # def test(self, computed, expected): + # self.testfun(computed, self.H) + # + # class SimpleBanditActionDistribution(BanditItemActionDistribution): + # pass + + +# class GradientBanditQuestion(QuestionGroup): +# title = "Gradient agent" +# class SimpleBanditItem(BanditItem): +# # title = "Simple agent question" +# def get_env_agent(self): +# from irlc.ex08.bandits import StationaryBandit +# from irlc.ex08.gradient_agent import GradientAgent +# env = StationaryBandit(k=10) +# agent = GradientAgent(env, alpha=0.05) +# return env, agent +# +# def precompute_payload(self): +# env, agent = self.get_env_agent() +# _, trajectories = train(env, agent, return_trajectory=True, num_episodes=1, max_steps=100) +# return trajectories, agent.H +# +# def compute_answer_print(self): +# trajectories, H = self.precomputed_payload() +# env, agent = self.get_env_agent() +# train_recording(env, agent, trajectories) +# self.H = H +# self.question.agent = agent +# return agent.H +# +# def test(self, computed, expected): +# self.testfun(computed, self.H) +# +# class SimpleBanditActionDistribution(BanditItemActionDistribution): +# pass + + + +class UCBAgentQuestion(BanditQuestion): + """ UCB agent """ + # class UCBAgentItem(BanditItem): + def get_env_agent(self): + from irlc.ex08.bandits import StationaryBandit + from irlc.ex08.ucb_agent import UCBAgent + env = StationaryBandit(k=10) + agent = UCBAgent(env) + return env, agent + + # class UCBAgentActionDistribution(BanditItemActionDistribution): + # pass + + +# class UCBAgentQuestion(QuestionGroup): +# title = "UCB agent" +# class UCBAgentItem(BanditItem): +# def get_env_agent(self): +# from irlc.ex08.bandits import StationaryBandit +# from irlc.ex08.ucb_agent import UCBAgent +# env = StationaryBandit(k=10) +# agent = UCBAgent(env) +# return env, agent +# +# class UCBAgentActionDistribution(BanditItemActionDistribution): +# pass + +# class NonstatiotnaryAgentQuestion(QuestionGroup): +# title = "Nonstationary bandit environment" +# class NonstationaryItem(BanditItem): +# def get_env_agent(self): +# epsilon = 0.1 +# from irlc.ex08.nonstationary import NonstationaryBandit, MovingAverageAgent +# bandit = NonstationaryBandit(k=10) +# agent = MovingAverageAgent(bandit, epsilon=epsilon, alpha=0.15) +# return bandit, agent +# +# class NonstationaryActionDistribution(BanditItemActionDistribution): +# pass + +class NonstatiotnaryAgentQuestion(BanditQuestion): + """ UCB agent """ + # class UCBAgentItem(BanditItem): + def get_env_agent(self): + epsilon = 0.1 + from irlc.ex08.nonstationary import NonstationaryBandit, MovingAverageAgent + bandit = NonstationaryBandit(k=10) + agent = MovingAverageAgent(bandit, epsilon=epsilon, alpha=0.15) + return bandit, agent + +import irlc +class Week08Tests(Report): + title = "Tests for week 08" + pack_imports = [irlc] + individual_imports = [] + questions = [ + (BanditQuestion, 10), + (GradientBanditQuestion, 10), + (UCBAgentQuestion, 5), + (NonstatiotnaryAgentQuestion, 5) + ] + +if __name__ == '__main__': + from unitgrade import evaluate_report_student + evaluate_report_student(Week08Tests()) diff --git a/irlc/tests/unitgrade_data/BanditQuestion.pkl b/irlc/tests/unitgrade_data/BanditQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..55e379fe474d7a967700bb9c83202905b8ebcbfa Binary files /dev/null and b/irlc/tests/unitgrade_data/BanditQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/BrachistochroneConstrainedQuestion.pkl b/irlc/tests/unitgrade_data/BrachistochroneConstrainedQuestion.pkl deleted file mode 100644 index 1d1b594d8ccdde8336eb3cd174e105beaf5eaf6f..0000000000000000000000000000000000000000 Binary files a/irlc/tests/unitgrade_data/BrachistochroneConstrainedQuestion.pkl and /dev/null differ diff --git a/irlc/tests/unitgrade_data/BrachistochroneQuestion.pkl b/irlc/tests/unitgrade_data/BrachistochroneQuestion.pkl deleted file mode 100644 index 1d1b594d8ccdde8336eb3cd174e105beaf5eaf6f..0000000000000000000000000000000000000000 Binary files a/irlc/tests/unitgrade_data/BrachistochroneQuestion.pkl and /dev/null differ diff --git a/irlc/tests/unitgrade_data/CartpoleCostQuestion.pkl b/irlc/tests/unitgrade_data/CartpoleCostQuestion.pkl index 1d1b594d8ccdde8336eb3cd174e105beaf5eaf6f..21e4c24c13dd49d445c4efe18438fe4a0b360513 100644 Binary files a/irlc/tests/unitgrade_data/CartpoleCostQuestion.pkl and b/irlc/tests/unitgrade_data/CartpoleCostQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/CartpoleTimeQuestion.pkl b/irlc/tests/unitgrade_data/CartpoleTimeQuestion.pkl index 1d1b594d8ccdde8336eb3cd174e105beaf5eaf6f..21e4c24c13dd49d445c4efe18438fe4a0b360513 100644 Binary files a/irlc/tests/unitgrade_data/CartpoleTimeQuestion.pkl and b/irlc/tests/unitgrade_data/CartpoleTimeQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/DirectAgentPendulum.pkl b/irlc/tests/unitgrade_data/DirectAgentPendulum.pkl index e9d2ed475214b22bc52f1cc0dfc8c04c71d9a2b9..8bcfd04385b49acb537aa90a6c1906443c00c348 100644 Binary files a/irlc/tests/unitgrade_data/DirectAgentPendulum.pkl and b/irlc/tests/unitgrade_data/DirectAgentPendulum.pkl differ diff --git a/irlc/tests/unitgrade_data/DirectMethods.pkl b/irlc/tests/unitgrade_data/DirectMethods.pkl index f81ab2560bf4752a237712f1df94fc8ae01ac0ce..1872c37be157b1d23e330e90fb98df324bc707a7 100644 Binary files a/irlc/tests/unitgrade_data/DirectMethods.pkl and b/irlc/tests/unitgrade_data/DirectMethods.pkl differ diff --git a/irlc/tests/unitgrade_data/DirectSolverQuestion.pkl b/irlc/tests/unitgrade_data/DirectSolverQuestion.pkl index 1d1b594d8ccdde8336eb3cd174e105beaf5eaf6f..21e4c24c13dd49d445c4efe18438fe4a0b360513 100644 Binary files a/irlc/tests/unitgrade_data/DirectSolverQuestion.pkl and b/irlc/tests/unitgrade_data/DirectSolverQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/Exam5InventoryEvaluation.pkl b/irlc/tests/unitgrade_data/Exam5InventoryEvaluation.pkl index a511917ab3f43109f1d3fe37b025f2fde713339f..288459bca52e824a5d9dabdcb4cf10e164f64114 100644 Binary files a/irlc/tests/unitgrade_data/Exam5InventoryEvaluation.pkl and b/irlc/tests/unitgrade_data/Exam5InventoryEvaluation.pkl differ diff --git a/irlc/tests/unitgrade_data/Exam6Toy2d.pkl b/irlc/tests/unitgrade_data/Exam6Toy2d.pkl index f927b5ae7578002c5a8840c91705f5d4c7d806f0..06341fef90fd2beed50cccac023bdd729b480a91 100644 Binary files a/irlc/tests/unitgrade_data/Exam6Toy2d.pkl and b/irlc/tests/unitgrade_data/Exam6Toy2d.pkl differ diff --git a/irlc/tests/unitgrade_data/ExamQuestion7FlowersStore.pkl b/irlc/tests/unitgrade_data/ExamQuestion7FlowersStore.pkl index 6555641d17ccd50bdc906dab13481cdad59254cc..7de7875d690be1fc4143070c2139bd34f61288ae 100644 Binary files a/irlc/tests/unitgrade_data/ExamQuestion7FlowersStore.pkl and b/irlc/tests/unitgrade_data/ExamQuestion7FlowersStore.pkl differ diff --git a/irlc/tests/unitgrade_data/GradientBanditQuestion.pkl b/irlc/tests/unitgrade_data/GradientBanditQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..55e379fe474d7a967700bb9c83202905b8ebcbfa Binary files /dev/null and b/irlc/tests/unitgrade_data/GradientBanditQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/ILQRAgentQuestion.pkl b/irlc/tests/unitgrade_data/ILQRAgentQuestion.pkl index 4af21ecb688c99771e1897bc53ecbae1bc667b8f..94b38667b6a59b2bdd827e9569ad5bce677cc91e 100644 Binary files a/irlc/tests/unitgrade_data/ILQRAgentQuestion.pkl and b/irlc/tests/unitgrade_data/ILQRAgentQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/ILQRPendulumQuestion.pkl b/irlc/tests/unitgrade_data/ILQRPendulumQuestion.pkl index 1019b5db1c0a0ad9b3e14545aaf80055652fcb66..af4efa1cc7fc8336bfab2d97317419f4573a58da 100644 Binary files a/irlc/tests/unitgrade_data/ILQRPendulumQuestion.pkl and b/irlc/tests/unitgrade_data/ILQRPendulumQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/NonstatiotnaryAgentQuestion.pkl b/irlc/tests/unitgrade_data/NonstatiotnaryAgentQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..55e379fe474d7a967700bb9c83202905b8ebcbfa Binary files /dev/null and b/irlc/tests/unitgrade_data/NonstatiotnaryAgentQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/PendulumQuestion.pkl b/irlc/tests/unitgrade_data/PendulumQuestion.pkl index 1d1b594d8ccdde8336eb3cd174e105beaf5eaf6f..21e4c24c13dd49d445c4efe18438fe4a0b360513 100644 Binary files a/irlc/tests/unitgrade_data/PendulumQuestion.pkl and b/irlc/tests/unitgrade_data/PendulumQuestion.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem1BobsFriend.pkl b/irlc/tests/unitgrade_data/Problem1BobsFriend.pkl index cb3e05e507f46a771194aa47e9d478a38ea6dc4a..0a911216fa96ee726261d5fd6122f47c63b7becd 100644 Binary files a/irlc/tests/unitgrade_data/Problem1BobsFriend.pkl and b/irlc/tests/unitgrade_data/Problem1BobsFriend.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem1DiscreteKuromoto.pkl b/irlc/tests/unitgrade_data/Problem1DiscreteKuromoto.pkl index 5884cbbaec323589db6f77d93e93aab42b835c82..6174c0b3159b23350a66f8510986566388f9a9e9 100644 Binary files a/irlc/tests/unitgrade_data/Problem1DiscreteKuromoto.pkl and b/irlc/tests/unitgrade_data/Problem1DiscreteKuromoto.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem1Kuramoto.pkl b/irlc/tests/unitgrade_data/Problem1Kuramoto.pkl index ef1b528ea3cb8460738583d5cf526ff19f05dc78..5da65912b9c77917947555ed5b62336969918a99 100644 Binary files a/irlc/tests/unitgrade_data/Problem1Kuramoto.pkl and b/irlc/tests/unitgrade_data/Problem1Kuramoto.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem2BobsPolicy.pkl b/irlc/tests/unitgrade_data/Problem2BobsPolicy.pkl index 25f36de2f85c245c47e3f76ccfd13411d7dbb190..5d994baa391da54fd3a6e1c1a369b72a9df5f17a 100644 Binary files a/irlc/tests/unitgrade_data/Problem2BobsPolicy.pkl and b/irlc/tests/unitgrade_data/Problem2BobsPolicy.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem2DeterministicDP.pkl b/irlc/tests/unitgrade_data/Problem2DeterministicDP.pkl index c6a6d86c444884d85398ab5d68b4a8f2d731b17c..4029b85e80a9ebbf315924351ada7ba445fcb24a 100644 Binary files a/irlc/tests/unitgrade_data/Problem2DeterministicDP.pkl and b/irlc/tests/unitgrade_data/Problem2DeterministicDP.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem2DeterministicInventory.pkl b/irlc/tests/unitgrade_data/Problem2DeterministicInventory.pkl index d7b664c9873a3f229a3ada6b5c0794429402f080..547769c9bb40f7e2f9e061a3d24943b7bf016ea1 100644 Binary files a/irlc/tests/unitgrade_data/Problem2DeterministicInventory.pkl and b/irlc/tests/unitgrade_data/Problem2DeterministicInventory.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem3InventoryInventoryEnvironment.pkl b/irlc/tests/unitgrade_data/Problem3InventoryInventoryEnvironment.pkl index ed6643cb8af26e6368d19072746fd41ae4c60ab0..f8b966396874d03b37f527e8166a7431bd63ce66 100644 Binary files a/irlc/tests/unitgrade_data/Problem3InventoryInventoryEnvironment.pkl and b/irlc/tests/unitgrade_data/Problem3InventoryInventoryEnvironment.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem3LQR.pkl b/irlc/tests/unitgrade_data/Problem3LQR.pkl index 604d5c20678f95431aae0b2c24d839ff98a641a0..cd8f6f6cd8072c224d9de2763d5585bdba4a6d80 100644 Binary files a/irlc/tests/unitgrade_data/Problem3LQR.pkl and b/irlc/tests/unitgrade_data/Problem3LQR.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem3PID.pkl b/irlc/tests/unitgrade_data/Problem3PID.pkl index 535051b828a3939a46f439310b29198a5e080a0a..252cfd024c97e5da728820dacd87ab9910607247 100644 Binary files a/irlc/tests/unitgrade_data/Problem3PID.pkl and b/irlc/tests/unitgrade_data/Problem3PID.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem3StochasticDP.pkl b/irlc/tests/unitgrade_data/Problem3StochasticDP.pkl index 04713f394a37f31128be4aead3269f8c3c2e4695..0e1fc83741cb9bd0877d29de2b3828b78bdd5b01 100644 Binary files a/irlc/tests/unitgrade_data/Problem3StochasticDP.pkl and b/irlc/tests/unitgrade_data/Problem3StochasticDP.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem4DPAgent.pkl b/irlc/tests/unitgrade_data/Problem4DPAgent.pkl index 54dc9c584545f20c53ce6eab91442e358d24233e..178368d13873f75c43be9a31cb3dbdb10d5fef36 100644 Binary files a/irlc/tests/unitgrade_data/Problem4DPAgent.pkl and b/irlc/tests/unitgrade_data/Problem4DPAgent.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem4InventoryTrain.pkl b/irlc/tests/unitgrade_data/Problem4InventoryTrain.pkl index 2e0efe223d9500337a95c76732a6fcfc3cbb2872..22065591b65be79d935c05472a7603be0e00bcdb 100644 Binary files a/irlc/tests/unitgrade_data/Problem4InventoryTrain.pkl and b/irlc/tests/unitgrade_data/Problem4InventoryTrain.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem4LQRAgent.pkl b/irlc/tests/unitgrade_data/Problem4LQRAgent.pkl index b7f0a6a6d9def299660158f65d0b85af24d92699..42b50d8f321a365c574de2e27cc5dead749dbee4 100644 Binary files a/irlc/tests/unitgrade_data/Problem4LQRAgent.pkl and b/irlc/tests/unitgrade_data/Problem4LQRAgent.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem4PIDAgent.pkl b/irlc/tests/unitgrade_data/Problem4PIDAgent.pkl index 70a85f4bcf6fc78c263690ba8b0491e7799bf59c..14b3e4b4c95270f0c2953a2cc41a66833ba99d7f 100644 Binary files a/irlc/tests/unitgrade_data/Problem4PIDAgent.pkl and b/irlc/tests/unitgrade_data/Problem4PIDAgent.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem5PacmanHardcoded.pkl b/irlc/tests/unitgrade_data/Problem5PacmanHardcoded.pkl index 3968cbf7284d8d716e19ed9972ad53d4d3172bf7..33dfa81f677fd061a0a39b2c51757d929785cd80 100644 Binary files a/irlc/tests/unitgrade_data/Problem5PacmanHardcoded.pkl and b/irlc/tests/unitgrade_data/Problem5PacmanHardcoded.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem5_6_Boeing.pkl b/irlc/tests/unitgrade_data/Problem5_6_Boeing.pkl index 4650e0a6d3fa65b094ffb89356fae69744d7793a..b61782009434e3024f670821a02eff567ea7220c 100644 Binary files a/irlc/tests/unitgrade_data/Problem5_6_Boeing.pkl and b/irlc/tests/unitgrade_data/Problem5_6_Boeing.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem6ChessTournament.pkl b/irlc/tests/unitgrade_data/Problem6ChessTournament.pkl index e7da887928def8fb94b9e7e3d034e373928ce32e..354e3485c6913c4ed2b0e90c1416d05becf63c1c 100644 Binary files a/irlc/tests/unitgrade_data/Problem6ChessTournament.pkl and b/irlc/tests/unitgrade_data/Problem6ChessTournament.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem7PIDCar.pkl b/irlc/tests/unitgrade_data/Problem7PIDCar.pkl index e22d2e24d83e87606e137a33987ca83d3c67a210..2ff576403f28ebc1f96c87a40defa18f2263737b 100644 Binary files a/irlc/tests/unitgrade_data/Problem7PIDCar.pkl and b/irlc/tests/unitgrade_data/Problem7PIDCar.pkl differ diff --git a/irlc/tests/unitgrade_data/Problem7_8_PidLQR.pkl b/irlc/tests/unitgrade_data/Problem7_8_PidLQR.pkl index d455290c459b3de4a4a31f3905f4f131de9a2b78..c0103b3e977fa2b98a34cf16e69b4168cf7d8d53 100644 Binary files a/irlc/tests/unitgrade_data/Problem7_8_PidLQR.pkl and b/irlc/tests/unitgrade_data/Problem7_8_PidLQR.pkl differ diff --git a/irlc/tests/unitgrade_data/RendevouzItem.pkl b/irlc/tests/unitgrade_data/RendevouzItem.pkl index 91c3ae562ba9fce0668664f1fa58979cd070b2a7..2ea308be8ae3ae254027640d548e0f9972c8cfe6 100644 Binary files a/irlc/tests/unitgrade_data/RendevouzItem.pkl and b/irlc/tests/unitgrade_data/RendevouzItem.pkl differ diff --git a/irlc/tests/unitgrade_data/UCBAgentQuestion.pkl b/irlc/tests/unitgrade_data/UCBAgentQuestion.pkl new file mode 100644 index 0000000000000000000000000000000000000000..55e379fe474d7a967700bb9c83202905b8ebcbfa Binary files /dev/null and b/irlc/tests/unitgrade_data/UCBAgentQuestion.pkl differ diff --git a/irlc/utils/bandit_graphics_environment.py b/irlc/utils/bandit_graphics_environment.py index 391f0ea5f31d49a4bc70edddc624eb8d27fd018c..a050ed32a36636d75c61fb146fe25f8a5ad9afc3 100644 --- a/irlc/utils/bandit_graphics_environment.py +++ b/irlc/utils/bandit_graphics_environment.py @@ -50,7 +50,12 @@ class GraphicalBandit(BinaryBandit): def reset(self): s, info = super().reset() + if hasattr(self, 'agent'): + if hasattr(self.agent, 'Q'): del self.agent.Q + if hasattr(self.agent, 'N'): del self.agent.N + self.render() + return s, info def step(self, action): @@ -217,7 +222,7 @@ class BanditViewer: reward = self.last_reward action = self.last_action self.ghost.set_direction(self.ghost.rand_eyes()) # Random eyes. - if reward is not None: + if reward is not None and action is not None: if reward <= 0: self.ghost.kill() else: diff --git a/solutions/ex07/ilqr_TODO_1.py b/solutions/ex07/ilqr_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..c9ada2c3a7adcd0a1d3995bd723716a749f4c08e --- /dev/null +++ b/solutions/ex07/ilqr_TODO_1.py @@ -0,0 +1,2 @@ + l, L = [np.zeros((m,))]*N, [np.zeros((m,n))]*N + x_bar, u_bar = forward_pass(model, x_bar, u_bar, L=L, l=l) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_10.py b/solutions/ex07/ilqr_TODO_10.py new file mode 100644 index 0000000000000000000000000000000000000000..97423181e2f17ec2e986788e79b4fd978a7c6838 --- /dev/null +++ b/solutions/ex07/ilqr_TODO_10.py @@ -0,0 +1 @@ + Delta, mu = max(1.0, Delta) * Delta_0, max(mu_min, mu * Delta) # Increase \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_11.py b/solutions/ex07/ilqr_TODO_11.py new file mode 100644 index 0000000000000000000000000000000000000000..dafc65dda3f1d6f36c21e3a0a612e97e4606bafb --- /dev/null +++ b/solutions/ex07/ilqr_TODO_11.py @@ -0,0 +1,4 @@ + R = c_uu + H = c_ux + q, qN = c_x[:-1], c_x[-1] + r = c_u \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_12.py b/solutions/ex07/ilqr_TODO_12.py new file mode 100644 index 0000000000000000000000000000000000000000..b5823c6a0680218628af756b10761f3211c04501 --- /dev/null +++ b/solutions/ex07/ilqr_TODO_12.py @@ -0,0 +1,4 @@ + # fs = [(v[1],v[2]) for v in [model.f(x, u, k, compute_jacobian=True) for k, (x, u) in enumerate(zip(x_bar[:-1], u_bar))]] + fs = [model.f_jacobian(x, u, k) for k, (x, u) in enumerate(zip(x_bar[:-1], u_bar))] + + A, B = zip(*fs) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_13.py b/solutions/ex07/ilqr_TODO_13.py new file mode 100644 index 0000000000000000000000000000000000000000..41ceba56deb0495616a22805f7a38bb9733ec181 --- /dev/null +++ b/solutions/ex07/ilqr_TODO_13.py @@ -0,0 +1,2 @@ + gs = [model.cost.c(x, u, i, compute_gradients=True) for i, (x, u) in enumerate(zip(x_bar[:-1], u_bar))] + c, c_x, c_u, c_xx, c_ux, c_uu = zip(*gs) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_14.py b/solutions/ex07/ilqr_TODO_14.py new file mode 100644 index 0000000000000000000000000000000000000000..8c925083e59871befab170fb5ec809e6311e5452 --- /dev/null +++ b/solutions/ex07/ilqr_TODO_14.py @@ -0,0 +1,3 @@ + c = c + (cN,) + c_x = c_x + (c_xN,) + c_xx = c_xx + (c_xxN,) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_15.py b/solutions/ex07/ilqr_TODO_15.py new file mode 100644 index 0000000000000000000000000000000000000000..73a0fa4ac59c0fedc7f4d7e87645ab4372a80399 --- /dev/null +++ b/solutions/ex07/ilqr_TODO_15.py @@ -0,0 +1 @@ + u_star[i] = u_bar[i] + alpha * l[i] + L[i] @ (x[i] - x_bar[i]) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_16.py b/solutions/ex07/ilqr_TODO_16.py new file mode 100644 index 0000000000000000000000000000000000000000..20904f4a15cb7b9db30da28ccb88bb21a1bb040d --- /dev/null +++ b/solutions/ex07/ilqr_TODO_16.py @@ -0,0 +1 @@ + x[i + 1] = model.f(x[i], u_star[i], i) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_2.py b/solutions/ex07/ilqr_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..a5ff3ca1047f1886bf4e94a5705d33b3eb88f1fc --- /dev/null +++ b/solutions/ex07/ilqr_TODO_2.py @@ -0,0 +1,2 @@ + A, B, c, c_x, c_u, c_xx, c_ux, c_uu = get_derivatives(model, x_bar, u_bar) + J = sum(c) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_3.py b/solutions/ex07/ilqr_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..417755d3e4956fbbf5bc0bb596377781412cb47b --- /dev/null +++ b/solutions/ex07/ilqr_TODO_3.py @@ -0,0 +1 @@ + L, l = backward_pass(A, B, c_x, c_u, c_xx, c_ux, c_uu, mu) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_4.py b/solutions/ex07/ilqr_TODO_4.py new file mode 100644 index 0000000000000000000000000000000000000000..6db866f5b27f7921e409d179462e51c8b8ea1420 --- /dev/null +++ b/solutions/ex07/ilqr_TODO_4.py @@ -0,0 +1 @@ + x_bar, u_bar = forward_pass(model, x_bar, u_bar, L=L, l=l, alpha=alpha) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_5.py b/solutions/ex07/ilqr_TODO_5.py new file mode 100644 index 0000000000000000000000000000000000000000..4ead664be5795faaebda22e5321ae4673eefdebf --- /dev/null +++ b/solutions/ex07/ilqr_TODO_5.py @@ -0,0 +1,2 @@ + l, L = [np.zeros((m,))] * N, [np.zeros((m, n))] * N + x_bar, u_bar = forward_pass(model, x_bar, u_bar, L=L, l=l) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_6.py b/solutions/ex07/ilqr_TODO_6.py new file mode 100644 index 0000000000000000000000000000000000000000..2e7b84c3c79530d3e24b420e11f4d15140e6fcd9 --- /dev/null +++ b/solutions/ex07/ilqr_TODO_6.py @@ -0,0 +1,2 @@ + A, B, c, c_x, c_u, c_xx, c_ux, c_uu = get_derivatives(model, x_bar, u_bar) + J_prime = sum(c) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_7.py b/solutions/ex07/ilqr_TODO_7.py new file mode 100644 index 0000000000000000000000000000000000000000..f23d1c9bd15a76d13938844705782aa675cc9e7e --- /dev/null +++ b/solutions/ex07/ilqr_TODO_7.py @@ -0,0 +1 @@ + L, l = backward_pass(A, B, c_x, c_u, c_xx, c_ux, c_uu, mu) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_8.py b/solutions/ex07/ilqr_TODO_8.py new file mode 100644 index 0000000000000000000000000000000000000000..123b9bb988a13d8f828a0af3b37c6a9f213495ff --- /dev/null +++ b/solutions/ex07/ilqr_TODO_8.py @@ -0,0 +1 @@ + J_new = cost_of_trajectory(model, x_hat, u_hat) \ No newline at end of file diff --git a/solutions/ex07/ilqr_TODO_9.py b/solutions/ex07/ilqr_TODO_9.py new file mode 100644 index 0000000000000000000000000000000000000000..1fe4de670d25185a976a1fb36ae4eb04e539a456 --- /dev/null +++ b/solutions/ex07/ilqr_TODO_9.py @@ -0,0 +1 @@ + Delta, mu = min(1.0, Delta) / Delta_0, max(0, mu*Delta) \ No newline at end of file diff --git a/solutions/ex07/ilqr_agent_TODO_1.py b/solutions/ex07/ilqr_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..5632d146890b0bcadefd5b04fccf7393338debcf --- /dev/null +++ b/solutions/ex07/ilqr_agent_TODO_1.py @@ -0,0 +1 @@ + u = self.ubar[k] + self.L[k]@ (x-self.xbar[k]) + self.l[k] \ No newline at end of file diff --git a/solutions/ex07/ilqr_pendulum_TODO_1.py b/solutions/ex07/ilqr_pendulum_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..dee07f700ec7c23c71a96ddb29e0e4e1eb9ba7e5 --- /dev/null +++ b/solutions/ex07/ilqr_pendulum_TODO_1.py @@ -0,0 +1 @@ + xs, us, J_hist, L, l = ilqr(model, N, x0, n_iter=n_iter, use_linesearch=use_linesearch) \ No newline at end of file diff --git a/solutions/ex07/linearization_agent_TODO_1.py b/solutions/ex07/linearization_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..9a9c162e64d62675433a1f5174da17654e528e42 --- /dev/null +++ b/solutions/ex07/linearization_agent_TODO_1.py @@ -0,0 +1,4 @@ + xp = model.f(xbar, ubar, k=0) + A, B = model.f_jacobian(xbar, ubar, k=0) + + d = xp - A @ xbar - B @ ubar \ No newline at end of file diff --git a/solutions/ex07/linearization_agent_TODO_2.py b/solutions/ex07/linearization_agent_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..f84fac4dd7bf0800e2734cee2d913762b410355e --- /dev/null +++ b/solutions/ex07/linearization_agent_TODO_2.py @@ -0,0 +1 @@ + (self.L, self.l), (V, v, vc) = LQR(A=[A]*N, B=[B]*N, d=[d]*N, Q=[Q]*N, q=[q]*N, R=[self.model.cost.R]*N) \ No newline at end of file diff --git a/solutions/ex07/linearization_agent_TODO_3.py b/solutions/ex07/linearization_agent_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..797bd934a4c3c08c245bbdbe956a9008510367a8 --- /dev/null +++ b/solutions/ex07/linearization_agent_TODO_3.py @@ -0,0 +1 @@ + u = self.L[0] @ x + self.l[0] \ No newline at end of file diff --git a/solutions/ex08/bandits_TODO_1.py b/solutions/ex08/bandits_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..e6ee400597c9534a859b929037ae3c6483ec44ee --- /dev/null +++ b/solutions/ex08/bandits_TODO_1.py @@ -0,0 +1,2 @@ + reward = self.q_star[a] + np.random.randn() + gab = self.q_star[self.optimal_action] - self.q_star[a] \ No newline at end of file diff --git a/solutions/ex08/gradient_agent_TODO_1.py b/solutions/ex08/gradient_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..2c166f9787f05f6acd9b977fe814558616388c3d --- /dev/null +++ b/solutions/ex08/gradient_agent_TODO_1.py @@ -0,0 +1,9 @@ + pi_a = self.Pa() + for b in range(self.k): + if b == a: + self.H[b] += self.alpha * (r - self.R_bar) * (1 - pi_a[b]) + else: + self.H[b] -= self.alpha * (r - self.R_bar) * pi_a[b] + + if self.baseline: + self.R_bar = self.R_bar + (self.alpha if self.alpha is not None else 1/(self.t+1)) * (r - self.R_bar) \ No newline at end of file diff --git a/solutions/ex08/grand_bandit_race_TODO_1.py b/solutions/ex08/grand_bandit_race_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..06e8845cc54405e1d5cac07b51eb414b9170e1ef --- /dev/null +++ b/solutions/ex08/grand_bandit_race_TODO_1.py @@ -0,0 +1 @@ + bandit1 = StationaryBandit(k=10) \ No newline at end of file diff --git a/solutions/ex08/grand_bandit_race_TODO_2.py b/solutions/ex08/grand_bandit_race_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..c9ffb8db36b8c1313eab01a4668a45f30d3cc243 --- /dev/null +++ b/solutions/ex08/grand_bandit_race_TODO_2.py @@ -0,0 +1,5 @@ + agents = [BasicAgent(bandit1, epsilon=epsilon)] + agents += [MovingAverageAgent(bandit1, epsilon=epsilon, alpha=alpha)] + agents += [GradientAgent(bandit1, alpha=alpha,use_baseline=False) ] + agents += [GradientAgent(bandit1, alpha=alpha,use_baseline=True) ] + agents += [UCBAgent(bandit1, c=2)] \ No newline at end of file diff --git a/solutions/ex08/grand_bandit_race_TODO_3.py b/solutions/ex08/grand_bandit_race_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..807579c172456281f047d47aa499d87869460af6 --- /dev/null +++ b/solutions/ex08/grand_bandit_race_TODO_3.py @@ -0,0 +1 @@ + eval_and_plot(bandit1, agents, max_episodes=2000, labels=labels) \ No newline at end of file diff --git a/solutions/ex08/grand_bandit_race_TODO_4.py b/solutions/ex08/grand_bandit_race_TODO_4.py new file mode 100644 index 0000000000000000000000000000000000000000..3a9cb82fb4ef4d9adbd1f7d726d53189b2827710 --- /dev/null +++ b/solutions/ex08/grand_bandit_race_TODO_4.py @@ -0,0 +1 @@ + bandit2 = StationaryBandit(k=10, q_star_mean=4) \ No newline at end of file diff --git a/solutions/ex08/grand_bandit_race_TODO_5.py b/solutions/ex08/grand_bandit_race_TODO_5.py new file mode 100644 index 0000000000000000000000000000000000000000..1a6cfc7bd723679fac58db343018033760fcd8f9 --- /dev/null +++ b/solutions/ex08/grand_bandit_race_TODO_5.py @@ -0,0 +1 @@ + eval_and_plot(bandit2, agents, max_episodes=2000, labels=labels) \ No newline at end of file diff --git a/solutions/ex08/grand_bandit_race_TODO_6.py b/solutions/ex08/grand_bandit_race_TODO_6.py new file mode 100644 index 0000000000000000000000000000000000000000..20c9ba027fd3dc0a70ea72bf97622c69031199f3 --- /dev/null +++ b/solutions/ex08/grand_bandit_race_TODO_6.py @@ -0,0 +1 @@ + bandit3 = NonstationaryBandit(k=10) \ No newline at end of file diff --git a/solutions/ex08/grand_bandit_race_TODO_7.py b/solutions/ex08/grand_bandit_race_TODO_7.py new file mode 100644 index 0000000000000000000000000000000000000000..a2a5676b54581b39119cddc327bdf79459f97f57 --- /dev/null +++ b/solutions/ex08/grand_bandit_race_TODO_7.py @@ -0,0 +1 @@ + eval_and_plot(bandit3, agents, max_episodes=2000, steps=10000, labels=labels) \ No newline at end of file diff --git a/solutions/ex08/grand_bandit_race_TODO_8.py b/solutions/ex08/grand_bandit_race_TODO_8.py new file mode 100644 index 0000000000000000000000000000000000000000..b34bc62119f16531be2e9792bf8b442f365132d2 --- /dev/null +++ b/solutions/ex08/grand_bandit_race_TODO_8.py @@ -0,0 +1 @@ + eval_and_plot(bandit1, agents2, steps=10000, labels=labels) \ No newline at end of file diff --git a/solutions/ex08/nonstationary_TODO_1.py b/solutions/ex08/nonstationary_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..e53da8bcceffb8f6b6c54f0cdfe39eaef44f9959 --- /dev/null +++ b/solutions/ex08/nonstationary_TODO_1.py @@ -0,0 +1,2 @@ + self.q_star += self.reward_change_std * np.random.randn(self.k) + self.optimal_action = np.argmax(self.q_star) \ No newline at end of file diff --git a/solutions/ex08/nonstationary_TODO_2.py b/solutions/ex08/nonstationary_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..26dc95b4931d7fab277c37fff8b5894f3315c332 --- /dev/null +++ b/solutions/ex08/nonstationary_TODO_2.py @@ -0,0 +1,2 @@ + self.alpha=alpha + super().__init__(env, epsilon=epsilon) \ No newline at end of file diff --git a/solutions/ex08/nonstationary_TODO_3.py b/solutions/ex08/nonstationary_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..d45b3a0f69beac0af8bfd8249534906f13bdd167 --- /dev/null +++ b/solutions/ex08/nonstationary_TODO_3.py @@ -0,0 +1 @@ + self.Q[a] = self.Q[a] + self.alpha * (r-self.Q[a]) \ No newline at end of file diff --git a/solutions/ex08/nonstationary_TODO_4.py b/solutions/ex08/nonstationary_TODO_4.py new file mode 100644 index 0000000000000000000000000000000000000000..e4ffd80476a720982fcd5655d5b58f00ee8e6d42 --- /dev/null +++ b/solutions/ex08/nonstationary_TODO_4.py @@ -0,0 +1,4 @@ + bandit = NonstationaryBandit(k=10) + + agents = [BasicAgent(bandit, epsilon=epsilon)] + agents += [MovingAverageAgent(bandit, epsilon=epsilon, alpha=alpha) for alpha in alphas] \ No newline at end of file diff --git a/solutions/ex08/nonstationary_TODO_5.py b/solutions/ex08/nonstationary_TODO_5.py new file mode 100644 index 0000000000000000000000000000000000000000..9742313984f8906af30c34cac26785b1b2ec8791 --- /dev/null +++ b/solutions/ex08/nonstationary_TODO_5.py @@ -0,0 +1 @@ + labels += [f"Mov.avg. agent, epsilon={epsilon}, alpha={alpha}" for alpha in alphas] \ No newline at end of file diff --git a/solutions/ex08/simple_agents_TODO_1.py b/solutions/ex08/simple_agents_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..5416e07d0958fe82276fa186510f0402eafc7ece --- /dev/null +++ b/solutions/ex08/simple_agents_TODO_1.py @@ -0,0 +1,2 @@ + self.Q = np.zeros((self.k,)) + self.N = np.zeros((self.k,)) \ No newline at end of file diff --git a/solutions/ex08/simple_agents_TODO_2.py b/solutions/ex08/simple_agents_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..d91b3938f2b78d14de98a867793d8b33f61cae55 --- /dev/null +++ b/solutions/ex08/simple_agents_TODO_2.py @@ -0,0 +1 @@ + return np.random.randint(self.k) if np.random.rand() < self.epsilon else np.argmax(self.Q) \ No newline at end of file diff --git a/solutions/ex08/simple_agents_TODO_3.py b/solutions/ex08/simple_agents_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..df218f01d8927b72f1920bc2841a8f2ae0b0e616 --- /dev/null +++ b/solutions/ex08/simple_agents_TODO_3.py @@ -0,0 +1,2 @@ + self.N[a] = self.N[a] + 1 + self.Q[a] = self.Q[a] + 1/self.N[a] * (r-self.Q[a]) \ No newline at end of file diff --git a/solutions/ex08/ucb_agent_TODO_1.py b/solutions/ex08/ucb_agent_TODO_1.py new file mode 100644 index 0000000000000000000000000000000000000000..4812f63b0e4fc5378d676a4d2e0459c3d2b07ca8 --- /dev/null +++ b/solutions/ex08/ucb_agent_TODO_1.py @@ -0,0 +1,2 @@ + self.N[a] += 1 + self.Q[a] += 1/self.N[a] * (r - self.Q[a]) \ No newline at end of file diff --git a/solutions/ex08/ucb_agent_TODO_2.py b/solutions/ex08/ucb_agent_TODO_2.py new file mode 100644 index 0000000000000000000000000000000000000000..437563cd1c647bce69dc8778e93c5bf9854f0a5c --- /dev/null +++ b/solutions/ex08/ucb_agent_TODO_2.py @@ -0,0 +1,3 @@ + k = self.env.action_space.n + self.Q = np.zeros((k,)) + self.N = np.zeros((k,)) \ No newline at end of file diff --git a/solutions/ex08/ucb_agent_TODO_3.py b/solutions/ex08/ucb_agent_TODO_3.py new file mode 100644 index 0000000000000000000000000000000000000000..59255040547725524492f30aa7b91b3267f04c27 --- /dev/null +++ b/solutions/ex08/ucb_agent_TODO_3.py @@ -0,0 +1 @@ + return np.argmax( self.Q + self.c * np.sqrt( np.log(k+1)/(self.N+1e-8) ) ) \ No newline at end of file