Skip to content
Snippets Groups Projects
Commit 4ab076af authored by tuhe's avatar tuhe
Browse files

Initializing first week

parent 74578260
No related branches found
No related tags found
No related merge requests found
Showing
with 690 additions and 4 deletions
......@@ -60,8 +60,10 @@ irlc/exam/exam2024august/*.pdf
irlc/exam/exam2025*/*.zip
irlc/exam/exam2025*/*.pdf
#irlc/exam/midterm2023a
#irlc/exam/midterm2023b
irlc/exam/exam2*/solution
irlc/exam/midterm2023a
irlc/exam/midterm2023b
irlc/lectures/lec01
irlc/lectures/lec02
......
......@@ -91,10 +91,10 @@ class SymbolicBicycleModel(ControlModel):
def x_bound(self) -> Box:
return Box(np.asarray([-np.inf, -np.inf, -np.inf, -np.inf, -np.inf, -self.map.width]),
np.asarray([self.v_max, np.inf, np.inf, np.inf, np.inf, self.map.width]))
np.asarray([self.v_max, np.inf, np.inf, np.inf, np.inf, self.map.width]), dtype=np.float64)
def u_bound(self) -> Box:
return Box(np.asarray([-0.5, -1]),np.asarray([0.5, 1]))
return Box(np.asarray([-0.5, -1]),np.asarray([0.5, 1]), dtype=np.float64)
def render(self, x, render_mode='human'):
if self.viewer == None:
......
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
"""This directory contains the exercises for week 1."""
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
"""The Agent class.
References:
[Her25] Tue Herlau. Sequential decision making. (Freely available online), 2025.
"""
import typing
import itertools
import os
import sys
from collections import OrderedDict, namedtuple
import numpy as np
from tqdm import tqdm
from irlc.utils.common import load_time_series, log_time_series
from irlc.utils.irlc_plot import existing_runs
import shutil
from gymnasium import Env
from dataclasses import dataclass
class Agent:
r"""The main agent class. See (Her25, Subsection 4.4.3) for additional details.
To use the agent class, you should first create an environment. In this case we will just create an instance of the
``InventoryEnvironment`` (see (Her25, Subsection 4.2.3))
:Example:
.. runblock:: pycon
>>> from irlc import Agent # You can import directly from top-level package
>>> import numpy as np
>>> np.random.seed(42) # Fix the seed for reproduciability
>>> from irlc.ex01.inventory_environment import InventoryEnvironment
>>> env = InventoryEnvironment() # Create an instance of the environment
>>> agent = Agent(env) # Create an instance of the agent.
>>> s0, info0 = env.reset() # Always call reset to start the environment
>>> a0 = agent.pi(s0, k=0, info=info0) # Tell the agent to compute action $a_{k=0}$
>>> print(f"In state {s0=}, the agent took the action {a0=}")
"""
def __init__(self, env: Env):
"""Instantiate the Agent class.
The agent is given the openai gym environment it must interact with. This allows the agent to know what the
action and observation space is.
:param env: The openai gym ``Env`` instance the agent should interact with.
"""
self.env = env
def pi(self, s, k : int, info : typing.Optional[dict] =None):
r"""Evaluate the Agent's policy (i.e., compute the action the agent want to take) at time step ``k`` in state ``s``.
This correspond to the environment being in a state evaluating :math:`x_k`, and the function should compute the next
action the agent wish to take:
.. math::
u_k = \mu_k(x_k)
This means that ``s`` = :math:`x_k` and ``k`` = :math:`k =\{0, 1, ...\}`. The function should return an action that lies in the action-space
of the environment.
The info dictionary:
The ``info``-dictionary contains possible extra information returned from the environment, for instance when calling the ``s, info = env.reset()`` function.
The main use in this course is in control, where the dictionary contains a value ``info['time_seconds']`` (which corresponds to the simulation time :math:`t` in seconds).
We will also use the info dictionary to let the agent know certain actions are not available. This is done by setting the ``info['mask']``-key.
Note that this is only relevant for reinforcement learning, and you should see the documentation/exercises for reinforcement learning for additional details.
The default behavior of the agent is to return a random action. An example:
.. runblock:: pycon
>>> from irlc.pacman.pacman_environment import PacmanEnvironment
>>> from irlc import Agent
>>> env = PacmanEnvironment()
>>> s, info = env.reset()
>>> agent = Agent(env)
>>> agent.pi(s, k=0, info=info) # get a random action
>>> agent.pi(s, k=0) # If info is not specified, all actions are assumed permissible.
:param s: Current state the environment is in.
:param timestep: Current time
:return: The action the agent want to take in the given state at the given time. By default the agent returns a random action
"""
if info is None or 'mask' not in info:
return self.env.action_space.sample()
else:
""" In the case where the actions available in each state differ, openAI deals with that by specifying a
``mask``-entry in the info-dictionary. The mask can then be passed on to the
env.action_space.sample-function to make sure we don't sample illegal actions. I consider this the most
difficult and annoying thing about openai gym."""
if info['mask'].max() > 1:
raise Exception("Bad mask!")
return self.env.action_space.sample(mask=info['mask'])
def train(self, s, a, r, sp, done=False, info_s=None, info_sp=None):
r"""Implement this function if the agent has to learn (be trained).
Note that you only have to implement this function from week 7 onwards -- before that, we are not interested in control methods that learn.
The agent takes a number of input arguments. You should imagine that
* ``s`` is the current state :math:`x_k``
* ``a`` is the action the agent took in state ``s``, i.e. ``a`` :math:`= u_k = \mu_k(x_k)`
* ``r`` is the reward the the agent got from that action
* ``sp`` (s-plus) is the state the environment then transitioned to, i.e. ``sp`` :math:`= x_{k+1}`
* '``done`` tells the agent if the environment has stopped
* ``info_s`` is the information-dictionary returned by the environment as it transitioned to ``s``
* ``info_sp`` is the information-dictionary returned by the environment as it transitioned to ``sp``.
The following example will hopefully clarify it by showing how you would manually call the train-function once:
:Example:
.. runblock:: pycon
>>> from irlc.ex01.inventory_environment import InventoryEnvironment # import environment
>>> from irlc import Agent
>>> env = InventoryEnvironment() # Create an instance of the environment
>>> agent = Agent(env) # Create an instance of the agent.
>>> s, info_s = env.reset() # s is the current state
>>> a = agent.pi(s, k=0, info=info_s) # The agent takes an action
>>> sp, r, done, _, info_sp = env.step(a) # Environment updates
>>> agent.train(s, a, r, sp, done, info_s, info_sp) # How the training function is called
In control and dynamical programming, please recall that the reward is equal to minus the cost.
:param s: Current state :math:`x_k`
:param a: Action taken :math:`u_k`
:param r: Reward obtained by taking action :math:`a_k` in state :math:`x_k`
:param sp: The state that the environment transitioned to :math:`{\\bf x}_{k+1}`
:param info_s: The information dictionary corresponding to ``s`` returned by ``env.reset`` (when :math:`k=0`) and otherwise ``env.step``.
:param info_sp: The information-dictionary corresponding to ``sp`` returned by ``env.step``
:param done: Whether environment terminated when transitioning to ``sp``
:return: None
"""
pass
def __str__(self):
"""**Optional:** A unique name for this agent. Used for labels when plotting, but can be kept like this."""
return super().__str__()
def extra_stats(self) -> dict:
"""**Optional:** Implement this function if you wish to record extra information from the ``Agent`` while training.
You can safely ignore this method as it will only be used for control theory to create nicer plots """
return {}
fields = ('time', 'state', 'action', 'reward')
Trajectory = namedtuple('Trajectory', fields + ("env_info",))
# Experiment using a dataclass.
@dataclass
class Stats:
episode: int
episode_length: int
accumulated_reward: float
total_steps: int
trajectory : Trajectory = None
agent_stats : dict = None
@property
def average_reward(self):
return self.accumulated_reward / self.episode_length
# s = Stats(episode=0, episode_length=5, accumulated_reward=4, total_steps=2, trajectory=Trajectory())
def train(env,
agent=None,
experiment_name=None,
num_episodes=1,
verbose=True,
reset=True, # If True we will call env.reset() upon episode start.
max_steps=1e10,
max_runs=None,
return_trajectory=True, # Return the current trajectories as a list
resume_stats=None, # Resume stat collection from last save.
log_interval=1, # Only log every log_interval steps. Reduces size of log files.
delete_old_experiments=False, # Remove the old experiments folder. Useful while debugging a model (or to conserve disk space)
seed=None, # Attempt to set the seed of the random number generator to produce reproducible results.
):
"""This function implements the main training loop as described in (Her25, Subsection 4.4.4).
The loop will simulate the interaction between agent `agent` and the environment `env`.
The function has a lot of special functionality, so it is useful to consider the common cases. An example:
>>> stats, _ = train(env, agent, num_episodes=2)
Simulate interaction for two episodes (i.e. environment terminates two times and is reset).
`stats` will be a list of length two containing information from each run
>>> stats, trajectories = train(env, agent, num_episodes=2, return_Trajectory=True)
`trajectories` will be a list of length two containing information from the two trajectories.
>>> stats, _ = train(env, agent, experiment_name='experiments/my_run', num_episodes=2)
Save `stats`, and trajectories, to a file which can easily be loaded/plotted (see course software for examples of this).
The file will be time-stamped so using several calls you can repeat the same experiment (run) many times.
>>> stats, _ = train(env, agent, experiment_name='experiments/my_run', num_episodes=2, max_runs=10)
As above, but do not perform more than 10 runs. Useful for repeated experiments.
:param env: An openai-Gym ``Env`` instance (the environment)
:param agent: An ``Agent`` instance
:param experiment_name: The outcome of this experiment will be saved in a folder with this name. This will allow you to run multiple (repeated) experiment and visualize the results in a single plot, which is very important in reinforcement learning.
:param num_episodes: Number of episodes to simulate
:param verbose: Display progress bar
:param reset: Call ``env.reset()`` before simulation start. Default is ``True``. This is only useful in very rare cases.
:param max_steps: Terminate if this many steps have elapsed (for non-terminating environments)
:param max_runs: Maximum number of repeated experiments (requires ``experiment_name``)
:param return_trajectory: Return trajectories list (Off by default since it might consume lots of memory)
:param resume_stats: Resume stat collection from last run (this requires the ``experiment_name`` variable to be set)
:param log_interval: Log stats less frequently than each episode. Useful if you want to run really long experiments.
:param delete_old_experiments: If true, old saved experiments will be deleted. This is useful during debugging.
:param seed: An integer. The random number generator of the environment will be reset to this seed allowing for reproducible results.
:return: A list where each element corresponds to each (started) episode. The elements are dictionaries, and contain the statistics for that episode.
"""
from irlc import cache_write
from irlc import cache_read
saveload_model = False
# temporal_policy = None
save_stats = True
if agent is None:
print("[train] No agent was specified. Using irlc.Agent(env) (this agent selects actions at random)")
agent = Agent(env)
if delete_old_experiments and experiment_name is not None and os.path.isdir(experiment_name):
shutil.rmtree(experiment_name)
if experiment_name is not None and max_runs is not None and existing_runs(experiment_name) >= max_runs:
stats, recent = load_time_series(experiment_name=experiment_name)
if return_trajectory:
trajectories = cache_read(recent+"/trajectories.pkl")
else:
trajectories = []
return stats, trajectories
stats = []
steps = 0
ep_start = 0
resume_stats = saveload_model if resume_stats is None else resume_stats
recent = None
if resume_stats:
stats, recent = load_time_series(experiment_name=experiment_name)
if recent is not None:
ep_start, steps = stats[-1]['Episode']+1, stats[-1]['Steps']
trajectories = []
# include_metadata = len(inspect.getfullargspec(agent.train).args) >= 7
break_outer = False
with tqdm(total=num_episodes, disable=not verbose, file=sys.stdout, mininterval=int(num_episodes/100) if num_episodes>100 else None) as tq:
for i_episode in range(num_episodes):
if break_outer:
break
info_s = {}
if reset or i_episode > 0:
if seed is not None:
s, info_s = env.reset(seed=seed)
seed = None
else:
s, info_s = env.reset()
elif hasattr(env, "s"): # This is doing what, exactly? Perhaps save/load of agent?
s = env.s
elif hasattr(env, 'state'):
s = env.state
else:
s = env.model.s
# time = 0
reward = []
trajectory = Trajectory(time=[], state=[], action=[], reward=[], env_info=[])
k = 0 # initial state k.
for _ in itertools.count():
# policy is always temporal
a = agent.pi(s, k, info_s) # if temporal_policy else agent.pi(s)
k = k + 1
sp, r, terminated, truncated, info_sp = env.step(a)
done = terminated or truncated
if info_sp is not None and 'mask' in info_sp and info_sp['mask'].max() > 1:
print("bad")
agent.train(s, a, r, sp, done, info_s, info_sp)
if return_trajectory:
trajectory.time.append(np.asarray(info_s['time_seconds'] if 'time_seconds' in info_s else steps)) #np.asarray(time))
trajectory.state.append(s)
trajectory.action.append(a)
trajectory.reward.append(np.asarray(r))
trajectory.env_info.append(info_s)
reward.append(r)
steps += 1
# time += info_sp['dt'] if 'dt' in info_sp else 1
# time += 1
if done or steps >= max_steps:
trajectory.state.append(sp)
trajectory.env_info.append(info_sp)
trajectory.time.append(np.asarray(info_sp['time_seconds'] if 'time_seconds' in info_s else steps))
break_outer = steps >= max_steps
break
s = sp
info_s = info_sp
if return_trajectory:
try:
from irlc.ex04.control_environment import ControlEnvironment
if isinstance(env, ControlEnvironment): # TODO: this is too hacky. States/actions should be lists, and subsequent methods should stack.
trajectory = Trajectory(**{field: np.stack([np.asarray(x_) for x_ in getattr(trajectory, field)]) for field in fields}, env_info=trajectory.env_info)
# else:
# trajectory = Trajectory(**{field: np.stack([np.asarray(x_) for x_ in getattr(trajectory, field)]) for field in fields}, env_info=trajectory.env_info)
except Exception as e:
pass
trajectories.append(trajectory)
if (i_episode + 1) % log_interval == 0:
stats.append({"Episode": i_episode + ep_start,
"Accumulated Reward": sum(reward),
# "Average Reward": np.mean(reward), # Not sure we need this anymore.
"Length": len(reward),
"Steps": steps, # Useful for deep learning applications. This should be kept, or week 13 will have issues.
**agent.extra_stats()})
rate = int(num_episodes / 100)
if rate > 0 and i_episode % rate == 0:
tq.set_postfix(ordered_dict=OrderedDict(list(OrderedDict(stats[-1]).items())[:5])) if len(stats) > 0 else None
tq.update()
sys.stderr.flush()
if resume_stats and save_stats and recent is not None:
os.remove(recent+"/log.txt")
if experiment_name is not None and save_stats:
path = log_time_series(experiment=experiment_name, list_obs=stats)
if return_trajectory:
cache_write(trajectories, path+"/trajectories.pkl")
print(f"Training completed. Logging {experiment_name}: '{', '.join( stats[0].keys()) }'")
for i, t in enumerate(trajectories):
from collections import defaultdict
nt = defaultdict(lambda: [])
if t.env_info is not None and t.env_info[1] is not None and "supersample" in t.env_info[1]:
for f in fields:
for k, ei in enumerate(t.env_info):
if 'supersample' not in ei:
continue
z = ei['supersample'].__getattribute__(f).T
if k == 0:
pass
else:
z = z[1:]
nt[f].append(z)
for f in fields:
nt[f] = np.concatenate([z for z in nt[f]],axis=0)
traj2 = Trajectory(**nt, env_info=[])
trajectories[i] = traj2
# for k, t in enumerate(stats):
# if k < len(trajectories):
# stats[k]['trajectory'] = trajectories[k]
# Turn this into a single episodes-list (refactor later)
return stats, trajectories
if __name__ == "__main__":
# Use the trajectories here.
from irlc.ex01.inventory_environment import InventoryEnvironment
env = InventoryEnvironment(N=10)
stats, traj = train(env, Agent(env))
print(stats)
s = Stats(episode=1, episode_length=2, accumulated_reward=4, total_steps=4, trajectory=None, agent_stats={})
print(s)
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
import gymnasium
import numpy as np
from gymnasium.spaces.discrete import Discrete
from irlc.ex01.agent import Agent, train
class BobFriendEnvironment(gymnasium.Env):
def __init__(self, x0=20):
self.x0 = x0
self.action_space = Discrete(2) # Possible actions {0, 1}
def reset(self):
# TODO: 1 lines missing.
raise NotImplementedError("Insert your solution and remove this error.")
return self.s, {}
def step(self, a):
# TODO: 9 lines missing.
raise NotImplementedError("Insert your solution and remove this error.")
return s_next, reward, terminated, False, {}
class AlwaysAction_u0(Agent):
def pi(self, s, k, info=None):
"""This agent should always take action u=0."""
# TODO: 1 lines missing.
raise NotImplementedError("Implement function body")
class AlwaysAction_u1(Agent):
def pi(self, s, k, info=None):
"""This agent should always take action u=1."""
# TODO: 1 lines missing.
raise NotImplementedError("Implement function body")
if __name__ == "__main__":
# Part A:
env = BobFriendEnvironment()
x0, _ = env.reset()
print(f"Initial amount of money is x0 = {x0} (should be 20 kroner)")
print("Lets put it in the bank, we should end up in state x1=22 and get a reward of 2 kroner")
x1, reward, _, _, _ = env.step(0)
print("we got", x1, reward)
# Since we reset the environment, we should get the same result as before:
env.reset()
x1, reward, _, _, _ = env.step(0)
print("(once more) we got", x1, reward, "(should be the same as before)")
env.reset() # We must call reset -- the environment has possibly been changed!
print("Lets lend it to our friend -- what happens will now be random")
x1, reward, _, _, _ = env.step(1)
print("we got", x1, reward)
# Part B:
stats, _ = train(env, AlwaysAction_u0(env), num_episodes=1000)
average_u0 = np.mean([stat['Accumulated Reward'] for stat in stats])
stats, _ = train(env, AlwaysAction_u1(env), num_episodes=1000)
average_u1 = np.mean([stat['Accumulated Reward'] for stat in stats])
print(f"Average reward while taking action u=0 was {average_u0} (should be 2)")
print(f"Average reward while taking action u=1 was {average_u1} (should be 4)")
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
"""This file contains code for the Chess Tournament problem."""
import numpy as np
from gymnasium.spaces.discrete import Discrete
from gymnasium import Env
class ChessTournament(Env):
"""The ChessTournament gymnasium-environment which simulate a chess tournament.
In the problem, a chess tournament ends when a player wins two games in a row. The results
of each game are -1, 0, 1 corresponding to a loss, draw and win for player 1. See:
https://www.youtube.com/watch?v=5UQU1oBpAic
To implement this, we define the step-function such that one episode of the environment corresponds to playing
a chess tournament to completion. Once the environment completes, it returns a reward of +1 if the player won
the tournament, and otherwise 0.
Each step therefore corresponds to playing a single game in the tournament.
To implement this, we use a state corresponding to the sequence of games in the tournament:
>>> self.s = [0, -1, 1, 0, 0, 1]
In the self.step(action)-function, we ignore the action, simulate the outcome of a single game,
and append the outcome to self.s. We then compute whether the tournament has completed, and if so
a reward of 1 if we won.
"""
def __init__(self, p_draw=3 / 4, p_win=2 / 3):
self.action_space = Discrete(1)
self.p_draw = p_draw
self.p_win = p_win
self.s = [] # A chess tournament is a sequence of won/lost games s = [0, -1, 1, 0, ...]
def reset(self):
"""Reset the tournament environment to begin to simulate a new tournament.
After each episode is complete, this function will reset :python:`self.s` and return the current state s and an empty dictionary.
:return:
- s - The initial state (what is it?)
- info - An empty dictionary, ``{}``
"""
# TODO: 1 lines missing.
raise NotImplementedError("Implement function body")
return self.s, {}
def step(self, action):
"""Play a single game in the current tournament
The variable action is required by gymnasium but it is not used since no (player) actions occur in this problem.
The step-method should update `self.state` to be the next (new) state, compute the reward, and determine whether
the environment has terminated (:python:`done`).
:param action: This input is required by gymnasium but it is not used in this case.
:return: A tuple of the form :python:`(new_state, reward, done, False, {})`
"""
game_outcome = None # should be -1, 0, or 1 depending on outcome of single game.
## TODO: Oy veh, the following 7 lines below have been permuted. Uncomment, rearrange to the correct order and remove the error.
#-------------------------------------------------------------------------------------------------------------------------------
# else:
# else:
# game_outcome = 1
# if np.random.rand() < self.p_win:
# game_outcome = -1
# game_outcome = 0
# if np.random.rand() < self.p_draw:
raise NotImplementedError("Compute game_outcome here")
self.s.append(game_outcome)
#done = True if the tournament has ended otherwise false. Compute using s.
# TODO: 1 lines missing.
raise NotImplementedError("Compute 'done', whether the tournament has ended.")
# r = ... . Compute reward. Let r=1 if we won the tournament otherwise 0.
# TODO: 1 lines missing.
raise NotImplementedError("Compute the reward 'r' here.")
return self.s, r, done, False, {}
def main():
"""The main method of the chess-game problem.
This function will simulate T tournament games and estimate average win probability for player 1 as p_win (answer to riddle) and also
the average length. Note the later should be a 1-liner, but would require non-trivial computations to solve
analytically. Please see the :class:`gymnasium.Env` class for additional details.
"""
T = 5000
from irlc import train, Agent
env = ChessTournament()
# Compute stats using the train function. Simulate the tournament for a total of T=10'000 episodes.
# TODO: 1 lines missing.
raise NotImplementedError("Compute stats here using train(env, ...). Use num_episodes.")
p_win = np.mean([st['Accumulated Reward'] for st in stats])
avg_length = np.mean([st['Length'] for st in stats])
print("Agent: Estimated chance I won the tournament: ", p_win)
print("Agent: Average tournament length", avg_length)
if __name__ == "__main__":
main()
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
import numpy as np
from gymnasium.spaces.discrete import Discrete
from gymnasium import Env
from irlc.ex01.agent import Agent, train
class InventoryEnvironment(Env):
def __init__(self, N=2):
self.N = N # planning horizon
self.action_space = Discrete(3) # Possible actions {0, 1, 2}
self.observation_space = Discrete(3) # Possible observations {0, 1, 2}
def reset(self):
self.s = 0 # reset initial state x0=0
self.k = 0 # reset time step k=0
return self.s, {} # Return the state we reset to (and an empty dict)
def step(self, a):
w = np.random.choice(3, p=(.1, .7, .2)) # Generate random disturbance
# TODO: 5 lines missing.
raise NotImplementedError("Insert your solution and remove this error.")
return s_next, reward, terminated, False, {} # return transition information
class RandomAgent(Agent):
def pi(self, s, k, info=None):
""" Return action to take in state s at time step k """
# TODO: 1 lines missing.
raise NotImplementedError("Implement function body")
def simplified_train(env: Env, agent: Agent) -> float:
s, _ = env.reset()
J = 0 # Accumulated reward for this rollout
for k in range(1000):
## TODO: Oy veh, the following 7 lines below have been permuted. Uncomment, rearrange to the correct order and remove the error.
#-------------------------------------------------------------------------------------------------------------------------------
# if terminated or truncated:
# sp, r, terminated, truncated, metadata = env.step(a)
# a = agent.pi(s, k)
# s = sp
# J += r
# agent.train(s, a, sp, r, terminated)
# break
raise NotImplementedError("Remove this exception after the above lines have been uncommented and rearranged.")
return J
def run_inventory():
env = InventoryEnvironment()
agent = RandomAgent(env)
stats, _ = train(env,agent,num_episodes=1,verbose=False) # Perform one rollout.
print("Accumulated reward of first episode", stats[0]['Accumulated Reward'])
# I recommend inspecting 'stats' in a debugger; why do you think it is a list of length 1?
stats, _ = train(env, agent, num_episodes=1000,verbose=False) # do 1000 rollouts
avg_reward = np.mean([stat['Accumulated Reward'] for stat in stats])
print("[RandomAgent class] Average cost of random policy J_pi_random(0)=", -avg_reward)
# Try to inspect stats again in a debugger here. How long is the list now?
stats, _ = train(env, Agent(env), num_episodes=1000,verbose=False) # Perform 1000 rollouts using Agent class
avg_reward = np.mean([stat['Accumulated Reward'] for stat in stats])
print("[Agent class] Average cost of random policy J_pi_random(0)=", -avg_reward)
""" Second part: Using the simplified training method. I.e. do not use train() below.
You can find some pretty strong hints about what goes on in simplified_train in the lecture slides for today. """
avg_reward_simplified_train = np.mean( [simplified_train(env, agent) for i in range(1000)])
print("[simplified train] Average cost of random policy J_pi_random(0) =", -avg_reward_simplified_train)
if __name__ == "__main__":
run_inventory()
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
from irlc.pacman.pacman_environment import PacmanEnvironment
from irlc import Agent, train, savepdf
# Maze layouts can be specified using a string.
layout = """
%%%%%%%%%%
%P.......%
%.%%%%%%.%
%.% %.%
%.% %.%
%.% %.%
%.% %.%
%.%%%%%%.%
%........%
%%%%%%%%%%
"""
# This is our first agent. Note it inherits from the Agent class. Use <ctrl>+click in pycharm to navigate to code definitions --
# this is a very useful habbit when you work with other peoples code in general, and object-oriented code in particular.
class GoAroundAgent(Agent):
def pi(self, x, k, info=None):
""" Collect all dots in the maze in the smallest amount of time.
This function should return an action, check the output of the code below to see what actions you can potentially
return.
Remember Pacman only have to solve this single maze, so don't make the function general.
Hints:
- Insert a breakpoint in the function. Try to write self.env and self.env.action_space.actions in the interpreter. Where did self.env get set?
- Remember that k is the current step number.
- Ignore the info dictionary; you can probably also ignore the state x.
- The function should return a string (the actions are strings such as 'North')
"""
# TODO: 7 lines missing.
raise NotImplementedError("Implement function body")
return 'West'
if __name__ == "__main__":
# Create an environment with the given layout. animate_movement is just for a nicer visualization.
env = PacmanEnvironment(layout_str=layout, render_mode='human')
# This creates a visualization (Note this makes the environment slower) which can help us see what Pacman does
# This create the GoAroundAgent-instance
agent = GoAroundAgent(env)
# Uncomment the following line to input actions instead of the agent using the keyboard:
# env, agent = interactive(env, agent)
s, info = env.reset() # Reset (and start) the environment
savepdf("pacman_roundabout.pdf", env=env) # Saves a snapshot of the start layout
# The next two lines display two ways to get the available actions. The 'canonical' way using the
# env.action_space, and a way particular to Pacman by using the s.A() function on the state.
# You can read more about the functions in the state in project 1.
# print("Available actions at start:", env.action_space.actions) # This will list the available actions.
print("Alternative way of getting actions:", s.A()) # See also project description
# Simulate the agent for one episode
stats, _ = train(env, agent, num_episodes=1)
# Print your obtained score.
print("Your obtained score was", stats[0]['Accumulated Reward'])
env.close() # When working with visualizations, call env.close() to close windows it may have opened. "
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
File deleted
File deleted
This directory is purposefully left empty. During the exam, you will be given a `.zip` file with the content of this directory.
Replace this directory with the corresponding directory from the `.zip` file to begin working on the exam.
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
This directory is purposefully left empty. During the exam, you will be given a `.zip` file with the content of this directory.
Replace this directory with the corresponding directory from the `.zip` file to begin working on the exam.
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
File deleted
File deleted
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
File deleted
File deleted
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment