Skip to content
Snippets Groups Projects
Commit 23ff9694 authored by tuhe's avatar tuhe
Browse files

Project 3

parent 3c712cf7
No related branches found
No related tags found
No related merge requests found
Showing
with 564 additions and 3 deletions
......@@ -9,7 +9,7 @@ exam_tabular_examples
#solutions/ex06
#solutions/ex07
#solutions/ex08
solutions/ex09
# solutions/ex09
solutions/ex10
solutions/ex11
solutions/ex12
......@@ -35,7 +35,7 @@ solutions/ex13
#irlc/tests/tests_week06.py
#irlc/tests/tests_week07.py
#irlc/tests/tests_week08.py
irlc/tests/tests_week09.py
# irlc/tests/tests_week09.py
irlc/tests/tests_week10.py
irlc/tests/tests_week11.py
irlc/tests/tests_week12.py
......@@ -72,7 +72,7 @@ irlc/exam/exam20*/solution
#irlc/lectures/lec06
#irlc/lectures/lec07
#irlc/lectures/lec08
irlc/lectures/lec09
# irlc/lectures/lec09
irlc/lectures/lec10
irlc/lectures/lec11
irlc/lectures/lec12
......
\documentclass[12pt,twoside]{article}
%\usepackage[table]{xcolor} % important to avoid options clash.
%\input{02465shared_preamble}
%\usepackage{cleveref}
\usepackage{url}
\usepackage{graphics}
\usepackage{multicol}
\usepackage{rotate}
\usepackage{rotating}
\usepackage{booktabs}
\usepackage{hyperref}
\usepackage{pifont}
\usepackage{latexsym}
\usepackage[english]{babel}
\usepackage{epstopdf}
\usepackage{etoolbox}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{multirow,epstopdf}
\usepackage{fancyhdr}
\usepackage{booktabs}
\usepackage{xcolor}
\newcommand\redt[1]{ {\textcolor[rgb]{0.60, 0.00, 0.00}{\textbf{ #1} } } }
\newcommand{\m}[1]{\boldsymbol{ #1}}
\newcommand{\yoursolution}{ \redt{(your solution here) } }
\title{ Report 3 hand-in }
\date{ \today }
\author{Alice (\texttt{s000001})\and Bob (\texttt{s000002})\and Clara (\texttt{s000003}) }
\begin{document}
\maketitle
\begin{table}[ht!]
\caption{Attribution table. Feel free to add/remove rows and columns}
\begin{tabular}{llll}
\toprule
& Alice & Bob & Clara \\
\midrule
1: Optimal policy & 0-100\% & 0-100\% & 0-100\% \\
2: Simulating a finite approximation of the optimal action-value function & 0-100\% & 0-100\% & 0-100\% \\
3: Analytically computing the optimal action-value function & 0-100\% & 0-100\% & 0-100\% \\
4: Extend solution to all states and actions & 0-100\% & 0-100\% & 0-100\% \\
5: UCB-based exploration & 0-100\% & 0-100\% & 0-100\% \\
6: Sarlacc rules & 0-100\% & 0-100\% & 0-100\% \\
7: Escape the Sarlacc & 0-100\% & 0-100\% & 0-100\% \\
\bottomrule
\end{tabular}
\end{table}
%\paragraph{Statement about collaboration:}
%Please edit this section to reflect how you have used external resources. The following statement will in most cases suffice:
%\emph{The code in the irls/project1 directory is entirely}
%\paragraph{Main report:}
Headings have been inserted in the document for readability. You only have to edit the part which says \yoursolution.
\section{Jar-Jar at the battle of Naboo (\texttt{jarjar.py})}
\subsubsection*{{\color{red}Problem 3: Analytically computing the optimal action-value function}}
Using that ... we obtain
\begin{align}
Q^*(0,1) & = \cdots \\
Q^*(1,-1) & = \cdots
\end{align}
therefore...
\section{Finding the rebels using UCB-exploration (\texttt{rebels.py})}
\section{Individual contribution: The great sarlacc (\texttt{sarlacc.py})}
\end{document}
\ No newline at end of file
File added
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
"""This file is required for the test system but should otherwise be empty."""
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
import matplotlib.pyplot as plt
import numpy as np
def pi_optimal(s : int) -> int:
""" Compute the optimal policy for Jar-Jar binks. Don't overthink this one! """
# TODO: 1 lines missing.
raise NotImplementedError("Return the optimal action in state s.")
return action
def Q0_approximate(gamma : float, N : int) -> float:
""" Return the (estimate) of the optimal action-value function Q^*(0,1) based on
the first N rewards using a discount factor of gamma. Note the similarity to the n-step estimator. """
# TODO: 1 lines missing.
raise NotImplementedError("Return N-term approximation of the optimal action-value function Q^*(0,1)")
return return_estimate
def Q_exact(s : int,a : int, gamma : float) -> float:
"""
Return the exact optimal action-value function Q^*(s,a) in the Jar-Jar problem.
I recommend focusing on simple cases first, such as the two cases in the problem.
Then try to look at larger values of s (for instance, s=2), first using actions that 'point in the right direction' (a = -1)
and then actions that point in the 'wrong' direction a=1.
There are several ways to solve the problem, but the simplest is probably to use recursions.
*Don't* use your solution to Q0_approximate; it is an approximate (finite-horizon) approximation.
"""
# TODO: 6 lines missing.
raise NotImplementedError("return optimal action-value function Q^*(s,a) as a float.")
if __name__ == "__main__":
gamma = 0.8
ss = np.asarray(range(-10, 10))
# Make a plot of your (exact) action-value function Q(s,-1) and Q(s,1).
plt.plot(ss, [Q_exact(s, -1, gamma) for s in ss], 'k-', label='Exact, a=-1')
plt.plot(ss, [Q_exact(s, 1, gamma) for s in ss], 'r-', label='Exact, a=1')
plt.legend()
plt.grid()
plt.show()
print("All done")
This diff is collapsed.
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
from unitgrade import UTestCase, Report
import irlc
class JarJarPiOptimal(UTestCase):
""" Problem 1: Compute optimal policy. """
def test_pi_1(self):
from irlc.project3.jarjar import pi_optimal
self.assertLinf(pi_optimal(1), -1)
def test_pi_all(self):
from irlc.project3.jarjar import pi_optimal
for s in range(-10, 10):
if s != 0:
self.assertLinf(pi_optimal(s))
class JarJarQ0Estimated(UTestCase):
""" Problem 2: Implement Q0_approximate to (approximate) the Q-function for the optimal policy. """
def test_Q0_N1(self):
from irlc.project3.jarjar import Q0_approximate
import numpy as np
self.assertLinf(np.abs(Q0_approximate(gamma=0.8, N=1))) # TODO: Remove abs. This was added due to typo.
def test_Q0_N2(self):
from irlc.project3.jarjar import Q0_approximate
import numpy as np
self.assertLinf(np.abs(Q0_approximate(gamma=0.7, N=20))) # TODO: Remove abs. This was added due to typo.
def test_Q0_N100(self):
from irlc.project3.jarjar import Q0_approximate
import numpy as np
self.assertLinf(np.abs(Q0_approximate(gamma=0.9, N=20))) # TODO: Remove abs. This was added due to typo.
class JarJarQExact(UTestCase):
""" Problem 4: Compute Q^*(s,a) exactly by extending analytical solution. """
def test_Q_s0(self):
from irlc.project3.jarjar import Q_exact
self.assertLinf(Q_exact(0, gamma=0.8, a=1))
self.assertLinf(Q_exact(0, gamma=0.8, a=-1))
def test_Q_s1(self):
from irlc.project3.jarjar import Q_exact
self.assertLinf(Q_exact(1, gamma=0.8, a=-1))
self.assertLinf(Q_exact(1, gamma=0.95, a=-1))
self.assertLinf(Q_exact(1, gamma=0.7, a=-1))
def test_Q_s_positive(self):
from irlc.project3.jarjar import Q_exact
for s in range(20):
self.assertLinf(Q_exact(s, gamma=0.75, a=-1))
def test_Q_all(self):
from irlc.project3.jarjar import Q_exact
for s in range(-20, 20):
self.assertLinf(Q_exact(s, gamma=0.75, a=-1))
self.assertLinf(Q_exact(s, gamma=0.75, a=1))
class RebelsSimple(UTestCase):
""" Problem 5: Test the UCB-algorithm in the basic-environment with a single state """
def test_simple_four_episodes(self):
""" Test the first four episodes in the simple grid problem. """
from irlc.project3.rebels import get_ucb_actions, very_basic_grid
actions = get_ucb_actions(very_basic_grid, alpha=0.1, episodes=4, c=5, plot=False)
# Make sure we only have 4 actions (remember to truncate the action-sequences!)
self.assertEqual(len(actions), 4) # Check the number of actions are correct
self.assertEqual(actions[0], 0) # Check the first action is correct
self.assertEqualC(actions) # Check all actions.
def test_simple_nine_episodes(self):
""" Test the first nine episodes in the simple grid problem. """
from irlc.project3.rebels import get_ucb_actions, very_basic_grid
actions = get_ucb_actions(very_basic_grid, alpha=0.1, episodes=9, c=5, plot=False)
self.assertEqual(len(actions), 9) # Check the number of actions are correct
self.assertEqual(actions[0], 0) # Check the first action is correct
self.assertEqualC(actions) # Check all actions.
def test_simple_environment(self):
from irlc.project3.rebels import get_ucb_actions, very_basic_grid
actions = get_ucb_actions(very_basic_grid, alpha=0.1, episodes=100, c=5, plot=False)
# Check the number of actions are correct
self.assertEqualC(len(actions))
# Check the first action is correct
self.assertEqualC(actions[0])
# Check all actions.
self.assertEqualC(actions)
def test_bridge_environment(self):
from irlc.gridworld.gridworld_environments import grid_bridge_grid
from irlc.project3.rebels import get_ucb_actions, very_basic_grid
actions = get_ucb_actions(grid_bridge_grid, alpha=0.1, episodes=1000, c=2, plot=False)
self.assertEqualC(len(actions))
# Check all actions.
self.assertEqualC(actions)
class RebelsBridge(UTestCase):
""" Problem 5: Test the UCB-algorithm in the bridge-environment """
def test_bridge_environment_one(self):
from irlc.gridworld.gridworld_environments import grid_bridge_grid
from irlc.project3.rebels import get_ucb_actions
actions = get_ucb_actions(grid_bridge_grid, alpha=0.1, episodes=1, c=2, plot=False)
self.assertEqualC(len(actions))
self.assertEqualC(actions)
def test_bridge_environment_two(self):
from irlc.gridworld.gridworld_environments import grid_bridge_grid
from irlc.project3.rebels import get_ucb_actions
actions = get_ucb_actions(grid_bridge_grid, alpha=0.1, episodes=2, c=2, plot=False)
self.assertEqualC(len(actions))
self.assertEqualC(actions)
def test_bridge_environment_short(self):
from irlc.gridworld.gridworld_environments import grid_bridge_grid
from irlc.project3.rebels import get_ucb_actions
actions = get_ucb_actions(grid_bridge_grid, alpha=0.1, episodes=30, c=2, plot=False)
self.assertEqualC(len(actions))
self.assertEqualC(actions)
def test_bridge_environment_long(self):
from irlc.gridworld.gridworld_environments import grid_bridge_grid
from irlc.project3.rebels import get_ucb_actions
actions = get_ucb_actions(grid_bridge_grid, alpha=0.1, episodes=1000, c=2, plot=False)
self.assertEqualC(len(actions))
self.assertEqualC(actions)
class Project3(Report):
title = "Project part 3: Reinforcement Learning"
pack_imports = [irlc]
jarjar1 = [(JarJarPiOptimal, 10),
(JarJarQ0Estimated, 10),
(JarJarQExact, 10) ]
rebels = [(RebelsSimple, 20),
(RebelsBridge, 20) ]
questions = []
questions += jarjar1
questions += rebels
if __name__ == '__main__':
from unitgrade import evaluate_report_student
evaluate_report_student(Project3())
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
import numpy as np
from irlc.ex11.q_agent import QAgent
from irlc.gridworld.gridworld_environments import GridworldEnvironment, grid_bridge_grid
from irlc import train
from irlc.ex09.rl_agent import TabularQ
# A simple UCB action-selection problem (basic problem)
very_basic_grid = [['#',1, '#'],
[1, 'S', 2],
['#',1, '#']]
# TODO: 21 lines missing.
raise NotImplementedError("I wrote an agent that inherited from the Q-agent, and updated the self.pi and self.train-functions to do UCB-based exploration.")
def get_ucb_actions(layout : list, alpha : float, c : float, episodes : int, plot=False) -> list:
""" Return the sequence of actions the agent tries in the environment with the given layout-string when trained over 'episodes' episodes.
To create an environment, you can use the line:
> env = GridworldEnvironment(layout)
See also the demo-file.
The 'plot'-parameter is optional; you can use it to add visualization using a line such as:
if plot:
env = GridworldEnvironment(layout, render_mode='human')
Or you can just ignore it. Make sure to return the truncated action list (see the rebels_demo.py-file or project description).
In other words, the return value should be a long list of integers corresponding to actions:
actions = [0, 1, 2, ..., 1, 3, 2, 1, 0, ...]
"""
# TODO: 6 lines missing.
raise NotImplementedError("Implement function body")
return actions
if __name__ == "__main__":
actions = get_ucb_actions(very_basic_grid, alpha=0.1, c=5, episodes=4, plot=False)
print("Number of actions taken", len(actions))
print("List of actions taken over 4 episodes", actions)
actions = get_ucb_actions(very_basic_grid, alpha=0.1, c=5, episodes=8, plot=False)
print("Number of actions taken", len(actions))
print("Actions taken over 8 episodes", actions)
actions = get_ucb_actions(very_basic_grid, alpha=0.1, c=5, episodes=9, plot=False)
print("Number of actions taken", len(actions))
print("Actions taken over 9 episodes", actions) # In this particular case, you can also predict the 9th action. Why?
# Simulate 100 episodes. This should solve the problem.
actions = get_ucb_actions(very_basic_grid, alpha=0.1, c=5, episodes=100, plot=False)
print("Basic: Actions taken over 100 episodes", actions)
# Simulate 100 episodes for the bridge-environment. The UCB-based method should solve the environment without being overly sensitive to c.
# You can compare your result with the Q-learning agent in the demo, which performs horribly.
actions = get_ucb_actions(grid_bridge_grid, alpha=0.1, c=5, episodes=300, plot=False)
print("Bridge: Actions taken over 300 episodes. The agent should solve the environment:", actions)
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
import numpy as np
from irlc import train, Agent, interactive, savepdf
from irlc.gridworld.gridworld_environments import GridworldEnvironment, grid_bridge_grid
from irlc.project3.rebels import very_basic_grid
from irlc.ex11.q_agent import QAgent
import matplotlib
import matplotlib.pyplot as plt
matplotlib.use('qtagg')
if __name__ == "__main__":
np.random.seed(42) # Fix the seed for reproduciability
env = GridworldEnvironment(very_basic_grid, render_mode='human') # Create an environment
env.reset() # Reset (to set up the visualization)
savepdf("rebels_basic", env=env) # Save a snapshot of the starting state
env.close()
# Create an interactive version.
env = GridworldEnvironment(very_basic_grid, render_mode='human') # Create an environment
agent = QAgent(env) # This agent will display the Q-values.
# agent = Agent(env) # A random agent.
# env, agent = interactive(env, agent) # Uncomment this line to play in 'env' environment. Use space to let the agent move.
stats, trajectories = train(env, agent, num_episodes=16, return_trajectory=True)
env.close()
print("Trajectory 0: States traversed", trajectories[0].state, "actions taken", trajectories[0].action)
print("Trajectory 1: States traversed", trajectories[1].state, "actions taken", trajectories[1].action)
all_actions = [t.action[:-1] for t in trajectories] # Concatenate all action sequence excluding the last dummy-action.
print("All actions taken in 16 episodes, excluding the terminal (dummy) action", all_actions)
# Note the last list is of length 20 -- this is because the environment will always terminate after two actions,
# and since we discard the last (dummy) action we get 20 actions.
# In general, the list of actions will be longer, as only the last action should be discarded (as in the code above).
# A more minimalistic example to plot the bridge-grid environment
bridge_env = GridworldEnvironment(grid_bridge_grid, render_mode='human')
bridge_env.reset()
savepdf("rebels_bridge", env=bridge_env)
bridge_env.close()
# The following code will simulate a Q-learning agent for 3000 (!) episodes and plot the Q-functions.
np.random.seed(42) # Fix the seed for reproduciability
env = GridworldEnvironment(grid_bridge_grid)
agent = QAgent(env, alpha=0.1, epsilon=0.2, gamma=1)
""" Uncomment the next line to play in the environment.
Use the space-bar to let the agent take an action, p to unpause, and otherwise use the keyboard arrows """
train(env, agent, num_episodes=3000) # Train for 3000 episodes. Surely the rebels must be found by now!
bridge_env, agent = interactive(env, agent)
bridge_env.reset()
bridge_env.savepdf("rebels_bridge_Q")
bridge_env.close()
File added
File added
File added
File added
File added
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
"""This file is required for the test system but should otherwise be empty."""
This diff is collapsed.
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
from unitgrade import UTestCase, Report
import irlc
import numpy as np
class SarlaccGameRules(UTestCase):
def check_rules(self, rules):
from irlc.project3i.sarlacc import game_rules
# Test what happens at the starting square s=0 for roll 1
self.assertEqualC(game_rules(rules, state=0, roll=1))
# Test what happens at the starting square s=0 for other rolls
for roll in [2, 3, 4, 5, 6]:
self.assertEqualC(game_rules(rules, state=0, roll=roll))
# Test all states:
for s in range(max(rules.keys())):
if s not in rules: # We skip because s is not a legal state to be in.
for roll in [1, 2, 3, 4, 5, 6]:
self.assertEqualC(game_rules(rules, s, roll))
def test_empty_board_rules(self):
rules = {55: -1}
self.check_rules(rules)
def test_rules(self):
from irlc.project3i.sarlacc import rules
self.check_rules(rules)
class SarlacReturn(UTestCase):
def check_return(self, rules, gamma):
from irlc.project3i.sarlacc import sarlacc_return
v = sarlacc_return(rules, gamma)
# Check that the keys (states) that are included in v are correct. I.e., that the return is computed for the right states.
states = list(sorted(v.keys()))
self.assertEqualC(states)
for s in states:
self.assertL2(v[s], tol=1e-2)
def test_sarlacc_return_empty_gamma1(self):
self.check_return({55: -1}, gamma=1)
def test_sarlacc_return(self):
from irlc.project3i.sarlacc import rules
self.check_return(rules, gamma=.8)
class Project3Individual(Report):
title = "Project part 3: Reinforcement Learning (individual)"
pack_imports = [irlc]
sarlacc = [(SarlaccGameRules, 20),
(SarlacReturn, 20)]
questions = []
questions += sarlacc
if __name__ == '__main__':
from unitgrade import evaluate_report_student
evaluate_report_student(Project3Individual())
# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
"""
References:
[SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
"""
from irlc import savepdf
from irlc.ex09.mdp import MDP
from irlc.ex09.value_iteration import value_iteration
import matplotlib.pyplot as plt
import numpy as np
# These are the game rules of the sarlac: If you land on a state s in the dictionary, you are teleported to rules[s].
rules = {
2: 16,
4: 8,
7: 21,
10: 3,
12: 25,
14: 1,
17: 27,
19: 5,
22: 3,
23: 32,
24: 44,
26: 44,
28: 38,
30: 18,
33: 48,
35: 11,
36: 34,
40: 53,
41: 29,
42: 9,
45: 51,
47: 31,
50: 25,
52: 38,
55: -1,
}
def game_rules(rules : dict, state : int, roll : int) -> int:
""" Compute the next state given the game rules in 'rules', the current state 'state', and the roll
which can be roll = 1, 2, 3, 4, 5, 6.
The output should be -1 in case the game terminates, and otherwise the function should return the next state
as an integer. Read the description of the project for examples on the rules. """
# TODO: 4 lines missing.
raise NotImplementedError("Return the next state")
return state_next
# TODO: 19 lines missing.
raise NotImplementedError("Put your code here.")
def sarlacc_return(rules : dict, gamma : float) -> dict:
""" Compute the value-function using a discount of gamma and the game rules 'rules'.
Result should be reasonable accurate.
The value you return should be a dictionary v, so that v[state] is the value function in that state.
(i.e., the standard output format of the value_iteration function).
Hints:
* One way to solve this problem is to create a MDP-class (see for instance the Gambler-problem in week 9)
and use the value_iteration function from week 9 to solve the problem. But I don't think the problem
is much harder to solve by just writing your own value-iteration method as in (SB18).
"""
# TODO: 2 lines missing.
raise NotImplementedError("Return the value function")
return v
if __name__ == "__main__":
"""
Rules for the snakes and ladder game:
The player starts in square s=0, and the game terminates when the player is in square s = 55.
When a player reaches the base of a ladder he/she climbs it, and when they reach a snakes mouth of a snake they are translated to the base.
When a player overshoots the goal state they go backwards from the goal state by the amount of moves they overshoot with.
A few examples (using the rules in the 'rules' dictionary in this file):
If the player is in position s=0 (start)
> roll 2: Go to state s=16 (using the ladder)
> roll 3: Go to state s=3.
Or if the player is in state s=54
> Roll 1: Win the game
> Roll 2: stay in 54
> Roll 3: Go to 53
> Roll 4: Go to 38
"""
# Test the game rules:
for roll in [1, 2, 3, 4, 5, 6]:
print(f"In state s=0 (start), using roll {roll}, I ended up in ", game_rules(rules, 0, roll))
# Test the game rules again:
for roll in [1, 2, 3, 4, 5, 6]:
print(f"In state s=54, using roll {roll}, I ended up in ", game_rules(rules, 54, roll))
# Compute value function with the ordinary rules.
V_rules = sarlacc_return(rules, gamma=1)
# Compute value function with no rules, i.e. with an empty dictionary except for the winning state:
V_norule = sarlacc_return({55: -1}, gamma=1)
print("Time to victory when there are no snakes/ladders", V_norule[0])
print("Time to victory when there are snakes/ladders", V_rules[0])
# Make a plot of the value-functions (optional).
width = .4
def v2bar(V):
k, x = zip(*V.items())
return np.asarray(k), np.asarray(x)
plt.figure(figsize=(10,5))
plt.grid()
k,x = v2bar(V_norule)
plt.bar(k-width/2, x, width=width, label="No rules")
k, x = v2bar(V_rules)
plt.bar(k + width / 2, x, width=width, label="Rules")
plt.legend()
plt.xlabel("Current tile")
plt.ylabel("Moves remaining")
savepdf('sarlacc_value_function')
plt.show()
File added
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment