Project 3

23ff9694 · tuhe · 3c712cf7 · 23ff9694 · 23ff9694 · 23ff9694
Commit 23ff9694 authored 3 months ago by tuhe
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,7 @@ exam_tabular_examples
 #solutions/ex06
 #solutions/ex07
 #solutions/ex08
-solutions/ex09
+# solutions/ex09
 solutions/ex10
 solutions/ex11
 solutions/ex12
@@ -35,7 +35,7 @@ solutions/ex13
 #irlc/tests/tests_week06.py
 #irlc/tests/tests_week07.py
 #irlc/tests/tests_week08.py
-irlc/tests/tests_week09.py
+# irlc/tests/tests_week09.py
 irlc/tests/tests_week10.py
 irlc/tests/tests_week11.py
 irlc/tests/tests_week12.py
@@ -72,7 +72,7 @@ irlc/exam/exam20*/solution
 #irlc/lectures/lec06
 #irlc/lectures/lec07
 #irlc/lectures/lec08
-irlc/lectures/lec09
+# irlc/lectures/lec09
 irlc/lectures/lec10
 irlc/lectures/lec11
 irlc/lectures/lec12

--- a/irlc/project3/Latex/02465project3_handin.tex
+++ b/irlc/project3/Latex/02465project3_handin.tex
+\documentclass[12pt,twoside]{article}
+%\usepackage[table]{xcolor} % important to avoid options clash.
+%\input{02465shared_preamble}
+%\usepackage{cleveref}
+\usepackage{url}
+\usepackage{graphics}
+\usepackage{multicol}
+\usepackage{rotate}
+\usepackage{rotating}
+\usepackage{booktabs}
+\usepackage{hyperref}
+\usepackage{pifont}
+\usepackage{latexsym}
+\usepackage[english]{babel}
+\usepackage{epstopdf}
+\usepackage{etoolbox}
+\usepackage{amsmath}
+\usepackage{amssymb}
+\usepackage{multirow,epstopdf}
+\usepackage{fancyhdr}
+\usepackage{booktabs}
+\usepackage{xcolor}
+\newcommand\redt[1]{ {\textcolor[rgb]{0.60, 0.00, 0.00}{\textbf{ #1} } } }
+
+
+\newcommand{\m}[1]{\boldsymbol{ #1}}
+\newcommand{\yoursolution}{ \redt{(your solution here) } } 
+
+
+
+\title{ Report 3 hand-in }
+\date{ \today }
+\author{Alice (\texttt{s000001})\and  Bob (\texttt{s000002})\and Clara (\texttt{s000003}) } 
+
+\begin{document}
+\maketitle
+
+\begin{table}[ht!]
+\caption{Attribution table. Feel free to add/remove rows and columns}
+\begin{tabular}{llll}
+\toprule
+                                                                           & Alice   & Bob    & Clara   \\
+\midrule
+ 1: Optimal policy                                                         & 0-100\%  & 0-100\% & 0-100\%  \\
+ 2: Simulating a finite approximation of the optimal action-value function & 0-100\%  & 0-100\% & 0-100\%  \\
+ 3: Analytically computing the optimal action-value function               & 0-100\%  & 0-100\% & 0-100\%  \\
+ 4: Extend solution to all states and actions                              & 0-100\%  & 0-100\% & 0-100\%  \\
+ 5: UCB-based exploration                                                  & 0-100\%  & 0-100\% & 0-100\%  \\
+ 6: Sarlacc rules                                                          & 0-100\%  & 0-100\% & 0-100\%  \\
+ 7: Escape the Sarlacc                                                     & 0-100\%  & 0-100\% & 0-100\%  \\
+\bottomrule
+\end{tabular}
+\end{table}
+
+%\paragraph{Statement about collaboration:}
+%Please edit this section to reflect how you have used external resources. The following statement will in most cases suffice: 
+%\emph{The code in the irls/project1 directory is entirely}
+
+%\paragraph{Main report:}
+Headings have been inserted in the document for readability. You only have to edit the part which says \yoursolution. 
+
+\section{Jar-Jar at the battle of Naboo (\texttt{jarjar.py})}
+\subsubsection*{{\color{red}Problem 3:  Analytically computing the optimal action-value function}}
+	
+		Using that ... we obtain 
+		\begin{align}
+			Q^*(0,1) & = \cdots \\
+			Q^*(1,-1) & = \cdots
+		\end{align}
+		therefore...
+	
+\section{Finding the rebels using UCB-exploration (\texttt{rebels.py})} 
+\section{Individual contribution: The great sarlacc (\texttt{sarlacc.py})} 
+\end{document}
\ No newline at end of file
--- a/irlc/project3/Latex/figures/your_answer.pdf
+++ b/irlc/project3/Latex/figures/your_answer.pdf
--- a/irlc/project3/__init__.py
+++ b/irlc/project3/__init__.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""This file is required for the test system but should otherwise be empty."""
--- a/irlc/project3/jarjar.py
+++ b/irlc/project3/jarjar.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def pi_optimal(s : int) -> int: 
+    """ Compute the optimal policy for Jar-Jar binks. Don't overthink this one! """
+    # TODO: 1 lines missing.
+    raise NotImplementedError("Return the optimal action in state s.")
+    return action
+
+def Q0_approximate(gamma : float, N : int) -> float: 
+    """ Return the (estimate) of the optimal action-value function Q^*(0,1) based on
+    the first N rewards using a discount factor of gamma. Note the similarity to the n-step estimator. """
+    # TODO: 1 lines missing.
+    raise NotImplementedError("Return N-term approximation of the optimal action-value function Q^*(0,1)")
+    return return_estimate
+
+def Q_exact(s : int,a : int, gamma : float) -> float:
+    """
+    Return the exact optimal action-value function Q^*(s,a) in the Jar-Jar problem.
+    I recommend focusing on simple cases first, such as the two cases in the problem.
+    Then try to look at larger values of s (for instance, s=2), first using actions that 'point in the right direction' (a = -1)
+    and then actions that point in the 'wrong' direction a=1.
+
+    There are several ways to solve the problem, but the simplest is probably to use recursions.
+
+    *Don't* use your solution to Q0_approximate; it is an approximate (finite-horizon) approximation.
+    """
+    # TODO: 6 lines missing.
+    raise NotImplementedError("return optimal action-value function Q^*(s,a) as a float.")
+
+
+if __name__ == "__main__":
+    gamma = 0.8
+
+    ss = np.asarray(range(-10, 10))
+    # Make a plot of your (exact) action-value function Q(s,-1) and Q(s,1).
+    plt.plot(ss, [Q_exact(s, -1, gamma) for s in ss], 'k-', label='Exact, a=-1')
+    plt.plot(ss, [Q_exact(s, 1, gamma) for s in ss], 'r-', label='Exact, a=1')
+    plt.legend()
+    plt.grid()
+    plt.show()
+    print("All done")
--- a/irlc/project3/project3_grade.py
+++ b/irlc/project3/project3_grade.py
--- a/irlc/project3/project3_tests.py
+++ b/irlc/project3/project3_tests.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from unitgrade import UTestCase, Report
+import irlc
+
+class JarJarPiOptimal(UTestCase):
+    """ Problem 1: Compute optimal policy.  """
+    def test_pi_1(self):
+        from irlc.project3.jarjar import pi_optimal
+        self.assertLinf(pi_optimal(1), -1)
+
+    def test_pi_all(self):
+        from irlc.project3.jarjar import pi_optimal
+        for s in range(-10, 10):
+            if s != 0:
+                self.assertLinf(pi_optimal(s))
+
+class JarJarQ0Estimated(UTestCase):
+    """ Problem 2: Implement Q0_approximate to (approximate) the Q-function for the optimal policy.  """
+    def test_Q0_N1(self):
+        from irlc.project3.jarjar import Q0_approximate
+        import numpy as np
+        self.assertLinf(np.abs(Q0_approximate(gamma=0.8, N=1))) # TODO: Remove abs. This was added due to typo.
+
+    def test_Q0_N2(self):
+        from irlc.project3.jarjar import Q0_approximate
+        import numpy as np
+        self.assertLinf(np.abs(Q0_approximate(gamma=0.7, N=20))) # TODO: Remove abs. This was added due to typo.
+
+    def test_Q0_N100(self):
+        from irlc.project3.jarjar import Q0_approximate
+        import numpy as np
+        self.assertLinf(np.abs(Q0_approximate(gamma=0.9, N=20)))  # TODO: Remove abs. This was added due to typo.
+
+
+class JarJarQExact(UTestCase):
+    """ Problem 4: Compute Q^*(s,a) exactly by extending analytical solution. """
+    def test_Q_s0(self):
+        from irlc.project3.jarjar import Q_exact
+        self.assertLinf(Q_exact(0, gamma=0.8, a=1))
+        self.assertLinf(Q_exact(0, gamma=0.8, a=-1))
+
+    def test_Q_s1(self):
+        from irlc.project3.jarjar import Q_exact
+        self.assertLinf(Q_exact(1, gamma=0.8, a=-1))
+        self.assertLinf(Q_exact(1, gamma=0.95, a=-1))
+        self.assertLinf(Q_exact(1, gamma=0.7, a=-1))
+
+    def test_Q_s_positive(self):
+        from irlc.project3.jarjar import Q_exact
+        for s in range(20):
+            self.assertLinf(Q_exact(s, gamma=0.75, a=-1))
+
+    def test_Q_all(self):
+        from irlc.project3.jarjar import Q_exact
+        for s in range(-20, 20):
+            self.assertLinf(Q_exact(s, gamma=0.75, a=-1))
+            self.assertLinf(Q_exact(s, gamma=0.75, a=1))
+
+class RebelsSimple(UTestCase):
+    """ Problem 5: Test the UCB-algorithm in the basic-environment with a single state """
+    def test_simple_four_episodes(self):
+        """ Test the first four episodes in the simple grid problem. """
+        from irlc.project3.rebels import get_ucb_actions, very_basic_grid
+        actions = get_ucb_actions(very_basic_grid, alpha=0.1, episodes=4, c=5, plot=False)
+        # Make sure we only have 4 actions (remember to truncate the action-sequences!)
+        self.assertEqual(len(actions), 4) # Check the number of actions are correct
+        self.assertEqual(actions[0], 0) # Check the first action is correct
+        self.assertEqualC(actions) # Check all actions.
+
+    def test_simple_nine_episodes(self):
+        """ Test the first nine episodes in the simple grid problem. """
+        from irlc.project3.rebels import get_ucb_actions, very_basic_grid
+        actions = get_ucb_actions(very_basic_grid, alpha=0.1, episodes=9, c=5, plot=False)
+        self.assertEqual(len(actions), 9) # Check the number of actions are correct
+        self.assertEqual(actions[0], 0) # Check the first action is correct
+        self.assertEqualC(actions) # Check all actions.
+
+    def test_simple_environment(self):
+        from irlc.project3.rebels import get_ucb_actions, very_basic_grid
+        actions = get_ucb_actions(very_basic_grid, alpha=0.1, episodes=100, c=5, plot=False)
+        # Check the number of actions are correct
+        self.assertEqualC(len(actions))
+        # Check the first action is correct
+        self.assertEqualC(actions[0])
+        # Check all actions.
+        self.assertEqualC(actions)
+
+    def test_bridge_environment(self):
+        from irlc.gridworld.gridworld_environments import grid_bridge_grid
+        from irlc.project3.rebels import get_ucb_actions, very_basic_grid
+        actions = get_ucb_actions(grid_bridge_grid, alpha=0.1, episodes=1000, c=2, plot=False)
+        self.assertEqualC(len(actions))
+        # Check all actions.
+        self.assertEqualC(actions)
+
+class RebelsBridge(UTestCase):
+    """ Problem 5: Test the UCB-algorithm in the bridge-environment """
+    def test_bridge_environment_one(self):
+        from irlc.gridworld.gridworld_environments import grid_bridge_grid
+        from irlc.project3.rebels import get_ucb_actions
+        actions = get_ucb_actions(grid_bridge_grid, alpha=0.1, episodes=1, c=2, plot=False)
+        self.assertEqualC(len(actions))
+        self.assertEqualC(actions)
+
+    def test_bridge_environment_two(self):
+        from irlc.gridworld.gridworld_environments import grid_bridge_grid
+        from irlc.project3.rebels import get_ucb_actions
+        actions = get_ucb_actions(grid_bridge_grid, alpha=0.1, episodes=2, c=2, plot=False)
+        self.assertEqualC(len(actions))
+        self.assertEqualC(actions)
+
+    def test_bridge_environment_short(self):
+        from irlc.gridworld.gridworld_environments import grid_bridge_grid
+        from irlc.project3.rebels import get_ucb_actions
+        actions = get_ucb_actions(grid_bridge_grid, alpha=0.1, episodes=30, c=2, plot=False)
+        self.assertEqualC(len(actions))
+        self.assertEqualC(actions)
+
+    def test_bridge_environment_long(self):
+        from irlc.gridworld.gridworld_environments import grid_bridge_grid
+        from irlc.project3.rebels import get_ucb_actions
+        actions = get_ucb_actions(grid_bridge_grid, alpha=0.1, episodes=1000, c=2, plot=False)
+        self.assertEqualC(len(actions))
+        self.assertEqualC(actions)
+
+class Project3(Report):
+    title = "Project part 3: Reinforcement Learning"
+    pack_imports = [irlc]
+
+    jarjar1 = [(JarJarPiOptimal, 10),
+               (JarJarQ0Estimated, 10),
+               (JarJarQExact, 10) ]
+
+    rebels = [(RebelsSimple, 20),
+              (RebelsBridge, 20) ]
+    questions = []
+    questions += jarjar1
+    questions += rebels
+
+if __name__ == '__main__':
+    from unitgrade import evaluate_report_student
+    evaluate_report_student(Project3())
--- a/irlc/project3/rebels.py
+++ b/irlc/project3/rebels.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+import numpy as np
+from irlc.ex11.q_agent import QAgent
+from irlc.gridworld.gridworld_environments import GridworldEnvironment, grid_bridge_grid
+from irlc import train
+from irlc.ex09.rl_agent import TabularQ
+
+# A simple UCB action-selection problem (basic problem)
+very_basic_grid = [['#',1, '#'],
+                    [1, 'S', 2],
+                    ['#',1, '#']]
+
+
+# TODO: 21 lines missing.
+raise NotImplementedError("I wrote an agent that inherited from the Q-agent, and updated the self.pi and self.train-functions to do UCB-based exploration.")
+
+def get_ucb_actions(layout : list, alpha : float, c : float, episodes : int, plot=False) -> list: 
+    """ Return the sequence of actions the agent tries in the environment with the given layout-string when trained over 'episodes' episodes.
+    To create an environment, you can use the line:
+
+    > env = GridworldEnvironment(layout)
+
+    See also the demo-file.
+
+    The 'plot'-parameter is optional; you can use it to add visualization using a line such as:
+
+    if plot:
+        env = GridworldEnvironment(layout, render_mode='human')
+
+    Or you can just ignore it. Make sure to return the truncated action list (see the rebels_demo.py-file or project description).
+    In other words, the return value should be a long list of integers corresponding to actions:
+    actions = [0, 1, 2, ..., 1, 3, 2, 1, 0, ...]
+    """
+    # TODO: 6 lines missing.
+    raise NotImplementedError("Implement function body")
+    return actions
+
+if __name__ == "__main__":
+    actions = get_ucb_actions(very_basic_grid, alpha=0.1, c=5, episodes=4, plot=False)
+    print("Number of actions taken", len(actions))
+    print("List of actions taken over 4 episodes", actions)
+
+    actions = get_ucb_actions(very_basic_grid, alpha=0.1, c=5, episodes=8, plot=False)
+    print("Number of actions taken", len(actions))
+    print("Actions taken over 8 episodes", actions)
+
+    actions = get_ucb_actions(very_basic_grid, alpha=0.1, c=5, episodes=9, plot=False)
+    print("Number of actions taken", len(actions))
+    print("Actions taken over 9 episodes", actions) # In this particular case, you can also predict the 9th action. Why?
+
+    # Simulate 100 episodes. This should solve the problem.
+    actions = get_ucb_actions(very_basic_grid, alpha=0.1, c=5, episodes=100, plot=False)
+    print("Basic: Actions taken over 100 episodes", actions)
+
+    # Simulate 100 episodes for the bridge-environment. The UCB-based method should solve the environment without being overly sensitive to c.
+    # You can compare your result with the Q-learning agent in the demo, which performs horribly.
+    actions = get_ucb_actions(grid_bridge_grid, alpha=0.1, c=5, episodes=300, plot=False)
+    print("Bridge: Actions taken over 300 episodes. The agent should solve the environment:", actions)
--- a/irlc/project3/rebels_demo.py
+++ b/irlc/project3/rebels_demo.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+import numpy as np
+from irlc import train, Agent, interactive, savepdf
+from irlc.gridworld.gridworld_environments import GridworldEnvironment, grid_bridge_grid
+from irlc.project3.rebels import very_basic_grid
+from irlc.ex11.q_agent import QAgent
+import matplotlib
+import matplotlib.pyplot as plt
+matplotlib.use('qtagg')
+
+
+if __name__ == "__main__":
+    np.random.seed(42) # Fix the seed for reproduciability
+    env = GridworldEnvironment(very_basic_grid, render_mode='human') # Create an environment
+    env.reset()                   # Reset (to set up the visualization)
+    savepdf("rebels_basic", env=env)   # Save a snapshot of the starting state
+    env.close()
+
+    # Create an interactive version.
+    env = GridworldEnvironment(very_basic_grid, render_mode='human')  # Create an environment
+    agent = QAgent(env) # This agent will display the Q-values.
+    # agent = Agent(env) # A random agent.
+    # env, agent = interactive(env, agent) # Uncomment this line to play in 'env' environment. Use space to let the agent move.
+    stats, trajectories = train(env, agent, num_episodes=16, return_trajectory=True)
+    env.close()
+    print("Trajectory 0: States traversed", trajectories[0].state, "actions taken", trajectories[0].action) 
+    print("Trajectory 1: States traversed", trajectories[1].state, "actions taken", trajectories[1].action)
+    all_actions = [t.action[:-1] for t in trajectories] # Concatenate all action sequence excluding the last dummy-action.
+    print("All actions taken in 16 episodes, excluding the terminal (dummy) action", all_actions) 
+    # Note the last list is of length 20 -- this is because the environment will always terminate after two actions,
+    # and since we discard the last (dummy) action we get 20 actions.
+    # In general, the list of actions will be longer, as only the last action should be discarded (as in the code above).
+
+    # A more minimalistic example to plot the bridge-grid environment
+    bridge_env = GridworldEnvironment(grid_bridge_grid, render_mode='human')
+    bridge_env.reset()
+    savepdf("rebels_bridge", env=bridge_env)
+    bridge_env.close()
+
+    # The following code will simulate a Q-learning agent for 3000 (!) episodes and plot the Q-functions.
+    np.random.seed(42)  # Fix the seed for reproduciability
+    env = GridworldEnvironment(grid_bridge_grid)
+    agent = QAgent(env, alpha=0.1, epsilon=0.2, gamma=1)
+    """ Uncomment the next line to play in the environment. 
+    Use the space-bar to let the agent take an action, p to unpause, and otherwise use the keyboard arrows """
+    train(env, agent, num_episodes=3000) # Train for 3000 episodes. Surely the rebels must be found by now!
+    bridge_env, agent = interactive(env, agent)
+    bridge_env.reset()
+    bridge_env.savepdf("rebels_bridge_Q")
+    bridge_env.close()
--- a/irlc/project3/unitgrade_data/JarJarPiOptimal.pkl
+++ b/irlc/project3/unitgrade_data/JarJarPiOptimal.pkl
--- a/irlc/project3/unitgrade_data/JarJarQ0Estimated.pkl
+++ b/irlc/project3/unitgrade_data/JarJarQ0Estimated.pkl
--- a/irlc/project3/unitgrade_data/JarJarQExact.pkl
+++ b/irlc/project3/unitgrade_data/JarJarQExact.pkl
--- a/irlc/project3/unitgrade_data/RebelsBridge.pkl
+++ b/irlc/project3/unitgrade_data/RebelsBridge.pkl
--- a/irlc/project3/unitgrade_data/RebelsSimple.pkl
+++ b/irlc/project3/unitgrade_data/RebelsSimple.pkl
--- a/irlc/project3i/__init__.py
+++ b/irlc/project3i/__init__.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""This file is required for the test system but should otherwise be empty."""
--- a/irlc/project3i/project3_individual_grade.py
+++ b/irlc/project3i/project3_individual_grade.py
--- a/irlc/project3i/project3_individual_tests.py
+++ b/irlc/project3i/project3_individual_tests.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+from unitgrade import UTestCase, Report
+import irlc
+import numpy as np
+
+class SarlaccGameRules(UTestCase):
+    def check_rules(self, rules):
+        from irlc.project3i.sarlacc import game_rules
+        # Test what happens at the starting square s=0 for roll 1
+        self.assertEqualC(game_rules(rules, state=0, roll=1))
+        # Test what happens at the starting square s=0 for other rolls
+        for roll in [2, 3, 4, 5, 6]:
+            self.assertEqualC(game_rules(rules, state=0, roll=roll))
+
+        # Test all states:
+        for s in range(max(rules.keys())):
+            if s not in rules: # We skip because s is not a legal state to be in.
+                for roll in [1, 2, 3, 4, 5, 6]:
+                    self.assertEqualC(game_rules(rules, s, roll))
+
+    def test_empty_board_rules(self):
+        rules = {55: -1}
+        self.check_rules(rules)
+
+    def test_rules(self):
+        from irlc.project3i.sarlacc import rules
+        self.check_rules(rules)
+
+class SarlacReturn(UTestCase):
+    def check_return(self, rules, gamma):
+        from irlc.project3i.sarlacc import sarlacc_return
+        v = sarlacc_return(rules, gamma)
+        # Check that the keys (states) that are included in v are correct. I.e., that the return is computed for the right states.
+        states = list(sorted(v.keys()))
+        self.assertEqualC(states)
+
+        for s in states:
+            self.assertL2(v[s], tol=1e-2)
+
+    def test_sarlacc_return_empty_gamma1(self):
+        self.check_return({55: -1}, gamma=1)
+
+    def test_sarlacc_return(self):
+        from irlc.project3i.sarlacc import rules
+        self.check_return(rules, gamma=.8)
+
+
+class Project3Individual(Report):
+    title = "Project part 3: Reinforcement Learning (individual)"
+    pack_imports = [irlc]
+
+    sarlacc = [(SarlaccGameRules, 20),
+               (SarlacReturn, 20)]
+
+    questions = []
+    questions += sarlacc
+
+
+if __name__ == '__main__':
+    from unitgrade import evaluate_report_student
+    evaluate_report_student(Project3Individual())
--- a/irlc/project3i/sarlacc.py
+++ b/irlc/project3i/sarlacc.py
+# This file may not be shared/redistributed without permission. Please read copyright notice in the git repo. If this file contains other copyright notices disregard this text.
+"""
+
+References:
+  [SB18] Richard S. Sutton and Andrew G. Barto. Reinforcement Learning: An Introduction. The MIT Press, second edition, 2018. (Freely available online).
+"""
+from irlc import savepdf
+from irlc.ex09.mdp import MDP
+from irlc.ex09.value_iteration import value_iteration
+import matplotlib.pyplot as plt
+import numpy as np
+
+# These are the game rules of the sarlac: If you land on a state s in the dictionary, you are teleported to rules[s].
+rules = {
+        2: 16,
+        4: 8,
+        7: 21,
+        10: 3,
+        12: 25,
+        14: 1,
+        17: 27,
+        19: 5,
+        22: 3,
+        23: 32,
+        24: 44,
+        26: 44,
+        28: 38,
+        30: 18,
+        33: 48,
+        35: 11,
+        36: 34,
+        40: 53,
+        41: 29,
+        42: 9,
+        45: 51,
+        47: 31,
+        50: 25,
+        52: 38,
+        55: -1,
+    }
+
+def game_rules(rules : dict, state : int, roll : int) -> int: 
+    """ Compute the next state given the game rules in 'rules', the current state 'state', and the roll
+    which can be roll = 1, 2, 3, 4, 5, 6.
+    The output should be -1 in case the game terminates, and otherwise the function should return the next state
+    as an integer. Read the description of the project for examples on the rules. """
+    # TODO: 4 lines missing.
+    raise NotImplementedError("Return the next state")
+    return state_next
+
+# TODO: 19 lines missing.
+raise NotImplementedError("Put your code here.")
+
+def sarlacc_return(rules : dict, gamma : float) -> dict: 
+    """ Compute the value-function using a discount of gamma and the game rules 'rules'.
+    Result should be reasonable accurate.
+
+    The value you return should be a dictionary v, so that v[state] is the value function in that state.
+    (i.e., the standard output format of the value_iteration function).
+
+    Hints:
+        * One way to solve this problem is to create a MDP-class (see for instance the Gambler-problem in week 9)
+        and use the value_iteration function from week 9 to solve the problem. But I don't think the problem
+        is much harder to solve by just writing your own value-iteration method as in (SB18).
+    """
+    # TODO: 2 lines missing.
+    raise NotImplementedError("Return the value function")
+    return v
+
+
+if __name__ == "__main__":
+    """ 
+    Rules for the snakes and ladder game: 
+    The player starts in square s=0, and the game terminates when the player is in square s = 55. 
+    When a player reaches the base of a ladder he/she climbs it, and when they reach a snakes mouth of a snake they are translated to the base.
+    When a player overshoots the goal state they go backwards from the goal state by the amount of moves they overshoot with.
+    
+    A few examples (using the rules in the 'rules' dictionary in this file):
+    If the player is in position s=0 (start)
+    > roll 2: Go to state s=16 (using the ladder)
+    > roll 3: Go to state s=3. 
+
+    Or if the player is in state s=54
+    > Roll 1: Win the game
+    > Roll 2: stay in 54
+    > Roll 3: Go to 53
+    > Roll 4: Go to 38    
+    """
+    # Test the game rules:
+    for roll in [1, 2, 3, 4, 5, 6]:
+        print(f"In state s=0 (start), using roll {roll}, I ended up in ", game_rules(rules, 0, roll))
+    # Test the game rules again:
+    for roll in [1, 2, 3, 4, 5, 6]:
+        print(f"In state s=54, using roll {roll}, I ended up in ", game_rules(rules, 54, roll))
+
+    # Compute value function with the ordinary rules.
+    V_rules = sarlacc_return(rules, gamma=1)
+    # Compute value function with no rules, i.e. with an empty dictionary except for the winning state:
+    V_norule = sarlacc_return({55: -1}, gamma=1)
+    print("Time to victory when there are no snakes/ladders", V_norule[0])
+    print("Time to victory when there are snakes/ladders", V_rules[0])
+
+    # Make a plot of the value-functions (optional).
+    width = .4
+    def v2bar(V):
+        k, x = zip(*V.items())
+        return np.asarray(k), np.asarray(x)
+
+    plt.figure(figsize=(10,5))
+    plt.grid()
+    k,x = v2bar(V_norule)
+    plt.bar(k-width/2, x, width=width, label="No rules")
+
+    k, x = v2bar(V_rules)
+    plt.bar(k + width / 2, x, width=width, label="Rules")
+    plt.legend()
+    plt.xlabel("Current tile")
+    plt.ylabel("Moves remaining")
+    savepdf('sarlacc_value_function')
+    plt.show()
--- a/irlc/project3i/unitgrade_data/SarlacReturn.pkl
+++ b/irlc/project3i/unitgrade_data/SarlacReturn.pkl
--- a/irlc/project3i/unitgrade_data/SarlaccGameRules.pkl
+++ b/irlc/project3i/unitgrade_data/SarlaccGameRules.pkl