Solution to exam problems

0b9db9c5 · tuhe · 2b44091b · 0b9db9c5 · 0b9db9c5 · 0b9db9c5
Commit 0b9db9c5 authored 2 months ago by tuhe
--- a/irlc/exam/exam2023spring/solution/exam2023spring_solutions.pdf
+++ b/irlc/exam/exam2023spring/solution/exam2023spring_solutions.pdf
--- a/irlc/exam/exam2023spring/solution/question_bandit.py
+++ b/irlc/exam/exam2023spring/solution/question_bandit.py
+import numpy as np
+
+def a_select_next_action_epsilon0(k : int, actions : list, rewards : list) -> int:
+    a = b_select_next_action(k, actions, rewards, epsilon=0) 
+    return a
+
+def b_select_next_action(k : int, actions : list, rewards : list, epsilon : float) -> int:
+    N = {a: 0 for a in range(k)} 
+    S = {a: 0 for a in range(k)}
+    for (a, r) in zip(actions, rewards):
+        S[a] += r
+        N[a] += 1
+    Q = {a: S[a] / N[a] if N[a] > 0 else 0 for a in range(k)}
+    if np.random.rand() < epsilon:
+        a = np.random.randint(k)
+    else:
+        a = max(Q, key=Q.get) 
+    return a
+
+def c_nonstationary_Qs(k : int, actions : list, rewards : list, alpha : float) -> dict:
+    Q = {a: 0 for a in range(k)} 
+    for (a, r) in zip(actions, rewards):
+        Q[a] = Q[a] + alpha * (r - Q[a]) 
+    return Q
+
+if __name__ == "__main__":
+    actions =  [1, 0, 2, 1, 2, 4, 5, 4, 3, 2, 1, 1]
+    rewards = [1, 1, 1, 0, 1, 3, 2, 0, 4, 1, 1, 2]
+    k = 10
+
+    a_t = a_select_next_action_epsilon0(k, actions, rewards)
+    print(f"a) The next action is suppoed to be 3, you computed {a_t}")
+    print(f"b) The action you computed was",  b_select_next_action(k, actions, rewards, epsilon=0.3))
+    Q = c_nonstationary_Qs(k, actions, rewards, alpha=0.1)
+    print(f"c) The Q-value associated with arm a=2 is supposed to be Q(2) = 0.271, you got", Q[2])
\ No newline at end of file
--- a/irlc/exam/exam2023spring/solution/question_inventory.py
+++ b/irlc/exam/exam2023spring/solution/question_inventory.py
+from irlc.exam.exam2023spring.inventory import InventoryDPModel
+from irlc.exam.exam2023spring.dp import DP_stochastic
+import numpy as np
+
+class InventoryDPModelB(InventoryDPModel): 
+
+    def __init__(self, N=3, c=0., prob_empty=False):
+        self.c = c
+        self.prob_empty = prob_empty
+        super().__init__(N=N)
+
+    def g(self, x, u, w, k):  # Cost function g_k(x,u,w)
+        if self.prob_empty:
+            return 0
+        return u * self.c + np.abs(x + u - w)
+
+    def f(self, x, u, w, k):  # Dynamics f_k(x,u,w)
+        return max(0, min(max(self.S(k)), x + u - w))
+
+    def Pw(self, x, u, k):  # Distribution over random disturbances
+        pw = {0: .1, 1: .3, 2: .6}
+        return pw
+
+    def gN(self, x):
+        if self.prob_empty:
+            return -1 if x == 1 else 0
+        else:
+            return 0 
+
+def a_get_policy(N: int, c: float, x0 : int) -> int:
+    model = InventoryDPModelB(N=N, c=c, prob_empty=False) 
+    J, pi = DP_stochastic(model)
+    u = pi[0][x0]  
+    return u
+
+def b_prob_one(N : int, x0 : int) -> float:
+    model = InventoryDPModelB(N=N, prob_empty=True) 
+    J, pi = DP_stochastic(model)
+    pr_empty = -J[0][x0] 
+    return pr_empty
+
+
+if __name__ == "__main__":
+    model = InventoryDPModel()
+    pi = [{s: 0 for s in model.S(k)} for k in range(model.N)]
+    x0 = 0
+    c = 0.5
+    N = 3
+    print(f"a) The policy choice for {c=} is {a_get_policy(N, c,x0)} should be 1")
+    print(f"b) The probability of ending up with a single element in the inventory is {b_prob_one(N, x0)} and should be 0.492")
\ No newline at end of file
--- a/irlc/exam/exam2023spring/solution/question_lqr.py
+++ b/irlc/exam/exam2023spring/solution/question_lqr.py
+from irlc.ex04.model_pendulum import PendulumModel
+from irlc.ex04.discrete_control_model import DiscreteControlModel
+from irlc.exam.exam2023spring.dlqr import LQR
+import numpy as np
+
+def getAB(a : float):
+    return np.asarray([[1,a], [0, 1]]), np.asarray([0, 1])[:,np.newaxis],  np.asarray([1, 0]) 
+
+def a_LQR_solve(a : float, x0 : np.ndarray) -> float:
+    A,B,d = getAB(a) 
+    Q = np.eye(2)
+    R = np.eye(1)
+    N = 100
+    (L, l), _ = LQR(A=[A]*N, B=[B]*N, d=[d] * N, Q=[Q]*N, R=[R]*N)
+    u = float( L[0] @ x0 + l[0]) 
+    return u
+
+def b_linearize(theta : float):
+    model = PendulumModel()  
+    dmodel = DiscreteControlModel(model=model, dt=0.5)
+    xbar = np.asarray([theta, 0])
+    ubar = np.asarray([0])
+    xp = dmodel.f(xbar, ubar, k=0)
+    A, B = dmodel.f_jacobian(xbar, ubar, k=0)
+    d = xp - A @ xbar - B @ ubar  
+    return A, B, d
+
+
+def c_get_optimal_linear_policy(x0 : np.ndarray) -> float:
+    x0 = np.asarray(x0) 
+    # xstar = np.asarray([np.pi/2, 0])
+    Q = np.eye(2)
+    R = np.eye(1)
+    # q = -Q @ xstar
+    # q0 = 0.5  * q@Q @q
+    A, B, d = b_linearize(theta=0)
+    N = 100
+    (L, l), _ = LQR([A] * N, [B]*N, [d]*N, Q=[Q]*N, R=[R]*N)
+    u = float(L[0] @ x0 + l[0]) 
+    return u
+
+if __name__ == "__main__":
+    theta = np.pi/2  # An example: linearize around theta = pi/2.
+    a = 1
+    x0 = np.asarray([1, 0])
+    print(f"a) LQR action should be approximately -1.666, you got: {a_LQR_solve(a, x0)=}")
+    A, B, d = b_linearize(theta) # Get the three matrices.
+    print(f"b) Entry d[1] should be approx. 4.91, you got: {d[1]=}")
+    theta = 0.1  # Try a small initial angle.
+    print(f"c) Optimal policy for linearized problem should be approximately -1.07, you got: {c_get_optimal_linear_policy(x0=np.asarray([theta, 0]))=}")
\ No newline at end of file
--- a/irlc/exam/exam2024spring/solution/exam2024spring_solutions.pdf
+++ b/irlc/exam/exam2024spring/solution/exam2024spring_solutions.pdf
--- a/irlc/exam/exam2024spring/solution/question_bill_mdp.py
+++ b/irlc/exam/exam2024spring/solution/question_bill_mdp.py
+from irlc.exam.exam2024spring.mdp import MDP
+from irlc.exam.exam2024spring.policy_evaluation import policy_evaluation
+from irlc.exam.exam2024spring.value_iteration import value_iteration
+
+class BigSpender(MDP): 
+    def __init__(self, r_airbnb=0.01):
+        self.p_win = 0.45
+        self.r_airbnb = r_airbnb
+        super().__init__(initial_state=1) # s0 = 1 means we have an appartment.
+
+    def is_terminal(self, state):
+        return False
+
+    def A(self, s):
+        if s == 0: # if there is no appartment, there is nothing we can do
+            return [0]
+        if s == 1: # If we have an appartment, we can airbnb, a=0, or gamble, a=1.
+            return [0, 1]
+
+    def Psr(self, s, a):
+        if s == 0:
+            return {(0, 0): 1} # No appartment means p(s=0, r=0 | s,a) = 1.
+        if s == 1 and a == 1: # with appartment and gambling
+            return {(0, 0): 1-self.p_win,  # p(s=0, r=0 | s,a=1) = 1-p_win
+                    (1, 2): self.p_win}  # p(s=1, r=2 | s,a=1) = p_win
+        if s == 1 and a == 0: # with appartment and no gambling, p(s=1, r=r_airbnb | s,a) = 1.
+            return {(1, self.r_airbnb): 1} 
+
+def a_always_airbnb(r_airbnb : float, gamma : float) -> float:
+    mdp = BigSpender(r_airbnb=r_airbnb) 
+    pi = {0: {0: 1},
+          1: {0: 1, 1:0}}
+    J = policy_evaluation(pi=pi, mdp=mdp, gamma=gamma)
+    r1 = mdp.r_airbnb * 1/(1-gamma) #n.b. this solution, which simply compute the return explicitly, is also legal.
+    r2 = J[1]
+    assert abs(r1 - r2) < 1e-3
+    v = r1 
+    return v
+
+def b_random_decisions(r_airbnb : float, gamma : float) -> float:
+    mdp = BigSpender(r_airbnb=r_airbnb) 
+    pi = {0: {0: 1}, 1: {0: 0.5, 1: 0.5}}
+    J = policy_evaluation(pi=pi, mdp=mdp, gamma=gamma)
+    v = J[1] 
+    return v
+
+def c_is_it_better_to_gamble(r_airbnb : float, gamma : float) -> bool:
+    mdp = BigSpender(r_airbnb=r_airbnb) 
+    pi, V = value_iteration(mdp, gamma)
+    better_to_gamble = pi[1] == 1 
+    return better_to_gamble
+
+if __name__ == "__main__":
+    print("a) The expected return is approximately 1, your result:", a_always_airbnb(r_airbnb=0.01, gamma=0.99))
+    print("b) The expected return is approximately 1.612, your result:", b_random_decisions(r_airbnb=0.01, gamma=0.99))
+    print("c1) In this case, you should return False as it is better to AirBnB, your result:", c_is_it_better_to_gamble(r_airbnb=0.02, gamma=0.99))
+    print("c2) In this case, you should return True as it is better to gamble, your result:", c_is_it_better_to_gamble(r_airbnb=0.01, gamma=0.99))
\ No newline at end of file
--- a/irlc/exam/exam2024spring/solution/question_control.py
+++ b/irlc/exam/exam2024spring/solution/question_control.py
+import numpy as np
+import sympy as sym
+from irlc.ex03.control_model import ControlModel
+from irlc.ex03.control_cost import SymbolicQRCost
+
+class Simulation(ControlModel): 
+    def sym_f(self, x, u, t=None):
+        return [-sym.exp( u[0] -x[0]**2 )]
+
+    def get_cost(self): # The cost is only required to specify dimensions of x and u.
+        return SymbolicQRCost(Q=np.eye(1), R=np.eye(1)) 
+
+def a_xdot(x : float, a : float) -> float:
+    m = Simulation() 
+    u = a * x**2 # This approach validates our implementation of the system. A manual implementation is just as good.
+    xd_ = -np.exp( u - x**2 )
+    xdot = m.f((x,), (u,), 0)[0]
+    assert xd_ == xdot 
+    return xdot
+
+def b_rk4_simulate(u0 : float, tF : float):
+    x = 0  
+    m = Simulation()
+    xs, us, ts, J_ = m.simulate((x,), u_fun=(u0,), t0=0, tF=tF)
+    xF =xs[-1][0] 
+    return xF
+
+if __name__ == "__main__":
+    print(f"a): dx/dt should be -1, you got {a_xdot(x=2, a=1)=}")
+    print(f"b): Final position x(tF) should be approximately -2.09, you got {b_rk4_simulate(u0=2, tF=3)=}")
\ No newline at end of file
--- a/irlc/exam/exam2024spring/solution/question_inventory.py
+++ b/irlc/exam/exam2024spring/solution/question_inventory.py
+import math
+from irlc.exam.exam2024spring.inventory import InventoryDPModel
+from irlc.exam.exam2024spring.dp import DP_stochastic
+
+class InventoryDPModelGowns(InventoryDPModel): 
+    action_sale = "sale"
+    def __init__(self, N=3, m=3, allow_sale=False):
+        self.m = m
+        self.allow_sale = allow_sale
+        super().__init__(N=N)
+
+    def A(self, x, k):  # Action space A_k(x)
+        space = list(range(self.m))
+        if self.allow_sale:
+            space = space + [self.action_sale]
+            return space
+        else:
+            return space
+
+    def g(self, x, u, w, k):  # Cost function g_k(x,u,w)
+        if u == self.action_sale:
+            return 3/4 * (self.m - w)
+        else:
+            return InventoryDPModel.g(self, x, u, w, k)
+
+    def f(self, x, u, w, k):  # Dynamics f_k(x,u,w)
+        if u == self.action_sale:
+            return 0
+        else:
+            return InventoryDPModel.f(self, x, u, w, k)  # max(0, min(self.m, x + u - w))
+
+    def Pw(self, x, u, k):  # Distribution over random disturbances
+        pw = {w: 1/self.m for w in range(self.m)}
+        assert math.fabs(sum(pw.values())  - 1) < 1e-6
+        return pw
+
+def a_get_cost(N: int, m: int, x0 : int) -> float:
+    model = InventoryDPModelGowns(N=N, m=m, allow_sale=False) 
+    J, pi = DP_stochastic(model)
+    expected_cost = J[0][x0] 
+    return expected_cost
+
+def b_sale(N : int, m : int, x0 : int) -> float:
+    model = InventoryDPModelGowns(N=N, m=m,  allow_sale=True) 
+    J, pi = DP_stochastic(model)
+    expected_cost = J[0][x0] 
+    return expected_cost
+
+
+if __name__ == "__main__":
+    x0 = 0
+    N = 6
+    m = 4
+    print(f"a) The expected cost should be 13.75, and you got {a_get_cost(N, m=m, x0=x0)=}")
+    print(f"b) Expected cost when the sales-option is available should be approximately 11.25, and you got {b_sale(N, m=m, x0=x0)=}")
\ No newline at end of file
--- a/irlc/exam/midterm2023a/solution/midterm2023a_solutions.pdf
+++ b/irlc/exam/midterm2023a/solution/midterm2023a_solutions.pdf
--- a/irlc/exam/midterm2023a/solution/question_dp.py
+++ b/irlc/exam/midterm2023a/solution/question_dp.py
+from irlc.exam.midterm2023a.inventory import InventoryDPModel
+
+def a_expected_items_next_day(x : int, u : int) -> float:
+    model = InventoryDPModel()
+    expected_number_of_items = None
+    k = 0 
+    expected_number_of_items = sum([p * model.f(x, u, w, k=0) for w, p in model.Pw(x, u, k).items()]) 
+    return expected_number_of_items
+
+
+def b_evaluate_policy(pi : list, x0 : int) -> float:
+    model = InventoryDPModel()     
+    N = model.N
+    J = [{} for _ in range(N + 1)]
+    J[N] = {x: model.gN(x) for x in model.S(model.N)}
+    for k in range(N - 1, -1, -1):
+        for x in model.S(k):
+            Qu = {u: sum(pw * (model.g(x, u, w, k) + J[k + 1][model.f(x, u, w, k)]) for w, pw in model.Pw(x, u, k).items()) for u
+                  in model.A(x, k)}
+
+            umin = pi[k][x] # min(Qu, key=Qu.get)
+            J[k][x] = Qu[umin]  # Compute the expected cost function
+    J_pi_x0 = J[0][x0] 
+    return J_pi_x0
+
+if __name__ == "__main__":
+    model = InventoryDPModel()
+    # Create a policy that always buy an item if the inventory is empty.
+    pi = [{s: 1 if s == 0 else 0 for s in model.S(k)} for k in range(model.N)]
+    x = 0
+    u = 1
+    x0 = 1
+    a_expected_items_next_day(x=0, u=1)
+    print(f"Given inventory is {x=} and we buy {u=}, the expected items on day k=1 is {a_expected_items_next_day(x, u)} and should be 0.1")
+    print(f"Evaluation of policy is {b_evaluate_policy(pi, x0)} and should be 2.7")
\ No newline at end of file
--- a/irlc/exam/midterm2023a/solution/question_pid.py
+++ b/irlc/exam/midterm2023a/solution/question_pid.py
+def pid(xs : list, xstar :float , Kp=0., Ki=0., Kd=0., stable=False): 
+    us = []
+    e_prev = 0
+    es = []
+    I = 0
+    Delta = 1
+    for k, x in enumerate(xs):
+        e =  xstar - x
+        es.append(e)
+
+        I = I + Delta * e
+
+        if k > 2 and stable:
+            d1 = (es[-1] - es[-2])/Delta
+            d2 = (es[-2] - es[-3]) / Delta
+
+            dterm = (d1+d2)/2
+        else:
+            dterm = (e-e_prev)/ Delta
+
+        u = Kp * e + Ki * I + Kd * dterm
+        e_prev = e
+        us.append(u)
+    return us[-1] 
+
+def a_pid_Kp(xs : list, xstar : float, Kp : float) -> float:
+    u = pid(xs, xstar, Kp=Kp) 
+    return u
+
+def b_pid_full(xs : list, xstar : float, Kp : float, Ki : float, Kd : float) -> float:
+    u = pid(xs, xstar, Kp=Kp, Ki=Ki, Kd=Kd) 
+    return u
+
+def c_pid_stable(xs : list, xstar : float, Kp : float, Ki : float, Kd : float) -> float:
+    u = pid(xs, xstar, Kp=Kp, Ki=Ki, Kd=Kd, stable=True) 
+    return u
+
+
+if __name__ == "__main__":
+    xs = [10, 8, 7, 5, 3, 1, 0, -2, -1, 0, 2] # Sequence of inputs x_k
+    Kp = 0.5
+    Ki = 0.05
+    Kd = 0.25
+    xstar = -1
+    u_a = a_pid_Kp(xs, xstar=0, Kp=Kp)
+    print(f"Testing part a. Got {u_a}, expected -1.")
+
+    u_b = b_pid_full(xs, xstar=-1, Kp=Kp, Ki=Ki, Kd=Kd)
+    print(f"Testing part b. Got {u_b}, expected -4.2")
+
+    u_c = c_pid_stable(xs, xstar=-1, Kp=Kp, Ki=Ki, Kd=Kd)
+    print(f"Testing part c. Got {u_c}, expected -4.075")
\ No newline at end of file
--- a/irlc/exam/midterm2023b/solution/midterm2023b_solutions.pdf
+++ b/irlc/exam/midterm2023b/solution/midterm2023b_solutions.pdf
--- a/irlc/exam/midterm2023b/solution/question_mdp.py
+++ b/irlc/exam/midterm2023b/solution/question_mdp.py
+# from irlc.exam.midterm2023b.inventory import InventoryDPModel
+# from irlc.exam.midterm2023b.dp import DP_stochastic
+# from irlc.exam
+# import irlc
+import numpy as np
+from irlc.exam.midterm2023b.mdp import MDP
+
+class SmallGambler(MDP):
+    """
+    Implements a variant of the gambler problem. Please refer to the problem text for a description. You can consider this
+    implementation of the environment to be authoritative, and I do not recommend changing it.
+    """
+    def __init__(self):
+        goal = 40
+        super().__init__(initial_state=goal // 2)
+        self.goal = 40
+        self.p_heads = .4 # Chance of winning.
+
+    def is_terminal(self, state):
+        """ Environment has been modified to never terminate. """
+        return False
+
+    def A(self, s):
+        """ Action is the amount you choose to gamble.
+        You can gamble from 0 and up to the amount of money you have (state),
+
+        If you are either in s = 0 or s = self.goal, you cannot gamble anything (A(s) = {0}). """
+        return range(0, min(s, self.goal - s) + 1)
+
+    def Psr(self, s, a):
+        """ Implement transition probabilities here.
+        the reward is 1 if s < self.goal and s + a == self.goal and otherwise 0. Remember the format should
+         return a dictionary with entries:
+        > { (sp, r) : probability }
+        """
+        r = 1 if s + a == self.goal and s < self.goal else -a/100
+        if a == 0:
+            d = {(s + a, r): 1}
+        else:
+            d = {(s + a, r): self.p_heads, (s - a, 0): 1 - self.p_heads}
+        assert sum(d.values()) == 1  # Sanity check: the probabilities must sum to 1.
+        return d
+
+
+def a_get_reward(s : int, a : int) -> float:
+    mdp = SmallGambler()
+    avg_reward = 0 
+    for (sp, r), p in mdp.Psr(s, a).items():
+        avg_reward += r * p 
+    return avg_reward
+
+def b_get_best_immediate_action(s : int) -> int:
+    mdp = SmallGambler()
+    if s not in mdp.nonterminal_states: 
+        return 0
+    d = {a: a_get_reward(s, a) for a in mdp.A(s)}
+    astar = max(d, key=d.get)
+    vs = [v for v in d.values() if np.abs(v - d[astar]) < 1e-6]
+    if len( vs )>1:
+        print(vs)
+        assert False   
+    return astar
+
+def c_get_best_action_twosteps(s : int) -> int:
+    mdp = SmallGambler()
+    d = {} 
+    for a in mdp.A(s):
+        d[a] = 0
+        for (sp, r), p in mdp.Psr(s,a).items():
+            d[a] += p * (r + a_get_reward(sp, b_get_best_immediate_action(sp)))
+
+    astar = max(d, key=d.get)
+    vs = [v for v in d.values() if np.abs(v-d[astar]) < 1e-6]
+    if len( vs )>1:
+        print(vs)
+        assert False          
+    return astar
+
+if __name__ == "__main__":
+    mdp = SmallGambler()
+    s = 16
+    a = 26
+
+    print(f"When {s=} and {a=} the average reward is -0.104; your value is {a_get_reward(s,a)=}")
+    print(f"When {s=} the best immediate action is 0, your value is {b_get_best_immediate_action(s)=}")
+    print(f"When {s=} the best action over two steps is 4, your value is {c_get_best_action_twosteps(s)=}")
\ No newline at end of file
--- a/irlc/exam/midterm2023b/solution/question_td0.py
+++ b/irlc/exam/midterm2023b/solution/question_td0.py
+def a_compute_deltas(v: dict, states: list, rewards: list, gamma: float) -> list:
+    deltas = []  # !b;nolines
+    for t, (s, r) in enumerate(zip(states[:-1], rewards)):
+        sp = states[t + 1]
+        delta = (r + gamma * v[sp]) - v[s]
+        deltas.append(delta)  # !b
+    return deltas
+
+
+def b_perform_td0(v: dict, states: list, rewards: list, gamma: float, alpha: float) -> dict:
+    for t in range(len(rewards)):  # !b;nolines
+        s = states[t]
+        sp = states[t + 1]
+        r = rewards[t]
+        delta = r + gamma * v[sp] - v[s]
+        v[s] = v[s] + alpha * delta  # !b
+    return v
+
+
+def c_perform_td0_batched(v: dict, states: list, rewards: list, gamma: float, alpha: float) -> dict:
+    deltas = a_compute_deltas(v, states, rewards, gamma)  # !b;nolines
+    for t in range(len(rewards)):
+        s = states[t]
+        v[s] = v[s] + alpha * deltas[t]  # !b
+    return v
+
+
+if __name__ == "__main__":
+    states = [1, 0, 2, -1, 2, 4, 5, 4, 3, 2, 1, -1]
+    rewards = [1, 0.5, -1, 0, 1, 2, 2, 0, 0, -1, 0.5]
+    # In the notation of the problem: T = len(rewards).
+    v = {s: 0 for s in states}  # Initialize the value function v.
+    gamma = 0.9
+    alpha = 0.2
+
+    deltas = a_compute_deltas(v, states, rewards, gamma)
+    print(f"The first value of delta should be 1, your value is {deltas[0]=}")
+
+    v = b_perform_td0(v, states, rewards, gamma, alpha)
+    print(f"The value function v(s=1) should be 0.25352, your value is {v[1]=}")
+
+    v_batched = {s: 0 for s in states}  # Initialize the value function anew
+    v_batched = c_perform_td0_batched(v_batched, states, rewards, gamma, alpha)
+    print(f"The batched value function in v(s=1) should be 0.3, your value is {v_batched[1]=}")
\ No newline at end of file