Skip to content
Snippets Groups Projects
Commit 0b9db9c5 authored by tuhe's avatar tuhe
Browse files

Solution to exam problems

parent 2b44091b
Branches
No related tags found
No related merge requests found
Showing
with 494 additions and 0 deletions
File added
import numpy as np
def a_select_next_action_epsilon0(k : int, actions : list, rewards : list) -> int:
a = b_select_next_action(k, actions, rewards, epsilon=0)
return a
def b_select_next_action(k : int, actions : list, rewards : list, epsilon : float) -> int:
N = {a: 0 for a in range(k)}
S = {a: 0 for a in range(k)}
for (a, r) in zip(actions, rewards):
S[a] += r
N[a] += 1
Q = {a: S[a] / N[a] if N[a] > 0 else 0 for a in range(k)}
if np.random.rand() < epsilon:
a = np.random.randint(k)
else:
a = max(Q, key=Q.get)
return a
def c_nonstationary_Qs(k : int, actions : list, rewards : list, alpha : float) -> dict:
Q = {a: 0 for a in range(k)}
for (a, r) in zip(actions, rewards):
Q[a] = Q[a] + alpha * (r - Q[a])
return Q
if __name__ == "__main__":
actions = [1, 0, 2, 1, 2, 4, 5, 4, 3, 2, 1, 1]
rewards = [1, 1, 1, 0, 1, 3, 2, 0, 4, 1, 1, 2]
k = 10
a_t = a_select_next_action_epsilon0(k, actions, rewards)
print(f"a) The next action is suppoed to be 3, you computed {a_t}")
print(f"b) The action you computed was", b_select_next_action(k, actions, rewards, epsilon=0.3))
Q = c_nonstationary_Qs(k, actions, rewards, alpha=0.1)
print(f"c) The Q-value associated with arm a=2 is supposed to be Q(2) = 0.271, you got", Q[2])
\ No newline at end of file
from irlc.exam.exam2023spring.inventory import InventoryDPModel
from irlc.exam.exam2023spring.dp import DP_stochastic
import numpy as np
class InventoryDPModelB(InventoryDPModel):
def __init__(self, N=3, c=0., prob_empty=False):
self.c = c
self.prob_empty = prob_empty
super().__init__(N=N)
def g(self, x, u, w, k): # Cost function g_k(x,u,w)
if self.prob_empty:
return 0
return u * self.c + np.abs(x + u - w)
def f(self, x, u, w, k): # Dynamics f_k(x,u,w)
return max(0, min(max(self.S(k)), x + u - w))
def Pw(self, x, u, k): # Distribution over random disturbances
pw = {0: .1, 1: .3, 2: .6}
return pw
def gN(self, x):
if self.prob_empty:
return -1 if x == 1 else 0
else:
return 0
def a_get_policy(N: int, c: float, x0 : int) -> int:
model = InventoryDPModelB(N=N, c=c, prob_empty=False)
J, pi = DP_stochastic(model)
u = pi[0][x0]
return u
def b_prob_one(N : int, x0 : int) -> float:
model = InventoryDPModelB(N=N, prob_empty=True)
J, pi = DP_stochastic(model)
pr_empty = -J[0][x0]
return pr_empty
if __name__ == "__main__":
model = InventoryDPModel()
pi = [{s: 0 for s in model.S(k)} for k in range(model.N)]
x0 = 0
c = 0.5
N = 3
print(f"a) The policy choice for {c=} is {a_get_policy(N, c,x0)} should be 1")
print(f"b) The probability of ending up with a single element in the inventory is {b_prob_one(N, x0)} and should be 0.492")
\ No newline at end of file
from irlc.ex04.model_pendulum import PendulumModel
from irlc.ex04.discrete_control_model import DiscreteControlModel
from irlc.exam.exam2023spring.dlqr import LQR
import numpy as np
def getAB(a : float):
return np.asarray([[1,a], [0, 1]]), np.asarray([0, 1])[:,np.newaxis], np.asarray([1, 0])
def a_LQR_solve(a : float, x0 : np.ndarray) -> float:
A,B,d = getAB(a)
Q = np.eye(2)
R = np.eye(1)
N = 100
(L, l), _ = LQR(A=[A]*N, B=[B]*N, d=[d] * N, Q=[Q]*N, R=[R]*N)
u = float( L[0] @ x0 + l[0])
return u
def b_linearize(theta : float):
model = PendulumModel()
dmodel = DiscreteControlModel(model=model, dt=0.5)
xbar = np.asarray([theta, 0])
ubar = np.asarray([0])
xp = dmodel.f(xbar, ubar, k=0)
A, B = dmodel.f_jacobian(xbar, ubar, k=0)
d = xp - A @ xbar - B @ ubar
return A, B, d
def c_get_optimal_linear_policy(x0 : np.ndarray) -> float:
x0 = np.asarray(x0)
# xstar = np.asarray([np.pi/2, 0])
Q = np.eye(2)
R = np.eye(1)
# q = -Q @ xstar
# q0 = 0.5 * q@Q @q
A, B, d = b_linearize(theta=0)
N = 100
(L, l), _ = LQR([A] * N, [B]*N, [d]*N, Q=[Q]*N, R=[R]*N)
u = float(L[0] @ x0 + l[0])
return u
if __name__ == "__main__":
theta = np.pi/2 # An example: linearize around theta = pi/2.
a = 1
x0 = np.asarray([1, 0])
print(f"a) LQR action should be approximately -1.666, you got: {a_LQR_solve(a, x0)=}")
A, B, d = b_linearize(theta) # Get the three matrices.
print(f"b) Entry d[1] should be approx. 4.91, you got: {d[1]=}")
theta = 0.1 # Try a small initial angle.
print(f"c) Optimal policy for linearized problem should be approximately -1.07, you got: {c_get_optimal_linear_policy(x0=np.asarray([theta, 0]))=}")
\ No newline at end of file
File added
from irlc.exam.exam2024spring.mdp import MDP
from irlc.exam.exam2024spring.policy_evaluation import policy_evaluation
from irlc.exam.exam2024spring.value_iteration import value_iteration
class BigSpender(MDP):
def __init__(self, r_airbnb=0.01):
self.p_win = 0.45
self.r_airbnb = r_airbnb
super().__init__(initial_state=1) # s0 = 1 means we have an appartment.
def is_terminal(self, state):
return False
def A(self, s):
if s == 0: # if there is no appartment, there is nothing we can do
return [0]
if s == 1: # If we have an appartment, we can airbnb, a=0, or gamble, a=1.
return [0, 1]
def Psr(self, s, a):
if s == 0:
return {(0, 0): 1} # No appartment means p(s=0, r=0 | s,a) = 1.
if s == 1 and a == 1: # with appartment and gambling
return {(0, 0): 1-self.p_win, # p(s=0, r=0 | s,a=1) = 1-p_win
(1, 2): self.p_win} # p(s=1, r=2 | s,a=1) = p_win
if s == 1 and a == 0: # with appartment and no gambling, p(s=1, r=r_airbnb | s,a) = 1.
return {(1, self.r_airbnb): 1}
def a_always_airbnb(r_airbnb : float, gamma : float) -> float:
mdp = BigSpender(r_airbnb=r_airbnb)
pi = {0: {0: 1},
1: {0: 1, 1:0}}
J = policy_evaluation(pi=pi, mdp=mdp, gamma=gamma)
r1 = mdp.r_airbnb * 1/(1-gamma) #n.b. this solution, which simply compute the return explicitly, is also legal.
r2 = J[1]
assert abs(r1 - r2) < 1e-3
v = r1
return v
def b_random_decisions(r_airbnb : float, gamma : float) -> float:
mdp = BigSpender(r_airbnb=r_airbnb)
pi = {0: {0: 1}, 1: {0: 0.5, 1: 0.5}}
J = policy_evaluation(pi=pi, mdp=mdp, gamma=gamma)
v = J[1]
return v
def c_is_it_better_to_gamble(r_airbnb : float, gamma : float) -> bool:
mdp = BigSpender(r_airbnb=r_airbnb)
pi, V = value_iteration(mdp, gamma)
better_to_gamble = pi[1] == 1
return better_to_gamble
if __name__ == "__main__":
print("a) The expected return is approximately 1, your result:", a_always_airbnb(r_airbnb=0.01, gamma=0.99))
print("b) The expected return is approximately 1.612, your result:", b_random_decisions(r_airbnb=0.01, gamma=0.99))
print("c1) In this case, you should return False as it is better to AirBnB, your result:", c_is_it_better_to_gamble(r_airbnb=0.02, gamma=0.99))
print("c2) In this case, you should return True as it is better to gamble, your result:", c_is_it_better_to_gamble(r_airbnb=0.01, gamma=0.99))
\ No newline at end of file
import numpy as np
import sympy as sym
from irlc.ex03.control_model import ControlModel
from irlc.ex03.control_cost import SymbolicQRCost
class Simulation(ControlModel):
def sym_f(self, x, u, t=None):
return [-sym.exp( u[0] -x[0]**2 )]
def get_cost(self): # The cost is only required to specify dimensions of x and u.
return SymbolicQRCost(Q=np.eye(1), R=np.eye(1))
def a_xdot(x : float, a : float) -> float:
m = Simulation()
u = a * x**2 # This approach validates our implementation of the system. A manual implementation is just as good.
xd_ = -np.exp( u - x**2 )
xdot = m.f((x,), (u,), 0)[0]
assert xd_ == xdot
return xdot
def b_rk4_simulate(u0 : float, tF : float):
x = 0
m = Simulation()
xs, us, ts, J_ = m.simulate((x,), u_fun=(u0,), t0=0, tF=tF)
xF =xs[-1][0]
return xF
if __name__ == "__main__":
print(f"a): dx/dt should be -1, you got {a_xdot(x=2, a=1)=}")
print(f"b): Final position x(tF) should be approximately -2.09, you got {b_rk4_simulate(u0=2, tF=3)=}")
\ No newline at end of file
import math
from irlc.exam.exam2024spring.inventory import InventoryDPModel
from irlc.exam.exam2024spring.dp import DP_stochastic
class InventoryDPModelGowns(InventoryDPModel):
action_sale = "sale"
def __init__(self, N=3, m=3, allow_sale=False):
self.m = m
self.allow_sale = allow_sale
super().__init__(N=N)
def A(self, x, k): # Action space A_k(x)
space = list(range(self.m))
if self.allow_sale:
space = space + [self.action_sale]
return space
else:
return space
def g(self, x, u, w, k): # Cost function g_k(x,u,w)
if u == self.action_sale:
return 3/4 * (self.m - w)
else:
return InventoryDPModel.g(self, x, u, w, k)
def f(self, x, u, w, k): # Dynamics f_k(x,u,w)
if u == self.action_sale:
return 0
else:
return InventoryDPModel.f(self, x, u, w, k) # max(0, min(self.m, x + u - w))
def Pw(self, x, u, k): # Distribution over random disturbances
pw = {w: 1/self.m for w in range(self.m)}
assert math.fabs(sum(pw.values()) - 1) < 1e-6
return pw
def a_get_cost(N: int, m: int, x0 : int) -> float:
model = InventoryDPModelGowns(N=N, m=m, allow_sale=False)
J, pi = DP_stochastic(model)
expected_cost = J[0][x0]
return expected_cost
def b_sale(N : int, m : int, x0 : int) -> float:
model = InventoryDPModelGowns(N=N, m=m, allow_sale=True)
J, pi = DP_stochastic(model)
expected_cost = J[0][x0]
return expected_cost
if __name__ == "__main__":
x0 = 0
N = 6
m = 4
print(f"a) The expected cost should be 13.75, and you got {a_get_cost(N, m=m, x0=x0)=}")
print(f"b) Expected cost when the sales-option is available should be approximately 11.25, and you got {b_sale(N, m=m, x0=x0)=}")
\ No newline at end of file
File added
from irlc.exam.midterm2023a.inventory import InventoryDPModel
def a_expected_items_next_day(x : int, u : int) -> float:
model = InventoryDPModel()
expected_number_of_items = None
k = 0
expected_number_of_items = sum([p * model.f(x, u, w, k=0) for w, p in model.Pw(x, u, k).items()])
return expected_number_of_items
def b_evaluate_policy(pi : list, x0 : int) -> float:
model = InventoryDPModel()
N = model.N
J = [{} for _ in range(N + 1)]
J[N] = {x: model.gN(x) for x in model.S(model.N)}
for k in range(N - 1, -1, -1):
for x in model.S(k):
Qu = {u: sum(pw * (model.g(x, u, w, k) + J[k + 1][model.f(x, u, w, k)]) for w, pw in model.Pw(x, u, k).items()) for u
in model.A(x, k)}
umin = pi[k][x] # min(Qu, key=Qu.get)
J[k][x] = Qu[umin] # Compute the expected cost function
J_pi_x0 = J[0][x0]
return J_pi_x0
if __name__ == "__main__":
model = InventoryDPModel()
# Create a policy that always buy an item if the inventory is empty.
pi = [{s: 1 if s == 0 else 0 for s in model.S(k)} for k in range(model.N)]
x = 0
u = 1
x0 = 1
a_expected_items_next_day(x=0, u=1)
print(f"Given inventory is {x=} and we buy {u=}, the expected items on day k=1 is {a_expected_items_next_day(x, u)} and should be 0.1")
print(f"Evaluation of policy is {b_evaluate_policy(pi, x0)} and should be 2.7")
\ No newline at end of file
def pid(xs : list, xstar :float , Kp=0., Ki=0., Kd=0., stable=False):
us = []
e_prev = 0
es = []
I = 0
Delta = 1
for k, x in enumerate(xs):
e = xstar - x
es.append(e)
I = I + Delta * e
if k > 2 and stable:
d1 = (es[-1] - es[-2])/Delta
d2 = (es[-2] - es[-3]) / Delta
dterm = (d1+d2)/2
else:
dterm = (e-e_prev)/ Delta
u = Kp * e + Ki * I + Kd * dterm
e_prev = e
us.append(u)
return us[-1]
def a_pid_Kp(xs : list, xstar : float, Kp : float) -> float:
u = pid(xs, xstar, Kp=Kp)
return u
def b_pid_full(xs : list, xstar : float, Kp : float, Ki : float, Kd : float) -> float:
u = pid(xs, xstar, Kp=Kp, Ki=Ki, Kd=Kd)
return u
def c_pid_stable(xs : list, xstar : float, Kp : float, Ki : float, Kd : float) -> float:
u = pid(xs, xstar, Kp=Kp, Ki=Ki, Kd=Kd, stable=True)
return u
if __name__ == "__main__":
xs = [10, 8, 7, 5, 3, 1, 0, -2, -1, 0, 2] # Sequence of inputs x_k
Kp = 0.5
Ki = 0.05
Kd = 0.25
xstar = -1
u_a = a_pid_Kp(xs, xstar=0, Kp=Kp)
print(f"Testing part a. Got {u_a}, expected -1.")
u_b = b_pid_full(xs, xstar=-1, Kp=Kp, Ki=Ki, Kd=Kd)
print(f"Testing part b. Got {u_b}, expected -4.2")
u_c = c_pid_stable(xs, xstar=-1, Kp=Kp, Ki=Ki, Kd=Kd)
print(f"Testing part c. Got {u_c}, expected -4.075")
\ No newline at end of file
File added
# from irlc.exam.midterm2023b.inventory import InventoryDPModel
# from irlc.exam.midterm2023b.dp import DP_stochastic
# from irlc.exam
# import irlc
import numpy as np
from irlc.exam.midterm2023b.mdp import MDP
class SmallGambler(MDP):
"""
Implements a variant of the gambler problem. Please refer to the problem text for a description. You can consider this
implementation of the environment to be authoritative, and I do not recommend changing it.
"""
def __init__(self):
goal = 40
super().__init__(initial_state=goal // 2)
self.goal = 40
self.p_heads = .4 # Chance of winning.
def is_terminal(self, state):
""" Environment has been modified to never terminate. """
return False
def A(self, s):
""" Action is the amount you choose to gamble.
You can gamble from 0 and up to the amount of money you have (state),
If you are either in s = 0 or s = self.goal, you cannot gamble anything (A(s) = {0}). """
return range(0, min(s, self.goal - s) + 1)
def Psr(self, s, a):
""" Implement transition probabilities here.
the reward is 1 if s < self.goal and s + a == self.goal and otherwise 0. Remember the format should
return a dictionary with entries:
> { (sp, r) : probability }
"""
r = 1 if s + a == self.goal and s < self.goal else -a/100
if a == 0:
d = {(s + a, r): 1}
else:
d = {(s + a, r): self.p_heads, (s - a, 0): 1 - self.p_heads}
assert sum(d.values()) == 1 # Sanity check: the probabilities must sum to 1.
return d
def a_get_reward(s : int, a : int) -> float:
mdp = SmallGambler()
avg_reward = 0
for (sp, r), p in mdp.Psr(s, a).items():
avg_reward += r * p
return avg_reward
def b_get_best_immediate_action(s : int) -> int:
mdp = SmallGambler()
if s not in mdp.nonterminal_states:
return 0
d = {a: a_get_reward(s, a) for a in mdp.A(s)}
astar = max(d, key=d.get)
vs = [v for v in d.values() if np.abs(v - d[astar]) < 1e-6]
if len( vs )>1:
print(vs)
assert False
return astar
def c_get_best_action_twosteps(s : int) -> int:
mdp = SmallGambler()
d = {}
for a in mdp.A(s):
d[a] = 0
for (sp, r), p in mdp.Psr(s,a).items():
d[a] += p * (r + a_get_reward(sp, b_get_best_immediate_action(sp)))
astar = max(d, key=d.get)
vs = [v for v in d.values() if np.abs(v-d[astar]) < 1e-6]
if len( vs )>1:
print(vs)
assert False
return astar
if __name__ == "__main__":
mdp = SmallGambler()
s = 16
a = 26
print(f"When {s=} and {a=} the average reward is -0.104; your value is {a_get_reward(s,a)=}")
print(f"When {s=} the best immediate action is 0, your value is {b_get_best_immediate_action(s)=}")
print(f"When {s=} the best action over two steps is 4, your value is {c_get_best_action_twosteps(s)=}")
\ No newline at end of file
def a_compute_deltas(v: dict, states: list, rewards: list, gamma: float) -> list:
deltas = [] # !b;nolines
for t, (s, r) in enumerate(zip(states[:-1], rewards)):
sp = states[t + 1]
delta = (r + gamma * v[sp]) - v[s]
deltas.append(delta) # !b
return deltas
def b_perform_td0(v: dict, states: list, rewards: list, gamma: float, alpha: float) -> dict:
for t in range(len(rewards)): # !b;nolines
s = states[t]
sp = states[t + 1]
r = rewards[t]
delta = r + gamma * v[sp] - v[s]
v[s] = v[s] + alpha * delta # !b
return v
def c_perform_td0_batched(v: dict, states: list, rewards: list, gamma: float, alpha: float) -> dict:
deltas = a_compute_deltas(v, states, rewards, gamma) # !b;nolines
for t in range(len(rewards)):
s = states[t]
v[s] = v[s] + alpha * deltas[t] # !b
return v
if __name__ == "__main__":
states = [1, 0, 2, -1, 2, 4, 5, 4, 3, 2, 1, -1]
rewards = [1, 0.5, -1, 0, 1, 2, 2, 0, 0, -1, 0.5]
# In the notation of the problem: T = len(rewards).
v = {s: 0 for s in states} # Initialize the value function v.
gamma = 0.9
alpha = 0.2
deltas = a_compute_deltas(v, states, rewards, gamma)
print(f"The first value of delta should be 1, your value is {deltas[0]=}")
v = b_perform_td0(v, states, rewards, gamma, alpha)
print(f"The value function v(s=1) should be 0.25352, your value is {v[1]=}")
v_batched = {s: 0 for s in states} # Initialize the value function anew
v_batched = c_perform_td0_batched(v_batched, states, rewards, gamma, alpha)
print(f"The batched value function in v(s=1) should be 0.3, your value is {v_batched[1]=}")
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment