From d9906643ae9ecd47ac59c66fed51806e568ff19f Mon Sep 17 00:00:00 2001 From: Tue Herlau <tuhe@dtu.dk> Date: Thu, 3 Apr 2025 16:39:42 +0200 Subject: [PATCH] Solutions for week 9 --- solutions/ex09/gambler_TODO_1.py | 1 + solutions/ex09/gambler_TODO_2.py | 1 + solutions/ex09/gambler_TODO_3.py | 4 ++++ solutions/ex09/mdp_warmup_TODO_1.py | 1 + solutions/ex09/mdp_warmup_TODO_2.py | 1 + solutions/ex09/mdp_warmup_TODO_3.py | 1 + solutions/ex09/mdp_warmup_TODO_4.py | 1 + solutions/ex09/policy_evaluation_TODO_1.py | 2 ++ solutions/ex09/policy_iteration_TODO_1.py | 6 ++++++ solutions/ex09/value_iteration_TODO_1.py | 2 ++ solutions/ex09/value_iteration_TODO_2.py | 2 ++ solutions/ex09/value_iteration_agent_TODO_1.py | 1 + solutions/ex09/value_iteration_agent_TODO_2.py | 1 + 13 files changed, 24 insertions(+) create mode 100644 solutions/ex09/gambler_TODO_1.py create mode 100644 solutions/ex09/gambler_TODO_2.py create mode 100644 solutions/ex09/gambler_TODO_3.py create mode 100644 solutions/ex09/mdp_warmup_TODO_1.py create mode 100644 solutions/ex09/mdp_warmup_TODO_2.py create mode 100644 solutions/ex09/mdp_warmup_TODO_3.py create mode 100644 solutions/ex09/mdp_warmup_TODO_4.py create mode 100644 solutions/ex09/policy_evaluation_TODO_1.py create mode 100644 solutions/ex09/policy_iteration_TODO_1.py create mode 100644 solutions/ex09/value_iteration_TODO_1.py create mode 100644 solutions/ex09/value_iteration_TODO_2.py create mode 100644 solutions/ex09/value_iteration_agent_TODO_1.py create mode 100644 solutions/ex09/value_iteration_agent_TODO_2.py diff --git a/solutions/ex09/gambler_TODO_1.py b/solutions/ex09/gambler_TODO_1.py new file mode 100644 index 0000000..5edd917 --- /dev/null +++ b/solutions/ex09/gambler_TODO_1.py @@ -0,0 +1 @@ + return state in [0, self.goal] \ No newline at end of file diff --git a/solutions/ex09/gambler_TODO_2.py b/solutions/ex09/gambler_TODO_2.py new file mode 100644 index 0000000..63c4cf7 --- /dev/null +++ b/solutions/ex09/gambler_TODO_2.py @@ -0,0 +1 @@ + return list( range(1, min(s, self.goal - s) + 1)) \ No newline at end of file diff --git a/solutions/ex09/gambler_TODO_3.py b/solutions/ex09/gambler_TODO_3.py new file mode 100644 index 0000000..b4e0a66 --- /dev/null +++ b/solutions/ex09/gambler_TODO_3.py @@ -0,0 +1,4 @@ + r = 1 if s + a == 100 else 0 + WIN = (s+a, r) + LOSS = (s-a, 0) + outcome_dict = {WIN: self.p_heads, LOSS: 1-self.p_heads } if WIN != LOSS else {WIN: 1.} \ No newline at end of file diff --git a/solutions/ex09/mdp_warmup_TODO_1.py b/solutions/ex09/mdp_warmup_TODO_1.py new file mode 100644 index 0000000..b8ee7db --- /dev/null +++ b/solutions/ex09/mdp_warmup_TODO_1.py @@ -0,0 +1 @@ + q_dict = {a: sum([p*(r+ (gamma*v[sp] if not mdp.is_terminal(sp) else 0)) for (sp,r), p in mdp.Psr(s,a).items()]) for a in mdp.A(s)} \ No newline at end of file diff --git a/solutions/ex09/mdp_warmup_TODO_2.py b/solutions/ex09/mdp_warmup_TODO_2.py new file mode 100644 index 0000000..f605ec3 --- /dev/null +++ b/solutions/ex09/mdp_warmup_TODO_2.py @@ -0,0 +1 @@ + raise NotImplementedError("Insert your solution and remove this error.") \ No newline at end of file diff --git a/solutions/ex09/mdp_warmup_TODO_3.py b/solutions/ex09/mdp_warmup_TODO_3.py new file mode 100644 index 0000000..c8f9a46 --- /dev/null +++ b/solutions/ex09/mdp_warmup_TODO_3.py @@ -0,0 +1 @@ + expected_reward = sum( [r * p for (sp, r), p in mdp.Psr(s, a).items() ] ) \ No newline at end of file diff --git a/solutions/ex09/mdp_warmup_TODO_4.py b/solutions/ex09/mdp_warmup_TODO_4.py new file mode 100644 index 0000000..bb8d281 --- /dev/null +++ b/solutions/ex09/mdp_warmup_TODO_4.py @@ -0,0 +1 @@ + V_s = sum( [Q[s,a] * p for a, p in policy.items()] ) \ No newline at end of file diff --git a/solutions/ex09/policy_evaluation_TODO_1.py b/solutions/ex09/policy_evaluation_TODO_1.py new file mode 100644 index 0000000..290d5ab --- /dev/null +++ b/solutions/ex09/policy_evaluation_TODO_1.py @@ -0,0 +1,2 @@ + q = value_function2q_function(mdp, s, gamma, v) + v_, v[s] = v[s], sum( [q[a] * pi_a for a,pi_a in pi[s].items()] ) \ No newline at end of file diff --git a/solutions/ex09/policy_iteration_TODO_1.py b/solutions/ex09/policy_iteration_TODO_1.py new file mode 100644 index 0000000..00c8a95 --- /dev/null +++ b/solutions/ex09/policy_iteration_TODO_1.py @@ -0,0 +1,6 @@ + for s in [mdp.nonterminal_states[i] for i in np.random.permutation(len(mdp.nonterminal_states))]: + old_a = pi[s] # The best action we would take under the current policy + Qs = value_function2q_function(mdp, s, gamma, V) + pi[s] = max(Qs, key=Qs.get) + if old_a != pi[s]: + policy_stable = False \ No newline at end of file diff --git a/solutions/ex09/value_iteration_TODO_1.py b/solutions/ex09/value_iteration_TODO_1.py new file mode 100644 index 0000000..d07abe4 --- /dev/null +++ b/solutions/ex09/value_iteration_TODO_1.py @@ -0,0 +1,2 @@ + v, V[s] = V[s], max(value_function2q_function(mdp, s, gamma, V).values()) if len(mdp.A(s)) > 0 else 0 + Delta = max(Delta, np.abs(v - V[s])) \ No newline at end of file diff --git a/solutions/ex09/value_iteration_TODO_2.py b/solutions/ex09/value_iteration_TODO_2.py new file mode 100644 index 0000000..89339fe --- /dev/null +++ b/solutions/ex09/value_iteration_TODO_2.py @@ -0,0 +1,2 @@ + Q = {a: v-(1e-8*a if isinstance(a, int) else 0) for a,v in value_function2q_function(mdp, s, gamma, V).items()} + pi[s] = max(Q, key=Q.get) \ No newline at end of file diff --git a/solutions/ex09/value_iteration_agent_TODO_1.py b/solutions/ex09/value_iteration_agent_TODO_1.py new file mode 100644 index 0000000..4909072 --- /dev/null +++ b/solutions/ex09/value_iteration_agent_TODO_1.py @@ -0,0 +1 @@ + self.policy, self.v = value_iteration(mdp, gamma=gamma, **kwargs) \ No newline at end of file diff --git a/solutions/ex09/value_iteration_agent_TODO_2.py b/solutions/ex09/value_iteration_agent_TODO_2.py new file mode 100644 index 0000000..5a41f14 --- /dev/null +++ b/solutions/ex09/value_iteration_agent_TODO_2.py @@ -0,0 +1 @@ + action = self.policy[s] \ No newline at end of file -- GitLab