From d9906643ae9ecd47ac59c66fed51806e568ff19f Mon Sep 17 00:00:00 2001
From: Tue Herlau <tuhe@dtu.dk>
Date: Thu, 3 Apr 2025 16:39:42 +0200
Subject: [PATCH] Solutions for week 9

---
 solutions/ex09/gambler_TODO_1.py               | 1 +
 solutions/ex09/gambler_TODO_2.py               | 1 +
 solutions/ex09/gambler_TODO_3.py               | 4 ++++
 solutions/ex09/mdp_warmup_TODO_1.py            | 1 +
 solutions/ex09/mdp_warmup_TODO_2.py            | 1 +
 solutions/ex09/mdp_warmup_TODO_3.py            | 1 +
 solutions/ex09/mdp_warmup_TODO_4.py            | 1 +
 solutions/ex09/policy_evaluation_TODO_1.py     | 2 ++
 solutions/ex09/policy_iteration_TODO_1.py      | 6 ++++++
 solutions/ex09/value_iteration_TODO_1.py       | 2 ++
 solutions/ex09/value_iteration_TODO_2.py       | 2 ++
 solutions/ex09/value_iteration_agent_TODO_1.py | 1 +
 solutions/ex09/value_iteration_agent_TODO_2.py | 1 +
 13 files changed, 24 insertions(+)
 create mode 100644 solutions/ex09/gambler_TODO_1.py
 create mode 100644 solutions/ex09/gambler_TODO_2.py
 create mode 100644 solutions/ex09/gambler_TODO_3.py
 create mode 100644 solutions/ex09/mdp_warmup_TODO_1.py
 create mode 100644 solutions/ex09/mdp_warmup_TODO_2.py
 create mode 100644 solutions/ex09/mdp_warmup_TODO_3.py
 create mode 100644 solutions/ex09/mdp_warmup_TODO_4.py
 create mode 100644 solutions/ex09/policy_evaluation_TODO_1.py
 create mode 100644 solutions/ex09/policy_iteration_TODO_1.py
 create mode 100644 solutions/ex09/value_iteration_TODO_1.py
 create mode 100644 solutions/ex09/value_iteration_TODO_2.py
 create mode 100644 solutions/ex09/value_iteration_agent_TODO_1.py
 create mode 100644 solutions/ex09/value_iteration_agent_TODO_2.py

diff --git a/solutions/ex09/gambler_TODO_1.py b/solutions/ex09/gambler_TODO_1.py
new file mode 100644
index 0000000..5edd917
--- /dev/null
+++ b/solutions/ex09/gambler_TODO_1.py
@@ -0,0 +1 @@
+        return state in [0, self.goal]
\ No newline at end of file
diff --git a/solutions/ex09/gambler_TODO_2.py b/solutions/ex09/gambler_TODO_2.py
new file mode 100644
index 0000000..63c4cf7
--- /dev/null
+++ b/solutions/ex09/gambler_TODO_2.py
@@ -0,0 +1 @@
+        return list( range(1, min(s, self.goal - s) + 1))
\ No newline at end of file
diff --git a/solutions/ex09/gambler_TODO_3.py b/solutions/ex09/gambler_TODO_3.py
new file mode 100644
index 0000000..b4e0a66
--- /dev/null
+++ b/solutions/ex09/gambler_TODO_3.py
@@ -0,0 +1,4 @@
+        r = 1 if s + a == 100 else 0
+        WIN = (s+a, r)
+        LOSS = (s-a, 0)
+        outcome_dict = {WIN: self.p_heads, LOSS: 1-self.p_heads } if WIN != LOSS else {WIN: 1.} 
\ No newline at end of file
diff --git a/solutions/ex09/mdp_warmup_TODO_1.py b/solutions/ex09/mdp_warmup_TODO_1.py
new file mode 100644
index 0000000..b8ee7db
--- /dev/null
+++ b/solutions/ex09/mdp_warmup_TODO_1.py
@@ -0,0 +1 @@
+    q_dict = {a: sum([p*(r+ (gamma*v[sp] if not mdp.is_terminal(sp) else 0)) for (sp,r), p in mdp.Psr(s,a).items()]) for a in mdp.A(s)} 
\ No newline at end of file
diff --git a/solutions/ex09/mdp_warmup_TODO_2.py b/solutions/ex09/mdp_warmup_TODO_2.py
new file mode 100644
index 0000000..f605ec3
--- /dev/null
+++ b/solutions/ex09/mdp_warmup_TODO_2.py
@@ -0,0 +1 @@
+    raise NotImplementedError("Insert your solution and remove this error.")
\ No newline at end of file
diff --git a/solutions/ex09/mdp_warmup_TODO_3.py b/solutions/ex09/mdp_warmup_TODO_3.py
new file mode 100644
index 0000000..c8f9a46
--- /dev/null
+++ b/solutions/ex09/mdp_warmup_TODO_3.py
@@ -0,0 +1 @@
+    expected_reward = sum( [r * p for (sp, r), p in mdp.Psr(s, a).items() ] ) 
\ No newline at end of file
diff --git a/solutions/ex09/mdp_warmup_TODO_4.py b/solutions/ex09/mdp_warmup_TODO_4.py
new file mode 100644
index 0000000..bb8d281
--- /dev/null
+++ b/solutions/ex09/mdp_warmup_TODO_4.py
@@ -0,0 +1 @@
+    V_s = sum( [Q[s,a] * p for a, p in policy.items()] ) 
\ No newline at end of file
diff --git a/solutions/ex09/policy_evaluation_TODO_1.py b/solutions/ex09/policy_evaluation_TODO_1.py
new file mode 100644
index 0000000..290d5ab
--- /dev/null
+++ b/solutions/ex09/policy_evaluation_TODO_1.py
@@ -0,0 +1,2 @@
+            q = value_function2q_function(mdp, s, gamma, v) 
+            v_, v[s] = v[s], sum( [q[a] * pi_a for a,pi_a in pi[s].items()] ) 
\ No newline at end of file
diff --git a/solutions/ex09/policy_iteration_TODO_1.py b/solutions/ex09/policy_iteration_TODO_1.py
new file mode 100644
index 0000000..00c8a95
--- /dev/null
+++ b/solutions/ex09/policy_iteration_TODO_1.py
@@ -0,0 +1,6 @@
+        for s in [mdp.nonterminal_states[i] for i in np.random.permutation(len(mdp.nonterminal_states))]:  
+            old_a = pi[s] # The best action we would take under the current policy
+            Qs = value_function2q_function(mdp, s, gamma, V)
+            pi[s] = max(Qs, key=Qs.get)
+            if old_a != pi[s]:
+                policy_stable = False 
\ No newline at end of file
diff --git a/solutions/ex09/value_iteration_TODO_1.py b/solutions/ex09/value_iteration_TODO_1.py
new file mode 100644
index 0000000..d07abe4
--- /dev/null
+++ b/solutions/ex09/value_iteration_TODO_1.py
@@ -0,0 +1,2 @@
+            v, V[s] = V[s], max(value_function2q_function(mdp, s, gamma, V).values()) if len(mdp.A(s)) > 0 else 0    
+            Delta = max(Delta, np.abs(v - V[s])) 
\ No newline at end of file
diff --git a/solutions/ex09/value_iteration_TODO_2.py b/solutions/ex09/value_iteration_TODO_2.py
new file mode 100644
index 0000000..89339fe
--- /dev/null
+++ b/solutions/ex09/value_iteration_TODO_2.py
@@ -0,0 +1,2 @@
+        Q = {a: v-(1e-8*a if isinstance(a, int) else 0) for a,v in value_function2q_function(mdp, s, gamma, V).items()} 
+        pi[s] = max(Q, key=Q.get) 
\ No newline at end of file
diff --git a/solutions/ex09/value_iteration_agent_TODO_1.py b/solutions/ex09/value_iteration_agent_TODO_1.py
new file mode 100644
index 0000000..4909072
--- /dev/null
+++ b/solutions/ex09/value_iteration_agent_TODO_1.py
@@ -0,0 +1 @@
+        self.policy, self.v = value_iteration(mdp, gamma=gamma, **kwargs) 
\ No newline at end of file
diff --git a/solutions/ex09/value_iteration_agent_TODO_2.py b/solutions/ex09/value_iteration_agent_TODO_2.py
new file mode 100644
index 0000000..5a41f14
--- /dev/null
+++ b/solutions/ex09/value_iteration_agent_TODO_2.py
@@ -0,0 +1 @@
+            action = self.policy[s] 
\ No newline at end of file
-- 
GitLab