dennybritz · link2xt · Jun 23, 2019
diff --git a/DP/Value Iteration Solution.ipynb b/DP/Value Iteration Solution.ipynb
@@ -173,7 +173,57 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "# Taken from Policy Evaluation Exercise!\n",
+    "\n",
+    "def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n",
+    "    \"\"\"\n",
+    "    Evaluate a policy given an environment and a full description of the environment's dynamics.\n",
+    "    \n",
+    "    Args:\n",
+    "        policy: [S, A] shaped matrix representing the policy.\n",
+    "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
+    "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
+    "            env.nS is a number of states in the environment. \n",
+    "            env.nA is a number of actions in the environment.\n",
+    "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
+    "        discount_factor: Gamma discount factor.\n",
+    "    \n",
+    "    Returns:\n",
+    "        Vector of length env.nS representing the value function.\n",
+    "    \"\"\"\n",
+    "    # Start with a random (all 0) value function\n",
+    "    V = np.zeros(env.nS)\n",
+    "    while True:\n",
+    "        delta = 0\n",
+    "        # For each state, perform a \"full backup\"\n",
+    "        for s in range(env.nS):\n",
+    "            v = 0\n",
+    "            # Look at the possible next actions\n",
+    "            for a, action_prob in enumerate(policy[s]):\n",
+    "                # For each action, look at the possible next states...\n",
+    "                for  prob, next_state, reward, done in env.P[s][a]:\n",
+    "                    # Calculate the expected value. Ref: Sutton book eq. 4.6.\n",
+    "                    v += action_prob * prob * (reward + discount_factor * V[next_state])\n",
+    "            # How much our value function changed (across any states)\n",
+    "            delta = max(delta, np.abs(v - V[s]))\n",
+    "            V[s] = v\n",
+    "        # Stop evaluating once our value function change is below a threshold\n",
+    "        if delta < theta:\n",
+    "            break\n",
+    "    return np.array(V)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test the policy\n",
+    "eval_v = policy_eval(policy, env)\n",
+    "np.testing.assert_array_almost_equal(eval_v, expected_v, decimal=2)"
+   ]
   }
  ],
  "metadata": {

diff --git a/DP/Value Iteration.ipynb b/DP/Value Iteration.ipynb
@@ -149,6 +149,63 @@
     "expected_v = np.array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1,  0])\n",
     "np.testing.assert_array_almost_equal(v, expected_v, decimal=2)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Taken from Policy Evaluation Exercise!\n",
+    "\n",
+    "def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n",
+    "    \"\"\"\n",
+    "    Evaluate a policy given an environment and a full description of the environment's dynamics.\n",
+    "    \n",
+    "    Args:\n",
+    "        policy: [S, A] shaped matrix representing the policy.\n",
+    "        env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
+    "            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
+    "            env.nS is a number of states in the environment. \n",
+    "            env.nA is a number of actions in the environment.\n",
+    "        theta: We stop evaluation once our value function change is less than theta for all states.\n",
+    "        discount_factor: Gamma discount factor.\n",
+    "    \n",
+    "    Returns:\n",
+    "        Vector of length env.nS representing the value function.\n",
+    "    \"\"\"\n",
+    "    # Start with a random (all 0) value function\n",
+    "    V = np.zeros(env.nS)\n",
+    "    while True:\n",
+    "        delta = 0\n",
+    "        # For each state, perform a \"full backup\"\n",
+    "        for s in range(env.nS):\n",
+    "            v = 0\n",
+    "            # Look at the possible next actions\n",
+    "            for a, action_prob in enumerate(policy[s]):\n",
+    "                # For each action, look at the possible next states...\n",
+    "                for  prob, next_state, reward, done in env.P[s][a]:\n",
+    "                    # Calculate the expected value. Ref: Sutton book eq. 4.6.\n",
+    "                    v += action_prob * prob * (reward + discount_factor * V[next_state])\n",
+    "            # How much our value function changed (across any states)\n",
+    "            delta = max(delta, np.abs(v - V[s]))\n",
+    "            V[s] = v\n",
+    "        # Stop evaluating once our value function change is below a threshold\n",
+    "        if delta < theta:\n",
+    "            break\n",
+    "    return np.array(V)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test the policy\n",
+    "eval_v = policy_eval(policy, env)\n",
+    "np.testing.assert_array_almost_equal(eval_v, expected_v, decimal=2)"
+   ]
   }
  ],
  "metadata": {