Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test the policy in "Value Iteration" exercise #205

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 51 additions & 1 deletion DP/Value Iteration Solution.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,57 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# Taken from Policy Evaluation Exercise!\n",
"\n",
"def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n",
" \"\"\"\n",
" Evaluate a policy given an environment and a full description of the environment's dynamics.\n",
" \n",
" Args:\n",
" policy: [S, A] shaped matrix representing the policy.\n",
" env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
" env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
" env.nS is a number of states in the environment. \n",
" env.nA is a number of actions in the environment.\n",
" theta: We stop evaluation once our value function change is less than theta for all states.\n",
" discount_factor: Gamma discount factor.\n",
" \n",
" Returns:\n",
" Vector of length env.nS representing the value function.\n",
" \"\"\"\n",
" # Start with a random (all 0) value function\n",
" V = np.zeros(env.nS)\n",
" while True:\n",
" delta = 0\n",
" # For each state, perform a \"full backup\"\n",
" for s in range(env.nS):\n",
" v = 0\n",
" # Look at the possible next actions\n",
" for a, action_prob in enumerate(policy[s]):\n",
" # For each action, look at the possible next states...\n",
" for prob, next_state, reward, done in env.P[s][a]:\n",
" # Calculate the expected value. Ref: Sutton book eq. 4.6.\n",
" v += action_prob * prob * (reward + discount_factor * V[next_state])\n",
" # How much our value function changed (across any states)\n",
" delta = max(delta, np.abs(v - V[s]))\n",
" V[s] = v\n",
" # Stop evaluating once our value function change is below a threshold\n",
" if delta < theta:\n",
" break\n",
" return np.array(V)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test the policy\n",
"eval_v = policy_eval(policy, env)\n",
"np.testing.assert_array_almost_equal(eval_v, expected_v, decimal=2)"
]
}
],
"metadata": {
Expand Down
57 changes: 57 additions & 0 deletions DP/Value Iteration.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,63 @@
"expected_v = np.array([ 0, -1, -2, -3, -1, -2, -3, -2, -2, -3, -2, -1, -3, -2, -1, 0])\n",
"np.testing.assert_array_almost_equal(v, expected_v, decimal=2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Taken from Policy Evaluation Exercise!\n",
"\n",
"def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):\n",
" \"\"\"\n",
" Evaluate a policy given an environment and a full description of the environment's dynamics.\n",
" \n",
" Args:\n",
" policy: [S, A] shaped matrix representing the policy.\n",
" env: OpenAI env. env.P represents the transition probabilities of the environment.\n",
" env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).\n",
" env.nS is a number of states in the environment. \n",
" env.nA is a number of actions in the environment.\n",
" theta: We stop evaluation once our value function change is less than theta for all states.\n",
" discount_factor: Gamma discount factor.\n",
" \n",
" Returns:\n",
" Vector of length env.nS representing the value function.\n",
" \"\"\"\n",
" # Start with a random (all 0) value function\n",
" V = np.zeros(env.nS)\n",
" while True:\n",
" delta = 0\n",
" # For each state, perform a \"full backup\"\n",
" for s in range(env.nS):\n",
" v = 0\n",
" # Look at the possible next actions\n",
" for a, action_prob in enumerate(policy[s]):\n",
" # For each action, look at the possible next states...\n",
" for prob, next_state, reward, done in env.P[s][a]:\n",
" # Calculate the expected value. Ref: Sutton book eq. 4.6.\n",
" v += action_prob * prob * (reward + discount_factor * V[next_state])\n",
" # How much our value function changed (across any states)\n",
" delta = max(delta, np.abs(v - V[s]))\n",
" V[s] = v\n",
" # Stop evaluating once our value function change is below a threshold\n",
" if delta < theta:\n",
" break\n",
" return np.array(V)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test the policy\n",
"eval_v = policy_eval(policy, env)\n",
"np.testing.assert_array_almost_equal(eval_v, expected_v, decimal=2)"
]
}
],
"metadata": {
Expand Down