Py Code Example 4 1 Gradient MC Evaluation

##################
# Example 4.1: 4 x 4 gridworld: Gradient Monte Carlo policy evaluation algorithm on

p. 224
##################
# Find the state-action function, state-value function, and greedy actions.

# equiprobable random policy (all actions equally likely)
# Set random seed for reproducibility.
seed = 543
from numpy import random
random.seed(seed)
from tensorflow import set_random_seed
set_random_seed(seed)
# Import packages and functions.
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
import matplotlib.pyplot as plt
which = lambda status: np.arange(len(status))[status]
# parameter settings
nrow = 4
ncol = 4
discount = 1.0
eps = 1.0e-5
#in_place = False
in_place = True # use most updated values; with each new value immediately
overwriting the old one
# move_increment = [np.array([0, -1]), np.array([-1, 0]), np.array([0, 1]),

np.array([1, 0])]]
action_word = np.array(["left", "up", "right", "down"])
action = np.array([0, 1, 2, 3])

action_num = 4
action_prob = np.zeros((nrow, ncol, action_num))

action_prob[:,:,:] = 0.25
state_value = np.zeros((nrow, ncol))

state_value_old = np.zeros((nrow, ncol))
# greedy action expressed by one-hot encoding
greedy_action = np.zeros((nrow, ncol, action_num))
def is_terminal(state):
status = False
x = state[0]
y = state[1]
if (x == 0) and (y == 0):
status = True
if (x == nrow - 1) and y == (ncol - 1):
status = True
return status
def step(state, action):

status = is_terminal(state)
if status == True:
next_state = state
reward = 0
else:
if action == 0: # move left
next_state = state + np.array([0, -1])
if action == 1: # move up
next_state = state + np.array([-1, 0])
if action == 2: # move right
next_state = state + np.array([0, 1])
if action == 3: # move down
next_state = state + np.array([1, 0])
x = next_state[0]
y = next_state[1]
if (x < 0) or (x > nrow - 1):
next_state = state
if (y < 0) or (y > ncol - 1):
next_state = state
reward = -1
return {"next_state": next_state, "reward": reward}
# Build the neural network model.
model = Sequential()
model.add(Dense(units = 20, input_dim = 3, kernel_initializer = 'uniform',
activation = 'relu'))
model.add(Dense(units = 10, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dense(units = 10, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'linear'))
model.summary()
# Compile the model.
model.compile(loss = 'mean_squared_error', optimizer = 'adam', metrics = ['mae'])
# Construct the must training data.
x_must = [[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [3, 3, 0], [3, 3, 1], [3, 3,
2], [3, 3, 3]]
x_must = np.array(x_must)
y_must = np.zeros(8)
score = model.train_on_batch(x = x_must, y = y_must)
#print("score = ", score)
# main loop
#episode_num = 1000
episode_num = 10000 # for testing purpose only
num_steps = np.zeros(episode_num, dtype = int)
seed = 543
np.random.seed(seed) # Set random seed for reproducibility.
for episode in range(episode_num):

#print("")
#print("episode = ", episode)
# some initialization
state_trajectory = []
state_action_trajectory = []
# random initial state
status = True
while status:
i = int(np.random.choice(range(nrow), size = 1))
j = int(np.random.choice(range(ncol), size = 1))
if is_terminal([i, j]):
continue
else:
status = False
state = [i, j]
state_trajectory.append(state)
reward_seq = []
while status == False:
# Choose an action.
prob = action_prob[state[0], state[1], :]
#print("prob = ", prob)
move = int(np.random.choice([0, 1, 2, 3], size = 1, p = prob))
state_action = [state[0], state[1], move]
state_action_trajectory.append(state_action)
step_obj = step(state, move)
next_state = step_obj['next_state']
reward = step_obj['reward']
state_trajectory.append(next_state)
reward_seq.append(reward)
state = next_state
#print("state_trajectory = ", state_trajectory)
#print("state_action_trajectory = ", state_action_trajectory)
#print("reward_seq = ", reward_seq)
# Update the state-action-value and state-value, and find the greedy actions.
m = len(state_action_trajectory)
#print("m = ", m)
x_collect = []
y_collect = []
G = 0
for i in range(m-1, -1, -1):
#print("i = ", i)
G = discount * G + reward_seq[i]
#print("G = ", G)
k0, k1, k2 = state_action_trajectory[i]
x_collect = x_collect + [[k0, k1, k2]]
y_collect = y_collect + [G]
x_train = np.array(x_collect)
y_train = np.array(y_collect)
# Train the model on this single batch.
score = model.train_on_batch(x = x_train, y = y_train)
#print("score = ", score)
# Find the state-action values.
state_action_value = np.zeros((nrow, ncol, action_num))
for i in range(nrow):
for j in range(ncol):
for k in range(action_num):
predictor = np.reshape([i, j, k], newshape = (1, 3))
q_value = model.predict(predictor)[0]
state_action_value[i, j, k] = q_value
print("state-action = ", [i, j, k], "state-action value = ",
q_value)
# Find the state values and greedy actions.
state_value = np.zeros((nrow, ncol))

greedy_action = np.zeros((nrow, ncol, action_num), dtype = int)
q_values = state_action_value[i, j, :]
state_value[i, j] = np.max(q_values)
index_max = np.argmax(q_values)
greedy_action[i, j, index_max] = 1
print("state = ", [i, j], "state value = ", state_value[i, j])
index_max = which(greedy_action[i, j, :] == 1)
print("state = ", [i, j], " greedy action = ", action_word[index_max])
##################

Py Code Example 4 1 Gradient MC Evaluation

Uploaded by

Copyright:

Available Formats

You might also like

Py Code Example 4 1 Gradient MC Evaluation

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Py Code Example 4 1 Gradient MC Evaluation

Uploaded by

Copyright:

Available Formats

##################

# Example 4.1: 4 x 4 gridworld: Gradient Monte Carlo policy evaluation algorithm on

# Find the state-action function, state-value function, and greedy actions.

# Set random seed for reproducibility.

# Import packages and functions.

which = lambda status: np.arange(len(status))[status]

# move_increment = [np.array([0, -1]), np.array([-1, 0]), np.array([0, 1]),

action_word = np.array(["left", "up", "right", "down"])

action = np.array([0, 1, 2, 3])

action_prob = np.zeros((nrow, ncol, action_num))

state_value = np.zeros((nrow, ncol))

# greedy action expressed by one-hot encoding

greedy_action = np.zeros((nrow, ncol, action_num))

def step(state, action):

# Compile the model.

model.compile(loss = 'mean_squared_error', optimizer = 'adam', metrics = ['mae'])

# Construct the must training data.

score = model.train_on_batch(x = x_must, y = y_must)

#print("score = ", score)

episode_num = 10000 # for testing purpose only

num_steps = np.zeros(episode_num, dtype = int)

for episode in range(episode_num):

# Find the state-action values.

state_action_value = np.zeros((nrow, ncol, action_num))

# Find the state values and greedy actions.

state_value = np.zeros((nrow, ncol))

You might also like