Py Code Example 4 1 Gradient MC Evaluation

You might also like

Download as txt, pdf, or txt
Download as txt, pdf, or txt
You are on page 1of 4

##################

# Example 4.1: 4 x 4 gridworld: Gradient Monte Carlo policy evaluation algorithm on


p. 224

##################

# Find the state-action function, state-value function, and greedy actions.


# equiprobable random policy (all actions equally likely)

# Set random seed for reproducibility.

seed = 543
from numpy import random
random.seed(seed)
from tensorflow import set_random_seed
set_random_seed(seed)

# Import packages and functions.

import numpy as np
from keras.models import Sequential
from keras.layers import Dense
import matplotlib.pyplot as plt

which = lambda status: np.arange(len(status))[status]

# parameter settings

nrow = 4
ncol = 4

discount = 1.0
eps = 1.0e-5

#in_place = False

in_place = True # use most updated values; with each new value immediately
overwriting the old one

# move_increment = [np.array([0, -1]), np.array([-1, 0]), np.array([0, 1]),


np.array([1, 0])]]

action_word = np.array(["left", "up", "right", "down"])

action = np.array([0, 1, 2, 3])


action_num = 4

action_prob = np.zeros((nrow, ncol, action_num))


action_prob[:,:,:] = 0.25

state_value = np.zeros((nrow, ncol))


state_value_old = np.zeros((nrow, ncol))

# greedy action expressed by one-hot encoding

greedy_action = np.zeros((nrow, ncol, action_num))

def is_terminal(state):
status = False
x = state[0]
y = state[1]
if (x == 0) and (y == 0):
status = True
if (x == nrow - 1) and y == (ncol - 1):
status = True
return status

def step(state, action):


status = is_terminal(state)
if status == True:
next_state = state
reward = 0
else:
if action == 0: # move left
next_state = state + np.array([0, -1])
if action == 1: # move up
next_state = state + np.array([-1, 0])
if action == 2: # move right
next_state = state + np.array([0, 1])
if action == 3: # move down
next_state = state + np.array([1, 0])
x = next_state[0]
y = next_state[1]
if (x < 0) or (x > nrow - 1):
next_state = state
if (y < 0) or (y > ncol - 1):
next_state = state
reward = -1
return {"next_state": next_state, "reward": reward}
# Build the neural network model.

model = Sequential()
model.add(Dense(units = 20, input_dim = 3, kernel_initializer = 'uniform',
activation = 'relu'))
model.add(Dense(units = 10, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dense(units = 10, kernel_initializer = 'uniform', activation = 'relu'))
model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'linear'))

model.summary()

# Compile the model.

model.compile(loss = 'mean_squared_error', optimizer = 'adam', metrics = ['mae'])

# Construct the must training data.

x_must = [[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [3, 3, 0], [3, 3, 1], [3, 3,
2], [3, 3, 3]]
x_must = np.array(x_must)
y_must = np.zeros(8)

score = model.train_on_batch(x = x_must, y = y_must)

#print("score = ", score)

# main loop
#episode_num = 1000

episode_num = 10000 # for testing purpose only

num_steps = np.zeros(episode_num, dtype = int)

seed = 543
np.random.seed(seed) # Set random seed for reproducibility.

for episode in range(episode_num):


#print("")
#print("episode = ", episode)
# some initialization
state_trajectory = []
state_action_trajectory = []
# random initial state
status = True
while status:
i = int(np.random.choice(range(nrow), size = 1))
j = int(np.random.choice(range(ncol), size = 1))
if is_terminal([i, j]):
continue
else:
status = False
state = [i, j]
state_trajectory.append(state)
reward_seq = []
status = is_terminal(state)
while status == False:
# Choose an action.
prob = action_prob[state[0], state[1], :]
#print("prob = ", prob)
move = int(np.random.choice([0, 1, 2, 3], size = 1, p = prob))
state_action = [state[0], state[1], move]
state_action_trajectory.append(state_action)
step_obj = step(state, move)
next_state = step_obj['next_state']
reward = step_obj['reward']
state_trajectory.append(next_state)
reward_seq.append(reward)
state = next_state
status = is_terminal(state)
#print("state_trajectory = ", state_trajectory)
#print("state_action_trajectory = ", state_action_trajectory)
#print("reward_seq = ", reward_seq)
# Update the state-action-value and state-value, and find the greedy actions.
m = len(state_action_trajectory)
#print("m = ", m)
x_collect = []
y_collect = []
G = 0
for i in range(m-1, -1, -1):
#print("i = ", i)
G = discount * G + reward_seq[i]
#print("G = ", G)
k0, k1, k2 = state_action_trajectory[i]
x_collect = x_collect + [[k0, k1, k2]]
y_collect = y_collect + [G]
x_train = np.array(x_collect)
y_train = np.array(y_collect)
# Train the model on this single batch.
score = model.train_on_batch(x = x_train, y = y_train)
#print("score = ", score)

# Find the state-action values.

state_action_value = np.zeros((nrow, ncol, action_num))

for i in range(nrow):
for j in range(ncol):
for k in range(action_num):
predictor = np.reshape([i, j, k], newshape = (1, 3))
q_value = model.predict(predictor)[0]
state_action_value[i, j, k] = q_value
print("state-action = ", [i, j, k], "state-action value = ",
q_value)

# Find the state values and greedy actions.

state_value = np.zeros((nrow, ncol))


greedy_action = np.zeros((nrow, ncol, action_num), dtype = int)

for i in range(nrow):
for j in range(ncol):
q_values = state_action_value[i, j, :]
state_value[i, j] = np.max(q_values)
index_max = np.argmax(q_values)
greedy_action[i, j, index_max] = 1
print("state = ", [i, j], "state value = ", state_value[i, j])

for i in range(nrow):
for j in range(ncol):
index_max = which(greedy_action[i, j, :] == 1)
print("state = ", [i, j], " greedy action = ", action_word[index_max])

##################

You might also like