Professional Documents
Culture Documents
Ass1 Merged Merged
Ass1 Merged Merged
Ass1 Merged Merged
In [1]:
import gym
In [2]:
env = gym.make('MountainCar-v0',render_mode="rgb_array")
In [3]:
# Observation and action space
obs_space = env.observation_space
action_space = env.action_space
print("The observation space: {} ".format(obs_space))
print("The action space: {} ".format(action_space))
In [4]:
In [5]:
import matplotlib.pyplot as plt
In [6]:
env_screen = env.render()
env.close()
<matplotlib.image.AxesImage at 0x266f66029e0>
In [7]:
import time
obs = env.reset()
# Wait a bit before the next frame unless you want to see a crazy fast video
time.sleep(0.001)
plt.imshow(env.render())
# If the epsiode is up, then start another one
if done:
env.reset()
import gym
In [2]:
env = gym.make('MountainCar-v0',render_mode="rgb_array")
In [3]:
# Observation and action space
obs_space = env.observation_space
action_space = env.action_space
print("The observation space: {} ".format(obs_space))
print("The action space: {} ".format(action_space))
In [4]:
In [5]:
import matplotlib.pyplot as plt
In [6]:
env_screen = env.render()
env.close()
<matplotlib.image.AxesImage at 0x266f66029e0>
In [7]:
import time
obs = env.reset()
# Wait a bit before the next frame unless you want to see a crazy fast video
time.sleep(0.001)
plt.imshow(env.render())
# If the epsiode is up, then start another one
if done:
env.reset()
import gym
import random
import matplotlib.pyplot as plt
import numpy as np
In [3]:
In [4]:
# Q TABLE FOR THE GAME
#1
# qtable = np.zeros((16, 4))
#2
nb_states = environment.observation_space.n # = 16
nb_actions = environment.action_space.n # = 4
qtable = np.zeros((nb_states, nb_actions))
qtable
Out[4]:
array([[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])
In [5]:
# random.choice(["LEFT", "DOWN", "RIGHT", "UP"])
environment.action_space.sample()
Out[5]:
In [6]:
environment.step(2)
environment.render()
In [7]:
action = environment.action_space.sample()
# 2. Implement this action and move the agent in the desired direction
new_state, reward, done, info,a = environment.step(action)
# Display the results (reward and map)
environment.render()
print(f'Reward = {reward}')
Reward = 0.0
In [8]:
# Hyperparameters
episodes = 1000 # Total number of episodes
alpha = 0.5 # Learning rate
gamma = 0.9 # Discount factor
# Training
for _ in range(episodes):
state = environment.reset()[0]
done = False
# Until the agent gets stuck in a hole or reaches the goal, keep training it
while not done:
# Choose the action with the highest value in the current state
if np.max(qtable[state]) > 0:
action = np.argmax(qtable[state])
# Implement this action and move the agent in the desired direction
new_state, reward, done, info,a = environment.step(action)
# Update Q(s,a)
qtable[state, action] = qtable[state, action] + \
alpha * (reward + gamma * np.max(qtable[new_state]) - q
table[state, action])
print()
print('===========================================')
print('Q-table after training:')
print(qtable)
# Plot outcomes
plt.figure(figsize=(12, 5))
plt.xlabel("Run number")
plt.ylabel("Outcome")
ax = plt.gca()
ax.set_facecolor('#efeeea')
plt.bar(range(len(outcomes)), outcomes, color="#0A047A", width=1.0)
plt.show()
Q-table before training:
[[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]]
===========================================
Q-table after training:
[[0. 0. 0.59049 0. ]
[0. 0. 0.6561 0. ]
[0. 0.729 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0.81 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0.2784375 0. ]
[0. 0.9 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 1. 0. ]
[0. 0. 0. 0. ]]
In [9]:
from IPython.display import clear_output
import time
state = environment.reset()[0]
done = False
sequence = []
# Implement this action and move the agent in the desired direction
new_state, reward, done, info,a = environment.step(action)
print(f"Sequence = {sequence}")
Sequence = [2, 2, 1, 1, 1, 2]
In [10]:
episodes = 100
nb_success = 0
# Evaluation
for _ in range(100):
state = environment.reset()[0]
done = False
# Until the agent gets stuck or reaches the goal, keep training it
while not done:
# Choose the action with the highest value in the current state
if np.max(qtable[state]) > 0:
action = np.argmax(qtable[state])
# Implement this action and move the agent in the desired direction
new_state, reward, done, info,a = environment.step(action)
import gym
import numpy as np
import matplotlib.pyplot as plt
In [2]:
In [3]:
env.reset()
# env.render()
# env.step(2)
Out[3]:
In [4]:
# env.step(2)
In [5]:
# env.step(2)
In [6]:
print(env.observation_space)
env.action_space
Discrete(16)
Out[6]:
Discrete(4)
In [7]:
env.P[2][2]
Out[7]:
[(1.0, 3, 0.0, False)]
In [8]:
for i in range(no_of_iterations):
Q_value.append(np.sum(next_states_rewards))
value_table[state] = max(Q_value)
# we will check whether we have reached the convergence i.e whether the differenc
e
# between our value table and updated value table is very small. But how do we kn
ow it is very
# small? We set some threshold and then we will see if the difference is less
# than our threshold, if it is less, we break the loop and return the value funct
ion as optimal
# value function
return value_table
In [9]:
def extract_policy(value_table, gamma = 0.9):
# select the action which has maximum Q value as an optimal action of the state
policy[state] = np.argmax(Q_table)
return policy
In [10]:
optimal_value_function = value_iteration(env=env,gamma=0.9)
optimal_value_function
In [11]:
optimal_policy = extract_policy(optimal_value_function, gamma=0.9)
optimal_policy
Out[11]:
Out[11]:
array([1., 2., 1., 0., 1., 0., 1., 0., 2., 1., 1., 0., 0., 2., 2., 0.])
In [12]:
opt_pol = optimal_policy.reshape(4,4)
print('THE OPTIMAL POLICY IS \n ',opt_pol)
import gym
import pandas as pd
from collections import defaultdict
In [ ]:
env = gym.make('Blackjack-v1')
print(env.reset())
(18, 7, False)
In [ ]:
print(env.action_space)
Discrete(2)
In [ ]:
print(env.observation_space)
In [ ]:
def policy(state):
return 0 if state[0] >19 else 1
In [ ]:
state = env.reset()
print(state)
(20, 4, False)
In [ ]:
print(policy(state))
In [ ]:
num_timesteps = 500
In [ ]:
print(play_episode(env,policy))
num_iterations = 100000
for i in range(num_iterations):
episode = play_episode(env, policy)
states, actions, rewards = zip(*episode)
for t, state in enumerate(states):
r = (sum(rewards[t:]))
total_return[state] += sum(rewards[t:])
N[state] += 1
In [ ]:
total_return = pd.DataFrame(total_return.items(), columns=['state', 'total_return'])
N = pd.DataFrame(N.items(), columns=['state', 'N'])
df = pd.merge(total_return, N, on='state')
df.head()
Out[ ]:
state total_return N
In [ ]:
df['value'] = df['total_return'] / df['N']
df.head()
Out[ ]:
In [ ]:
df[df['state']==(21,9,False)]['value'].values
Out[ ]:
array([0.94723618])
In [ ]:
df[df['state']==(5,8,False)]['value'].values
Out[ ]:
array([-0.63513514])
ASSIGNMENT - 6
In [11]:
import gym
import numpy as np
import random
In [12]:
In [13]:
Q = {}
for s in range(env.observation_space.n):
for a in range(env.action_space.n):
Q[(s,a)] = 0.0
In [14]:
def epsilon_greedy (state, epsilon):
if random.uniform(0,1) < epsilon:
return env.action_space.sample()
else:
return max(list(range(env.action_space.n)), key= lambda x:
Q[(state,x)])
In [15]:
alpha=0.85
gamma= 0.90
epsilon = 0.8
In [16]:
num_episodes = 50000
num_timesteps= 1000
In [17]:
for i in range(num_episodes):
s = env.reset()[0]
for t in range(num_timesteps):
a = epsilon_greedy(s, epsilon)
s_,r, done, _, trash = env.step(a)
a_ = np.argmax([Q[(s_, a)] for a in range(env.action_space.n)])
Q[(s,a)] += alpha * (r + gamma * Q[(s_,a_)]-Q[(s,a)])
s = s_
if done:
break
In [18]:
Q
Out[18]:
{(0, 0): 0.23477961696373423,
(0, 1): 0.22480181183703787,
(0, 2): 0.23961716957752016,
(0, 3): 0.24066398243905854,
(1, 0): 0.2204815896999076,
(1, 1): 0.04017125915710931,
(1, 2): 0.2822227428738474,
(1, 3): 0.22490808477046206,
(2, 0): 0.29961284509447655,
(2, 0): 0.29961284509447655,
(2, 1): 0.32990657866523887,
(2, 2): 0.37292229711147334,
(2, 3): 0.2790710024900863,
(3, 0): 0.25000597793284357,
(3, 1): 0.2575759230383145,
(3, 2): 0.037377204692152305,
(3, 3): 0.32551898596551954,
(4, 0): 0.3804023551933965,
(4, 1): 0.00856676265665978,
(4, 2): 0.5076563484150082,
(4, 3): 0.050394379122136346,
(5, 0): 0.0,
(5, 1): 0.0,
(5, 2): 0.0,
(5, 3): 0.0,
(6, 0): 0.4285911909119954,
(6, 1): 0.0002831967627810342,
(6, 2): 0.692932809233417,
(6, 3): 0.006210297861473632,
(7, 0): 0.0,
(7, 1): 0.0,
(7, 2): 0.0,
(7, 3): 0.0,
(8, 0): 0.4724947380728043,
(8, 1): 0.5292926568861616,
(8, 2): 0.0854144618498413,
(8, 3): 0.4739574281045383,
(9, 0): 0.07482359865928406,
(9, 1): 0.7499983936496128,
(9, 2): 0.5420577123103719,
(9, 3): 0.07627448541464799,
(10, 0): 0.5934266484891275,
(10, 1): 0.8240260740178592,
(10, 2): 0.774672222464751,
(10, 3): 0.09258352148159432,
(11, 0): 0.0,
(11, 1): 0.0,
(11, 2): 0.0,
(11, 3): 0.0,
(12, 0): 0.0,
(12, 1): 0.0,
(12, 2): 0.0,
(12, 3): 0.0,
(13, 0): 0.5646633544813056,
(13, 1): 0.11021736826449313,
(13, 2): 0.6234923802366498,
(13, 3): 0.7062633423537164,
(14, 0): 0.807111703090516,
(14, 1): 0.6835467756183892,
(14, 2): 0.9109203529502328,
(14, 3): 0.7967422343382821,
(15, 0): 0.0,
(15, 1): 0.0,
(15, 2): 0.0,
(15, 3): 0.0}