Ass1 Merged Merged

ASSIGNMENT - 1
In [1]:
import gym
In [2]:
env = gym.make('MountainCar-v0',render_mode="rgb_array")
In [3]:
# Observation and action space
obs_space = env.observation_space
action_space = env.action_space
print("The observation space: {} ".format(obs_space))
print("The action space: {} ".format(action_space))
The observation space: Box([-1.2 -0.07], [0.6 0.07], (2,), float32)

The action space: Discrete(3)
In [4]:
# pip install gym[classic_control]
In [5]:
import matplotlib.pyplot as plt
# reset the environment and see the initial observation

obs = env.reset()
print("The initial observation is {} ".format(obs))
# Sample a random action from the entire action space

random_action = env.action_space.sample()
# # Take the action and get the new observation space

new_obs, reward, done, info, a = env.step(random_action)
print("The new observation is {} ".format(new_obs))
The initial observation is (array([-0.5884122, 0. ], dtype=float32), {})

The new observation is [-5.8892918e-01 -5.1695626e-04]
In [6]:
env_screen = env.render()
env.close()

plt.imshow(env_screen)
Out[6]:
<matplotlib.image.AxesImage at 0x266f66029e0>
In [7]:
import time
# Number of steps you run the agent for

num_steps = 150
obs = env.reset()
for step in range(num_steps):

# take random action, but you can also do something more intelligent
# action = my_intelligent_agent_fn(obs)
action = env.action_space.sample()
# apply the action

obs, reward, done, info ,a= env.step(action)
# Render the env

env.render()
# Wait a bit before the next frame unless you want to see a crazy fast video
time.sleep(0.001)
plt.imshow(env.render())
# If the epsiode is up, then start another one
if done:
env.reset()
# Close the env

env.close()
ASSIGNMENT - 1
In [1]:
import gym
In [2]:
env = gym.make('MountainCar-v0',render_mode="rgb_array")
In [3]:
# Observation and action space
obs_space = env.observation_space
action_space = env.action_space
print("The observation space: {} ".format(obs_space))
print("The action space: {} ".format(action_space))
The observation space: Box([-1.2 -0.07], [0.6 0.07], (2,), float32)

The action space: Discrete(3)
In [4]:
# pip install gym[classic_control]
In [5]:
# reset the environment and see the initial observation

obs = env.reset()
print("The initial observation is {} ".format(obs))
# Sample a random action from the entire action space

random_action = env.action_space.sample()
# # Take the action and get the new observation space

new_obs, reward, done, info, a = env.step(random_action)
print("The new observation is {} ".format(new_obs))
The initial observation is (array([-0.5884122, 0. ], dtype=float32), {})

The new observation is [-5.8892918e-01 -5.1695626e-04]
In [6]:
env_screen = env.render()
env.close()

plt.imshow(env_screen)
Out[6]:
<matplotlib.image.AxesImage at 0x266f66029e0>
In [7]:
import time
# Number of steps you run the agent for

num_steps = 150
obs = env.reset()
for step in range(num_steps):

# take random action, but you can also do something more intelligent
# action = my_intelligent_agent_fn(obs)
action = env.action_space.sample()
# apply the action

obs, reward, done, info ,a= env.step(action)
# Render the env

env.render()
# Wait a bit before the next frame unless you want to see a crazy fast video
time.sleep(0.001)
plt.imshow(env.render())
# If the epsiode is up, then start another one
if done:
env.reset()
# Close the env

env.close()
ASSIGNMENT - 2
In [1]:
import gym
import random
import numpy as np
In [3]:
# MAKING A GYM ENVIRONMENT FOR THE GAME

environment = gym.make("FrozenLake-v1", is_slippery=False)
environment.reset()
environment.render()
In [4]:
# Q TABLE FOR THE GAME
#1
# qtable = np.zeros((16, 4))
#2
nb_states = environment.observation_space.n # = 16
nb_actions = environment.action_space.n # = 4
qtable = np.zeros((nb_states, nb_actions))
qtable
Out[4]:
array([[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])
In [5]:
# random.choice(["LEFT", "DOWN", "RIGHT", "UP"])
environment.action_space.sample()
Out[5]:
In [6]:
environment.step(2)
In [7]:
action = environment.action_space.sample()
# 2. Implement this action and move the agent in the desired direction
new_state, reward, done, info,a = environment.step(action)
# Display the results (reward and map)
print(f'Reward = {reward}')
Reward = 0.0
In [8]:

plt.rcParams['figure.dpi'] = 300
plt.rcParams.update({'font.size': 17})
# We re-initialize the Q-table

qtable = np.zeros((environment.observation_space.n, environment.action_space.n))
# Hyperparameters
episodes = 1000 # Total number of episodes
alpha = 0.5 # Learning rate
gamma = 0.9 # Discount factor
# List of outcomes to plot

outcomes = []
print('Q-table before training:')

print(qtable)
# Training
for _ in range(episodes):
state = environment.reset()[0]
done = False
# By default, we consider our outcome to be a failure

outcomes.append("Failure")
# Until the agent gets stuck in a hole or reaches the goal, keep training it
while not done:
# Choose the action with the highest value in the current state
if np.max(qtable[state]) > 0:
action = np.argmax(qtable[state])
# If there's no best action (only zeros), take a random one

else:
# Implement this action and move the agent in the desired direction
# Update Q(s,a)
qtable[state, action] = qtable[state, action] + \
alpha * (reward + gamma * np.max(qtable[new_state]) - q
table[state, action])
# Update our current state

state = new_state
# If we have a reward, it means that our outcome is a success

if reward:
outcomes[-1] = "Success"
print()
print('===========================================')
print('Q-table after training:')
print(qtable)
# Plot outcomes
plt.figure(figsize=(12, 5))
plt.xlabel("Run number")
plt.ylabel("Outcome")
ax = plt.gca()
ax.set_facecolor('#efeeea')
plt.bar(range(len(outcomes)), outcomes, color="#0A047A", width=1.0)
plt.show()
Q-table before training:
[[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]]
===========================================
Q-table after training:
[[0. 0. 0.59049 0. ]
[0. 0. 0.6561 0. ]
[0. 0.729 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0.81 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0.2784375 0. ]
[0. 0.9 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 1. 0. ]
[0. 0. 0. 0. ]]
In [9]:
from IPython.display import clear_output
import time
done = False
sequence = []
while not done:

else:
# Add the action to the sequence

sequence.append(action)

state = new_state
# Update the render

clear_output(wait=True)
time.sleep(1)
print(f"Sequence = {sequence}")
Sequence = [2, 2, 1, 1, 1, 2]
In [10]:
episodes = 100
nb_success = 0
# Evaluation
for _ in range(100):
done = False
# Until the agent gets stuck or reaches the goal, keep training it
while not done:

else:

state = new_state
# When we get a reward, it means we solved the game

nb_success += reward
# Let's check our success rate!

print (f"Success rate = {nb_success/episodes*100}%")
Success rate = 100.0%

ASSIGNMENT - 3
In [1]:
import gym
import numpy as np
In [2]:
env = gym.make("FrozenLake-v1", render_mode="human" ,is_slippery=False)
In [3]:
env.reset()
# env.render()
# env.step(2)
Out[3]:
(0, {'prob': 1})
In [4]:
# env.step(2)
In [5]:
# env.step(2)
In [6]:
print(env.observation_space)
env.action_space
Discrete(16)
Out[6]:
Discrete(4)
In [7]:
env.P[2][2]
Out[7]:
[(1.0, 3, 0.0, False)]
In [8]:
def value_iteration(env, gamma = 0.9):
# initialize value table with zeros

value_table = np.zeros(env.observation_space.n)
# set number of iterations and threshold

no_of_iterations = 100000
threshold = 1e-20
for i in range(no_of_iterations):
# On each iteration, copy the value table to the updated_value_table

updated_value_table = np.copy(value_table)
# Now we calculate Q Value for each actions in the state

# and update the value of a state with maximum Q value
for state in range(env.observation_space.n):
Q_value = []
for action in range(env.action_space.n):
next_states_rewards = []
for next_sr in env.P[state][action]:
trans_prob, next_state, reward_prob, _ = next_sr
next_states_rewards.append((trans_prob * (reward_prob + gamma * upda
ted_value_table[next_state])))
Q_value.append(np.sum(next_states_rewards))
value_table[state] = max(Q_value)
# we will check whether we have reached the convergence i.e whether the differenc
e
# between our value table and updated value table is very small. But how do we kn
ow it is very
# small? We set some threshold and then we will see if the difference is less
# than our threshold, if it is less, we break the loop and return the value funct
ion as optimal
# value function
if (np.sum(np.fabs(updated_value_table - value_table)) <= threshold):

print ('Value-iteration converged at iteration# %d .' %(i+1))
break
return value_table
In [9]:
def extract_policy(value_table, gamma = 0.9):
# initialize the policy with zeros

policy = np.zeros(env.observation_space.n)
for state in range(env.observation_space.n):
# initialize the Q table for a state

Q_table = np.zeros(env.action_space.n)
# compute Q value for all ations in the state

for action in range(env.action_space.n):
for next_sr in env.P[state][action]:
trans_prob, next_state, reward_prob, _ = next_sr
Q_table[action] += (trans_prob * (reward_prob + gamma * value_table[next
_state]))
# select the action which has maximum Q value as an optimal action of the state
policy[state] = np.argmax(Q_table)
return policy
In [10]:
optimal_value_function = value_iteration(env=env,gamma=0.9)
optimal_value_function
Value-iteration converged at iteration# 7.

Out[10]:
array([0.59049, 0.6561 , 0.729 , 0.6561 , 0.6561 , 0. , 0.81 ,
0. , 0.729 , 0.81 , 0.9 , 0. , 0. , 0.9 ,
1. , 0. ])
In [11]:
optimal_policy = extract_policy(optimal_value_function, gamma=0.9)
optimal_policy
Out[11]:
Out[11]:
array([1., 2., 1., 0., 1., 0., 1., 0., 2., 1., 1., 0., 0., 2., 2., 0.])
In [12]:
opt_pol = optimal_policy.reshape(4,4)
print('THE OPTIMAL POLICY IS \n ',opt_pol)
THE OPTIMAL POLICY IS

[[1. 2. 1. 0.]
[1. 0. 1. 0.]
[2. 1. 1. 0.]
[0. 2. 2. 0.]]
ASSIGNMENT - 4
In [1]:
import gym
import pandas as pd
from collections import defaultdict
In [ ]:
env = gym.make('Blackjack-v1')
print(env.reset())
(18, 7, False)
In [ ]:
print(env.action_space)
Discrete(2)
In [ ]:
print(env.observation_space)
Tuple(Discrete(32), Discrete(11), Discrete(2))
In [ ]:
def policy(state):
return 0 if state[0] >19 else 1
In [ ]:
state = env.reset()
print(state)
(20, 4, False)
In [ ]:
print(policy(state))
In [ ]:
num_timesteps = 500
def play_episode(env, policy):

state = env.reset()
episode = []
for t in range(num_timesteps):
action = policy(state)
next_state, reward, done, _ = env.step(action)
episode.append((state, action, reward))
if done:
break
state = next_state
return episode
In [ ]:
print(play_episode(env,policy))
[((12, 10, False), 1, 0.0), ((19, 10, False), 1, -1.0)]

In [ ]:
total_return = defaultdict(float)
N = defaultdict(int)
num_iterations = 100000
for i in range(num_iterations):
episode = play_episode(env, policy)
states, actions, rewards = zip(*episode)
for t, state in enumerate(states):
r = (sum(rewards[t:]))
total_return[state] += sum(rewards[t:])
N[state] += 1
In [ ]:
total_return = pd.DataFrame(total_return.items(), columns=['state', 'total_return'])
N = pd.DataFrame(N.items(), columns=['state', 'N'])
df = pd.merge(total_return, N, on='state')
df.head()
Out[ ]:
state total_return N
0 (13, 10, False) -2317.0 3807
1 (20, 10, False) 2628.0 6001
2 (14, 1, True) -26.0 85
3 (13, 1, False) -614.0 898
4 (12, 10, False) -1996.0 3597
In [ ]:
df['value'] = df['total_return'] / df['N']
df.head()
Out[ ]:
state total_return N value
0 (13, 10, False) -2317.0 3807 -0.608616
1 (20, 10, False) 2628.0 6001 0.437927
2 (14, 1, True) -26.0 85 -0.305882
3 (13, 1, False) -614.0 898 -0.683742
4 (12, 10, False) -1996.0 3597 -0.554907
In [ ]:
df[df['state']==(21,9,False)]['value'].values
Out[ ]:
array([0.94723618])
In [ ]:
df[df['state']==(5,8,False)]['value'].values
Out[ ]:
array([-0.63513514])
ASSIGNMENT - 6
In [11]:
import gym
import numpy as np
import random
In [12]:
env= gym.make('FrozenLake-v1') #, render_mode='human')
In [13]:
Q = {}
for s in range(env.observation_space.n):
for a in range(env.action_space.n):
Q[(s,a)] = 0.0
In [14]:
def epsilon_greedy (state, epsilon):
if random.uniform(0,1) < epsilon:
return env.action_space.sample()
else:
return max(list(range(env.action_space.n)), key= lambda x:
Q[(state,x)])
In [15]:
alpha=0.85
gamma= 0.90
epsilon = 0.8
In [16]:
num_episodes = 50000
num_timesteps= 1000
In [17]:
for i in range(num_episodes):
s = env.reset()[0]
for t in range(num_timesteps):
a = epsilon_greedy(s, epsilon)
s_,r, done, _, trash = env.step(a)
a_ = np.argmax([Q[(s_, a)] for a in range(env.action_space.n)])
Q[(s,a)] += alpha * (r + gamma * Q[(s_,a_)]-Q[(s,a)])
s = s_
if done:
break
In [18]:
Q
Out[18]:
{(0, 0): 0.23477961696373423,
(0, 1): 0.22480181183703787,
(0, 2): 0.23961716957752016,
(0, 3): 0.24066398243905854,
(1, 0): 0.2204815896999076,
(1, 1): 0.04017125915710931,
(1, 2): 0.2822227428738474,
(1, 3): 0.22490808477046206,
(2, 0): 0.29961284509447655,
(2, 0): 0.29961284509447655,
(2, 1): 0.32990657866523887,
(2, 2): 0.37292229711147334,
(2, 3): 0.2790710024900863,
(3, 0): 0.25000597793284357,
(3, 1): 0.2575759230383145,
(3, 2): 0.037377204692152305,
(3, 3): 0.32551898596551954,
(4, 0): 0.3804023551933965,
(4, 1): 0.00856676265665978,
(4, 2): 0.5076563484150082,
(4, 3): 0.050394379122136346,
(5, 0): 0.0,
(5, 1): 0.0,
(5, 2): 0.0,
(5, 3): 0.0,
(6, 0): 0.4285911909119954,
(6, 1): 0.0002831967627810342,
(6, 2): 0.692932809233417,
(6, 3): 0.006210297861473632,
(7, 0): 0.0,
(7, 1): 0.0,
(7, 2): 0.0,
(7, 3): 0.0,
(8, 0): 0.4724947380728043,
(8, 1): 0.5292926568861616,
(8, 2): 0.0854144618498413,
(8, 3): 0.4739574281045383,
(9, 0): 0.07482359865928406,
(9, 1): 0.7499983936496128,
(9, 2): 0.5420577123103719,
(9, 3): 0.07627448541464799,
(10, 0): 0.5934266484891275,
(10, 1): 0.8240260740178592,
(10, 2): 0.774672222464751,
(10, 3): 0.09258352148159432,
(11, 0): 0.0,
(11, 1): 0.0,
(11, 2): 0.0,
(11, 3): 0.0,
(12, 0): 0.0,
(12, 1): 0.0,
(12, 2): 0.0,
(12, 3): 0.0,
(13, 0): 0.5646633544813056,
(13, 1): 0.11021736826449313,
(13, 2): 0.6234923802366498,
(13, 3): 0.7062633423537164,
(14, 0): 0.807111703090516,
(14, 1): 0.6835467756183892,
(14, 2): 0.9109203529502328,
(14, 3): 0.7967422343382821,
(15, 0): 0.0,
(15, 1): 0.0,
(15, 2): 0.0,
(15, 3): 0.0}

Ass1 Merged Merged

Uploaded by

Copyright:

Available Formats

You might also like

Ass1 Merged Merged

Uploaded by

Document Information

Original Description:

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Ass1 Merged Merged

Uploaded by

Copyright:

Available Formats

ASSIGNMENT - 1

The observation space: Box([-1.2 -0.07], [0.6 0.07], (2,), float32)

# pip install gym[classic_control]

# reset the environment and see the initial observation

# Sample a random action from the entire action space

# # Take the action and get the new observation space

The initial observation is (array([-0.5884122, 0. ], dtype=float32), {})

import matplotlib.pyplot as plt

# Number of steps you run the agent for

for step in range(num_steps):

# apply the action

# Render the env

# Close the env

The observation space: Box([-1.2 -0.07], [0.6 0.07], (2,), float32)

# pip install gym[classic_control]

# reset the environment and see the initial observation

# Sample a random action from the entire action space

# # Take the action and get the new observation space

The initial observation is (array([-0.5884122, 0. ], dtype=float32), {})

import matplotlib.pyplot as plt

# Number of steps you run the agent for

for step in range(num_steps):

# apply the action

# Render the env

# Close the env

# MAKING A GYM ENVIRONMENT FOR THE GAME

import matplotlib.pyplot as plt

# We re-initialize the Q-table

# List of outcomes to plot

print('Q-table before training:')

# By default, we consider our outcome to be a failure

# If there's no best action (only zeros), take a random one

# Update our current state

# If we have a reward, it means that our outcome is a success

while not done:

# Add the action to the sequence

# Update our current state

# Update the render

# If there's no best action (only zeros), take a random one

# Update our current state

# When we get a reward, it means we solved the game

# Let's check our success rate!

Success rate = 100.0%

env = gym.make("FrozenLake-v1", render_mode="human" ,is_slippery=False)

(0, {'prob': 1})

def value_iteration(env, gamma = 0.9):

# initialize value table with zeros

# set number of iterations and threshold

# On each iteration, copy the value table to the updated_value_table

# Now we calculate Q Value for each actions in the state

if (np.sum(np.fabs(updated_value_table - value_table)) <= threshold):

# initialize the policy with zeros

for state in range(env.observation_space.n):

# initialize the Q table for a state

# compute Q value for all ations in the state

Value-iteration converged at iteration# 7.

THE OPTIMAL POLICY IS