Ass1 Merged Merged

You might also like

Download as pdf or txt
Download as pdf or txt
You are on page 1of 15

ASSIGNMENT - 1

In [1]:

import gym

In [2]:
env = gym.make('MountainCar-v0',render_mode="rgb_array")

In [3]:
# Observation and action space
obs_space = env.observation_space
action_space = env.action_space
print("The observation space: {} ".format(obs_space))
print("The action space: {} ".format(action_space))

The observation space: Box([-1.2 -0.07], [0.6 0.07], (2,), float32)


The action space: Discrete(3)

In [4]:

# pip install gym[classic_control]

In [5]:
import matplotlib.pyplot as plt

# reset the environment and see the initial observation


obs = env.reset()
print("The initial observation is {} ".format(obs))

# Sample a random action from the entire action space


random_action = env.action_space.sample()

# # Take the action and get the new observation space


new_obs, reward, done, info, a = env.step(random_action)
print("The new observation is {} ".format(new_obs))

The initial observation is (array([-0.5884122, 0. ], dtype=float32), {})


The new observation is [-5.8892918e-01 -5.1695626e-04]

In [6]:
env_screen = env.render()
env.close()

import matplotlib.pyplot as plt


plt.imshow(env_screen)
Out[6]:

<matplotlib.image.AxesImage at 0x266f66029e0>
In [7]:

import time

# Number of steps you run the agent for


num_steps = 150

obs = env.reset()

for step in range(num_steps):


# take random action, but you can also do something more intelligent
# action = my_intelligent_agent_fn(obs)
action = env.action_space.sample()

# apply the action


obs, reward, done, info ,a= env.step(action)

# Render the env


env.render()

# Wait a bit before the next frame unless you want to see a crazy fast video
time.sleep(0.001)
plt.imshow(env.render())
# If the epsiode is up, then start another one
if done:
env.reset()

# Close the env


env.close()
ASSIGNMENT - 1
In [1]:

import gym

In [2]:
env = gym.make('MountainCar-v0',render_mode="rgb_array")

In [3]:
# Observation and action space
obs_space = env.observation_space
action_space = env.action_space
print("The observation space: {} ".format(obs_space))
print("The action space: {} ".format(action_space))

The observation space: Box([-1.2 -0.07], [0.6 0.07], (2,), float32)


The action space: Discrete(3)

In [4]:

# pip install gym[classic_control]

In [5]:
import matplotlib.pyplot as plt

# reset the environment and see the initial observation


obs = env.reset()
print("The initial observation is {} ".format(obs))

# Sample a random action from the entire action space


random_action = env.action_space.sample()

# # Take the action and get the new observation space


new_obs, reward, done, info, a = env.step(random_action)
print("The new observation is {} ".format(new_obs))

The initial observation is (array([-0.5884122, 0. ], dtype=float32), {})


The new observation is [-5.8892918e-01 -5.1695626e-04]

In [6]:
env_screen = env.render()
env.close()

import matplotlib.pyplot as plt


plt.imshow(env_screen)
Out[6]:

<matplotlib.image.AxesImage at 0x266f66029e0>
In [7]:

import time

# Number of steps you run the agent for


num_steps = 150

obs = env.reset()

for step in range(num_steps):


# take random action, but you can also do something more intelligent
# action = my_intelligent_agent_fn(obs)
action = env.action_space.sample()

# apply the action


obs, reward, done, info ,a= env.step(action)

# Render the env


env.render()

# Wait a bit before the next frame unless you want to see a crazy fast video
time.sleep(0.001)
plt.imshow(env.render())
# If the epsiode is up, then start another one
if done:
env.reset()

# Close the env


env.close()
ASSIGNMENT - 2
In [1]:

import gym
import random
import matplotlib.pyplot as plt
import numpy as np

In [3]:

# MAKING A GYM ENVIRONMENT FOR THE GAME


environment = gym.make("FrozenLake-v1", is_slippery=False)
environment.reset()
environment.render()

In [4]:
# Q TABLE FOR THE GAME
#1
# qtable = np.zeros((16, 4))
#2
nb_states = environment.observation_space.n # = 16
nb_actions = environment.action_space.n # = 4
qtable = np.zeros((nb_states, nb_actions))
qtable

Out[4]:
array([[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])

In [5]:
# random.choice(["LEFT", "DOWN", "RIGHT", "UP"])
environment.action_space.sample()
Out[5]:

In [6]:
environment.step(2)
environment.render()

In [7]:

action = environment.action_space.sample()

# 2. Implement this action and move the agent in the desired direction
new_state, reward, done, info,a = environment.step(action)
# Display the results (reward and map)
environment.render()
print(f'Reward = {reward}')

Reward = 0.0

In [8]:

import matplotlib.pyplot as plt


plt.rcParams['figure.dpi'] = 300
plt.rcParams.update({'font.size': 17})

# We re-initialize the Q-table


qtable = np.zeros((environment.observation_space.n, environment.action_space.n))

# Hyperparameters
episodes = 1000 # Total number of episodes
alpha = 0.5 # Learning rate
gamma = 0.9 # Discount factor

# List of outcomes to plot


outcomes = []

print('Q-table before training:')


print(qtable)

# Training
for _ in range(episodes):
state = environment.reset()[0]
done = False

# By default, we consider our outcome to be a failure


outcomes.append("Failure")

# Until the agent gets stuck in a hole or reaches the goal, keep training it
while not done:
# Choose the action with the highest value in the current state
if np.max(qtable[state]) > 0:
action = np.argmax(qtable[state])

# If there's no best action (only zeros), take a random one


else:
action = environment.action_space.sample()

# Implement this action and move the agent in the desired direction
new_state, reward, done, info,a = environment.step(action)

# Update Q(s,a)
qtable[state, action] = qtable[state, action] + \
alpha * (reward + gamma * np.max(qtable[new_state]) - q
table[state, action])

# Update our current state


state = new_state

# If we have a reward, it means that our outcome is a success


if reward:
outcomes[-1] = "Success"

print()
print('===========================================')
print('Q-table after training:')
print(qtable)

# Plot outcomes
plt.figure(figsize=(12, 5))
plt.xlabel("Run number")
plt.ylabel("Outcome")
ax = plt.gca()
ax.set_facecolor('#efeeea')
plt.bar(range(len(outcomes)), outcomes, color="#0A047A", width=1.0)
plt.show()
Q-table before training:
[[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]]

===========================================
Q-table after training:
[[0. 0. 0.59049 0. ]
[0. 0. 0.6561 0. ]
[0. 0.729 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0.81 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0.2784375 0. ]
[0. 0.9 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 0. 0. ]
[0. 0. 1. 0. ]
[0. 0. 0. 0. ]]

In [9]:
from IPython.display import clear_output
import time

state = environment.reset()[0]
done = False
sequence = []

while not done:


# Choose the action with the highest value in the current state
if np.max(qtable[state]) > 0:
action = np.argmax(qtable[state])
# If there's no best action (only zeros), take a random one
else:
action = environment.action_space.sample()

# Add the action to the sequence


sequence.append(action)

# Implement this action and move the agent in the desired direction
new_state, reward, done, info,a = environment.step(action)

# Update our current state


state = new_state

# Update the render


clear_output(wait=True)
environment.render()
time.sleep(1)

print(f"Sequence = {sequence}")

Sequence = [2, 2, 1, 1, 1, 2]

In [10]:
episodes = 100
nb_success = 0

# Evaluation
for _ in range(100):
state = environment.reset()[0]
done = False

# Until the agent gets stuck or reaches the goal, keep training it
while not done:
# Choose the action with the highest value in the current state
if np.max(qtable[state]) > 0:
action = np.argmax(qtable[state])

# If there's no best action (only zeros), take a random one


else:
action = environment.action_space.sample()

# Implement this action and move the agent in the desired direction
new_state, reward, done, info,a = environment.step(action)

# Update our current state


state = new_state

# When we get a reward, it means we solved the game


nb_success += reward

# Let's check our success rate!


print (f"Success rate = {nb_success/episodes*100}%")

Success rate = 100.0%


ASSIGNMENT - 3
In [1]:

import gym
import numpy as np
import matplotlib.pyplot as plt

In [2]:

env = gym.make("FrozenLake-v1", render_mode="human" ,is_slippery=False)

In [3]:

env.reset()
# env.render()
# env.step(2)
Out[3]:

(0, {'prob': 1})

In [4]:
# env.step(2)

In [5]:

# env.step(2)

In [6]:
print(env.observation_space)
env.action_space

Discrete(16)
Out[6]:

Discrete(4)

In [7]:
env.P[2][2]
Out[7]:
[(1.0, 3, 0.0, False)]

In [8]:

def value_iteration(env, gamma = 0.9):

# initialize value table with zeros


value_table = np.zeros(env.observation_space.n)

# set number of iterations and threshold


no_of_iterations = 100000
threshold = 1e-20

for i in range(no_of_iterations):

# On each iteration, copy the value table to the updated_value_table


updated_value_table = np.copy(value_table)

# Now we calculate Q Value for each actions in the state


# and update the value of a state with maximum Q value
for state in range(env.observation_space.n):
Q_value = []
for action in range(env.action_space.n):
next_states_rewards = []
for next_sr in env.P[state][action]:
trans_prob, next_state, reward_prob, _ = next_sr
next_states_rewards.append((trans_prob * (reward_prob + gamma * upda
ted_value_table[next_state])))

Q_value.append(np.sum(next_states_rewards))

value_table[state] = max(Q_value)

# we will check whether we have reached the convergence i.e whether the differenc
e
# between our value table and updated value table is very small. But how do we kn
ow it is very
# small? We set some threshold and then we will see if the difference is less
# than our threshold, if it is less, we break the loop and return the value funct
ion as optimal
# value function

if (np.sum(np.fabs(updated_value_table - value_table)) <= threshold):


print ('Value-iteration converged at iteration# %d .' %(i+1))
break

return value_table

In [9]:
def extract_policy(value_table, gamma = 0.9):

# initialize the policy with zeros


policy = np.zeros(env.observation_space.n)

for state in range(env.observation_space.n):

# initialize the Q table for a state


Q_table = np.zeros(env.action_space.n)

# compute Q value for all ations in the state


for action in range(env.action_space.n):
for next_sr in env.P[state][action]:
trans_prob, next_state, reward_prob, _ = next_sr
Q_table[action] += (trans_prob * (reward_prob + gamma * value_table[next
_state]))

# select the action which has maximum Q value as an optimal action of the state
policy[state] = np.argmax(Q_table)

return policy

In [10]:
optimal_value_function = value_iteration(env=env,gamma=0.9)
optimal_value_function

Value-iteration converged at iteration# 7.


Out[10]:
array([0.59049, 0.6561 , 0.729 , 0.6561 , 0.6561 , 0. , 0.81 ,
0. , 0.729 , 0.81 , 0.9 , 0. , 0. , 0.9 ,
1. , 0. ])

In [11]:
optimal_policy = extract_policy(optimal_value_function, gamma=0.9)
optimal_policy
Out[11]:
Out[11]:

array([1., 2., 1., 0., 1., 0., 1., 0., 2., 1., 1., 0., 0., 2., 2., 0.])

In [12]:
opt_pol = optimal_policy.reshape(4,4)
print('THE OPTIMAL POLICY IS \n ',opt_pol)

THE OPTIMAL POLICY IS


[[1. 2. 1. 0.]
[1. 0. 1. 0.]
[2. 1. 1. 0.]
[0. 2. 2. 0.]]
ASSIGNMENT - 4
In [1]:

import gym
import pandas as pd
from collections import defaultdict

In [ ]:

env = gym.make('Blackjack-v1')
print(env.reset())

(18, 7, False)

In [ ]:

print(env.action_space)

Discrete(2)

In [ ]:
print(env.observation_space)

Tuple(Discrete(32), Discrete(11), Discrete(2))

In [ ]:
def policy(state):
return 0 if state[0] >19 else 1

In [ ]:
state = env.reset()
print(state)

(20, 4, False)

In [ ]:

print(policy(state))

In [ ]:

num_timesteps = 500

def play_episode(env, policy):


state = env.reset()
episode = []
for t in range(num_timesteps):
action = policy(state)
next_state, reward, done, _ = env.step(action)
episode.append((state, action, reward))
if done:
break
state = next_state
return episode

In [ ]:
print(play_episode(env,policy))

[((12, 10, False), 1, 0.0), ((19, 10, False), 1, -1.0)]


In [ ]:
total_return = defaultdict(float)
N = defaultdict(int)

num_iterations = 100000

for i in range(num_iterations):
episode = play_episode(env, policy)
states, actions, rewards = zip(*episode)
for t, state in enumerate(states):
r = (sum(rewards[t:]))
total_return[state] += sum(rewards[t:])
N[state] += 1

In [ ]:
total_return = pd.DataFrame(total_return.items(), columns=['state', 'total_return'])
N = pd.DataFrame(N.items(), columns=['state', 'N'])
df = pd.merge(total_return, N, on='state')
df.head()
Out[ ]:

state total_return N

0 (13, 10, False) -2317.0 3807

1 (20, 10, False) 2628.0 6001

2 (14, 1, True) -26.0 85

3 (13, 1, False) -614.0 898

4 (12, 10, False) -1996.0 3597

In [ ]:
df['value'] = df['total_return'] / df['N']
df.head()
Out[ ]:

state total_return N value

0 (13, 10, False) -2317.0 3807 -0.608616

1 (20, 10, False) 2628.0 6001 0.437927

2 (14, 1, True) -26.0 85 -0.305882

3 (13, 1, False) -614.0 898 -0.683742

4 (12, 10, False) -1996.0 3597 -0.554907

In [ ]:
df[df['state']==(21,9,False)]['value'].values

Out[ ]:
array([0.94723618])

In [ ]:
df[df['state']==(5,8,False)]['value'].values
Out[ ]:
array([-0.63513514])
ASSIGNMENT - 6
In [11]:

import gym
import numpy as np
import random

In [12]:

env= gym.make('FrozenLake-v1') #, render_mode='human')

In [13]:

Q = {}
for s in range(env.observation_space.n):
for a in range(env.action_space.n):
Q[(s,a)] = 0.0

In [14]:
def epsilon_greedy (state, epsilon):
if random.uniform(0,1) < epsilon:
return env.action_space.sample()
else:
return max(list(range(env.action_space.n)), key= lambda x:
Q[(state,x)])

In [15]:
alpha=0.85
gamma= 0.90
epsilon = 0.8

In [16]:
num_episodes = 50000
num_timesteps= 1000

In [17]:
for i in range(num_episodes):
s = env.reset()[0]
for t in range(num_timesteps):
a = epsilon_greedy(s, epsilon)
s_,r, done, _, trash = env.step(a)
a_ = np.argmax([Q[(s_, a)] for a in range(env.action_space.n)])
Q[(s,a)] += alpha * (r + gamma * Q[(s_,a_)]-Q[(s,a)])
s = s_
if done:
break

In [18]:
Q

Out[18]:
{(0, 0): 0.23477961696373423,
(0, 1): 0.22480181183703787,
(0, 2): 0.23961716957752016,
(0, 3): 0.24066398243905854,
(1, 0): 0.2204815896999076,
(1, 1): 0.04017125915710931,
(1, 2): 0.2822227428738474,
(1, 3): 0.22490808477046206,
(2, 0): 0.29961284509447655,
(2, 0): 0.29961284509447655,
(2, 1): 0.32990657866523887,
(2, 2): 0.37292229711147334,
(2, 3): 0.2790710024900863,
(3, 0): 0.25000597793284357,
(3, 1): 0.2575759230383145,
(3, 2): 0.037377204692152305,
(3, 3): 0.32551898596551954,
(4, 0): 0.3804023551933965,
(4, 1): 0.00856676265665978,
(4, 2): 0.5076563484150082,
(4, 3): 0.050394379122136346,
(5, 0): 0.0,
(5, 1): 0.0,
(5, 2): 0.0,
(5, 3): 0.0,
(6, 0): 0.4285911909119954,
(6, 1): 0.0002831967627810342,
(6, 2): 0.692932809233417,
(6, 3): 0.006210297861473632,
(7, 0): 0.0,
(7, 1): 0.0,
(7, 2): 0.0,
(7, 3): 0.0,
(8, 0): 0.4724947380728043,
(8, 1): 0.5292926568861616,
(8, 2): 0.0854144618498413,
(8, 3): 0.4739574281045383,
(9, 0): 0.07482359865928406,
(9, 1): 0.7499983936496128,
(9, 2): 0.5420577123103719,
(9, 3): 0.07627448541464799,
(10, 0): 0.5934266484891275,
(10, 1): 0.8240260740178592,
(10, 2): 0.774672222464751,
(10, 3): 0.09258352148159432,
(11, 0): 0.0,
(11, 1): 0.0,
(11, 2): 0.0,
(11, 3): 0.0,
(12, 0): 0.0,
(12, 1): 0.0,
(12, 2): 0.0,
(12, 3): 0.0,
(13, 0): 0.5646633544813056,
(13, 1): 0.11021736826449313,
(13, 2): 0.6234923802366498,
(13, 3): 0.7062633423537164,
(14, 0): 0.807111703090516,
(14, 1): 0.6835467756183892,
(14, 2): 0.9109203529502328,
(14, 3): 0.7967422343382821,
(15, 0): 0.0,
(15, 1): 0.0,
(15, 2): 0.0,
(15, 3): 0.0}

You might also like