Q-Learning on Lunar Lander and Frozen Lake

Frozen Lake

import gym
import numpy as np
import time
from IPython.display import clear_output
env = gym.make('FrozenLake-v0')
env.render()
SFFF
FHFH
FFFH
HFFG
numActions = env.action_space.n
numStates = env.observation_space.n

print(numActions,numStates)
4 16
Q = np.zeros((numStates,numActions))

print(Q)
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
nE = 10000
mpE = 80
alpha = 0.01
gamma = 0.99
epsilon = 1
edr = 0.0001
for e in range(nE):
    state = env.reset()
    done = False
    for step in range(mpE):
        if np.random.rand() > epsilon:
            action = np.argmax(Q[state,:])
        else:
            action = env.action_space.sample()
        ns,reward,done,info = env.step(action)
        Q[state,action] = (1-alpha)*Q[state,action]+alpha*(
        reward+gamma*np.max(Q[ns,:]))
        state = ns
        if done == True:
            break
    epsilon = epsilon*np.exp(-edr*e)
Q
array([[5.99111917e-02, 5.39127429e-08, 2.17374482e-07, 5.45540084e-08],
       [2.17128272e-02, 2.60457993e-10, 6.00641319e-14, 1.32357723e-09],
       [3.38990428e-15, 7.16113524e-16, 3.88459030e-16, 7.96722571e-03],
       [1.78960082e-03, 0.00000000e+00, 1.82496912e-20, 1.82496912e-20],
       [6.61165734e-07, 5.97543386e-02, 3.59911930e-07, 4.55951033e-08],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.84880149e-16, 0.00000000e+00, 7.60677264e-02, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.62163328e-08, 1.02864055e-05, 1.53226070e-06, 1.23706488e-01],
       [1.29854182e-05, 1.76902113e-05, 1.90883193e-01, 4.63819943e-06],
       [1.87742049e-04, 1.32186676e-07, 2.35026008e-01, 2.84424843e-07],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 3.87235389e-04, 3.30428548e-01, 4.70975442e-07],
       [6.85822113e-05, 9.99900000e-03, 2.84496388e-06, 5.48603540e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])
for e in range(4):
    state = env.reset()
    done = False
    print("Episode::",e)
    time.sleep(0.1)
    for step in range(mpE):
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        action = np.argmax(Q[state,:])
        ns,reward,done,info = env.step(action)
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("Goal")
                time.sleep(3)
            else:
                print("Hole")
                time.sleep(3)
                clear_output(wait=True)
            break
        state = ns
env.close()
  (Up)
SFFF
FHFH
FFFH
HFFG
Goal

Lunar Lander

!pip install box2d-py
import gym

env = gym.make('LunarLander-v2')

# Nop, fire left engine, main engine, right engine
ACTIONS = env.action_space.n

# Landing pad is always at coordinates (0,0). Coordinates are the first two numbers in state vector.
# Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points.
# If lander moves away from landing pad it loses reward back. Episode finishes if the lander crashes or
# comes to rest, receiving additional -100 or +100 points. Each leg ground contact is +10. Firing main
# engine is -0.3 points each frame. Solved is 200 points.

import numpy as np
import random

def discretize_state(state):
    dstate = list(state[:5])
    dstate[0] = int(0.5*(state[0]+0.7)*10/2.0) # pos x
    dstate[1] = int(0.5*(state[1]+0.5)*10/2.0) # pos y
    dstate[2] = int(0.5*(state[2]+1.5)*10/3.0) # vel x
    dstate[3] = int(0.5*(state[3]+2)*10/3.0) # vel y
    dstate[4] = int(0.5*(state[4]+3.14159)*10/(2*3.14159)) # angle
    if dstate[0] >= 5: dstate[0] = 4
    if dstate[1] >= 5: dstate[1] = 4
    if dstate[2] >= 5: dstate[2] = 4
    if dstate[3] >= 5: dstate[3] = 4
    if dstate[4] >= 5: dstate[4] = 4
    if dstate[0] < 0: dstate[0] = 0
    if dstate[1] < 0: dstate[1] = 0
    if dstate[2] < 0: dstate[2] = 0
    if dstate[3] < 0: dstate[3] = 0
    if dstate[4] < 0: dstate[4] = 0
    return tuple(dstate)

def run(num_episodes, alpha, gamma, explore_mult):
    max_rewards = []
    last_reward = []
    qtable = np.subtract(np.zeros((5, 5, 5, 5, 5, ACTIONS)), 100) # start all rewards at -100
    explore_rate = 1.0
    for episode in range(num_episodes):
        s = env.reset()
        state = discretize_state(s)
        
        for step in range(10000):

            # select action
            if random.random() < explore_rate:
                action = random.choice(range(ACTIONS))
            else:
                action = np.argmax(qtable[state])

            (new_s, reward, done, _) = env.step(action)
            new_state = discretize_state(new_s)

            # update Q
            best_future_q = np.amax(qtable[new_state]) # returns best possible reward from next state
            prior_val = qtable[state + (action,)]
            qtable[state + (action,)] = (1.0-alpha)*prior_val + alpha*(reward + gamma * best_future_q)
            state = new_state
            
            if done or step == 9999:
                last_reward.append(reward)
                break
        
        if explore_rate > 0.01:
            explore_rate *= explore_mult    
        max_rewards.append(np.amax(qtable))
        
    return (max_rewards, last_reward[-50:], qtable) # return rewards from last 50 episodes


num_episodes = 100
for alpha in [0.05, 0.10, 0.15]:
    for gamma in [0.85, 0.90, 0.95]:
        (max_rewards, last_reward, _) = run(num_episodes=num_episodes, alpha=alpha, gamma=gamma, explore_mult=0.995)
        print("alpha = %.2f, gamma = %.2f, mean last 50 outcomes = %.2f, q max: %.2f, q mean: %.2f" % (alpha, gamma, np.mean(last_reward), np.max(max_rewards), np.mean(max_rewards)))

(max_rewards, last_reward, qtable) = run(num_episodes=200, alpha=0.1, gamma=0.95, explore_mult=0.995)
print("mean last 50 outcomes = %.2f, q max: %.2f, q mean: %.2f" % (np.mean(last_reward), np.max(max_rewards), np.mean(max_rewards)))
np.save('qtable.npy', qtable)
alpha = 0.05, gamma = 0.85, mean last 50 outcomes = -100.00, q max: -9.53, q mean: -26.74
alpha = 0.05, gamma = 0.90, mean last 50 outcomes = -100.00, q max: -35.02, q mean: -54.21
alpha = 0.05, gamma = 0.95, mean last 50 outcomes = -100.00, q max: -33.37, q mean: -64.44
alpha = 0.10, gamma = 0.85, mean last 50 outcomes = -100.00, q max: -1.27, q mean: -23.16
alpha = 0.10, gamma = 0.90, mean last 50 outcomes = -100.00, q max: -1.36, q mean: -25.79
alpha = 0.10, gamma = 0.95, mean last 50 outcomes = -100.00, q max: -24.42, q mean: -53.55
alpha = 0.15, gamma = 0.85, mean last 50 outcomes = -100.00, q max: 3.06, q mean: -11.68
alpha = 0.15, gamma = 0.90, mean last 50 outcomes = -100.00, q max: 6.88, q mean: -23.10
alpha = 0.15, gamma = 0.95, mean last 50 outcomes = -98.00, q max: 6.86, q mean: -31.84
mean last 50 outcomes = -100.00, q max: -3.02, q mean: -26.40
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1

!wget http://www.atarimania.com/roms/Roms.rar
!mkdir /content/ROM/
!unrar e /content/Roms.rar /content/ROM/
!python -m atari_py.import_roms /content/ROM/
import gym
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

display = Display(visible=0, size=(1400, 900))
display.start()

"""
Utility functions to enable video recording of gym environment 
and displaying it.
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env
# Use best qtable to play the game (no learning anymore)
import gym
import numpy as np

env = wrap_env(gym.make('LunarLander-v2'))
qtable = np.load('qtable.npy')

for i in range(100):
    s = env.reset()
    state = discretize_state(s)
    for step in range(10000):
        env.render()

        # select action
        action = np.argmax(qtable[state])

        (new_s, reward, done, _) = env.step(action)
        new_state = discretize_state(new_s)

        if done or step == 9999:
            break

        state = new_state
            
env.close()
show_video()