Home
Download the notebook or follow along.
import numpy as np
import gym
import seaborn as sns
import pandas as pd
import tensorflow as tf
sns.set()
$Q$-learning as a Regression Problem¶
When we first learned about $Q$-learning, we used the Bellman equation to learn the $Q$ function: $$ Q(s_t, a_t) \gets Q(s_t, a_t) + \alpha \left( r_t + (1-d_t)\gamma \max_{a_{t+1}} \left( Q(s_{t+1}, a_{t+1}) \right) - Q(s_t, a_t) \right) $$
Compare this to gradient descent for a regression problem: $$ \theta \gets \theta - \alpha 2 \left( \hat{y} - y \right) \nabla_\theta \hat{y} $$
These methods are essentially analogous: we update parameters about our function in a manner proportional to the difference between our prediction and the 'true' value. The difference for tabular $Q$-learning is that we essentially have a different parameter for each state-action pair.
If we define the following loss function: $$ L(\theta) = \frac{1}{2} \left( r_t + (1-d_t)\gamma \max_{a_{t+1}} \left( Q(s_{t+1}, a_{t+1}) \right) - Q(s_t, a_t) \right)^2 $$
Then the gradient descent update rule is exactly the same as our q-learning update rule!
FrozenLake with Tensorflow¶
Before diving deep into using techniques like deep neural networks, I want to show you how we might do $Q$-learning in tensorflow using the same FrozenLake-v0
environment from earlier.
class Agent:
def __init__(self, num_states, num_actions,
epsilon_i=1.0,
epsilon_f=0.0,
n_epsilon=0.1,
alpha=0.5,
gamma = 0.95,
hidden_layers = []
):
self.epsilon_i = epsilon_i
self.epsilon_f = epsilon_f
self.epsilon = epsilon_i
self.n_epsilon = n_epsilon
self.num_states = num_states
self.num_actions = num_actions
self.alpha = alpha
self.gamma = gamma
self.Q = tf.Variable(tf.zeros((num_states, num_actions)), name="Q")
self.optimizer = tf.keras.optimizers.SGD(alpha)
def decay_epsilon(self, n):
self.epsilon = max(
self.epsilon_f,
self.epsilon_i - (n/self.n_epsilon)*(self.epsilon_i - self.epsilon_f))
def act(self, s_t):
if np.random.rand() < self.epsilon:
return np.random.randint(self.num_actions)
return np.argmax(self.Q[s_t])
def update(self, s_t, a_t, r_t, s_t_next, d_t):
Q_next = tf.stop_gradient(np.max(self.Q[s_t_next]))
with tf.GradientTape() as tape:
loss = 0.5*tf.reduce_mean(r_t + (1-d_t)*self.gamma*Q_next - self.Q[s_t, a_t])**2
grads = tape.gradient(loss, [self.Q])
self.optimizer.apply_gradients(zip(grads, [self.Q]))
def plot(data, window=100):
sns.lineplot(
data=data.rolling(window=window).mean()[window-1::window]
)
def train(env_name,
T=100000, alpha=0.8, gamma=0.95, epsilon_i = 1.0, epsilon_f = 0.0, n_epsilon = 0.1):
env = gym.make(env_name)
num_states = env.observation_space.n
num_actions = env.action_space.n
agent = Agent(num_states, num_actions, alpha=alpha, gamma=gamma, epsilon_i=epsilon_i, epsilon_f=epsilon_f, n_epsilon = n_epsilon)
rewards = []
episode_rewards = 0
s_t = env.reset()
for t in range(T):
a_t = agent.act(s_t)
s_t_next, r_t, d_t, info = env.step(a_t)
agent.update(s_t, a_t, r_t, s_t_next, d_t)
agent.decay_epsilon(t/T)
s_t = s_t_next
episode_rewards += r_t
if d_t:
rewards.append(episode_rewards)
episode_rewards = 0
s_t = env.reset()
plot(pd.DataFrame(rewards))
return agent
train("FrozenLake-v0", T=100000)