I have been learning Reinforcement Learning for about two weeks. Although haven’t go through all the course of Arthur Juliani, I had been able to write a small example of Q-learning now.

This example is about using DNN for Q-value table to solve a path-finding-problem. Actually, the path is more looks like a tree:

The start point is ‘0’, and the destination (or ‘goal’) is ’12’.

The code framework of my example is mainly from Manuel Amunategui’s tutorial but replacing Q-value table with a one-layer-neural-network.

```
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import pylab as plt
MATRIX_SIZE = 15
goal = 12
points_list = [(0,1), (0,2), \
(1,3), (1,4), (2,5), (2,6), \
(3,7), (3,8), (4,9), (4,10), \
(5,11), (5,12), (6,13), (6,14)]
# Build feed-forward network by using 'state' as input, 'best action' as output
state_in = tf.placeholder(tf.int32, [1])
state_oh = slim.one_hot_encoding(state_in, 15)
output = slim.fully_connected(state_oh, 15,
biases_initializer = None, activation_fn = tf.nn.relu,
weights_initializer = tf.ones_initializer())
outputQ = tf.reshape(output, [-1])
chosen_action = tf.argmax(outputQ, 0)
nextQ = tf.placeholder(tf.float32, [15])
loss = tf.reduce_sum(tf.square(nextQ - outputQ))
# Gradient Descent Optimizer usually have better generalization performance
optimizer = tf.train.GradientDescentOptimizer(0.1)
update = optimizer.minimize(loss)
# Build reward matrix
R = np.matrix(np.ones(shape=(MATRIX_SIZE, MATRIX_SIZE)))
# Set extremely low reward (minus) for unconnected nodes
R *= -1000
for point in points_list:
if point[1] == goal:
R[point] = 100
else:
R[point] = 0
if point[0]== goal:
R[point[::-1]] = 100
else:
R[point[::-1]] = 0
R[goal, goal] = 100
# learning parameter
gamma = 0.9
# Epsilon-Greedy Algorithm
e = 0.1
# Training
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
reward_list = []
for j in range(50):
all_reward = 0
for i in range(10):
current_state = np.random.randint(0, 15)
# Use current state to predict best action
action, allQ = sess.run([chosen_action, outputQ],
feed_dict = {state_in: [current_state]})
if np.random.rand(1) < e:
action = np.random.randint(0, 15, dtype = np.int)
new_state = action
Q1 = sess.run(outputQ,
feed_dict = {state_in: [new_state]})
maxQ1 = np.max(Q1)
reward = R[current_state, action]
targetQ = allQ
targetQ[action] = reward + gamma * maxQ1
# Use next state and next Q-values to train neural network
_, read_loss = sess.run([update, loss],
feed_dict = {state_in: [current_state], nextQ: targetQ})
all_reward += reward
reward_list.append(all_reward)
# show curve of reward in different training steps
plt.plot(reward_list)
plt.show()
# Testing
current_state = 0
steps = [current_state]
while current_state != goal:
action = sess.run([chosen_action],
feed_dict = {state_in: [current_state]})
steps.append(action[0])
current_state = action[0]
print("Most efficient path:")
print(steps)
```

The rewards curve in training steps:

And this example will finally report:

Most efficient path: [0, 2, 5, 12]

which is the correct answer.