I have been learning Reinforcement Learning for about two weeks. Although haven’t go through all the course of Arthur Juliani, I had been able to write a small example of Q-learning now.
This example is about using DNN for Q-value table to solve a path-finding-problem. Actually, the path is more looks like a tree: The start point is ‘0’, and the destination (or ‘goal’) is ’12’.

The code framework of my example is mainly from Manuel Amunategui’s tutorial but replacing Q-value table with a one-layer-neural-network.

```import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np
import pylab as plt

MATRIX_SIZE = 15
goal = 12

points_list = [(0,1), (0,2), \
(1,3), (1,4), (2,5), (2,6), \
(3,7), (3,8), (4,9), (4,10), \
(5,11), (5,12), (6,13), (6,14)]

# Build feed-forward network by using 'state' as input, 'best action' as output
state_in = tf.placeholder(tf.int32, )
state_oh = slim.one_hot_encoding(state_in, 15)
output = slim.fully_connected(state_oh, 15,
biases_initializer = None, activation_fn = tf.nn.relu,
weights_initializer = tf.ones_initializer())

outputQ = tf.reshape(output, [-1])
chosen_action = tf.argmax(outputQ, 0)

nextQ = tf.placeholder(tf.float32, )
loss = tf.reduce_sum(tf.square(nextQ - outputQ))
# Gradient Descent Optimizer usually have better generalization performance
update = optimizer.minimize(loss)

# Build reward matrix
R = np.matrix(np.ones(shape=(MATRIX_SIZE, MATRIX_SIZE)))
# Set extremely low reward (minus) for unconnected nodes
R *= -1000

for point in points_list:
if point == goal:
R[point] = 100
else:
R[point] = 0

if point== goal:
R[point[::-1]] = 100
else:
R[point[::-1]] = 0

R[goal, goal] = 100

# learning parameter
gamma = 0.9
# Epsilon-Greedy Algorithm
e = 0.1

# Training
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
reward_list = []
for j in range(50):
all_reward = 0
for i in range(10):
current_state = np.random.randint(0, 15)
# Use current state to predict best action
action, allQ = sess.run([chosen_action, outputQ],
feed_dict = {state_in: [current_state]})
if np.random.rand(1) < e:
action = np.random.randint(0, 15, dtype = np.int)
new_state = action
Q1 = sess.run(outputQ,
feed_dict = {state_in: [new_state]})
maxQ1 = np.max(Q1)
reward = R[current_state, action]
targetQ = allQ
targetQ[action] = reward + gamma * maxQ1
# Use next state and next Q-values to train neural network
feed_dict = {state_in: [current_state], nextQ: targetQ})
all_reward += reward
reward_list.append(all_reward)

# show curve of reward in different training steps
plt.plot(reward_list)
plt.show()

# Testing
current_state = 0
steps = [current_state]

while current_state != goal:
action = sess.run([chosen_action],
feed_dict = {state_in: [current_state]})
steps.append(action)
current_state = action

print("Most efficient path:")
print(steps)
```

The rewards curve in training steps: And this example will finally report:

```Most efficient path:
[0, 2, 5, 12]
```