-
Notifications
You must be signed in to change notification settings - Fork 0
/
ddpg_network.py
154 lines (137 loc) · 9.21 KB
/
ddpg_network.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# ddpg_network.py loads up the trained ddpg model into its own tf session, and allows external modules
# to call its inference function to observe the network outputs, this approach was used as it makes
# using multiple neural networks in the same job simpler
# however the approach of handling rnn_state outside of ddpg_network currently is not ideal
import tensorflow as tf
import os
import numpy as np
# hyperparameters
gamma = 0.99 # reward discount factor
h1_actor = 400 # hidden layer 1 size for the actor
h2_actor = 300 # hidden layer 2 size for the actor
h3_actor = 200 # hidden layer 3 size for the actor
h1_critic = 400 # hidden layer 1 size for the critic
h2_critic = 300 # hidden layer 2 size for the critic
h3_critic = 200 # hidden layer 3 size for the critic
lstm_actor = 16 # lstm units for actor
lr_actor = 1e-4 # learning rate for the actor
lr_critic = 1e-3 # learning rate for the critic
lr_decay = 1 # learning rate decay (per episode)
l2_reg_actor = 1e-6 # L2 regularization factor for the actor
l2_reg_critic = 1e-6 # L2 regularization factor for the critic
dropout_actor = 0 # dropout rate for actor (0 = no dropout)
dropout_critic = 0 # dropout rate for critic (0 = no dropout)
num_episodes = 5000 # number of episodes
max_steps_ep = 10000 # default max number of steps per episode (unless env has a lower hardcoded limit)
tau = 1e-3 # soft target update rate
train_every = 100 # number of steps to run the policy (and collect experience) before updating network weights
replay_memory_capacity = int(1e6) # capacity of experience replay memory
minibatch_size = 64 # size of minibatch from experience replay memory for updates
initial_noise_scale = 1.0 # scale of the exploration noise process (1.0 is the range of each action dimension)
noise_decay = 0.997 # decay rate (per episode) of the scale of the exploration noise process
exploration_mu = 0.0 # mu parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt
exploration_theta = 0.15 # theta parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt
exploration_sigma = 0.2 # sigma parameter for the exploration noise process: dXt = theta*(mu-Xt )*dt + sigma*dWt
V_MIN = 17
V_MAX = 30
restore_from = None
restore_ep = 0
# game parameters
#env = gym.make(env_to_use)
# Action Space Shape
N_S = 4 # number of states
N_A = 1 # number of actions
A_BOUND = [-1, 1] # action bounds
MODEL_FILE = 'model-ep-4999-final-final.ckpt'
DATA_DIR = './data/'
LOG_DIR = '/vol/research/safeav/Sampo/condor-a2c/test/log_ddpg/ddpg_ssc_lstm'
class DdpgNetwork(object):
"""implements the Ddpg network model for estimating vehicle host actions"""
def __init__(self):
# set up tf session and model
rl_graph = tf.Graph()
rl_config = tf.ConfigProto()
rl_config.gpu_options.allow_growth = True
self.sess_rl = tf.Session(graph=rl_graph, config=rl_config)
self.recurrent = True
with self.sess_rl.as_default():
with rl_graph.as_default():
# BUILD MODEL
# placeholders
self.state_ph = tf.placeholder(dtype=tf.float32, shape=[None, N_S], name='state')
self.action_ph = tf.placeholder(dtype=tf.float32, shape=[None, N_A], name='action')
self.reward_ph = tf.placeholder(dtype=tf.float32, shape=[None], name='reward')
self.next_state_ph = tf.placeholder(dtype=tf.float32, shape=[None, N_S], name='next_state')
self.is_not_terminal_ph = tf.placeholder(dtype=tf.float32, shape=[None],
name='is_not_terminal') # indicators (go into target computation)
self.is_training_ph = tf.placeholder(dtype=tf.bool, shape=(), name='is_training') # for dropout
# create networks
# actor network
with tf.variable_scope('actor'):
# Policy's outputted action for each state_ph (for generating actions and training the critic)
self.actions = self.generate_actor_network(self.state_ph, trainable=True, reuse=False)
# slow target actor network
with tf.variable_scope('slow_target_actor', reuse=False):
# Slow target policy's outputted action for each next_state_ph (for training the critic)
# use stop_gradient to treat the output values as constant targets when doing backprop
slow_target_next_actions = tf.stop_gradient(
self.generate_actor_network(self.next_state_ph, trainable=False, reuse=False))
with tf.variable_scope('critic') as scope:
# Critic applied to state_ph and a given action (for training critic)
q_values_of_given_actions = self.generate_critic_network(self.state_ph, self.action_ph,
trainable=True, reuse=False)
# Critic applied to state_ph and the current policy's outputted actions for state_ph (for training actor via deterministic policy gradient)
q_values_of_suggested_actions = self.generate_critic_network(self.state_ph, self.actions,
trainable=True, reuse=True)
# slow target critic network
with tf.variable_scope('slow_target_critic', reuse=False):
# Slow target critic applied to slow target actor's outputted actions for next_state_ph (for training critic)
slow_q_values_next = tf.stop_gradient(
self.generate_critic_network(self.next_state_ph, slow_target_next_actions, trainable=False,
reuse=False))
# GET SAVED WEIGHTS
saver = tf.train.Saver()
checkpoint_path = os.path.join(LOG_DIR, MODEL_FILE)
saver.restore(self.sess_rl, checkpoint_path)
print('rl_model: Restored model: %s' % MODEL_FILE)
# Build the network
def generate_actor_network(self, s, trainable, reuse):
hidden = tf.layers.dense(s, h1_actor, activation=tf.nn.relu, trainable=trainable, name='dense', reuse=reuse)
hidden_drop = tf.layers.dropout(hidden, rate=dropout_actor, training=trainable & self.is_training_ph)
hidden_2 = tf.layers.dense(hidden_drop, h2_actor, activation=tf.nn.relu, trainable=trainable, name='dense_1',
reuse=reuse)
hidden_drop_2 = tf.layers.dropout(hidden_2, rate=dropout_actor, training=trainable & self.is_training_ph)
hidden_3 = tf.layers.dense(hidden_drop_2, h3_actor, activation=tf.nn.relu, trainable=trainable, name='dense_2',
reuse=reuse)
hidden_drop_3 = tf.layers.dropout(hidden_3, rate=dropout_actor, training=trainable & self.is_training_ph)
if self.recurrent:
# Recurrent network for temporal dependencies
self.lstm_layer = tf.keras.layers.LSTM(lstm_actor, stateful=True, return_sequences=True)
rnn_out = self.lstm_layer(tf.expand_dims(hidden_drop_3, [0]))
rnn_out = tf.reshape(rnn_out, [-1, lstm_actor])
actions_unscaled = tf.layers.dense(rnn_out, N_A, trainable=trainable, name='dense_3', reuse=reuse)
else:
actions_unscaled = tf.layers.dense(hidden_drop_3, N_A, trainable=trainable, name='dense_3', reuse=reuse)
actions = A_BOUND[0] + tf.nn.sigmoid(actions_unscaled) * (
A_BOUND[1] - A_BOUND[0]) # bound the actions to the valid range
return actions
# will use this to initialize both the critic network its slowly-changing target network with same structure
def generate_critic_network(self, s, a, trainable, reuse):
state_action = tf.concat([s, a], axis=1)
hidden = tf.layers.dense(state_action, h1_critic, activation=tf.nn.relu, trainable=trainable, name='dense',
reuse=reuse)
hidden_drop = tf.layers.dropout(hidden, rate=dropout_critic, training=trainable & self.is_training_ph)
hidden_2 = tf.layers.dense(hidden_drop, h2_critic, activation=tf.nn.relu, trainable=trainable, name='dense_1',
reuse=reuse)
hidden_drop_2 = tf.layers.dropout(hidden_2, rate=dropout_critic, training=trainable & self.is_training_ph)
hidden_3 = tf.layers.dense(hidden_drop_2, h3_critic, activation=tf.nn.relu, trainable=trainable, name='dense_2',
reuse=reuse)
hidden_drop_3 = tf.layers.dropout(hidden_3, rate=dropout_critic, training=trainable & self.is_training_ph)
q_values = tf.layers.dense(hidden_drop_3, 1, trainable=trainable, name='dense_3', reuse=reuse)
return q_values
def reset_lstm(self):
self.lstm_layer.reset_states()
def inference(self, s):
s = np.reshape(s, (1, N_S)) # reshape state vector
return self.sess_rl.run(self.actions, {self.state_ph: s,
self.is_training_ph: False})[0]