Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
# coding:utf-8
import pdb
import sys
import os
import numpy as np
import pandas as pd
pd.set_option('mode.chained_assignment', None)
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
from collections import deque
from keras.models import Model
from keras.layers import Input, Flatten, Dense, merge, Reshape, Activation, Convolution2D, \
AveragePooling2D, MaxPooling2D, Cropping2D, Lambda, Multiply, concatenate
# from keras import backend as K
import sys
sys.path.append('/home/wenqi/HybridDelivery')
sys.path.append('/home/wenqi/Ashutosh')
from utils.utils import Params
#params = Params('/home/wenqi/HybridDelivery/experiments/config/MHGD.json')
#params = Params('/home/wenqi/HybridDelivery/experiments/config/DeepPool0.json')
KERAS_BACKEND = 'tensorflow'
#DATA_PATH = '/home/wenqi/Ashutosh'
ENV_NAME = 'duel'
# Normalization parameters
NUM_AGGREGATION = 5
LATITUDE_DELTA = 0.3 / 218 * NUM_AGGREGATION
LONGITUDE_DELTA = 0.3 / 218 * NUM_AGGREGATION
LATITUDE_MIN = 40.6003
LATITUDE_MAX = 40.9003
LONGITUDE_MIN = -74.0407
LONGITUDE_MAX = -73.7501
X_SCALE = 100.0
W_SCALE = 100.0
MAP_WIDTH = int((LONGITUDE_MAX - LONGITUDE_MIN) / LONGITUDE_DELTA) + 1
MAP_HEIGHT = int((LATITUDE_MAX - LATITUDE_MIN) / LATITUDE_DELTA) + 1
MAIN_LENGTH = 51
MAIN_DEPTH = 5
AUX_LENGTH = 15 #23
AUX_DEPTH = 11
MAX_MOVE = 7
OUTPUT_LENGTH = 15
STAY_ACTION = OUTPUT_LENGTH * OUTPUT_LENGTH / 2
#Helper function
def pad_crop(F, x, y, size):
pad_F = np.pad(F, int((size - 1) / 2), 'constant')
return pad_F[x:x + size, y:y + size]
def build_d_network():
input = Input(shape=(6, 212, 219), dtype='float32')
x = Convolution2D(8, (5, 5), activation='relu', border_mode='same',data_format="channels_first")(input)
x = Convolution2D(16, (3, 3), activation='relu', border_mode='same',data_format="channels_first")(x)
output = Convolution2D(1, (1, 1), activation='relu', border_mode='same',data_format="channels_first")(x)
model = Model(input=input, output=output)
return model
# Version 3.3
def build_q_network():
main_input = Input(shape=(MAIN_DEPTH, MAIN_LENGTH, MAIN_LENGTH), dtype='float32')
aux_input = Input(shape=(AUX_DEPTH, AUX_LENGTH, AUX_LENGTH), dtype='float32')
c = OUTPUT_LENGTH / 2
sliced_input = Lambda(lambda x: x[:, :-1, :, :])(main_input)
ave = AveragePooling2D(pool_size=(OUTPUT_LENGTH, OUTPUT_LENGTH), strides=(1, 1),data_format ="channels_first")(sliced_input)
e=int(c)
ave1 = Cropping2D(cropping=((e, e), (e, e)),data_format="channels_first")(ave) #correct
ave2 = AveragePooling2D(pool_size=(OUTPUT_LENGTH, OUTPUT_LENGTH), strides=(1, 1),data_format= "channels_first")(ave) #correct
gra_test = Cropping2D(cropping=((e * 2, e * 2), (e * 2, e * 2)),data_format="channels_first")(sliced_input)#correct
merge1_test_3 = concatenate([gra_test,ave1, ave2],axis=1) # 12 x 23 x 23
x = Convolution2D(16, (5, 5), activation='relu', name='main/conv_1',data_format="channels_first")(merge1_test_3)
x = Convolution2D(32, (3, 3), activation='relu', name='main/conv_2',data_format="channels_first")(x)
main_output = Convolution2D(64, (3, 3), activation='relu', name='main/conv_3',data_format="channels_first")(x)
aux_output = Convolution2D(16, (1, 1), activation='relu', name='ayx/conv',data_format="channels_first")(aux_input)
merge2_test = concatenate([main_output,aux_output],axis=1)
x = Convolution2D(128, (1, 1), activation='relu', name='merge/conv',data_format="channels_first")(merge2_test)
x = Convolution2D(1, (1, 1), name='main/q_value',data_format="channels_first")(x)
z = Flatten()(x)
legal = Flatten()(Lambda(lambda x: x[:, -1:, :, :])(aux_input))
q_values_test1 = Multiply()([z,legal])
model = Model(input=[main_input, aux_input], output=q_values_test1)
return main_input, aux_input, q_values_test1, model
class Agent(object):
def __init__(self, params, geohash_table, time_step, cycle, demand_cycle, training=True, load_network=False):
self.init_params(params)
self.geo_table = geohash_table
self.time_step = time_step
self.cycle = cycle
self.training = training
self.demand_cycle = demand_cycle
self.x_matrix = np.zeros((AUX_LENGTH, AUX_LENGTH))
self.y_matrix = np.zeros((AUX_LENGTH, AUX_LENGTH))
self.d_matrix = np.zeros((AUX_LENGTH, AUX_LENGTH))
for i in range(AUX_LENGTH):
self.x_matrix[i, :] = i - AUX_LENGTH/2
self.y_matrix[:, i] = i - AUX_LENGTH/2
for j in range(AUX_LENGTH):
self.d_matrix[i, j] = np.sqrt((i - AUX_LENGTH/2)**2 + (j - AUX_LENGTH/2)**2) / OUTPUT_LENGTH
self.geo_table['x'] = np.uint8((self.geo_table.lon - LONGITUDE_MIN) / LONGITUDE_DELTA)
self.geo_table['y'] = np.uint8((self.geo_table.lat - LATITUDE_MIN) / LATITUDE_DELTA)
self.xy2g = [[list(self.geo_table[(self.geo_table.x == x) & (self.geo_table.y == y)].index)
for y in range(MAP_HEIGHT)] for x in range(MAP_WIDTH)]
self.legal_map = np.zeros((MAP_WIDTH, MAP_HEIGHT))
for x in range(MAP_WIDTH):
for y in range(MAP_HEIGHT):
if self.xy2g[x][y]:
self.legal_map[x, y] = 1
index = pd.MultiIndex.from_tuples([(x, y) for x in range(MAP_WIDTH) for y in range(MAP_HEIGHT)], names=['x', 'y'])
self.df = pd.DataFrame(index=index, columns=['X', 'X1', 'X2', 'X_idle', 'W'])
self.action_space = [(x, y) for x in range(-MAX_MOVE, MAX_MOVE + 1) for y in range(-MAX_MOVE, MAX_MOVE + 1)]
self.num_actions = len(self.action_space)
# Create q network
self.s, self.x, self.q_values, q_network = build_q_network()
q_network_weights = q_network.trainable_weights
self.num_iters = 0
self.sess = tf.InteractiveSession()
if self.training:
#for var in q_network_weights:
# tf.histogram_summary(var.name, var)
# Create target network
self.st, self.xt, self.target_q_values, target_network = build_q_network()
target_network_weights = target_network.trainable_weights
# Define target network update operation
self.update_target_network = [target_network_weights[i].assign(q_network_weights[i]) for i in
range(len(target_network_weights))]
# Define loss and gradient update operation
self.a, self.y, self.loss, self.grad_update = self.build_training_op(q_network_weights)
self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary()
self.summary_writer = tf.summary.FileWriter(logdir = self.SAVE_SUMMARY_PATH, graph = self.sess.graph, filename_suffix=("_"+params.scenario+params.id))
self.epsilon = self.INITIAL_EPSILON
self.epsilon_step = (self.FINAL_EPSILON - self.INITIAL_EPSILON) / self.EXPLORATION_STEPS
self.beta = self.INITIAL_BETA
self.beta_step = (self.FINAL_BETA - self.INITIAL_BETA) / self.ANNEALING_STEPS
self.num_iters -= self.INITIAL_REPLAY_SIZE
self.start_iter = self.num_iters
# Parameters used for summary
self.total_q_max = 0
self.total_loss = 0
# Create state buffer
self.state_buffer = deque()
# Create replay memory
self.replay_memory = deque()
self.replay_memory_weights = deque()
self.replay_memory_keys = [
'minofday', 'dayofweek', 'env', 'pos', 'action',
'reward', 'next_env', 'next_pos', 'delay']
self.saver = tf.train.Saver(q_network_weights)
if not os.path.exists(self.SAVE_NETWORK_PATH):
os.makedirs(self.SAVE_NETWORK_PATH)
self.sess.run(tf.initialize_all_variables())
# Load network
if load_network:
self.load_network()
# Initialize target network
if self.training:
self.sess.run(self.update_target_network)
else:
self.demand_model = build_d_network()
self.demand_model.load_weights(self.DEMAND_MODEL_PATH)
def init_params(self,params):
self.DATA_PATH = '/home/wenqi/Ashutosh'
self.GAMMA = params.GAMMA #Checked
#ORIGINAL EXPLORATION_STEPS = 500 # Number of steps over which the initial value of epsilon is linearly annealed to its final value
self.EXPLORATION_STEPS = params.EXPLORATION_STEPS #Checked
self.INITIAL_EPSILON = params.INITIAL_EPSILON #Checked
self.FINAL_EPSILON = params.FINAL_EPSILON #Checked
self.ANNEALING_STEPS = params.ANNEALING_STEPS
self.INITIAL_BETA = params.INITIAL_BETA #Checked
self.FINAL_BETA = params.FINAL_BETA #Checked
self.INITIAL_REPLAY_SIZE = params.INITIAL_REPLAY_SIZE #Checked # Number of steps to populate the replay memory before training starts
self.NUM_REPLAY_MEMORY = params.NUM_REPLAY_MEMORY #Checked # Number of replay memory the agent uses for training
self.SAVE_INTERVAL = params.SAVE_INTERVAL #Checked # The frequency with which the network is saved
self.BATCH_SIZE = params.BATCH_SIZE #Checked # Mini batch size
self.NUM_BATCH = params.NUM_BATCH #Checked # Number of batches
self.SAMPLE_PER_FRAME = params.SAMPLE_PER_FRAME #Checked
self.TARGET_UPDATE_INTERVAL = params.TARGET_UPDATE_INTERVAL #Checked # The frequency with which the target network is updated
self.SUMMARY_INTERVAL = params.SUMMARY_INTERVAL #Checked
self.LEARNING_RATE = params.LEARNING_RATE #Checked # Learning rate used by RMSProp
self.MOMENTUM = params.MOMENTUM #Checked # Momentum used by RMSProp
self.MIN_GRAD = params.MIN_GRAD #Checked # Constant added to the squared gradient in the denominator of the RMSProp update
self.SAVE_NETWORK_PATH = self.DATA_PATH + params.SAVE_NETWORK_PATH #Checked
#SAVE_SUMMARY_PATH = DATA_PATH + params.SAVE_SUMMARY_PATH
self.SAVE_SUMMARY_PATH = params.SAVE_SUMMARY_PATH #Checked
if not os.path.exists(self.SAVE_SUMMARY_PATH):
os.makedirs(self.SAVE_SUMMARY_PATH)
self.DEMAND_MODEL_PATH = params.DEMAND_MODEL_PATH #Checked
def reset(self, requests, dayofweek, minofday):
self.dayofweek = dayofweek
self.minofday = minofday
self.request_buffer = deque()
self.geo_table['W_1'] = 0
self.geo_table['W_2'] = 0
minutes = (requests.second.values[-1] - requests.second.values[0]) / 60.0
count = requests.groupby('phash')['plat'].count() * self.time_step / minutes
for i in range(int(60 / self.time_step)):
self.request_buffer.append(count.copy())
self.state_buffer = deque()
self.start_iter = self.num_iters
self.total_q_max = 0
self.total_loss = 0
def init_train(self, N, init_memory, summary_duration=5):
self.replay_memory = deque()
self.replay_memory_weights = deque()
self.replay_memory.extend(init_memory)
self.replay_memory_weights.extend([len(m[3]) for m in init_memory])
for i in range(N):
if i % self.TARGET_UPDATE_INTERVAL == 0:
self.sess.run(self.update_target_network)
if i % summary_duration == 0:
avg_q_max = self.total_q_max / summary_duration
avg_loss = self.total_loss / summary_duration
print('ITER: {:d} / EPSILON: {:.3f} / BETA: {:.3f} / Q_MAX: {:.3f} / LOSS: {:.3f}'.format(i, self.epsilon, self.beta, avg_q_max, avg_loss))
self.total_q_max = 0
self.total_loss = 0
self.train_network()
def get_actions(self, vehicles, requests):
self.update_time()
if not self.training:
self.update_demand(requests)
env_state, resource = self.preprocess(vehicles)
if self.training:
self.memorize_experience(env_state, vehicles)
if self.num_iters >= 0:
# Update target network
if self.num_iters % self.TARGET_UPDATE_INTERVAL == 0:
self.sess.run(self.update_target_network)
# Train network
#if len(self.replay_memory) == NUM_REPLAY_MEMORY:
self.train_network()
if self.num_iters % self.SUMMARY_INTERVAL == 0:
self.write_summary()
# Save network
if self.num_iters % self.SAVE_INTERVAL == 0:
save_path = self.saver.save(self.sess, self.SAVE_NETWORK_PATH + '/' + ENV_NAME,
global_step=(self.num_iters))
print('Successfully saved: ' + save_path)
# Anneal epsilon linearly over time
if self.num_iters < self.EXPLORATION_STEPS:
self.epsilon += self.epsilon_step
if self.num_iters < self.ANNEALING_STEPS:
self.beta += self.beta_step
if len(resource.index) > 0:
if self.training:
actions = self.e_greedy(env_state, resource)
else:
actions = self.run_policy(env_state, resource)
else:
actions = []
self.num_iters += 1
return actions
def update_time(self):
self.minofday += self.time_step
if self.minofday >= 1440: # 24 hour * 60 minute
self.minofday -= 1440
self.dayofweek = (self.dayofweek + 1) % 7
def update_demand(self, requests):
if len(self.request_buffer) >= 60 / self.time_step:
self.request_buffer.popleft()
count = requests.groupby('phash')['plat'].count()
self.request_buffer.append(count)
if self.num_iters % 10 == 0:
self.geo_table.loc[:, ['W_1', 'W_2']] = 0
for i, W in enumerate(self.request_buffer):
if i < 30 / self.time_step:
self.geo_table.loc[W.index, 'W_1'] += W.values
else:
self.geo_table.loc[W.index, 'W_2'] += W.values
df = self.geo_table
W_1 = df.pivot(index='x_', columns='y_', values='W_1').fillna(0).values
W_2 = df.pivot(index='x_', columns='y_', values='W_2').fillna(0).values
min = self.minofday / 1440.0
day = self.dayofweek / 7.0
aux_features = [np.sin(min), np.cos(min), np.sin(day), np.cos(day)]
demand = self.demand_model.predict(np.float32([[W_1, W_2] + [np.ones(W_1.shape) * x for x in aux_features]]))[0,0]
self.geo_table['W'] = demand[self.geo_table.x_.values, self.geo_table.y_.values]
return
def preprocess(self, vehicles):
vehicles['x'] = np.uint8((vehicles.lon - LONGITUDE_MIN) / LONGITUDE_DELTA)
vehicles['y'] = np.uint8((vehicles.lat - LATITUDE_MIN) / LATITUDE_DELTA)
#print (vehicles.shape)
R = vehicles[vehicles.available==1]
R_idle = R[R.idle%self.cycle==0]
R1 = vehicles[vehicles.eta <= self.cycle]
R2 = vehicles[vehicles.eta <= self.cycle * 2]
#print (self.geo_table.shape)
self.geo_table['X'] = R.groupby('dest_geohash')['available'].count()
#print (self.geo_table.shape)
self.geo_table = self.geo_table.fillna(0)
self.geo_table['ratio'] = self.geo_table.X / float(self.geo_table.X.sum() + 1) - self.geo_table.W / float(self.geo_table.W.sum() + 1)
self.df['W'] = self.geo_table.groupby(['x', 'y'])['W'].sum()
self.df['X'] = R.groupby(['x', 'y'])['available'].count()
self.df['X1'] = R1.groupby(['x', 'y'])['available'].count()
self.df['X2'] = R2.groupby(['x', 'y'])['available'].count()
self.df['X_idle'] = R_idle.groupby(['x', 'y'])['available'].count()
self.df = self.df.fillna(0)
self.df['X1'] -= self.df.W / 2.0
self.df['X2'] -= self.df.W
df = self.df.reset_index()
W = df.pivot(index='x', columns='y', values='W').fillna(0).values.astype(np.float32) / W_SCALE
X = df.pivot(index='x', columns='y', values='X').fillna(0).values.astype(np.float32) / X_SCALE
X1 = df.pivot(index='x', columns='y', values='X1').fillna(0).values.astype(np.float32) / X_SCALE
X2 = df.pivot(index='x', columns='y', values='X2').fillna(0).values.astype(np.float32) / X_SCALE
X_idle = df.pivot(index='x', columns='y', values='X_idle').fillna(0).values.astype(np.float32) / X_SCALE
env_state = [W, X, X1, X2, X_idle]
return env_state, R_idle
def e_greedy(self, env_state, resource):
dispatch = []
actions = []
xy_idle = [(x, y) for y in range(MAP_HEIGHT) for x in range(MAP_WIDTH) if env_state[-1][x, y] > 0]
if self.epsilon < 1:
xy2index = {(x, y):i for i, (x, y) in enumerate(xy_idle)}
aux_features = np.float32(self.create_aux_feature(self.minofday, self.dayofweek, xy_idle))
main_features = np.float32(self.create_main_feature(env_state, xy_idle))
aids = np.argmax(self.q_values.eval(feed_dict={
self.s: np.float32(main_features), self.x: np.float32(aux_features)}), axis=1)
for vid, (x, y) in resource[['x', 'y']].iterrows():
if self.epsilon < np.random.random():
aid = aids[xy2index[(x, y)]]
else:
aid = STAY_ACTION if self.beta >= np.random.random() else np.random.randint(self.num_actions)
action = STAY_ACTION
if aid != STAY_ACTION:
move_x, move_y = self.action_space[aid]
x_ = x + move_x
y_ = y + move_y
if x_ >= 0 and x_ < MAP_WIDTH and y_ >= 0 and y_ < MAP_HEIGHT:
g = self.xy2g[x_][y_]
if len(g) > 0:
###### EDITED HERE #######
gmin = self.geo_table.loc[g, 'ratio'].idxmin()
lat, lon = self.geo_table.loc[gmin, ['lat', 'lon']]
###### EDITED HERE #######
dispatch.append((vid, (lat, lon)))
action = aid
actions.append(action)
state_dict = {}
state_dict['minofday'] = self.minofday
state_dict['dayofweek'] = self.dayofweek
state_dict['vid'] = resource.index
state_dict['env'] = env_state
state_dict['pos'] = resource[['x', 'y']].values.astype(np.uint8)
state_dict['reward'] = resource['reward'].values.astype(np.float32)
state_dict['action'] = np.uint8(actions)
self.state_buffer.append(state_dict)
return dispatch
def run_policy(self, env_state, resource):
dispatch = []
W, X, X1, X2, X_idle = env_state
xy_idle = [(x, y) for y in range(MAP_HEIGHT) for x in range(MAP_WIDTH) if X_idle[x, y] > 0]
xy2index = {(x, y): i for i, (x, y) in enumerate(xy_idle)}
aux_features = np.float32(self.create_aux_feature(self.minofday, self.dayofweek, xy_idle))
for vid, (x, y) in resource[['x', 'y']].iterrows():
aux_feature = aux_features[[xy2index[(x, y)]]]
main_feature = np.float32(self.create_main_feature(env_state, [(x, y)]))
aid = np.argmax(self.q_values.eval(feed_dict={
self.s: np.float32(main_feature), self.x: np.float32(aux_feature)}), axis=1)[0]
new_x, new_y = x, y
if aid != STAY_ACTION:
move_x, move_y = self.action_space[aid]
x_ = x + move_x
y_ = y + move_y
if x_ >= 0 and x_ < MAP_WIDTH and y_ >= 0 and y_ < MAP_HEIGHT:
g = self.xy2g[x_][y_]
if len(g) > 0:
gmin = self.geo_table.loc[g, 'ratio'].argmin()
lat, lon = self.geo_table.loc[gmin, ['lat', 'lon']]
dispatch.append((vid, (lat, lon)))
new_x, new_y = x_, y_
X1[x, y] -= 1.0 / X_SCALE
X2[x, y] -= 1.0 / X_SCALE
X_idle[x, y] -= 1.0 / X_SCALE
X1[new_x, new_y] += 1.0 / X_SCALE
X2[new_x, new_y] += 1.0 / X_SCALE
return dispatch
def create_main_feature(self, env_state, positions):
features = [[pad_crop(s, x, y, MAIN_LENGTH) for s in env_state]
for x, y in positions]
return features
def create_aux_feature(self, minofday, dayofweek, positions):
aux_features = []
min = minofday / 1440.0
day = (dayofweek + int(min)) / 7.0
for i, (x, y) in enumerate(positions):
aux = np.zeros((AUX_DEPTH, AUX_LENGTH, AUX_LENGTH))
aux[0, :, :] = np.sin(min)
aux[1, :, :] = np.cos(min)
aux[2, :, :] = np.sin(day)
aux[3, :, :] = np.cos(day)
aux[4, int(AUX_LENGTH/2), int(AUX_LENGTH/2)] = 1.0
aux[5, :, :] = float(x) / MAP_WIDTH
aux[6, :, :] = float(y) / MAP_HEIGHT
aux[7, :, :] = (float(x) + self.x_matrix) / MAP_WIDTH
aux[8, :, :] = (float(y) + self.y_matrix) / MAP_HEIGHT
aux[9, :, :] = self.d_matrix
legal_map = pad_crop(self.legal_map, x, y, AUX_LENGTH)
legal_map[int(AUX_LENGTH / 2) + 1, int(AUX_LENGTH / 2) + 1] = 1
aux[10, :, :] = legal_map
aux_features.append(aux)
return aux_features
def memorize_experience(self, env_state, vehicles):
# Store transition in replay memory
if len(self.state_buffer) == 0:
return
if (self.state_buffer[0]['minofday'] + self.cycle) % 1440 != self.minofday:
return
state_action = self.state_buffer.popleft()
weight = len(state_action['vid'])
if weight == 0:
return
vdata = vehicles.loc[state_action['vid'], ['geohash', 'reward', 'eta', 'lat', 'lon']]
state_action['reward'] = vdata['reward'].values.astype(np.float32) - state_action['reward']
state_action['delay'] = np.round(vdata['eta'].values / self.cycle).astype(np.uint8)
state_action['next_pos'] = self.geo_table.loc[vdata['geohash'], ['x', 'y']].values.astype(np.uint8)
state_action['next_env'] = env_state
self.replay_memory.append([state_action[key] for key in self.replay_memory_keys])
self.replay_memory_weights.append(weight)
if len(self.replay_memory) > self.NUM_REPLAY_MEMORY:
self.replay_memory.popleft()
self.replay_memory_weights.popleft()
return
def train_network(self):
main_batch = []
aux_batch = []
action_batch = []
reward_batch = []
next_main_batch = []
next_aux_batch = []
delay_batch = []
#Bandaid stuff. Just retrieving the initialized parameters as local variables.
BATCH_SIZE = self.BATCH_SIZE
NUM_BATCH = self.NUM_BATCH
SAMPLE_PER_FRAME = self.SAMPLE_PER_FRAME
# Sample random minibatch of transition from replay memory
#0 minofday
#1 dayofweek
#2 env
#3 pos
#4 action
#5 reward
#6 next_env
#7 next_pos
#8 delay
weights = np.array(self.replay_memory_weights, dtype=np.float32)
memory_index = np.random.choice(range(len(self.replay_memory)), size=int(BATCH_SIZE*NUM_BATCH/SAMPLE_PER_FRAME), p=weights/weights.sum())
for i in memory_index:
data = self.replay_memory[i]
samples = np.random.randint(self.replay_memory_weights[i], size=SAMPLE_PER_FRAME)
aux_batch += self.create_aux_feature(data[0], data[1], data[3][samples])
next_aux_batch += self.create_aux_feature(data[0] + self.cycle, data[1], data[7][samples])
main_batch += self.create_main_feature(data[2], data[3][samples])
next_main_batch += self.create_main_feature(data[6], data[7][samples])
action_batch += data[4][samples].tolist()
reward_batch += data[5][samples].tolist()
delay_batch += data[8][samples].tolist()
# Double DQN
target_q_batch = self.target_q_values.eval(
feed_dict={
self.st: np.array(next_main_batch),
self.xt: np.array(next_aux_batch)
})
a_batch = np.argmax(self.q_values.eval(
feed_dict={
self.s: np.array(next_main_batch),
self.x: np.array(next_aux_batch)
}), axis=1)
target_q_max_batch = target_q_batch[range(BATCH_SIZE * NUM_BATCH), a_batch]
self.total_q_max += target_q_max_batch.mean()
y_batch = np.array(reward_batch) + self.GAMMA ** (1 + np.array(delay_batch)) * target_q_max_batch
p = np.random.permutation(BATCH_SIZE * NUM_BATCH)
main_batch = np.array(main_batch)[p]
aux_batch = np.array(aux_batch)[p]
action_batch = np.array(action_batch)[p]
y_batch = y_batch[p]
batches = [(main_batch[k:k + BATCH_SIZE], aux_batch[k:k + BATCH_SIZE], action_batch[k:k + BATCH_SIZE], y_batch[k:k + BATCH_SIZE])
for k in range(0, BATCH_SIZE * NUM_BATCH, BATCH_SIZE)]
total_loss = 0
for s, x, a, y in batches:
loss, _ = self.sess.run([self.loss, self.grad_update], feed_dict={
self.s: s,
self.x: x,
self.a: a,
self.y: y
})
total_loss += loss
self.total_loss += total_loss / NUM_BATCH
return
def build_training_op(self, q_network_weights):
a = tf.placeholder(tf.int64, [None])
y = tf.placeholder(tf.float32, [None])
# Convert action to one hot vector
a_one_hot = tf.one_hot(a, self.num_actions, 1.0, 0.0)
q_value = tf.reduce_sum(tf.multiply(self.q_values, a_one_hot), reduction_indices=1)
# Clip the error, the loss is quadratic when the error is in (-1, 1), and linear outside of that region
error = tf.abs(y - q_value)
quadratic_part = tf.clip_by_value(error, 0.0, 1.0)
linear_part = error - quadratic_part
loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part)
optimizer = tf.train.RMSPropOptimizer(self.LEARNING_RATE, momentum=self.MOMENTUM, epsilon=self.MIN_GRAD)
grad_update = optimizer.minimize(loss, var_list=q_network_weights)
return a, y, loss, grad_update
def setup_summary(self):
avg_max_q = tf.Variable(0.)
tf.summary.scalar(ENV_NAME + '/Average Max Q', avg_max_q)
avg_loss = tf.Variable(0.)
tf.summary.scalar(ENV_NAME + '/Average Loss', avg_loss)
summary_vars = [avg_max_q, avg_loss]
summary_placeholders = [tf.placeholder(tf.float32) for _ in range(len(summary_vars))]
update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))]
summary_op = tf.summary.merge_all()
return summary_placeholders, update_ops, summary_op
def write_summary(self):
if self.num_iters >= 0:
duration = float(self.num_iters - self.start_iter + 1)
avg_q_max = self.total_q_max / duration
avg_loss = self.total_loss / duration
stats = [avg_q_max, avg_loss]
for i in range(len(stats)):
self.sess.run(self.update_ops[i], feed_dict={
self.summary_placeholders[i]: float(stats[i])
})
summary_str = self.sess.run(self.summary_op)
self.summary_writer.add_summary(summary_str, self.num_iters)
# Debug
print('ITER: {0:6d} / EPSILON: {1:.4f} / BETA: {2:.4f} / Q_MAX: {3:.3f} / LOSS: {4:.3f}'.format(
self.num_iters, self.epsilon, self.beta, avg_q_max, avg_loss))
sys.stdout.flush()
self.start_iter = self.num_iters
self.total_q_max = 0
self.total_loss = 0
def load_network(self):
checkpoint = tf.train.get_checkpoint_state(self.SAVE_NETWORK_PATH)
if checkpoint and checkpoint.model_checkpoint_path:
self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
print('Successfully loaded: ' + checkpoint.model_checkpoint_path)
else:
print('Training new network...')
def update_future_demand(self, requests):
self.geo_table['W'] = 0
W = requests.groupby('phash')['plat'].count()
self.geo_table.loc[W.index, 'W'] += W.values