Permalink
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
MultiHopRideSharing/dqn_v3.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
687 lines (587 sloc)
30.2 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding:utf-8 | |
import sys | |
import os | |
import numpy as np | |
import pandas as pd | |
import tensorflow as tf | |
from collections import deque | |
from keras.models import Model | |
from keras.layers import Input, Flatten, Dense, merge, Reshape, Activation, Convolution2D, \ | |
AveragePooling2D, MaxPooling2D, Cropping2D, Lambda, Multiply, concatenate | |
# from keras import backend as K | |
KERAS_BACKEND = 'tensorflow' | |
DATA_PATH = '/home/wenqi/Ashutosh' | |
ENV_NAME = 'duel' | |
# Normalization parameters | |
NUM_AGGREGATION = 5 | |
LATITUDE_DELTA = 0.3 / 218 * NUM_AGGREGATION | |
LONGITUDE_DELTA = 0.3 / 218 * NUM_AGGREGATION | |
LATITUDE_MIN = 40.6003 | |
LATITUDE_MAX = 40.9003 | |
LONGITUDE_MIN = -74.0407 | |
LONGITUDE_MAX = -73.7501 | |
X_SCALE = 100.0 | |
W_SCALE = 100.0 | |
MAP_WIDTH = int((LONGITUDE_MAX - LONGITUDE_MIN) / LONGITUDE_DELTA) + 1 | |
MAP_HEIGHT = int((LATITUDE_MAX - LATITUDE_MIN) / LATITUDE_DELTA) + 1 | |
MAIN_LENGTH = 51 | |
MAIN_DEPTH = 5 | |
AUX_LENGTH = 15 #23 | |
AUX_DEPTH = 11 | |
MAX_MOVE = 7 | |
OUTPUT_LENGTH = 15 | |
STAY_ACTION = OUTPUT_LENGTH * OUTPUT_LENGTH / 2 | |
GAMMA = 0.9 | |
EXPLORATION_STEPS = 500 # Number of steps over which the initial value of epsilon is linearly annealed to its final value | |
INITIAL_EPSILON = 0.10 # Initial value of epsilon in epsilon-greedy | |
FINAL_EPSILON = 0.05 # Final value of epsilon in epsilon-greedy | |
INITIAL_BETA = 0.10 # Initial value of beta in epsilon-greedy | |
FINAL_BETA = 0.0 # Final value of beta in epsilon-greedy | |
INITIAL_REPLAY_SIZE = 0 # Number of steps to populate the replay memory before training starts | |
NUM_REPLAY_MEMORY = 10000 # Number of replay memory the agent uses for training | |
SAVE_INTERVAL = 1000 # The frequency with which the network is saved | |
BATCH_SIZE = 64 # Mini batch size | |
NUM_BATCH = 2 # Number of batches | |
SAMPLE_PER_FRAME = 2 | |
TARGET_UPDATE_INTERVAL = 150 # The frequency with which the target network is updated | |
SUMMARY_INTERVAL = 60 | |
LEARNING_RATE = 0.00025 # Learning rate used by RMSProp | |
MOMENTUM = 0.95 # Momentum used by RMSProp | |
MIN_GRAD = 0.01 # Constant added to the squared gradient in the denominator of the RMSProp update | |
SAVE_NETWORK_PATH = DATA_PATH + '/saved_networks' | |
SAVE_SUMMARY_PATH = DATA_PATH + '/summary' | |
DEMAND_MODEL_PATH = '/home/wenqi/Ashutosh/model.h5' | |
#Helper function | |
def pad_crop(F, x, y, size): | |
pad_F = np.pad(F, int((size - 1) / 2), 'constant') | |
return pad_F[x:x + size, y:y + size] | |
def build_d_network(): | |
input = Input(shape=(6, 212, 219), dtype='float32') | |
x = Convolution2D(8, (5, 5), activation='relu', border_mode='same',data_format="channels_first")(input) | |
x = Convolution2D(16, (3, 3), activation='relu', border_mode='same',data_format="channels_first")(x) | |
output = Convolution2D(1, (1, 1), activation='relu', border_mode='same',data_format="channels_first")(x) | |
model = Model(input=input, output=output) | |
return model | |
# Version 3.3 | |
def build_q_network(): | |
main_input = Input(shape=(MAIN_DEPTH, MAIN_LENGTH, MAIN_LENGTH), dtype='float32') | |
aux_input = Input(shape=(AUX_DEPTH, AUX_LENGTH, AUX_LENGTH), dtype='float32') | |
c = OUTPUT_LENGTH / 2 | |
sliced_input = Lambda(lambda x: x[:, :-1, :, :])(main_input) | |
ave = AveragePooling2D(pool_size=(OUTPUT_LENGTH, OUTPUT_LENGTH), strides=(1, 1),data_format ="channels_first")(sliced_input) | |
e=int(c) | |
ave1 = Cropping2D(cropping=((e, e), (e, e)),data_format="channels_first")(ave) #correct | |
ave2 = AveragePooling2D(pool_size=(OUTPUT_LENGTH, OUTPUT_LENGTH), strides=(1, 1),data_format= "channels_first")(ave) #correct | |
gra_test = Cropping2D(cropping=((e * 2, e * 2), (e * 2, e * 2)),data_format="channels_first")(sliced_input)#correct | |
merge1_test_3 = concatenate([gra_test,ave1, ave2],axis=1) # 12 x 23 x 23 | |
x = Convolution2D(16, (5, 5), activation='relu', name='main/conv_1',data_format="channels_first")(merge1_test_3) | |
x = Convolution2D(32, (3, 3), activation='relu', name='main/conv_2',data_format="channels_first")(x) | |
main_output = Convolution2D(64, (3, 3), activation='relu', name='main/conv_3',data_format="channels_first")(x) | |
aux_output = Convolution2D(16, (1, 1), activation='relu', name='ayx/conv',data_format="channels_first")(aux_input) | |
merge2_test = concatenate([main_output,aux_output],axis=1) | |
x = Convolution2D(128, (1, 1), activation='relu', name='merge/conv',data_format="channels_first")(merge2_test) | |
x = Convolution2D(1, (1, 1), name='main/q_value',data_format="channels_first")(x) | |
z = Flatten()(x) | |
legal = Flatten()(Lambda(lambda x: x[:, -1:, :, :])(aux_input)) | |
q_values_test1 = Multiply()([z,legal]) | |
model = Model(input=[main_input, aux_input], output=q_values_test1) | |
return main_input, aux_input, q_values_test1, model | |
# Version 3.4 | |
# merge1 = merge([gra, ave1, ave2], mode='concat', concat_axis=1) | |
# x = Convolution2D(16, 5, 5, activation='relu', name='main/conv_1')(merge1) | |
# x = Convolution2D(32, 3, 3, activation='relu', name='main/conv_2')(x) | |
# main_output = Convolution2D(64, 3, 3, activation='relu', name='main/conv_3')(x) | |
# aux_output = Convolution2D(16, 1, 1, activation='relu', name='aux/conv')(aux_input) | |
# merge2 = merge([main_output, aux_output], mode='concat', concat_axis=1) | |
# x = Convolution2D(128, 1, 1, activation='relu', name='merge/conv')(merge2) | |
# | |
# v = Convolution2D(1, 1, 1, activation='relu', name='value/conv')(x) | |
# v = MaxPooling2D(pool_size=(3, 3))(v) | |
# v = Flatten()(v) | |
# v = Dense(32, activation='relu', name='value/dense_1')(v) | |
# v = Dense(1, name='value/dense_2')(v) | |
# value = Lambda(lambda s: K.expand_dims(s[:, 0], dim=-1), | |
# output_shape=(OUTPUT_LENGTH * OUTPUT_LENGTH,), name='value/lambda')(v) | |
# | |
# z = Convolution2D(1, 1, 1, name='advantage/conv')(x) | |
# z = Flatten()(z) | |
# advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), name='advantage/lambda')(z) | |
# Vesrsion 3.1 | |
# def build_q_network(): | |
# main_input = Input(shape=(MAIN_DEPTH, MAIN_LENGTH, MAIN_LENGTH), dtype='float32') | |
# aux_input = Input(shape=(AUX_DEPTH, AUX_LENGTH, AUX_LENGTH), dtype='float32') | |
# | |
# c = OUTPUT_LENGTH / 2 | |
# ave = AveragePooling2D(pool_size=(OUTPUT_LENGTH, OUTPUT_LENGTH), strides=(1, 1))(main_input) | |
# ave1 = Cropping2D(cropping=((c, c), (c, c)))(ave) | |
# ave2 = AveragePooling2D(pool_size=(OUTPUT_LENGTH, OUTPUT_LENGTH), strides=(1, 1))(ave) | |
# gra = Cropping2D(cropping=((c * 2, c * 2), (c * 2, c * 2)))(main_input) | |
# | |
# merge1 = merge([gra, ave1, ave2], mode='concat', concat_axis=1) | |
# x = Convolution2D(16, 5, 5, activation='relu', name='main/conv_1')(merge1) | |
# x = Convolution2D(32, 3, 3, activation='relu', name='main/conv_2')(x) | |
# main_output = Convolution2D(64, 3, 3, activation='relu', name='main/conv_3')(x) | |
# merge2 = merge([main_output, aux_input], mode='concat', concat_axis=1) | |
# x = Convolution2D(128, 1, 1, activation='relu', name='merge/conv_1')(merge2) | |
# x = Convolution2D(128, 1, 1, activation='relu', name='merge/conv_2')(x) | |
# | |
# v = Convolution2D(1, 1, 1, activation='relu', name='value/conv')(x) | |
# v = MaxPooling2D(pool_size=(3, 3))(v) | |
# v = Flatten()(v) | |
# v = Dense(32, activation='relu', name='value/dense_1')(v) | |
# v = Dense(1, name='value/dense_2')(v) | |
# value = Lambda(lambda s: K.expand_dims(s[:, 0], dim=-1), | |
# output_shape=(OUTPUT_LENGTH*OUTPUT_LENGTH,), name='value/lambda')(v) | |
# | |
# z = Convolution2D(1, 1, 1, name='advantage/conv')(x) | |
# z = Flatten()(z) | |
# advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), name='advantage/lambda')(z) | |
# | |
# q_values = merge([value, advantage], mode='sum') | |
# legal = Flatten()(Lambda(lambda a: a[:, -1:, :, :])(aux_input)) | |
# q_values_legal = merge([q_values, legal], mode='mul') | |
# | |
# model = Model(input=[main_input, aux_input], output=q_values_legal) | |
# | |
# return main_input, aux_input, q_values_legal, model | |
# Version 3.2 | |
# merge1 = merge([gra, ave1, ave2], mode='concat', concat_axis=1) | |
# x = Convolution2D(16, 5, 5, activation='relu', name='main/conv_1')(merge1) | |
# x = Convolution2D(32, 3, 3, activation='relu', name='main/conv_2')(x) | |
# main_output = Convolution2D(64, 3, 3, activation='relu', name='main/conv_3')(x) | |
# merge2 = merge([main_output, aux_input], mode='concat', concat_axis=1) | |
# x = Convolution2D(128, 1, 1, activation='relu', name='merge/conv_1')(merge2) | |
# x = Convolution2D(128, 1, 1, activation='relu', name='merge/conv_2')(x) | |
# z = Convolution2D(1, 1, 1, name='main/q_value')(x) | |
# z = Flatten()(z) | |
# legal = Flatten()(Lambda(lambda a: a[:, -1:, :, :])(aux_input)) | |
# q_values_legal = merge([z, legal], mode='mul') | |
class Agent(object): | |
def __init__(self, geohash_table, time_step, cycle, demand_cycle, training=True, load_network=False): | |
self.geo_table = geohash_table | |
self.time_step = time_step | |
self.cycle = cycle | |
self.training = training | |
self.demand_cycle = demand_cycle | |
self.x_matrix = np.zeros((AUX_LENGTH, AUX_LENGTH)) | |
self.y_matrix = np.zeros((AUX_LENGTH, AUX_LENGTH)) | |
self.d_matrix = np.zeros((AUX_LENGTH, AUX_LENGTH)) | |
for i in range(AUX_LENGTH): | |
self.x_matrix[i, :] = i - AUX_LENGTH/2 | |
self.y_matrix[:, i] = i - AUX_LENGTH/2 | |
for j in range(AUX_LENGTH): | |
self.d_matrix[i, j] = np.sqrt((i - AUX_LENGTH/2)**2 + (j - AUX_LENGTH/2)**2) / OUTPUT_LENGTH | |
self.geo_table['x'] = np.uint8((self.geo_table.lon - LONGITUDE_MIN) / LONGITUDE_DELTA) | |
self.geo_table['y'] = np.uint8((self.geo_table.lat - LATITUDE_MIN) / LATITUDE_DELTA) | |
self.xy2g = [[list(self.geo_table[(self.geo_table.x == x) & (self.geo_table.y == y)].index) | |
for y in range(MAP_HEIGHT)] for x in range(MAP_WIDTH)] | |
self.legal_map = np.zeros((MAP_WIDTH, MAP_HEIGHT)) | |
for x in range(MAP_WIDTH): | |
for y in range(MAP_HEIGHT): | |
if self.xy2g[x][y]: | |
self.legal_map[x, y] = 1 | |
index = pd.MultiIndex.from_tuples([(x, y) for x in range(MAP_WIDTH) for y in range(MAP_HEIGHT)], names=['x', 'y']) | |
self.df = pd.DataFrame(index=index, columns=['X', 'X1', 'X2', 'X_idle', 'W']) | |
self.action_space = [(x, y) for x in range(-MAX_MOVE, MAX_MOVE + 1) for y in range(-MAX_MOVE, MAX_MOVE + 1)] | |
self.num_actions = len(self.action_space) | |
# Create q network | |
self.s, self.x, self.q_values, q_network = build_q_network() | |
q_network_weights = q_network.trainable_weights | |
self.num_iters = 0 | |
self.sess = tf.InteractiveSession() | |
if self.training: | |
#for var in q_network_weights: | |
# tf.histogram_summary(var.name, var) | |
# Create target network | |
self.st, self.xt, self.target_q_values, target_network = build_q_network() | |
target_network_weights = target_network.trainable_weights | |
# Define target network update operation | |
self.update_target_network = [target_network_weights[i].assign(q_network_weights[i]) for i in | |
range(len(target_network_weights))] | |
# Define loss and gradient update operation | |
self.a, self.y, self.loss, self.grad_update = self.build_training_op(q_network_weights) | |
self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary() | |
self.summary_writer = tf.summary.FileWriter(SAVE_SUMMARY_PATH, self.sess.graph) | |
self.epsilon = INITIAL_EPSILON | |
self.epsilon_step = (FINAL_EPSILON - INITIAL_EPSILON) / EXPLORATION_STEPS | |
self.beta = INITIAL_BETA | |
self.beta_step = (FINAL_BETA - INITIAL_BETA) / EXPLORATION_STEPS | |
self.num_iters -= INITIAL_REPLAY_SIZE | |
self.start_iter = self.num_iters | |
# Parameters used for summary | |
self.total_q_max = 0 | |
self.total_loss = 0 | |
# Create state buffer | |
self.state_buffer = deque() | |
# Create replay memory | |
self.replay_memory = deque() | |
self.replay_memory_weights = deque() | |
self.replay_memory_keys = [ | |
'minofday', 'dayofweek', 'env', 'pos', 'action', | |
'reward', 'next_env', 'next_pos', 'delay'] | |
self.saver = tf.train.Saver(q_network_weights) | |
if not os.path.exists(SAVE_NETWORK_PATH): | |
os.makedirs(SAVE_NETWORK_PATH) | |
self.sess.run(tf.initialize_all_variables()) | |
# Load network | |
if load_network: | |
self.load_network() | |
# Initialize target network | |
if self.training: | |
self.sess.run(self.update_target_network) | |
else: | |
self.demand_model = build_d_network() | |
self.demand_model.load_weights(DEMAND_MODEL_PATH) | |
def reset(self, requests, dayofweek, minofday): | |
self.dayofweek = dayofweek | |
self.minofday = minofday | |
self.request_buffer = deque() | |
self.geo_table['W_1'] = 0 | |
self.geo_table['W_2'] = 0 | |
minutes = (requests.second.values[-1] - requests.second.values[0]) / 60.0 | |
count = requests.groupby('phash')['plat'].count() * self.time_step / minutes | |
for i in range(int(60 / self.time_step)): | |
self.request_buffer.append(count.copy()) | |
self.state_buffer = deque() | |
self.start_iter = self.num_iters | |
self.total_q_max = 0 | |
self.total_loss = 0 | |
def init_train(self, N, init_memory, summary_duration=5): | |
self.replay_memory = deque() | |
self.replay_memory_weights = deque() | |
self.replay_memory.extend(init_memory) | |
self.replay_memory_weights.extend([len(m[3]) for m in init_memory]) | |
for i in range(N): | |
if i % TARGET_UPDATE_INTERVAL == 0: | |
self.sess.run(self.update_target_network) | |
if i % summary_duration == 0: | |
avg_q_max = self.total_q_max / summary_duration | |
avg_loss = self.total_loss / summary_duration | |
print('ITER: {:d} / Q_MAX: {:.3f} / LOSS: {:.3f}'.format(i, avg_q_max, avg_loss)) | |
self.total_q_max = 0 | |
self.total_loss = 0 | |
self.train_network() | |
def get_actions(self, vehicles, requests): | |
self.update_time() | |
if not self.training: | |
self.update_demand(requests) | |
env_state, resource = self.preprocess(vehicles) | |
#print (env_state[0].shape,env_state[1].shape,env_state[2].shape,env_state[3].shape,env_state[4].shape) | |
# print (env_state[0].head(5)) | |
# print (env_state[1].head(5)) | |
# print (env_state[2].head(5)) | |
# print (env_state[3].head(5)) | |
# print (env_state[4].head(5)) | |
if self.training: | |
self.memorize_experience(env_state, vehicles) | |
if self.num_iters >= 0: | |
# Update target network | |
if self.num_iters % TARGET_UPDATE_INTERVAL == 0: | |
self.sess.run(self.update_target_network) | |
# Train network | |
#if len(self.replay_memory) == NUM_REPLAY_MEMORY: | |
self.train_network() | |
if self.num_iters % SUMMARY_INTERVAL == 0: | |
self.write_summary() | |
# Save network | |
if self.num_iters % SAVE_INTERVAL == 0: | |
save_path = self.saver.save(self.sess, SAVE_NETWORK_PATH + '/' + ENV_NAME, | |
global_step=(self.num_iters)) | |
print('Successfully saved: ' + save_path) | |
# Anneal epsilon linearly over time | |
if self.num_iters < EXPLORATION_STEPS: | |
self.epsilon += self.epsilon_step | |
self.beta += self.beta_step | |
if len(resource.index) > 0: | |
if self.training: | |
actions = self.e_greedy(env_state, resource) | |
else: | |
actions = self.run_policy(env_state, resource) | |
else: | |
actions = [] | |
self.num_iters += 1 | |
return actions | |
def update_time(self): | |
self.minofday += self.time_step | |
if self.minofday >= 1440: # 24 hour * 60 minute | |
self.minofday -= 1440 | |
self.dayofweek = (self.dayofweek + 1) % 7 | |
def update_demand(self, requests): | |
if len(self.request_buffer) >= 60 / self.time_step: | |
self.request_buffer.popleft() | |
count = requests.groupby('phash')['plat'].count() | |
self.request_buffer.append(count) | |
if self.num_iters % 10 == 0: | |
self.geo_table.loc[:, ['W_1', 'W_2']] = 0 | |
for i, W in enumerate(self.request_buffer): | |
if i < 30 / self.time_step: | |
self.geo_table.loc[W.index, 'W_1'] += W.values | |
else: | |
self.geo_table.loc[W.index, 'W_2'] += W.values | |
df = self.geo_table | |
W_1 = df.pivot(index='x_', columns='y_', values='W_1').fillna(0).values | |
W_2 = df.pivot(index='x_', columns='y_', values='W_2').fillna(0).values | |
min = self.minofday / 1440.0 | |
day = self.dayofweek / 7.0 | |
aux_features = [np.sin(min), np.cos(min), np.sin(day), np.cos(day)] | |
demand = self.demand_model.predict(np.float32([[W_1, W_2] + [np.ones(W_1.shape) * x for x in aux_features]]))[0,0] | |
self.geo_table['W'] = demand[self.geo_table.x_.values, self.geo_table.y_.values] | |
return | |
def preprocess(self, vehicles): | |
vehicles['x'] = np.uint8((vehicles.lon - LONGITUDE_MIN) / LONGITUDE_DELTA) | |
vehicles['y'] = np.uint8((vehicles.lat - LATITUDE_MIN) / LATITUDE_DELTA) | |
#print (vehicles.shape) | |
R = vehicles[vehicles.available==1] | |
R_idle = R[R.idle%self.cycle==0] | |
R1 = vehicles[vehicles.eta <= self.cycle] | |
R2 = vehicles[vehicles.eta <= self.cycle * 2] | |
#print (self.geo_table.shape) | |
self.geo_table['X'] = R.groupby('dest_geohash')['available'].count() | |
#print (self.geo_table.shape) | |
self.geo_table = self.geo_table.fillna(0) | |
self.geo_table['ratio'] = self.geo_table.X / float(self.geo_table.X.sum() + 1) - self.geo_table.W / float(self.geo_table.W.sum() + 1) | |
self.df['W'] = self.geo_table.groupby(['x', 'y'])['W'].sum() | |
self.df['X'] = R.groupby(['x', 'y'])['available'].count() | |
self.df['X1'] = R1.groupby(['x', 'y'])['available'].count() | |
self.df['X2'] = R2.groupby(['x', 'y'])['available'].count() | |
self.df['X_idle'] = R_idle.groupby(['x', 'y'])['available'].count() | |
self.df = self.df.fillna(0) | |
self.df['X1'] -= self.df.W / 2.0 | |
self.df['X2'] -= self.df.W | |
df = self.df.reset_index() | |
W = df.pivot(index='x', columns='y', values='W').fillna(0).values.astype(np.float32) / W_SCALE | |
X = df.pivot(index='x', columns='y', values='X').fillna(0).values.astype(np.float32) / X_SCALE | |
X1 = df.pivot(index='x', columns='y', values='X1').fillna(0).values.astype(np.float32) / X_SCALE | |
X2 = df.pivot(index='x', columns='y', values='X2').fillna(0).values.astype(np.float32) / X_SCALE | |
X_idle = df.pivot(index='x', columns='y', values='X_idle').fillna(0).values.astype(np.float32) / X_SCALE | |
env_state = [W, X, X1, X2, X_idle] | |
return env_state, R_idle | |
def e_greedy(self, env_state, resource): | |
dispatch = [] | |
actions = [] | |
xy_idle = [(x, y) for y in range(MAP_HEIGHT) for x in range(MAP_WIDTH) if env_state[-1][x, y] > 0] | |
if self.epsilon < 1: | |
xy2index = {(x, y):i for i, (x, y) in enumerate(xy_idle)} | |
aux_features = np.float32(self.create_aux_feature(self.minofday, self.dayofweek, xy_idle)) | |
main_features = np.float32(self.create_main_feature(env_state, xy_idle)) | |
aids = np.argmax(self.q_values.eval(feed_dict={ | |
self.s: np.float32(main_features), self.x: np.float32(aux_features)}), axis=1) | |
for vid, (x, y) in resource[['x', 'y']].iterrows(): | |
if self.epsilon < np.random.random(): | |
aid = aids[xy2index[(x, y)]] | |
else: | |
aid = STAY_ACTION if self.beta >= np.random.random() else np.random.randint(self.num_actions) | |
action = STAY_ACTION | |
if aid != STAY_ACTION: | |
move_x, move_y = self.action_space[aid] | |
x_ = x + move_x | |
y_ = y + move_y | |
if x_ >= 0 and x_ < MAP_WIDTH and y_ >= 0 and y_ < MAP_HEIGHT: | |
g = self.xy2g[x_][y_] | |
if len(g) > 0: | |
gmin = self.geo_table.loc[g, 'ratio'].argmin() | |
lat, lon = self.geo_table.loc[gmin, ['lat', 'lon']] | |
dispatch.append((vid, (lat, lon))) | |
action = aid | |
actions.append(action) | |
state_dict = {} | |
state_dict['minofday'] = self.minofday | |
state_dict['dayofweek'] = self.dayofweek | |
state_dict['vid'] = resource.index | |
state_dict['env'] = env_state | |
state_dict['pos'] = resource[['x', 'y']].values.astype(np.uint8) | |
state_dict['reward'] = resource['reward'].values.astype(np.float32) | |
state_dict['action'] = np.uint8(actions) | |
self.state_buffer.append(state_dict) | |
return dispatch | |
def run_policy(self, env_state, resource): | |
dispatch = [] | |
W, X, X1, X2, X_idle = env_state | |
xy_idle = [(x, y) for y in range(MAP_HEIGHT) for x in range(MAP_WIDTH) if X_idle[x, y] > 0] | |
xy2index = {(x, y): i for i, (x, y) in enumerate(xy_idle)} | |
aux_features = np.float32(self.create_aux_feature(self.minofday, self.dayofweek, xy_idle)) | |
for vid, (x, y) in resource[['x', 'y']].iterrows(): | |
aux_feature = aux_features[[xy2index[(x, y)]]] | |
main_feature = np.float32(self.create_main_feature(env_state, [(x, y)])) | |
aid = np.argmax(self.q_values.eval(feed_dict={ | |
self.s: np.float32(main_feature), self.x: np.float32(aux_feature)}), axis=1)[0] | |
new_x, new_y = x, y | |
if aid != STAY_ACTION: | |
move_x, move_y = self.action_space[aid] | |
x_ = x + move_x | |
y_ = y + move_y | |
if x_ >= 0 and x_ < MAP_WIDTH and y_ >= 0 and y_ < MAP_HEIGHT: | |
g = self.xy2g[x_][y_] | |
if len(g) > 0: | |
gmin = self.geo_table.loc[g, 'ratio'].argmin() | |
lat, lon = self.geo_table.loc[gmin, ['lat', 'lon']] | |
dispatch.append((vid, (lat, lon))) | |
new_x, new_y = x_, y_ | |
X1[x, y] -= 1.0 / X_SCALE | |
X2[x, y] -= 1.0 / X_SCALE | |
X_idle[x, y] -= 1.0 / X_SCALE | |
X1[new_x, new_y] += 1.0 / X_SCALE | |
X2[new_x, new_y] += 1.0 / X_SCALE | |
return dispatch | |
def create_main_feature(self, env_state, positions): | |
features = [[pad_crop(s, x, y, MAIN_LENGTH) for s in env_state] | |
for x, y in positions] | |
return features | |
def create_aux_feature(self, minofday, dayofweek, positions): | |
aux_features = [] | |
min = minofday / 1440.0 | |
day = (dayofweek + int(min)) / 7.0 | |
for i, (x, y) in enumerate(positions): | |
aux = np.zeros((AUX_DEPTH, AUX_LENGTH, AUX_LENGTH)) | |
aux[0, :, :] = np.sin(min) | |
aux[1, :, :] = np.cos(min) | |
aux[2, :, :] = np.sin(day) | |
aux[3, :, :] = np.cos(day) | |
aux[4, int(AUX_LENGTH/2), int(AUX_LENGTH/2)] = 1.0 | |
aux[5, :, :] = float(x) / MAP_WIDTH | |
aux[6, :, :] = float(y) / MAP_HEIGHT | |
aux[7, :, :] = (float(x) + self.x_matrix) / MAP_WIDTH | |
aux[8, :, :] = (float(y) + self.y_matrix) / MAP_HEIGHT | |
aux[9, :, :] = self.d_matrix | |
legal_map = pad_crop(self.legal_map, x, y, AUX_LENGTH) | |
legal_map[int(AUX_LENGTH / 2) + 1, int(AUX_LENGTH / 2) + 1] = 1 | |
aux[10, :, :] = legal_map | |
aux_features.append(aux) | |
return aux_features | |
def memorize_experience(self, env_state, vehicles): | |
# Store transition in replay memory | |
if len(self.state_buffer) == 0: | |
return | |
if (self.state_buffer[0]['minofday'] + self.cycle) % 1440 != self.minofday: | |
return | |
state_action = self.state_buffer.popleft() | |
weight = len(state_action['vid']) | |
if weight == 0: | |
return | |
vdata = vehicles.loc[state_action['vid'], ['geohash', 'reward', 'eta', 'lat', 'lon']] | |
state_action['reward'] = vdata['reward'].values.astype(np.float32) - state_action['reward'] | |
state_action['delay'] = np.round(vdata['eta'].values / self.cycle).astype(np.uint8) | |
state_action['next_pos'] = self.geo_table.loc[vdata['geohash'], ['x', 'y']].values.astype(np.uint8) | |
state_action['next_env'] = env_state | |
self.replay_memory.append([state_action[key] for key in self.replay_memory_keys]) | |
self.replay_memory_weights.append(weight) | |
if len(self.replay_memory) > NUM_REPLAY_MEMORY: | |
self.replay_memory.popleft() | |
self.replay_memory_weights.popleft() | |
return | |
def train_network(self): | |
main_batch = [] | |
aux_batch = [] | |
action_batch = [] | |
reward_batch = [] | |
next_main_batch = [] | |
next_aux_batch = [] | |
delay_batch = [] | |
# Sample random minibatch of transition from replay memory | |
#0 minofday | |
#1 dayofweek | |
#2 env | |
#3 pos | |
#4 action | |
#5 reward | |
#6 next_env | |
#7 next_pos | |
#8 delay | |
weights = np.array(self.replay_memory_weights, dtype=np.float32) | |
memory_index = np.random.choice(range(len(self.replay_memory)), size=int(BATCH_SIZE*NUM_BATCH/SAMPLE_PER_FRAME), p=weights/weights.sum()) | |
for i in memory_index: | |
data = self.replay_memory[i] | |
samples = np.random.randint(self.replay_memory_weights[i], size=SAMPLE_PER_FRAME) | |
aux_batch += self.create_aux_feature(data[0], data[1], data[3][samples]) | |
next_aux_batch += self.create_aux_feature(data[0] + self.cycle, data[1], data[7][samples]) | |
main_batch += self.create_main_feature(data[2], data[3][samples]) | |
next_main_batch += self.create_main_feature(data[6], data[7][samples]) | |
action_batch += data[4][samples].tolist() | |
reward_batch += data[5][samples].tolist() | |
delay_batch += data[8][samples].tolist() | |
# Double DQN | |
target_q_batch = self.target_q_values.eval( | |
feed_dict={ | |
self.st: np.array(next_main_batch), | |
self.xt: np.array(next_aux_batch) | |
}) | |
a_batch = np.argmax(self.q_values.eval( | |
feed_dict={ | |
self.s: np.array(next_main_batch), | |
self.x: np.array(next_aux_batch) | |
}), axis=1) | |
target_q_max_batch = target_q_batch[range(BATCH_SIZE * NUM_BATCH), a_batch] | |
self.total_q_max += target_q_max_batch.mean() | |
y_batch = np.array(reward_batch) + GAMMA ** (1 + np.array(delay_batch)) * target_q_max_batch | |
p = np.random.permutation(BATCH_SIZE * NUM_BATCH) | |
main_batch = np.array(main_batch)[p] | |
aux_batch = np.array(aux_batch)[p] | |
action_batch = np.array(action_batch)[p] | |
y_batch = y_batch[p] | |
batches = [(main_batch[k:k + BATCH_SIZE], aux_batch[k:k + BATCH_SIZE], action_batch[k:k + BATCH_SIZE], y_batch[k:k + BATCH_SIZE]) | |
for k in range(0, BATCH_SIZE * NUM_BATCH, BATCH_SIZE)] | |
total_loss = 0 | |
for s, x, a, y in batches: | |
loss, _ = self.sess.run([self.loss, self.grad_update], feed_dict={ | |
self.s: s, | |
self.x: x, | |
self.a: a, | |
self.y: y | |
}) | |
total_loss += loss | |
self.total_loss += total_loss / NUM_BATCH | |
return | |
def build_training_op(self, q_network_weights): | |
a = tf.placeholder(tf.int64, [None]) | |
y = tf.placeholder(tf.float32, [None]) | |
# Convert action to one hot vector | |
a_one_hot = tf.one_hot(a, self.num_actions, 1.0, 0.0) | |
q_value = tf.reduce_sum(tf.multiply(self.q_values, a_one_hot), reduction_indices=1) | |
# Clip the error, the loss is quadratic when the error is in (-1, 1), and linear outside of that region | |
error = tf.abs(y - q_value) | |
quadratic_part = tf.clip_by_value(error, 0.0, 1.0) | |
linear_part = error - quadratic_part | |
loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part) | |
optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE, momentum=MOMENTUM, epsilon=MIN_GRAD) | |
grad_update = optimizer.minimize(loss, var_list=q_network_weights) | |
return a, y, loss, grad_update | |
def setup_summary(self): | |
avg_max_q = tf.Variable(0.) | |
tf.summary.scalar(ENV_NAME + '/Average Max Q', avg_max_q) | |
avg_loss = tf.Variable(0.) | |
tf.summary.scalar(ENV_NAME + '/Average Loss', avg_loss) | |
summary_vars = [avg_max_q, avg_loss] | |
summary_placeholders = [tf.placeholder(tf.float32) for _ in range(len(summary_vars))] | |
update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))] | |
summary_op = tf.summary.merge_all() | |
return summary_placeholders, update_ops, summary_op | |
def write_summary(self): | |
if self.num_iters >= 0: | |
duration = float(self.num_iters - self.start_iter + 1) | |
avg_q_max = self.total_q_max / duration | |
avg_loss = self.total_loss / duration | |
stats = [avg_q_max, avg_loss] | |
for i in range(len(stats)): | |
self.sess.run(self.update_ops[i], feed_dict={ | |
self.summary_placeholders[i]: float(stats[i]) | |
}) | |
summary_str = self.sess.run(self.summary_op) | |
self.summary_writer.add_summary(summary_str, self.num_iters) | |
# Debug | |
print('ITER: {0:6d} / EPSILON: {1:.4f} / BETA: {2:.4f} / Q_MAX: {3:.3f} / LOSS: {4:.3f}'.format( | |
self.num_iters, self.epsilon, self.beta, avg_q_max, avg_loss)) | |
sys.stdout.flush() | |
self.start_iter = self.num_iters | |
self.total_q_max = 0 | |
self.total_loss = 0 | |
def load_network(self): | |
checkpoint = tf.train.get_checkpoint_state(SAVE_NETWORK_PATH) | |
if checkpoint and checkpoint.model_checkpoint_path: | |
self.saver.restore(self.sess, checkpoint.model_checkpoint_path) | |
print('Successfully loaded: ' + checkpoint.model_checkpoint_path) | |
else: | |
print('Training new network...') | |
def update_future_demand(self, requests): | |
self.geo_table['W'] = 0 | |
W = requests.groupby('phash')['plat'].count() | |
self.geo_table.loc[W.index, 'W'] += W.values |