Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
MultiHopRideSharing/dqn_v3.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
687 lines (587 sloc)
30.2 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding:utf-8 | |
import sys | |
import os | |
import numpy as np | |
import pandas as pd | |
import tensorflow as tf | |
from collections import deque | |
from keras.models import Model | |
from keras.layers import Input, Flatten, Dense, merge, Reshape, Activation, Convolution2D, \ | |
AveragePooling2D, MaxPooling2D, Cropping2D, Lambda, Multiply, concatenate | |
# from keras import backend as K | |
KERAS_BACKEND = 'tensorflow' | |
DATA_PATH = '/home/wenqi/Ashutosh' | |
ENV_NAME = 'duel' | |
# Normalization parameters | |
NUM_AGGREGATION = 5 | |
LATITUDE_DELTA = 0.3 / 218 * NUM_AGGREGATION | |
LONGITUDE_DELTA = 0.3 / 218 * NUM_AGGREGATION | |
LATITUDE_MIN = 40.6003 | |
LATITUDE_MAX = 40.9003 | |
LONGITUDE_MIN = -74.0407 | |
LONGITUDE_MAX = -73.7501 | |
X_SCALE = 100.0 | |
W_SCALE = 100.0 | |
MAP_WIDTH = int((LONGITUDE_MAX - LONGITUDE_MIN) / LONGITUDE_DELTA) + 1 | |
MAP_HEIGHT = int((LATITUDE_MAX - LATITUDE_MIN) / LATITUDE_DELTA) + 1 | |
MAIN_LENGTH = 51 | |
MAIN_DEPTH = 5 | |
AUX_LENGTH = 15 #23 | |
AUX_DEPTH = 11 | |
MAX_MOVE = 7 | |
OUTPUT_LENGTH = 15 | |
STAY_ACTION = OUTPUT_LENGTH * OUTPUT_LENGTH / 2 | |
GAMMA = 0.9 | |
EXPLORATION_STEPS = 500 # Number of steps over which the initial value of epsilon is linearly annealed to its final value | |
INITIAL_EPSILON = 0.10 # Initial value of epsilon in epsilon-greedy | |
FINAL_EPSILON = 0.05 # Final value of epsilon in epsilon-greedy | |
INITIAL_BETA = 0.10 # Initial value of beta in epsilon-greedy | |
FINAL_BETA = 0.0 # Final value of beta in epsilon-greedy | |
INITIAL_REPLAY_SIZE = 0 # Number of steps to populate the replay memory before training starts | |
NUM_REPLAY_MEMORY = 10000 # Number of replay memory the agent uses for training | |
SAVE_INTERVAL = 1000 # The frequency with which the network is saved | |
BATCH_SIZE = 64 # Mini batch size | |
NUM_BATCH = 2 # Number of batches | |
SAMPLE_PER_FRAME = 2 | |
TARGET_UPDATE_INTERVAL = 150 # The frequency with which the target network is updated | |
SUMMARY_INTERVAL = 60 | |
LEARNING_RATE = 0.00025 # Learning rate used by RMSProp | |
MOMENTUM = 0.95 # Momentum used by RMSProp | |
MIN_GRAD = 0.01 # Constant added to the squared gradient in the denominator of the RMSProp update | |
SAVE_NETWORK_PATH = DATA_PATH + '/saved_networks' | |
SAVE_SUMMARY_PATH = DATA_PATH + '/summary' | |
DEMAND_MODEL_PATH = '/home/wenqi/Ashutosh/model.h5' | |
#Helper function | |
def pad_crop(F, x, y, size): | |
pad_F = np.pad(F, int((size - 1) / 2), 'constant') | |
return pad_F[x:x + size, y:y + size] | |
def build_d_network(): | |
input = Input(shape=(6, 212, 219), dtype='float32') | |
x = Convolution2D(8, (5, 5), activation='relu', border_mode='same',data_format="channels_first")(input) | |
x = Convolution2D(16, (3, 3), activation='relu', border_mode='same',data_format="channels_first")(x) | |
output = Convolution2D(1, (1, 1), activation='relu', border_mode='same',data_format="channels_first")(x) | |
model = Model(input=input, output=output) | |
return model | |
# Version 3.3 | |
def build_q_network(): | |
main_input = Input(shape=(MAIN_DEPTH, MAIN_LENGTH, MAIN_LENGTH), dtype='float32') | |
aux_input = Input(shape=(AUX_DEPTH, AUX_LENGTH, AUX_LENGTH), dtype='float32') | |
c = OUTPUT_LENGTH / 2 | |
sliced_input = Lambda(lambda x: x[:, :-1, :, :])(main_input) | |
ave = AveragePooling2D(pool_size=(OUTPUT_LENGTH, OUTPUT_LENGTH), strides=(1, 1),data_format ="channels_first")(sliced_input) | |
e=int(c) | |
ave1 = Cropping2D(cropping=((e, e), (e, e)),data_format="channels_first")(ave) #correct | |
ave2 = AveragePooling2D(pool_size=(OUTPUT_LENGTH, OUTPUT_LENGTH), strides=(1, 1),data_format= "channels_first")(ave) #correct | |
gra_test = Cropping2D(cropping=((e * 2, e * 2), (e * 2, e * 2)),data_format="channels_first")(sliced_input)#correct | |
merge1_test_3 = concatenate([gra_test,ave1, ave2],axis=1) # 12 x 23 x 23 | |
x = Convolution2D(16, (5, 5), activation='relu', name='main/conv_1',data_format="channels_first")(merge1_test_3) | |
x = Convolution2D(32, (3, 3), activation='relu', name='main/conv_2',data_format="channels_first")(x) | |
main_output = Convolution2D(64, (3, 3), activation='relu', name='main/conv_3',data_format="channels_first")(x) | |
aux_output = Convolution2D(16, (1, 1), activation='relu', name='ayx/conv',data_format="channels_first")(aux_input) | |
merge2_test = concatenate([main_output,aux_output],axis=1) | |
x = Convolution2D(128, (1, 1), activation='relu', name='merge/conv',data_format="channels_first")(merge2_test) | |
x = Convolution2D(1, (1, 1), name='main/q_value',data_format="channels_first")(x) | |
z = Flatten()(x) | |
legal = Flatten()(Lambda(lambda x: x[:, -1:, :, :])(aux_input)) | |
q_values_test1 = Multiply()([z,legal]) | |
model = Model(input=[main_input, aux_input], output=q_values_test1) | |
return main_input, aux_input, q_values_test1, model | |
# Version 3.4 | |
# merge1 = merge([gra, ave1, ave2], mode='concat', concat_axis=1) | |
# x = Convolution2D(16, 5, 5, activation='relu', name='main/conv_1')(merge1) | |
# x = Convolution2D(32, 3, 3, activation='relu', name='main/conv_2')(x) | |
# main_output = Convolution2D(64, 3, 3, activation='relu', name='main/conv_3')(x) | |
# aux_output = Convolution2D(16, 1, 1, activation='relu', name='aux/conv')(aux_input) | |
# merge2 = merge([main_output, aux_output], mode='concat', concat_axis=1) | |
# x = Convolution2D(128, 1, 1, activation='relu', name='merge/conv')(merge2) | |
# | |
# v = Convolution2D(1, 1, 1, activation='relu', name='value/conv')(x) | |
# v = MaxPooling2D(pool_size=(3, 3))(v) | |
# v = Flatten()(v) | |
# v = Dense(32, activation='relu', name='value/dense_1')(v) | |
# v = Dense(1, name='value/dense_2')(v) | |
# value = Lambda(lambda s: K.expand_dims(s[:, 0], dim=-1), | |
# output_shape=(OUTPUT_LENGTH * OUTPUT_LENGTH,), name='value/lambda')(v) | |
# | |
# z = Convolution2D(1, 1, 1, name='advantage/conv')(x) | |
# z = Flatten()(z) | |
# advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), name='advantage/lambda')(z) | |
# Vesrsion 3.1 | |
# def build_q_network(): | |
# main_input = Input(shape=(MAIN_DEPTH, MAIN_LENGTH, MAIN_LENGTH), dtype='float32') | |
# aux_input = Input(shape=(AUX_DEPTH, AUX_LENGTH, AUX_LENGTH), dtype='float32') | |
# | |
# c = OUTPUT_LENGTH / 2 | |
# ave = AveragePooling2D(pool_size=(OUTPUT_LENGTH, OUTPUT_LENGTH), strides=(1, 1))(main_input) | |
# ave1 = Cropping2D(cropping=((c, c), (c, c)))(ave) | |
# ave2 = AveragePooling2D(pool_size=(OUTPUT_LENGTH, OUTPUT_LENGTH), strides=(1, 1))(ave) | |
# gra = Cropping2D(cropping=((c * 2, c * 2), (c * 2, c * 2)))(main_input) | |
# | |
# merge1 = merge([gra, ave1, ave2], mode='concat', concat_axis=1) | |
# x = Convolution2D(16, 5, 5, activation='relu', name='main/conv_1')(merge1) | |
# x = Convolution2D(32, 3, 3, activation='relu', name='main/conv_2')(x) | |
# main_output = Convolution2D(64, 3, 3, activation='relu', name='main/conv_3')(x) | |
# merge2 = merge([main_output, aux_input], mode='concat', concat_axis=1) | |
# x = Convolution2D(128, 1, 1, activation='relu', name='merge/conv_1')(merge2) | |
# x = Convolution2D(128, 1, 1, activation='relu', name='merge/conv_2')(x) | |
# | |
# v = Convolution2D(1, 1, 1, activation='relu', name='value/conv')(x) | |
# v = MaxPooling2D(pool_size=(3, 3))(v) | |
# v = Flatten()(v) | |
# v = Dense(32, activation='relu', name='value/dense_1')(v) | |
# v = Dense(1, name='value/dense_2')(v) | |
# value = Lambda(lambda s: K.expand_dims(s[:, 0], dim=-1), | |
# output_shape=(OUTPUT_LENGTH*OUTPUT_LENGTH,), name='value/lambda')(v) | |
# | |
# z = Convolution2D(1, 1, 1, name='advantage/conv')(x) | |
# z = Flatten()(z) | |
# advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), name='advantage/lambda')(z) | |
# | |
# q_values = merge([value, advantage], mode='sum') | |
# legal = Flatten()(Lambda(lambda a: a[:, -1:, :, :])(aux_input)) | |
# q_values_legal = merge([q_values, legal], mode='mul') | |
# | |
# model = Model(input=[main_input, aux_input], output=q_values_legal) | |
# | |
# return main_input, aux_input, q_values_legal, model | |
# Version 3.2 | |
# merge1 = merge([gra, ave1, ave2], mode='concat', concat_axis=1) | |
# x = Convolution2D(16, 5, 5, activation='relu', name='main/conv_1')(merge1) | |
# x = Convolution2D(32, 3, 3, activation='relu', name='main/conv_2')(x) | |
# main_output = Convolution2D(64, 3, 3, activation='relu', name='main/conv_3')(x) | |
# merge2 = merge([main_output, aux_input], mode='concat', concat_axis=1) | |
# x = Convolution2D(128, 1, 1, activation='relu', name='merge/conv_1')(merge2) | |
# x = Convolution2D(128, 1, 1, activation='relu', name='merge/conv_2')(x) | |
# z = Convolution2D(1, 1, 1, name='main/q_value')(x) | |
# z = Flatten()(z) | |
# legal = Flatten()(Lambda(lambda a: a[:, -1:, :, :])(aux_input)) | |
# q_values_legal = merge([z, legal], mode='mul') | |
class Agent(object): | |
def __init__(self, geohash_table, time_step, cycle, demand_cycle, training=True, load_network=False): | |
self.geo_table = geohash_table | |
self.time_step = time_step | |
self.cycle = cycle | |
self.training = training | |
self.demand_cycle = demand_cycle | |
self.x_matrix = np.zeros((AUX_LENGTH, AUX_LENGTH)) | |
self.y_matrix = np.zeros((AUX_LENGTH, AUX_LENGTH)) | |
self.d_matrix = np.zeros((AUX_LENGTH, AUX_LENGTH)) | |
for i in range(AUX_LENGTH): | |
self.x_matrix[i, :] = i - AUX_LENGTH/2 | |
self.y_matrix[:, i] = i - AUX_LENGTH/2 | |
for j in range(AUX_LENGTH): | |
self.d_matrix[i, j] = np.sqrt((i - AUX_LENGTH/2)**2 + (j - AUX_LENGTH/2)**2) / OUTPUT_LENGTH | |
self.geo_table['x'] = np.uint8((self.geo_table.lon - LONGITUDE_MIN) / LONGITUDE_DELTA) | |
self.geo_table['y'] = np.uint8((self.geo_table.lat - LATITUDE_MIN) / LATITUDE_DELTA) | |
self.xy2g = [[list(self.geo_table[(self.geo_table.x == x) & (self.geo_table.y == y)].index) | |
for y in range(MAP_HEIGHT)] for x in range(MAP_WIDTH)] | |
self.legal_map = np.zeros((MAP_WIDTH, MAP_HEIGHT)) | |
for x in range(MAP_WIDTH): | |
for y in range(MAP_HEIGHT): | |
if self.xy2g[x][y]: | |
self.legal_map[x, y] = 1 | |
index = pd.MultiIndex.from_tuples([(x, y) for x in range(MAP_WIDTH) for y in range(MAP_HEIGHT)], names=['x', 'y']) | |
self.df = pd.DataFrame(index=index, columns=['X', 'X1', 'X2', 'X_idle', 'W']) | |
self.action_space = [(x, y) for x in range(-MAX_MOVE, MAX_MOVE + 1) for y in range(-MAX_MOVE, MAX_MOVE + 1)] | |
self.num_actions = len(self.action_space) | |
# Create q network | |
self.s, self.x, self.q_values, q_network = build_q_network() | |
q_network_weights = q_network.trainable_weights | |
self.num_iters = 0 | |
self.sess = tf.InteractiveSession() | |
if self.training: | |
#for var in q_network_weights: | |
# tf.histogram_summary(var.name, var) | |
# Create target network | |
self.st, self.xt, self.target_q_values, target_network = build_q_network() | |
target_network_weights = target_network.trainable_weights | |
# Define target network update operation | |
self.update_target_network = [target_network_weights[i].assign(q_network_weights[i]) for i in | |
range(len(target_network_weights))] | |
# Define loss and gradient update operation | |
self.a, self.y, self.loss, self.grad_update = self.build_training_op(q_network_weights) | |
self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary() | |
self.summary_writer = tf.summary.FileWriter(SAVE_SUMMARY_PATH, self.sess.graph) | |
self.epsilon = INITIAL_EPSILON | |
self.epsilon_step = (FINAL_EPSILON - INITIAL_EPSILON) / EXPLORATION_STEPS | |
self.beta = INITIAL_BETA | |
self.beta_step = (FINAL_BETA - INITIAL_BETA) / EXPLORATION_STEPS | |
self.num_iters -= INITIAL_REPLAY_SIZE | |
self.start_iter = self.num_iters | |
# Parameters used for summary | |
self.total_q_max = 0 | |
self.total_loss = 0 | |
# Create state buffer | |
self.state_buffer = deque() | |
# Create replay memory | |
self.replay_memory = deque() | |
self.replay_memory_weights = deque() | |
self.replay_memory_keys = [ | |
'minofday', 'dayofweek', 'env', 'pos', 'action', | |
'reward', 'next_env', 'next_pos', 'delay'] | |
self.saver = tf.train.Saver(q_network_weights) | |
if not os.path.exists(SAVE_NETWORK_PATH): | |
os.makedirs(SAVE_NETWORK_PATH) | |
self.sess.run(tf.initialize_all_variables()) | |
# Load network | |
if load_network: | |
self.load_network() | |
# Initialize target network | |
if self.training: | |
self.sess.run(self.update_target_network) | |
else: | |
self.demand_model = build_d_network() | |
self.demand_model.load_weights(DEMAND_MODEL_PATH) | |
def reset(self, requests, dayofweek, minofday): | |
self.dayofweek = dayofweek | |
self.minofday = minofday | |
self.request_buffer = deque() | |
self.geo_table['W_1'] = 0 | |
self.geo_table['W_2'] = 0 | |
minutes = (requests.second.values[-1] - requests.second.values[0]) / 60.0 | |
count = requests.groupby('phash')['plat'].count() * self.time_step / minutes | |
for i in range(int(60 / self.time_step)): | |
self.request_buffer.append(count.copy()) | |
self.state_buffer = deque() | |
self.start_iter = self.num_iters | |
self.total_q_max = 0 | |
self.total_loss = 0 | |
def init_train(self, N, init_memory, summary_duration=5): | |
self.replay_memory = deque() | |
self.replay_memory_weights = deque() | |
self.replay_memory.extend(init_memory) | |
self.replay_memory_weights.extend([len(m[3]) for m in init_memory]) | |
for i in range(N): | |
if i % TARGET_UPDATE_INTERVAL == 0: | |
self.sess.run(self.update_target_network) | |
if i % summary_duration == 0: | |
avg_q_max = self.total_q_max / summary_duration | |
avg_loss = self.total_loss / summary_duration | |
print('ITER: {:d} / Q_MAX: {:.3f} / LOSS: {:.3f}'.format(i, avg_q_max, avg_loss)) | |
self.total_q_max = 0 | |
self.total_loss = 0 | |
self.train_network() | |
def get_actions(self, vehicles, requests): | |
self.update_time() | |
if not self.training: | |
self.update_demand(requests) | |
env_state, resource = self.preprocess(vehicles) | |
#print (env_state[0].shape,env_state[1].shape,env_state[2].shape,env_state[3].shape,env_state[4].shape) | |
# print (env_state[0].head(5)) | |
# print (env_state[1].head(5)) | |
# print (env_state[2].head(5)) | |
# print (env_state[3].head(5)) | |
# print (env_state[4].head(5)) | |
if self.training: | |
self.memorize_experience(env_state, vehicles) | |
if self.num_iters >= 0: | |
# Update target network | |
if self.num_iters % TARGET_UPDATE_INTERVAL == 0: | |
self.sess.run(self.update_target_network) | |
# Train network | |
#if len(self.replay_memory) == NUM_REPLAY_MEMORY: | |
self.train_network() | |
if self.num_iters % SUMMARY_INTERVAL == 0: | |
self.write_summary() | |
# Save network | |
if self.num_iters % SAVE_INTERVAL == 0: | |
save_path = self.saver.save(self.sess, SAVE_NETWORK_PATH + '/' + ENV_NAME, | |
global_step=(self.num_iters)) | |
print('Successfully saved: ' + save_path) | |
# Anneal epsilon linearly over time | |
if self.num_iters < EXPLORATION_STEPS: | |
self.epsilon += self.epsilon_step | |
self.beta += self.beta_step | |
if len(resource.index) > 0: | |
if self.training: | |
actions = self.e_greedy(env_state, resource) | |
else: | |
actions = self.run_policy(env_state, resource) | |
else: | |
actions = [] | |
self.num_iters += 1 | |
return actions | |
def update_time(self): | |
self.minofday += self.time_step | |
if self.minofday >= 1440: # 24 hour * 60 minute | |
self.minofday -= 1440 | |
self.dayofweek = (self.dayofweek + 1) % 7 | |
def update_demand(self, requests): | |
if len(self.request_buffer) >= 60 / self.time_step: | |
self.request_buffer.popleft() | |
count = requests.groupby('phash')['plat'].count() | |
self.request_buffer.append(count) | |
if self.num_iters % 10 == 0: | |
self.geo_table.loc[:, ['W_1', 'W_2']] = 0 | |
for i, W in enumerate(self.request_buffer): | |
if i < 30 / self.time_step: | |
self.geo_table.loc[W.index, 'W_1'] += W.values | |
else: | |
self.geo_table.loc[W.index, 'W_2'] += W.values | |
df = self.geo_table | |
W_1 = df.pivot(index='x_', columns='y_', values='W_1').fillna(0).values | |
W_2 = df.pivot(index='x_', columns='y_', values='W_2').fillna(0).values | |
min = self.minofday / 1440.0 | |
day = self.dayofweek / 7.0 | |
aux_features = [np.sin(min), np.cos(min), np.sin(day), np.cos(day)] | |
demand = self.demand_model.predict(np.float32([[W_1, W_2] + [np.ones(W_1.shape) * x for x in aux_features]]))[0,0] | |
self.geo_table['W'] = demand[self.geo_table.x_.values, self.geo_table.y_.values] | |
return | |
def preprocess(self, vehicles): | |
vehicles['x'] = np.uint8((vehicles.lon - LONGITUDE_MIN) / LONGITUDE_DELTA) | |
vehicles['y'] = np.uint8((vehicles.lat - LATITUDE_MIN) / LATITUDE_DELTA) | |
#print (vehicles.shape) | |
R = vehicles[vehicles.available==1] | |
R_idle = R[R.idle%self.cycle==0] | |
R1 = vehicles[vehicles.eta <= self.cycle] | |
R2 = vehicles[vehicles.eta <= self.cycle * 2] | |
#print (self.geo_table.shape) | |
self.geo_table['X'] = R.groupby('dest_geohash')['available'].count() | |
#print (self.geo_table.shape) | |
self.geo_table = self.geo_table.fillna(0) | |
self.geo_table['ratio'] = self.geo_table.X / float(self.geo_table.X.sum() + 1) - self.geo_table.W / float(self.geo_table.W.sum() + 1) | |
self.df['W'] = self.geo_table.groupby(['x', 'y'])['W'].sum() | |
self.df['X'] = R.groupby(['x', 'y'])['available'].count() | |
self.df['X1'] = R1.groupby(['x', 'y'])['available'].count() | |
self.df['X2'] = R2.groupby(['x', 'y'])['available'].count() | |
self.df['X_idle'] = R_idle.groupby(['x', 'y'])['available'].count() | |
self.df = self.df.fillna(0) | |
self.df['X1'] -= self.df.W / 2.0 | |
self.df['X2'] -= self.df.W | |
df = self.df.reset_index() | |
W = df.pivot(index='x', columns='y', values='W').fillna(0).values.astype(np.float32) / W_SCALE | |
X = df.pivot(index='x', columns='y', values='X').fillna(0).values.astype(np.float32) / X_SCALE | |
X1 = df.pivot(index='x', columns='y', values='X1').fillna(0).values.astype(np.float32) / X_SCALE | |
X2 = df.pivot(index='x', columns='y', values='X2').fillna(0).values.astype(np.float32) / X_SCALE | |
X_idle = df.pivot(index='x', columns='y', values='X_idle').fillna(0).values.astype(np.float32) / X_SCALE | |
env_state = [W, X, X1, X2, X_idle] | |
return env_state, R_idle | |
def e_greedy(self, env_state, resource): | |
dispatch = [] | |
actions = [] | |
xy_idle = [(x, y) for y in range(MAP_HEIGHT) for x in range(MAP_WIDTH) if env_state[-1][x, y] > 0] | |
if self.epsilon < 1: | |
xy2index = {(x, y):i for i, (x, y) in enumerate(xy_idle)} | |
aux_features = np.float32(self.create_aux_feature(self.minofday, self.dayofweek, xy_idle)) | |
main_features = np.float32(self.create_main_feature(env_state, xy_idle)) | |
aids = np.argmax(self.q_values.eval(feed_dict={ | |
self.s: np.float32(main_features), self.x: np.float32(aux_features)}), axis=1) | |
for vid, (x, y) in resource[['x', 'y']].iterrows(): | |
if self.epsilon < np.random.random(): | |
aid = aids[xy2index[(x, y)]] | |
else: | |
aid = STAY_ACTION if self.beta >= np.random.random() else np.random.randint(self.num_actions) | |
action = STAY_ACTION | |
if aid != STAY_ACTION: | |
move_x, move_y = self.action_space[aid] | |
x_ = x + move_x | |
y_ = y + move_y | |
if x_ >= 0 and x_ < MAP_WIDTH and y_ >= 0 and y_ < MAP_HEIGHT: | |
g = self.xy2g[x_][y_] | |
if len(g) > 0: | |
gmin = self.geo_table.loc[g, 'ratio'].argmin() | |
lat, lon = self.geo_table.loc[gmin, ['lat', 'lon']] | |
dispatch.append((vid, (lat, lon))) | |
action = aid | |
actions.append(action) | |
state_dict = {} | |
state_dict['minofday'] = self.minofday | |
state_dict['dayofweek'] = self.dayofweek | |
state_dict['vid'] = resource.index | |
state_dict['env'] = env_state | |
state_dict['pos'] = resource[['x', 'y']].values.astype(np.uint8) | |
state_dict['reward'] = resource['reward'].values.astype(np.float32) | |
state_dict['action'] = np.uint8(actions) | |
self.state_buffer.append(state_dict) | |
return dispatch | |
def run_policy(self, env_state, resource): | |
dispatch = [] | |
W, X, X1, X2, X_idle = env_state | |
xy_idle = [(x, y) for y in range(MAP_HEIGHT) for x in range(MAP_WIDTH) if X_idle[x, y] > 0] | |
xy2index = {(x, y): i for i, (x, y) in enumerate(xy_idle)} | |
aux_features = np.float32(self.create_aux_feature(self.minofday, self.dayofweek, xy_idle)) | |
for vid, (x, y) in resource[['x', 'y']].iterrows(): | |
aux_feature = aux_features[[xy2index[(x, y)]]] | |
main_feature = np.float32(self.create_main_feature(env_state, [(x, y)])) | |
aid = np.argmax(self.q_values.eval(feed_dict={ | |
self.s: np.float32(main_feature), self.x: np.float32(aux_feature)}), axis=1)[0] | |
new_x, new_y = x, y | |
if aid != STAY_ACTION: | |
move_x, move_y = self.action_space[aid] | |
x_ = x + move_x | |
y_ = y + move_y | |
if x_ >= 0 and x_ < MAP_WIDTH and y_ >= 0 and y_ < MAP_HEIGHT: | |
g = self.xy2g[x_][y_] | |
if len(g) > 0: | |
gmin = self.geo_table.loc[g, 'ratio'].argmin() | |
lat, lon = self.geo_table.loc[gmin, ['lat', 'lon']] | |
dispatch.append((vid, (lat, lon))) | |
new_x, new_y = x_, y_ | |
X1[x, y] -= 1.0 / X_SCALE | |
X2[x, y] -= 1.0 / X_SCALE | |
X_idle[x, y] -= 1.0 / X_SCALE | |
X1[new_x, new_y] += 1.0 / X_SCALE | |
X2[new_x, new_y] += 1.0 / X_SCALE | |
return dispatch | |
def create_main_feature(self, env_state, positions): | |
features = [[pad_crop(s, x, y, MAIN_LENGTH) for s in env_state] | |
for x, y in positions] | |
return features | |
def create_aux_feature(self, minofday, dayofweek, positions): | |
aux_features = [] | |
min = minofday / 1440.0 | |
day = (dayofweek + int(min)) / 7.0 | |
for i, (x, y) in enumerate(positions): | |
aux = np.zeros((AUX_DEPTH, AUX_LENGTH, AUX_LENGTH)) | |
aux[0, :, :] = np.sin(min) | |
aux[1, :, :] = np.cos(min) | |
aux[2, :, :] = np.sin(day) | |
aux[3, :, :] = np.cos(day) | |
aux[4, int(AUX_LENGTH/2), int(AUX_LENGTH/2)] = 1.0 | |
aux[5, :, :] = float(x) / MAP_WIDTH | |
aux[6, :, :] = float(y) / MAP_HEIGHT | |
aux[7, :, :] = (float(x) + self.x_matrix) / MAP_WIDTH | |
aux[8, :, :] = (float(y) + self.y_matrix) / MAP_HEIGHT | |
aux[9, :, :] = self.d_matrix | |
legal_map = pad_crop(self.legal_map, x, y, AUX_LENGTH) | |
legal_map[int(AUX_LENGTH / 2) + 1, int(AUX_LENGTH / 2) + 1] = 1 | |
aux[10, :, :] = legal_map | |
aux_features.append(aux) | |
return aux_features | |
def memorize_experience(self, env_state, vehicles): | |
# Store transition in replay memory | |
if len(self.state_buffer) == 0: | |
return | |
if (self.state_buffer[0]['minofday'] + self.cycle) % 1440 != self.minofday: | |
return | |
state_action = self.state_buffer.popleft() | |
weight = len(state_action['vid']) | |
if weight == 0: | |
return | |
vdata = vehicles.loc[state_action['vid'], ['geohash', 'reward', 'eta', 'lat', 'lon']] | |
state_action['reward'] = vdata['reward'].values.astype(np.float32) - state_action['reward'] | |
state_action['delay'] = np.round(vdata['eta'].values / self.cycle).astype(np.uint8) | |
state_action['next_pos'] = self.geo_table.loc[vdata['geohash'], ['x', 'y']].values.astype(np.uint8) | |
state_action['next_env'] = env_state | |
self.replay_memory.append([state_action[key] for key in self.replay_memory_keys]) | |
self.replay_memory_weights.append(weight) | |
if len(self.replay_memory) > NUM_REPLAY_MEMORY: | |
self.replay_memory.popleft() | |
self.replay_memory_weights.popleft() | |
return | |
def train_network(self): | |
main_batch = [] | |
aux_batch = [] | |
action_batch = [] | |
reward_batch = [] | |
next_main_batch = [] | |
next_aux_batch = [] | |
delay_batch = [] | |
# Sample random minibatch of transition from replay memory | |
#0 minofday | |
#1 dayofweek | |
#2 env | |
#3 pos | |
#4 action | |
#5 reward | |
#6 next_env | |
#7 next_pos | |
#8 delay | |
weights = np.array(self.replay_memory_weights, dtype=np.float32) | |
memory_index = np.random.choice(range(len(self.replay_memory)), size=int(BATCH_SIZE*NUM_BATCH/SAMPLE_PER_FRAME), p=weights/weights.sum()) | |
for i in memory_index: | |
data = self.replay_memory[i] | |
samples = np.random.randint(self.replay_memory_weights[i], size=SAMPLE_PER_FRAME) | |
aux_batch += self.create_aux_feature(data[0], data[1], data[3][samples]) | |
next_aux_batch += self.create_aux_feature(data[0] + self.cycle, data[1], data[7][samples]) | |
main_batch += self.create_main_feature(data[2], data[3][samples]) | |
next_main_batch += self.create_main_feature(data[6], data[7][samples]) | |
action_batch += data[4][samples].tolist() | |
reward_batch += data[5][samples].tolist() | |
delay_batch += data[8][samples].tolist() | |
# Double DQN | |
target_q_batch = self.target_q_values.eval( | |
feed_dict={ | |
self.st: np.array(next_main_batch), | |
self.xt: np.array(next_aux_batch) | |
}) | |
a_batch = np.argmax(self.q_values.eval( | |
feed_dict={ | |
self.s: np.array(next_main_batch), | |
self.x: np.array(next_aux_batch) | |
}), axis=1) | |
target_q_max_batch = target_q_batch[range(BATCH_SIZE * NUM_BATCH), a_batch] | |
self.total_q_max += target_q_max_batch.mean() | |
y_batch = np.array(reward_batch) + GAMMA ** (1 + np.array(delay_batch)) * target_q_max_batch | |
p = np.random.permutation(BATCH_SIZE * NUM_BATCH) | |
main_batch = np.array(main_batch)[p] | |
aux_batch = np.array(aux_batch)[p] | |
action_batch = np.array(action_batch)[p] | |
y_batch = y_batch[p] | |
batches = [(main_batch[k:k + BATCH_SIZE], aux_batch[k:k + BATCH_SIZE], action_batch[k:k + BATCH_SIZE], y_batch[k:k + BATCH_SIZE]) | |
for k in range(0, BATCH_SIZE * NUM_BATCH, BATCH_SIZE)] | |
total_loss = 0 | |
for s, x, a, y in batches: | |
loss, _ = self.sess.run([self.loss, self.grad_update], feed_dict={ | |
self.s: s, | |
self.x: x, | |
self.a: a, | |
self.y: y | |
}) | |
total_loss += loss | |
self.total_loss += total_loss / NUM_BATCH | |
return | |
def build_training_op(self, q_network_weights): | |
a = tf.placeholder(tf.int64, [None]) | |
y = tf.placeholder(tf.float32, [None]) | |
# Convert action to one hot vector | |
a_one_hot = tf.one_hot(a, self.num_actions, 1.0, 0.0) | |
q_value = tf.reduce_sum(tf.multiply(self.q_values, a_one_hot), reduction_indices=1) | |
# Clip the error, the loss is quadratic when the error is in (-1, 1), and linear outside of that region | |
error = tf.abs(y - q_value) | |
quadratic_part = tf.clip_by_value(error, 0.0, 1.0) | |
linear_part = error - quadratic_part | |
loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part) | |
optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE, momentum=MOMENTUM, epsilon=MIN_GRAD) | |
grad_update = optimizer.minimize(loss, var_list=q_network_weights) | |
return a, y, loss, grad_update | |
def setup_summary(self): | |
avg_max_q = tf.Variable(0.) | |
tf.summary.scalar(ENV_NAME + '/Average Max Q', avg_max_q) | |
avg_loss = tf.Variable(0.) | |
tf.summary.scalar(ENV_NAME + '/Average Loss', avg_loss) | |
summary_vars = [avg_max_q, avg_loss] | |
summary_placeholders = [tf.placeholder(tf.float32) for _ in range(len(summary_vars))] | |
update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))] | |
summary_op = tf.summary.merge_all() | |
return summary_placeholders, update_ops, summary_op | |
def write_summary(self): | |
if self.num_iters >= 0: | |
duration = float(self.num_iters - self.start_iter + 1) | |
avg_q_max = self.total_q_max / duration | |
avg_loss = self.total_loss / duration | |
stats = [avg_q_max, avg_loss] | |
for i in range(len(stats)): | |
self.sess.run(self.update_ops[i], feed_dict={ | |
self.summary_placeholders[i]: float(stats[i]) | |
}) | |
summary_str = self.sess.run(self.summary_op) | |
self.summary_writer.add_summary(summary_str, self.num_iters) | |
# Debug | |
print('ITER: {0:6d} / EPSILON: {1:.4f} / BETA: {2:.4f} / Q_MAX: {3:.3f} / LOSS: {4:.3f}'.format( | |
self.num_iters, self.epsilon, self.beta, avg_q_max, avg_loss)) | |
sys.stdout.flush() | |
self.start_iter = self.num_iters | |
self.total_q_max = 0 | |
self.total_loss = 0 | |
def load_network(self): | |
checkpoint = tf.train.get_checkpoint_state(SAVE_NETWORK_PATH) | |
if checkpoint and checkpoint.model_checkpoint_path: | |
self.saver.restore(self.sess, checkpoint.model_checkpoint_path) | |
print('Successfully loaded: ' + checkpoint.model_checkpoint_path) | |
else: | |
print('Training new network...') | |
def update_future_demand(self, requests): | |
self.geo_table['W'] = 0 | |
W = requests.groupby('phash')['plat'].count() | |
self.geo_table.loc[W.index, 'W'] += W.values |