dqn_v3.py

# coding:utf-8

import sys
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from collections import deque
from keras.models import Model
from keras.layers import Input, Flatten, Dense, merge, Reshape, Activation, Convolution2D, \
    AveragePooling2D, MaxPooling2D, Cropping2D, Lambda, Multiply, concatenate
# from keras import backend as K

KERAS_BACKEND = 'tensorflow'
DATA_PATH = '/home/wenqi/Ashutosh'
ENV_NAME = 'duel'
# Normalization parameters
NUM_AGGREGATION = 5
LATITUDE_DELTA = 0.3 / 218 * NUM_AGGREGATION
LONGITUDE_DELTA = 0.3 / 218 * NUM_AGGREGATION
LATITUDE_MIN = 40.6003
LATITUDE_MAX = 40.9003
LONGITUDE_MIN = -74.0407
LONGITUDE_MAX = -73.7501
X_SCALE = 100.0
W_SCALE = 100.0

MAP_WIDTH = int((LONGITUDE_MAX - LONGITUDE_MIN) / LONGITUDE_DELTA) + 1
MAP_HEIGHT = int((LATITUDE_MAX - LATITUDE_MIN) / LATITUDE_DELTA) + 1
MAIN_LENGTH = 51
MAIN_DEPTH = 5
AUX_LENGTH = 15 #23
AUX_DEPTH = 11
MAX_MOVE = 7
OUTPUT_LENGTH = 15
STAY_ACTION = OUTPUT_LENGTH * OUTPUT_LENGTH / 2

GAMMA = 0.9
EXPLORATION_STEPS = 500  # Number of steps over which the initial value of epsilon is linearly annealed to its final value
INITIAL_EPSILON = 0.10  # Initial value of epsilon in epsilon-greedy
FINAL_EPSILON = 0.05  # Final value of epsilon in epsilon-greedy
INITIAL_BETA = 0.10 # Initial value of beta in epsilon-greedy
FINAL_BETA = 0.0 # Final value of beta in epsilon-greedy
INITIAL_REPLAY_SIZE = 0  # Number of steps to populate the replay memory before training starts
NUM_REPLAY_MEMORY = 10000  # Number of replay memory the agent uses for training
SAVE_INTERVAL = 1000  # The frequency with which the network is saved
BATCH_SIZE = 64  # Mini batch size
NUM_BATCH = 2 # Number of batches
SAMPLE_PER_FRAME = 2
TARGET_UPDATE_INTERVAL = 150  # The frequency with which the target network is updated
SUMMARY_INTERVAL = 60
LEARNING_RATE = 0.00025  # Learning rate used by RMSProp
MOMENTUM = 0.95  # Momentum used by RMSProp
MIN_GRAD = 0.01  # Constant added to the squared gradient in the denominator of the RMSProp update
SAVE_NETWORK_PATH = DATA_PATH + '/saved_networks'
SAVE_SUMMARY_PATH = DATA_PATH + '/summary'
DEMAND_MODEL_PATH = '/home/wenqi/Ashutosh/model.h5'

#Helper function
def pad_crop(F, x, y, size):
    pad_F = np.pad(F, int((size - 1) / 2), 'constant')
    return pad_F[x:x + size, y:y + size]


def build_d_network():
    input = Input(shape=(6, 212, 219), dtype='float32')
    x = Convolution2D(8, (5, 5), activation='relu', border_mode='same',data_format="channels_first")(input)
    x = Convolution2D(16, (3, 3), activation='relu', border_mode='same',data_format="channels_first")(x)
    output = Convolution2D(1, (1, 1), activation='relu', border_mode='same',data_format="channels_first")(x)
    model = Model(input=input, output=output)
    return model

# Version 3.3
def build_q_network():
    main_input = Input(shape=(MAIN_DEPTH, MAIN_LENGTH, MAIN_LENGTH), dtype='float32')
    aux_input = Input(shape=(AUX_DEPTH, AUX_LENGTH, AUX_LENGTH), dtype='float32')
    c = OUTPUT_LENGTH / 2
    sliced_input = Lambda(lambda x: x[:, :-1, :, :])(main_input)
    ave = AveragePooling2D(pool_size=(OUTPUT_LENGTH, OUTPUT_LENGTH), strides=(1, 1),data_format ="channels_first")(sliced_input)
    e=int(c)
    ave1 = Cropping2D(cropping=((e, e), (e, e)),data_format="channels_first")(ave)  #correct
    ave2 = AveragePooling2D(pool_size=(OUTPUT_LENGTH, OUTPUT_LENGTH), strides=(1, 1),data_format= "channels_first")(ave) #correct
    gra_test = Cropping2D(cropping=((e * 2, e * 2), (e * 2, e * 2)),data_format="channels_first")(sliced_input)#correct
    merge1_test_3 = concatenate([gra_test,ave1, ave2],axis=1) # 12 x 23 x 23
    x = Convolution2D(16, (5, 5), activation='relu', name='main/conv_1',data_format="channels_first")(merge1_test_3)
    x = Convolution2D(32, (3, 3), activation='relu', name='main/conv_2',data_format="channels_first")(x)
    main_output = Convolution2D(64, (3, 3), activation='relu', name='main/conv_3',data_format="channels_first")(x)
    aux_output = Convolution2D(16, (1, 1), activation='relu', name='ayx/conv',data_format="channels_first")(aux_input)
    merge2_test = concatenate([main_output,aux_output],axis=1)
    x = Convolution2D(128, (1, 1), activation='relu', name='merge/conv',data_format="channels_first")(merge2_test)
    x = Convolution2D(1, (1, 1), name='main/q_value',data_format="channels_first")(x)
    z = Flatten()(x)
    legal = Flatten()(Lambda(lambda x: x[:, -1:, :, :])(aux_input))
    q_values_test1 = Multiply()([z,legal])
    model = Model(input=[main_input, aux_input], output=q_values_test1)
    return main_input, aux_input, q_values_test1, model

# Version 3.4
# merge1 = merge([gra, ave1, ave2], mode='concat', concat_axis=1)
# x = Convolution2D(16, 5, 5, activation='relu', name='main/conv_1')(merge1)
# x = Convolution2D(32, 3, 3, activation='relu', name='main/conv_2')(x)
# main_output = Convolution2D(64, 3, 3, activation='relu', name='main/conv_3')(x)
# aux_output = Convolution2D(16, 1, 1, activation='relu', name='aux/conv')(aux_input)
# merge2 = merge([main_output, aux_output], mode='concat', concat_axis=1)
# x = Convolution2D(128, 1, 1, activation='relu', name='merge/conv')(merge2)
#
# v = Convolution2D(1, 1, 1, activation='relu', name='value/conv')(x)
# v = MaxPooling2D(pool_size=(3, 3))(v)
# v = Flatten()(v)
# v = Dense(32, activation='relu', name='value/dense_1')(v)
# v = Dense(1, name='value/dense_2')(v)
# value = Lambda(lambda s: K.expand_dims(s[:, 0], dim=-1),
#                output_shape=(OUTPUT_LENGTH * OUTPUT_LENGTH,), name='value/lambda')(v)
#
# z = Convolution2D(1, 1, 1, name='advantage/conv')(x)
# z = Flatten()(z)
# advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), name='advantage/lambda')(z)


# Vesrsion 3.1
# def build_q_network():
#     main_input = Input(shape=(MAIN_DEPTH, MAIN_LENGTH, MAIN_LENGTH), dtype='float32')
#     aux_input = Input(shape=(AUX_DEPTH, AUX_LENGTH, AUX_LENGTH), dtype='float32')
#
#     c = OUTPUT_LENGTH / 2
#     ave = AveragePooling2D(pool_size=(OUTPUT_LENGTH, OUTPUT_LENGTH), strides=(1, 1))(main_input)
#     ave1 = Cropping2D(cropping=((c, c), (c, c)))(ave)
#     ave2 = AveragePooling2D(pool_size=(OUTPUT_LENGTH, OUTPUT_LENGTH), strides=(1, 1))(ave)
#     gra = Cropping2D(cropping=((c * 2, c * 2), (c * 2, c * 2)))(main_input)
#
#     merge1 = merge([gra, ave1, ave2], mode='concat', concat_axis=1)
#     x = Convolution2D(16, 5, 5, activation='relu', name='main/conv_1')(merge1)
#     x = Convolution2D(32, 3, 3, activation='relu', name='main/conv_2')(x)
#     main_output = Convolution2D(64, 3, 3, activation='relu', name='main/conv_3')(x)
#     merge2 = merge([main_output, aux_input], mode='concat', concat_axis=1)
#     x = Convolution2D(128, 1, 1, activation='relu', name='merge/conv_1')(merge2)
#     x = Convolution2D(128, 1, 1, activation='relu', name='merge/conv_2')(x)
#
#     v = Convolution2D(1, 1, 1, activation='relu', name='value/conv')(x)
#     v = MaxPooling2D(pool_size=(3, 3))(v)
#     v = Flatten()(v)
#     v = Dense(32, activation='relu', name='value/dense_1')(v)
#     v = Dense(1, name='value/dense_2')(v)
#     value = Lambda(lambda s: K.expand_dims(s[:, 0], dim=-1),
#                    output_shape=(OUTPUT_LENGTH*OUTPUT_LENGTH,), name='value/lambda')(v)
#
#     z = Convolution2D(1, 1, 1, name='advantage/conv')(x)
#     z = Flatten()(z)
#     advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), name='advantage/lambda')(z)
#
#     q_values = merge([value, advantage], mode='sum')
#     legal = Flatten()(Lambda(lambda a: a[:, -1:, :, :])(aux_input))
#     q_values_legal = merge([q_values, legal], mode='mul')
#
#     model = Model(input=[main_input, aux_input], output=q_values_legal)
#
#     return main_input, aux_input, q_values_legal, model

# Version 3.2
# merge1 = merge([gra, ave1, ave2], mode='concat', concat_axis=1)
# x = Convolution2D(16, 5, 5, activation='relu', name='main/conv_1')(merge1)
# x = Convolution2D(32, 3, 3, activation='relu', name='main/conv_2')(x)
# main_output = Convolution2D(64, 3, 3, activation='relu', name='main/conv_3')(x)
# merge2 = merge([main_output, aux_input], mode='concat', concat_axis=1)
# x = Convolution2D(128, 1, 1, activation='relu', name='merge/conv_1')(merge2)
# x = Convolution2D(128, 1, 1, activation='relu', name='merge/conv_2')(x)
# z = Convolution2D(1, 1, 1, name='main/q_value')(x)
# z = Flatten()(z)
# legal = Flatten()(Lambda(lambda a: a[:, -1:, :, :])(aux_input))
# q_values_legal = merge([z, legal], mode='mul')

class Agent(object):
    def __init__(self, geohash_table, time_step, cycle, demand_cycle, training=True, load_network=False):
        self.geo_table = geohash_table
        self.time_step = time_step
        self.cycle = cycle
        self.training = training
        self.demand_cycle = demand_cycle
        self.x_matrix = np.zeros((AUX_LENGTH, AUX_LENGTH))
        self.y_matrix = np.zeros((AUX_LENGTH, AUX_LENGTH))
        self.d_matrix = np.zeros((AUX_LENGTH, AUX_LENGTH))
        for i in range(AUX_LENGTH):
            self.x_matrix[i, :] = i - AUX_LENGTH/2
            self.y_matrix[:, i] = i - AUX_LENGTH/2
            for j in range(AUX_LENGTH):
                self.d_matrix[i, j] = np.sqrt((i - AUX_LENGTH/2)**2 + (j - AUX_LENGTH/2)**2) / OUTPUT_LENGTH
        self.geo_table['x'] = np.uint8((self.geo_table.lon - LONGITUDE_MIN) / LONGITUDE_DELTA)
        self.geo_table['y'] = np.uint8((self.geo_table.lat - LATITUDE_MIN) / LATITUDE_DELTA)
        self.xy2g = [[list(self.geo_table[(self.geo_table.x == x) & (self.geo_table.y == y)].index)
                      for y in range(MAP_HEIGHT)] for x in range(MAP_WIDTH)]
        self.legal_map = np.zeros((MAP_WIDTH, MAP_HEIGHT))
        for x in range(MAP_WIDTH):
            for y in range(MAP_HEIGHT):
                if self.xy2g[x][y]:
                    self.legal_map[x, y] = 1

        index = pd.MultiIndex.from_tuples([(x, y) for x in range(MAP_WIDTH) for y in range(MAP_HEIGHT)], names=['x', 'y'])
        self.df = pd.DataFrame(index=index, columns=['X', 'X1', 'X2', 'X_idle', 'W'])
        self.action_space = [(x, y) for x in range(-MAX_MOVE, MAX_MOVE + 1) for y in range(-MAX_MOVE, MAX_MOVE + 1)]
        self.num_actions = len(self.action_space)

        # Create q network
        self.s, self.x, self.q_values, q_network = build_q_network()
        q_network_weights = q_network.trainable_weights
        self.num_iters = 0
        self.sess = tf.InteractiveSession()

        if self.training:
            #for var in q_network_weights:
            #    tf.histogram_summary(var.name, var)

            # Create target network
            self.st, self.xt, self.target_q_values, target_network = build_q_network()
            target_network_weights = target_network.trainable_weights

            # Define target network update operation
            self.update_target_network = [target_network_weights[i].assign(q_network_weights[i]) for i in
                                          range(len(target_network_weights))]

            # Define loss and gradient update operation
            self.a, self.y, self.loss, self.grad_update = self.build_training_op(q_network_weights)

            self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary()
            self.summary_writer = tf.summary.FileWriter(SAVE_SUMMARY_PATH, self.sess.graph)

            self.epsilon = INITIAL_EPSILON
            self.epsilon_step = (FINAL_EPSILON - INITIAL_EPSILON) / EXPLORATION_STEPS
            self.beta = INITIAL_BETA
            self.beta_step = (FINAL_BETA - INITIAL_BETA) / EXPLORATION_STEPS

            self.num_iters -= INITIAL_REPLAY_SIZE
            self.start_iter = self.num_iters

            # Parameters used for summary
            self.total_q_max = 0
            self.total_loss = 0

            # Create state buffer
            self.state_buffer = deque()

            # Create replay memory
            self.replay_memory = deque()
            self.replay_memory_weights = deque()
            self.replay_memory_keys = [
                'minofday', 'dayofweek', 'env', 'pos', 'action',
                'reward', 'next_env', 'next_pos', 'delay']

        self.saver = tf.train.Saver(q_network_weights)
        if not os.path.exists(SAVE_NETWORK_PATH):
            os.makedirs(SAVE_NETWORK_PATH)
        self.sess.run(tf.initialize_all_variables())

        # Load network
        if load_network:
            self.load_network()

        # Initialize target network
        if self.training:
            self.sess.run(self.update_target_network)
        else:
            self.demand_model = build_d_network()
            self.demand_model.load_weights(DEMAND_MODEL_PATH)


    def reset(self, requests, dayofweek, minofday):
        self.dayofweek = dayofweek
        self.minofday = minofday
        self.request_buffer = deque()
        self.geo_table['W_1'] = 0
        self.geo_table['W_2'] = 0
        minutes = (requests.second.values[-1] - requests.second.values[0]) / 60.0
        count = requests.groupby('phash')['plat'].count() * self.time_step / minutes
        for i in range(int(60 / self.time_step)):
            self.request_buffer.append(count.copy())
        self.state_buffer = deque()
        self.start_iter = self.num_iters
        self.total_q_max = 0
        self.total_loss = 0

    def init_train(self, N, init_memory, summary_duration=5):
        self.replay_memory = deque()
        self.replay_memory_weights = deque()
        self.replay_memory.extend(init_memory)
        self.replay_memory_weights.extend([len(m[3]) for m in init_memory])
        for i in range(N):
            if i % TARGET_UPDATE_INTERVAL == 0:
                self.sess.run(self.update_target_network)

            if i % summary_duration == 0:
                avg_q_max = self.total_q_max / summary_duration
                avg_loss = self.total_loss / summary_duration
                print('ITER: {:d} / Q_MAX: {:.3f} / LOSS: {:.3f}'.format(i, avg_q_max, avg_loss))
                self.total_q_max = 0
                self.total_loss = 0

            self.train_network()

    def get_actions(self, vehicles, requests):
        self.update_time()
        if not self.training:
            self.update_demand(requests)
        env_state, resource = self.preprocess(vehicles)
        #print (env_state[0].shape,env_state[1].shape,env_state[2].shape,env_state[3].shape,env_state[4].shape)
#         print (env_state[0].head(5))
#         print (env_state[1].head(5))
#         print (env_state[2].head(5))
#         print (env_state[3].head(5))
#         print (env_state[4].head(5))


        if self.training:
            self.memorize_experience(env_state, vehicles)
            if self.num_iters >= 0:
                # Update target network
                if self.num_iters % TARGET_UPDATE_INTERVAL == 0:
                    self.sess.run(self.update_target_network)

                # Train network
                #if len(self.replay_memory) == NUM_REPLAY_MEMORY:
                self.train_network()

                if self.num_iters % SUMMARY_INTERVAL == 0:
                    self.write_summary()

                # Save network
                if self.num_iters % SAVE_INTERVAL == 0:
                    save_path = self.saver.save(self.sess, SAVE_NETWORK_PATH + '/' + ENV_NAME,
                                                global_step=(self.num_iters))
                    print('Successfully saved: ' + save_path)

                # Anneal epsilon linearly over time
                if self.num_iters < EXPLORATION_STEPS:
                    self.epsilon += self.epsilon_step
                    self.beta += self.beta_step

        if len(resource.index) > 0:
            if self.training:
                actions = self.e_greedy(env_state, resource)
            else:
                actions = self.run_policy(env_state, resource)
        else:
            actions = []

        self.num_iters += 1

        return actions


    def update_time(self):
        self.minofday += self.time_step
        if self.minofday >= 1440: # 24 hour * 60 minute
            self.minofday -= 1440
            self.dayofweek = (self.dayofweek + 1) % 7


    def update_demand(self, requests):
        if len(self.request_buffer) >= 60 / self.time_step:
            self.request_buffer.popleft()
        count = requests.groupby('phash')['plat'].count()
        self.request_buffer.append(count)

        if self.num_iters % 10 == 0:
            self.geo_table.loc[:, ['W_1', 'W_2']] = 0
            for i, W in enumerate(self.request_buffer):
                if i < 30 / self.time_step:
                    self.geo_table.loc[W.index, 'W_1'] += W.values
                else:
                    self.geo_table.loc[W.index, 'W_2'] += W.values

            df = self.geo_table
            W_1 = df.pivot(index='x_', columns='y_', values='W_1').fillna(0).values
            W_2 = df.pivot(index='x_', columns='y_', values='W_2').fillna(0).values
            min = self.minofday / 1440.0
            day = self.dayofweek / 7.0
            aux_features = [np.sin(min), np.cos(min), np.sin(day), np.cos(day)]
            demand = self.demand_model.predict(np.float32([[W_1, W_2] + [np.ones(W_1.shape) * x for x in aux_features]]))[0,0]
            self.geo_table['W'] = demand[self.geo_table.x_.values, self.geo_table.y_.values]

        return


    def preprocess(self, vehicles):
        vehicles['x'] = np.uint8((vehicles.lon - LONGITUDE_MIN) / LONGITUDE_DELTA)
        vehicles['y'] = np.uint8((vehicles.lat - LATITUDE_MIN) / LATITUDE_DELTA)

        #print (vehicles.shape)

        R = vehicles[vehicles.available==1]
        R_idle = R[R.idle%self.cycle==0]
        R1 = vehicles[vehicles.eta <= self.cycle]
        R2 = vehicles[vehicles.eta <= self.cycle * 2]

        #print (self.geo_table.shape)

        self.geo_table['X'] = R.groupby('dest_geohash')['available'].count()
        #print (self.geo_table.shape)
        self.geo_table = self.geo_table.fillna(0)
        self.geo_table['ratio'] = self.geo_table.X / float(self.geo_table.X.sum() + 1) - self.geo_table.W / float(self.geo_table.W.sum() + 1)

        self.df['W'] = self.geo_table.groupby(['x', 'y'])['W'].sum()
        self.df['X'] = R.groupby(['x', 'y'])['available'].count()
        self.df['X1'] = R1.groupby(['x', 'y'])['available'].count()
        self.df['X2'] = R2.groupby(['x', 'y'])['available'].count()
        self.df['X_idle'] = R_idle.groupby(['x', 'y'])['available'].count()
        self.df = self.df.fillna(0)
        self.df['X1'] -= self.df.W / 2.0
        self.df['X2'] -= self.df.W
        df = self.df.reset_index()
        W = df.pivot(index='x', columns='y', values='W').fillna(0).values.astype(np.float32) / W_SCALE
        X = df.pivot(index='x', columns='y', values='X').fillna(0).values.astype(np.float32) / X_SCALE
        X1 = df.pivot(index='x', columns='y', values='X1').fillna(0).values.astype(np.float32) / X_SCALE
        X2 = df.pivot(index='x', columns='y', values='X2').fillna(0).values.astype(np.float32) / X_SCALE
        X_idle = df.pivot(index='x', columns='y', values='X_idle').fillna(0).values.astype(np.float32) / X_SCALE
        env_state = [W, X, X1, X2, X_idle]

        return env_state, R_idle

    def e_greedy(self, env_state, resource):
        dispatch = []
        actions = []
        xy_idle = [(x, y) for y in range(MAP_HEIGHT) for x in range(MAP_WIDTH) if env_state[-1][x, y] > 0]

        if self.epsilon < 1:
            xy2index = {(x, y):i for i, (x, y) in enumerate(xy_idle)}
            aux_features = np.float32(self.create_aux_feature(self.minofday, self.dayofweek, xy_idle))
            main_features = np.float32(self.create_main_feature(env_state, xy_idle))
            aids = np.argmax(self.q_values.eval(feed_dict={
                    self.s: np.float32(main_features), self.x: np.float32(aux_features)}), axis=1)

        for vid, (x, y) in resource[['x', 'y']].iterrows():
            if self.epsilon < np.random.random():
                aid = aids[xy2index[(x, y)]]
            else:
                aid = STAY_ACTION if self.beta >= np.random.random() else np.random.randint(self.num_actions)
            action = STAY_ACTION
            if aid != STAY_ACTION:
                move_x, move_y = self.action_space[aid]
                x_ = x + move_x
                y_ = y + move_y
                if x_ >= 0 and x_ < MAP_WIDTH and y_ >= 0 and y_ < MAP_HEIGHT:
                    g = self.xy2g[x_][y_]
                    if len(g) > 0:
                        gmin = self.geo_table.loc[g, 'ratio'].argmin()
                        lat, lon = self.geo_table.loc[gmin, ['lat', 'lon']]
                        dispatch.append((vid, (lat, lon)))
                        action = aid
            actions.append(action)

        state_dict = {}
        state_dict['minofday'] = self.minofday
        state_dict['dayofweek'] = self.dayofweek
        state_dict['vid'] = resource.index
        state_dict['env'] = env_state
        state_dict['pos'] = resource[['x', 'y']].values.astype(np.uint8)
        state_dict['reward'] = resource['reward'].values.astype(np.float32)
        state_dict['action'] = np.uint8(actions)
        self.state_buffer.append(state_dict)

        return dispatch


    def run_policy(self, env_state, resource):
        dispatch = []
        W, X, X1, X2, X_idle = env_state
        xy_idle = [(x, y) for y in range(MAP_HEIGHT) for x in range(MAP_WIDTH) if X_idle[x, y] > 0]
        xy2index = {(x, y): i for i, (x, y) in enumerate(xy_idle)}
        aux_features = np.float32(self.create_aux_feature(self.minofday, self.dayofweek, xy_idle))

        for vid, (x, y) in resource[['x', 'y']].iterrows():
            aux_feature = aux_features[[xy2index[(x, y)]]]
            main_feature = np.float32(self.create_main_feature(env_state, [(x, y)]))
            aid = np.argmax(self.q_values.eval(feed_dict={
                    self.s: np.float32(main_feature), self.x: np.float32(aux_feature)}), axis=1)[0]
            new_x, new_y = x, y
            if aid != STAY_ACTION:
                move_x, move_y = self.action_space[aid]
                x_ = x + move_x
                y_ = y + move_y
                if x_ >= 0 and x_ < MAP_WIDTH and y_ >= 0 and y_ < MAP_HEIGHT:
                    g = self.xy2g[x_][y_]
                    if len(g) > 0:
                        gmin = self.geo_table.loc[g, 'ratio'].argmin()
                        lat, lon = self.geo_table.loc[gmin, ['lat', 'lon']]
                        dispatch.append((vid, (lat, lon)))
                        new_x, new_y = x_, y_
            X1[x, y] -= 1.0 / X_SCALE
            X2[x, y] -= 1.0 / X_SCALE
            X_idle[x, y] -= 1.0 / X_SCALE
            X1[new_x, new_y] += 1.0 / X_SCALE
            X2[new_x, new_y] += 1.0 / X_SCALE

        return dispatch


    def create_main_feature(self, env_state, positions):
        features = [[pad_crop(s, x, y, MAIN_LENGTH) for s in env_state]
                    for x, y in positions]
        return features

    def create_aux_feature(self, minofday, dayofweek, positions):
        aux_features = []
        min = minofday / 1440.0
        day = (dayofweek + int(min)) / 7.0
        for i, (x, y) in enumerate(positions):
            aux = np.zeros((AUX_DEPTH, AUX_LENGTH, AUX_LENGTH))
            aux[0, :, :] = np.sin(min)
            aux[1, :, :] = np.cos(min)
            aux[2, :, :] = np.sin(day)
            aux[3, :, :] = np.cos(day)
            aux[4, int(AUX_LENGTH/2), int(AUX_LENGTH/2)] = 1.0
            aux[5, :, :] = float(x) / MAP_WIDTH
            aux[6, :, :] = float(y) / MAP_HEIGHT
            aux[7, :, :] = (float(x) + self.x_matrix) / MAP_WIDTH
            aux[8, :, :] = (float(y) + self.y_matrix) / MAP_HEIGHT
            aux[9, :, :] = self.d_matrix
            legal_map = pad_crop(self.legal_map, x, y, AUX_LENGTH)
            legal_map[int(AUX_LENGTH / 2) + 1, int(AUX_LENGTH / 2) + 1] = 1
            aux[10, :, :] = legal_map
            aux_features.append(aux)

        return aux_features


    def memorize_experience(self, env_state, vehicles):
        # Store transition in replay memory

        if len(self.state_buffer) == 0:
            return

        if (self.state_buffer[0]['minofday'] + self.cycle) % 1440 != self.minofday:
            return

        state_action = self.state_buffer.popleft()
        weight = len(state_action['vid'])
        if weight == 0:
            return

        vdata = vehicles.loc[state_action['vid'], ['geohash', 'reward', 'eta', 'lat', 'lon']]

        state_action['reward'] =  vdata['reward'].values.astype(np.float32) - state_action['reward']
        state_action['delay'] =  np.round(vdata['eta'].values / self.cycle).astype(np.uint8)
        state_action['next_pos'] = self.geo_table.loc[vdata['geohash'], ['x', 'y']].values.astype(np.uint8)
        state_action['next_env'] = env_state
        self.replay_memory.append([state_action[key] for key in self.replay_memory_keys])
        self.replay_memory_weights.append(weight)
        if len(self.replay_memory) > NUM_REPLAY_MEMORY:
            self.replay_memory.popleft()
            self.replay_memory_weights.popleft()

        return


    def train_network(self):
        main_batch = []
        aux_batch = []
        action_batch = []
        reward_batch = []
        next_main_batch = []
        next_aux_batch = []
        delay_batch = []

        # Sample random minibatch of transition from replay memory
        #0 minofday
        #1 dayofweek
        #2 env
        #3 pos
        #4 action
        #5 reward
        #6 next_env
        #7 next_pos
        #8 delay
        weights = np.array(self.replay_memory_weights, dtype=np.float32)
        memory_index = np.random.choice(range(len(self.replay_memory)), size=int(BATCH_SIZE*NUM_BATCH/SAMPLE_PER_FRAME), p=weights/weights.sum())
        for i in memory_index:
            data = self.replay_memory[i]
            samples = np.random.randint(self.replay_memory_weights[i], size=SAMPLE_PER_FRAME)
            aux_batch += self.create_aux_feature(data[0], data[1], data[3][samples])
            next_aux_batch += self.create_aux_feature(data[0] + self.cycle, data[1], data[7][samples])
            main_batch += self.create_main_feature(data[2], data[3][samples])
            next_main_batch += self.create_main_feature(data[6], data[7][samples])
            action_batch += data[4][samples].tolist()
            reward_batch += data[5][samples].tolist()
            delay_batch += data[8][samples].tolist()

        # Double DQN
        target_q_batch = self.target_q_values.eval(
            feed_dict={
                self.st: np.array(next_main_batch),
                self.xt: np.array(next_aux_batch)
            })
        a_batch = np.argmax(self.q_values.eval(
            feed_dict={
                self.s: np.array(next_main_batch),
                self.x: np.array(next_aux_batch)
            }), axis=1)
        target_q_max_batch = target_q_batch[range(BATCH_SIZE * NUM_BATCH), a_batch]
        self.total_q_max += target_q_max_batch.mean()

        y_batch = np.array(reward_batch) + GAMMA ** (1 + np.array(delay_batch)) * target_q_max_batch
        p = np.random.permutation(BATCH_SIZE * NUM_BATCH)
        main_batch = np.array(main_batch)[p]
        aux_batch = np.array(aux_batch)[p]
        action_batch = np.array(action_batch)[p]
        y_batch = y_batch[p]
        batches = [(main_batch[k:k + BATCH_SIZE], aux_batch[k:k + BATCH_SIZE], action_batch[k:k + BATCH_SIZE], y_batch[k:k + BATCH_SIZE])
                   for k in range(0, BATCH_SIZE * NUM_BATCH, BATCH_SIZE)]

        total_loss = 0
        for s, x, a, y in batches:
            loss, _ = self.sess.run([self.loss, self.grad_update], feed_dict={
                self.s: s,
                self.x: x,
                self.a: a,
                self.y: y
            })
            total_loss += loss
        self.total_loss += total_loss / NUM_BATCH

        return

    def build_training_op(self, q_network_weights):
        a = tf.placeholder(tf.int64, [None])
        y = tf.placeholder(tf.float32, [None])

        # Convert action to one hot vector
        a_one_hot = tf.one_hot(a, self.num_actions, 1.0, 0.0)
        q_value = tf.reduce_sum(tf.multiply(self.q_values, a_one_hot), reduction_indices=1)

        # Clip the error, the loss is quadratic when the error is in (-1, 1), and linear outside of that region
        error = tf.abs(y - q_value)
        quadratic_part = tf.clip_by_value(error, 0.0, 1.0)
        linear_part = error - quadratic_part
        loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part)

        optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE, momentum=MOMENTUM, epsilon=MIN_GRAD)
        grad_update = optimizer.minimize(loss, var_list=q_network_weights)

        return a, y, loss, grad_update


    def setup_summary(self):
        avg_max_q = tf.Variable(0.)
        tf.summary.scalar(ENV_NAME + '/Average Max Q', avg_max_q)
        avg_loss = tf.Variable(0.)
        tf.summary.scalar(ENV_NAME + '/Average Loss', avg_loss)
        summary_vars = [avg_max_q, avg_loss]
        summary_placeholders = [tf.placeholder(tf.float32) for _ in range(len(summary_vars))]
        update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))]
        summary_op = tf.summary.merge_all()
        return summary_placeholders, update_ops, summary_op

    def write_summary(self):
        if self.num_iters >= 0:
            duration = float(self.num_iters - self.start_iter + 1)
            avg_q_max = self.total_q_max / duration
            avg_loss = self.total_loss / duration
            stats = [avg_q_max, avg_loss]
            for i in range(len(stats)):
                self.sess.run(self.update_ops[i], feed_dict={
                    self.summary_placeholders[i]: float(stats[i])
                })
            summary_str = self.sess.run(self.summary_op)
            self.summary_writer.add_summary(summary_str, self.num_iters)

            # Debug
            print('ITER: {0:6d} / EPSILON: {1:.4f} / BETA: {2:.4f} / Q_MAX: {3:.3f} / LOSS: {4:.3f}'.format(
                self.num_iters, self.epsilon, self.beta, avg_q_max, avg_loss))
            sys.stdout.flush()

        self.start_iter = self.num_iters
        self.total_q_max = 0
        self.total_loss = 0

    def load_network(self):
        checkpoint = tf.train.get_checkpoint_state(SAVE_NETWORK_PATH)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print('Successfully loaded: ' + checkpoint.model_checkpoint_path)
        else:
            print('Training new network...')


    def update_future_demand(self, requests):
        self.geo_table['W'] = 0
        W = requests.groupby('phash')['plat'].count()
        self.geo_table.loc[W.index, 'W'] += W.values