diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c5b5606 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +Results/ +.DS_Store +.idea/ +Scripts/__pycache__/ \ No newline at end of file diff --git a/Display/Fig1a.png b/Display/Fig1a.png new file mode 100644 index 0000000..41bd0c3 Binary files /dev/null and b/Display/Fig1a.png differ diff --git a/Display/Fig1b.png b/Display/Fig1b.png new file mode 100644 index 0000000..beb5a31 Binary files /dev/null and b/Display/Fig1b.png differ diff --git a/Main.py b/Main.py new file mode 100644 index 0000000..f1d4f19 --- /dev/null +++ b/Main.py @@ -0,0 +1,80 @@ +from Scripts.Algorithm import train, evaluateMARLNonLocal, evaluateMARLLocal +from Scripts.Parameters import ParseInput +import time +import numpy as np +import matplotlib.pyplot as plt +import os + +if __name__ == '__main__': + args = ParseInput() + + t0 = time.time() + + indexN = 0 + valueLocalArray = np.zeros(args.numN) + valueLocalArraySD = np.zeros(args.numN) + + valueNonLocalArray = np.zeros(args.numN) + valueNonLocalArraySD = np.zeros(args.numN) + + ErrorArray = np.zeros(args.numN) + ErrorArraySD = np.zeros(args.numN) + + NVec = np.zeros(args.numN) + + if args.train: + print('Training is in progress.') + train(args) + + print('Evaluation is in progress.') + while indexN < args.numN: + N = args.minN + indexN * args.divN + NVec[indexN] = N + + for _ in range(0, args.maxSeed): + valueLocal = evaluateMARLLocal(args, N) + valueLocal = np.array(valueLocal.detach()) + + valueLocalArray[indexN] += valueLocal/args.maxSeed + valueLocalArraySD[indexN] += valueLocal ** 2 / args.maxSeed + + valueNonLocal = evaluateMARLNonLocal(args, N) + valueNonLocal = np.array(valueNonLocal.detach()) + + valueNonLocalArray[indexN] += valueNonLocal/args.maxSeed + valueNonLocalArraySD[indexN] += valueNonLocal**2/args.maxSeed + + Error = np.abs(valueNonLocal - valueLocal) + ErrorArray[indexN] += Error/args.maxSeed + ErrorArraySD[indexN] += Error**2/args.maxSeed + + indexN += 1 + print(f'N: {N}') + + valueLocalArraySD = np.sqrt(np.maximum(0, valueLocalArraySD - valueLocalArray ** 2)) + valueNonLocalArraySD = np.sqrt(np.maximum(0, valueNonLocalArraySD - valueNonLocalArray ** 2)) + ErrorArraySD = np.sqrt(np.maximum(0, ErrorArraySD - ErrorArray ** 2)) + + if not os.path.exists('Results'): + os.mkdir('Results') + + plt.figure() + plt.xlabel('N') + plt.ylabel('Values') + plt.plot(NVec, valueLocalArray, label='Local') + plt.fill_between(NVec, valueLocalArray - valueLocalArraySD, valueLocalArray + valueLocalArraySD, alpha=0.3) + plt.plot(NVec, valueNonLocalArray, label='Non-Local') + plt.fill_between(NVec, valueNonLocalArray - valueNonLocalArraySD, valueNonLocalArray + valueNonLocalArraySD, alpha=0.3) + plt.legend() + plt.savefig(f'Results/Values.png') + + plt.figure() + plt.xlabel('N') + plt.ylabel('Error') + plt.plot(NVec, ErrorArray) + plt.fill_between(NVec, ErrorArray - ErrorArraySD, ErrorArray + ErrorArraySD, alpha=0.3) + plt.savefig(f'Results/Error.png') + + t1 = time.time() + + print(f'Elapsed time is {t1-t0} sec') diff --git a/Models/Actor.pkl b/Models/Actor.pkl new file mode 100644 index 0000000..48bc9f3 Binary files /dev/null and b/Models/Actor.pkl differ diff --git a/README.md b/README.md new file mode 100644 index 0000000..e9458d4 --- /dev/null +++ b/README.md @@ -0,0 +1,35 @@ +# Introduction + +This repository contains codes that are used for generating numerical results in the following paper: + +"On the Near-Optimality of Local Policies in Large Cooperative +Multi-Agent Reinforcement Learning", Transactions on Machine Learning Research, 2022. + +# Parameters + +Various parameters used in the experiments can be found in Scripts/Parameters.py file. + +# Results + +Generated results will be stored in Results folder (will be created on the fly). +Some pre-generated results are available for display in the Display folder. Specifically, +Fig. 1 depicts the percentage error between the values generated by local and non-local policies in an N-agent system +as a function of N. + +# Run Experiments + +``` +python3 Main.py +``` + +# Command Line Options + +Various command line options are given below: + +``` +--train : if training is required from scratch, otherwise a pre-trained model will be used +--minN : minimum value of N +--numN : number of N values +--divN : difference between two consecutive N values +--maxSeed: number of random seeds +``` \ No newline at end of file diff --git a/Scripts/Algorithm.py b/Scripts/Algorithm.py new file mode 100644 index 0000000..89fa20b --- /dev/null +++ b/Scripts/Algorithm.py @@ -0,0 +1,341 @@ +import torch +import torch.optim as optim +import torch.nn.functional as F +import torch.nn as nn +from torch.distributions.categorical import Categorical +import os +import copy + + +class Actor(nn.Module): + def __init__(self, state_size, action_size, hidden_size=32): + super(Actor, self).__init__() + self.state_size = 2*state_size # one-hot state + mean-distribution + self.action_size = action_size + self.hidden_size = hidden_size + self.linear1 = nn.Linear(self.state_size, self.hidden_size) + self.linear2 = nn.Linear(self.hidden_size, self.hidden_size) + self.linear3 = nn.Linear(self.hidden_size, self.action_size) + + def forward(self, state, state_dist): + state_joined = torch.cat([state, state_dist]) + output = F.relu(self.linear1(state_joined)) + output = F.relu(self.linear2(output)) + output = F.softmax(self.linear3(output), dim=-1) + return output + + +def train(args): + actor = Actor(args.num_states, args.num_actions, args.hidden_size) + NumActParam = 2*args.num_states * args.hidden_size + args.hidden_size + args.hidden_size**2 + args.hidden_size + args.hidden_size*args.num_actions + args.num_actions + optimizer = optim.Adam(list(actor.parameters())) + + # Floating point representation of states + states_float = torch.tensor(range(0, args.num_states)).float() + + for j in range(args.J): + + w = torch.zeros(NumActParam) + w_avg = torch.zeros(NumActParam) + + for _ in range(args.L): + + # Initial state distribution + curr_state_dist = torch.ones(args.num_states) / args.num_states + curr_state = Categorical(curr_state_dist).sample().long() + + """ ------------ Sampling (x, mu, u) ------------ """ + FLAG = False + while not FLAG: + if torch.rand(1) > args.gamma: + FLAG = True + """ --------- Update Subroutine -------------- """ + + """ ------------ Current State ------------------- """ + curr_state_one_hot = torch.zeros(args.num_states) + curr_state_one_hot[curr_state] = 1 + + """ ------------- Mean of Current State Distribution ------------- """ + curr_state_dist_mean = torch.dot(states_float, curr_state_dist) + + """ ------------- Current Action ------------------ """ + policy = Categorical(actor(curr_state_one_hot, curr_state_dist)) + curr_action = policy.sample().long() + + """ ------------- Next State --------------- """ + fraction = 1 - (curr_state_dist_mean/args.num_states) + if curr_action == 0: + next_state = curr_state + else: + chi = torch.rand(1) + next_state = curr_state + (chi * fraction * (args.num_states - 1 - curr_state)).long() + next_state_one_hot = torch.zeros(args.num_states) + next_state_one_hot[next_state] = 1 + + """ -------------- Next State Distribution ------------- """ + + next_state_dist = torch.zeros(args.num_states) + for state_t in range(0, args.num_states): + one_hot_state_t = torch.zeros(args.num_states) + one_hot_state_t[state_t] = 1 + + for action_t in range(0, args.num_actions): + dist_vec = torch.zeros(args.num_states) + if action_t == 0: + dist_vec[state_t] = 1 + else: + prob_mass = 1/(fraction * (args.num_states - 1 - state_t)) + total_prob = torch.tensor(1.0) + state_t_plus_1 = state_t + while total_prob > 0 and state_t_plus_1 < args.num_states: + dist_vec[state_t_plus_1] = torch.minimum(prob_mass, total_prob) + total_prob -= torch.minimum(prob_mass, total_prob) + state_t_plus_1 += 1 + + prob = actor(one_hot_state_t, curr_state_dist)[action_t] * curr_state_dist[state_t] + next_state_dist += dist_vec * prob + + """ --------------------- Update ------------------ """ + curr_state = copy.copy(next_state) + curr_state_dist = copy.copy(next_state_dist) + + """ ------------ Sampling Advantage Functions ---------- """ + FLAG = False + SumRewards = torch.tensor([0.]) + + while not FLAG: + if torch.rand(1) > args.gamma: + FLAG = True + """ --------- Update Subroutine -------------- """ + + """ ------------ Current State ------------------- """ + curr_state_one_hot = torch.zeros(args.num_states) + curr_state_one_hot[curr_state] = 1 + + """ ------------- Mean of Current State Distribution ------------- """ + curr_state_dist_mean = torch.dot(states_float, curr_state_dist) + + """ ------------- Current Action ------------------ """ + policy = Categorical(actor(curr_state_one_hot, curr_state_dist)) + curr_action = policy.sample().long() + + """ ------------- Next State --------------- """ + fraction = 1 - (curr_state_dist_mean/args.num_states) + if curr_action == 0: + next_state = curr_state + else: + chi = torch.rand(1) + next_state = curr_state + (chi * fraction * (args.num_states - 1 - curr_state)).long() + next_state_one_hot = torch.zeros(args.num_states) + next_state_one_hot[next_state] = 1 + + """ -------------- Next State Distribution ------------- """ + + next_state_dist = torch.zeros(args.num_states) + for state_t in range(0, args.num_states): + one_hot_state_t = torch.zeros(args.num_states) + one_hot_state_t[state_t] = 1 + + for action_t in range(0, args.num_actions): + dist_vec = torch.zeros(args.num_states) + if action_t == 0: + dist_vec[state_t] = 1 + else: + prob_mass = 1/(fraction * (args.num_states - 1 - state_t)) + total_prob = torch.tensor(1.0) + state_t_plus_1 = state_t + while total_prob > 0 and state_t_plus_1 < args.num_states: + dist_vec[state_t_plus_1] = torch.minimum(prob_mass, total_prob) + total_prob -= torch.minimum(prob_mass, total_prob) + state_t_plus_1 += 1 + + prob = actor(one_hot_state_t, curr_state_dist)[action_t] * curr_state_dist[state_t] + next_state_dist += dist_vec * prob + + """ -------------- SumRewards Update ---------- """ + SumRewards += args.alpha_r * curr_state - args.beta_r * curr_state_dist_mean - args.lambda_r * curr_action + + """ --------------------- Update ------------------ """ + curr_state = copy.copy(next_state) + curr_state_dist = copy.copy(next_state_dist) + + Value_R = 0 + Q_R = 0 + + if torch.rand(1) < 0.5: + Value_R = SumRewards + else: + Q_R = SumRewards + + Advantage_R = 2*(Q_R-Value_R) + + # Gradient Update for the Sub-Problem + log_prob = policy.log_prob(curr_action) + optimizer.zero_grad() + log_prob.backward() + + phi_grads = [] + for f in actor.parameters(): + phi_grads.append(f.grad.view(-1)) + phi_grads = torch.cat(phi_grads) + + h_grads = (torch.dot(w, phi_grads)-Advantage_R)*phi_grads + + w = w - args.alpha * h_grads + w_avg += w/args.L + + count = 0 + for phi in actor.parameters(): + phi.data -= (args.eta/(1-args.gamma))*w_avg[count] + count += 1 + + if not os.path.exists('Models'): + os.mkdir('Models') + torch.save(actor.state_dict(), f'Models/Actor.pkl') + + +def evaluateMARLLocal(args, N): + actor = Actor(args.num_states, args.num_actions) + + if not os.path.exists(f'Models/Actor.pkl'): + raise ValueError('Model does not exist.') + actor.load_state_dict(torch.load(f'Models/Actor.pkl')) + + # Initial state distribution + init_state_dist = torch.ones(args.num_states)/args.num_states + + # Initial infinite population mean-field state distribution + curr_mf_state_dist = torch.ones(args.num_states) / args.num_states + + # Current Joint State + curr_joint_state = Categorical(init_state_dist).sample([N]).long() + next_joint_state = torch.zeros(N).long() + + # Floating point representation of states + states_float = torch.tensor(range(0, args.num_states)).float() + + # Doubly Stochastic Interaction Matrix + W = torch.ones([N, N])/N + + ValueRewardMARL = 0 + curr_gamma = 1 + + for iter_count in range(args.run_eval): + curr_average_reward = 0 + + curr_joint_state_one_hot = torch.zeros([N, args.num_states]) + curr_joint_state_one_hot[range(0, N), curr_joint_state] = 1 + + curr_state_dist = torch.matmul(W, curr_joint_state_one_hot) + + for agent_index in range(0, N): + agent_state = curr_joint_state[agent_index] + agent_state_one_hot = curr_joint_state_one_hot[agent_index, :] + agent_state_dist = curr_state_dist[agent_index, :] + agent_state_dist_mean = torch.dot(states_float, agent_state_dist) + """ ------- Local Policy --------- """ + agent_action = Categorical(actor(agent_state_one_hot, curr_mf_state_dist)).sample() + + agent_reward = args.alpha_r * agent_state - args.beta_r * agent_state_dist_mean - args.lambda_r * agent_action + curr_average_reward += agent_reward/N + + # Next State for the agent + if agent_action == 1: + chi = torch.rand(1) + fraction = 1 - (agent_state_dist_mean/args.num_states) + next_joint_state[agent_index] = curr_joint_state[agent_index] + (chi*fraction*(args.num_states - 1 - curr_joint_state[agent_index])).long() + else: + next_joint_state[agent_index] = curr_joint_state[agent_index] + + ValueRewardMARL += curr_gamma*args.gamma*curr_average_reward + curr_gamma *= args.gamma + + """ --------------- Mean-Field Update ------------ """ + curr_mf_state_dist_mean = torch.dot(states_float, curr_mf_state_dist) + mf_fraction = 1 - (curr_mf_state_dist_mean / args.num_states) + + next_mf_state_dist = torch.zeros(args.num_states) + + for state_t in range(0, args.num_states): + one_hot_state_t = torch.zeros(args.num_states) + one_hot_state_t[state_t] = 1 + + for action_t in range(0, args.num_actions): + dist_vec = torch.zeros(args.num_states) + if action_t == 0: + dist_vec[state_t] = 1 + else: + prob_mass = 1 / (mf_fraction * (args.num_states - 1 - state_t)) + total_prob = torch.tensor(1.0) + state_t_plus_1 = state_t + while total_prob > 0 and state_t_plus_1 < args.num_states: + dist_vec[state_t_plus_1] = torch.minimum(prob_mass, total_prob) + total_prob -= torch.minimum(prob_mass, total_prob) + state_t_plus_1 += 1 + + prob = actor(one_hot_state_t, curr_mf_state_dist)[action_t] * curr_mf_state_dist[state_t] + next_mf_state_dist += dist_vec * prob + + """ ----------- Update -------------------- """ + curr_joint_state = copy.copy(next_joint_state) + curr_mf_state_dist = copy.copy(next_mf_state_dist) + + return ValueRewardMARL + + +def evaluateMARLNonLocal(args, N): + actor = Actor(args.num_states, args.num_actions) + + if not os.path.exists(f'Models/Actor.pkl'): + raise ValueError('Model does not exist.') + actor.load_state_dict(torch.load(f'Models/Actor.pkl')) + + # Initial state distribution + init_state_dist = torch.ones(args.num_states)/args.num_states + + # Current Joint State + curr_joint_state = Categorical(init_state_dist).sample([N]).long() + next_joint_state = torch.zeros(N).long() + + # Floating point representation of states + states_float = torch.tensor(range(0, args.num_states)).float() + + # Doubly Stochastic Interaction Matrix + W = torch.ones([N, N])/N + + ValueRewardMARL = 0 + curr_gamma = 1 + + for iter_count in range(args.run_eval): + curr_average_reward = 0 + + curr_joint_state_one_hot = torch.zeros([N, args.num_states]) + curr_joint_state_one_hot[range(0, N), curr_joint_state] = 1 + + curr_state_dist = torch.matmul(W, curr_joint_state_one_hot) + + for agent_index in range(0, N): + agent_state = curr_joint_state[agent_index] + agent_state_one_hot = curr_joint_state_one_hot[agent_index, :] + agent_state_dist = curr_state_dist[agent_index, :] + agent_state_dist_mean = torch.dot(states_float, agent_state_dist) + agent_action = Categorical(actor(agent_state_one_hot, agent_state_dist)).sample() + + agent_reward = args.alpha_r * agent_state - args.beta_r * agent_state_dist_mean - args.lambda_r * agent_action + curr_average_reward += agent_reward/N + + # Next State for the agent + if agent_action == 1: + chi = torch.rand(1) + fraction = 1 - (agent_state_dist_mean/args.num_states) + next_joint_state[agent_index] = curr_joint_state[agent_index] + (chi*fraction*(args.num_states - 1 - curr_joint_state[agent_index])).long() + else: + next_joint_state[agent_index] = curr_joint_state[agent_index] + + ValueRewardMARL += curr_gamma*args.gamma*curr_average_reward + curr_gamma *= args.gamma + + """ ----------- State Update -------------------- """ + curr_joint_state = copy.copy(next_joint_state) + + return ValueRewardMARL diff --git a/Scripts/Parameters.py b/Scripts/Parameters.py new file mode 100644 index 0000000..b4f4b0d --- /dev/null +++ b/Scripts/Parameters.py @@ -0,0 +1,35 @@ +import argparse + + +def ParseInput(): + parser = argparse.ArgumentParser() + parser.add_argument('--train', action='store_true', help='enable training') + + """ ---------- Simulation Parameters ---------- """ + parser.add_argument('--minN', type=int, default=5, dest='minN', help='minimumN') + parser.add_argument('--numN', type=int, default=20, dest='numN', help='numberN') + parser.add_argument('--divN', type=int, default=5, dest='divN', help='divisionN') + parser.add_argument('--maxSeed', type=int, default=25, dest='maxSeed', help='numberSeed') + + args = parser.parse_args() + + """ ---------- Algorithm Hyperparameters ------- """ + + args.num_actions = 2 + args.num_states = 10 + args.J = 10 ** 2 # Number of iterations for training the neural network based policy + args.L = 10 ** 2 + args.run_eval = 10 ** 2 # Number of iterations for evaluating a policy + args.gamma = 0.9 # Discount factor + + """ --------- Reward Parameters --------- """ + args.alpha_r = 1 + args.beta_r = 0.5 + args.lambda_r = 0.5 + + """----------- Learning Parameters --------- """ + args.alpha = 10**-3 + args.eta = 10**-3 + args.hidden_size = 32 + + return args