Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#!/usr/bin/env python
# coding: utf-8
import numpy as np
import cvxpy as cp
from tqdm import tqdm
from EnvGrid import Action
class StateIter():
def __init__(self, width, height):
self.width = width
self.height = height
def obs_to_index(self, obs):
# ind = obs[0] + obs[1]*self.height + obs[2]*self.height*self.width + obs[3]*self.height*self.width*self.height
ind = obs[0] + obs[1]*self.height
if int(ind) != ind:
print(ind, int(ind))
return int(ind)
def index_to_obs(self, index):
temp = index
obs = []
obs.append(temp%self.height)
temp = (temp - obs[-1])/self.height
obs.append(temp%self.width)
temp = (temp - obs[-1])/self.width
# obs.append(temp%self.height)
# temp = (temp - obs[-1])/self.height
# obs.append(temp%self.width)
# temp = (temp - obs[-1])/self.width
if temp != 0:
print(temp)
raise Exception(f"index {index} is out of bounds for map size {self.height} x {self.width}")
return np.array(obs, dtype=np.int)
def get_next_index(self, index, action):
obs = self.index_to_obs(index)
if action == Action.LEFT:
if obs[1] < 1:
return index
# return self.obs_to_index(np.array([obs[0], obs[1]-1, obs[2], obs[3]]))
return self.obs_to_index(np.array([obs[0], obs[1]-1]))
elif action == Action.RIGHT:
if obs[1] > self.width-2:
return index
# return self.obs_to_index(np.array([obs[0], obs[1]+1, obs[2], obs[3]]))
return self.obs_to_index(np.array([obs[0], obs[1]+1]))
elif action == Action.DOWN:
if obs[0] > self.height-2:
return index
# return self.obs_to_index(np.array([obs[0]+1, obs[1], obs[2], obs[3]]))
return self.obs_to_index(np.array([obs[0]+1, obs[1]]))
elif action == Action.UP:
if obs[0] < 1:
return index
# return self.obs_to_index(np.array([obs[0]-1, obs[1], obs[2], obs[3]]))
return self.obs_to_index(np.array([obs[0]-1, obs[1]]))
else:
raise Exception(f"Invalid action number: {action}")
def OREPS(env, eta, num_episodes = 10000, print_interval=20, gamma=1.4e-3, r=0.0, separation=False, force_move=None, optimistic_predictor="off", reset_period=1):
state_dim = env.config.height*env.config.width
state_iter = StateIter(env.config.width, env.config.height)
score, episode_length, prediction_error = 0, 0, 0
length_list, score_list, episode_list, pred_error_list = [], [], [], []
# For training
obs_list, action_list, reward_list = [], [], []
if env.state_dim != 4:
raise Exception("OREPS only supports state_dim==4")
if separation:
occupancy_measure = np.ones((env.timeout, state_dim, env.action_dim))
l_hat = np.zeros((env.timeout, state_dim, env.action_dim))
else:
occupancy_measure = np.ones((1, state_dim, env.action_dim))
l_hat = np.zeros((1, state_dim, env.action_dim))
occupancy_measure = occupancy_measure / occupancy_measure.sum()
M_prime = np.zeros((1, state_dim, env.action_dim))
visitation = np.zeros((state_dim, env.action_dim))
saved_preset = []
for t in tqdm(range(num_episodes)):
if t==0:
obs, _ = env.reset(True)
saved_preset.append(env.save_map())
if optimistic_predictor in ["super", "super_random"]:
for state in range(state_dim):
for action in range(env.action_dim):
try:
reward = env.get_reward(state_iter.index_to_obs(state),action)
except:
import IPython
IPython.embed()
exit()
if optimistic_predictor=="super_random" and np.random.random() < 0.01:
M_prime[0,state,action] = 0
else:
M_prime[0,state,action] = max(min(-reward,1),0)
else:
obs, _ = env.reset()
M_t = M_prime
obs_list, action_list, reward_list = [], [], []
done, truncated = False, False
# Run episode
k=0
while not done and not truncated:
obs_list.append(obs)
try:
pi = occupancy_measure[k, state_iter.obs_to_index(obs), :]
pi = pi / pi.sum()
action = np.random.choice(env.action_dim, p=pi)
except:
import IPython
IPython.embed()
exit()
obs, reward, done, truncated, _ = env.step(action)
# if truncated:
# print(f"policy distribution: {pi}")
# import IPython
# IPython.embed()
# exit()
action_list.append(action)
reward_list.append(reward)
score += reward
episode_length += 1
if separation:
k+=1
# Accumulate prediction error (c_t-M_t)
prediction_error = 0
for state in range(state_dim):
for action in range(env.action_dim):
try:
reward = env.get_reward(state_iter.index_to_obs(state),action)
except:
import IPython
IPython.embed()
exit()
prediction_error += max(min(-reward,1),0) - M_t[0,state,action]
pred_error_list.append(prediction_error)
# Reset env for the next episode
if force_move and (t+1)%force_move == 0:
obs, _ = env.reset(True)
saved_preset.append(env.save_map())
# Perfect M_t, M_prime
if optimistic_predictor in ["super", "super_random"]:
for state in range(state_dim):
for action in range(env.action_dim):
try:
reward = env.get_reward(state_iter.index_to_obs(state),action)
except:
import IPython
IPython.embed()
exit()
if optimistic_predictor=="super_random" and np.random.random() < 0.01:
M_prime[0,state,action] = 0
prediction_error += max(min(-reward,1),0)
else:
M_prime[0,state,action] = max(min(-reward,1),0)
if separation:
l_hat = np.zeros((env.timeout, state_dim, env.action_dim))
else:
l_hat = np.zeros((1, state_dim, env.action_dim))
for tmp_obs, action, reward in zip(obs_list, action_list, reward_list):
tmp_idx = state_iter.obs_to_index(tmp_obs)
if occupancy_measure[k,tmp_idx,action] <= 0:
print(f"state: {tmp_obs}")
print(f"action: {action}")
print(f"reward: {reward}")
prev_lhat = l_hat[k, tmp_idx, action]
if reward > 0:
if optimistic_predictor == "average":
M_prime[0, tmp_idx, action] = M_prime[0,tmp_idx,action]*visitation[tmp_idx,action]/(visitation[tmp_idx,action]+1)
visitation[tmp_idx, action] += 1
elif optimistic_predictor == "latest":
M_prime[0, tmp_idx, action] = 0
l_hat[k, tmp_idx, action] = -M_t[0,tmp_idx,action]/(occupancy_measure[k,tmp_idx,action]+gamma)+M_t[0,tmp_idx,action]
elif reward < -1:
if optimistic_predictor == "latest":
M_prime[0, tmp_idx, action] = 1
elif optimistic_predictor == "average":
M_prime[0, tmp_idx, action] = (M_prime[0,tmp_idx,action]*visitation[tmp_idx,action]+1)/(visitation[tmp_idx,action]+1)
visitation[tmp_idx, action] += 1
l_hat[k, tmp_idx, action] = (1-M_t[0,tmp_idx,action])/(occupancy_measure[k,tmp_idx,action]+gamma)+M_t[0,tmp_idx,action]
else:
if optimistic_predictor == "latest":
M_prime[0, tmp_idx, action] = -reward
elif optimistic_predictor == "average":
M_prime[0, tmp_idx, action] = (M_prime[0,tmp_idx,action]*visitation[tmp_idx,action]-reward)/(visitation[tmp_idx,action]+1)
visitation[tmp_idx, action] += 1
l_hat[k, tmp_idx, action] = (-reward-M_t[0,tmp_idx,action])/(occupancy_measure[k,tmp_idx,action]+gamma)+M_t[0,tmp_idx,action]
if np.exp(-l_hat[k, tmp_idx, action]*eta) <= 0:
# If occupancy measure is too small, l_hat explodes, making occupancy measure to be zero next time
# However, in O-REPS, gamma=0.0 and occupancy measure (denominator) must not be zero
# Do not update l_hat value, when occupancy measure is too small
# print(f"l_hat must be finite: eta {eta}, state {tmp_idx}, action {action}, reward {reward}, occupancy measure {occupancy_measure[k,tmp_idx,action]}, M_prime: {M_prime[0,tmp_idx,action]}")
l_hat[k, tmp_idx, action] = prev_lhat
# return episode_list, length_list, score_list
# raise Exception(f"l_hat must be finite: eta {eta}, state {tmp_idx}, action {action}, reward {reward}, occupancy measure {occupancy_measure[k,tmp_idx,action]}")
if force_move and (t+1)%(force_move*reset_period) == 0 and optimistic_predictor in ["average", "latest"]:
M_prime = np.zeros((1, state_dim, env.action_dim))
visitation = np.zeros((state_dim, env.action_dim))
# Solve for v_hat using cvxpy
if separation:
v_temp = cp.Variable((l_hat.shape[0], state_dim))
# Set objective function
z_t = 0
for k in range(l_hat.shape[0]-1):
z_k = 0
for state in range(state_dim):
for action in range(env.action_dim):
z_k += occupancy_measure[k,state,action]*cp.exp(-eta*(l_hat[k,state,action] + M_prime[0,state,action] - M_t[0,state,action]) + v_temp[k,state] - v_temp[k+1,state_iter.get_next_index(state,action)])
z_t += cp.log(z_k)
prob = cp.Problem(cp.Minimize(z_t))
# prob.solve()
prob.solve(qcp=True, solver=cp.SCS)
v_hat = np.zeros((l_hat.shape[0], state_dim))
for k, v_k in enumerate(v_temp.value):
for state, value in enumerate(v_k):
v_hat[k,state] = value
else:
v_temp = cp.Variable(state_dim)
# Set constraints
# constraints = []
# for state in range(state_dim):
# for action in range(env.action_dim):
# constraints.append(cp.exp(v_temp[state]-v_temp[state_iter.get_next_index(state,action)])==bellman[state,action])
# Set objective function
z_0 = 0
for state in range(state_dim):
for action in range(env.action_dim):
z_0 += occupancy_measure[0,state,action]*cp.exp(-eta*(l_hat[0,state,action] + M_prime[0,state,action] - M_t[0,state,action]) + v_temp[state] - v_temp[state_iter.get_next_index(state,action)])
# prob = cp.Problem(cp.Minimize(z_0), constraints)
prob = cp.Problem(cp.Minimize(z_0))
# prob.solve()
# prob.solve(qcp=True)
prob.solve(qcp=True, solver=cp.SCS)
v_hat = np.zeros((1, state_dim))
for state, value in enumerate(v_temp.value):
v_hat[0,state] = value
Z = 0
for k in range(l_hat.shape[0]):
for ind in range(state_dim):
for action in range(env.action_dim):
delta = -eta*(l_hat[k, ind, action]+M_prime[0, ind, action]-M_t[0, ind, action])
if separation:
delta = delta - v_hat[k+1, state_iter.get_next_index(ind,action)] + v_hat[k, ind]
else:
delta = delta - v_hat[0, state_iter.get_next_index(ind,action)] + v_hat[0, ind]
Z = Z + occupancy_measure[k, ind, action]*np.exp(delta)
if np.isnan(Z):
print(f"delta: {delta}")
print(f"Z: {Z}")
print(f"np.exp(delta): {np.exp(delta)}")
print(f"l_hat: {l_hat[k,ind,action]}")
print(f"M_t: {M_t[0,ind,action]}")
print(f"M_prime: {M_prime[0,ind,action]}")
print(f"v_hat: {v_hat[k,ind]}")
print(f"v_hat next: {v_hat[k,state_iter.get_next_index(ind,action)]}")
print(f"eta: {eta}")
for k in range(l_hat.shape[0]):
for ind in range(state_dim):
for action in range(env.action_dim):
delta = -eta*(l_hat[k, ind, action]+M_prime[0, ind, action]-M_t[0, ind, action])
if separation:
delta = delta - v_hat[k+1, state_iter.get_next_index(ind,action)] + v_hat[k, ind]
else:
delta = delta - v_hat[0, state_iter.get_next_index(ind,action)] + v_hat[0, ind]
occupancy_measure[k, ind, action] = occupancy_measure[k, ind, action]*np.exp(delta) / Z
if np.isnan(occupancy_measure[k,ind,action]):
print(f"delta: {delta}")
print(f"Z: {Z}")
print(f"np.exp(delta): {np.exp(delta)}")
print(f"l_hat: {l_hat[k,ind,action]}")
print(f"M_t: {M_t[0,ind,action]}")
print(f"M_prime: {M_prime[0,ind,action]}")
print(f"v_hat: {v_hat[k,ind]}")
print(f"v_hat next: {v_hat[k,state_iter.get_next_index(ind,action)]}")
print(f"eta: {eta}")
if (t+1)%print_interval == 0:
if len(episode_list) == 0:
prev_length = episode_length/(t+1)
prev_score = score/(t+1)
# prev_pred_error = prediction_error/(t+1)
else:
prev_length = (prev_length*(t+1-print_interval) + episode_length)/(t+1)
prev_score = (prev_score*(t+1-print_interval) + score)/(t+1)
# prev_pred_error = (prev_pred_error*(t+1-print_interval) + prediction_error)/(t+1)
length_list.append(prev_length)
score_list.append(prev_score)
episode_list.append(t)
# min_q_list.append(min_q)
# pred_error_list.append(prev_pred_error)
score, episode_length, prediction_error = 0, 0, 0
if np.isnan(occupancy_measure).any() > 0:
break
# Find a fixed optimal policy using SARSA
epsilon = 0.1
alpha = 0.1
gamma = 0.9
obs_list, action_list = [], []
Q = np.zeros((state_dim, env.action_dim))
for episode in range(num_episodes):
preset = saved_preset[episode%len(saved_preset)]
obs, _ = env.load_map(preset)
obs_idx = state_iter.obs_to_index(obs)
if np.random.random() < epsilon:
action = np.random.choice(env.action_dim)
else:
action = np.argmax(Q[obs_idx, :])
done, truncated = False, False
# Run episode
while not done and not truncated:
next_obs, reward, done, truncated, _ = env.step(action)
next_obs_idx = state_iter.obs_to_index(next_obs)
if np.random.random() < epsilon:
next_action = np.random.choice(env.action_dim)
else:
next_action = np.argmax(Q[next_obs_idx, :])
obs_idx = state_iter.obs_to_index(obs)
Q[obs_idx,action] = Q[obs_idx,action] + alpha*(reward + gamma*Q[next_obs_idx, next_action] - Q[obs_idx,action])
action=next_action
obs = next_obs
optimal_list = []
preset_id = 0
for episode in range(num_episodes):
if force_move and (episode+1)%force_move == 0:
preset_id += 1
if force_move and episode%force_move==0:
optimal_score = 0
obs, _ = env.load_map(saved_preset[preset_id])
done, truncated = False, False
while not done and not truncated:
obs_idx = state_iter.obs_to_index(obs)
action = np.argmax(Q[obs_idx, :])
obs, reward, done, truncated, _ = env.step(action)
optimal_score += reward
if (episode+1)%print_interval == 0 and len(optimal_list)<len(score_list):
optimal_list.append(optimal_score)
return episode_list, np.array(optimal_list)-np.array(score_list), pred_error_list