Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
OREPS-OPIX/OREPS.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
377 lines (332 sloc)
16.7 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
import numpy as np | |
import cvxpy as cp | |
from tqdm import tqdm | |
from EnvGrid import Action | |
class StateIter(): | |
def __init__(self, width, height): | |
self.width = width | |
self.height = height | |
def obs_to_index(self, obs): | |
# ind = obs[0] + obs[1]*self.height + obs[2]*self.height*self.width + obs[3]*self.height*self.width*self.height | |
ind = obs[0] + obs[1]*self.height | |
if int(ind) != ind: | |
print(ind, int(ind)) | |
return int(ind) | |
def index_to_obs(self, index): | |
temp = index | |
obs = [] | |
obs.append(temp%self.height) | |
temp = (temp - obs[-1])/self.height | |
obs.append(temp%self.width) | |
temp = (temp - obs[-1])/self.width | |
# obs.append(temp%self.height) | |
# temp = (temp - obs[-1])/self.height | |
# obs.append(temp%self.width) | |
# temp = (temp - obs[-1])/self.width | |
if temp != 0: | |
print(temp) | |
raise Exception(f"index {index} is out of bounds for map size {self.height} x {self.width}") | |
return np.array(obs, dtype=np.int) | |
def get_next_index(self, index, action): | |
obs = self.index_to_obs(index) | |
if action == Action.LEFT: | |
if obs[1] < 1: | |
return index | |
# return self.obs_to_index(np.array([obs[0], obs[1]-1, obs[2], obs[3]])) | |
return self.obs_to_index(np.array([obs[0], obs[1]-1])) | |
elif action == Action.RIGHT: | |
if obs[1] > self.width-2: | |
return index | |
# return self.obs_to_index(np.array([obs[0], obs[1]+1, obs[2], obs[3]])) | |
return self.obs_to_index(np.array([obs[0], obs[1]+1])) | |
elif action == Action.DOWN: | |
if obs[0] > self.height-2: | |
return index | |
# return self.obs_to_index(np.array([obs[0]+1, obs[1], obs[2], obs[3]])) | |
return self.obs_to_index(np.array([obs[0]+1, obs[1]])) | |
elif action == Action.UP: | |
if obs[0] < 1: | |
return index | |
# return self.obs_to_index(np.array([obs[0]-1, obs[1], obs[2], obs[3]])) | |
return self.obs_to_index(np.array([obs[0]-1, obs[1]])) | |
else: | |
raise Exception(f"Invalid action number: {action}") | |
def OREPS(env, eta, num_episodes = 10000, print_interval=20, gamma=1.4e-3, r=0.0, separation=False, force_move=None, optimistic_predictor="off", reset_period=1): | |
state_dim = env.config.height*env.config.width | |
state_iter = StateIter(env.config.width, env.config.height) | |
score, episode_length, prediction_error = 0, 0, 0 | |
length_list, score_list, episode_list, pred_error_list = [], [], [], [] | |
# For training | |
obs_list, action_list, reward_list = [], [], [] | |
if env.state_dim != 4: | |
raise Exception("OREPS only supports state_dim==4") | |
if separation: | |
occupancy_measure = np.ones((env.timeout, state_dim, env.action_dim)) | |
l_hat = np.zeros((env.timeout, state_dim, env.action_dim)) | |
else: | |
occupancy_measure = np.ones((1, state_dim, env.action_dim)) | |
l_hat = np.zeros((1, state_dim, env.action_dim)) | |
occupancy_measure = occupancy_measure / occupancy_measure.sum() | |
M_prime = np.zeros((1, state_dim, env.action_dim)) | |
visitation = np.zeros((state_dim, env.action_dim)) | |
saved_preset = [] | |
for t in tqdm(range(num_episodes)): | |
if t==0: | |
obs, _ = env.reset(True) | |
saved_preset.append(env.save_map()) | |
if optimistic_predictor in ["super", "super_random"]: | |
for state in range(state_dim): | |
for action in range(env.action_dim): | |
try: | |
reward = env.get_reward(state_iter.index_to_obs(state),action) | |
except: | |
import IPython | |
IPython.embed() | |
exit() | |
if optimistic_predictor=="super_random" and np.random.random() < 0.01: | |
M_prime[0,state,action] = 0 | |
else: | |
M_prime[0,state,action] = max(min(-reward,1),0) | |
else: | |
obs, _ = env.reset() | |
M_t = M_prime | |
obs_list, action_list, reward_list = [], [], [] | |
done, truncated = False, False | |
# Run episode | |
k=0 | |
while not done and not truncated: | |
obs_list.append(obs) | |
try: | |
pi = occupancy_measure[k, state_iter.obs_to_index(obs), :] | |
pi = pi / pi.sum() | |
action = np.random.choice(env.action_dim, p=pi) | |
except: | |
import IPython | |
IPython.embed() | |
exit() | |
obs, reward, done, truncated, _ = env.step(action) | |
# if truncated: | |
# print(f"policy distribution: {pi}") | |
# import IPython | |
# IPython.embed() | |
# exit() | |
action_list.append(action) | |
reward_list.append(reward) | |
score += reward | |
episode_length += 1 | |
if separation: | |
k+=1 | |
# Accumulate prediction error (c_t-M_t) | |
prediction_error = 0 | |
for state in range(state_dim): | |
for action in range(env.action_dim): | |
try: | |
reward = env.get_reward(state_iter.index_to_obs(state),action) | |
except: | |
import IPython | |
IPython.embed() | |
exit() | |
prediction_error += max(min(-reward,1),0) - M_t[0,state,action] | |
pred_error_list.append(prediction_error) | |
# Reset env for the next episode | |
if force_move and (t+1)%force_move == 0: | |
obs, _ = env.reset(True) | |
saved_preset.append(env.save_map()) | |
# Perfect M_t, M_prime | |
if optimistic_predictor in ["super", "super_random"]: | |
for state in range(state_dim): | |
for action in range(env.action_dim): | |
try: | |
reward = env.get_reward(state_iter.index_to_obs(state),action) | |
except: | |
import IPython | |
IPython.embed() | |
exit() | |
if optimistic_predictor=="super_random" and np.random.random() < 0.01: | |
M_prime[0,state,action] = 0 | |
prediction_error += max(min(-reward,1),0) | |
else: | |
M_prime[0,state,action] = max(min(-reward,1),0) | |
if separation: | |
l_hat = np.zeros((env.timeout, state_dim, env.action_dim)) | |
else: | |
l_hat = np.zeros((1, state_dim, env.action_dim)) | |
for tmp_obs, action, reward in zip(obs_list, action_list, reward_list): | |
tmp_idx = state_iter.obs_to_index(tmp_obs) | |
if occupancy_measure[k,tmp_idx,action] <= 0: | |
print(f"state: {tmp_obs}") | |
print(f"action: {action}") | |
print(f"reward: {reward}") | |
prev_lhat = l_hat[k, tmp_idx, action] | |
if reward > 0: | |
if optimistic_predictor == "average": | |
M_prime[0, tmp_idx, action] = M_prime[0,tmp_idx,action]*visitation[tmp_idx,action]/(visitation[tmp_idx,action]+1) | |
visitation[tmp_idx, action] += 1 | |
elif optimistic_predictor == "latest": | |
M_prime[0, tmp_idx, action] = 0 | |
l_hat[k, tmp_idx, action] = -M_t[0,tmp_idx,action]/(occupancy_measure[k,tmp_idx,action]+gamma)+M_t[0,tmp_idx,action] | |
elif reward < -1: | |
if optimistic_predictor == "latest": | |
M_prime[0, tmp_idx, action] = 1 | |
elif optimistic_predictor == "average": | |
M_prime[0, tmp_idx, action] = (M_prime[0,tmp_idx,action]*visitation[tmp_idx,action]+1)/(visitation[tmp_idx,action]+1) | |
visitation[tmp_idx, action] += 1 | |
l_hat[k, tmp_idx, action] = (1-M_t[0,tmp_idx,action])/(occupancy_measure[k,tmp_idx,action]+gamma)+M_t[0,tmp_idx,action] | |
else: | |
if optimistic_predictor == "latest": | |
M_prime[0, tmp_idx, action] = -reward | |
elif optimistic_predictor == "average": | |
M_prime[0, tmp_idx, action] = (M_prime[0,tmp_idx,action]*visitation[tmp_idx,action]-reward)/(visitation[tmp_idx,action]+1) | |
visitation[tmp_idx, action] += 1 | |
l_hat[k, tmp_idx, action] = (-reward-M_t[0,tmp_idx,action])/(occupancy_measure[k,tmp_idx,action]+gamma)+M_t[0,tmp_idx,action] | |
if np.exp(-l_hat[k, tmp_idx, action]*eta) <= 0: | |
# If occupancy measure is too small, l_hat explodes, making occupancy measure to be zero next time | |
# However, in O-REPS, gamma=0.0 and occupancy measure (denominator) must not be zero | |
# Do not update l_hat value, when occupancy measure is too small | |
# print(f"l_hat must be finite: eta {eta}, state {tmp_idx}, action {action}, reward {reward}, occupancy measure {occupancy_measure[k,tmp_idx,action]}, M_prime: {M_prime[0,tmp_idx,action]}") | |
l_hat[k, tmp_idx, action] = prev_lhat | |
# return episode_list, length_list, score_list | |
# raise Exception(f"l_hat must be finite: eta {eta}, state {tmp_idx}, action {action}, reward {reward}, occupancy measure {occupancy_measure[k,tmp_idx,action]}") | |
if force_move and (t+1)%(force_move*reset_period) == 0 and optimistic_predictor in ["average", "latest"]: | |
M_prime = np.zeros((1, state_dim, env.action_dim)) | |
visitation = np.zeros((state_dim, env.action_dim)) | |
# Solve for v_hat using cvxpy | |
if separation: | |
v_temp = cp.Variable((l_hat.shape[0], state_dim)) | |
# Set objective function | |
z_t = 0 | |
for k in range(l_hat.shape[0]-1): | |
z_k = 0 | |
for state in range(state_dim): | |
for action in range(env.action_dim): | |
z_k += occupancy_measure[k,state,action]*cp.exp(-eta*(l_hat[k,state,action] + M_prime[0,state,action] - M_t[0,state,action]) + v_temp[k,state] - v_temp[k+1,state_iter.get_next_index(state,action)]) | |
z_t += cp.log(z_k) | |
prob = cp.Problem(cp.Minimize(z_t)) | |
# prob.solve() | |
prob.solve(qcp=True, solver=cp.SCS) | |
v_hat = np.zeros((l_hat.shape[0], state_dim)) | |
for k, v_k in enumerate(v_temp.value): | |
for state, value in enumerate(v_k): | |
v_hat[k,state] = value | |
else: | |
v_temp = cp.Variable(state_dim) | |
# Set constraints | |
# constraints = [] | |
# for state in range(state_dim): | |
# for action in range(env.action_dim): | |
# constraints.append(cp.exp(v_temp[state]-v_temp[state_iter.get_next_index(state,action)])==bellman[state,action]) | |
# Set objective function | |
z_0 = 0 | |
for state in range(state_dim): | |
for action in range(env.action_dim): | |
z_0 += occupancy_measure[0,state,action]*cp.exp(-eta*(l_hat[0,state,action] + M_prime[0,state,action] - M_t[0,state,action]) + v_temp[state] - v_temp[state_iter.get_next_index(state,action)]) | |
# prob = cp.Problem(cp.Minimize(z_0), constraints) | |
prob = cp.Problem(cp.Minimize(z_0)) | |
# prob.solve() | |
# prob.solve(qcp=True) | |
prob.solve(qcp=True, solver=cp.SCS) | |
v_hat = np.zeros((1, state_dim)) | |
for state, value in enumerate(v_temp.value): | |
v_hat[0,state] = value | |
Z = 0 | |
for k in range(l_hat.shape[0]): | |
for ind in range(state_dim): | |
for action in range(env.action_dim): | |
delta = -eta*(l_hat[k, ind, action]+M_prime[0, ind, action]-M_t[0, ind, action]) | |
if separation: | |
delta = delta - v_hat[k+1, state_iter.get_next_index(ind,action)] + v_hat[k, ind] | |
else: | |
delta = delta - v_hat[0, state_iter.get_next_index(ind,action)] + v_hat[0, ind] | |
Z = Z + occupancy_measure[k, ind, action]*np.exp(delta) | |
if np.isnan(Z): | |
print(f"delta: {delta}") | |
print(f"Z: {Z}") | |
print(f"np.exp(delta): {np.exp(delta)}") | |
print(f"l_hat: {l_hat[k,ind,action]}") | |
print(f"M_t: {M_t[0,ind,action]}") | |
print(f"M_prime: {M_prime[0,ind,action]}") | |
print(f"v_hat: {v_hat[k,ind]}") | |
print(f"v_hat next: {v_hat[k,state_iter.get_next_index(ind,action)]}") | |
print(f"eta: {eta}") | |
for k in range(l_hat.shape[0]): | |
for ind in range(state_dim): | |
for action in range(env.action_dim): | |
delta = -eta*(l_hat[k, ind, action]+M_prime[0, ind, action]-M_t[0, ind, action]) | |
if separation: | |
delta = delta - v_hat[k+1, state_iter.get_next_index(ind,action)] + v_hat[k, ind] | |
else: | |
delta = delta - v_hat[0, state_iter.get_next_index(ind,action)] + v_hat[0, ind] | |
occupancy_measure[k, ind, action] = occupancy_measure[k, ind, action]*np.exp(delta) / Z | |
if np.isnan(occupancy_measure[k,ind,action]): | |
print(f"delta: {delta}") | |
print(f"Z: {Z}") | |
print(f"np.exp(delta): {np.exp(delta)}") | |
print(f"l_hat: {l_hat[k,ind,action]}") | |
print(f"M_t: {M_t[0,ind,action]}") | |
print(f"M_prime: {M_prime[0,ind,action]}") | |
print(f"v_hat: {v_hat[k,ind]}") | |
print(f"v_hat next: {v_hat[k,state_iter.get_next_index(ind,action)]}") | |
print(f"eta: {eta}") | |
if (t+1)%print_interval == 0: | |
if len(episode_list) == 0: | |
prev_length = episode_length/(t+1) | |
prev_score = score/(t+1) | |
# prev_pred_error = prediction_error/(t+1) | |
else: | |
prev_length = (prev_length*(t+1-print_interval) + episode_length)/(t+1) | |
prev_score = (prev_score*(t+1-print_interval) + score)/(t+1) | |
# prev_pred_error = (prev_pred_error*(t+1-print_interval) + prediction_error)/(t+1) | |
length_list.append(prev_length) | |
score_list.append(prev_score) | |
episode_list.append(t) | |
# min_q_list.append(min_q) | |
# pred_error_list.append(prev_pred_error) | |
score, episode_length, prediction_error = 0, 0, 0 | |
if np.isnan(occupancy_measure).any() > 0: | |
break | |
# Find a fixed optimal policy using SARSA | |
epsilon = 0.1 | |
alpha = 0.1 | |
gamma = 0.9 | |
obs_list, action_list = [], [] | |
Q = np.zeros((state_dim, env.action_dim)) | |
for episode in range(num_episodes): | |
preset = saved_preset[episode%len(saved_preset)] | |
obs, _ = env.load_map(preset) | |
obs_idx = state_iter.obs_to_index(obs) | |
if np.random.random() < epsilon: | |
action = np.random.choice(env.action_dim) | |
else: | |
action = np.argmax(Q[obs_idx, :]) | |
done, truncated = False, False | |
# Run episode | |
while not done and not truncated: | |
next_obs, reward, done, truncated, _ = env.step(action) | |
next_obs_idx = state_iter.obs_to_index(next_obs) | |
if np.random.random() < epsilon: | |
next_action = np.random.choice(env.action_dim) | |
else: | |
next_action = np.argmax(Q[next_obs_idx, :]) | |
obs_idx = state_iter.obs_to_index(obs) | |
Q[obs_idx,action] = Q[obs_idx,action] + alpha*(reward + gamma*Q[next_obs_idx, next_action] - Q[obs_idx,action]) | |
action=next_action | |
obs = next_obs | |
optimal_list = [] | |
preset_id = 0 | |
for episode in range(num_episodes): | |
if force_move and (episode+1)%force_move == 0: | |
preset_id += 1 | |
if force_move and episode%force_move==0: | |
optimal_score = 0 | |
obs, _ = env.load_map(saved_preset[preset_id]) | |
done, truncated = False, False | |
while not done and not truncated: | |
obs_idx = state_iter.obs_to_index(obs) | |
action = np.argmax(Q[obs_idx, :]) | |
obs, reward, done, truncated, _ = env.step(action) | |
optimal_score += reward | |
if (episode+1)%print_interval == 0 and len(optimal_list)<len(score_list): | |
optimal_list.append(optimal_score) | |
return episode_list, np.array(optimal_list)-np.array(score_list), pred_error_list |