GPR_Optimizer.py

import numpy as np
import json
import pickle
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from skopt import Optimizer
from skopt.learning.gaussian_process.kernels import ConstantKernel, Matern, Sum, RBF, WhiteKernel
from skopt.learning import GaussianProcessRegressor
from skopt.plots import plot_evaluations, plot_convergence, plot_objective, plot_gaussian_process, plot_histogram, plot_objective_2D
from skopt import expected_minimum
from scipy.stats import multivariate_normal
import time
#from DE_Optimizer import DifferentialEvolution
#import sys
#import os
#from skopt.benchmarks import branin, hart6
#from skopt.space.space import Real, Integer
#from functools import partial
#from skopt import gp_minimize
#from skopt import callbacks
#from skopt import gp_minimize, forest_minimize, dummy_minimize
#from skopt.callbacks import CheckpointSaver
#from skopt.learning import ExtraTreesRegressor
#from joblib import Parallel, delayed
#import time

''' Future plans
Domain reduction
    Include a scale factor that only searchs over a smaller region around the best point. 
    Some gpr libraries slowly decrease the domain size to help with convergence.
    Need to make a new gpr class for this. Can't modify previuos one.
    
Differential evolution switch
    Differential evolution switch that turns on DE.  The DE alg will take the best
    points to create the first population, then start the algoirthm for a fixed number 
    of points.  This could be useful to run after random points to get better points
    and help make a better guess of the hyperparameters. Or if GPR is failing, then 
    we can turn it temporarily since DE is very robust, but slow. 
    
    
'''

class GPR_Optimizer(Optimizer):
    """GPR_Optimizer is based on Optimizer class of skopt library. 
    scikit-optimize.github.io/stable
    This class allows you to insert the noise for each data point. 
  
    1D Example:
        opt = GPR_Optimizer(dimensions=[(-2.,2.)],  # parameter space
                     length_scale=[1], length_scale_bounds=[(.01,10.)], 
                     noise_level=0.1, noise_level_bounds=(0.01,1), 
                     n_initial_points=10, 
                     kappa=10)
        next_x = opt.ask() # The first n_initial_points will be random, then the GPR optimizer starts.
        f_val = objective(next_x)  #objective is the cost function
        res = opt.tell(next_x, f_val, error, fit=True) # Report the value.
   """
    
    def __init__(self, dimensions, length_scale, length_scale_bounds, noise_level=0.1, noise_level_bounds=[1e-3,2e-1], n_initial_points=10, x_initial=[],
                 acq_func="LCB", kappa=4*1.96, xi=4*0.01, alpha=[], normalize_y=True, sigma_value_bounds=(1e-3,1.0), sigma_value=0.5,
                 n_restarts_optimizer=0, acq_optimizer='sampling', verbose=True, config_path='optimzer_config.json', domain_reduction=1, best_point=[], hp_fit_n_points=10000):
        '''
        Initialize the GaussianProcessRegressor.
        
        I use RBF + WhiteKernel. The default kernel is "1.0 * RBF(1.0)". 
        Kernel's hyperparameters are optimized during fitting at tell. Another common 
        option is kernel = 1.0 * Matern(length_scale=1.0, length_scale_bounds=(1e-1, 1.0), nu=2.5) 
        which is a generalization of RBF, and equiv when nu = inf, but takes longer to compute.
        alpha (default 1e-10) is variance = noise**2 added to diagonal kernel durin fitting.
        alpha can be a constant, where then it is the same for all points, or an array 
        the same length as data.  Some sources say it is equiv to white noise added
        by a WhiteKernel.  But WhiteKernel has a hyperparameter for the noise which
        is tuned every time opt.tell(...) is called.
        If you set noise = "gaussian", then it will add a WhiteKernel. I below
        add a WhiteKernel manually in order to set the noise_level_bounds manually.
        Note also that ConstantKernel(constant_value=1**2, constant_value_bounds=constant_value_bounds)*RBF() is equivalent to 1 * RBF(),
        expect you can explicitely set the starting value and bounds, which by default are 1.0 and (1e-5, 1e5)
        
        Parameters
        ----------
        dimensions : list of bounds for each parameters
            Example: [(-2.,2.), ..., ndims]
            Dimensions can be list of (0., 1.) for real, and (0,1) for integers. Other options available like categories.   
            
        length_scale : initial length_scale. 
            Can be float or list of length ndims,  In normalized units from 0 to 1. 
        
        length_scale_bounds: bounds for each scales
            Can be [1e-3,1] or list of bounds for each parameter. In normalized units from 0 to 1. 
        
        noise_level : float, initial value of noise hyperparameter for WhiteKernel, which is Gaussian noise added to the RBF kernel.
            Example: 1e-3 or None
            If noise_level = 0 or None, then WhiteKernel is not included, and the noise only comes from alpha.
            NEW:  When normalize_y = True, then this is actually noise/std(y).  So it is fractional noise over hte std of your data. 
            
        noise_level_bounds : bounds
            Example: [1e-3, 1]
        
        n_initial_points: number of initial random points.  
            Sampling method determined by  'initial_point_generator' in opimtizer.py
        
        acq_func : Acquisition function that determines the next best point. 
            # LCB = lower confidence bound. Uses kappa, not xi. 
            # PI= probabilit of improvement. PI doesn't care how much better predictions are than previous values, since it gives equal reward. Uses kappa, not xi.
            # EI = expective improvement. Uses xi, which is the desired improvement. Does not use kappa. 
            # "gp_hedge" uses all three, then favors the ones that improve. 
            # Advice: if exploration is priority, use LCB.  If exploitation, then use PI.  For balanced, use EI. 
    
        kappa : Controls exploration for local optimization.  kappa high (even inf) favors exploration
            1.96 default. kappa contros how much of the variance in the predicted value should be taken into account. 
            kappa is used by LCB and .  xi is used for EI and is goal of improvement to shoot for over the previous best value. 
            Code currently uses this kappa to set both. Maybe in future make xi its own variable. 
          
        xi : float 0 to 1,  parameter for EI acquisition function
            Determines the desired improvement, for x=0.01 is 1% change.
            
        alpha  : diagonal of the covariance matrix, which is the variance = err^2
            float or array-like, optional (default: 1e-10)    
            The tell function can update a list of alphas that give the variance of eery point. 
            If is too small then will have trouble with optimizer model fits. 
            ONLY USE alpha BY ENTERING IT WITH tell() OR NORMALIZE_Y WILL MESS IT UP.
            If using normalize_y = True, then it this is fractional variance over the std of your data. 
         
        x_initial : [] or [ (0,0) ] for 2D.  list of initial points to be taken first. 
        
        constant_value : inintial constant for RBF kernel.
            The kernel is  sigma^2  exp(- |x - x'|^2/2l^2  ).   The constant is sigma^2.  It allows larger outputs, increaseing the flexibiliyt of the GPR model. Important to set similar to expected scale.
        
        constant_value_bounds : bounds for constant_value

        acq_optimizer : 'lbfgs' or 'sampling'.   For finding hyperparameters. 
            Takes 10,000 points of hyperparameters, including current setting. 'sampling' uses best. 
            'lbfgs' takes best 'n_restart_optimizer' points and performs gradient based lbfgs optimization,
            then picks best. 'lgbfgs' is slow for high dimensions.
        
        n_restarts_optimizer :  4,  see acq-optimizer. If 'lbfgs' selected, number of points to perform lbfgs on. 
        
        verbose : Default True,  whether to print while running

        normalize_y : 'True',  when normalize_y true, then the y data has mean subtracted and divided by std at every call of tell() right before it fits new  
        hp's.      # That normalization is not used to change noise level.  So noise level above should be interpreted at noise_level/std(y).  That would be 
        the same for alpha, 
        # expect I now always enter alpha=error^2 with tell, and I have changed tell() to normalize alpha by std as well.  So just enter the alpha by using
        # the  tell(x,y,y_err)  , where y_err is in same units as y.  If no noise, still put small value of y_err.  Cannot use y_err =None or 0!!!!!!
        # if you really have no noise, still use small value of y_err, which is necessary for the fitting to converge easily.  
        '''
        # Check types
        dimensions = [tuple(float(num) for num in tpl) for tpl in dimensions]  # make sure dimensions is a list of tuples of 

        # Save to class
        self.dimensions = dimensions
        self.length_scale = length_scale
        self.length_scale_bounds = length_scale_bounds
        self.noise_level = noise_level
        self.noise_level_bounds = noise_level_bounds
        self.n_initial_points = n_initial_points
        self.x_initial = x_initial
        self.acq_func = acq_func
        self.kappa = kappa
        self.xi = xi
        self.alpha = alpha
        self.sigma_value_bounds = sigma_value_bounds
        self.sigma_value = sigma_value
        self.n_restarts_optimizer = n_restarts_optimizer
        self.acq_optimizer = acq_optimizer
        self.verbose = verbose
        self.hp_fit_n_points = hp_fit_n_points
        self.domain_reduction = domain_reduction
        self.best_point = best_point

        self.config_path = config_path
        # Define the constant value bounds for the amplitude parameter
        #constant_value_bounds = (1e-3, 1e3)  # Square since the kernel uses the square of the value, default = 1e-5 to 1e5

        # constant value is variance, so square sigmas
        constant_value = sigma_value**2
        constant_value_bounds = ( sigma_value_bounds[0]**2, sigma_value_bounds[1]**2)
        
        if noise_level == 0 or noise_level is None:
            # If no noise, don't add WhiteKernel
            kernel = ConstantKernel(constant_value=constant_value, constant_value_bounds=constant_value_bounds) * RBF(length_scale=length_scale, length_scale_bounds=length_scale_bounds)
            print("Using no noise.")
        else:
            # If noise is present, add WhiteKernel
            kernel = ConstantKernel(constant_value=constant_value, constant_value_bounds=constant_value_bounds) * RBF(length_scale=length_scale, length_scale_bounds=length_scale_bounds) + WhiteKernel(noise_level=noise_level, noise_level_bounds=noise_level_bounds)   #starting hp. fitting at each run. 
        gpr = GaussianProcessRegressor(kernel=kernel,
                        alpha=alpha, 
                        normalize_y=normalize_y,  #XXXXXXXXXXXXXXXXXXXXXXXX
                            # Subtracts mean and normalize std to 1. It is done at beginning of fit, before alpha. 
                            # in _gpr.py of sklearn,      y = (y - self._y_train_mean) / self._y_train_std
                            # Can access them at opt.models[-1].y_train_std_
                            # Also weird because adjusts y data, but doesn't change theta or alpha. 
                            # So if normalization changes a lot, then the hyperparameer and alpha will have to change.
                            # Whether the target values y are normalized, i.e., the mean of the
                            # observed target values become zero. This parameter should be set to
                            # True if the target values' mean is expected to differ considerable from
                            # zero. When enabled, the normalization effectively modifies the GP's
                            # prior based on the data, which contradicts the likelihood principle;
                            # normalization is thus disabled per default.
                        #noise="gaussian",  # noise="gaussian" adds WhiteKernel, but I added manually. "off" doesn't work, so need to not include.
                        n_restarts_optimizer=n_restarts_optimizer 
                            # how many The number of restarts of the optimizer for finding the kernel's hyperparameters. The first run
                            # of the optimizer is performed from the kernel's initial parameters,
                            # the remaining ones (if any) from thetas sampled log-uniform randomly
                            # from the space of allowed theta-values. If greater than 0, all bounds
                            # must be finite. Note that n_restarts_optimizer == 0 implies that one
                            # run is performed.
                                    )
  
        # For domain reduction is not 1 and best_point not empty, set dimensions to reduced dimensions right here!
        
        super().__init__(
                        dimensions,   # dimensions,
                        base_estimator= gpr, #"GP", #gpr, # default "GP"  # base estimator, also GP, RF, ET, GBRT
                        n_initial_points=n_initial_points,  #default 10
                        acq_optimizer=acq_optimizer, #'sampling', 'lbfgs', or auto (which chooses between two). 'lbfgs' is slow for high dim and many points
                        initial_point_generator="lhs", # random (default), sobol, halton, hammersly, lhs (latin hypercube sampling), grid.  See optimizer.py for more details. 
                        acq_func=acq_func, #(randomly choose LCB, EI, or PI)   #"gp_hedge", "LCB", "EI", "PI".  
                        acq_func_kwargs = {"xi":xi, "kappa":kappa}, # kappa for LCB, xi for EI.  
                        acq_optimizer_kwargs = {"n_points":hp_fit_n_points,  "n_jobs":2}  # settings to speed up tell(). n_jobs sets number of jobs in Parallel.  sampling of hp space is always done with 'n_points' points.  If lbfgs, then the 'n_restarts_optimizer' best points of those are used as start for 20 iterations of lbfgs optimizaation. #"n_restarts_optimizer":1,
                        )
        
        
        self.error = []  # List of errors of measured points. Must float or be an array of length of number data points.
        self.x_initial = x_initial # initial points to run
        self.verbose = verbose # Whether to print statements
        
        self.mean_for_y_norm = 0 # NOT USING. mean and std for normalizing all the y data.  This is so that you don't have to change ConstantValue. 
        self.std_for_y_norm = 1  # NOT USING
        
    def ask(self, *args):
        ''' Same as optimizer class.
        # Obtains next point. If < n_initial_point, then from initial_point_generator.
        Otherwise it uses the acq_func to find the next point uses either LCB, EI, or PI. 
        Default is "gp_hedge" method which uses all three methods with gains based on which
        improves. 
        If called twice in a row,then gives same value until tell() is called.
        
        '''
        # Run poitns from x_intiial until list is empty. Removes the element of x_initial. 
        if self.x_initial:  # Check if the list is not empty
            next_x = self.x_initial.pop(0)  # Remove and get the first element
            # first_element is now the removed element
            # x_initial is now the list without its first element
            print("Removed element:", next_x)

        else:
            next_x = super().ask(*args) # very fast
                    
        #print("next_x: ", next_x)
        return next_x
        
    def tell(self, next_x, f_val, error=None, fit=True):
        ''' Reports a measured value. Inherits Optimizer class, but also allows you to put in point dependence noise, if eg the noise is different
        between points or you know the noise from repeated measurements.
        next_x : list, parameters for measured point
        f_val : measured value
        error : If error included, then uses it to define point specific alpha. If error=0 and then all noise is in initial alpha parameter and WhiteKernel.
        fit : Whether to refit hyperparameters at the end
        Note:  tell() is most time consuming.  Can speed up by reducing 'n_restarts_optimizer' and maybe sampling method. 
        If normalize_y = True,  then gpr.py will normalize y data by subtracting mean and dividing by std every time tell() is called. 
        You can access the std and mean from the last iteratio with self.models[-1].y_train_std_
        It does not normalize noise or alpha.  I have setup alpha such that alpha = error^2/ std^2 so we can keep error 
        in the units of the cost that we enter. 
        
        '''
        
        # normalize
        f_val = self.normalize_y(f_val)
        if error is not None:
            error = error/self.std_for_y_norm # normalize

        # update alpha
        if error is not None:
            self.error = np.append(self.error, error )

            # if self.models: # check if not empty
            #     std = self.models[-1].y_train_std_  # the last used standard deviation of y data
            # else:
            
            if len(self.yi)>3:
                std = np.std(self.yi) # standard dev of the y data.  Use in tell() in gpr.py to normalize data
                self.base_estimator_.alpha= self.error**2 / std**2  #use this to update alpha during scan.  the fit always clones the base estimtator. 

        #self.update_next()   # use this if x different from latest ask value. 
        start_time = time.time()
        res = super().tell(next_x, f_val, fit=fit)
        end_time = time.time()
        if np.abs(end_time - start_time) > 1:
            print("elapsed time for tell: ", end_time - start_time)
        

        if len(self.models) != 0:
            hp = np.exp(self.models[-1].kernel_.theta)  # get latest hyperparameters, [] list.  Note that y-data is normalized, so alpha and noisevalue are normalized by std.
            likelihood = self.models[-1].log_marginal_likelihood_value_ # float, log-marginal likelihood of hyperparameters
            if self.verbose:
                formatted_hp = [f"{x:.2e}" for x in hp]
                formatted_likelihood = f"{likelihood:.1f}"  
                print("[hp]: ", formatted_hp, ' LML: ', formatted_likelihood)

        return res

    def normalize_y(self, y, inverse=False):
        ''' Subtract mean and divide by std in order to make y data have std = 1, and mean = 0. 
        # This is so ConstantValue can always be close to 1.  '''
        mean =  self.mean_for_y_norm 
        std =  self.std_for_y_norm
        
        if inverse is False:
            return (y - mean)/std
        if inverse is True:
            return y*std + mean
 
   ############# PLOTTING ###############################
    def expected_minimum(self, **kwargs):
        '''Options: n_random_starts : int, default=20
            The number of random starts for the minimization of the surrogate
            model.'''
        # Calculate the expected minimium of the model
        x_exp, best_fun = expected_minimum(self.get_result(), **kwargs)
        return x_exp, best_fun

    def get_hyperparameters(self):
        ''' Get list of all hyperparameters and likelihoods for all models.
        theta is np.log(hyperparameter), so I take exp^(theta).  
        The hyperparameters are (annoyingly) in opt.models[i].kernel_.theta
        To see the latest kernel with parameters:   opt.models[-1].kernel_
        This shows you what each hyperparameter is. 
        To see just the np.log() of value, use  np.exp(opt.models[-1].kernel_.theta)
        I think these parameters are arrange left to right from  print(opt.models[-1].kernel_)
        
        The magnitude of the LML isn't necessarily meaninfgul. Comparison '
        LML values are typically negative.  The probability is between 0 and 1, and taking log gives negative number. 
        
        
        '''
        opt = self
        hp = [np.exp(opt.models[i].kernel_.theta) for i in range(len(opt.models))]  # has n_iter - n_initial_ponit + 1 models.
        likelihood = [opt.models[i].log_marginal_likelihood_value_  for i in range(len(opt.models))] 
        print(opt.models[-1].kernel_) # # Print latest kernel
        
        return hp, likelihood
    
    def plot_hyperparameters(self):
        ''' plot the hyperparameters vs iteration'''
        hp, likelihood = self.get_hyperparameters()


        # create an array of indices from 0 to the length of the list
        lst = hp
        x = np.arange(len(lst))

        # loop over the number of elements in each array (assuming they are all the same length)
        for i in range(len(lst[0])):
            # create an empty array to store the ith element of each array
            y = np.empty(len(lst))
            # loop over the list of arrays and fill the y array with the ith element
            for j in range(len(lst)):
                y[j] = lst[j][i]
            # plot x vs y with a label
            if i==0:
                # Constant value hyperparameter
                label='sigma'  # this is sqrt of ConstantValue
                y = np.sqrt(np.abs(y)) #
            elif i==len(lst[0])-1 and self.noise_level is not None:
                label='noise'
                #y = y*self.std_for_y_norm
            else:
                label = f'length scale {i+1-1}'
            plt.plot(x, y, label=label)
            plt.yscale('log')
            plt.xlabel('run (after initial)')
            plt.ylabel('hyperparameter')
            plt.title(str(self.models[-1].kernel_))
            plt.grid()

        # add a legend and show the plot
        plt.legend()
        plt.show()

    def predict():
        ''' add later '''
        # Predict output for X.
        # In addition to the mean of the predictive distribution, also its
        # standard deviation (return_std=True) or covariance (return_cov=True),
        # the gradient of the mean and the standard-deviation with respect to X
        # can be optionally provided.
        return 0

    def plot_LML(self):
        '''  Plots the log marginal likelihood noise level and length scale for 1D scan.
        Only works for 1D. 
        This was taken from a scikit learn example.''' 
        opt = self
        hp_final = np.exp(opt.models[-1].kernel_.theta)
        print("final noise parameters:", hp_final )
        gpr =  opt.models[-1]
        print(gpr.kernel_)

        length_scale = np.logspace(-2, 2, num=50)
        noise_level = np.logspace(-2, 1, num=50)
        length_scale_grid, noise_level_grid = np.meshgrid(length_scale, noise_level)

        log_marginal_likelihood = [
            gpr.log_marginal_likelihood(theta=np.log([hp_final[0], scale, noise]))  # insert first hp here (amp of RBF)
            for scale, noise in zip(length_scale_grid.ravel(), noise_level_grid.ravel())
        ]
        log_marginal_likelihood = np.reshape(
            log_marginal_likelihood, newshape=noise_level_grid.shape
        )

        vmin, vmax = (-log_marginal_likelihood).min(), 4*50  # set max height here.
        level = np.around(np.logspace(np.log10(vmin), np.log10(vmax), num=50), decimals=1)
        plt.contour(
            length_scale_grid,
            noise_level_grid,
            -log_marginal_likelihood,
            levels=level,
            norm=LogNorm(vmin=vmin, vmax=vmax),
        )
        plt.colorbar()
        plt.xscale("log")
        plt.yscale("log")
        plt.xlabel("Length-scale")
        plt.ylabel("Noise-level")
        plt.title("Log-marginal-likelihood")
        plt.show()

    def plot_convergence(self, **kwargs):
        # from skopt.plots
        # if normalize_y = True, this is still in unnormalized units
        res = self.get_result()
        plot_convergence(res, **kwargs) # plot at every iteration the best point.  I want to plot each point. 

    def plot_objective(self, **kwargs):
        '''
        See https://scikit-optimize.github.io/stable/auto_examples/plots/partial-dependence-plot.html#sphx-glr-auto-examples-plots-partial-dependence-plot-py
        partial dependence plot.  Partial dependence plots average over the other dimensions, rather than showing cross-sections. See below.This is not the same thing as plotting the cross-sections.
        It shows how the objective function changes with changes in the values of one or two hyperparameters, while marginalizing over the values of all other hyperparameters. This is useful to understand the relationship between the hyperparameters and the objective function.
         This function takse a long time for > 3 dimensions.  See options below to speed up.
        Calculating partial dependence is expensive.Objective Function Minimum: The lowest value in the partial dependence plot represents the minimum value of the objective function with respect to the hyperparameter(s) being plotted, while averaging out the effects of the other hyperparameters. This value indicates the most optimal (lowest) average outcome for the given range(s) of the hyperparameter(s) in question.

        Best Average Performance: In hyperparameter optimization, this minimum value suggests the best average performance that can be expected from the model when the hyperparameter(s) is/are set within the specific range(s) shown in the plot. This is particularly useful when you want to understand how sensitive the model's performance is to changes in one or two specific hyperparameters.

        Not Necessarily the Global Optimum: It's important to note that this value does not necessarily represent the global optimum of the entire hyperparameter space. Partial dependence plots marginalize over other dimensions, meaning they average out the effects of other hyperparameters. Therefore, the lowest value in these plots is more about understanding the average behavior of the model in relation to specific hyperparameters rather than pinpointing the exact best hyperparameter combination.

           def plot_objective(result, levels=10, n_points=40, n_samples=250, size=2,
                       zscale='linear', dimensions=None, sample_source='random',
                       minimum='result', n_minimum_search=None, plot_dims=None,
                       show_points=True, cmap='viridis_r'):
        Plot a 2-d matrix with so-called Partial Dependence plots
        of the objective function. This shows the influence of each
        search-space dimension on the objective function.

        This uses the last fitted model for estimating the objective function.

        The diagonal shows the effect of a single dimension on the
        objective function, while the plots below the diagonal show
        the effect on the objective function when varying two dimensions.

        The Partial Dependence is calculated by averaging the objective value
        for a number of random samples in the search-space,
        while keeping one or two dimensions fixed at regular intervals. This
        averages out the effect of varying the other dimensions and shows
        the influence of one or two dimensions on the objective function.

        Also shown are small black dots for the points that were sampled
        during optimization.

        A red star indicates per default the best observed minimum, but
        this can be changed by changing argument ´minimum´.

        .. note::
              The Partial Dependence plot is only an estimation of the surrogate
              model which in turn is only an estimation of the true objective
              function that has been optimized. This means the plots show
              an "estimate of an estimate" and may therefore be quite imprecise,
              especially if few samples have been collected during the
              optimization
              (e.g. less than 100-200 samples), and in regions of the search-space
              that have been sparsely sampled (e.g. regions away from the optimum).
              This means that the plots may change each time you run the
              optimization and they should not be considered completely reliable.
              These compromises are necessary because we cannot evaluate the
              expensive objective function in order to plot it, so we have to use
              the cheaper surrogate model to plot its contour. And in order to
              show search-spaces with 3 dimensions or more in a 2-dimensional
              plot,
              we further need to map those dimensions to only 2-dimensions using
              the Partial Dependence, which also causes distortions in the plots.

        Parameters
        ----------
        result : `OptimizeResult`
            The optimization results from calling e.g. `gp_minimize()`.

        levels : int, default=10
            Number of levels to draw on the contour plot, passed directly
            to `plt.contourf()`.

        n_points : int, default=40
            Number of points at which to evaluate the partial dependence
            along each dimension.

        n_samples : int, default=250
            Number of samples to use for averaging the model function
            at each of the `n_points` when `sample_method` is set to 'random'.

        size : float, default=2
            Height (in inches) of each facet.

        zscale : str, default='linear'
            Scale to use for the z axis of the contour plots. Either 'linear'
            or 'log'.

        dimensions : list of str, default=None
            Labels of the dimension
            variables. `None` defaults to `space.dimensions[i].name`, or
            if also `None` to `['X_0', 'X_1', ..]`.

        plot_dims : list of str and int, default=None
            List of dimension names or dimension indices from the
            search-space dimensions to be included in the plot.
            If `None` then use all dimensions except constant ones
            from the search-space.

        sample_source : str or list of floats, default='random'
            Defines to samples generation to use for averaging the model function
            at each of the `n_points`.

            A partial dependence plot is only generated, when `sample_source`
            is set to 'random' and `n_samples` is sufficient.

            `sample_source` can also be a list of
            floats, which is then used for averaging.

            Valid strings:

            - 'random' - `n_samples` random samples will used
            - 'result' - Use only the best observed parameters
            - 'expected_minimum' - Parameters that gives the best
              minimum Calculated using scipy's minimize method.
              This method currently does not work with categorical values.
            - 'expected_minimum_random' - Parameters that gives the
              best minimum when using naive random sampling.
              Works with categorical values.

        minimum : str or list of floats, default = 'result'
            Defines the values for the red points in the plots.
            Valid strings:

            - 'result' - Use best observed parameters
            - 'expected_minimum' - Parameters that gives the best
              minimum Calculated using scipy's minimize method.
              This method currently does not work with categorical values.
            - 'expected_minimum_random' - Parameters that gives the
              best minimum when using naive random sampling.
              Works with categorical values

        n_minimum_search : int, default = None
            Determines how many points should be evaluated
            to find the minimum when using 'expected_minimum' or
            'expected_minimum_random'. Parameter is used when
            `sample_source` and/or `minimum` is set to
            'expected_minimum' or 'expected_minimum_random'.

        show_points: bool, default = True
            Choose whether to show evaluated points in the
            contour plots.

        cmap: str or Colormap, default = 'viridis_r'
            Color map for contour plots. Passed directly to
            `plt.contourf()`

        Returns
        -------
        ax : `Matplotlib.Axes`
            A 2-d matrix of Axes-objects with the sub-plots.
    
    '''
        # from skopt.plots
        # if normalize_y = True, this is still in unnormalized units
        res = self.get_result()
        plot_objective(res, **kwargs) # plot at every iteration the best point.  I want to plot each point. 
 
    def plot_histogram(self, **kwargs):
        # from skopt.plots
        # if normalize_y = True, this is still in unnormalized units
        res = self.get_result()
        plot_histogram(res, **kwargs) # plot at every iteration the best point.  I want to plot each point. 

    def plot_gaussian_process(self, **kwargs):
        # from skopt.plots
        res = self.get_result()
        #plot_gaussian_process(res) # plot at every iteration the best point.  I want to plot each point. 
        plot_gaussian_process(res, #objective=objective, 
                          #noise_level=0.001,
                          show_next_point=False,
                          show_acq_func=True, **kwargs)  

    def plot_evaluations(self, **kwargs):
        # from skopt.plots
        res = self.get_result()
        plot_evaluations(res, **kwargs) # plot at every iteration the best point.  I want to plot each point. 
    
    def plot_cross_section(self, xpoint = None, best_type='measured'):
        ''' Plot the cross-sections through the best measured point.  Still working on this. 
        Plotobjective plots the partial dependence. Somtimes I just want to see a line for each parameter
        through the best point. 
        xpoint = point to plot cross-section, in the unit of dimensions. 
        Later make it possible to use best measured or best predicted. 
        
        '''
        # Get the latest gpr model
        res = self.get_result()
        gpr_model = res.models[-1] 
        
        # Create a list of bounds for each dimension
        bounds_list = [dim.bounds for dim in self.space.dimensions]
        num_params = len(bounds_list)
        bounds_list_norm = [(0.0, 1.0)]*num_params # because predict(x) accepts 0 to 1
        
        
        # I NEED TO HERE CONVERT BEST POINT TO 0 to 1 for each dimension.
        # Then bounds_list should just be 0 to 1 for all 
        
        
        if xpoint is None:
            if best_type=='measured':
                x = res.x  # Best point found by the optimizer [x0, x1, ....]
            elif best_type=='model':
                x_exp, y_exp = self.expected_minimum()  # to use minimium of the model, not measured min
        else:
            x = xpoint # manually enter point
            
        # Convert x to normalized 0 to 1 coordinates, because predict(x) accepts that
        x_norm = [(value - min_bound) / (max_bound - min_bound) for value, (min_bound, max_bound) in zip(x, bounds_list)]
        
        
        for i, bounds in enumerate(bounds_list_norm):
            # Create a grid of values for the current hyperparameter
            
            #x_grid = np.linspace(bounds[0], bounds[1], num=100)  # gpr.predict(x) requires x from 0 to 1
            x_grid = np.linspace(0, 1, num=100)   #  range has to be 0 to 1 for every parameter
            
            # Prepare the input for the model
            x_input = np.tile(x_norm, (len(x_grid), 1))  # Create a matrix where each row is the point x
            x_input[:, i] = x_grid  # Replace the ith column with the grid values
        
            # Predict using the GPR model
            y_pred, sigma = gpr_model.predict(x_input, return_std=True)  # gpr.predict(x) requires x from 0 to 1
            
            # Plotting
            plt.plot(x_grid, y_pred, label=f'variable {i + 1}')
            plt.fill_between(x_grid, y_pred - sigma, y_pred + sigma, alpha=0.2)

        plt.xlabel('Parameter (normalized to (0,1)) ')
        plt.ylabel('GPR model estimate')
        plt.title('Cross-sections through the best point')
        plt.legend()
        plt.show()

    def plot_2D_cross_sections(self):
        """
        Plot 2D cross-sections through the optimization space.

        This function generates 2D cross-section plots through the best point found by the optimizer.
        If the optimization space consists of only two parameters, a contour plot of the entire space is shown.
        For more than two parameters, the function plots cross-sections for each pair of parameters,
        holding the other parameters at their values in the best point.
        """
        figures=[]
        # Get the latest gpr model
        res = self.get_result()
        gpr_model = res.models[-1] 
        
        x_best = res.x
        bounds_list = [dim.bounds for dim in self.space.dimensions]

        # Function to normalize a point
        def normalize_point(point, bounds_list):
            return [(x - low) / (high - low) for x, (low, high) in zip(point, bounds_list)]

        # Normalize the best point
        x_best_norm = normalize_point(x_best, bounds_list)

        # Create the plots
        num_params = len(x_best)
        if num_params == 2:
            # Just plot the entire space for 2 parameters
            x_grid, y_grid = np.meshgrid(
                np.linspace(0, 1, 100), 
                np.linspace(0, 1, 100)
            )
            xy_grid = np.vstack([x_grid.ravel(), y_grid.ravel()]).T
            predictions, sigma = gpr_model.predict(xy_grid, return_std=True)
            
            # Reshape for plotting
            predictions = predictions.reshape(x_grid.shape)
            sigma = sigma.reshape(x_grid.shape)
            
            # Plot
            plt.figure()
            cp = plt.contourf(x_grid, y_grid, predictions, alpha=0.7)
            plt.colorbar(cp)
            plt.title('2D Cross-section of the entire space')
            plt.xlabel('Parameter 1')
            plt.ylabel('Parameter 2')
            plt.show()
        else:
            # Plot 2D cross-sections for each pair of parameters, holding other parameters at best
            for i in range(num_params):
                for j in range(i+1, num_params):
                    x_grid, y_grid = np.meshgrid(
                        np.linspace(0, 1, 100), 
                        np.linspace(0, 1, 100)
                    )
                    # Create a grid for predictions
                    grid = np.tile(x_best_norm, (100 * 100, 1))
                    grid[:, i] = x_grid.ravel()
                    grid[:, j] = y_grid.ravel()

                    # Make predictions
                    predictions, sigma = gpr_model.predict(grid, return_std=True)
                    
                    # Reshape for plotting
                    predictions = predictions.reshape(x_grid.shape)
                    sigma = sigma.reshape(x_grid.shape)
                    
                    # Plot
                    plt.figure()
                    cp = plt.contourf(x_grid, y_grid, predictions, alpha=0.7)
                    plt.colorbar(cp)
                    plt.title(f'2D Cross-section between Parameter {i+1} and Parameter {j+1}')
                    plt.xlabel(f'Parameter {i+1}')
                    plt.ylabel(f'Parameter {j+1}')
                    plt.show()

        return figures

    def save(self, file = 'hloop.pkl'):
        with open(file, 'wb') as f:
            pickle.dump(self, f)
            
    def load(self, file):
        with open(file, 'rb') as f:
            hloop_restored = pickle.load(f)
        return hloop_restored
    
    def write_to_file(self, config_path='optimizer_config.json'):
        ''' 
        Write json file with gpr parameters that can be adjusted.  
        '''
        config = {
            'parameters': {
                'length_scale': self.length_scale,
                'length_scale_bounds': self.length_scale_bounds,
                'dimensions': self.dimensions,
                'noise_level': self.noise_level,
                'noise_level_bounds': self.noise_level_bounds,
                'sigma_value':self.sigma_value,
                'sigma_value_bounds':self.sigma_value_bounds,
                'alpha': self.alpha,
                'kappa':self.kappa,
                'xi':self.xi
            },
            'data': {
                'x_list': self.Xi,
                'y_list': self.yi
            }
        }
        with open(config_path, 'w') as file:
            json.dump(config, file, indent=4)

    def update_file(self, config_path='optimizer_config.json'):
        ''' Why do we need write_to_file and update_file?  - Jon'''
        self.write_to_file(config_path)

    def check_file_format(self, config_path='optimizer_config.json'):
        '''Checks if the JSON file can be read. '''
        try:
            with open(config_path, 'r') as file:
                data = json.load(file)
            return True
        except Exception as e:
            print(f"Error in file format: {e}")
            return False

    def load_optimizer_from_file(self, config_path='optimizer_config.json'):
        """
        Loads a GPR (Gaussian Process Regression) Optimizer configuration from a JSON file and initializes the optimizer with these settings.
        This function reads a JSON configuration file that specifies the parameters for a GPR Optimizer. The JSON file should contain two main sections: 'parameters', which includes all the necessary parameters for the GPR Optimizer, and 'data', which includes the data points (x_list and y_list) to be used by the optimizer.
    
        Parameters:
        - config_path (str): The path to the JSON configuration file. Defaults to 'optimizer_config.json'.
    
        Returns:
        - opt: An instance of the GPR_Optimizer class, initialized with the parameters and data from the configuration file.
    
        Example usage:
        optimizer = MyClass()  # Assuming this method is part of MyClass
        opt = optimizer.load_optimizer_from_file("path/to/config.json")
        
        Note:
        - The function prints "loading config..." to the console once the configuration is successfully loaded.
        - Ensure the JSON file exists at the specified path and adheres to the expected format.
        """

        
        with open(config_path, 'r') as file:
            config = json.load(file)
            
        # Also allow for domain reduction here.  
        #If domain_reduction changed, then define best_point with self.expected_minimium or Xi[argmin(self.y)],
        # then define dimensios centered around that.  around that. 
        # take the dimensions  config['parameters']['dimensions'], then find new dimensions. 
            
            
        opt = GPR_Optimizer(
            dimensions=config['parameters']['dimensions'],
            length_scale=config['parameters']['length_scale'],
            length_scale_bounds=config['parameters']['length_scale_bounds'],
            noise_level=config['parameters']['noise_level'],
            noise_level_bounds=config['parameters']['noise_level_bounds'],
            sigma_value=config['parameters']['sigma_value'],
            sigma_value_bounds= config['parameters']['sigma_value_bounds'],
            alpha=config['parameters']['alpha'],
            kappa=config['parameters']['kappa'],
            xi=config['parameters']['xi'],
            n_initial_points=1, # turn off randomly sampled initial points. Test it being 0. 
            acq_func = self.acq_func,
            normalize_y = self.normalize_y,
            n_restarts_optimizer = self.n_restarts_optimizer,
            acq_optimizer = self.acq_optimizer,
            verbose = self.verbose,
            config_path = self.config_path,
            domain_reduction = config['parameters']['domain_reduction'],
            best_point = [] ,
            hp_fit_n_points = self.hp_fit_n_points
        )
        
        # Reload the data points
        opt.Xi=config['data']['x_list']
        opt.yi=config['data']['y_list']
        print("loading config...")
        
        # Filter out Xi, yi that are outside new bounds. Also need to do for domain_reduction
        # if config['parameters']['dimensions'] is not the same as self.dimensions,
        # then filter out Xi,yi data.  Will want to keep data. opt.Xi_all and opt.Yi_all.
        # What if I use tell( list) here, will that naturally do the filtering?

        
        return opt

    
######### BENCHMARKING ###############################
def calculate_distances(points, single_point):
    """
    Calculate the Euclidean distances from a list of points to a single point.

    :param points: List of points (each point is a list of coordinates).
    :param single_point: A single point (list of coordinates).
    :return: List of distances.
    """
    # Convert lists to numpy arrays for efficient computation
    points_array = np.array(points)
    single_point_array = np.array(single_point)

    diff = points_array - single_point_array # Calculate the differences and square them
    squared_diff = diff ** 2

    distances = np.sqrt(np.sum(squared_diff, axis=1))     # Sum the squared differences and take the square root
    
    return distances

def multivariate_gaussian_signal(x, ndims=1, amplitude=1, sigma=0.1, noisetype=None, background_noise=None, N_gaussian = 1):
    '''Simulates noisy data using multivariate distribution with covariance matrix. 
    Gives single peak. Shape of peak is given by covariance matrix. 
    For noise, 
        shot noise (poisson distribution),
        gaussian noise proportional to sqrt(signal)/ N_gaussian. For example from adding several poissoning signals together. 
        constant gaussian noise with amplitude 'background noise'
    
    Parameters :   
        x : list of points
        ndims : 1-5, Number of dimensions 
        amplitude : amplitude of the gaussian, so max value. 
        sigma = 0.1 # standard deviation,  FWHM = 2.35*sigma
        noisetype : None / 'shot' / 'gaussian'
        background_noise : None, or 1. constant gaussian noise amplitude. 
        N_gaussian : 10, if noisetype = 'gaussian', number of averages.
       
    Output : 
        negative float.  Neg so works with minimizer. 
        '''
    
    # 1D Gaussian objective, shot noise 
    if 1:
        # Peak at center with no covariance
        mean = np.zeros(ndims)  # Assuming the mean vector is zero
        covariance_matrix = np.diag(sigma**2 * np.ones(ndims))
    elif 0:
        # Randomize means and covariance
        mean = np.random.randn(ndims)  # Assuming the mean vector is zero
        A = np.random.randn(ndims, ndims)
        covariance_matrix = sigma**2 * np.dot(A, A.T) # to make a positive semi-definite matrix, which is what a covariance matrix must be. 
    
    
    normalizing_factor = 1 / np.sqrt((2 * np.pi) ** ndims * np.linalg.det(covariance_matrix)) # normalization to make max 1
    val = multivariate_normal(mean, covariance_matrix).pdf(x) / normalizing_factor
    val = amplitude * val  

    if noisetype=='shot':    
        # Generate random number from Poisson distribution with mean of 'val'
        n = 1  # Number of points to generate
        mu = np.abs(val)  # Mean of the Poisson distribution
        val = np.random.poisson(mu, n)[0]
    elif noisetype=='gaussian':
        # Gaussian noise proportional to  sqrt(signal)/N
        # This for example would model the average of N shot noise signals, each with a max signal of amplitude. 
        val = val + np.sqrt(np.abs(val))/ N_gaussian * np.random.randn()
    
    if background_noise is not None:
        # Constant gaussian noise
        val = val + background_noise * np.random.randn()

    val = np.abs(val)
    return -val # because minimizing

def ci_wilson(self, k, n, z=1.96, eqmode=0):
    ''' Calculates the Binomial Proportion Confidence Interval using the Wilson Score method without continuation correction
    # Equations eqmode == 1 from: https://en.wikipedia.org/w/index.php?title=Binomial_proportion_confidence_interval&oldid=1101942017#Wilson_score_interval
    # Equations eqmode == 0 from: https://www.evanmiller.org/how-not-to-sort-by-average-rating.html
    # The results should be close to:
    #    from statsmodels.stats.proportion import proportion_confint
    #    proportion_confint(k, n, alpha=0.05, method='wilson')
    #z=1.44 = 85%, 1.96 = 95% '''
    if n == 0:
        return 0
    p_hat = float(k) / n  #mean
    z2 = z**2
    if eqmode == 0:
        ci_l = (p_hat + z2/(2*n) - z*np.sqrt(max(0.0, (p_hat*(1 - p_hat) + z2/(4*n))/n))) / (1 + z2 / n)
    else:
        ci_l = (1.0 / (1.0 + z2/n)) * (p_hat + z2/(2*n)) - (z / (1 + z2/n)) * np.sqrt(max(0.0, (p_hat*(1 - p_hat)/n + z2/(4*(n**2)))))
    if eqmode == 0:
        ci_u = (p_hat + z2/(2*n) + z*np.sqrt(max(0.0, (p_hat*(1 - p_hat) + z2/(4*n))/n))) / (1 + z2 / n)
    else:
        ci_u = (1.0 / (1.0 + z2/n)) * (p_hat + z2/(2*n)) + (z / (1 + z2/n)) * np.sqrt(max(0.0, (p_hat*(1 - p_hat)/n + z2/(4*(n**2)))))
    return [ci_l, ci_u]

def binary_simulation(self, mu, Nruns):
    ''' Simulates binary data with Nruns of either 0 or 1 and mean mu.
    mu:  between 0 and 1
    Nruns: number of runs
    # Outputs simulated mean, Wilson 86% confint [lcb, ucb], and the max sigma.'''
    data = np.random.binomial(1, mu, Nruns)
    mean = np.mean(data)
    num_ones = np.count_nonzero(data == 1) 
    confint = self.ci_wilson(num_ones,data.size, z=1.44) # 1.44 for 86% conf, 1.96 for 95%
    sigma = np.max(abs(mean - confint)) # take max of lc and uc bounds
    return mean, sigma, confint