From 4768241dba1bae5d693fc68e9d4047635c100ff8 Mon Sep 17 00:00:00 2001 From: Gaurav S Deshmukh Date: Sun, 24 Sep 2023 00:32:53 -0400 Subject: [PATCH 1/3] Added Model class --- src/data.py | 1 + src/models.py | 2 +- src/samplers.py | 6 ++++-- src/utils.py | 35 +++++++++++++++++++++++++++++++++++ 4 files changed, 41 insertions(+), 3 deletions(-) diff --git a/src/data.py b/src/data.py index 01e92a1..aa1a23d 100644 --- a/src/data.py +++ b/src/data.py @@ -352,6 +352,7 @@ def load_datapoints(atoms, process_dict): # ]) print(dataset[0][-1].x) print(dataset.df_name_idx.head()) + print(dataset[0][-1].name) # Create datapoint atoms = read(data_root_path / "Pt_3_Rh_9_-7-7-S.cif") diff --git a/src/models.py b/src/models.py index 8ff3fd6..6f2d727 100644 --- a/src/models.py +++ b/src/models.py @@ -110,7 +110,7 @@ def init_conv_layers(self): gnn.CGConv( channels=self.conv_size[i], dim=self.num_edge_features[i], - batch_norm=True, + batch_norm=False, ), nn.LeakyReLU(inplace=True), ] diff --git a/src/samplers.py b/src/samplers.py index d6d79ef..ffa364b 100644 --- a/src/samplers.py +++ b/src/samplers.py @@ -67,9 +67,11 @@ def create_samplers(self, sample_config): randomizer.shuffle(idx_array) # Get indices - train_size = int(np.ceil(sample_config["train"] * self.dataset_size)) + if sample_config["train"] < 1.: + train_size = int(np.ceil(sample_config["train"] * self.dataset_size)) train_idx = idx_array[:train_size] - val_size = int(np.ceil(sample_config["val"] * self.dataset_size)) + if sample_config["val"] < 1.: + val_size = int(np.floor(sample_config["val"] * self.dataset_size)) val_idx = idx_array[train_size : train_size + val_size] test_idx = idx_array[train_size + val_size :] diff --git a/src/utils.py b/src/utils.py index 2e73f2f..5b2c6b2 100644 --- a/src/utils.py +++ b/src/utils.py @@ -5,6 +5,9 @@ import numpy as np import torch +from torch.utils.data import SubsetRandomSampler +from torch_geometric.loader import DataLoader + from featurizers import ( OneHotEncoder, list_of_edge_featurizers, @@ -133,6 +136,38 @@ def featurize_atoms( "edge_indices": edge_indices, } +def create_dataloaders(proc_data, sample_idx, batch_size, num_proc=0): + """Create training, validation, and/or test dataloaders. + + Parameters + ---------- + proc_data: AtomsDataset or AtomsDatapoints + Processed dataset object + sampler: dict + A dictionary with "train", "val", and "test" indices returned by a Sampler + object. + batch_size: int + Batch size + num_proc: int (default = 0) + Number of cores to be used for parallelization. Defaults to serial. + + Returns + ------- + dataloader_dict: dict + Dictionary of "train", "val", and "test" dataloaders + """ + # Create dataloader dict + dataloader_dict = {"train": [], "val": [], "test": []} + + for key in dataloader_dict.keys(): + if sample_idx[key].shape[0] > 0.: + sampler = SubsetRandomSampler(sample_idx[key]) + dataloader_dict[key] = DataLoader(dataset=proc_data, + batch_size=batch_size, + sampler=sampler, + num_workers=num_proc) + + return dataloader_dict if __name__ == "__main__": from ase.io import read From 8875133a7b6d26deacd0e0aab05c3e3be59e6cb2 Mon Sep 17 00:00:00 2001 From: Gaurav S Deshmukh Date: Sun, 24 Sep 2023 00:34:43 -0400 Subject: [PATCH 2/3] Added train.py --- src/train.py | 463 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 463 insertions(+) create mode 100644 src/train.py diff --git a/src/train.py b/src/train.py new file mode 100644 index 0000000..32539f9 --- /dev/null +++ b/src/train.py @@ -0,0 +1,463 @@ +"""Train and test the model.""" + +from copy import deepcopy +from pathlib import Path + +import numpy as np +import torch + +from sklearn.metrics import mean_absolute_error, mean_squared_error + +from models import MultiGCN + +class Standardizer: + def __init__(self, X): + """ + Class to standardize outputs. + + Parameters + ---------- + X: torch.Tensor + Tensor of outputs + """ + self.mean = torch.mean(X) + self.std = torch.std(X) + + def standardize(self, X): + """ + Convert a non-standardized output to a standardized output. + + Parameters + ---------- + X: torch.Tensor + Tensor of non-standardized outputs + + Returns + ------- + Z: torch.Tensor + Tensor of standardized outputs + + """ + Z = (X - self.mean) / (self.std) + return Z + + def restore(self, Z): + """ + Restore a standardized output to the non-standardized output. + + Parameters + ---------- + Z: torch.Tensor + Tensor of standardized outputs + + Returns + ------- + X: torch.Tensor + Tensor of non-standardized outputs + + """ + X = self.mean + Z * self.std + return X + + def get_state(self): + """ + Return dictionary of the state of the Standardizer. + + Returns + ------- + dict + Dictionary with the mean and std of the outputs + + """ + return {"mean" : self.mean, "std" : self.std} + + def set_state(self, state): + """ + Load a dictionary containing the state of the Standardizer. + + Parameters + ---------- + state : dict + Dictionary containing mean and std + """ + self.mean = state["mean"] + self.std = state["std"] + +class Model: + """Wrapper class for a MultiGCN model that allows training and prediction.""" + def __init__(self, global_config, partition_configs, model_path): + """Initialize a MultiGCN model. + + Parameters + ---------- + global_config: dict + Global configuration dictionary. Should contain the following keys: + gpu (whether to use GPU, bool), optimizer (name of optimizer, str; can + either be "adam" or "sgd"), learning_rate (model learning rate, float), + lr_milestones (milestones when learning rate is to be decresed, list; + optional) + partition_configs: List[Dict] + List of dictionaries containing parameters for the GNN for each + partition. The number of different GNNs are judged based on the + size of the list. Each partition config should contain the following + keys: n_conv (number of convolutional layers, int), n_hidden (number + of hidden layers, int), conv_size (feature size before convolution, int) + hidden_size (nodes per hidden layer node, int), dropout (dropout + probability for hidden layers, float), conv_type (type of convolution + layer, str; currently only "CGConv" is supported), pool_type + (type of pooling layer, str; currently "add" and "mean" are supported), + num_node_features (number of node features, int), num_edge_features + (number of edge features, int). + model_path: str + Path where the model is to be saved + """ + # Create model + self.model = MultiGCN(partition_configs) + + # Create model path + self.make_directory_structure(model_path) + + # Set GPU status + self.use_gpu = global_config["gpu"] + + # Set loss function + if global_config["loss_function"] == "mse": + self.loss_fn = torch.nn.MSELoss() + else: + raise ValueError( + "Incorrect loss function. Currently only 'mse' is supported" + ) + + # Set metric function + if global_config["metric_function"] == "mae": + self.metric_fn = mean_absolute_error + elif global_config["metric_function"] == "mse": + self.metric_fn = mean_squared_error + + # Set optimizer + if global_config["optimizer"].lower().strip() == "adam": + self.optimizer = torch.optim.Adam( + self.model.parameters(), + lr=global_config["learning_rate"], + ) + elif global_config["optimizer"].lower().strip() == "sgd": + self.optimizer = torch.optim.SGD( + self.model_parameters(), + lr=global_config["learning_rate"], + ) + + # Set scheduler + if "lr_milestones" in global_config.keys(): + self.scheduler = torch.optim.MultiStepLR( + optimizer=self.optimizer, + milestones=global_config["lr_milestones"] + ) + else: + self.scheduler = None + + def make_directory_structure(self, model_path): + """Make directory structure to store models and results.""" + self.model_path = Path(model_path) + self.model_save_path = self.model_path / "models" + self.model_results_path = self.model_path / "results" + self.model_save_path.mkdir(parents=True, exist_ok=True) + self.model_results_path.mkdir(parents=True, exist_ok=True) + + def init_standardizer(self, targets): + """Initialize the Standardizer using training targets (typically). + + Parameters + ---------- + targets: np.ndarray or torch.Tensor + Array of training outputs + """ + self.standardizer = Standardizer(targets) + + def train_epoch(self, dataloader): + """Train the model for a single epoch. + + Parameters + ---------- + dataloader: torch_geometric.loader.DataLoader + Training dataloader + """ + # Variables to store average stats + avg_loss = 0 + avg_metric = 0 + count = 0 + + # Enable train mode of model + self.model.train() + + # Go over each batch in the dataloader + for data_objects in dataloader: + # Standardize output + y = data_objects[0].y + y_std = self.standardizer.standardize(y) + + # Transfer to GPU (if True) + if self.use_gpu: + nn_output = y_std.cuda() + nn_input = [d.cuda() for d in data_objects] + else: + nn_output = y_std + nn_input = data_objects + + # Compute prediction + pred_dict = self.model(nn_input) + + # Calculate loss + loss = self.loss_fn(nn_output, pred_dict["output"]) + avg_loss += loss + + # Calculate metric + y_pred = self.standardizer.restore(pred_dict["output"].cpu()) + metric = self.metric_fn(y, y_pred) + avg_metric += metric + + # Set zero gradient for all the tensors + self.optimizer.zero_grad() + + # Perform backward propagation + loss.backward() + + # Update weights and biases + self.optimizer.step() + + # Update scheduler if not None + if self.scheduler is not None: + self.scheduler.step() + + # Increase count + count += 1 + + # Calculate average loss and metric + avg_loss = avg_loss / count + avg_metric = avg_metric / count + + return avg_loss, avg_metric + + def validate(self, dataloader): + """Validate/test the model. + + Parameters + ---------- + dataloader: torch_geometric.loader.DataLoader + Validation/test dataloader + """ + # Variables to store average stats + avg_loss = 0 + avg_metric = 0 + count = 0 + + # Enable train mode of model + self.model.eval() + + # Go over each batch in the dataloader + for data_objects in dataloader: + # Standardize output + y = data_objects[0].y + y_std = self.standardizer.standardize(y) + + # Transfer to GPU (if True) + if self.use_gpu: + nn_output = y_std.cuda() + nn_input = [d.cuda() for d in data_objects] + else: + nn_output = y_std + nn_input = data_objects + + # Compute prediction + pred_dict = self.model(nn_input) + + # Calculate loss + loss = self.loss_fn(nn_output, pred_dict["output"]) + avg_loss += loss + + # Calculate metric + y_pred = self.standardizer.restore(pred_dict["output"].cpu()) + metric = self.metric_fn(y, y_pred) + avg_metric += metric + + # Increase count + count += 1 + + # Calculate average loss and metric + avg_loss = avg_loss / count + avg_metric = avg_metric / count + + return avg_loss, avg_metric + + def predict(self, dataset, indices, return_targets=False): + """Predict outputs from the model. + + Parameters + ---------- + dataset: AtomsDataset or AtomsDatapoints + Validation dataloader + indices: list or np.ndarray + List of indices for datapoints for which predictions are to be made + return_targets: bool (default = False) + If True, outputs are returned. If False, all targets will be 0. + + Returns + ------- + prediction_dict: dict + Dictionary containing "targets", "predictions", and "indices" (copy of + predict_idx). + """ + # Create arrays + targets = np.zeros(len(indices)) + predictions = np.zeros(len(indices)) + + # Enable eval mode of model + self.model.eval() + + # Go over each batch in the dataloader + for i, idx in enumerate(indices): + # Get data objects + data_objects = dataset.get(i) + + # Standardize output + if return_targets: + targets[i] = data_objects[0].y.cpu() + + # Transfer to GPU (if True) + if self.use_gpu: + nn_input = [d.cuda() for d in data_objects] + else: + nn_input = data_objects + + # Compute prediction + pred_dict = self.model(nn_input) + predictions[i] = self.standardizer.restore(pred_dict["output"].cpu()) + + predictions_dict = {"targets": targets, "predictions": predictions, + "indices": indices} + + return predictions_dict + + def save(self, epoch, best_status=None): + """Save the current state of the model as a dictionary. + + The dictionary contains the epoch, model state dict, optimizer state dict, + and standardizer state dict. + + Parameters + ---------- + epoch: int + Current epoch + best_status: bool + If True, this is also saved as "best.pt". + """ + save_dict = { + "epoch": epoch, + "model_state_dict": self.model.state_dict(), + "optimizer_state_dict": self.optimizer.state_dict(), + "standardizer_state_dict": self.standardizer.get_state(), + } + save_path = self.model_save_path / f"model_{epoch}.pt" + torch.save(save_dict, save_path) + if best_status: + save_path = self.model_save_path / "best.pt" + torch.save(save_dict, save_path) + + def load(self, epoch=None, best_status=None): + """Load a model saved at a particular epoch or the best model. + + If best_status is True, epoch is ignored and the best model is loaded. + + Parameters + ---------- + epoch: int + Model at this epoch is loaded + best_status: bool + If this is True, the best model is loaded + """ + # Load path + if best_status: + load_path = self.model_save_path / "best.pt" + else: + load_path = self.model_save_path / f"model_{epoch}.pt" + + # Load the dictionary + load_dict = torch.load(load_path) + + # Set state dicts + self.model.load_state_dict(load_dict["model_state_dict"]) + self.standardizer.set_state(load_dict["standardizer_state_dict"]) + + def train(self, epochs, dataloader_dict, verbose=False): + """Train a model for the given number of epochs. + + The training is performed with early stopping, i.e., the metric function + is evaluated at every epoch and the model with the best value for this + metric is loaded after training for testing. + + Parameters + ---------- + epochs: int + Total number of epochs + dataloader_dict: dict + Dictionary of train, val, and test dataloaders + verbose: bool + If True, progress is printed for every epoch. + + Returns + ------- + results_dict: Dict[Dict] + Dictionary of dictionaries. The outer dictionary contains the keys + "loss" and "metric" and the inner dictionaries contain the keys + "train", "val", and "test". + """ + # Create empty lists + train_losses = [] + train_metrics = [] + val_losses = [] + val_metrics = [] + + # Initialize validation loss + prev_val_metric = 1e9 + best_status = False + + # Train and validate model + for i in range(epochs): + # Train + train_loss, train_metric = self.train_epoch(dataloader_dict["train"]) + + # Validate + val_loss, val_metric = self.validate(dataloader_dict["val"]) + + # Check if model is best + if val_metric < prev_val_metric: + best_status = True + prev_val_metric = deepcopy(val_metric) + else: + best_status = False + + # Save model + self.save(i, best_status) + + # Save losses and metrics + train_losses.append(train_loss) + val_losses.append(val_loss) + train_metrics.append(train_metric) + val_metrics.append(val_metric) + + # Load the best model + self.load(best_status=True) + + # Test the model + test_loss, test_metric = self.validate(dataloader_dict["test"]) + + loss_dict = { + "train": train_losses, "val": val_losses, "test": test_loss + } + metric_dict = { + "train": train_metrics, "val": val_metrics, "test": test_metric + } + + results_dict = {"loss": loss_dict, "metric": metric_dict} + + return results_dict + \ No newline at end of file From b212fae58383d79dcbd6ce5838cbc6f7a44a85cd Mon Sep 17 00:00:00 2001 From: Gaurav S Deshmukh Date: Sun, 24 Sep 2023 00:37:03 -0400 Subject: [PATCH 3/3] Fix codestyle --- src/train.py | 72 ++++++++++++++++++++++++++-------------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/src/train.py b/src/train.py index 32539f9..2e9ab02 100644 --- a/src/train.py +++ b/src/train.py @@ -5,31 +5,32 @@ import numpy as np import torch - from sklearn.metrics import mean_absolute_error, mean_squared_error from models import MultiGCN + class Standardizer: + """Class to standardize targets.""" def __init__(self, X): """ Class to standardize outputs. - + Parameters ---------- - X: torch.Tensor + X: torch.Tensor Tensor of outputs """ self.mean = torch.mean(X) self.std = torch.std(X) - + def standardize(self, X): """ Convert a non-standardized output to a standardized output. Parameters ---------- - X: torch.Tensor + X: torch.Tensor Tensor of non-standardized outputs Returns @@ -40,7 +41,7 @@ def standardize(self, X): """ Z = (X - self.mean) / (self.std) return Z - + def restore(self, Z): """ Restore a standardized output to the non-standardized output. @@ -52,13 +53,13 @@ def restore(self, Z): Returns ------- - X: torch.Tensor + X: torch.Tensor Tensor of non-standardized outputs """ X = self.mean + Z * self.std return X - + def get_state(self): """ Return dictionary of the state of the Standardizer. @@ -69,8 +70,8 @@ def get_state(self): Dictionary with the mean and std of the outputs """ - return {"mean" : self.mean, "std" : self.std} - + return {"mean": self.mean, "std": self.std} + def set_state(self, state): """ Load a dictionary containing the state of the Standardizer. @@ -78,13 +79,15 @@ def set_state(self, state): Parameters ---------- state : dict - Dictionary containing mean and std + Dictionary containing mean and std """ self.mean = state["mean"] self.std = state["std"] + class Model: """Wrapper class for a MultiGCN model that allows training and prediction.""" + def __init__(self, global_config, partition_configs, model_path): """Initialize a MultiGCN model. @@ -116,7 +119,7 @@ def __init__(self, global_config, partition_configs, model_path): # Create model path self.make_directory_structure(model_path) - + # Set GPU status self.use_gpu = global_config["gpu"] @@ -127,7 +130,7 @@ def __init__(self, global_config, partition_configs, model_path): raise ValueError( "Incorrect loss function. Currently only 'mse' is supported" ) - + # Set metric function if global_config["metric_function"] == "mae": self.metric_fn = mean_absolute_error @@ -149,8 +152,7 @@ def __init__(self, global_config, partition_configs, model_path): # Set scheduler if "lr_milestones" in global_config.keys(): self.scheduler = torch.optim.MultiStepLR( - optimizer=self.optimizer, - milestones=global_config["lr_milestones"] + optimizer=self.optimizer, milestones=global_config["lr_milestones"] ) else: self.scheduler = None @@ -236,7 +238,7 @@ def train_epoch(self, dataloader): avg_metric = avg_metric / count return avg_loss, avg_metric - + def validate(self, dataloader): """Validate/test the model. @@ -332,11 +334,14 @@ def predict(self, dataset, indices, return_targets=False): pred_dict = self.model(nn_input) predictions[i] = self.standardizer.restore(pred_dict["output"].cpu()) - predictions_dict = {"targets": targets, "predictions": predictions, - "indices": indices} - + predictions_dict = { + "targets": targets, + "predictions": predictions, + "indices": indices, + } + return predictions_dict - + def save(self, epoch, best_status=None): """Save the current state of the model as a dictionary. @@ -364,7 +369,7 @@ def save(self, epoch, best_status=None): def load(self, epoch=None, best_status=None): """Load a model saved at a particular epoch or the best model. - + If best_status is True, epoch is ignored and the best model is loaded. Parameters @@ -379,7 +384,7 @@ def load(self, epoch=None, best_status=None): load_path = self.model_save_path / "best.pt" else: load_path = self.model_save_path / f"model_{epoch}.pt" - + # Load the dictionary load_dict = torch.load(load_path) @@ -392,7 +397,7 @@ def train(self, epochs, dataloader_dict, verbose=False): The training is performed with early stopping, i.e., the metric function is evaluated at every epoch and the model with the best value for this - metric is loaded after training for testing. + metric is loaded after training for testing. Parameters ---------- @@ -422,22 +427,22 @@ def train(self, epochs, dataloader_dict, verbose=False): # Train and validate model for i in range(epochs): - # Train + # Train train_loss, train_metric = self.train_epoch(dataloader_dict["train"]) - + # Validate val_loss, val_metric = self.validate(dataloader_dict["val"]) - + # Check if model is best if val_metric < prev_val_metric: best_status = True prev_val_metric = deepcopy(val_metric) else: best_status = False - + # Save model self.save(i, best_status) - + # Save losses and metrics train_losses.append(train_loss) val_losses.append(val_loss) @@ -450,14 +455,9 @@ def train(self, epochs, dataloader_dict, verbose=False): # Test the model test_loss, test_metric = self.validate(dataloader_dict["test"]) - loss_dict = { - "train": train_losses, "val": val_losses, "test": test_loss - } - metric_dict = { - "train": train_metrics, "val": val_metrics, "test": test_metric - } + loss_dict = {"train": train_losses, "val": val_losses, "test": test_loss} + metric_dict = {"train": train_metrics, "val": val_metrics, "test": test_metric} results_dict = {"loss": loss_dict, "metric": metric_dict} - return results_dict - \ No newline at end of file + return results_dict