diff --git a/.gitignore b/.gitignore index 513abe4..1acfb60 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ data/* !data/dband_centers.csv __pycache__ *.cif +trained_models +*.pt +*__init__.py \ No newline at end of file diff --git a/src/data.py b/src/data.py index aa1a23d..093b964 100644 --- a/src/data.py +++ b/src/data.py @@ -10,9 +10,9 @@ from ase.io import read from torch_geometric.data import Data, Dataset -from constants import REPO_PATH -from featurizers import OneHotEncoder -from utils import featurize_atoms, partition_structure +from .constants import REPO_PATH +from .featurizers import OneHotEncoder +from .utils import featurize_atoms, partition_structure_by_layers class AtomsDataset(Dataset): @@ -60,7 +60,7 @@ def __init__(self, root, prop_csv): def process_data( self, - z_cutoffs, + layer_cutoffs, node_features, edge_features, max_atoms=None, @@ -75,8 +75,8 @@ def process_data( Parameters ---------- - z_cutoffs: list or np.ndarray - List of z-coordinates based on which atomic structures are + layer_cutoffs: list or np.ndarray + List of layer cutoffs based on which atomic structures are partitioned. The number of partitions is equal to one more than the length of z_cutoffs. node_features: list[list] @@ -115,7 +115,7 @@ def process_data( atoms = read(str(file_path)) # Partition structure - part_atoms = partition_structure(atoms, z_cutoffs) + part_atoms = partition_structure_by_layers(atoms, layer_cutoffs) # Featurize partitions data_objects = [] @@ -191,7 +191,7 @@ def __init__(self, atoms): def process_data( self, - z_cutoffs, + layer_cutoffs, node_features, edge_features, max_atoms=None, @@ -206,8 +206,8 @@ def process_data( Parameters ---------- - z_cutoffs: list or np.ndarray - List of z-coordinates based on which atomic structures are + layer_cutoffs: list or np.ndarray + List of layer cutoffs based on which atomic structures are partitioned. The number of partitions is equal to one more than the length of z_cutoffs. node_features: list[list] @@ -229,7 +229,7 @@ def process_data( # Iterate over files and process them for atoms_obj in self.atoms: # Partition structure - part_atoms = partition_structure(atoms_obj, z_cutoffs) + part_atoms = partition_structure_by_layers(atoms_obj, layer_cutoffs) # Featurize partitions data_objects = [] @@ -337,28 +337,11 @@ def load_datapoints(atoms, process_dict): data_root_path = Path(REPO_PATH) / "data" / "S_calcs" prop_csv_path = data_root_path / "name_prop.csv" - # Create dataset - dataset = AtomsDataset(data_root_path, prop_csv_path) - # dataset.process_data(z_cutoffs=[13., 20.], - # node_features=[ - # ["atomic_number", "dband_center"], - # ["atomic_number", "reactivity"], - # ["atomic_number", "reactivity"], - # ], - # edge_features=[ - # ["bulk_bond_distance"], - # ["surface_bond_distance"], - # ["adsorbate_bond_distance"], - # ]) - print(dataset[0][-1].x) - print(dataset.df_name_idx.head()) - print(dataset[0][-1].name) - # Create datapoint atoms = read(data_root_path / "Pt_3_Rh_9_-7-7-S.cif") datapoint = AtomsDatapoints(atoms) datapoint.process_data( - z_cutoffs=[13.0, 20.0], + layer_cutoffs=[3, 6], node_features=[ ["atomic_number", "dband_center"], ["atomic_number", "reactivity"], diff --git a/src/featurizers.py b/src/featurizers.py index 4ca0109..bd749f5 100644 --- a/src/featurizers.py +++ b/src/featurizers.py @@ -9,8 +9,8 @@ from mendeleev import element from torch.nn.functional import one_hot -from constants import DBAND_FILE_PATH -from graphs import AtomsGraph +from .constants import DBAND_FILE_PATH +from .graphs import AtomsGraph class OneHotEncoder: diff --git a/src/models.py b/src/models.py index 6f2d727..97f6cbc 100644 --- a/src/models.py +++ b/src/models.py @@ -45,7 +45,7 @@ def __init__(self, partition_configs): # Initialize layers # Initial transform - self.init_transform = [] + self.init_transform = nn.ModuleList() for i in range(self.n_partitions): self.init_transform.append( nn.Sequential( @@ -58,12 +58,12 @@ def __init__(self, partition_configs): self.init_conv_layers() # Pooling layers - self.pool_layers = [] + self.pool_layers = nn.ModuleList() for i in range(self.n_partitions): - self.pool_layers.append(gnn.pool.global_add_pool) + self.pool_layers.append(gnn.aggr.SumAggregation()) # Pool transform - self.pool_transform = [] + self.pool_transform = nn.ModuleList() for i in range(self.n_partitions): self.pool_transform.append( nn.Sequential( @@ -73,7 +73,7 @@ def __init__(self, partition_configs): ) # Hidden layers - self.hidden_layers = [] + self.hidden_layers = nn.ModuleList() for i in range(self.n_partitions): self.hidden_layers.append( nn.Sequential( @@ -98,10 +98,12 @@ def __init__(self, partition_configs): self.final_lin_transform = nn.Linear(self.n_partitions, 1, bias=False) with torch.no_grad(): self.final_lin_transform.weight.copy_(torch.ones(self.n_partitions)) + for p in self.final_lin_transform.parameters(): + p.requires_grad = False def init_conv_layers(self): """Initialize convolutional layers.""" - self.conv_layers = [] + self.conv_layers = nn.ModuleList() for i in range(self.n_partitions): part_conv_layers = [] for j in range(self.n_conv[i]): @@ -110,7 +112,7 @@ def init_conv_layers(self): gnn.CGConv( channels=self.conv_size[i], dim=self.num_edge_features[i], - batch_norm=False, + batch_norm=True, ), nn.LeakyReLU(inplace=True), ] @@ -151,7 +153,7 @@ def forward(self, data_objects): conv_data = layer(conv_data) # Apply pooling layer - pooled_data = self.pool_layers[i](x=conv_data, batch=None) + pooled_data = self.pool_layers[i](x=conv_data, index=data.batch) # Apply pool-to-hidden transform hidden_data = self.pool_transform[i](pooled_data) @@ -163,8 +165,8 @@ def forward(self, data_objects): contributions.append(hidden_data) # Apply final transformation - contributions = torch.cat(contributions, dim=-1) - output = self.final_lin_transform(contributions) + contributions = torch.cat(contributions) + output = self.final_lin_transform(contributions.view(-1, 3)) return {"output": output, "contributions": contributions} diff --git a/src/train.py b/src/train.py index 2e9ab02..150ce5c 100644 --- a/src/train.py +++ b/src/train.py @@ -7,15 +7,24 @@ import torch from sklearn.metrics import mean_absolute_error, mean_squared_error -from models import MultiGCN +from .models import MultiGCN class Standardizer: """Class to standardize targets.""" - def __init__(self, X): + + def __init__(self): """ Class to standardize outputs. + Initialize with dummy values. + """ + self.mean = 0 + self.std = 0.1 + + def initialize(self, X): + """Initialize mean and std based on the given tensor. + Parameters ---------- X: torch.Tensor @@ -60,6 +69,24 @@ def restore(self, Z): X = self.mean + Z * self.std return X + def restore_cont(self, Z): + """ + Restore a standardized contribution to the non-standardized contribution. + + Parameters + ---------- + Z: torch.Tensor + Tensor of standardized contributions + + Returns + ------- + X: torch.Tensor + Tensor of non-standardized contributions + + """ + X = (self.mean / Z.shape[0]) + Z * self.std + return X + def get_state(self): """ Return dictionary of the state of the Standardizer. @@ -122,7 +149,8 @@ def __init__(self, global_config, partition_configs, model_path): # Set GPU status self.use_gpu = global_config["gpu"] - + if self.use_gpu: + self.model.cuda() # Set loss function if global_config["loss_function"] == "mse": self.loss_fn = torch.nn.MSELoss() @@ -145,18 +173,21 @@ def __init__(self, global_config, partition_configs, model_path): ) elif global_config["optimizer"].lower().strip() == "sgd": self.optimizer = torch.optim.SGD( - self.model_parameters(), + self.model.parameters(), lr=global_config["learning_rate"], ) # Set scheduler if "lr_milestones" in global_config.keys(): - self.scheduler = torch.optim.MultiStepLR( + self.scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer=self.optimizer, milestones=global_config["lr_milestones"] ) else: self.scheduler = None + # Set standardizer + self.standardizer = Standardizer() + def make_directory_structure(self, model_path): """Make directory structure to store models and results.""" self.model_path = Path(model_path) @@ -173,7 +204,7 @@ def init_standardizer(self, targets): targets: np.ndarray or torch.Tensor Array of training outputs """ - self.standardizer = Standardizer(targets) + self.standardizer.initialize(torch.Tensor(targets)) def train_epoch(self, dataloader): """Train the model for a single epoch. @@ -209,11 +240,11 @@ def train_epoch(self, dataloader): pred_dict = self.model(nn_input) # Calculate loss - loss = self.loss_fn(nn_output, pred_dict["output"]) + loss = self.loss_fn(pred_dict["output"], nn_output.unsqueeze(1)) avg_loss += loss # Calculate metric - y_pred = self.standardizer.restore(pred_dict["output"].cpu()) + y_pred = self.standardizer.restore(pred_dict["output"].cpu().detach()) metric = self.metric_fn(y, y_pred) avg_metric += metric @@ -273,11 +304,11 @@ def validate(self, dataloader): pred_dict = self.model(nn_input) # Calculate loss - loss = self.loss_fn(nn_output, pred_dict["output"]) + loss = self.loss_fn(pred_dict["output"], nn_output.unsqueeze(1)) avg_loss += loss # Calculate metric - y_pred = self.standardizer.restore(pred_dict["output"].cpu()) + y_pred = self.standardizer.restore(pred_dict["output"].cpu().detach()) metric = self.metric_fn(y, y_pred) avg_metric += metric @@ -305,12 +336,14 @@ def predict(self, dataset, indices, return_targets=False): Returns ------- prediction_dict: dict - Dictionary containing "targets", "predictions", and "indices" (copy of - predict_idx). + Dictionary containing "targets", "predictions", "contributions" and + "indices" (copy of predict_idx). """ # Create arrays + n_partitions = len(dataset.get(indices[0])) targets = np.zeros(len(indices)) predictions = np.zeros(len(indices)) + contributions = np.zeros((len(indices), n_partitions)) # Enable eval mode of model self.model.eval() @@ -318,7 +351,7 @@ def predict(self, dataset, indices, return_targets=False): # Go over each batch in the dataloader for i, idx in enumerate(indices): # Get data objects - data_objects = dataset.get(i) + data_objects = dataset.get(idx) # Standardize output if return_targets: @@ -332,12 +365,19 @@ def predict(self, dataset, indices, return_targets=False): # Compute prediction pred_dict = self.model(nn_input) - predictions[i] = self.standardizer.restore(pred_dict["output"].cpu()) + predictions[i] = self.standardizer.restore( + pred_dict["output"].cpu().detach() + ) + conts_std = pred_dict["contributions"].cpu().detach() + contributions[i, :] = ( + self.standardizer.restore_cont(conts_std).numpy().flatten() + ) predictions_dict = { "targets": targets, "predictions": predictions, "indices": indices, + "contributions": contributions, } return predictions_dict @@ -444,11 +484,20 @@ def train(self, epochs, dataloader_dict, verbose=False): self.save(i, best_status) # Save losses and metrics - train_losses.append(train_loss) - val_losses.append(val_loss) + train_losses.append(train_loss.cpu().detach().numpy()) + val_losses.append(val_loss.cpu().detach().numpy()) train_metrics.append(train_metric) val_metrics.append(val_metric) + # Print, if verbose + if verbose: + print( + f"Epoch: [{i}] Training loss: [{train_loss:.3f}] " + + f"Training metric: [{train_metric:.3f}] " + + f"Validation loss: [{val_loss:.3f}] " + + f"Validation metric: [{val_metric:.3f}]" + ) + # Load the best model self.load(best_status=True) diff --git a/src/utils.py b/src/utils.py index 5b2c6b2..de98478 100644 --- a/src/utils.py +++ b/src/utils.py @@ -8,12 +8,12 @@ from torch.utils.data import SubsetRandomSampler from torch_geometric.loader import DataLoader -from featurizers import ( +from .featurizers import ( OneHotEncoder, list_of_edge_featurizers, list_of_node_featurizers, ) -from graphs import AtomsGraph +from .graphs import AtomsGraph def partition_structure(atoms, z_cutoffs): @@ -51,6 +51,47 @@ def partition_structure(atoms, z_cutoffs): return part_atoms +def partition_structure_by_layers(atoms, layer_cutoffs): + """Partition atomic structue into bulk, surface, and/or adsorbates by layers. + + Parameters + ---------- + atoms: ase.Atoms object + The structure to be partitioned + layer_cutoffs: list or np.ndarray + List of layer cutoffs. xy planes are placed above the specified layer + cutoffs to partition atoms above and below them. The length of layer + should be equal to one less than the number of partitions. + """ + # Set number of partitions equal to 1 more than the length of z_cutoffs + n_partitions = int(len(layer_cutoffs) + 1) + + # Calculate interlayer distance + z_array = np.unique(np.sort(atoms.get_positions()[:, -1])) + z_min = z_array.min() + d_interlayer = abs(z_array[1] - z_array[0]) + + # Add 0 and infinity to cutoffs + z_cutoffs = z_min + (np.array(layer_cutoffs) - 1) * d_interlayer + 0.1 + z_cutoffs = np.insert(z_cutoffs, 0, 0) + z_cutoffs = np.insert(z_cutoffs, len(z_cutoffs), np.inf) + + # Get positions + pos = atoms.get_positions() + + # Iterate over number of partitions + part_atoms = [] + for i in range(n_partitions): + part_idx = ( + np.argwhere((pos[:, -1] >= z_cutoffs[i]) & (pos[:, -1] < z_cutoffs[i + 1])) + .flatten() + .tolist() + ) + part_atoms.append(part_idx) + + return part_atoms + + def featurize_atoms( atoms, select_idx, @@ -136,6 +177,7 @@ def featurize_atoms( "edge_indices": edge_indices, } + def create_dataloaders(proc_data, sample_idx, batch_size, num_proc=0): """Create training, validation, and/or test dataloaders. @@ -150,7 +192,7 @@ def create_dataloaders(proc_data, sample_idx, batch_size, num_proc=0): Batch size num_proc: int (default = 0) Number of cores to be used for parallelization. Defaults to serial. - + Returns ------- dataloader_dict: dict @@ -160,15 +202,18 @@ def create_dataloaders(proc_data, sample_idx, batch_size, num_proc=0): dataloader_dict = {"train": [], "val": [], "test": []} for key in dataloader_dict.keys(): - if sample_idx[key].shape[0] > 0.: + if sample_idx[key].shape[0] > 0.0: sampler = SubsetRandomSampler(sample_idx[key]) - dataloader_dict[key] = DataLoader(dataset=proc_data, - batch_size=batch_size, - sampler=sampler, - num_workers=num_proc) - + dataloader_dict[key] = DataLoader( + dataset=proc_data, + batch_size=batch_size, + sampler=sampler, + num_workers=num_proc, + ) + return dataloader_dict + if __name__ == "__main__": from ase.io import read diff --git a/workflows/basic_train_val_test.py b/workflows/basic_train_val_test.py new file mode 100644 index 0000000..50b2a65 --- /dev/null +++ b/workflows/basic_train_val_test.py @@ -0,0 +1,94 @@ +"""Basic workflow to train, validate, and test a model.""" + +import torch + +from ..src.constants import REPO_PATH +from ..src.data import AtomsDataset +from ..src.samplers import RandomSampler +from ..src.train import Model +from ..src.utils import create_dataloaders + +# Set seeds +seed = 0 +torch.manual_seed(seed) + +# Create dataset +dataset_path = REPO_PATH / "data" / "S_calcs" +prop_csv_path = dataset_path / "name_prop.csv" +dataset = AtomsDataset(root=dataset_path, prop_csv=prop_csv_path) + +# Process dataset +# dataset.process_data(layer_cutoffs=[3, 6], +# node_features=[ +# ["atomic_number", "dband_center", "coordination"], +# ["atomic_number", "reactivity", "coordination"], +# ["atomic_number", "reactivity", "coordination"], +# ], +# edge_features=[ +# ["bulk_bond_distance"], +# ["surface_bond_distance"], +# ["adsorbate_bond_distance"], +# ]) + +# Create sampler +sample_config = {"train": 0.8, "val": 0.1, "test": 0.1} +sampler = RandomSampler(seed, dataset.len()) +sample_idx = sampler.create_samplers(sample_config) + +# Create dataloaders +dataloader_dict = create_dataloaders(dataset, sample_idx, batch_size=32) + +# Create model +global_config = { + "gpu": True, + "loss_function": "mse", + "metric_function": "mae", + "learning_rate": 0.001, + "optimizer": "adam", + "lr_milestones": [75], +} +partition_configs = [ + { + "n_conv": 3, + "n_hidden": 1, + "hidden_size": 20, + "conv_size": 20, + "dropout": 0.1, + "num_node_features": dataset[0][0].num_node_features, + "num_edge_features": dataset[0][0].num_edge_features, + "conv_type": "CGConv", + }, + { + "n_conv": 5, + "n_hidden": 2, + "hidden_size": 50, + "conv_size": 50, + "dropout": 0.1, + "num_node_features": dataset[0][1].num_node_features, + "num_edge_features": dataset[0][1].num_edge_features, + "conv_type": "CGConv", + }, + { + "n_conv": 5, + "n_hidden": 2, + "hidden_size": 50, + "conv_size": 50, + "dropout": 0.1, + "num_node_features": dataset[0][2].num_node_features, + "num_edge_features": dataset[0][2].num_edge_features, + "conv_type": "CGConv", + }, +] + +model_path = REPO_PATH / "trained_models" / "S_binary_calcs" +model = Model(global_config, partition_configs, model_path) +model.init_standardizer([dataset[i][0].y for i in sample_idx["train"]]) +results_dict = model.train(100, dataloader_dict, verbose=True) +print(f"Test metric: {results_dict['metric']['test']}") + +# Load model +model.load(best_status=True) + +# Make predictions on a structure +pred_dict = model.predict(dataset, [0, 100, 200, 500], return_targets=True) +print(pred_dict)