diff --git a/.gitignore b/.gitignore index 513abe4..1acfb60 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ data/* !data/dband_centers.csv __pycache__ *.cif +trained_models +*.pt +*__init__.py \ No newline at end of file diff --git a/src/data.py b/src/data.py index aa1a23d..093b964 100644 --- a/src/data.py +++ b/src/data.py @@ -10,9 +10,9 @@ from ase.io import read from torch_geometric.data import Data, Dataset -from constants import REPO_PATH -from featurizers import OneHotEncoder -from utils import featurize_atoms, partition_structure +from .constants import REPO_PATH +from .featurizers import OneHotEncoder +from .utils import featurize_atoms, partition_structure_by_layers class AtomsDataset(Dataset): @@ -60,7 +60,7 @@ def __init__(self, root, prop_csv): def process_data( self, - z_cutoffs, + layer_cutoffs, node_features, edge_features, max_atoms=None, @@ -75,8 +75,8 @@ def process_data( Parameters ---------- - z_cutoffs: list or np.ndarray - List of z-coordinates based on which atomic structures are + layer_cutoffs: list or np.ndarray + List of layer cutoffs based on which atomic structures are partitioned. The number of partitions is equal to one more than the length of z_cutoffs. node_features: list[list] @@ -115,7 +115,7 @@ def process_data( atoms = read(str(file_path)) # Partition structure - part_atoms = partition_structure(atoms, z_cutoffs) + part_atoms = partition_structure_by_layers(atoms, layer_cutoffs) # Featurize partitions data_objects = [] @@ -191,7 +191,7 @@ def __init__(self, atoms): def process_data( self, - z_cutoffs, + layer_cutoffs, node_features, edge_features, max_atoms=None, @@ -206,8 +206,8 @@ def process_data( Parameters ---------- - z_cutoffs: list or np.ndarray - List of z-coordinates based on which atomic structures are + layer_cutoffs: list or np.ndarray + List of layer cutoffs based on which atomic structures are partitioned. The number of partitions is equal to one more than the length of z_cutoffs. node_features: list[list] @@ -229,7 +229,7 @@ def process_data( # Iterate over files and process them for atoms_obj in self.atoms: # Partition structure - part_atoms = partition_structure(atoms_obj, z_cutoffs) + part_atoms = partition_structure_by_layers(atoms_obj, layer_cutoffs) # Featurize partitions data_objects = [] @@ -337,28 +337,11 @@ def load_datapoints(atoms, process_dict): data_root_path = Path(REPO_PATH) / "data" / "S_calcs" prop_csv_path = data_root_path / "name_prop.csv" - # Create dataset - dataset = AtomsDataset(data_root_path, prop_csv_path) - # dataset.process_data(z_cutoffs=[13., 20.], - # node_features=[ - # ["atomic_number", "dband_center"], - # ["atomic_number", "reactivity"], - # ["atomic_number", "reactivity"], - # ], - # edge_features=[ - # ["bulk_bond_distance"], - # ["surface_bond_distance"], - # ["adsorbate_bond_distance"], - # ]) - print(dataset[0][-1].x) - print(dataset.df_name_idx.head()) - print(dataset[0][-1].name) - # Create datapoint atoms = read(data_root_path / "Pt_3_Rh_9_-7-7-S.cif") datapoint = AtomsDatapoints(atoms) datapoint.process_data( - z_cutoffs=[13.0, 20.0], + layer_cutoffs=[3, 6], node_features=[ ["atomic_number", "dband_center"], ["atomic_number", "reactivity"], diff --git a/src/featurizers.py b/src/featurizers.py index 4ca0109..bd749f5 100644 --- a/src/featurizers.py +++ b/src/featurizers.py @@ -9,8 +9,8 @@ from mendeleev import element from torch.nn.functional import one_hot -from constants import DBAND_FILE_PATH -from graphs import AtomsGraph +from .constants import DBAND_FILE_PATH +from .graphs import AtomsGraph class OneHotEncoder: diff --git a/src/models.py b/src/models.py index 6f2d727..657792f 100644 --- a/src/models.py +++ b/src/models.py @@ -151,7 +151,7 @@ def forward(self, data_objects): conv_data = layer(conv_data) # Apply pooling layer - pooled_data = self.pool_layers[i](x=conv_data, batch=None) + pooled_data = self.pool_layers[i](x=conv_data, batch=data.batch) # Apply pool-to-hidden transform hidden_data = self.pool_transform[i](pooled_data) @@ -163,8 +163,8 @@ def forward(self, data_objects): contributions.append(hidden_data) # Apply final transformation - contributions = torch.cat(contributions, dim=-1) - output = self.final_lin_transform(contributions) + contributions = torch.cat(contributions) + output = self.final_lin_transform(contributions.view(-1, 3)) return {"output": output, "contributions": contributions} diff --git a/src/train.py b/src/train.py index 2e9ab02..9af85a1 100644 --- a/src/train.py +++ b/src/train.py @@ -7,7 +7,7 @@ import torch from sklearn.metrics import mean_absolute_error, mean_squared_error -from models import MultiGCN +from .models import MultiGCN class Standardizer: @@ -122,6 +122,8 @@ def __init__(self, global_config, partition_configs, model_path): # Set GPU status self.use_gpu = global_config["gpu"] + if self.use_gpu: + self.model.cuda() # Set loss function if global_config["loss_function"] == "mse": @@ -151,7 +153,7 @@ def __init__(self, global_config, partition_configs, model_path): # Set scheduler if "lr_milestones" in global_config.keys(): - self.scheduler = torch.optim.MultiStepLR( + self.scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer=self.optimizer, milestones=global_config["lr_milestones"] ) else: @@ -173,7 +175,7 @@ def init_standardizer(self, targets): targets: np.ndarray or torch.Tensor Array of training outputs """ - self.standardizer = Standardizer(targets) + self.standardizer = Standardizer(torch.Tensor(targets)) def train_epoch(self, dataloader): """Train the model for a single epoch. @@ -209,11 +211,11 @@ def train_epoch(self, dataloader): pred_dict = self.model(nn_input) # Calculate loss - loss = self.loss_fn(nn_output, pred_dict["output"]) + loss = self.loss_fn(pred_dict["output"], nn_output.unsqueeze(1)) avg_loss += loss # Calculate metric - y_pred = self.standardizer.restore(pred_dict["output"].cpu()) + y_pred = self.standardizer.restore(pred_dict["output"].cpu().detach()) metric = self.metric_fn(y, y_pred) avg_metric += metric @@ -273,11 +275,11 @@ def validate(self, dataloader): pred_dict = self.model(nn_input) # Calculate loss - loss = self.loss_fn(nn_output, pred_dict["output"]) + loss = self.loss_fn(pred_dict["output"], nn_output.unsqueeze(1)) avg_loss += loss # Calculate metric - y_pred = self.standardizer.restore(pred_dict["output"].cpu()) + y_pred = self.standardizer.restore(pred_dict["output"].cpu().detach()) metric = self.metric_fn(y, y_pred) avg_metric += metric @@ -318,7 +320,7 @@ def predict(self, dataset, indices, return_targets=False): # Go over each batch in the dataloader for i, idx in enumerate(indices): # Get data objects - data_objects = dataset.get(i) + data_objects = dataset.get(idx) # Standardize output if return_targets: @@ -449,6 +451,11 @@ def train(self, epochs, dataloader_dict, verbose=False): train_metrics.append(train_metric) val_metrics.append(val_metric) + # Print, if verbose + if verbose: + print(f"Epoch: [{i}]\tTraining Loss: [{train_loss}]\ + \tValidation Loss: [{val_loss}]") + # Load the best model self.load(best_status=True) diff --git a/src/utils.py b/src/utils.py index 5b2c6b2..a834949 100644 --- a/src/utils.py +++ b/src/utils.py @@ -8,12 +8,12 @@ from torch.utils.data import SubsetRandomSampler from torch_geometric.loader import DataLoader -from featurizers import ( +from .featurizers import ( OneHotEncoder, list_of_edge_featurizers, list_of_node_featurizers, ) -from graphs import AtomsGraph +from .graphs import AtomsGraph def partition_structure(atoms, z_cutoffs): @@ -50,6 +50,46 @@ def partition_structure(atoms, z_cutoffs): return part_atoms +def partition_structure_by_layers(atoms, layer_cutoffs): + """Partition atomic structue into bulk, surface, and/or adsorbates by layers. + + Parameters + ---------- + atoms: ase.Atoms object + The structure to be partitioned + layer_cutoffs: list or np.ndarray + List of layer cutoffs. xy planes are placed above the specified layer + cutoffs to partition atoms above and below them. The length of layer + should be equal to one less than the number of partitions. + """ + # Set number of partitions equal to 1 more than the length of z_cutoffs + n_partitions = int(len(layer_cutoffs) + 1) + + # Calculate interlayer distance + z_array = np.unique(np.sort(atoms.get_positions()[:, -1])) + z_min = z_array.min() + d_interlayer = abs(z_array[1] - z_array[0]) + + # Add 0 and infinity to cutoffs + z_cutoffs = z_min + (np.array(layer_cutoffs) - 1) * d_interlayer + 0.1 + z_cutoffs = np.insert(z_cutoffs, 0, 0) + z_cutoffs = np.insert(z_cutoffs, len(z_cutoffs), np.inf) + + # Get positions + pos = atoms.get_positions() + + # Iterate over number of partitions + part_atoms = [] + for i in range(n_partitions): + part_idx = ( + np.argwhere((pos[:, -1] >= z_cutoffs[i]) & (pos[:, -1] < z_cutoffs[i + 1])) + .flatten() + .tolist() + ) + part_atoms.append(part_idx) + + return part_atoms + def featurize_atoms( atoms, diff --git a/workflows/basic_train_val_test.py b/workflows/basic_train_val_test.py new file mode 100644 index 0000000..82a6739 --- /dev/null +++ b/workflows/basic_train_val_test.py @@ -0,0 +1,87 @@ +"""Basic workflow to train, validate, and test a model.""" + +import torch + +from ..src.constants import REPO_PATH +from ..src.data import AtomsDataset +from ..src.samplers import RandomSampler +from ..src.utils import create_dataloaders +from ..src.train import Model + +# Set seeds +seed = 0 +torch.manual_seed(seed) + +# Create dataset +dataset_path = REPO_PATH / "data" / "S_calcs" +prop_csv_path = dataset_path / "name_prop.csv" +dataset = AtomsDataset(root=dataset_path, prop_csv=prop_csv_path) + +# Process dataset +# dataset.process_data(layer_cutoffs=[3, 6], +# node_features=[ +# ["atomic_number", "dband_center", "coordination"], +# ["atomic_number", "reactivity", "coordination"], +# ["atomic_number", "reactivity", "coordination"], +# ], +# edge_features=[ +# ["bulk_bond_distance"], +# ["surface_bond_distance"], +# ["adsorbate_bond_distance"], +# ]) + +# Create sampler +sample_config = {"train": 0.6, "val": 0.2, "test": 0.2} +sampler = RandomSampler(seed, dataset.len()) +sample_idx = sampler.create_samplers(sample_config) + +# Create dataloaders +dataloader_dict = create_dataloaders(dataset, sample_idx, batch_size=32) + +# Create model +global_config = { + "gpu": False, + "loss_function": "mse", + "metric_function": "mae", + "learning_rate": 0.1, + "optimizer": "adam", + "lr_milestones": [3, 10] +} +partition_configs = [ + { + "n_conv": 3, + "n_hidden": 3, + "hidden_size": 30, + "conv_size": 40, + "dropout": 0.1, + "num_node_features": dataset[0][0].num_node_features, + "num_edge_features": dataset[0][0].num_edge_features, + "conv_type": "CGConv", + }, + { + "n_conv": 3, + "n_hidden": 3, + "hidden_size": 30, + "conv_size": 40, + "dropout": 0.1, + "num_node_features": dataset[0][1].num_node_features, + "num_edge_features": dataset[0][1].num_edge_features, + "conv_type": "CGConv", + }, + { + "n_conv": 3, + "n_hidden": 3, + "hidden_size": 30, + "conv_size": 40, + "dropout": 0.1, + "num_node_features": dataset[0][2].num_node_features, + "num_edge_features": dataset[0][2].num_edge_features, + "conv_type": "CGConv", + }, +] + +model_path = REPO_PATH / "trained_models" / "S_binary_calcs" +model = Model(global_config, partition_configs, model_path) +model.init_standardizer([dataset[i][0].y for i in sample_idx["train"]]) +results_dict = model.train(100, dataloader_dict, verbose=True) +print(results_dict) \ No newline at end of file