Basic workflow on cpu

GreeleyGroup · Sep 25, 2023 · 037ca9d · 037ca9d
1 parent b6b8292
commit 037ca9d
Show file tree

Hide file tree

Showing 7 changed files with 164 additions and 44 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,6 @@ data/*
 !data/dband_centers.csv
 __pycache__
 *.cif
+trained_models
+*.pt
+*__init__.py
diff --git a/src/data.py b/src/data.py
@@ -10,9 +10,9 @@
 from ase.io import read
 from torch_geometric.data import Data, Dataset
 
-from constants import REPO_PATH
-from featurizers import OneHotEncoder
-from utils import featurize_atoms, partition_structure
+from .constants import REPO_PATH
+from .featurizers import OneHotEncoder
+from .utils import featurize_atoms, partition_structure_by_layers
 
 
 class AtomsDataset(Dataset):
@@ -60,7 +60,7 @@ def __init__(self, root, prop_csv):
 
     def process_data(
         self,
-        z_cutoffs,
+        layer_cutoffs,
         node_features,
         edge_features,
         max_atoms=None,
@@ -75,8 +75,8 @@ def process_data(
 
         Parameters
         ----------
-        z_cutoffs: list or np.ndarray
-            List of z-coordinates based on which atomic structures are
+        layer_cutoffs: list or np.ndarray
+            List of layer cutoffs based on which atomic structures are
             partitioned. The number of partitions is equal to one more than the
             length of z_cutoffs.
         node_features: list[list]
@@ -115,7 +115,7 @@ def process_data(
             atoms = read(str(file_path))
 
             # Partition structure
-            part_atoms = partition_structure(atoms, z_cutoffs)
+            part_atoms = partition_structure_by_layers(atoms, layer_cutoffs)
 
             # Featurize partitions
             data_objects = []
@@ -191,7 +191,7 @@ def __init__(self, atoms):
 
     def process_data(
         self,
-        z_cutoffs,
+        layer_cutoffs,
         node_features,
         edge_features,
         max_atoms=None,
@@ -206,8 +206,8 @@ def process_data(
 
         Parameters
         ----------
-        z_cutoffs: list or np.ndarray
-            List of z-coordinates based on which atomic structures are
+        layer_cutoffs: list or np.ndarray
+            List of layer cutoffs based on which atomic structures are
             partitioned. The number of partitions is equal to one more than the
             length of z_cutoffs.
         node_features: list[list]
@@ -229,7 +229,7 @@ def process_data(
         # Iterate over files and process them
         for atoms_obj in self.atoms:
             # Partition structure
-            part_atoms = partition_structure(atoms_obj, z_cutoffs)
+            part_atoms = partition_structure_by_layers(atoms_obj, layer_cutoffs)
 
             # Featurize partitions
             data_objects = []
@@ -337,28 +337,11 @@ def load_datapoints(atoms, process_dict):
     data_root_path = Path(REPO_PATH) / "data" / "S_calcs"
     prop_csv_path = data_root_path / "name_prop.csv"
 
-    # Create dataset
-    dataset = AtomsDataset(data_root_path, prop_csv_path)
-    # dataset.process_data(z_cutoffs=[13., 20.],
-    #                      node_features=[
-    #                          ["atomic_number", "dband_center"],
-    #                          ["atomic_number", "reactivity"],
-    #                          ["atomic_number", "reactivity"],
-    #                      ],
-    #                      edge_features=[
-    #                          ["bulk_bond_distance"],
-    #                          ["surface_bond_distance"],
-    #                          ["adsorbate_bond_distance"],
-    #                      ])
-    print(dataset[0][-1].x)
-    print(dataset.df_name_idx.head())
-    print(dataset[0][-1].name)
-
     # Create datapoint
     atoms = read(data_root_path / "Pt_3_Rh_9_-7-7-S.cif")
     datapoint = AtomsDatapoints(atoms)
     datapoint.process_data(
-        z_cutoffs=[13.0, 20.0],
+        layer_cutoffs=[3, 6],
         node_features=[
             ["atomic_number", "dband_center"],
             ["atomic_number", "reactivity"],

diff --git a/src/featurizers.py b/src/featurizers.py
@@ -9,8 +9,8 @@
 from mendeleev import element
 from torch.nn.functional import one_hot
 
-from constants import DBAND_FILE_PATH
-from graphs import AtomsGraph
+from .constants import DBAND_FILE_PATH
+from .graphs import AtomsGraph
 
 
 class OneHotEncoder:

diff --git a/src/models.py b/src/models.py
@@ -151,7 +151,7 @@ def forward(self, data_objects):
                     conv_data = layer(conv_data)
 
             # Apply pooling layer
-            pooled_data = self.pool_layers[i](x=conv_data, batch=None)
+            pooled_data = self.pool_layers[i](x=conv_data, batch=data.batch)
 
             # Apply pool-to-hidden transform
             hidden_data = self.pool_transform[i](pooled_data)
@@ -163,8 +163,8 @@ def forward(self, data_objects):
             contributions.append(hidden_data)
 
         # Apply final transformation
-        contributions = torch.cat(contributions, dim=-1)
-        output = self.final_lin_transform(contributions)
+        contributions = torch.cat(contributions)
+        output = self.final_lin_transform(contributions.view(-1, 3))
 
         return {"output": output, "contributions": contributions}
 

diff --git a/src/train.py b/src/train.py
@@ -7,7 +7,7 @@
 import torch
 from sklearn.metrics import mean_absolute_error, mean_squared_error
 
-from models import MultiGCN
+from .models import MultiGCN
 
 
 class Standardizer:
@@ -122,6 +122,8 @@ def __init__(self, global_config, partition_configs, model_path):
 
         # Set GPU status
         self.use_gpu = global_config["gpu"]
+        if self.use_gpu:
+            self.model.cuda()
 
         # Set loss function
         if global_config["loss_function"] == "mse":
@@ -151,7 +153,7 @@ def __init__(self, global_config, partition_configs, model_path):
 
         # Set scheduler
         if "lr_milestones" in global_config.keys():
-            self.scheduler = torch.optim.MultiStepLR(
+            self.scheduler = torch.optim.lr_scheduler.MultiStepLR(
                 optimizer=self.optimizer, milestones=global_config["lr_milestones"]
             )
         else:
@@ -173,7 +175,7 @@ def init_standardizer(self, targets):
         targets: np.ndarray or torch.Tensor
             Array of training outputs
         """
-        self.standardizer = Standardizer(targets)
+        self.standardizer = Standardizer(torch.Tensor(targets))
 
     def train_epoch(self, dataloader):
         """Train the model for a single epoch.
@@ -209,11 +211,11 @@ def train_epoch(self, dataloader):
             pred_dict = self.model(nn_input)
 
             # Calculate loss
-            loss = self.loss_fn(nn_output, pred_dict["output"])
+            loss = self.loss_fn(pred_dict["output"], nn_output.unsqueeze(1))
             avg_loss += loss
 
             # Calculate metric
-            y_pred = self.standardizer.restore(pred_dict["output"].cpu())
+            y_pred = self.standardizer.restore(pred_dict["output"].cpu().detach())
             metric = self.metric_fn(y, y_pred)
             avg_metric += metric
 
@@ -273,11 +275,11 @@ def validate(self, dataloader):
             pred_dict = self.model(nn_input)
 
             # Calculate loss
-            loss = self.loss_fn(nn_output, pred_dict["output"])
+            loss = self.loss_fn(pred_dict["output"], nn_output.unsqueeze(1))
             avg_loss += loss
 
             # Calculate metric
-            y_pred = self.standardizer.restore(pred_dict["output"].cpu())
+            y_pred = self.standardizer.restore(pred_dict["output"].cpu().detach())
             metric = self.metric_fn(y, y_pred)
             avg_metric += metric
 
@@ -318,7 +320,7 @@ def predict(self, dataset, indices, return_targets=False):
         # Go over each batch in the dataloader
         for i, idx in enumerate(indices):
             # Get data objects
-            data_objects = dataset.get(i)
+            data_objects = dataset.get(idx)
 
             # Standardize output
             if return_targets:
@@ -449,6 +451,11 @@ def train(self, epochs, dataloader_dict, verbose=False):
             train_metrics.append(train_metric)
             val_metrics.append(val_metric)
 
+            # Print, if verbose
+            if verbose:
+                print(f"Epoch: [{i}]\tTraining Loss: [{train_loss}]\
+                      \tValidation Loss: [{val_loss}]")
+
         # Load the best model
         self.load(best_status=True)
 

diff --git a/src/utils.py b/src/utils.py
@@ -8,12 +8,12 @@
 from torch.utils.data import SubsetRandomSampler
 from torch_geometric.loader import DataLoader
 
-from featurizers import (
+from .featurizers import (
     OneHotEncoder,
     list_of_edge_featurizers,
     list_of_node_featurizers,
 )
-from graphs import AtomsGraph
+from .graphs import AtomsGraph
 
 
 def partition_structure(atoms, z_cutoffs):
@@ -50,6 +50,46 @@ def partition_structure(atoms, z_cutoffs):
 
     return part_atoms
 
+def partition_structure_by_layers(atoms, layer_cutoffs):
+    """Partition atomic structue into bulk, surface, and/or adsorbates by layers.
+
+    Parameters
+    ----------
+    atoms: ase.Atoms object
+        The structure to be partitioned
+    layer_cutoffs: list or np.ndarray
+        List of layer cutoffs. xy planes are placed above the specified layer
+        cutoffs to partition atoms above and below them. The length of layer
+        should be equal to one less than the number of partitions.
+    """
+    # Set number of partitions equal to 1 more than the length of z_cutoffs
+    n_partitions = int(len(layer_cutoffs) + 1)
+
+    # Calculate interlayer distance
+    z_array = np.unique(np.sort(atoms.get_positions()[:, -1]))
+    z_min = z_array.min()
+    d_interlayer = abs(z_array[1] - z_array[0])
+
+    # Add 0 and infinity to cutoffs
+    z_cutoffs = z_min + (np.array(layer_cutoffs) - 1) * d_interlayer + 0.1
+    z_cutoffs = np.insert(z_cutoffs, 0, 0)
+    z_cutoffs = np.insert(z_cutoffs, len(z_cutoffs), np.inf)
+
+    # Get positions
+    pos = atoms.get_positions()
+
+    # Iterate over number of partitions
+    part_atoms = []
+    for i in range(n_partitions):
+        part_idx = (
+            np.argwhere((pos[:, -1] >= z_cutoffs[i]) & (pos[:, -1] < z_cutoffs[i + 1]))
+            .flatten()
+            .tolist()
+        )
+        part_atoms.append(part_idx)
+
+    return part_atoms
+
 
 def featurize_atoms(
     atoms,

diff --git a/workflows/basic_train_val_test.py b/workflows/basic_train_val_test.py
@@ -0,0 +1,87 @@
+"""Basic workflow to train, validate, and test a model."""
+
+import torch
+
+from ..src.constants import REPO_PATH
+from ..src.data import AtomsDataset
+from ..src.samplers import RandomSampler
+from ..src.utils import create_dataloaders
+from ..src.train import Model
+
+# Set seeds
+seed = 0
+torch.manual_seed(seed)
+
+# Create dataset
+dataset_path = REPO_PATH / "data" / "S_calcs"
+prop_csv_path = dataset_path / "name_prop.csv"
+dataset = AtomsDataset(root=dataset_path, prop_csv=prop_csv_path)
+
+# Process dataset
+# dataset.process_data(layer_cutoffs=[3, 6],
+#                          node_features=[
+#                              ["atomic_number", "dband_center", "coordination"],
+#                              ["atomic_number", "reactivity", "coordination"],
+#                              ["atomic_number", "reactivity", "coordination"],
+#                          ],
+#                          edge_features=[
+#                              ["bulk_bond_distance"],
+#                              ["surface_bond_distance"],
+#                              ["adsorbate_bond_distance"],
+#                          ])
+
+# Create sampler
+sample_config = {"train": 0.6, "val": 0.2, "test": 0.2}
+sampler = RandomSampler(seed, dataset.len())
+sample_idx = sampler.create_samplers(sample_config)
+
+# Create dataloaders
+dataloader_dict = create_dataloaders(dataset, sample_idx, batch_size=32)
+
+# Create model
+global_config = {
+    "gpu": False,
+    "loss_function": "mse",
+    "metric_function": "mae",
+    "learning_rate": 0.1,
+    "optimizer": "adam",
+    "lr_milestones": [3, 10]
+}
+partition_configs = [
+        {
+            "n_conv": 3,
+            "n_hidden": 3,
+            "hidden_size": 30,
+            "conv_size": 40,
+            "dropout": 0.1,
+            "num_node_features": dataset[0][0].num_node_features,
+            "num_edge_features": dataset[0][0].num_edge_features,
+            "conv_type": "CGConv",
+        },
+        {
+            "n_conv": 3,
+            "n_hidden": 3,
+            "hidden_size": 30,
+            "conv_size": 40,
+            "dropout": 0.1,
+            "num_node_features": dataset[0][1].num_node_features,
+            "num_edge_features": dataset[0][1].num_edge_features,
+            "conv_type": "CGConv",
+        },
+        {
+            "n_conv": 3,
+            "n_hidden": 3,
+            "hidden_size": 30,
+            "conv_size": 40,
+            "dropout": 0.1,
+            "num_node_features": dataset[0][2].num_node_features,
+            "num_edge_features": dataset[0][2].num_edge_features,
+            "conv_type": "CGConv",
+        },
+]
+
+model_path = REPO_PATH / "trained_models" / "S_binary_calcs" 
+model = Model(global_config, partition_configs, model_path)
+model.init_standardizer([dataset[i][0].y for i in sample_idx["train"]])
+results_dict = model.train(100, dataloader_dict, verbose=True)
+print(results_dict)