From 4cced6523e8b3da0dc68e37f72a6bd154e13a4e0 Mon Sep 17 00:00:00 2001
From: Gaurav S Deshmukh <deshmukg@gilbreth-fe03.rcac.purdue.edu>
Date: Tue, 19 Sep 2023 17:05:09 -0400
Subject: [PATCH] Added AtomsDataset

---
 src/data.py        | 116 +++++++++++++++++++++++++++++++++++++++++++++
 src/featurizers.py |   2 +-
 src/utils.py       |  12 ++---
 3 files changed, 120 insertions(+), 10 deletions(-)
 create mode 100644 src/data.py

diff --git a/src/data.py b/src/data.py
new file mode 100644
index 0000000..528b64c
--- /dev/null
+++ b/src/data.py
@@ -0,0 +1,116 @@
+"Store graph data using PyTorch Geometric abstractions."
+
+import csv
+from pathlib import Path
+
+import torch
+
+from ase.io import read
+from torch_geometric.data import Data, Dataset
+
+from utils import partition_structure, featurize_atoms
+from featurizers import OneHotEncoder
+
+class AtomsDataset(Dataset):
+    """Class to hold a dataset containing graphs of atomic_structures."""
+    def __init__(self, root, prop_csv):
+        """Initialize an AtomsDataset.
+
+        Atomic structures stored as .cif files in the root directory are loaded.
+
+        Paramters
+        ---------
+        root: str
+            Path to the directory in which atomic structures are stored
+        pro_csv: str
+            Path to the file mapping atomic structure filename and property.
+            This filename will typically have two columns, the first with the
+            names of the cif files (without .cif) and the second with the
+            corresponding target property values.
+        """
+        super().__init__(root)
+
+        # Read csv
+        self.prop_csv = prop_csv
+        self.names = []
+        self.props = []
+        with open(self.prop_csv, "r") as f:
+            csv_reader = csv.reader(f)
+            for row in csv_reader:
+                self.names.append(str(row[0]))
+                self.props.append(float(row[1]))
+
+        # Create name to property map
+        self.map_name_prop = {
+            name: prop for name, prop in zip(self.names, self.props)
+        }
+
+    def process(self, z_cutoffs, node_features, edge_features, max_atoms=12,
+                encoder=OneHotEncoder()):
+        """Process raw data in the root directory into PyTorch Data and save.
+
+        Each atomic structure in the root directory is partitioned based on the
+        given z_cutoffs and each partition is featurized according to the given
+        node_features and edge_features. The featurized graphs are converted
+        into Data objects and stored in the "processed" directory under root.
+
+        Parameters
+        ----------
+        z_cutoffs: list or np.ndarray
+            List of z-coordinates based on which atomic structures are
+            partitioned. The number of partitions is equal to one more than the
+            length of z_cutoffs.
+        node_features: list[list]
+            List of lists of node featurization methods to be used for each
+            partition. For e.g., specify [["atomic_number", "dband_center"], 
+            ["atomic_number", "reactivity"], ["atomic_number", "reactivity"]] for
+            a typical bulk + surface + adsorbate partition.
+        edge_features: list[list]
+            List of lists of edge featurization methods to be used for each
+            partition. For e.g., specify [["bulk_bond_distance"], 
+            ["surface_bond_distance"], ["adsorbate_bond_distance"]] for
+            a typical bulk + surface + adsorbate partition.
+        encoder: OneHotEncoder object
+            Encoder to convert properties to vectors
+        """
+        # Root path
+        root_path = Path(self.root)
+
+        # Create processed path if it doesn't exist
+        processed_path = Path(self.processed_dir).mkdir(exist_ok=True)
+
+        # Iterate over files and process them
+        for name in self.names:
+            # Set file path
+            file_path = root_path / name + ".cif"
+
+            # Read structure
+            atoms = read(str(file_path))
+
+            # Partition structure
+            part_atoms = partition_structure(atoms, z_cutoffs)
+
+            # Featurize partitions
+            data_objects = []
+            for i, part_idx in enumerate(part_atoms):
+                feat_dict = featurize_atoms(
+                    atoms,
+                    part_idx,
+                    node_features=node_features[i],
+                    edge_features=edge_features[i],\
+                    max_atoms=max_atoms,
+                    encoder=encoder
+                )
+
+                # Convert to Data object
+                data_obj = Data(
+                    x=feat_dict["node_tensor"],
+                    edge_index=feat_dict["edge_indices"],
+                    edge_attr=feat_dict["edge_tensor"],
+                    y=torch.Tensor([self.map_name_prop[name]])
+                )
+                data_objects.append(data_obj)
+
+            # Save data objects
+            torch.save(data_objects, processed_path / name + ".pt")
+                
\ No newline at end of file
diff --git a/src/featurizers.py b/src/featurizers.py
index b4c6d65..4ca0109 100644
--- a/src/featurizers.py
+++ b/src/featurizers.py
@@ -588,7 +588,7 @@ def __init__(self, encoder, min=0, max=4, n_intervals=16):
     @staticmethod
     def name():
         """Return the name of the featurizer."""
-        return "adsorbate_distance"
+        return "adsorbate_bond_distance"
 
 
 list_of_node_featurizers = [
diff --git a/src/utils.py b/src/utils.py
index 3ecf32b..0d3592a 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -13,26 +13,20 @@
 from graphs import AtomsGraph
 
 
-def partition_structure(atoms, n_partitions, z_cutoffs):
+def partition_structure(atoms, z_cutoffs):
     """Partition atomic structue into bulk, surface, and/or adsorbates.
 
     Parameters
     ----------
     atoms: ase.Atoms object
         The structure to be partitioned
-    n_partitions: int
-        Number of partitions
     z_cutoffs: list or np.ndarray
         List of z-coordinate cutoffs. xy planes are placed at the specified
         cutoffs to partition atoms above and below them. The length of z-cutoffs
         should be equal to one less than the number of partitions.
     """
-    # Check if length of z_cutoffs is equal to n_paritions
-    if len(z_cutoffs) != n_partitions - 1:
-        raise ValueError(
-            "The length of z_cutoffs must be equal to\
-                         one less than the number of partitions"
-        )
+    # Set number of partitions equal to 1 more than the length of z_cutoffs
+    n_partitions = int(len(z_cutoffs) + 1)
 
     # Add 0 and infinity to cutoffs
     z_cutoffs = np.insert(z_cutoffs, 0, 0)