From 33293ea6053544b40db1a96af14bfddb9f41f427 Mon Sep 17 00:00:00 2001
From: Gaurav S Deshmukh <deshmukg@gilbreth-fe02.rcac.purdue.edu>
Date: Fri, 22 Sep 2023 11:53:27 -0400
Subject: [PATCH] Added AtomsDatapoint

---
 src/data.py | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/src/data.py b/src/data.py
index fb3ec17..c482e22 100644
--- a/src/data.py
+++ b/src/data.py
@@ -8,6 +8,7 @@
 import tqdm
 
 from ase.io import read
+from ase import Atoms
 from torch_geometric.data import Data, Dataset
 
 from utils import partition_structure, featurize_atoms
@@ -156,6 +157,103 @@ def processed_status(self):
         else:
             return False
 
+class AtomsDatapoints:
+    """Class to hold atomic structures as a datapoints (without targets).
+    
+    This main difference between this class and AtomsDataset is that this is
+    initialized with a list of atoms objects (as opposed to a directory with
+    files containing atomic structures) without any targets specified. This is
+    useful to make predictions on atomic structures for which true target values
+    are not known, i.e., previously unseen structures.
+    """
+    def __init__(self, atoms):
+        """Initialize an AtomsDatapoint.
+
+        Atomic structures provided in the list are initialized.
+
+        Paramters
+        ---------
+        atoms: ase.Atoms object or a list of ase.Atoms objects
+            Structures for which predictions are to be made.
+        """
+        # If single object, convert to list
+        if isinstance(atoms, Atoms):
+            atoms = [atoms]
+
+        # Save object
+        self.atoms = atoms
+        self.data = []
+
+    def process_data(self, z_cutoffs, node_features, edge_features, pad=False,
+                max_atoms=12, encoder=OneHotEncoder()):
+        """Process list of Atoms objects into PyTorch Data and save.
+
+        Each atomic structure in the root directory is partitioned based on the
+        given z_cutoffs and each partition is featurized according to the given
+        node_features and edge_features. The featurized graphs are converted
+        into Data objects and stored in the "processed" directory under root.
+
+        Parameters
+        ----------
+        z_cutoffs: list or np.ndarray
+            List of z-coordinates based on which atomic structures are
+            partitioned. The number of partitions is equal to one more than the
+            length of z_cutoffs.
+        node_features: list[list]
+            List of lists of node featurization methods to be used for each
+            partition. For e.g., specify [["atomic_number", "dband_center"], 
+            ["atomic_number", "reactivity"], ["atomic_number", "reactivity"]] for
+            a typical bulk + surface + adsorbate partition.
+        edge_features: list[list]
+            List of lists of edge featurization methods to be used for each
+            partition. For e.g., specify [["bulk_bond_distance"], 
+            ["surface_bond_distance"], ["adsorbate_bond_distance"]] for
+            a typical bulk + surface + adsorbate partition.
+        pad: bool
+            Whether to pad the graph with empty nodes to make total nodes add
+            up to max_atoms
+        max_atoms: int
+            Maximum number of nodes in graph. Only used if pad is True.
+        encoder: OneHotEncoder object
+            Encoder to convert properties to vectors
+        """
+        # Iterate over files and process them
+        for atoms_obj in self.atoms:
+            # Partition structure
+            part_atoms = partition_structure(atoms_obj, z_cutoffs)
+
+            # Featurize partitions
+            data_objects = []
+            for j, part_idx in enumerate(part_atoms):
+                feat_dict = featurize_atoms(
+                    atoms_obj,
+                    part_idx,
+                    node_features=node_features[j],
+                    edge_features=edge_features[j],
+                    pad=pad,
+                    max_atoms=max_atoms,
+                    encoder=encoder
+                )
+
+                # Convert to Data object
+                data_obj = Data(
+                    x=feat_dict["node_tensor"],
+                    edge_index=feat_dict["edge_indices"],
+                    edge_attr=feat_dict["edge_tensor"],
+                )
+                data_objects.append(data_obj)
+
+            # Save data objects
+            self.data.append(data_objects)
+
+    def len(self):
+        """Return size of the dataset."""
+        return len(self.data)
+    
+    def get(self, i):
+        """Fetch the processed graph(s) at the i-th index."""
+        data_objects = self.data[i]
+        return data_objects
 
 if __name__ == "__main__":
     # Get path to root directory