From 4cced6523e8b3da0dc68e37f72a6bd154e13a4e0 Mon Sep 17 00:00:00 2001 From: Gaurav S Deshmukh Date: Tue, 19 Sep 2023 17:05:09 -0400 Subject: [PATCH 01/12] Added AtomsDataset --- src/data.py | 116 +++++++++++++++++++++++++++++++++++++++++++++ src/featurizers.py | 2 +- src/utils.py | 12 ++--- 3 files changed, 120 insertions(+), 10 deletions(-) create mode 100644 src/data.py diff --git a/src/data.py b/src/data.py new file mode 100644 index 0000000..528b64c --- /dev/null +++ b/src/data.py @@ -0,0 +1,116 @@ +"Store graph data using PyTorch Geometric abstractions." + +import csv +from pathlib import Path + +import torch + +from ase.io import read +from torch_geometric.data import Data, Dataset + +from utils import partition_structure, featurize_atoms +from featurizers import OneHotEncoder + +class AtomsDataset(Dataset): + """Class to hold a dataset containing graphs of atomic_structures.""" + def __init__(self, root, prop_csv): + """Initialize an AtomsDataset. + + Atomic structures stored as .cif files in the root directory are loaded. + + Paramters + --------- + root: str + Path to the directory in which atomic structures are stored + pro_csv: str + Path to the file mapping atomic structure filename and property. + This filename will typically have two columns, the first with the + names of the cif files (without .cif) and the second with the + corresponding target property values. + """ + super().__init__(root) + + # Read csv + self.prop_csv = prop_csv + self.names = [] + self.props = [] + with open(self.prop_csv, "r") as f: + csv_reader = csv.reader(f) + for row in csv_reader: + self.names.append(str(row[0])) + self.props.append(float(row[1])) + + # Create name to property map + self.map_name_prop = { + name: prop for name, prop in zip(self.names, self.props) + } + + def process(self, z_cutoffs, node_features, edge_features, max_atoms=12, + encoder=OneHotEncoder()): + """Process raw data in the root directory into PyTorch Data and save. + + Each atomic structure in the root directory is partitioned based on the + given z_cutoffs and each partition is featurized according to the given + node_features and edge_features. The featurized graphs are converted + into Data objects and stored in the "processed" directory under root. + + Parameters + ---------- + z_cutoffs: list or np.ndarray + List of z-coordinates based on which atomic structures are + partitioned. The number of partitions is equal to one more than the + length of z_cutoffs. + node_features: list[list] + List of lists of node featurization methods to be used for each + partition. For e.g., specify [["atomic_number", "dband_center"], + ["atomic_number", "reactivity"], ["atomic_number", "reactivity"]] for + a typical bulk + surface + adsorbate partition. + edge_features: list[list] + List of lists of edge featurization methods to be used for each + partition. For e.g., specify [["bulk_bond_distance"], + ["surface_bond_distance"], ["adsorbate_bond_distance"]] for + a typical bulk + surface + adsorbate partition. + encoder: OneHotEncoder object + Encoder to convert properties to vectors + """ + # Root path + root_path = Path(self.root) + + # Create processed path if it doesn't exist + processed_path = Path(self.processed_dir).mkdir(exist_ok=True) + + # Iterate over files and process them + for name in self.names: + # Set file path + file_path = root_path / name + ".cif" + + # Read structure + atoms = read(str(file_path)) + + # Partition structure + part_atoms = partition_structure(atoms, z_cutoffs) + + # Featurize partitions + data_objects = [] + for i, part_idx in enumerate(part_atoms): + feat_dict = featurize_atoms( + atoms, + part_idx, + node_features=node_features[i], + edge_features=edge_features[i],\ + max_atoms=max_atoms, + encoder=encoder + ) + + # Convert to Data object + data_obj = Data( + x=feat_dict["node_tensor"], + edge_index=feat_dict["edge_indices"], + edge_attr=feat_dict["edge_tensor"], + y=torch.Tensor([self.map_name_prop[name]]) + ) + data_objects.append(data_obj) + + # Save data objects + torch.save(data_objects, processed_path / name + ".pt") + \ No newline at end of file diff --git a/src/featurizers.py b/src/featurizers.py index b4c6d65..4ca0109 100644 --- a/src/featurizers.py +++ b/src/featurizers.py @@ -588,7 +588,7 @@ def __init__(self, encoder, min=0, max=4, n_intervals=16): @staticmethod def name(): """Return the name of the featurizer.""" - return "adsorbate_distance" + return "adsorbate_bond_distance" list_of_node_featurizers = [ diff --git a/src/utils.py b/src/utils.py index 3ecf32b..0d3592a 100644 --- a/src/utils.py +++ b/src/utils.py @@ -13,26 +13,20 @@ from graphs import AtomsGraph -def partition_structure(atoms, n_partitions, z_cutoffs): +def partition_structure(atoms, z_cutoffs): """Partition atomic structue into bulk, surface, and/or adsorbates. Parameters ---------- atoms: ase.Atoms object The structure to be partitioned - n_partitions: int - Number of partitions z_cutoffs: list or np.ndarray List of z-coordinate cutoffs. xy planes are placed at the specified cutoffs to partition atoms above and below them. The length of z-cutoffs should be equal to one less than the number of partitions. """ - # Check if length of z_cutoffs is equal to n_paritions - if len(z_cutoffs) != n_partitions - 1: - raise ValueError( - "The length of z_cutoffs must be equal to\ - one less than the number of partitions" - ) + # Set number of partitions equal to 1 more than the length of z_cutoffs + n_partitions = int(len(z_cutoffs) + 1) # Add 0 and infinity to cutoffs z_cutoffs = np.insert(z_cutoffs, 0, 0) From 77c5620f7b2a5a2fb7f8ad712448660c58a415e3 Mon Sep 17 00:00:00 2001 From: Gaurav S Deshmukh Date: Tue, 19 Sep 2023 17:23:27 -0400 Subject: [PATCH 02/12] Added option to not pad graphs --- src/graphs.py | 6 ++++-- src/utils.py | 10 ++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/graphs.py b/src/graphs.py index 40a09b3..a3ac282 100644 --- a/src/graphs.py +++ b/src/graphs.py @@ -14,7 +14,7 @@ class AtomsGraph: """Create graph representation of a collection of atoms.""" - def __init__(self, atoms, select_idx, max_atoms=50): + def __init__(self, atoms, select_idx, pad=True, max_atoms=50): """Initialize variables of the class. Parameters @@ -32,6 +32,7 @@ def __init__(self, atoms, select_idx, max_atoms=50): # Save parameters self.atoms = atoms self.select_idx = select_idx + self.pad = pad self.max_atoms = max_atoms # Create graph @@ -108,7 +109,8 @@ def create_graph(self): graph.add_edge(n, self.map_idx_node[nn], bond_distance=bond_dist) # Pad graph - graph = self.pad_graph(graph) + if self.pad: + graph = self.pad_graph(graph) # Add coordination numbers for n in graph.nodes(): diff --git a/src/utils.py b/src/utils.py index 0d3592a..8436778 100644 --- a/src/utils.py +++ b/src/utils.py @@ -53,6 +53,7 @@ def featurize_atoms( select_idx, node_features, edge_features, + pad=True, max_atoms=50, encoder=OneHotEncoder(), ): @@ -74,6 +75,9 @@ def featurize_atoms( Names of edge featurizers to use (current options: bulk_bond_distance, surface_bond_distance, adsorbate_bond_distance). All of these encode bond distance using a one-hot encoder, but the bounds for each vary. + pad: bool + If True, the graph is padded to ensure the number of nodes is equal to + max_atoms. In that case, the blank nodes have all 0s in their node tensors. max_atoms: int (default = 50) Maximum number of allowed atoms. If the number of atoms in the graph are fewer than this number, the graph is padded with empty nodes. This is @@ -89,7 +93,8 @@ def featurize_atoms( corresponding tensors as values. """ # Create graph - atoms_graph = AtomsGraph(atoms=atoms, select_idx=select_idx, max_atoms=max_atoms) + atoms_graph = AtomsGraph(atoms=atoms, select_idx=select_idx, max_atoms=max_atoms, + pad=pad) # Collect node featurizers node_feats = [] @@ -140,7 +145,7 @@ def featurize_atoms( atoms = read("CONTCAR") - part_atoms = partition_structure(atoms, 3, z_cutoffs=[15, 23.5]) + part_atoms = partition_structure(atoms, z_cutoffs=[15, 23.5]) print(part_atoms) feat_dict = featurize_atoms( @@ -149,5 +154,6 @@ def featurize_atoms( ["atomic_number", "dband_center"], ["bulk_bond_distance"], max_atoms=34, + pad=False, ) print(feat_dict) From 003f6698dfdffebd8d38433d68cf76fc6c145765 Mon Sep 17 00:00:00 2001 From: Gaurav S Deshmukh Date: Wed, 20 Sep 2023 11:15:22 -0400 Subject: [PATCH 03/12] Added len and get methods --- src/data.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/data.py b/src/data.py index 528b64c..708640b 100644 --- a/src/data.py +++ b/src/data.py @@ -29,6 +29,7 @@ def __init__(self, root, prop_csv): corresponding target property values. """ super().__init__(root) + self.root_path = Path(self.root) # Read csv self.prop_csv = prop_csv @@ -73,16 +74,13 @@ def process(self, z_cutoffs, node_features, edge_features, max_atoms=12, encoder: OneHotEncoder object Encoder to convert properties to vectors """ - # Root path - root_path = Path(self.root) - # Create processed path if it doesn't exist - processed_path = Path(self.processed_dir).mkdir(exist_ok=True) + self.processed_path = Path(self.processed_dir).mkdir(exist_ok=True) # Iterate over files and process them - for name in self.names: + for i, name in enumerate(self.names): # Set file path - file_path = root_path / name + ".cif" + file_path = self.root_path / name + ".cif" # Read structure atoms = read(str(file_path)) @@ -92,12 +90,12 @@ def process(self, z_cutoffs, node_features, edge_features, max_atoms=12, # Featurize partitions data_objects = [] - for i, part_idx in enumerate(part_atoms): + for j, part_idx in enumerate(part_atoms): feat_dict = featurize_atoms( atoms, part_idx, - node_features=node_features[i], - edge_features=edge_features[i],\ + node_features=node_features[j], + edge_features=edge_features[j],\ max_atoms=max_atoms, encoder=encoder ) @@ -112,5 +110,13 @@ def process(self, z_cutoffs, node_features, edge_features, max_atoms=12, data_objects.append(data_obj) # Save data objects - torch.save(data_objects, processed_path / name + ".pt") - \ No newline at end of file + torch.save(data_objects, self.processed_path / f"data_{i}.pt") + + def len(self): + """Return size of the dataset.""" + return len(self.names) + + def get(self, i): + """Fetch the processed graph(s) at the i-th index.""" + data_objects = torch.load(self.processed_path / f"data_{i}.pt") + return data_objects \ No newline at end of file From b3e924674f41f64cc8559e7975505f1557f9c9de Mon Sep 17 00:00:00 2001 From: Gaurav S Deshmukh Date: Wed, 20 Sep 2023 11:47:56 -0400 Subject: [PATCH 04/12] Add name of struct --- src/data.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/data.py b/src/data.py index 708640b..a09e111 100644 --- a/src/data.py +++ b/src/data.py @@ -109,6 +109,9 @@ def process(self, z_cutoffs, node_features, edge_features, max_atoms=12, ) data_objects.append(data_obj) + # Add name of structure + data_objects.append(name) + # Save data objects torch.save(data_objects, self.processed_path / f"data_{i}.pt") From c8adcb0ae4b9c453914dc7b74b858eba57ca9459 Mon Sep 17 00:00:00 2001 From: Gaurav S Deshmukh Date: Wed, 20 Sep 2023 13:26:09 -0400 Subject: [PATCH 05/12] Updated gitignore --- .gitignore | 1 + src/data.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 94d6dd9..bc0c661 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ *.csv !data/dband_centers.csv __pycache__ +*.cif diff --git a/src/data.py b/src/data.py index a09e111..c322fb5 100644 --- a/src/data.py +++ b/src/data.py @@ -25,7 +25,7 @@ def __init__(self, root, prop_csv): pro_csv: str Path to the file mapping atomic structure filename and property. This filename will typically have two columns, the first with the - names of the cif files (without .cif) and the second with the + names of the cif files and the second with the corresponding target property values. """ super().__init__(root) @@ -80,7 +80,7 @@ def process(self, z_cutoffs, node_features, edge_features, max_atoms=12, # Iterate over files and process them for i, name in enumerate(self.names): # Set file path - file_path = self.root_path / name + ".cif" + file_path = self.root_path / name # Read structure atoms = read(str(file_path)) From e98d9988c15b78f2184118462e8c1623e0e78555 Mon Sep 17 00:00:00 2001 From: Gaurav S Deshmukh Date: Thu, 21 Sep 2023 19:07:29 -0400 Subject: [PATCH 06/12] AtomsDataset works --- .gitignore | 1 + src/data.py | 48 +++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index bc0c661..513abe4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ *POSCAR* *CONTCAR* *.csv +data/* !data/dband_centers.csv __pycache__ *.cif diff --git a/src/data.py b/src/data.py index c322fb5..671537d 100644 --- a/src/data.py +++ b/src/data.py @@ -4,12 +4,14 @@ from pathlib import Path import torch +import tqdm from ase.io import read from torch_geometric.data import Data, Dataset from utils import partition_structure, featurize_atoms from featurizers import OneHotEncoder +from constants import REPO_PATH class AtomsDataset(Dataset): """Class to hold a dataset containing graphs of atomic_structures.""" @@ -22,7 +24,7 @@ def __init__(self, root, prop_csv): --------- root: str Path to the directory in which atomic structures are stored - pro_csv: str + prop_csv: str Path to the file mapping atomic structure filename and property. This filename will typically have two columns, the first with the names of the cif files and the second with the @@ -31,6 +33,10 @@ def __init__(self, root, prop_csv): super().__init__(root) self.root_path = Path(self.root) + # Create processed path if it doesn't exist + self.processed_path = Path(self.processed_dir) + self.processed_path.mkdir(exist_ok=True) + # Read csv self.prop_csv = prop_csv self.names = [] @@ -46,8 +52,8 @@ def __init__(self, root, prop_csv): name: prop for name, prop in zip(self.names, self.props) } - def process(self, z_cutoffs, node_features, edge_features, max_atoms=12, - encoder=OneHotEncoder()): + def process_data(self, z_cutoffs, node_features, edge_features, pad=False, + max_atoms=12, encoder=OneHotEncoder()): """Process raw data in the root directory into PyTorch Data and save. Each atomic structure in the root directory is partitioned based on the @@ -71,14 +77,17 @@ def process(self, z_cutoffs, node_features, edge_features, max_atoms=12, partition. For e.g., specify [["bulk_bond_distance"], ["surface_bond_distance"], ["adsorbate_bond_distance"]] for a typical bulk + surface + adsorbate partition. + pad: bool + Whether to pad the graph with empty nodes to make total nodes add + up to max_atoms + max_atoms: int + Maximum number of nodes in graph. Only used if pad is True. encoder: OneHotEncoder object Encoder to convert properties to vectors """ - # Create processed path if it doesn't exist - self.processed_path = Path(self.processed_dir).mkdir(exist_ok=True) - # Iterate over files and process them - for i, name in enumerate(self.names): + for i, name in tqdm.tqdm(enumerate(self.names), desc="Processing data", + total=len(self.names)): # Set file path file_path = self.root_path / name @@ -95,7 +104,8 @@ def process(self, z_cutoffs, node_features, edge_features, max_atoms=12, atoms, part_idx, node_features=node_features[j], - edge_features=edge_features[j],\ + edge_features=edge_features[j], + pad=pad, max_atoms=max_atoms, encoder=encoder ) @@ -122,4 +132,24 @@ def len(self): def get(self, i): """Fetch the processed graph(s) at the i-th index.""" data_objects = torch.load(self.processed_path / f"data_{i}.pt") - return data_objects \ No newline at end of file + return data_objects + +if __name__ == "__main__": + # Get path to root directory + data_root_path = Path(REPO_PATH) / "data" / "S_calcs" + prop_csv_path = data_root_path / "name_prop.csv" + + # Create dataset + dataset = AtomsDataset(data_root_path, prop_csv_path) + dataset.process_data(z_cutoffs=[13., 20.], + node_features=[ + ["atomic_number", "dband_center"], + ["atomic_number", "reactivity"], + ["atomic_number", "reactivity"], + ], + edge_features=[ + ["bulk_bond_distance"], + ["surface_bond_distance"], + ["adsorbate_bond_distance"], + ]) + print(dataset[0][-2].x) \ No newline at end of file From ffac10bcc565e63e31472a29999f8c15ebd53a80 Mon Sep 17 00:00:00 2001 From: Gaurav S Deshmukh Date: Thu, 21 Sep 2023 19:10:50 -0400 Subject: [PATCH 07/12] Blank change --- src/data.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/data.py b/src/data.py index 671537d..28a9049 100644 --- a/src/data.py +++ b/src/data.py @@ -141,15 +141,15 @@ def get(self, i): # Create dataset dataset = AtomsDataset(data_root_path, prop_csv_path) - dataset.process_data(z_cutoffs=[13., 20.], - node_features=[ - ["atomic_number", "dband_center"], - ["atomic_number", "reactivity"], - ["atomic_number", "reactivity"], - ], - edge_features=[ - ["bulk_bond_distance"], - ["surface_bond_distance"], - ["adsorbate_bond_distance"], - ]) + # dataset.process_data(z_cutoffs=[13., 20.], + # node_features=[ + # ["atomic_number", "dband_center"], + # ["atomic_number", "reactivity"], + # ["atomic_number", "reactivity"], + # ], + # edge_features=[ + # ["bulk_bond_distance"], + # ["surface_bond_distance"], + # ["adsorbate_bond_distance"], + # ]) print(dataset[0][-2].x) \ No newline at end of file From 964dfcd9e2ea56b1a0ed0c7b328334408c010b7a Mon Sep 17 00:00:00 2001 From: Gaurav S Deshmukh Date: Thu, 21 Sep 2023 19:30:42 -0400 Subject: [PATCH 08/12] Add processed flag --- src/data.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/data.py b/src/data.py index 28a9049..423fd26 100644 --- a/src/data.py +++ b/src/data.py @@ -36,6 +36,7 @@ def __init__(self, root, prop_csv): # Create processed path if it doesn't exist self.processed_path = Path(self.processed_dir) self.processed_path.mkdir(exist_ok=True) + self.process_flag = False # Read csv self.prop_csv = prop_csv @@ -124,6 +125,9 @@ def process_data(self, z_cutoffs, node_features, edge_features, pad=False, # Save data objects torch.save(data_objects, self.processed_path / f"data_{i}.pt") + + # Set process flag to true + self.process_flag = True def len(self): """Return size of the dataset.""" @@ -133,6 +137,11 @@ def get(self, i): """Fetch the processed graph(s) at the i-th index.""" data_objects = torch.load(self.processed_path / f"data_{i}.pt") return data_objects + + def processed_status(self): + """Check if the dataset is processed.""" + return self.process_flag + if __name__ == "__main__": # Get path to root directory From dc8f977ecb6e2936bf513a2e2d418d67b530f7fe Mon Sep 17 00:00:00 2001 From: Gaurav S Deshmukh Date: Thu, 21 Sep 2023 19:59:32 -0400 Subject: [PATCH 09/12] Added name index dataframe --- src/data.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/src/data.py b/src/data.py index 423fd26..fb3ec17 100644 --- a/src/data.py +++ b/src/data.py @@ -3,6 +3,7 @@ import csv from pathlib import Path +import pandas as pd import torch import tqdm @@ -36,7 +37,6 @@ def __init__(self, root, prop_csv): # Create processed path if it doesn't exist self.processed_path = Path(self.processed_dir) self.processed_path.mkdir(exist_ok=True) - self.process_flag = False # Read csv self.prop_csv = prop_csv @@ -53,6 +53,11 @@ def __init__(self, root, prop_csv): name: prop for name, prop in zip(self.names, self.props) } + # Load index.csv if processed + self.index_path = self.processed_path / "index.csv" + if self.processed_status(): + self.df_name_idx = pd.read_csv(self.index_path) + def process_data(self, z_cutoffs, node_features, edge_features, pad=False, max_atoms=12, encoder=OneHotEncoder()): """Process raw data in the root directory into PyTorch Data and save. @@ -86,9 +91,18 @@ def process_data(self, z_cutoffs, node_features, edge_features, pad=False, encoder: OneHotEncoder object Encoder to convert properties to vectors """ + # Create empty dataframe to store index and name correspondence + self.df_name_idx = pd.DataFrame( + {"index": [0] * len(self.names), "name": [""] * len(self.names)} + ) + # Iterate over files and process them for i, name in tqdm.tqdm(enumerate(self.names), desc="Processing data", total=len(self.names)): + # Map index to name + self.df_name_idx.loc[i, "index"] = i + self.df_name_idx.loc[i, "name"] = name + # Set file path file_path = self.root_path / name @@ -120,14 +134,11 @@ def process_data(self, z_cutoffs, node_features, edge_features, pad=False, ) data_objects.append(data_obj) - # Add name of structure - data_objects.append(name) - # Save data objects torch.save(data_objects, self.processed_path / f"data_{i}.pt") - # Set process flag to true - self.process_flag = True + # Save name-index dataframe + self.df_name_idx.to_csv(self.index_path, index=None) def len(self): """Return size of the dataset.""" @@ -140,7 +151,10 @@ def get(self, i): def processed_status(self): """Check if the dataset is processed.""" - return self.process_flag + if Path(self.index_path).exists(): + return True + else: + return False if __name__ == "__main__": @@ -161,4 +175,5 @@ def processed_status(self): # ["surface_bond_distance"], # ["adsorbate_bond_distance"], # ]) - print(dataset[0][-2].x) \ No newline at end of file + print(dataset[0][-1].x) + print(dataset.df_name_idx.head()) \ No newline at end of file From 33293ea6053544b40db1a96af14bfddb9f41f427 Mon Sep 17 00:00:00 2001 From: Gaurav S Deshmukh Date: Fri, 22 Sep 2023 11:53:27 -0400 Subject: [PATCH 10/12] Added AtomsDatapoint --- src/data.py | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/src/data.py b/src/data.py index fb3ec17..c482e22 100644 --- a/src/data.py +++ b/src/data.py @@ -8,6 +8,7 @@ import tqdm from ase.io import read +from ase import Atoms from torch_geometric.data import Data, Dataset from utils import partition_structure, featurize_atoms @@ -156,6 +157,103 @@ def processed_status(self): else: return False +class AtomsDatapoints: + """Class to hold atomic structures as a datapoints (without targets). + + This main difference between this class and AtomsDataset is that this is + initialized with a list of atoms objects (as opposed to a directory with + files containing atomic structures) without any targets specified. This is + useful to make predictions on atomic structures for which true target values + are not known, i.e., previously unseen structures. + """ + def __init__(self, atoms): + """Initialize an AtomsDatapoint. + + Atomic structures provided in the list are initialized. + + Paramters + --------- + atoms: ase.Atoms object or a list of ase.Atoms objects + Structures for which predictions are to be made. + """ + # If single object, convert to list + if isinstance(atoms, Atoms): + atoms = [atoms] + + # Save object + self.atoms = atoms + self.data = [] + + def process_data(self, z_cutoffs, node_features, edge_features, pad=False, + max_atoms=12, encoder=OneHotEncoder()): + """Process list of Atoms objects into PyTorch Data and save. + + Each atomic structure in the root directory is partitioned based on the + given z_cutoffs and each partition is featurized according to the given + node_features and edge_features. The featurized graphs are converted + into Data objects and stored in the "processed" directory under root. + + Parameters + ---------- + z_cutoffs: list or np.ndarray + List of z-coordinates based on which atomic structures are + partitioned. The number of partitions is equal to one more than the + length of z_cutoffs. + node_features: list[list] + List of lists of node featurization methods to be used for each + partition. For e.g., specify [["atomic_number", "dband_center"], + ["atomic_number", "reactivity"], ["atomic_number", "reactivity"]] for + a typical bulk + surface + adsorbate partition. + edge_features: list[list] + List of lists of edge featurization methods to be used for each + partition. For e.g., specify [["bulk_bond_distance"], + ["surface_bond_distance"], ["adsorbate_bond_distance"]] for + a typical bulk + surface + adsorbate partition. + pad: bool + Whether to pad the graph with empty nodes to make total nodes add + up to max_atoms + max_atoms: int + Maximum number of nodes in graph. Only used if pad is True. + encoder: OneHotEncoder object + Encoder to convert properties to vectors + """ + # Iterate over files and process them + for atoms_obj in self.atoms: + # Partition structure + part_atoms = partition_structure(atoms_obj, z_cutoffs) + + # Featurize partitions + data_objects = [] + for j, part_idx in enumerate(part_atoms): + feat_dict = featurize_atoms( + atoms_obj, + part_idx, + node_features=node_features[j], + edge_features=edge_features[j], + pad=pad, + max_atoms=max_atoms, + encoder=encoder + ) + + # Convert to Data object + data_obj = Data( + x=feat_dict["node_tensor"], + edge_index=feat_dict["edge_indices"], + edge_attr=feat_dict["edge_tensor"], + ) + data_objects.append(data_obj) + + # Save data objects + self.data.append(data_objects) + + def len(self): + """Return size of the dataset.""" + return len(self.data) + + def get(self, i): + """Fetch the processed graph(s) at the i-th index.""" + data_objects = self.data[i] + return data_objects if __name__ == "__main__": # Get path to root directory From 986380fef182cbfa52993d18438a4beafee5cea4 Mon Sep 17 00:00:00 2001 From: Gaurav S Deshmukh Date: Fri, 22 Sep 2023 12:23:06 -0400 Subject: [PATCH 11/12] Added data functions --- src/data.py | 111 ++++++++++++++++++++++++++++++++++++++++++-------- src/graphs.py | 12 +++--- src/utils.py | 19 +++------ 3 files changed, 106 insertions(+), 36 deletions(-) diff --git a/src/data.py b/src/data.py index c482e22..c1920c1 100644 --- a/src/data.py +++ b/src/data.py @@ -59,8 +59,8 @@ def __init__(self, root, prop_csv): if self.processed_status(): self.df_name_idx = pd.read_csv(self.index_path) - def process_data(self, z_cutoffs, node_features, edge_features, pad=False, - max_atoms=12, encoder=OneHotEncoder()): + def process_data(self, z_cutoffs, node_features, edge_features, max_atoms=None, + encoder=OneHotEncoder()): """Process raw data in the root directory into PyTorch Data and save. Each atomic structure in the root directory is partitioned based on the @@ -84,11 +84,9 @@ def process_data(self, z_cutoffs, node_features, edge_features, pad=False, partition. For e.g., specify [["bulk_bond_distance"], ["surface_bond_distance"], ["adsorbate_bond_distance"]] for a typical bulk + surface + adsorbate partition. - pad: bool - Whether to pad the graph with empty nodes to make total nodes add - up to max_atoms - max_atoms: int - Maximum number of nodes in graph. Only used if pad is True. + max_atoms: int (default = None) + Maximum number of nodes in graph. If a value is provided, graphs are + padded to make sure the total number of nodes matches max_atoms. encoder: OneHotEncoder object Encoder to convert properties to vectors """ @@ -121,7 +119,6 @@ def process_data(self, z_cutoffs, node_features, edge_features, pad=False, part_idx, node_features=node_features[j], edge_features=edge_features[j], - pad=pad, max_atoms=max_atoms, encoder=encoder ) @@ -184,8 +181,8 @@ def __init__(self, atoms): self.atoms = atoms self.data = [] - def process_data(self, z_cutoffs, node_features, edge_features, pad=False, - max_atoms=12, encoder=OneHotEncoder()): + def process_data(self, z_cutoffs, node_features, edge_features, max_atoms=None, + encoder=OneHotEncoder()): """Process list of Atoms objects into PyTorch Data and save. Each atomic structure in the root directory is partitioned based on the @@ -209,11 +206,9 @@ def process_data(self, z_cutoffs, node_features, edge_features, pad=False, partition. For e.g., specify [["bulk_bond_distance"], ["surface_bond_distance"], ["adsorbate_bond_distance"]] for a typical bulk + surface + adsorbate partition. - pad: bool - Whether to pad the graph with empty nodes to make total nodes add - up to max_atoms - max_atoms: int - Maximum number of nodes in graph. Only used if pad is True. + max_atoms: int (default is None) + Maximum number of nodes in graph. If a value is provided, graphs are + padded to make sure the total number of nodes matches max_atoms. encoder: OneHotEncoder object Encoder to convert properties to vectors """ @@ -230,7 +225,6 @@ def process_data(self, z_cutoffs, node_features, edge_features, pad=False, part_idx, node_features=node_features[j], edge_features=edge_features[j], - pad=pad, max_atoms=max_atoms, encoder=encoder ) @@ -255,6 +249,73 @@ def get(self, i): data_objects = self.data[i] return data_objects +def load_dataset(root, prop_csv, process_dict=None): + """Load an AtomsDataset at the path given by root. + + If process_dict is provided, the process_data method of AtomsDataset is called + to convert the atomic structures to graphs based on the given parameters in + process_dict. This should be used when the dataset is created for the first + time. + + Parameters + ---------- + root: str + Path to the dataset + prop_csv: str + Path to the file mapping atomic structure filename and property. + This filename will typically have two columns, the first with the + names of the cif files and the second with the + corresponding target property values. + process_dict: dict (default = None) + If this is provided, atomic structures at root will be processed into + graphs and stored under a "processed" subdirectory. Only use this when + creating a new dataset. This should contain the following keys: z_cutoffs, + node_features, edge_features, max_atoms (optional), encoder (optional). + Refer to the documentation of process_atoms for more information regarding + these parameters. + + Returns + ------- + dataset: AtomsDataset + Initialized AtomsDataset object + """ + dataset = AtomsDataset(root, prop_csv) + if process_dict is not None: + dataset.process_data(**process_dict) + + return dataset + +def load_datapoints(atoms, process_dict): + """Load AtomsDatapoints for the provided ase.Atoms or list of ase.Atoms. + + If process_dict is provided, the process_data method of AtomsDatapoints is called + to convert the atomic structures to graphs based on the given parameters in + process_dict. This should be used when the dataset is created for the first + time. + + Parameters + ---------- + atoms: ase.Atoms object or a list of ase.Atoms objects + Structures for which predictions are to be made. + process_dict: dict + Parameters to process the provided Atoms objects into graphs. + This should contain the following keys: z_cutoffs, node_features, + edge_features, max_atoms (optional), encoder (optional). Refer to the + documentation of process_atoms for more information regarding these + parameters. + + Returns + ------- + datapoints: AtomsDatapoints + Initialized AtomsDatapoints object + """ + datapoints = AtomsDatapoints(atoms) + if process_dict is not None: + datapoints.process_data(**process_dict) + + return datapoints + + if __name__ == "__main__": # Get path to root directory data_root_path = Path(REPO_PATH) / "data" / "S_calcs" @@ -274,4 +335,20 @@ def get(self, i): # ["adsorbate_bond_distance"], # ]) print(dataset[0][-1].x) - print(dataset.df_name_idx.head()) \ No newline at end of file + print(dataset.df_name_idx.head()) + + # Create datapoint + atoms = read(data_root_path / "Pt_3_Rh_9_-7-7-S.cif") + datapoint = AtomsDatapoints(atoms) + datapoint.process_data(z_cutoffs=[13., 20.], + node_features=[ + ["atomic_number", "dband_center"], + ["atomic_number", "reactivity"], + ["atomic_number", "reactivity"], + ], + edge_features=[ + ["bulk_bond_distance"], + ["surface_bond_distance"], + ["adsorbate_bond_distance"], + ]) + print(datapoint.get(0)) \ No newline at end of file diff --git a/src/graphs.py b/src/graphs.py index a3ac282..7e0f747 100644 --- a/src/graphs.py +++ b/src/graphs.py @@ -14,7 +14,7 @@ class AtomsGraph: """Create graph representation of a collection of atoms.""" - def __init__(self, atoms, select_idx, pad=True, max_atoms=50): + def __init__(self, atoms, select_idx, max_atoms=None): """Initialize variables of the class. Parameters @@ -25,14 +25,14 @@ def __init__(self, atoms, select_idx, pad=True, max_atoms=50): List of indices of atoms that are to be included in the graph neighbor_list: ase.neighborlist.NeighborList object Neighbor list that defines bonds between atoms - max_atoms: int (default = 50) - The maximum number of atoms in the graph. Graphs that have fewer - atoms are padded with 0s to reach this value. + max_atoms: int (default = None) + The maximum number of atoms in the graph. If it is not None, graphs + that have fewer nodes than max_atoms are padded with 0s to ensure + that the total number of nodes is equal to max_atoms. """ # Save parameters self.atoms = atoms self.select_idx = select_idx - self.pad = pad self.max_atoms = max_atoms # Create graph @@ -109,7 +109,7 @@ def create_graph(self): graph.add_edge(n, self.map_idx_node[nn], bond_distance=bond_dist) # Pad graph - if self.pad: + if self.max_atoms is not None: graph = self.pad_graph(graph) # Add coordination numbers diff --git a/src/utils.py b/src/utils.py index 8436778..2e73f2f 100644 --- a/src/utils.py +++ b/src/utils.py @@ -53,8 +53,7 @@ def featurize_atoms( select_idx, node_features, edge_features, - pad=True, - max_atoms=50, + max_atoms=None, encoder=OneHotEncoder(), ): """Featurize atoms and bonds with the chosen featurizers. @@ -75,14 +74,10 @@ def featurize_atoms( Names of edge featurizers to use (current options: bulk_bond_distance, surface_bond_distance, adsorbate_bond_distance). All of these encode bond distance using a one-hot encoder, but the bounds for each vary. - pad: bool - If True, the graph is padded to ensure the number of nodes is equal to - max_atoms. In that case, the blank nodes have all 0s in their node tensors. - max_atoms: int (default = 50) - Maximum number of allowed atoms. If the number of atoms in the graph are - fewer than this number, the graph is padded with empty nodes. This is - required to make the sizes of the node feature tensors consistent across - structures. + max_atoms: int (default = None) + Maximum number of allowed atoms. If it is not None, graphs + that have fewer nodes than max_atoms are padded with 0s to ensure + that the total number of nodes is equal to max_atoms. encoder: encoder object from featurizers.py Currently only the OneHotEncoder is supported @@ -93,8 +88,7 @@ def featurize_atoms( corresponding tensors as values. """ # Create graph - atoms_graph = AtomsGraph(atoms=atoms, select_idx=select_idx, max_atoms=max_atoms, - pad=pad) + atoms_graph = AtomsGraph(atoms=atoms, select_idx=select_idx, max_atoms=max_atoms) # Collect node featurizers node_feats = [] @@ -154,6 +148,5 @@ def featurize_atoms( ["atomic_number", "dband_center"], ["bulk_bond_distance"], max_atoms=34, - pad=False, ) print(feat_dict) From e3c22417ecae7afef710b40fbd1db640e210255c Mon Sep 17 00:00:00 2001 From: Gaurav S Deshmukh Date: Fri, 22 Sep 2023 12:25:41 -0400 Subject: [PATCH 12/12] Fixed codestyle --- src/data.py | 108 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 63 insertions(+), 45 deletions(-) diff --git a/src/data.py b/src/data.py index c1920c1..e9561af 100644 --- a/src/data.py +++ b/src/data.py @@ -1,4 +1,4 @@ -"Store graph data using PyTorch Geometric abstractions." +"""Store graph data using PyTorch Geometric abstractions.""" import csv from pathlib import Path @@ -6,17 +6,18 @@ import pandas as pd import torch import tqdm - -from ase.io import read from ase import Atoms +from ase.io import read from torch_geometric.data import Data, Dataset -from utils import partition_structure, featurize_atoms -from featurizers import OneHotEncoder from constants import REPO_PATH +from featurizers import OneHotEncoder +from utils import featurize_atoms, partition_structure + class AtomsDataset(Dataset): """Class to hold a dataset containing graphs of atomic_structures.""" + def __init__(self, root, prop_csv): """Initialize an AtomsDataset. @@ -50,17 +51,21 @@ def __init__(self, root, prop_csv): self.props.append(float(row[1])) # Create name to property map - self.map_name_prop = { - name: prop for name, prop in zip(self.names, self.props) - } + self.map_name_prop = {name: prop for name, prop in zip(self.names, self.props)} # Load index.csv if processed self.index_path = self.processed_path / "index.csv" if self.processed_status(): self.df_name_idx = pd.read_csv(self.index_path) - def process_data(self, z_cutoffs, node_features, edge_features, max_atoms=None, - encoder=OneHotEncoder()): + def process_data( + self, + z_cutoffs, + node_features, + edge_features, + max_atoms=None, + encoder=OneHotEncoder(), + ): """Process raw data in the root directory into PyTorch Data and save. Each atomic structure in the root directory is partitioned based on the @@ -76,12 +81,12 @@ def process_data(self, z_cutoffs, node_features, edge_features, max_atoms=None, length of z_cutoffs. node_features: list[list] List of lists of node featurization methods to be used for each - partition. For e.g., specify [["atomic_number", "dband_center"], + partition. For e.g., specify [["atomic_number", "dband_center"], ["atomic_number", "reactivity"], ["atomic_number", "reactivity"]] for a typical bulk + surface + adsorbate partition. edge_features: list[list] List of lists of edge featurization methods to be used for each - partition. For e.g., specify [["bulk_bond_distance"], + partition. For e.g., specify [["bulk_bond_distance"], ["surface_bond_distance"], ["adsorbate_bond_distance"]] for a typical bulk + surface + adsorbate partition. max_atoms: int (default = None) @@ -96,8 +101,9 @@ def process_data(self, z_cutoffs, node_features, edge_features, max_atoms=None, ) # Iterate over files and process them - for i, name in tqdm.tqdm(enumerate(self.names), desc="Processing data", - total=len(self.names)): + for i, name in tqdm.tqdm( + enumerate(self.names), desc="Processing data", total=len(self.names) + ): # Map index to name self.df_name_idx.loc[i, "index"] = i self.df_name_idx.loc[i, "name"] = name @@ -120,7 +126,7 @@ def process_data(self, z_cutoffs, node_features, edge_features, max_atoms=None, node_features=node_features[j], edge_features=edge_features[j], max_atoms=max_atoms, - encoder=encoder + encoder=encoder, ) # Convert to Data object @@ -128,7 +134,7 @@ def process_data(self, z_cutoffs, node_features, edge_features, max_atoms=None, x=feat_dict["node_tensor"], edge_index=feat_dict["edge_indices"], edge_attr=feat_dict["edge_tensor"], - y=torch.Tensor([self.map_name_prop[name]]) + y=torch.Tensor([self.map_name_prop[name]]), ) data_objects.append(data_obj) @@ -137,16 +143,16 @@ def process_data(self, z_cutoffs, node_features, edge_features, max_atoms=None, # Save name-index dataframe self.df_name_idx.to_csv(self.index_path, index=None) - + def len(self): """Return size of the dataset.""" return len(self.names) - + def get(self, i): """Fetch the processed graph(s) at the i-th index.""" data_objects = torch.load(self.processed_path / f"data_{i}.pt") return data_objects - + def processed_status(self): """Check if the dataset is processed.""" if Path(self.index_path).exists(): @@ -154,15 +160,17 @@ def processed_status(self): else: return False + class AtomsDatapoints: """Class to hold atomic structures as a datapoints (without targets). - + This main difference between this class and AtomsDataset is that this is initialized with a list of atoms objects (as opposed to a directory with files containing atomic structures) without any targets specified. This is useful to make predictions on atomic structures for which true target values are not known, i.e., previously unseen structures. """ + def __init__(self, atoms): """Initialize an AtomsDatapoint. @@ -181,8 +189,14 @@ def __init__(self, atoms): self.atoms = atoms self.data = [] - def process_data(self, z_cutoffs, node_features, edge_features, max_atoms=None, - encoder=OneHotEncoder()): + def process_data( + self, + z_cutoffs, + node_features, + edge_features, + max_atoms=None, + encoder=OneHotEncoder(), + ): """Process list of Atoms objects into PyTorch Data and save. Each atomic structure in the root directory is partitioned based on the @@ -198,12 +212,12 @@ def process_data(self, z_cutoffs, node_features, edge_features, max_atoms=None, length of z_cutoffs. node_features: list[list] List of lists of node featurization methods to be used for each - partition. For e.g., specify [["atomic_number", "dband_center"], + partition. For e.g., specify [["atomic_number", "dband_center"], ["atomic_number", "reactivity"], ["atomic_number", "reactivity"]] for a typical bulk + surface + adsorbate partition. edge_features: list[list] List of lists of edge featurization methods to be used for each - partition. For e.g., specify [["bulk_bond_distance"], + partition. For e.g., specify [["bulk_bond_distance"], ["surface_bond_distance"], ["adsorbate_bond_distance"]] for a typical bulk + surface + adsorbate partition. max_atoms: int (default is None) @@ -226,7 +240,7 @@ def process_data(self, z_cutoffs, node_features, edge_features, max_atoms=None, node_features=node_features[j], edge_features=edge_features[j], max_atoms=max_atoms, - encoder=encoder + encoder=encoder, ) # Convert to Data object @@ -243,12 +257,13 @@ def process_data(self, z_cutoffs, node_features, edge_features, max_atoms=None, def len(self): """Return size of the dataset.""" return len(self.data) - + def get(self, i): """Fetch the processed graph(s) at the i-th index.""" data_objects = self.data[i] return data_objects + def load_dataset(root, prop_csv, process_dict=None): """Load an AtomsDataset at the path given by root. @@ -273,7 +288,7 @@ def load_dataset(root, prop_csv, process_dict=None): node_features, edge_features, max_atoms (optional), encoder (optional). Refer to the documentation of process_atoms for more information regarding these parameters. - + Returns ------- dataset: AtomsDataset @@ -285,6 +300,7 @@ def load_dataset(root, prop_csv, process_dict=None): return dataset + def load_datapoints(atoms, process_dict): """Load AtomsDatapoints for the provided ase.Atoms or list of ase.Atoms. @@ -297,13 +313,13 @@ def load_datapoints(atoms, process_dict): ---------- atoms: ase.Atoms object or a list of ase.Atoms objects Structures for which predictions are to be made. - process_dict: dict + process_dict: dict Parameters to process the provided Atoms objects into graphs. - This should contain the following keys: z_cutoffs, node_features, - edge_features, max_atoms (optional), encoder (optional). Refer to the + This should contain the following keys: z_cutoffs, node_features, + edge_features, max_atoms (optional), encoder (optional). Refer to the documentation of process_atoms for more information regarding these parameters. - + Returns ------- datapoints: AtomsDatapoints @@ -312,7 +328,7 @@ def load_datapoints(atoms, process_dict): datapoints = AtomsDatapoints(atoms) if process_dict is not None: datapoints.process_data(**process_dict) - + return datapoints @@ -323,7 +339,7 @@ def load_datapoints(atoms, process_dict): # Create dataset dataset = AtomsDataset(data_root_path, prop_csv_path) - # dataset.process_data(z_cutoffs=[13., 20.], + # dataset.process_data(z_cutoffs=[13., 20.], # node_features=[ # ["atomic_number", "dband_center"], # ["atomic_number", "reactivity"], @@ -340,15 +356,17 @@ def load_datapoints(atoms, process_dict): # Create datapoint atoms = read(data_root_path / "Pt_3_Rh_9_-7-7-S.cif") datapoint = AtomsDatapoints(atoms) - datapoint.process_data(z_cutoffs=[13., 20.], - node_features=[ - ["atomic_number", "dband_center"], - ["atomic_number", "reactivity"], - ["atomic_number", "reactivity"], - ], - edge_features=[ - ["bulk_bond_distance"], - ["surface_bond_distance"], - ["adsorbate_bond_distance"], - ]) - print(datapoint.get(0)) \ No newline at end of file + datapoint.process_data( + z_cutoffs=[13.0, 20.0], + node_features=[ + ["atomic_number", "dband_center"], + ["atomic_number", "reactivity"], + ["atomic_number", "reactivity"], + ], + edge_features=[ + ["bulk_bond_distance"], + ["surface_bond_distance"], + ["adsorbate_bond_distance"], + ], + ) + print(datapoint.get(0))