diff --git a/.gitignore b/.gitignore index 94d6dd9..513abe4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ *POSCAR* *CONTCAR* *.csv +data/* !data/dband_centers.csv __pycache__ +*.cif diff --git a/src/data.py b/src/data.py new file mode 100644 index 0000000..e9561af --- /dev/null +++ b/src/data.py @@ -0,0 +1,372 @@ +"""Store graph data using PyTorch Geometric abstractions.""" + +import csv +from pathlib import Path + +import pandas as pd +import torch +import tqdm +from ase import Atoms +from ase.io import read +from torch_geometric.data import Data, Dataset + +from constants import REPO_PATH +from featurizers import OneHotEncoder +from utils import featurize_atoms, partition_structure + + +class AtomsDataset(Dataset): + """Class to hold a dataset containing graphs of atomic_structures.""" + + def __init__(self, root, prop_csv): + """Initialize an AtomsDataset. + + Atomic structures stored as .cif files in the root directory are loaded. + + Paramters + --------- + root: str + Path to the directory in which atomic structures are stored + prop_csv: str + Path to the file mapping atomic structure filename and property. + This filename will typically have two columns, the first with the + names of the cif files and the second with the + corresponding target property values. + """ + super().__init__(root) + self.root_path = Path(self.root) + + # Create processed path if it doesn't exist + self.processed_path = Path(self.processed_dir) + self.processed_path.mkdir(exist_ok=True) + + # Read csv + self.prop_csv = prop_csv + self.names = [] + self.props = [] + with open(self.prop_csv, "r") as f: + csv_reader = csv.reader(f) + for row in csv_reader: + self.names.append(str(row[0])) + self.props.append(float(row[1])) + + # Create name to property map + self.map_name_prop = {name: prop for name, prop in zip(self.names, self.props)} + + # Load index.csv if processed + self.index_path = self.processed_path / "index.csv" + if self.processed_status(): + self.df_name_idx = pd.read_csv(self.index_path) + + def process_data( + self, + z_cutoffs, + node_features, + edge_features, + max_atoms=None, + encoder=OneHotEncoder(), + ): + """Process raw data in the root directory into PyTorch Data and save. + + Each atomic structure in the root directory is partitioned based on the + given z_cutoffs and each partition is featurized according to the given + node_features and edge_features. The featurized graphs are converted + into Data objects and stored in the "processed" directory under root. + + Parameters + ---------- + z_cutoffs: list or np.ndarray + List of z-coordinates based on which atomic structures are + partitioned. The number of partitions is equal to one more than the + length of z_cutoffs. + node_features: list[list] + List of lists of node featurization methods to be used for each + partition. For e.g., specify [["atomic_number", "dband_center"], + ["atomic_number", "reactivity"], ["atomic_number", "reactivity"]] for + a typical bulk + surface + adsorbate partition. + edge_features: list[list] + List of lists of edge featurization methods to be used for each + partition. For e.g., specify [["bulk_bond_distance"], + ["surface_bond_distance"], ["adsorbate_bond_distance"]] for + a typical bulk + surface + adsorbate partition. + max_atoms: int (default = None) + Maximum number of nodes in graph. If a value is provided, graphs are + padded to make sure the total number of nodes matches max_atoms. + encoder: OneHotEncoder object + Encoder to convert properties to vectors + """ + # Create empty dataframe to store index and name correspondence + self.df_name_idx = pd.DataFrame( + {"index": [0] * len(self.names), "name": [""] * len(self.names)} + ) + + # Iterate over files and process them + for i, name in tqdm.tqdm( + enumerate(self.names), desc="Processing data", total=len(self.names) + ): + # Map index to name + self.df_name_idx.loc[i, "index"] = i + self.df_name_idx.loc[i, "name"] = name + + # Set file path + file_path = self.root_path / name + + # Read structure + atoms = read(str(file_path)) + + # Partition structure + part_atoms = partition_structure(atoms, z_cutoffs) + + # Featurize partitions + data_objects = [] + for j, part_idx in enumerate(part_atoms): + feat_dict = featurize_atoms( + atoms, + part_idx, + node_features=node_features[j], + edge_features=edge_features[j], + max_atoms=max_atoms, + encoder=encoder, + ) + + # Convert to Data object + data_obj = Data( + x=feat_dict["node_tensor"], + edge_index=feat_dict["edge_indices"], + edge_attr=feat_dict["edge_tensor"], + y=torch.Tensor([self.map_name_prop[name]]), + ) + data_objects.append(data_obj) + + # Save data objects + torch.save(data_objects, self.processed_path / f"data_{i}.pt") + + # Save name-index dataframe + self.df_name_idx.to_csv(self.index_path, index=None) + + def len(self): + """Return size of the dataset.""" + return len(self.names) + + def get(self, i): + """Fetch the processed graph(s) at the i-th index.""" + data_objects = torch.load(self.processed_path / f"data_{i}.pt") + return data_objects + + def processed_status(self): + """Check if the dataset is processed.""" + if Path(self.index_path).exists(): + return True + else: + return False + + +class AtomsDatapoints: + """Class to hold atomic structures as a datapoints (without targets). + + This main difference between this class and AtomsDataset is that this is + initialized with a list of atoms objects (as opposed to a directory with + files containing atomic structures) without any targets specified. This is + useful to make predictions on atomic structures for which true target values + are not known, i.e., previously unseen structures. + """ + + def __init__(self, atoms): + """Initialize an AtomsDatapoint. + + Atomic structures provided in the list are initialized. + + Paramters + --------- + atoms: ase.Atoms object or a list of ase.Atoms objects + Structures for which predictions are to be made. + """ + # If single object, convert to list + if isinstance(atoms, Atoms): + atoms = [atoms] + + # Save object + self.atoms = atoms + self.data = [] + + def process_data( + self, + z_cutoffs, + node_features, + edge_features, + max_atoms=None, + encoder=OneHotEncoder(), + ): + """Process list of Atoms objects into PyTorch Data and save. + + Each atomic structure in the root directory is partitioned based on the + given z_cutoffs and each partition is featurized according to the given + node_features and edge_features. The featurized graphs are converted + into Data objects and stored in the "processed" directory under root. + + Parameters + ---------- + z_cutoffs: list or np.ndarray + List of z-coordinates based on which atomic structures are + partitioned. The number of partitions is equal to one more than the + length of z_cutoffs. + node_features: list[list] + List of lists of node featurization methods to be used for each + partition. For e.g., specify [["atomic_number", "dband_center"], + ["atomic_number", "reactivity"], ["atomic_number", "reactivity"]] for + a typical bulk + surface + adsorbate partition. + edge_features: list[list] + List of lists of edge featurization methods to be used for each + partition. For e.g., specify [["bulk_bond_distance"], + ["surface_bond_distance"], ["adsorbate_bond_distance"]] for + a typical bulk + surface + adsorbate partition. + max_atoms: int (default is None) + Maximum number of nodes in graph. If a value is provided, graphs are + padded to make sure the total number of nodes matches max_atoms. + encoder: OneHotEncoder object + Encoder to convert properties to vectors + """ + # Iterate over files and process them + for atoms_obj in self.atoms: + # Partition structure + part_atoms = partition_structure(atoms_obj, z_cutoffs) + + # Featurize partitions + data_objects = [] + for j, part_idx in enumerate(part_atoms): + feat_dict = featurize_atoms( + atoms_obj, + part_idx, + node_features=node_features[j], + edge_features=edge_features[j], + max_atoms=max_atoms, + encoder=encoder, + ) + + # Convert to Data object + data_obj = Data( + x=feat_dict["node_tensor"], + edge_index=feat_dict["edge_indices"], + edge_attr=feat_dict["edge_tensor"], + ) + data_objects.append(data_obj) + + # Save data objects + self.data.append(data_objects) + + def len(self): + """Return size of the dataset.""" + return len(self.data) + + def get(self, i): + """Fetch the processed graph(s) at the i-th index.""" + data_objects = self.data[i] + return data_objects + + +def load_dataset(root, prop_csv, process_dict=None): + """Load an AtomsDataset at the path given by root. + + If process_dict is provided, the process_data method of AtomsDataset is called + to convert the atomic structures to graphs based on the given parameters in + process_dict. This should be used when the dataset is created for the first + time. + + Parameters + ---------- + root: str + Path to the dataset + prop_csv: str + Path to the file mapping atomic structure filename and property. + This filename will typically have two columns, the first with the + names of the cif files and the second with the + corresponding target property values. + process_dict: dict (default = None) + If this is provided, atomic structures at root will be processed into + graphs and stored under a "processed" subdirectory. Only use this when + creating a new dataset. This should contain the following keys: z_cutoffs, + node_features, edge_features, max_atoms (optional), encoder (optional). + Refer to the documentation of process_atoms for more information regarding + these parameters. + + Returns + ------- + dataset: AtomsDataset + Initialized AtomsDataset object + """ + dataset = AtomsDataset(root, prop_csv) + if process_dict is not None: + dataset.process_data(**process_dict) + + return dataset + + +def load_datapoints(atoms, process_dict): + """Load AtomsDatapoints for the provided ase.Atoms or list of ase.Atoms. + + If process_dict is provided, the process_data method of AtomsDatapoints is called + to convert the atomic structures to graphs based on the given parameters in + process_dict. This should be used when the dataset is created for the first + time. + + Parameters + ---------- + atoms: ase.Atoms object or a list of ase.Atoms objects + Structures for which predictions are to be made. + process_dict: dict + Parameters to process the provided Atoms objects into graphs. + This should contain the following keys: z_cutoffs, node_features, + edge_features, max_atoms (optional), encoder (optional). Refer to the + documentation of process_atoms for more information regarding these + parameters. + + Returns + ------- + datapoints: AtomsDatapoints + Initialized AtomsDatapoints object + """ + datapoints = AtomsDatapoints(atoms) + if process_dict is not None: + datapoints.process_data(**process_dict) + + return datapoints + + +if __name__ == "__main__": + # Get path to root directory + data_root_path = Path(REPO_PATH) / "data" / "S_calcs" + prop_csv_path = data_root_path / "name_prop.csv" + + # Create dataset + dataset = AtomsDataset(data_root_path, prop_csv_path) + # dataset.process_data(z_cutoffs=[13., 20.], + # node_features=[ + # ["atomic_number", "dband_center"], + # ["atomic_number", "reactivity"], + # ["atomic_number", "reactivity"], + # ], + # edge_features=[ + # ["bulk_bond_distance"], + # ["surface_bond_distance"], + # ["adsorbate_bond_distance"], + # ]) + print(dataset[0][-1].x) + print(dataset.df_name_idx.head()) + + # Create datapoint + atoms = read(data_root_path / "Pt_3_Rh_9_-7-7-S.cif") + datapoint = AtomsDatapoints(atoms) + datapoint.process_data( + z_cutoffs=[13.0, 20.0], + node_features=[ + ["atomic_number", "dband_center"], + ["atomic_number", "reactivity"], + ["atomic_number", "reactivity"], + ], + edge_features=[ + ["bulk_bond_distance"], + ["surface_bond_distance"], + ["adsorbate_bond_distance"], + ], + ) + print(datapoint.get(0)) diff --git a/src/featurizers.py b/src/featurizers.py index b4c6d65..4ca0109 100644 --- a/src/featurizers.py +++ b/src/featurizers.py @@ -588,7 +588,7 @@ def __init__(self, encoder, min=0, max=4, n_intervals=16): @staticmethod def name(): """Return the name of the featurizer.""" - return "adsorbate_distance" + return "adsorbate_bond_distance" list_of_node_featurizers = [ diff --git a/src/graphs.py b/src/graphs.py index 40a09b3..7e0f747 100644 --- a/src/graphs.py +++ b/src/graphs.py @@ -14,7 +14,7 @@ class AtomsGraph: """Create graph representation of a collection of atoms.""" - def __init__(self, atoms, select_idx, max_atoms=50): + def __init__(self, atoms, select_idx, max_atoms=None): """Initialize variables of the class. Parameters @@ -25,9 +25,10 @@ def __init__(self, atoms, select_idx, max_atoms=50): List of indices of atoms that are to be included in the graph neighbor_list: ase.neighborlist.NeighborList object Neighbor list that defines bonds between atoms - max_atoms: int (default = 50) - The maximum number of atoms in the graph. Graphs that have fewer - atoms are padded with 0s to reach this value. + max_atoms: int (default = None) + The maximum number of atoms in the graph. If it is not None, graphs + that have fewer nodes than max_atoms are padded with 0s to ensure + that the total number of nodes is equal to max_atoms. """ # Save parameters self.atoms = atoms @@ -108,7 +109,8 @@ def create_graph(self): graph.add_edge(n, self.map_idx_node[nn], bond_distance=bond_dist) # Pad graph - graph = self.pad_graph(graph) + if self.max_atoms is not None: + graph = self.pad_graph(graph) # Add coordination numbers for n in graph.nodes(): diff --git a/src/utils.py b/src/utils.py index 3ecf32b..2e73f2f 100644 --- a/src/utils.py +++ b/src/utils.py @@ -13,26 +13,20 @@ from graphs import AtomsGraph -def partition_structure(atoms, n_partitions, z_cutoffs): +def partition_structure(atoms, z_cutoffs): """Partition atomic structue into bulk, surface, and/or adsorbates. Parameters ---------- atoms: ase.Atoms object The structure to be partitioned - n_partitions: int - Number of partitions z_cutoffs: list or np.ndarray List of z-coordinate cutoffs. xy planes are placed at the specified cutoffs to partition atoms above and below them. The length of z-cutoffs should be equal to one less than the number of partitions. """ - # Check if length of z_cutoffs is equal to n_paritions - if len(z_cutoffs) != n_partitions - 1: - raise ValueError( - "The length of z_cutoffs must be equal to\ - one less than the number of partitions" - ) + # Set number of partitions equal to 1 more than the length of z_cutoffs + n_partitions = int(len(z_cutoffs) + 1) # Add 0 and infinity to cutoffs z_cutoffs = np.insert(z_cutoffs, 0, 0) @@ -59,7 +53,7 @@ def featurize_atoms( select_idx, node_features, edge_features, - max_atoms=50, + max_atoms=None, encoder=OneHotEncoder(), ): """Featurize atoms and bonds with the chosen featurizers. @@ -80,11 +74,10 @@ def featurize_atoms( Names of edge featurizers to use (current options: bulk_bond_distance, surface_bond_distance, adsorbate_bond_distance). All of these encode bond distance using a one-hot encoder, but the bounds for each vary. - max_atoms: int (default = 50) - Maximum number of allowed atoms. If the number of atoms in the graph are - fewer than this number, the graph is padded with empty nodes. This is - required to make the sizes of the node feature tensors consistent across - structures. + max_atoms: int (default = None) + Maximum number of allowed atoms. If it is not None, graphs + that have fewer nodes than max_atoms are padded with 0s to ensure + that the total number of nodes is equal to max_atoms. encoder: encoder object from featurizers.py Currently only the OneHotEncoder is supported @@ -146,7 +139,7 @@ def featurize_atoms( atoms = read("CONTCAR") - part_atoms = partition_structure(atoms, 3, z_cutoffs=[15, 23.5]) + part_atoms = partition_structure(atoms, z_cutoffs=[15, 23.5]) print(part_atoms) feat_dict = featurize_atoms(