Skip to content

Commit

Permalink
Basic workflow on cpu
Browse files Browse the repository at this point in the history
  • Loading branch information
Gaurav S Deshmukh committed Sep 25, 2023
1 parent b6b8292 commit 037ca9d
Show file tree
Hide file tree
Showing 7 changed files with 164 additions and 44 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@ data/*
!data/dband_centers.csv
__pycache__
*.cif
trained_models
*.pt
*__init__.py
41 changes: 12 additions & 29 deletions src/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
from ase.io import read
from torch_geometric.data import Data, Dataset

from constants import REPO_PATH
from featurizers import OneHotEncoder
from utils import featurize_atoms, partition_structure
from .constants import REPO_PATH
from .featurizers import OneHotEncoder
from .utils import featurize_atoms, partition_structure_by_layers


class AtomsDataset(Dataset):
Expand Down Expand Up @@ -60,7 +60,7 @@ def __init__(self, root, prop_csv):

def process_data(
self,
z_cutoffs,
layer_cutoffs,
node_features,
edge_features,
max_atoms=None,
Expand All @@ -75,8 +75,8 @@ def process_data(
Parameters
----------
z_cutoffs: list or np.ndarray
List of z-coordinates based on which atomic structures are
layer_cutoffs: list or np.ndarray
List of layer cutoffs based on which atomic structures are
partitioned. The number of partitions is equal to one more than the
length of z_cutoffs.
node_features: list[list]
Expand Down Expand Up @@ -115,7 +115,7 @@ def process_data(
atoms = read(str(file_path))

# Partition structure
part_atoms = partition_structure(atoms, z_cutoffs)
part_atoms = partition_structure_by_layers(atoms, layer_cutoffs)

# Featurize partitions
data_objects = []
Expand Down Expand Up @@ -191,7 +191,7 @@ def __init__(self, atoms):

def process_data(
self,
z_cutoffs,
layer_cutoffs,
node_features,
edge_features,
max_atoms=None,
Expand All @@ -206,8 +206,8 @@ def process_data(
Parameters
----------
z_cutoffs: list or np.ndarray
List of z-coordinates based on which atomic structures are
layer_cutoffs: list or np.ndarray
List of layer cutoffs based on which atomic structures are
partitioned. The number of partitions is equal to one more than the
length of z_cutoffs.
node_features: list[list]
Expand All @@ -229,7 +229,7 @@ def process_data(
# Iterate over files and process them
for atoms_obj in self.atoms:
# Partition structure
part_atoms = partition_structure(atoms_obj, z_cutoffs)
part_atoms = partition_structure_by_layers(atoms_obj, layer_cutoffs)

# Featurize partitions
data_objects = []
Expand Down Expand Up @@ -337,28 +337,11 @@ def load_datapoints(atoms, process_dict):
data_root_path = Path(REPO_PATH) / "data" / "S_calcs"
prop_csv_path = data_root_path / "name_prop.csv"

# Create dataset
dataset = AtomsDataset(data_root_path, prop_csv_path)
# dataset.process_data(z_cutoffs=[13., 20.],
# node_features=[
# ["atomic_number", "dband_center"],
# ["atomic_number", "reactivity"],
# ["atomic_number", "reactivity"],
# ],
# edge_features=[
# ["bulk_bond_distance"],
# ["surface_bond_distance"],
# ["adsorbate_bond_distance"],
# ])
print(dataset[0][-1].x)
print(dataset.df_name_idx.head())
print(dataset[0][-1].name)

# Create datapoint
atoms = read(data_root_path / "Pt_3_Rh_9_-7-7-S.cif")
datapoint = AtomsDatapoints(atoms)
datapoint.process_data(
z_cutoffs=[13.0, 20.0],
layer_cutoffs=[3, 6],
node_features=[
["atomic_number", "dband_center"],
["atomic_number", "reactivity"],
Expand Down
4 changes: 2 additions & 2 deletions src/featurizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
from mendeleev import element
from torch.nn.functional import one_hot

from constants import DBAND_FILE_PATH
from graphs import AtomsGraph
from .constants import DBAND_FILE_PATH
from .graphs import AtomsGraph


class OneHotEncoder:
Expand Down
6 changes: 3 additions & 3 deletions src/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def forward(self, data_objects):
conv_data = layer(conv_data)

# Apply pooling layer
pooled_data = self.pool_layers[i](x=conv_data, batch=None)
pooled_data = self.pool_layers[i](x=conv_data, batch=data.batch)

# Apply pool-to-hidden transform
hidden_data = self.pool_transform[i](pooled_data)
Expand All @@ -163,8 +163,8 @@ def forward(self, data_objects):
contributions.append(hidden_data)

# Apply final transformation
contributions = torch.cat(contributions, dim=-1)
output = self.final_lin_transform(contributions)
contributions = torch.cat(contributions)
output = self.final_lin_transform(contributions.view(-1, 3))

return {"output": output, "contributions": contributions}

Expand Down
23 changes: 15 additions & 8 deletions src/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import torch
from sklearn.metrics import mean_absolute_error, mean_squared_error

from models import MultiGCN
from .models import MultiGCN


class Standardizer:
Expand Down Expand Up @@ -122,6 +122,8 @@ def __init__(self, global_config, partition_configs, model_path):

# Set GPU status
self.use_gpu = global_config["gpu"]
if self.use_gpu:
self.model.cuda()

# Set loss function
if global_config["loss_function"] == "mse":
Expand Down Expand Up @@ -151,7 +153,7 @@ def __init__(self, global_config, partition_configs, model_path):

# Set scheduler
if "lr_milestones" in global_config.keys():
self.scheduler = torch.optim.MultiStepLR(
self.scheduler = torch.optim.lr_scheduler.MultiStepLR(
optimizer=self.optimizer, milestones=global_config["lr_milestones"]
)
else:
Expand All @@ -173,7 +175,7 @@ def init_standardizer(self, targets):
targets: np.ndarray or torch.Tensor
Array of training outputs
"""
self.standardizer = Standardizer(targets)
self.standardizer = Standardizer(torch.Tensor(targets))

def train_epoch(self, dataloader):
"""Train the model for a single epoch.
Expand Down Expand Up @@ -209,11 +211,11 @@ def train_epoch(self, dataloader):
pred_dict = self.model(nn_input)

# Calculate loss
loss = self.loss_fn(nn_output, pred_dict["output"])
loss = self.loss_fn(pred_dict["output"], nn_output.unsqueeze(1))
avg_loss += loss

# Calculate metric
y_pred = self.standardizer.restore(pred_dict["output"].cpu())
y_pred = self.standardizer.restore(pred_dict["output"].cpu().detach())
metric = self.metric_fn(y, y_pred)
avg_metric += metric

Expand Down Expand Up @@ -273,11 +275,11 @@ def validate(self, dataloader):
pred_dict = self.model(nn_input)

# Calculate loss
loss = self.loss_fn(nn_output, pred_dict["output"])
loss = self.loss_fn(pred_dict["output"], nn_output.unsqueeze(1))
avg_loss += loss

# Calculate metric
y_pred = self.standardizer.restore(pred_dict["output"].cpu())
y_pred = self.standardizer.restore(pred_dict["output"].cpu().detach())
metric = self.metric_fn(y, y_pred)
avg_metric += metric

Expand Down Expand Up @@ -318,7 +320,7 @@ def predict(self, dataset, indices, return_targets=False):
# Go over each batch in the dataloader
for i, idx in enumerate(indices):
# Get data objects
data_objects = dataset.get(i)
data_objects = dataset.get(idx)

# Standardize output
if return_targets:
Expand Down Expand Up @@ -449,6 +451,11 @@ def train(self, epochs, dataloader_dict, verbose=False):
train_metrics.append(train_metric)
val_metrics.append(val_metric)

# Print, if verbose
if verbose:
print(f"Epoch: [{i}]\tTraining Loss: [{train_loss}]\
\tValidation Loss: [{val_loss}]")

# Load the best model
self.load(best_status=True)

Expand Down
44 changes: 42 additions & 2 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
from torch.utils.data import SubsetRandomSampler
from torch_geometric.loader import DataLoader

from featurizers import (
from .featurizers import (
OneHotEncoder,
list_of_edge_featurizers,
list_of_node_featurizers,
)
from graphs import AtomsGraph
from .graphs import AtomsGraph


def partition_structure(atoms, z_cutoffs):
Expand Down Expand Up @@ -50,6 +50,46 @@ def partition_structure(atoms, z_cutoffs):

return part_atoms

def partition_structure_by_layers(atoms, layer_cutoffs):
"""Partition atomic structue into bulk, surface, and/or adsorbates by layers.
Parameters
----------
atoms: ase.Atoms object
The structure to be partitioned
layer_cutoffs: list or np.ndarray
List of layer cutoffs. xy planes are placed above the specified layer
cutoffs to partition atoms above and below them. The length of layer
should be equal to one less than the number of partitions.
"""
# Set number of partitions equal to 1 more than the length of z_cutoffs
n_partitions = int(len(layer_cutoffs) + 1)

# Calculate interlayer distance
z_array = np.unique(np.sort(atoms.get_positions()[:, -1]))
z_min = z_array.min()
d_interlayer = abs(z_array[1] - z_array[0])

# Add 0 and infinity to cutoffs
z_cutoffs = z_min + (np.array(layer_cutoffs) - 1) * d_interlayer + 0.1
z_cutoffs = np.insert(z_cutoffs, 0, 0)
z_cutoffs = np.insert(z_cutoffs, len(z_cutoffs), np.inf)

# Get positions
pos = atoms.get_positions()

# Iterate over number of partitions
part_atoms = []
for i in range(n_partitions):
part_idx = (
np.argwhere((pos[:, -1] >= z_cutoffs[i]) & (pos[:, -1] < z_cutoffs[i + 1]))
.flatten()
.tolist()
)
part_atoms.append(part_idx)

return part_atoms


def featurize_atoms(
atoms,
Expand Down
87 changes: 87 additions & 0 deletions workflows/basic_train_val_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""Basic workflow to train, validate, and test a model."""

import torch

from ..src.constants import REPO_PATH
from ..src.data import AtomsDataset
from ..src.samplers import RandomSampler
from ..src.utils import create_dataloaders
from ..src.train import Model

# Set seeds
seed = 0
torch.manual_seed(seed)

# Create dataset
dataset_path = REPO_PATH / "data" / "S_calcs"
prop_csv_path = dataset_path / "name_prop.csv"
dataset = AtomsDataset(root=dataset_path, prop_csv=prop_csv_path)

# Process dataset
# dataset.process_data(layer_cutoffs=[3, 6],
# node_features=[
# ["atomic_number", "dband_center", "coordination"],
# ["atomic_number", "reactivity", "coordination"],
# ["atomic_number", "reactivity", "coordination"],
# ],
# edge_features=[
# ["bulk_bond_distance"],
# ["surface_bond_distance"],
# ["adsorbate_bond_distance"],
# ])

# Create sampler
sample_config = {"train": 0.6, "val": 0.2, "test": 0.2}
sampler = RandomSampler(seed, dataset.len())
sample_idx = sampler.create_samplers(sample_config)

# Create dataloaders
dataloader_dict = create_dataloaders(dataset, sample_idx, batch_size=32)

# Create model
global_config = {
"gpu": False,
"loss_function": "mse",
"metric_function": "mae",
"learning_rate": 0.1,
"optimizer": "adam",
"lr_milestones": [3, 10]
}
partition_configs = [
{
"n_conv": 3,
"n_hidden": 3,
"hidden_size": 30,
"conv_size": 40,
"dropout": 0.1,
"num_node_features": dataset[0][0].num_node_features,
"num_edge_features": dataset[0][0].num_edge_features,
"conv_type": "CGConv",
},
{
"n_conv": 3,
"n_hidden": 3,
"hidden_size": 30,
"conv_size": 40,
"dropout": 0.1,
"num_node_features": dataset[0][1].num_node_features,
"num_edge_features": dataset[0][1].num_edge_features,
"conv_type": "CGConv",
},
{
"n_conv": 3,
"n_hidden": 3,
"hidden_size": 30,
"conv_size": 40,
"dropout": 0.1,
"num_node_features": dataset[0][2].num_node_features,
"num_edge_features": dataset[0][2].num_edge_features,
"conv_type": "CGConv",
},
]

model_path = REPO_PATH / "trained_models" / "S_binary_calcs"
model = Model(global_config, partition_configs, model_path)
model.init_standardizer([dataset[i][0].y for i in sample_idx["train"]])
results_dict = model.train(100, dataloader_dict, verbose=True)
print(results_dict)

0 comments on commit 037ca9d

Please sign in to comment.