diff --git a/src/data.py b/src/data.py index 01e92a1..aa1a23d 100644 --- a/src/data.py +++ b/src/data.py @@ -352,6 +352,7 @@ def load_datapoints(atoms, process_dict): # ]) print(dataset[0][-1].x) print(dataset.df_name_idx.head()) + print(dataset[0][-1].name) # Create datapoint atoms = read(data_root_path / "Pt_3_Rh_9_-7-7-S.cif") diff --git a/src/models.py b/src/models.py index 8ff3fd6..6f2d727 100644 --- a/src/models.py +++ b/src/models.py @@ -110,7 +110,7 @@ def init_conv_layers(self): gnn.CGConv( channels=self.conv_size[i], dim=self.num_edge_features[i], - batch_norm=True, + batch_norm=False, ), nn.LeakyReLU(inplace=True), ] diff --git a/src/samplers.py b/src/samplers.py index d6d79ef..ffa364b 100644 --- a/src/samplers.py +++ b/src/samplers.py @@ -67,9 +67,11 @@ def create_samplers(self, sample_config): randomizer.shuffle(idx_array) # Get indices - train_size = int(np.ceil(sample_config["train"] * self.dataset_size)) + if sample_config["train"] < 1.: + train_size = int(np.ceil(sample_config["train"] * self.dataset_size)) train_idx = idx_array[:train_size] - val_size = int(np.ceil(sample_config["val"] * self.dataset_size)) + if sample_config["val"] < 1.: + val_size = int(np.floor(sample_config["val"] * self.dataset_size)) val_idx = idx_array[train_size : train_size + val_size] test_idx = idx_array[train_size + val_size :] diff --git a/src/utils.py b/src/utils.py index 2e73f2f..5b2c6b2 100644 --- a/src/utils.py +++ b/src/utils.py @@ -5,6 +5,9 @@ import numpy as np import torch +from torch.utils.data import SubsetRandomSampler +from torch_geometric.loader import DataLoader + from featurizers import ( OneHotEncoder, list_of_edge_featurizers, @@ -133,6 +136,38 @@ def featurize_atoms( "edge_indices": edge_indices, } +def create_dataloaders(proc_data, sample_idx, batch_size, num_proc=0): + """Create training, validation, and/or test dataloaders. + + Parameters + ---------- + proc_data: AtomsDataset or AtomsDatapoints + Processed dataset object + sampler: dict + A dictionary with "train", "val", and "test" indices returned by a Sampler + object. + batch_size: int + Batch size + num_proc: int (default = 0) + Number of cores to be used for parallelization. Defaults to serial. + + Returns + ------- + dataloader_dict: dict + Dictionary of "train", "val", and "test" dataloaders + """ + # Create dataloader dict + dataloader_dict = {"train": [], "val": [], "test": []} + + for key in dataloader_dict.keys(): + if sample_idx[key].shape[0] > 0.: + sampler = SubsetRandomSampler(sample_idx[key]) + dataloader_dict[key] = DataLoader(dataset=proc_data, + batch_size=batch_size, + sampler=sampler, + num_workers=num_proc) + + return dataloader_dict if __name__ == "__main__": from ase.io import read