Source code for ccsd.data.preprocess_for_nspdk

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""preprocess_for_nspdk.py: preprocess the test molecules for NSPDK.

Adapted from Jo, J. & al (2022)
"""

import argparse
import os
import sys
from time import perf_counter

sys.path.insert(0, os.getcwd())

import json
import pickle

import pandas as pd

from ccsd.src.parsers.parser_preprocess import ParserPreprocess
from ccsd.src.utils.mol_utils import mols_to_nx, smiles_to_mols


[docs] def preprocess_nspdk(args: argparse.Namespace, print_elapsed_time: bool = True) -> None: """Preprocess the test molecules for NSPDK Args: args (argparse.Namespace): arguments print_elapsed_time (bool, True): if True, print the elapsed time to preprocess the test molecules. Defaults to True. Raises: ValueError: raise an error if the dataset is not supported. Molecule dataset supported: QM9, ZINC250k """ dataset = args.dataset folder = args.folder start_time = perf_counter() # Load the test indices with open(os.path.join(folder, "data", f"valid_idx_{dataset.lower()}.json")) as f: test_idx = json.load(f) # Get the column name of the SMILES if dataset == "QM9": # special case for QM9 test_idx = test_idx["valid_idxs"] test_idx = [int(i) for i in test_idx] col = "SMILES1" elif dataset == "ZINC250k": col = "smiles" else: raise ValueError(f"[ERROR] Unexpected value. Dataset {dataset} not supported.") # Load the molecules smiles = pd.read_csv(os.path.join(folder, "data", f"{dataset.lower()}.csv"))[col] # Get the test molecules test_smiles = [smiles.iloc[i] for i in test_idx] # Convert the test molecules into graphs nx_graphs = mols_to_nx(smiles_to_mols(test_smiles)) print(f"Converted the test molecules into {len(nx_graphs)} graphs") # Save the graphs with open( os.path.join(folder, "data", f"{dataset.lower()}_test_nx.pkl"), "wb" ) as f: pickle.dump(nx_graphs, f) # Print the elapsed time if print_elapsed_time: print(f"Total {perf_counter() - start_time:.2f} sec elapsed")
if __name__ == "__main__": # Parse the arguments args = ParserPreprocess().parse() # Preprocess the test molecules preprocess_nspdk(args)