Source code for pytoda.preprocessing.smi

"""Processing utilities for .smi files."""
import os
import sys
from typing import List

import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw

from ..files import read_smi
from ..smiles.transforms import Canonicalization


[docs]def filter_invalid_smi(
    input_filepath: str, output_filepath: str, chunk_size: int = 100000
):
    """
    Execute chunked invalid SMILES filtering in a .smi file.

    Args:
        input_filepath (str): path to the .smi file to process.
        output_filepath (str): path where to store the filtered .smi file.
        chunk_size (int): size of the SMILES chunk. Defaults to 100000.
    """
    for chunk in read_smi(input_filepath, chunk_size=chunk_size):
        pd.DataFrame(
            [
                [row['SMILES'], index]
                for index, row in chunk.iterrows()
                if Chem.MolFromSmiles(row['SMILES'])
            ]
        ).to_csv(output_filepath, index=None, header=None, mode='a', sep='\t')


[docs]def find_undesired_smiles_files(
    undesired_filepath: str,
    data_filepath: str,
    save_matches: bool = False,
    file=sys.stdout,
    **smi_kwargs,
):
    """
    Method to find undesired SMILES in a list of existing SMILES.

    Args:
        undesired_filepath (str): Path to .smi file with a header at first row.
        data_filepath (str): Path to .csv file with a column 'SMILES'.
        save_matches (bool, optional): Whether found matches should be plotted and
            saved. Defaults to False.
    """
    canonicalizer = Canonicalization()

    # Read undesired data
    undesired = read_smi(undesired_filepath, **smi_kwargs)
    undesired_smiles = undesired['SMILES'].apply(canonicalizer).tolist()

    # Read data filepath
    df = pd.read_csv(data_filepath)

    matches, idxs = [], []
    for idx, row in df.iterrows():
        match = find_undesired_smiles(row['SMILES'], undesired_smiles, canonical=True)

        if match:
            print(f'Found {row.SMILES} in list of undesired SMILES.', file=file)
            matches.append(row.SMILES)
            idxs.append(idx)

    if len(matches) == 0:
        print('No matches found, shutting down.', file=file)
        return

    if save_matches:
        grid = Draw.MolsToGridImage(
            [Chem.MolFromSmiles(s) for s in matches],
            molsPerRow=5,
            maxMols=50,
            legends=[f'Idx: {i}, {s}' for i, s in zip(idxs, matches)],
        )
        grid.save(
            os.path.join(os.path.dirname(data_filepath), 'undesired_molecules.pdf')
        )

    return


[docs]def find_undesired_smiles(
    smiles: str, undesired_smiles: List, canonical: bool = False
) -> bool:
    """
    Whether or not a given SMILES is contained in a list of SMILES, respecting
    canonicalization.

    Args:
        smiles (str): Seed SMILES.
        undesired_smiles (List): List of SMILES for comparison
        canonical (bool, optional): Whether comparison list was canonicalized.
            Defaults to False.

    Returns:
        bool: Whether SMILES was present in undesired_smiles.
    """

    canonicalizer = Canonicalization()
    if not canonical:
        undesired_smiles = list(map(canonicalizer, undesired_smiles))

    smiles = canonicalizer(smiles)

    return smiles in undesired_smiles