Source code for pytoda.preprocessing.crawlers

import logging
import urllib.request as urllib_request
from itertools import filterfalse
from typing import Iterable, List, Tuple, Union
from urllib.error import HTTPError, URLError

import pubchempy as pcp
from pubchempy import BadRequestError, PubChemHTTPError
from unidecode import unidecode

from pytoda.smiles.transforms import Canonicalization

logger = logging.getLogger(__name__)

PUBCHEM_START = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound'
PUBCHEM_MID = 'property'
PUBCHEM_END = 'TXT'


[docs]def get_smiles_from_pubchem(
    drug: Union[str, int],
    query_type: str = 'name',
    use_isomeric: bool = True,
    kekulize: bool = False,
    sanitize: bool = True,
) -> str:
    """
    Uses the PubChem database to retrieve the SMILES of a drug name given as string
    (default) or a PubChem ID.

    Args:
        drug (str): string with a drug name (or a PubChem ID as a string).
        query_type (str): Either 'name' or 'cid'.
            Identifies whether the argument provided as drug is a name (e.g 'Tacrine') or
            a PubChem ID (1935). Defaults to name.
        use_isomeric (bool, optional) - If available, returns the isomeric
            SMILES, not the canonical one.
        kekulize (bool, optional): whether kekulization is used. PubChem uses
            kekulization per default, so setting this to 'True' will not
            perform any operation on the retrieved SMILES.
            NOTE: Setting it to 'False' will convert aromatic atoms to lower-
            case characters and *induces a RDKit dependency*
        sanitize (bool, optional): Sanitize SMILE
    Returns:
        smiles (str): The SMILES string of the drug name.
    """

    if not kekulize and not sanitize:
        raise ValueError(
            'If Kekulize is False, molecule has to be sanitize '
            '(sanitize cannot be False).'
        )

    if type(drug) != str and type(drug) != int:
        raise TypeError(
            f'Please insert drug of type str or int, given was {type(drug)}({drug}).'
        )
    if not kekulize:
        from rdkit import Chem

    options = ['CanonicalSMILES']
    if use_isomeric:
        options = ['IsomericSMILES'] + options

    # Parse name
    if isinstance(drug, str):
        drug = unidecode(drug).strip().replace(' ', '%20')

    # Search in PubChem for compound name
    for option in options:
        try:
            path = '{}/{}/{}/{}/{}/{}'.format(
                PUBCHEM_START, query_type, drug, PUBCHEM_MID, option, PUBCHEM_END
            )
            smiles = urllib_request.urlopen(path).read().decode('UTF-8').split()[0]
            if not kekulize:
                smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles, sanitize=sanitize))
            return smiles
        except HTTPError:
            if option == 'CanonicalSMILES':
                logger.warning(f'Did not find any result for drug: {drug}')
                return ''
            continue


[docs]def remove_pubchem_smiles(smiles_list: Iterable[str]) -> List:
    """
    Function for removing PubChem molecules from an iterable of smiles.
    Args:
        smiles_list (Iterable[str]): many SMILES strings.
    Returns:
        List[str]:  Filtered list of SMILES, all SMILES pointing to PubChem
            molecules are removed.
    """

    if not isinstance(smiles_list, Iterable):
        raise TypeError(f'Please pass Iterable, not {type(smiles_list)}')

    canonicalizer = Canonicalization(sanitize=False)
    filtered = filterfalse(is_pubchem, smiles_list)
    # Canonicalize molecules and filter again (sanity check)
    filtered_canonical = filterfalse(lambda x: is_pubchem(canonicalizer(x)), filtered)
    return list(filtered_canonical)


[docs]def query_pubchem(smiles: str) -> Tuple[bool, int]:
    """
    Queries pubchem for a given SMILES.

    Args:
        smiles (str): A SMILES string.

    Returns:
        Tuple[bool, int]:
            bool: Whether or not SMILES is known to PubChem.

            int: PubChem ID of matched SMILES, -1 if SMILES was not found.
                Instead, -2 means an error in the PubChem query.
    """

    if not isinstance(smiles, str):
        raise TypeError(f'Please pass str, not {type(smiles)}')
    try:
        result = pcp.get_compounds(smiles, 'smiles')[0]
        return (False, -1) if result.cid is None else (True, result.cid)
    except BadRequestError:
        logger.warning(f'Skipping SMILES. BadRequestError with: {smiles}')
    except HTTPError:
        logger.warning(f'Skipping SMILES. HTTPError with: {smiles}')
    except TimeoutError:
        logger.warning(f'Skipping SMILES. TimeoutError with: {smiles}')
    except ConnectionResetError:
        logger.warning(f'Skipping SMILES. ConnectionResetError with: {smiles}')
    except PubChemHTTPError:
        logger.warning(f'Skipping SMILES, server busy. with: {smiles}')
    except URLError:
        logger.error(f"Skipping SMILES, Network unreachable {smiles}")
    return (False, -2)


[docs]def is_pubchem(smiles: str) -> bool:
    """Whether a given SMILES in PubChem.
    Args:
        smiles (str): A SMILES string.
    Returns:
        bool: Whether or not SMILES is known to PubChem.
    """
    return query_pubchem(smiles)[0]