Source code for pytoda.smiles.processing

"""SMILES processing utilities."""
import codecs
import logging
import re
import warnings

from importlib_resources import as_file, files
from selfies import split_selfies as split_selfies_
from SmilesPE.pretokenizer import kmer_tokenizer
from SmilesPE.tokenizer import SPE_Tokenizer

from ..types import Dict, Tokenizer, Tokens

logger = logging.getLogger(__name__)

# tokenizer
SMILES_TOKENIZER = re.compile(
    r'(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|'
    r'-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])'
)

with as_file(files('pytoda.smiles.metadata').joinpath('spe_chembl.txt')) as filepath:
    SPE_TOKENIZER = SPE_Tokenizer(codecs.open(str(filepath)))


[docs]def tokenize_smiles(smiles: str, regexp=SMILES_TOKENIZER, *args, **kwargs) -> Tokens: """ Tokenize a character-level SMILES string. Args: smiles (str): a SMILES representation. regexp (re.Pattern): optionally pass a regexp for the tokenization. Defaults to SMILES_TOKENIZER. args (): ignored, for backwards compatibility. kwargs (): ignored, for backwards compatibility. Returns: Tokens: the tokenized SMILES. """ return [token for token in regexp.split(smiles) if token]
[docs]def kmer_smiles_tokenizer( smiles: str, k: int = 2, stride: int = 1, *args, **kwargs ) -> Tokens: """K-Mer SMILES tokenization following SMILES PE (Li et al. 2020): Li, Xinhao, and Denis Fourches. "SMILES Pair Encoding: A Data-Driven Substructure Tokenization Algorithm for Deep Learning." (2020). Args: smiles (str): SMILES string to be tokenized. k (int): Positive integer denoting the tuple/k-gram lengths. Defaults to 2 (bigrams). stride (int, optional): Stride used for k-mer generation. Higher values result in less tokens. Defaults to 1 (densely overlapping). args (): Optional arguments for `kmer_tokenizer`. kwargs (): Optional keyword arguments for `kmer_tokenizer`. Returns: Tokens: Tokenized SMILES sequence (list of str). """ return kmer_tokenizer(smiles, ngram=k, stride=stride, *args, **kwargs)
[docs]def spe_smiles_tokenizer(smiles: str) -> Tokens: """Pretrained SMILES Pair Encoding tokenizer following (Li et al. 2020). Splits a SMILES into tokens of substructures of varying lengths, depending on occurrence of tokens in ChEMBL dataset. Li, Xinhao, and Denis Fourches. "SMILES Pair Encoding: A Data-Driven Substructure Tokenization Algorithm for Deep Learning." (2020). Args: smiles (str): SMILES string to be tokenized. Returns: Tokens: SMILES tokenized into substructures (list of str). """ return SPE_TOKENIZER.tokenize(smiles).split(' ')
[docs]def tokenize_selfies(selfies: str) -> Tokens: """Tokenize SELFIES. NOTE: Code adapted from selfies package (`def selfies_to_hot`): https://github.com/aspuru-guzik-group/selfies Args: selfies (str): a SELFIES representation (character-level). Returns: Tokens: the tokenized SELFIES. """ warnings.warn( "tokenize_selfies will be deprecated in favor of `split_selfies`", DeprecationWarning, ) try: selfies = selfies.replace('.', '[.]') # to allow parsing unbound atoms selfies_char_list_pre = selfies[1:-1].split('][') return [ '[' + selfies_element + ']' for selfies_element in selfies_char_list_pre ] except Exception: logger.warning(f'Error in tokenizing {selfies}. Returning empty list.') return ['']
[docs]def split_selfies(selfies: str) -> Tokens: """Tokenize SELFIES, wrapping generator as list. Args: selfies (str): a SELFIES representation (character-level). Returns: Tokens: the tokenized SELFIES. """ return list(split_selfies_(selfies))
TOKENIZER_FUNCTIONS: Dict[str, Tokenizer] = { 'smiles': tokenize_smiles, 'kmer_smiles': kmer_smiles_tokenizer, 'spe_smiles': spe_smiles_tokenizer, 'selfies': split_selfies, }