Source code for pytoda.smiles.tests.test_transforms

"""Testing SMILES transforms."""
import unittest

import numpy as np
import torch

from pytoda.smiles.smiles_language import SMILESTokenizer
from pytoda.smiles.transforms import (
    AugmentTensor,
    Kekulize,
    NotKekulize,
    RemoveIsomery,
    Selfies,
)


[docs]class TestTransforms(unittest.TestCase): """Testing transforms."""
[docs] def test_kekulize(self): sanitize_opts = [True, False] for sanitize in sanitize_opts: self._test_kekulize(sanitize)
def _test_kekulize(self, sanitize) -> None: """Test Kekulize.""" for smiles, ground_truth in [ ('c1cnoc1', 'C1=CON=C1'), ('[O-][n+]1ccccc1S', '[O-][N+]1=CC=CC=C1S'), ('c1snnc1-c1ccccn1', 'C1=C(C2=CC=CC=N2)N=NS1'), ]: transform = Kekulize( all_bonds_explicit=False, all_hs_explicit=False, sanitize=sanitize ) self.assertEqual(transform(smiles), ground_truth) for smiles, ground_truth in [ ('c1cnoc1', 'C1=C-O-N=C-1'), ('[O-][n+]1ccccc1S', '[O-]-[N+]1=C-C=C-C=C-1-S'), ('c1snnc1-c1ccccn1', 'C1=C(-C2=C-C=C-C=N-2)-N=N-S-1'), ]: transform = Kekulize( all_bonds_explicit=True, all_hs_explicit=False, sanitize=sanitize ) self.assertEqual(transform(smiles), ground_truth) for smiles, ground_truth in [ ('c1cnoc1', '[CH]1=[CH][O][N]=[CH]1'), ('[O-][n+]1ccccc1S', '[O-][N+]1=[CH][CH]=[CH][CH]=[C]1[SH]'), ('c1snnc1-c1ccccn1', '[CH]1=[C]([C]2=[CH][CH]=[CH][CH]=[N]2)[N]=[N][S]1'), ]: transform = Kekulize( all_bonds_explicit=False, all_hs_explicit=True, sanitize=sanitize ) self.assertEqual(transform(smiles), ground_truth) for smiles, ground_truth in [ ('c1cnoc1', '[CH]1=[CH]-[O]-[N]=[CH]-1'), ('[O-][n+]1ccccc1S', '[O-]-[N+]1=[CH]-[CH]=[CH]-[CH]=[C]-1-[SH]'), ( 'c1snnc1-c1ccccn1', '[CH]1=[C](-[C]2=[CH]-[CH]=[CH]-[CH]=[N]-2)-[N]=[N]-[S]-1', ), ]: transform = Kekulize( all_bonds_explicit=True, all_hs_explicit=True, sanitize=sanitize ) self.assertEqual(transform(smiles), ground_truth)
[docs] def test_non_kekulize(self) -> None: sanitize_opts = [True, False] for sanitize in sanitize_opts: self._test_non_kekulize(sanitize)
def _test_non_kekulize(self, sanitize) -> None: """Test NotKekulize.""" for smiles, ground_truth in [ ('c1cnoc1', 'c1cnoc1'), ('[O-][n+]1ccccc1S', '[O-][n+]1ccccc1S'), ('c1snnc1-c1ccccn1', 'c1snnc1-c1ccccn1'), ]: transform = NotKekulize( all_bonds_explicit=False, all_hs_explicit=False, sanitize=sanitize ) self.assertEqual(transform(smiles), ground_truth) for smiles, ground_truth in [ ('c1cnoc1', 'c1:c:n:o:c:1'), ('[O-][n+]1ccccc1S', '[O-]-[n+]1:c:c:c:c:c:1-S'), ('c1snnc1-c1ccccn1', 'c1:s:n:n:c:1-c1:c:c:c:c:n:1'), ]: transform = NotKekulize( all_bonds_explicit=True, all_hs_explicit=False, sanitize=sanitize ) self.assertEqual(transform(smiles), ground_truth) for smiles, ground_truth in [ ('c1cnoc1', '[cH]1[cH][n][o][cH]1'), ('[O-][n+]1ccccc1S', '[O-][n+]1[cH][cH][cH][cH][c]1[SH]'), ('c1snnc1-c1ccccn1', '[cH]1[s][n][n][c]1-[c]1[cH][cH][cH][cH][n]1'), ]: transform = NotKekulize( all_bonds_explicit=False, all_hs_explicit=True, sanitize=sanitize ) self.assertEqual(transform(smiles), ground_truth) for smiles, ground_truth in [ ('c1cnoc1', '[cH]1:[cH]:[n]:[o]:[cH]:1'), ('[O-][n+]1ccccc1S', '[O-]-[n+]1:[cH]:[cH]:[cH]:[cH]:[c]:1-[SH]'), ( 'c1snnc1-c1ccccn1', '[cH]1:[s]:[n]:[n]:[c]:1-[c]1:[cH]:[cH]:[cH]:[cH]:[n]:1', ), ]: transform = NotKekulize( all_bonds_explicit=True, all_hs_explicit=True, sanitize=sanitize ) self.assertEqual(transform(smiles), ground_truth)
[docs] def test_remove_isomery(self) -> None: """Test RemoveIsomery.""" for bonddir, chirality, smiles, ground_truth in zip( [False, False, True, True], [False, True, False, True], 4 * ['c1ccc(/C=C/[C@H](C)O)cc1'], [ 'c1ccc(/C=C/[C@H](C)O)cc1', 'c1ccc(/C=C/C(C)O)cc1', 'c1ccc(C=C[C@H](C)O)cc1', 'c1ccc(C=CC(C)O)cc1', ], ): transform = RemoveIsomery( bonddir=bonddir, chirality=chirality, sanitize=True ) self.assertEqual(transform(smiles), ground_truth)
[docs] def test_augment_tensor(self) -> None: sanitize_opts = [True, False] for sanitize in sanitize_opts: for canonical in [False, True]: self._test_augment_tensor(sanitize, canonical)
def _test_augment_tensor(self, sanitize, canonical) -> None: """Test AugmentTensor.""" smiles = 'NCCS' smiles_language = SMILESTokenizer( add_start_and_stop=True, padding=False, canonical=canonical ) smiles_language.add_smiles(smiles) np.random.seed(0) transform = AugmentTensor(smiles_language, sanitize=sanitize) def _transform_without_encoding(smile): """Encode SMILES by skippping transforms in smiles language. This is needed because the new SMILESTokenizer.smiles_to_token_indexes also implements SMILES transforms. This could silently canonicalize all SMILES if canonical=True in the smiles language object. Hence it's inappropriate to generate the ground truth and replace by this method.""" return smiles_language.transform_encoding( [ smiles_language.token_to_index[t] for t in smiles_language.smiles_tokenizer(smile) ] ) token_indexes_tensor = _transform_without_encoding(smiles) for augmented_smile in ['C(S)CN', 'NCCS', 'SCCN', 'C(N)CS', 'C(CS)N']: ground_truth = _transform_without_encoding(augmented_smile) self.assertSequenceEqual( transform(token_indexes_tensor).tolist(), ground_truth.tolist() ) # Now test calling with a tensor of several SMILES # Include the padding of the sequence (right padding) pl = 5 # padding_length single_smiles_tensor = torch.unsqueeze( torch.nn.functional.pad( token_indexes_tensor, (0, pl), value=smiles_language.padding_index ), 0, ) seq_len = single_smiles_tensor.shape[1] # sequence_length multi_smiles_tensor = torch.cat([single_smiles_tensor] * 5) np.random.seed(0) augmented = transform(multi_smiles_tensor) for ind, augmented_smile in enumerate( ['C(S)CN', 'NCCS', 'SCCN', 'C(N)CS', 'C(CS)N'] ): ground_truth = _transform_without_encoding(augmented_smile) ground_truth = torch.nn.functional.pad( ground_truth, pad=(0, seq_len - len(ground_truth)), value=smiles_language.padding_index, ) self.assertSequenceEqual(augmented[ind].tolist(), ground_truth.tolist())
[docs] def test_selfies(self) -> None: """Test SELFIES.""" selfie = Selfies() for smiles, ground_truth in [ ('c1cnoc1', '[C][C][=N][O][C][=Ring1][Branch1]'), ('[O-][n+]1ccccc1S', '[O-1][N+1][=C][C][=C][C][=C][Ring1][=Branch1][S]'), ( 'c1snnc1-c1ccccn1', '[C][S][N][=N][C][=Ring1][Branch1][C][=C][C][=C][C][=N][Ring1][=Branch1]', ), ]: self.assertEqual(selfie(smiles), ground_truth)
if __name__ == '__main__': unittest.main()