Source code for pytoda.smiles.tests.test_processing

"""Testing SMILES processing."""
import unittest

import selfies as sf

from pytoda.smiles.processing import (
    kmer_smiles_tokenizer,
    spe_smiles_tokenizer,
    split_selfies,
    tokenize_selfies,
    tokenize_smiles,
)
from pytoda.smiles.transforms import Selfies


[docs]class TestProcessing(unittest.TestCase): """Testing processing."""
[docs] def test_tokenize_smiles(self) -> None: """Test tokenize_smiles.""" for smiles, ground_truth in [ ('c1cnoc1', ['c', '1', 'c', 'n', 'o', 'c', '1']), ( '[O-][n+]1ccccc1S', ['[O-]', '[n+]', '1', 'c', 'c', 'c', 'c', 'c', '1', 'S'], ), ( 'c1snnc1-c1ccccn1', [ 'c', '1', 's', 'n', 'n', 'c', '1', '-', 'c', '1', 'c', 'c', 'c', 'c', 'n', '1', ], ), ]: self.assertListEqual(tokenize_smiles(smiles), ground_truth)
[docs] def test_kmer_smiles_tokenizer(self) -> None: """Test kmer_smiles_tokenizer.""" smiles = ['c1ccc(/C=C/[C@H](C)O)cc', '[O-][n+]1ccccc1S', 'c1snnc1-c1ccccn1'] k = 1 stride = 1 for s in smiles: self.assertListEqual( tokenize_smiles(s), kmer_smiles_tokenizer(s, k=k, stride=stride) ) k = 2 stride = 2 for s, gt in zip( smiles, [ ['c1', 'cc', 'c(', '/C', '=C', '/[C@H]', '(C', ')O', ')c'], ['[O-][n+]', '1c', 'cc', 'cc', '1S'], ['c1', 'sn', 'nc', '1-', 'c1', 'cc', 'cc', 'n1'], ], ): self.assertListEqual(kmer_smiles_tokenizer(s, k=k, stride=stride), gt) k = 2 stride = 1 for s, gt in zip( smiles, [ [ 'c1', '1c', 'cc', 'cc', 'c(', '(/', '/C', 'C=', '=C', 'C/', '/[C@H]', '[C@H](', '(C', 'C)', ')O', 'O)', ')c', 'cc', ], ['[O-][n+]', '[n+]1', '1c', 'cc', 'cc', 'cc', 'cc', 'c1', '1S'], [ 'c1', '1s', 'sn', 'nn', 'nc', 'c1', '1-', '-c', 'c1', '1c', 'cc', 'cc', 'cc', 'cn', 'n1', ], ], ): self.assertListEqual(kmer_smiles_tokenizer(s, k=k, stride=stride), gt) k = 4 stride = 4 for s, gt in zip( smiles, [ ['c1cc', 'c(/C', '=C/[C@H]', '(C)O'], ['[O-][n+]1c', 'cccc'], ['c1sn', 'nc1-', 'c1cc', 'ccn1'], ], ): self.assertListEqual(kmer_smiles_tokenizer(s, k=k, stride=stride), gt) k = 4 stride = 2 for s, gt in zip( smiles, [ [ 'c1cc', 'ccc(', 'c(/C', '/C=C', '=C/[C@H]', '/[C@H](C', '(C)O', ')O)c', ], ['[O-][n+]1c', '1ccc', 'cccc', 'cc1S'], ['c1sn', 'snnc', 'nc1-', '1-c1', 'c1cc', 'cccc', 'ccn1'], ], ): self.assertListEqual(kmer_smiles_tokenizer(s, k=k, stride=stride), gt)
[docs] def test_spe_smiles_tokenizer(self) -> None: """Test spe_smiles_tokenizer.""" for smiles, ground_truth in [ ('c1ccc(/C=C/[C@H](C)O)cc', ['c1ccc(', '/C=C/', '[C@H](C)', 'O)', 'cc']), ('[O-][n+]1ccccc1S', ['[O-]', '[n+]1', 'ccccc1', 'S']), ('c1snnc1-c1ccccn1', ['c1s', 'nnc1', '-', 'c1ccccn1']), ]: self.assertListEqual(spe_smiles_tokenizer(smiles), ground_truth)
[docs] def test_selfies_split(self) -> None: """Test tokenization by selfies package has not changed.""" benzene = 'c1ccccc1' encoded_selfies = sf.encoder(benzene) # '[c][c][c][c][c][c][Ring1][Branch1_1]' v0.2.4 # '[C][=C][C][=C][C][=C][Ring1][Branch1_2]' v1.0.2 (no aromatic) # sf.split_selfies returns generator symbols_benzene = list(sf.split_selfies(encoded_selfies)) # before selfies 2.0.0 the last token is [Branch1_2] self.assertListEqual( symbols_benzene, ['[C]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]'], ) for smiles, ground_truth in [ ( 'c1cnoc1', # before selfies 2.0.0 the 2 last token are [Expl=Ring1] and [Branch1_1] ['[C]', '[C]', '[=N]', '[O]', '[C]', '[=Ring1]', '[Branch1]'], ), ( '[O-][n+]1ccccc1S', # before selfies 2.0.0 it is: [O-expl], [N+expl] and [=Branch1_2] [ '[O-1]', '[N+1]', '[=C]', '[C]', '[=C]', '[C]', '[=C]', '[Ring1]', '[=Branch1]', '[S]', ], ), ( 'c1snnc1-c1ccccn1', # before selfies 2.0.0 it is: [Expl=Ring1], [Branch1_1] and [Branch1_2] [ '[C]', '[S]', '[N]', '[=N]', '[C]', '[=Ring1]', '[Branch1]', '[C]', '[=C]', '[C]', '[=C]', '[C]', '[=N]', '[Ring1]', '[=Branch1]', ], ), ]: self.assertListEqual( # list wrapping version split_selfies(sf.encoder(smiles)), ground_truth, )
[docs] def test_tokenize_selfies_match(self) -> None: """Test deprecated tokenize_selfies.""" for smiles in ['c1cnoc1', '[O-][n+]1ccccc1S', 'c1snnc1-c1ccccn1']: transform = Selfies() selfies = transform(smiles) self.assertListEqual( tokenize_selfies(selfies), list(sf.split_selfies(selfies)) )
[docs] def test_package_tokenizer(self) -> None: """Test package tokenizer.""" from pytoda.smiles import vocab self.assertEqual(len(vocab), 575)
if __name__ == '__main__': unittest.main()