Source code for pytoda.datasets.tests.test_polymer_dataset

"""Testing AnnotatedDataset dataset with eager backend."""
import os
import unittest

import numpy as np

from pytoda.datasets import PolymerTokenizerDataset
from pytoda.tests.utils import TestFileContent


[docs]class TestPolymerTokenizerDataset(unittest.TestCase):
    """Testing annotated dataset."""

[docs]    def test___len__(self) -> None:

        content_monomer = os.linesep.join(
            ['CCO	CHEMBL545', 'C	CHEMBL17564', 'CO	CHEMBL14688', 'NCCS	CHEMBL602']
        )
        content_catalyst = os.linesep.join(
            [
                'N#CCCC1CCCC1=NNc1ccc([N+](=O)[O-])cc1	CHEMBL543',
                'CC	CHEMBL17',
                'NCCSCCCCC	CHEMBL6402',
            ]
        )

        annotated_content = os.linesep.join(
            [
                'label_0,label_1,monomer,catalyst',
                '2.3,3.4,CHEMBL545,CHEMBL17',
                '4.5,5.6,CHEMBL17564,CHEMBL6402',
                '6.7,7.8,CHEMBL602,CHEMBL6402',
                '6.7,7.8,CHEMBL54556,CHEMBL5434',
            ]
        )

        # longest = 51

        with TestFileContent(content_monomer) as a_test_file:
            with TestFileContent(content_catalyst) as another_test_file:
                with TestFileContent(annotated_content) as annotation_file:
                    polymer_dataset = PolymerTokenizerDataset(
                        *[a_test_file.filename, another_test_file.filename],
                        annotations_filepath=annotation_file.filename,
                        entity_names=['monomer', 'cATalysT'],
                    )

                    self.assertEqual(len(polymer_dataset), 3)

[docs]    def test_smiles_params(self) -> None:

        content_monomer = os.linesep.join(
            ['CCO	CHEMBL545', 'C	CHEMBL17564', 'CO	CHEMBL14688', 'NCCS	CHEMBL602']
        )
        content_catalyst = os.linesep.join(
            [
                'N#CCCC1CCCC1=NNc1ccc([N+](=O)[O-])cc1	CHEMBL543',
                'CC	CHEMBL17',
                'NCCSCCCCC	CHEMBL6402',
            ]
        )

        annotated_content = os.linesep.join(
            [
                'label_0,label_1,monomer,catalyst',
                '2.3,3.4,CHEMBL545,CHEMBL17',
                '4.5,5.6,CHEMBL17564,CHEMBL6402',
                '6.7,7.8,CHEMBL602,CHEMBL6402',
            ]
        )

        with TestFileContent(content_monomer) as a_test_file:
            with TestFileContent(content_catalyst) as another_test_file:
                with TestFileContent(annotated_content) as annotation_file:
                    polymer_dataset = PolymerTokenizerDataset(
                        *[a_test_file.filename, another_test_file.filename],
                        annotations_filepath=annotation_file.filename,
                        entity_names=['monomer', 'cATalysT'],
                        all_bonds_explicit=True,
                        all_hs_explicit=[True, False],
                        sanitize=[True, False],
                        padding_length=[9, None],
                    )

                    pad_ind = polymer_dataset.smiles_language.padding_index
                    monomer_start_ind = polymer_dataset.smiles_language.token_to_index[
                        '<MONOMER_START>'
                    ]
                    monomer_stop_ind = polymer_dataset.smiles_language.token_to_index[
                        '<MONOMER_STOP>'
                    ]
                    catalyst_start_ind = polymer_dataset.smiles_language.token_to_index[
                        '<CATALYST_START>'
                    ]
                    catalyst_stop_ind = polymer_dataset.smiles_language.token_to_index[
                        '<CATALYST_STOP>'
                    ]
                    ch3_ind = polymer_dataset.smiles_language.token_to_index['[CH3]']
                    oh_ind = polymer_dataset.smiles_language.token_to_index['[OH]']
                    ch2_ind = polymer_dataset.smiles_language.token_to_index['[CH2]']
                    b_ind = polymer_dataset.smiles_language.token_to_index['-']
                    c_ind = polymer_dataset.smiles_language.token_to_index['C']

                    # test first sample
                    monomer, catalyst, labels = polymer_dataset[0]

                    # CCO -> CH3CH2OH
                    self.assertEqual(
                        monomer.numpy().flatten().tolist(),
                        [
                            pad_ind,
                            pad_ind,
                            monomer_start_ind,
                            ch3_ind,
                            b_ind,
                            ch2_ind,
                            b_ind,
                            oh_ind,
                            monomer_stop_ind,
                        ],
                    )
                    self.assertEqual(
                        polymer_dataset.smiles_language.token_indexes_to_smiles(
                            monomer
                        ),
                        '[CH3]-[CH2]-[OH]',
                    )
                    # CC
                    self.assertEqual(
                        catalyst.numpy().flatten().tolist(),
                        [
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            catalyst_start_ind,
                            c_ind,
                            b_ind,
                            c_ind,
                            catalyst_stop_ind,
                        ],
                    )
                    self.assertEqual(
                        polymer_dataset.smiles_language.token_indexes_to_smiles(
                            catalyst
                        ),
                        'C-C',
                    )
                    self.assertTrue(
                        np.allclose(labels.numpy().flatten().tolist(), [2.3, 3.4])
                    )

[docs]    def test___getitem__(self) -> None:
        """Test __getitem__."""
        content_monomer = os.linesep.join(
            ['CCO	CHEMBL545', 'C	CHEMBL17564', 'CO	CHEMBL14688', 'NCCS	CHEMBL602']
        )
        content_catalyst = os.linesep.join(
            ['c1ccc([N+])cc1	CHEMBL543', 'CC	CHEMBL17', 'NCCSCCCCC	CHEMBL6402']
        )

        annotated_content = os.linesep.join(
            [
                'label_0,label_1,monomer,catalyst',
                '2.3,3.4,CHEMBL545,CHEMBL17',
                '4.5,5.6,CHEMBL17564,CHEMBL6402',
                '6.7,7.8,CHEMBL602,CHEMBL6402',
            ]
        )
        with TestFileContent(content_monomer) as a_test_file:
            with TestFileContent(content_catalyst) as another_test_file:
                with TestFileContent(annotated_content) as annotation_file:
                    polymer_dataset = PolymerTokenizerDataset(
                        *[a_test_file.filename, another_test_file.filename],
                        annotations_filepath=annotation_file.filename,
                        entity_names=['monomer', 'cATalysT'],
                        remove_bonddir=True,
                    )

                    pad_ind = polymer_dataset.smiles_language.padding_index
                    monomer_start_ind = polymer_dataset.smiles_language.token_to_index[
                        '<MONOMER_START>'
                    ]
                    monomer_stop_ind = polymer_dataset.smiles_language.token_to_index[
                        '<MONOMER_STOP>'
                    ]
                    catalyst_start_ind = polymer_dataset.smiles_language.token_to_index[
                        '<CATALYST_START>'
                    ]
                    catalyst_stop_ind = polymer_dataset.smiles_language.token_to_index[
                        '<CATALYST_STOP>'
                    ]
                    c_ind = polymer_dataset.smiles_language.token_to_index['C']
                    o_ind = polymer_dataset.smiles_language.token_to_index['O']
                    n_ind = polymer_dataset.smiles_language.token_to_index['N']
                    s_ind = polymer_dataset.smiles_language.token_to_index['S']

                    # test first sample
                    monomer, catalyst, labels = polymer_dataset[0]

                    self.assertEqual(
                        monomer.numpy().flatten().tolist(),
                        [pad_ind] * 8
                        + [monomer_start_ind, c_ind, c_ind, o_ind, monomer_stop_ind],
                    )
                    self.assertEqual(
                        catalyst.numpy().flatten().tolist(),
                        [
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            catalyst_start_ind,
                            c_ind,
                            c_ind,
                            catalyst_stop_ind,
                        ],
                    )
                    self.assertTrue(
                        np.allclose(labels.numpy().flatten().tolist(), [2.3, 3.4])
                    )

                    monomer, catalyst, labels = polymer_dataset[2]

                    self.assertEqual(
                        monomer.numpy().flatten().tolist(),
                        [pad_ind] * 7
                        + [
                            monomer_start_ind,
                            n_ind,
                            c_ind,
                            c_ind,
                            s_ind,
                            monomer_stop_ind,
                        ],
                    )
                    self.assertEqual(
                        catalyst.numpy().flatten().tolist(),
                        [
                            pad_ind,
                            pad_ind,
                            catalyst_start_ind,
                            n_ind,
                            c_ind,
                            c_ind,
                            s_ind,
                            c_ind,
                            c_ind,
                            c_ind,
                            c_ind,
                            c_ind,
                            catalyst_stop_ind,
                        ],
                    )
                    self.assertTrue(
                        np.allclose(labels.numpy().flatten().tolist(), [6.7, 7.8])
                    )

[docs]    def test___getitem___with_annotation_column_names(self) -> None:
        """Test __getitem__ with annotations_column_names in the annotation."""
        content_monomer = os.linesep.join(
            ['CCO	CHEMBL545', 'C	CHEMBL17564', 'CO	CHEMBL14688', 'NCCS	CHEMBL602']
        )
        content_catalyst = os.linesep.join(
            ['c1ccc([N+])cc1	CHEMBL543', 'CC	CHEMBL17', 'NCCSCCCCC	CHEMBL6402']
        )
        annotated_content = os.linesep.join(
            [
                'index,label_0,label_1,monomer,catalyst',
                '0,2.3,3.4,CHEMBL545,CHEMBL6402',
                '1,4.5,5.6,CHEMBL17564,CHEMBL543',
                '1,6.7,7.8,CHEMBL602,CHEMBL17',
            ]
        )
        with TestFileContent(content_monomer) as a_test_file:
            with TestFileContent(content_catalyst) as another_test_file:
                with TestFileContent(annotated_content) as annotation_file:
                    polymer_dataset = PolymerTokenizerDataset(
                        *[a_test_file.filename, another_test_file.filename],
                        annotations_filepath=annotation_file.filename,
                        entity_names=['monomer', 'cATalysT'],
                        annotations_column_names=['label_0'],
                        remove_bonddir=True,
                    )
                    pad_ind = polymer_dataset.smiles_language.padding_index
                    monomer_start_ind = polymer_dataset.smiles_language.token_to_index[
                        '<MONOMER_START>'
                    ]
                    monomer_stop_ind = polymer_dataset.smiles_language.token_to_index[
                        '<MONOMER_STOP>'
                    ]
                    catalyst_start_ind = polymer_dataset.smiles_language.token_to_index[
                        '<CATALYST_START>'
                    ]
                    catalyst_stop_ind = polymer_dataset.smiles_language.token_to_index[
                        '<CATALYST_STOP>'
                    ]
                    c_ind = polymer_dataset.smiles_language.token_to_index['C']
                    o_ind = polymer_dataset.smiles_language.token_to_index['O']
                    n_ind = polymer_dataset.smiles_language.token_to_index['N']
                    s_ind = polymer_dataset.smiles_language.token_to_index['S']

                    # test first sample
                    monomer, catalyst, labels = polymer_dataset[0]

                    self.assertEqual(
                        monomer.numpy().flatten().tolist(),
                        [pad_ind] * 8
                        + [monomer_start_ind, c_ind, c_ind, o_ind, monomer_stop_ind],
                    )
                    self.assertEqual(
                        catalyst.numpy().flatten().tolist(),
                        [
                            pad_ind,
                            pad_ind,
                            catalyst_start_ind,
                            n_ind,
                            c_ind,
                            c_ind,
                            s_ind,
                            c_ind,
                            c_ind,
                            c_ind,
                            c_ind,
                            c_ind,
                            catalyst_stop_ind,
                        ],
                    )
                    self.assertTrue(
                        np.allclose(labels.numpy().flatten().tolist(), [2.3])
                    )

                    monomer, catalyst, labels = polymer_dataset[2]

                    self.assertEqual(
                        monomer.numpy().flatten().tolist(),
                        [pad_ind] * 7
                        + [
                            monomer_start_ind,
                            n_ind,
                            c_ind,
                            c_ind,
                            s_ind,
                            monomer_stop_ind,
                        ],
                    )
                    self.assertEqual(
                        catalyst.numpy().flatten().tolist(),
                        [
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            pad_ind,
                            catalyst_start_ind,
                            c_ind,
                            c_ind,
                            catalyst_stop_ind,
                        ],
                    )
                    self.assertTrue(
                        np.allclose(labels.numpy().flatten().tolist(), [6.7])
                    )


if __name__ == '__main__':
    unittest.main()