Source code for pytoda.datasets.tests.test_drug_affinity_dataset

"""Testing DrugAffinityDataset."""
import os
import unittest

import numpy as np
from torch.utils.data import DataLoader

from pytoda.datasets import DrugAffinityDataset
from pytoda.tests.utils import TestFileContent

COLUMN_NAMES = [',ligand_name,sequence_id,label', ',drug,protein,class']
DRUG_AFFINITY_CONTENT = os.linesep.join(
    [
        '0,CHEMBL14688,name=20S proteasome chymotrypsin-like-organism=Homo sapiens,1',  # noqa
        '1,CHEMBL14688,name=21S proteasome chymotrypsin-like-organism=Homo sapiens,0',  # noqa
        '2,CHEMBL17564,name=20S proteasome chymotrypsin-like-organism=Homo sapiens,0',  # noqa
        '3,CHEMBL17564,name=21S proteasome chymotrypsin-like-organism=Homo sapiens,1',  # noqa
    ]
)
SMILES_CONTENT = os.linesep.join(
    ['CCO	CHEMBL545', 'C	CHEMBL17564', 'CO	CHEMBL14688', 'NCCS	CHEMBL602']
)
PROTEIN_SEQUENCE_CONTENT = os.linesep.join(
    [
        'ABC\tname=20S proteasome chymotrypsin-like-organism=Homo sapiens',  # noqa
        'DEFG\tname=21S proteasome chymotrypsin-like-organism=Homo sapiens',  # noqa
        'CDEF\tname=22S proteasome chymotrypsin-like-organism=Homo sapiens',  # noqa
        'XZSDASDFF\tname=23S proteasome chymotrypsin-like-organism=Homo sapiens',  # noqa
    ]
)


[docs]class TestDrugAffinityDatasetEagerBackend(unittest.TestCase):
    """Testing DrugAffinityDataset with eager backend."""

[docs]    def setUp(self):
        self.backend = 'eager'
        print(f'backend is {self.backend}')
        self.smiles_content = SMILES_CONTENT
        self.protein_sequence_content = PROTEIN_SEQUENCE_CONTENT

        for column_names in COLUMN_NAMES:
            self.drug_affinity_content = os.linesep.join(
                [column_names, DRUG_AFFINITY_CONTENT]
            )

            with TestFileContent(self.drug_affinity_content) as drug_affinity_file:
                with TestFileContent(self.smiles_content) as smiles_file:
                    with TestFileContent(
                        self.protein_sequence_content
                    ) as protein_sequence_file:
                        self.drug_affinity_dataset = DrugAffinityDataset(
                            drug_affinity_file.filename,
                            smiles_file.filename,
                            protein_sequence_file.filename,
                            backend=self.backend,
                            column_names=column_names.split(',')[1:],
                        )

[docs]    def test___len__(self) -> None:
        """Test __len__."""
        self.assertEqual(len(self.drug_affinity_dataset), 4)

[docs]    def test___getitem__(self) -> None:
        """Test __getitem__."""
        smiles_padding_index = (
            self.drug_affinity_dataset.smiles_dataset.smiles_language.padding_index
        )
        smiles_c_index = (
            self.drug_affinity_dataset.smiles_dataset.smiles_language.token_to_index[
                'C'
            ]
        )
        smiles_o_index = (
            self.drug_affinity_dataset.smiles_dataset.smiles_language.token_to_index[
                'O'
            ]
        )
        protein_sequence_padding_index = (
            self.drug_affinity_dataset.protein_sequence_dataset.protein_language.padding_index
        )
        protein_sequence_a_index = self.drug_affinity_dataset.protein_sequence_dataset.protein_language.token_to_index[
            'A'
        ]
        protein_sequence_b_index = self.drug_affinity_dataset.protein_sequence_dataset.protein_language.token_to_index[
            'B'
        ]
        protein_sequence_c_index = self.drug_affinity_dataset.protein_sequence_dataset.protein_language.token_to_index[
            'C'
        ]
        (
            smiles_indexes_tensor,
            protein_sequence_indexes_tensor,
            label_tensor,
        ) = self.drug_affinity_dataset[0]
        np.testing.assert_almost_equal(
            smiles_indexes_tensor.numpy(),
            np.array(
                [
                    smiles_padding_index,
                    smiles_padding_index,
                    smiles_c_index,
                    smiles_o_index,
                ]
            ),
        )
        np.testing.assert_almost_equal(
            protein_sequence_indexes_tensor.numpy(),
            np.array(
                6 * [protein_sequence_padding_index]
                + [
                    protein_sequence_a_index,
                    protein_sequence_b_index,
                    protein_sequence_c_index,
                ]
            ),
        )
        np.testing.assert_almost_equal(label_tensor.numpy(), np.array([1], dtype=int))

[docs]    def test_data_loader(self) -> None:
        """Test data_loader."""
        data_loader = DataLoader(self.drug_affinity_dataset, batch_size=2, shuffle=True)
        for (
            batch_index,
            (smiles_indexes_batch, protein_sequence_indexes_batch, label_batch),
        ) in enumerate(data_loader):
            self.assertEqual(smiles_indexes_batch.size(), (2, 4))
            self.assertEqual(protein_sequence_indexes_batch.size(), (2, 9))
            self.assertEqual(label_batch.size(), (2, 1))
            if batch_index > 4:
                break


[docs]class TestDrugAffinityDatasetLazyBackend(TestDrugAffinityDatasetEagerBackend):
    """Testing DrugAffinityDataset with lazy backend."""

[docs]    def setUp(self):
        self.backend = 'lazy'
        print(f'backend is {self.backend}')
        self.smiles_content = SMILES_CONTENT
        self.protein_sequence_content = PROTEIN_SEQUENCE_CONTENT

        for column_names in COLUMN_NAMES:
            self.drug_affinity_content = os.linesep.join(
                [column_names, DRUG_AFFINITY_CONTENT]
            )

            with TestFileContent(self.drug_affinity_content) as drug_affinity_file:
                with TestFileContent(self.smiles_content) as smiles_file:
                    with TestFileContent(
                        self.protein_sequence_content
                    ) as protein_sequence_file:
                        self.drug_affinity_dataset = DrugAffinityDataset(
                            drug_affinity_file.filename,
                            smiles_file.filename,
                            protein_sequence_file.filename,
                            backend=self.backend,
                            column_names=column_names.split(',')[1:],
                        )


if __name__ == '__main__':
    unittest.main()