Source code for pytoda.datasets.tests.test_protein_protein_interaction_dataset

"""Testing ProteinProteinInteractionDataset """
import os
import unittest

import numpy as np

from pytoda.datasets import ProteinProteinInteractionDataset
from pytoda.proteins import ProteinFeatureLanguage, ProteinLanguage
from pytoda.tests.utils import TestFileContent


[docs]class TestProteinProteinInteractionDataset(unittest.TestCase):
    """Testing annotated dataset."""

[docs]    def test___len__(self) -> None:

        content_entity_1 = os.linesep.join(['CCE	ID1', 'KCPR	ID3', 'NCCS	ID2'])
        content_entity_2 = os.linesep.join(
            ['EGK	ID3', 'S	ID1', 'FGAAV	ID2', 'NCCS	ID4']
        )

        annotated_content = os.linesep.join(
            [
                'label_0,label_1,tcr,peptide',
                '2.3,3.4,ID3,ID4',
                '4.5,5.6,ID2,ID1',
                '6.7,7.8,ID1,ID2',
            ]
        )

        with TestFileContent(content_entity_1, suffix='.smi') as a_test_file:
            with TestFileContent(content_entity_2, suffix='.smi') as another_test_file:
                with TestFileContent(annotated_content) as annotation_file:

                    for sequence_filetype in ['.smi', 'infer']:
                        ppi_dataset = ProteinProteinInteractionDataset(
                            [a_test_file.filename, another_test_file.filename],
                            ['tcr', 'peptide'],
                            annotation_file.filename,
                            sequence_filetypes=sequence_filetype,
                        )

                    self.assertEqual(len(ppi_dataset), 3)

        # Test for length if some sequences are not there
        annotated_content = os.linesep.join(
            [
                'label_0,label_1,tcr,peptide',
                '2.3,3.4,ID3,ID4',
                '4.5,5.6,ID2,ID1',
                '6.7,7.8,ID1,ID2',
                '6.7,7.8,ID7,ID2',
                '3.14,1.61,oh,no',
            ]
        )
        with TestFileContent(content_entity_1) as a_test_file:
            with TestFileContent(content_entity_2) as another_test_file:
                with TestFileContent(annotated_content) as annotation_file:
                    ppi_dataset = ProteinProteinInteractionDataset(
                        [a_test_file.filename, another_test_file.filename],
                        ['tcr', 'peptide'],
                        annotation_file.filename,
                        sequence_filetypes='.smi',
                    )

                    self.assertEqual(len(ppi_dataset), 3)
                    # ppi_dataset.masks_df to inspect

[docs]    def test___getitem__(self) -> None:
        """Test __getitem__."""

        content_entity_1 = os.linesep.join(['CCE	ID1', 'KCPR	ID3', 'NCCS	ID2'])
        content_entity_2 = os.linesep.join(
            ['EGK	ID3', 'S	ID1', 'FGAAV	ID2', 'NCCS	ID4']
        )

        annotated_content = os.linesep.join(
            [
                'label_0,label_1,tcr,peptide',
                '2.3,3.4,ID3,ID4',
                '4.5,5.6,ID2,ID1',
                '6.7,7.8,ID1,ID2',
            ]
        )

        with TestFileContent(content_entity_1) as a_test_file:
            with TestFileContent(content_entity_2) as another_test_file:
                with TestFileContent(annotated_content) as annotation_file:
                    ppi_dataset = ProteinProteinInteractionDataset(
                        [a_test_file.filename, another_test_file.filename],
                        ['tcr', 'peptide'],
                        annotation_file.filename,
                        sequence_filetypes='.smi',
                    )

                    # test first sample
                    tcr, peptide, label = ppi_dataset[0]

                    tok_to_idx = ppi_dataset.protein_languages[0].token_to_index

                    self.assertEqual(
                        tcr.numpy().flatten().tolist(),
                        [
                            tok_to_idx['K'],
                            tok_to_idx['C'],
                            tok_to_idx['P'],
                            tok_to_idx['R'],
                        ],
                    )
                    self.assertEqual(
                        peptide.numpy().flatten().tolist(),
                        [
                            tok_to_idx['<PAD>'],
                            tok_to_idx['N'],
                            tok_to_idx['C'],
                            tok_to_idx['C'],
                            tok_to_idx['S'],
                        ],
                    )
                    self.assertTrue(
                        np.allclose(label.numpy().flatten().tolist(), [2.3, 3.4])
                    )

        # Test for non-case-matching entity names
        with TestFileContent(content_entity_1) as a_test_file:
            with TestFileContent(content_entity_2) as another_test_file:
                with TestFileContent(annotated_content) as annotation_file:
                    ppi_dataset = ProteinProteinInteractionDataset(
                        [a_test_file.filename, another_test_file.filename],
                        ['TCR', 'Peptide'],
                        annotation_file.filename,
                        sequence_filetypes='.smi',
                    )

                    # test first sample
                    tcr, peptide, label = ppi_dataset[0]

                    tok_to_idx = ppi_dataset.protein_languages[0].token_to_index

                    self.assertEqual(
                        tcr.numpy().flatten().tolist(),
                        [
                            tok_to_idx['K'],
                            tok_to_idx['C'],
                            tok_to_idx['P'],
                            tok_to_idx['R'],
                        ],
                    )
                    self.assertEqual(
                        peptide.numpy().flatten().tolist(),
                        [
                            tok_to_idx['<PAD>'],
                            tok_to_idx['N'],
                            tok_to_idx['C'],
                            tok_to_idx['C'],
                            tok_to_idx['S'],
                        ],
                    )
                    self.assertTrue(
                        np.allclose(label.numpy().flatten().tolist(), [2.3, 3.4])
                    )

        # Switch label columns
        annotated_content = os.linesep.join(
            [
                'label_0,label_1,peptIDE,tcR',
                '2.3,3.4,ID3,ID4',
                '4.5,5.6,ID2,ID1',
                '6.7,7.8,ID1,ID2',
            ]
        )
        content_entity_1 = os.linesep.join(['CCE	ID1', 'KCPR	ID3', 'NCCS	ID2'])
        content_entity_2 = os.linesep.join(
            ['EGK	ID3', 'S	ID1', 'FGAAV	ID2', 'NCCS	ID4']
        )

        with TestFileContent(content_entity_1) as a_test_file:
            with TestFileContent(content_entity_2) as another_test_file:
                with TestFileContent(annotated_content) as annotation_file:
                    ppi_dataset = ProteinProteinInteractionDataset(
                        [a_test_file.filename, another_test_file.filename],
                        ['tcr', 'peptide'],
                        annotation_file.filename,
                        sequence_filetypes='.smi',
                    )
                    self.assertEqual(len(ppi_dataset), 2)

                    # test first sample
                    tcr, peptide, label = ppi_dataset[-1]

                    tok_to_idx = ppi_dataset.protein_languages[0].token_to_index
                    self.assertTrue(
                        np.allclose(label.numpy().flatten().tolist(), [6.7, 7.8])
                    )
                    self.assertEqual(
                        tcr.numpy().flatten().tolist(),
                        [
                            tok_to_idx['N'],
                            tok_to_idx['C'],
                            tok_to_idx['C'],
                            tok_to_idx['S'],
                        ],
                    )
                    self.assertEqual(
                        peptide.numpy().flatten().tolist(),
                        [
                            tok_to_idx['<PAD>'],
                            tok_to_idx['<PAD>'],
                            tok_to_idx['<PAD>'],
                            tok_to_idx['<PAD>'],
                            tok_to_idx['S'],
                        ],
                    )

        # Only one annotation column
        with TestFileContent(content_entity_1) as a_test_file:
            with TestFileContent(content_entity_2) as another_test_file:
                with TestFileContent(annotated_content) as annotation_file:
                    ppi_dataset = ProteinProteinInteractionDataset(
                        [a_test_file.filename, another_test_file.filename],
                        ['tcr', 'peptide'],
                        annotation_file.filename,
                        sequence_filetypes='.smi',
                        annotations_column_names=['label_0'],
                    )
                    self.assertEqual(len(ppi_dataset), 2)

                    # test first sample
                    tcr, peptide, label = ppi_dataset[-1]

                    tok_to_idx = ppi_dataset.protein_languages[0].token_to_index
                    self.assertTrue(
                        np.allclose(label.numpy().flatten().tolist(), [6.7])
                    )

        # Annotation colum given as index
        with TestFileContent(content_entity_1) as a_test_file:
            with TestFileContent(content_entity_2) as another_test_file:
                with TestFileContent(annotated_content) as annotation_file:
                    ppi_dataset = ProteinProteinInteractionDataset(
                        [a_test_file.filename, another_test_file.filename],
                        ['tcr', 'peptide'],
                        annotation_file.filename,
                        sequence_filetypes='.smi',
                        annotations_column_names=[1],
                    )
                    self.assertEqual(len(ppi_dataset), 2)

                    # test first sample
                    tcr, peptide, label = ppi_dataset[-1]

                    tok_to_idx = ppi_dataset.protein_languages[0].token_to_index
                    self.assertTrue(
                        np.allclose(label.numpy().flatten().tolist(), [7.8])
                    )
        # Test for giving only one protein sequence entity
        with TestFileContent(content_entity_2) as a_test_file:
            with TestFileContent(annotated_content) as annotation_file:
                ppi_dataset = ProteinProteinInteractionDataset(
                    [a_test_file.filename],
                    ['peptide'],
                    annotation_file.filename,
                    sequence_filetypes='.smi',
                    annotations_column_names=[1],
                )
                self.assertEqual(len(ppi_dataset), 3)

                # test last sample
                data_tuple = ppi_dataset[-1]
                self.assertEqual(len(data_tuple), 2)

        # Test for giving three entities
        with TestFileContent(content_entity_1) as a_test_file:
            with TestFileContent(content_entity_2) as another_test_file:
                with TestFileContent(content_entity_2) as third_test_file:
                    with TestFileContent(annotated_content) as annotation_file:

                        # Test passing no protein language
                        ppi_dataset = ProteinProteinInteractionDataset(
                            [
                                a_test_file.filename,
                                another_test_file.filename,
                                third_test_file.filename,
                            ],
                            entity_names=['tcr', 'peptide', 'peptide'],
                            labels_filepath=annotation_file.filename,
                            sequence_filetypes='.smi',
                            annotations_column_names=[1],
                        )
                        self.assertEqual(len(ppi_dataset), 2)

                        # test last sample
                        data_tuple = ppi_dataset[-1]
                        self.assertEqual(len(data_tuple), 4)
                        self.assertListEqual(
                            data_tuple[1].tolist(), data_tuple[2].tolist()
                        )

                        # Test passing single protein language
                        for dic in ['iupac', 'unirep', 'human-kinase-alignment']:
                            for s in [False, True]:
                                lang = ProteinLanguage(
                                    amino_acid_dict=dic, add_start_and_stop=s
                                )
                                ppi_dataset = ProteinProteinInteractionDataset(
                                    [
                                        a_test_file.filename,
                                        another_test_file.filename,
                                        third_test_file.filename,
                                    ],
                                    entity_names=['tcr', 'peptide', 'peptide'],
                                    protein_languages=lang,
                                    labels_filepath=annotation_file.filename,
                                    sequence_filetypes='.smi',
                                    annotations_column_names=[1],
                                    add_start_and_stops=s,
                                    iterate_datasets=True,
                                )
                                self.assertEqual(len(ppi_dataset), 2)

                                # test last sample
                                data_tuple = ppi_dataset[-1]
                                self.assertEqual(len(data_tuple), 4)
                                self.assertListEqual(
                                    data_tuple[1].tolist(), data_tuple[2].tolist()
                                )
                                # Tensor should be 1D
                                self.assertEqual(1, len(data_tuple[0].shape))

                        # Test passing single protein_feature_language
                        for dic in ['blosum', 'binary_features', 'float_features']:
                            for s in [False, True]:
                                lang = ProteinFeatureLanguage(
                                    features=dic, add_start_and_stop=s
                                )

                                ppi_dataset = ProteinProteinInteractionDataset(
                                    [
                                        a_test_file.filename,
                                        another_test_file.filename,
                                        third_test_file.filename,
                                    ],
                                    entity_names=['tcr', 'peptide', 'peptide'],
                                    protein_languages=lang,
                                    labels_filepath=annotation_file.filename,
                                    sequence_filetypes='.smi',
                                    annotations_column_names=[1],
                                    add_start_and_stops=s,
                                    iterate_datasets=True,
                                )
                                self.assertEqual(len(ppi_dataset), 2)

                                # test last sample
                                data_tuple = ppi_dataset[-1]
                                self.assertEqual(len(data_tuple), 4)
                                self.assertListEqual(
                                    data_tuple[1].tolist(), data_tuple[2].tolist()
                                )
                                # Tensor should be 2D
                                self.assertEqual(2, len(data_tuple[0].shape))

                        # Test passing mixture of protein and protein_feature languages
                        for dic1 in ['iupac', 'unirep', 'human-kinase-alignment']:
                            for dic2 in ['blosum', 'float_features']:

                                lang1 = ProteinLanguage(amino_acid_dict=dic1)
                                lang2 = ProteinFeatureLanguage(features=dic2)

                                ppi_dataset = ProteinProteinInteractionDataset(
                                    [
                                        a_test_file.filename,
                                        another_test_file.filename,
                                        third_test_file.filename,
                                    ],
                                    entity_names=['tcr', 'peptide', 'peptide'],
                                    protein_languages=[lang1, lang2, lang2],
                                    labels_filepath=annotation_file.filename,
                                    sequence_filetypes='.smi',
                                    annotations_column_names=[1],
                                    add_start_and_stops=True,
                                    iterate_datasets=True,
                                )
                                self.assertEqual(len(ppi_dataset), 2)

                                # test last sample
                                data_tuple = ppi_dataset[-1]
                                self.assertEqual(len(data_tuple), 4)
                                self.assertListEqual(
                                    data_tuple[1].tolist(), data_tuple[2].tolist()
                                )
                                # Tensor dimensions
                                for i, g in zip(range(len(data_tuple) - 1), (1, 2, 2)):
                                    self.assertEqual(g, len(data_tuple[i].shape))

                                # Testing alternative ordering
                                ppi_dataset = ProteinProteinInteractionDataset(
                                    [
                                        a_test_file.filename,
                                        another_test_file.filename,
                                        third_test_file.filename,
                                    ],
                                    entity_names=['tcr', 'peptide', 'peptide'],
                                    protein_languages=[lang2, lang1, lang2],
                                    labels_filepath=annotation_file.filename,
                                    sequence_filetypes='.smi',
                                    annotations_column_names=[1],
                                    add_start_and_stops=True,
                                    iterate_datasets=True,
                                )
                                self.assertEqual(len(ppi_dataset), 2)

                                # test last sample
                                data_tuple = ppi_dataset[-2]
                                self.assertEqual(len(data_tuple), 4)
                                # Test whether decoding with PL and PFL gives the same
                                feat1 = data_tuple[1].long().tolist()
                                if dic2 == 'blosum':
                                    feat0 = [
                                        tuple(x) for x in data_tuple[0].long().tolist()
                                    ]

                                    feat2 = [
                                        tuple(x) for x in data_tuple[2].long().tolist()
                                    ]
                                elif dic2 == 'float_features':
                                    feat0 = [
                                        tuple([round(xx, 2) for xx in x])
                                        for x in data_tuple[0].double().tolist()
                                    ]
                                    feat2 = [
                                        tuple([round(xx, 2) for xx in x])
                                        for x in data_tuple[2].double().tolist()
                                    ]

                                self.assertEqual(
                                    'CCE',
                                    ppi_dataset.protein_languages[
                                        0
                                    ].token_indexes_to_sequence(feat0),
                                )
                                self.assertEqual(
                                    'FGAAV',
                                    ppi_dataset.protein_languages[
                                        1
                                    ].token_indexes_to_sequence(feat1),
                                )
                                self.assertEqual(
                                    'FGAAV',
                                    ppi_dataset.protein_languages[
                                        2
                                    ].token_indexes_to_sequence(feat2),
                                )
                                # Tensor dimensions
                                for i, g in zip(range(len(data_tuple) - 1), (2, 1, 2)):
                                    self.assertEqual(g, len(data_tuple[i].shape))

        # Test for using different padding lengths
        padding_lengths = [8, 6]
        with TestFileContent(content_entity_1) as a_test_file:
            with TestFileContent(content_entity_2) as another_test_file:
                with TestFileContent(annotated_content) as annotation_file:
                    ppi_dataset = ProteinProteinInteractionDataset(
                        [
                            a_test_file.filename,
                            another_test_file.filename,
                        ],
                        ['tcr', 'peptide'],
                        annotation_file.filename,
                        padding_lengths=padding_lengths,
                        sequence_filetypes='.smi',
                    )
                    self.assertEqual(len(ppi_dataset), 2)

                    # test last sample
                    data_tuple = ppi_dataset[-1]
                    self.assertEqual(len(data_tuple), 3)
                    for i, p in enumerate(padding_lengths):
                        self.assertEqual(len(data_tuple[i]), p)


if __name__ == '__main__':
    unittest.main()