Source code for pytoda.proteins.tests.test_protein_language

"""Testing ProteinLanguage."""
import os
import unittest

from pytoda.proteins.processing import (
    HUMAN_KINASE_ALIGNMENT_VOCAB,
    IUPAC_VOCAB,
    UNIREP_VOCAB,
)
from pytoda.proteins.protein_language import ProteinLanguage
from pytoda.tests.utils import TestFileContent


[docs]class TestProteinLanguage(unittest.TestCase): """Testing ProteinLanguage."""
[docs] def test__update_max_token_sequence_length(self) -> None: """Test _update_max_token_sequence_length.""" sequence = 'EGK' protein_language = ProteinLanguage(add_start_and_stop=False) self.assertEqual(protein_language.max_token_sequence_length, 0) protein_language.add_sequence(sequence) self.assertEqual(protein_language.max_token_sequence_length, 3) protein_language = ProteinLanguage(add_start_and_stop=True) self.assertEqual(protein_language.max_token_sequence_length, 2) protein_language.add_sequence(sequence) self.assertEqual(protein_language.max_token_sequence_length, 5) protein_language = ProteinLanguage( add_start_and_stop=False, amino_acid_dict='unirep' ) self.assertEqual(protein_language.max_token_sequence_length, 0) protein_language.add_sequence(sequence) self.assertEqual(protein_language.max_token_sequence_length, 3) protein_language = ProteinLanguage(add_start_and_stop=True) self.assertEqual(protein_language.max_token_sequence_length, 2) protein_language.add_sequence(sequence) self.assertEqual(protein_language.max_token_sequence_length, 5)
[docs] def test_add_file(self) -> None: """Test add_file""" content = os.linesep.join(['EGK ID3', 'S ID1', 'FGAAV ID2', 'NCCS ID4']) with TestFileContent(content) as a_test_file: protein_language = ProteinLanguage() protein_language.add_file(a_test_file.filename, index_col=1) self.assertEqual(protein_language.max_token_sequence_length, 7) # Test parsing of .fasta file content = r""">sp|Q6GZX0|005R_FRG3G Uncharacterized protein 005R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-005R PE=4 SV=1 MQNPLPEVMSPEHDKRTTTPMSKEANKFIRELDKKPGDLAVVSDFVKRNTGKRLPIGKRS NLYVRICDLSGTIYMGETFILESWEELYLPEPTKMEVLGTLESCCGIPPFPEWIVMVGED QCVYAYGDEEILLFAYSVKQLVEEGIQETGISYKYPDDISDVDEEVLQQDEEIQKIRKKT REFVDKDAQEFQDFLNSLDASLLS >sp|Q91G88|006L_IIV6 Putative KilA-N domain-containing protein 006L OS=Invertebrate iridescent virus 6 OX=176652 GN=IIV6-006L PE=3 SV=1 MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGKRFVDWNKTLRSKKL IQYYETRCDIKTESLLYEIKGDNNDEITKQITGTYLPKEFILDIASWISVEFYDKCNNII """ with TestFileContent(content) as a_test_file: protein_language = ProteinLanguage(add_start_and_stop=False) protein_language.add_file(a_test_file.filename, file_type='.fasta') self.assertEqual(protein_language.max_token_sequence_length, 204)
[docs] def test_sequence_to_token_indexes(self) -> None: """Test sequence_to_token_indexes.""" sequence = 'CCO' protein_language = ProteinLanguage(add_start_and_stop=False) protein_language.add_sequence(sequence) self.assertListEqual( protein_language.sequence_to_token_indexes(sequence), [IUPAC_VOCAB['C'], IUPAC_VOCAB['C'], IUPAC_VOCAB['O']], ) protein_language = ProteinLanguage(add_start_and_stop=True) protein_language.add_sequence(sequence) self.assertListEqual( protein_language.sequence_to_token_indexes(sequence), [ IUPAC_VOCAB['<START>'], IUPAC_VOCAB['C'], IUPAC_VOCAB['C'], IUPAC_VOCAB['O'], IUPAC_VOCAB['<STOP>'], ], ) # UniRep protein_language = ProteinLanguage( add_start_and_stop=False, amino_acid_dict='unirep' ) protein_language.add_sequence(sequence) self.assertListEqual( protein_language.sequence_to_token_indexes(sequence), [UNIREP_VOCAB['C'], UNIREP_VOCAB['C'], UNIREP_VOCAB['O']], ) protein_language = ProteinLanguage( add_start_and_stop=True, amino_acid_dict='unirep' ) protein_language.add_sequence(sequence) self.assertListEqual( protein_language.sequence_to_token_indexes(sequence), [ UNIREP_VOCAB['<START>'], UNIREP_VOCAB['C'], UNIREP_VOCAB['C'], UNIREP_VOCAB['O'], UNIREP_VOCAB['<STOP>'], ], ) # human kinase alignment sequence = 'AC-C' protein_language = ProteinLanguage( add_start_and_stop=False, amino_acid_dict='human-kinase-alignment' ) protein_language.add_sequence(sequence) self.assertListEqual( protein_language.sequence_to_token_indexes(sequence), [ HUMAN_KINASE_ALIGNMENT_VOCAB['A'], HUMAN_KINASE_ALIGNMENT_VOCAB['C'], HUMAN_KINASE_ALIGNMENT_VOCAB['-'], HUMAN_KINASE_ALIGNMENT_VOCAB['C'], ], ) protein_language = ProteinLanguage( add_start_and_stop=True, amino_acid_dict='human-kinase-alignment' ) protein_language.add_sequence(sequence) self.assertListEqual( protein_language.sequence_to_token_indexes(sequence), [ HUMAN_KINASE_ALIGNMENT_VOCAB['<START>'], HUMAN_KINASE_ALIGNMENT_VOCAB['A'], HUMAN_KINASE_ALIGNMENT_VOCAB['C'], HUMAN_KINASE_ALIGNMENT_VOCAB['-'], HUMAN_KINASE_ALIGNMENT_VOCAB['C'], HUMAN_KINASE_ALIGNMENT_VOCAB['<STOP>'], ], ) # Test encoding an unknown character new_seq = 'Aq-C' self.assertListEqual( protein_language.sequence_to_token_indexes(new_seq), [ HUMAN_KINASE_ALIGNMENT_VOCAB['<START>'], HUMAN_KINASE_ALIGNMENT_VOCAB['A'], HUMAN_KINASE_ALIGNMENT_VOCAB['<UNK>'], HUMAN_KINASE_ALIGNMENT_VOCAB['-'], HUMAN_KINASE_ALIGNMENT_VOCAB['C'], HUMAN_KINASE_ALIGNMENT_VOCAB['<STOP>'], ], )
[docs] def test_token_indexes_to_sequence(self) -> None: """Test token_indexes_to_sequence.""" sequence = 'CCO' protein_language = ProteinLanguage() protein_language.add_sequence(sequence) token_indexes = [protein_language.token_to_index[token] for token in sequence] self.assertEqual( protein_language.token_indexes_to_sequence(token_indexes), 'CCO' ) token_indexes = ( [protein_language.token_to_index['<START>']] + token_indexes + [protein_language.token_to_index['<STOP>']] ) protein_language = ProteinLanguage(add_start_and_stop=True) protein_language.add_sequence(sequence) self.assertEqual( protein_language.token_indexes_to_sequence(token_indexes), 'CCO' ) # UniRep protein_language = ProteinLanguage(amino_acid_dict='unirep') protein_language.add_sequence(sequence) token_indexes = [protein_language.token_to_index[token] for token in sequence] self.assertEqual( protein_language.token_indexes_to_sequence(token_indexes), 'CCO' ) token_indexes = ( [protein_language.token_to_index['<START>']] + token_indexes + [protein_language.token_to_index['<STOP>']] ) protein_language = ProteinLanguage( add_start_and_stop=True, amino_acid_dict='unirep' ) protein_language.add_sequence(sequence) self.assertEqual( protein_language.token_indexes_to_sequence(token_indexes), 'CCO' ) # human kinase alignment sequence = 'AC-C' protein_language = ProteinLanguage(amino_acid_dict='human-kinase-alignment') protein_language.add_sequence(sequence) token_indexes = [protein_language.token_to_index[token] for token in sequence] self.assertEqual( protein_language.token_indexes_to_sequence(token_indexes), sequence ) token_indexes = ( [protein_language.token_to_index['<START>']] + token_indexes + [protein_language.token_to_index['<STOP>']] ) protein_language = ProteinLanguage( add_start_and_stop=True, amino_acid_dict='human-kinase-alignment' ) protein_language.add_sequence(sequence) self.assertEqual( protein_language.token_indexes_to_sequence(token_indexes), sequence )
if __name__ == '__main__': unittest.main()