Source code for pytoda.proteins.protein_feature_language

"""Protein language handling."""
from ..types import Tokenizer
from .processing import AA_FEAT, AA_PROPERTIES_NUM, BLOSUM62, BLOSUM62_NORM
from .protein_language import ProteinLanguage


[docs]class IndexesToSequenceError(Exception):
    pass


[docs]def token_indexes_to_sequence_raise(token_indexes: list) -> str:
    """monkey patch to raise Error."""
    raise IndexesToSequenceError(
        'token_indexes_to_sequence not implemented for '
        'binary_features since mapping is not unique.'
    )


[docs]class ProteinFeatureLanguage(ProteinLanguage):
    """
    ProteinFeatureLanguage class.

    ProteinFeatureLanguage handles Protein data and translates from text to
    feature space
    """

[docs]    def __init__(
        self,
        name: str = 'protein-feature-language',
        features: str = 'blosum',
        tokenizer: Tokenizer = list,
        add_start_and_stop: bool = True,
    ) -> None:
        """
        Initialize Protein feature language.

        Args:
            name (str): name of the ProteinFeatureLanguage.
            features (str): Feature alphabet choice. Defaults to 'blosum',
                alternatives are 'binary_features', 'float_features' and 'blosum_norm'.
            tokenizer (Tokenizer): This needs to be a function used to tokenize
                the amino acid sequences. The default is list which simply
                splits the sequence character-by-character.
        """
        self.name = name
        self.feat = features
        self.add_start_and_stop = add_start_and_stop

        if self.feat == 'binary_features':
            self.token_to_index = AA_PROPERTIES_NUM
            # monkey patching method
            self.token_indexes_to_sequence = token_indexes_to_sequence_raise
        elif self.feat == 'float_features':
            self.token_to_index = AA_FEAT
        elif self.feat == 'blosum':
            self.token_to_index = BLOSUM62
        elif self.feat == 'blosum_norm':
            self.token_to_index = BLOSUM62_NORM
        else:
            raise ValueError(
                "Choose dict as 'binary_features', 'float_features', 'blosum' or "
                f"'blosum_norm' (given was {features})."
            )

        self.number_of_features = len(self.token_to_index['<START>'])
        self.tokenizer = tokenizer

        self.setup_dict()

[docs]    def token_indexes_to_sequence(self, token_indexes: list) -> str:
        """
        Transform a list of tuples of token indexes into amino acid sequence.

        Args:
            token_indexes (list): a list of tuples, one tuple per AA and each
                tuple has length self.number_of_features

        Returns:
            str: an amino acid sequence representation.
        """
        return ''.join(
            [
                self.index_to_token.get(token_index, '')
                for token_index in token_indexes
                if token_index in self.sequence_tokens
            ]
        )

    @property
    def method(self) -> str:
        """A string denoting the language encoding method"""
        return self.feat