Source code for pytoda.proteins.transforms

"""Amino Acid Sequence transforms."""
from pathlib import Path
from typing import Dict, List, Tuple, Union

import numpy as np

from ..files import read_smi
from ..transforms import Transform
from ..types import Indexes
from .processing import REAL_AAS
from .protein_language import ProteinLanguage


[docs]class SequenceToTokenIndexes(Transform):
    """Transform Sequence to token indexes using Sequence language."""

[docs]    def __init__(self, protein_language: ProteinLanguage) -> None:
        """
        Initialize a Sequence to token indexes object.

        Args:
            protein_language (ProteinLanguage): a Protein language.
        """
        self.protein_language = protein_language

    def __call__(self, smiles: str) -> Indexes:
        """
        Apply the Sequence tokenization transformation

        Args:
            smiles (str): a Sequence representation.

        Returns:
            Indexes: indexes representation for the Sequence provided.
        """
        return self.protein_language.sequence_to_token_indexes(smiles)


[docs]class ReplaceByFullProteinSequence(Transform):
    """
    A transform to replace short amino acid sequences with the full protein sequence.
    For example, replace active site sequence of a kinase with its full sequence.
    """

[docs]    def __init__(self, alignment_path: Union[str, Path]) -> None:
        """
        Loads alignment info with two "classes" (or types) of residues.

        Args:
            alignment_path (str): path to `.smi` or `.tsv` file which allows to map
                between shortened and full, aligned sequences. Do not use a header in
                the file.

                NOTE: By convention, residues in upper case are important and will be
                kept and residues in lower case are less important and are (usually)
                discarded.
                NOTE: The first column has to be the full protein sequence (use upper
                case only for residues to be used). E.g., ggABCggDEFgg
                NOTE: The second column has to be the condensed sequence (ABCDEF).
                NOTE: The third column has to be a protein id (can be duplicated).
        """

        if not (isinstance(alignment_path, str) or isinstance(alignment_path, Path)):
            raise TypeError(
                f"alignment_path must be string or Path, not {type(alignment_path)}"
            )
        self.alignment_path = alignment_path

        alignment_info = read_smi(
            self.alignment_path,
            index_col=None,
            header=None,
            names=['full_sequence', 'short_sequence', 'id'],
        )
        # We use a combination of ID and the shortened sequence as keys to
        # enable support if proteins have >1 short sequence.
        self.id_to_full = dict(
            zip(
                alignment_info['id'] + '_' + alignment_info['short_sequence'],
                alignment_info['full_sequence'],
            )
        )
        if len(self.id_to_full) < len(alignment_info):
            raise ValueError(f'Duplicate IDs not allowed in: {self.alignment_path}')

    def __call__(self, sample_dict: Dict) -> str:
        """
        Replace the shortened sequence (usually uppercase only) with an aligned
        sequence where usually uppercase is for residues of interest and lowercase
        for the remaining ones.

        Args:
            sample_dict (Dict): a dictionary with the following keys:
                - 'id': the protein id.
                - 'sequence': the shortened protein sequence. (E.g., ABCDEF)
            NOTE: This has to be a dictionary because otherwise the shortened protein
                sequence has to be unique.

        Returns:
            str: the full protein sequence (e.g., abABChijDEF).
        """
        return self.id_to_full[f"{sample_dict['id']}_{sample_dict['sequence']}"]


[docs]def extract_active_sites_info(
    aligned_seq: str,
) -> Tuple[str, List[str], List[str], List[str]]:
    """
    Processes and extracts useful information from an aligned protein sequence.
    Expects lower case amino acids to be outside of the relevant area (e.g., active site)
    and upper case amino acids to be inside it.

    Args:
        aligned_seq: A string containing the aligned protein sequence including
            lower case amino acids and high case amino acids.

    Returns:
        4-Tuple of:
        aligned_seq (str): The input sequence.
        non_active_sites (List[str]): A list of strings, one item for each contiguous
            subsequence NOT belonging to active site.
        active_sites (List[str]): A list of strings, one item for each contiguous
            subsequence belonging to active site.
        all_seqs (List[str]): A list of strings, one item for each contiguous
            subsequence that either belongs to the active site or not.
    """

    non_active_sites = ''
    active_sites = ''
    prev_was_highcase = False
    for c in aligned_seq:
        next_is_highcase = c <= 'Z'
        if next_is_highcase ^ prev_was_highcase:
            if next_is_highcase:
                active_sites += '#'
            else:
                non_active_sites += '#'

        if next_is_highcase:
            active_sites += c
            prev_was_highcase = True
        else:
            non_active_sites += c
            prev_was_highcase = False

    non_active_sites = [s for s in non_active_sites.split('#') if s != '']
    active_sites = [s for s in active_sites.split('#') if s != '']

    if aligned_seq[0] <= 'Z':
        zip_obj = zip(active_sites, non_active_sites)
    else:
        zip_obj = zip(non_active_sites, active_sites)

    all_seqs = [i for one_tuple in zip_obj for i in one_tuple]

    if len(active_sites) > len(non_active_sites):
        assert len(active_sites) == len(non_active_sites) + 1
        all_seqs.append(active_sites[-1])
    elif len(active_sites) < len(non_active_sites):
        assert len(active_sites) + 1 == len(non_active_sites)
        all_seqs.append(non_active_sites[-1])

    return aligned_seq, non_active_sites, active_sites, all_seqs


[docs]def verify_aligned_info(sequence: str) -> None:
    """
    Verify that the sequence is aligned.

    Args:
        sequence: An amino acid sequence.

    Raises:
        Exception: If alignment could not be detected.
    """
    isinstance(sequence, str)
    found_lower_case = False
    found_upper_case = False
    for c in sequence:
        if c >= 'A' and c <= 'Z':
            found_upper_case = True
        elif c >= 'a' and c <= 'z':
            found_lower_case = True
    if not (found_lower_case and found_upper_case):
        raise Exception(
            'Expected aligned residues sequence! Did you forget to use ReplaceByFullProteinSequence?'
        )


[docs]class ProteinAugmentFlipSubstrs(Transform):
    """Augment a protein sequence by randomly flipping each contiguous subsequence."""

[docs]    def __init__(self, p: float = 0.5) -> None:
        """
        Args:
            p (float): Probability that reverting occurs.
        """
        if not isinstance(p, float):
            raise TypeError(f'Please pass float, not {type(p)}.')
        self._p = np.clip(p, 0.0, 1.0)

    def __call__(self, sequence: str) -> str:
        """
        Apply the transform.

        Args:
            sequence (str): an aligned sequence (example: abcDEfgHI).

        Returns:
            str: an aligned sequence with optional flipping (example: abcEDfgHI).
        """
        verify_aligned_info(sequence)
        (
            aligned_seq,
            non_active_sites,
            active_sites,
            all_seqs,
        ) = extract_active_sites_info(sequence)

        ans = ''
        for substr in all_seqs:
            if substr[0] <= 'Z':
                if np.random.rand() < self._p:
                    ans += substr[::-1]
                else:
                    ans += substr
            else:
                ans += substr

        return ans


[docs]class MutateResidues(Transform):
    """
    Augment a protein sequence by injecting (possibly different) noise to residues
    inside and outside the relevant part (e.g., active site).
    NOTE: Noise means single-residue point mutations.
    """

[docs]    def __init__(self, mutate_upper: float = 0.01, mutate_lower: float = 0.1) -> None:
        """
        Args:
            mutate_lower (float): probability for mutating lowercase residues
            mutate_upper (float): probability for mutating uppercase residues.
        """

        if not isinstance(mutate_upper, float):
            raise TypeError(
                f'Please pass float for mutate_prob_in_active_site, not {type(mutate_upper)}.'
            )
        self.mutate_upper = mutate_upper

        if not isinstance(mutate_lower, float):
            raise TypeError(
                f'Please pass float for mutate_lower, not {type(mutate_lower)}.'
            )
        self.mutate_lower = mutate_lower
        self.num_aas = len(REAL_AAS)

    def __call__(self, sequence: str) -> str:
        """
        Apply the transform.

        Args:
            sequence (str): an aligned sequence (example: acDEFg).

        Returns:
            str: a possibly mutated aligned sequence (example: afDEFg).
        """
        # import ipdb;ipdb.set_trace()
        verify_aligned_info(sequence)
        (
            aligned_seq,
            non_active_sites,
            active_sites,
            all_seqs,
        ) = extract_active_sites_info(sequence)

        ans = ''
        for curr_sub_seq in all_seqs:
            for c in curr_sub_seq:
                if (
                    curr_sub_seq[0] <= 'Z'
                ):  # it's uppercase, so it's inside an active site
                    if np.random.rand() < self.mutate_upper:
                        ans += REAL_AAS[np.random.randint(self.num_aas)]
                    else:
                        ans += c
                else:
                    if np.random.rand() < self.mutate_lower:
                        ans += REAL_AAS[np.random.randint(self.num_aas)].lower()
                    else:
                        ans += c

        return ans


[docs]class ProteinAugmentSwapSubstrs(Transform):
    """Augment a protein sequence by randomly swapping neighboring subsequences."""

[docs]    def __init__(self, p: float = 0.2) -> None:
        """
        Args:
            p (float): Probability that any substr switches places with its "neighbour".

        """
        if not isinstance(p, float):
            raise TypeError(f'Please pass float, not {type(p)}.')
        self._p = np.clip(p, 0.0, 1.0)

    def __call__(self, sequence: str) -> str:
        """
        Apply the transform.

        Args:
            sequence (str): an aligned sequence (example: abCDefGHi).

        Returns:
            str: an aligned sequence with swapped substrings (example: abGHefCDi).
        """
        verify_aligned_info(sequence)
        (
            aligned_seq,
            non_active_sites,
            active_sites,
            all_seqs,
        ) = extract_active_sites_info(sequence)

        order = list(range(len(active_sites)))

        for pos in range(len(order) - 1):
            if np.random.rand() < self._p:
                # switch
                order[pos], order[pos + 1] = order[pos + 1], order[pos]

        curr_active_site_substr_idx = -1
        ans = ''
        for substr in all_seqs:
            if substr[0] <= 'Z':
                curr_active_site_substr_idx += 1
                ans += active_sites[order[curr_active_site_substr_idx]]
            else:
                ans += substr

        return ans