Source code for pytoda.files

"""Utilities for file handling."""
import os
from itertools import repeat, takewhile
from typing import Sequence

import pandas as pd


[docs]def count_file_lines(filepath: str, buffer_size: int = 1024 * 1024) -> int:
    """
    Count lines in a file without persisting it in memory.

    Args:
        filepath (str): path to the file.
        buffer_size (int): size of the buffer.

    Returns:
        int: Number of lines in the file.
    """
    number_of_lines = 0
    new_line = os.linesep.encode()
    with open(filepath, 'rb') as fp:
        raw_fp = fp.raw
        previous_buffer = None
        for buffer in takewhile(
            lambda x: x, (raw_fp.read(buffer_size) for _ in repeat(None))
        ):
            number_of_lines += buffer.count(new_line)
            previous_buffer = buffer
        number_of_lines += int(not previous_buffer.endswith(new_line))
    return number_of_lines


[docs]def read_smi(
    filepath: str,
    chunk_size: int = None,
    index_col: int = 1,
    names: Sequence[str] = ['SMILES'],
    header: int = None,
    *args,
    **kwargs,
) -> pd.DataFrame:
    """
    Read a .smi (or .csv file with tab-separated values) in a pd.DataFrame.

    Args:
        filepath (str): path to a .smi file.
        chunk_size (int): size of the chunk. Defaults to None, a.k.a. no chunking.
        index_col (int): Data column used for indexing, defaults to 1.
        names (Sequence[str]): User-assigned names given to the columns.
        header (int): Row number to use as column names. Defaults to None.
        args (): Optional arguments for `pd.read_csv`.
        kwargs (): Optional keyword arguments for `pd.read_csv`.

    Returns:
        pd.DataFrame: a pd.DataFrame containing the data of the .smi file
            where the index is the index_col column.
    """
    try:
        return pd.read_csv(
            filepath,
            sep='\t',
            header=header,
            index_col=index_col,
            names=names,
            chunksize=chunk_size,
            *args,
            **kwargs,
        )
    except IndexError:
        raise IndexError(
            'Pandas does not understand the .smi file. The most common '
            'reason is a wrong delimiter (has to be \\t)'
        )