Source code for pytoda.tests.test_data_splitter

"""Testing data splitter."""
import os
import tempfile
import unittest
from typing import Tuple

import pandas as pd

from pytoda.data_splitter import csv_data_splitter
from pytoda.tests.utils import TestFileContent
from pytoda.types import Files


[docs]class TestDataSplitter(unittest.TestCase): """Testing csv data splitting.""" def _read_dfs(self, filepaths: Files) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Read data frames from a list of files. Args: filepaths (Files): a list of files. Returns: Tuple[pd.DataFrame, ...]: a tuple of data frames. """ return tuple(pd.read_csv(filepath, index_col=0) for filepath in filepaths)
[docs] def test_data_splitter(self) -> None: """Test csv_data_splitter.""" a_content = os.linesep.join( [ 'genes,A,B,C,D', 'sample_3,9.45,4.984,7.016,8.336', 'sample_2,7.188,0.695,10.34,6.047', 'sample_1,9.25,6.133,5.047,5.6', ] ) another_content = os.linesep.join( [ 'genes,B,C,D,E,F', 'sample_10,4.918,0.0794,1.605,3.463,10.18', 'sample_11,3.043,8.56,1.961,0.6226,5.027', 'sample_12,4.76,1.124,6.06,0.3743,11.05', 'sample_13,0.626,5.164,4.277,4.414,2.7', ] ) # first row for random splits, second for file split ground_truth = [ [(6, 6), (1, 6)], [(5, 6), (2, 6)], [(3, 6), (4, 6)], [(3, 4), (4, 5)], [(3, 4), (4, 5)], [(3, 4), (4, 5)], ] index = 0 with tempfile.TemporaryDirectory() as directory: for mode in ['random', 'file']: for test_fraction in [0.1, 0.2, 0.5]: with TestFileContent(a_content) as a_test_file: with TestFileContent(another_content) as another_test_file: train_filepath, test_filepath = csv_data_splitter( [a_test_file.filename, another_test_file.filename], directory, 'general', mode=mode, test_fraction=test_fraction, ) train_df, test_df = self._read_dfs( [train_filepath, test_filepath] ) self.assertEqual( ground_truth[index], [train_df.shape, test_df.shape] ) index += 1
if __name__ == '__main__': unittest.main()