Source code for pytoda.datasets.tests.test_polymer_dataset

"""Testing AnnotatedDataset dataset with eager backend."""
import os
import unittest

import numpy as np

from pytoda.datasets import PolymerTokenizerDataset
from pytoda.tests.utils import TestFileContent


[docs]class TestPolymerTokenizerDataset(unittest.TestCase): """Testing annotated dataset."""
[docs] def test___len__(self) -> None: content_monomer = os.linesep.join( ['CCO CHEMBL545', 'C CHEMBL17564', 'CO CHEMBL14688', 'NCCS CHEMBL602'] ) content_catalyst = os.linesep.join( [ 'N#CCCC1CCCC1=NNc1ccc([N+](=O)[O-])cc1 CHEMBL543', 'CC CHEMBL17', 'NCCSCCCCC CHEMBL6402', ] ) annotated_content = os.linesep.join( [ 'label_0,label_1,monomer,catalyst', '2.3,3.4,CHEMBL545,CHEMBL17', '4.5,5.6,CHEMBL17564,CHEMBL6402', '6.7,7.8,CHEMBL602,CHEMBL6402', '6.7,7.8,CHEMBL54556,CHEMBL5434', ] ) # longest = 51 with TestFileContent(content_monomer) as a_test_file: with TestFileContent(content_catalyst) as another_test_file: with TestFileContent(annotated_content) as annotation_file: polymer_dataset = PolymerTokenizerDataset( *[a_test_file.filename, another_test_file.filename], annotations_filepath=annotation_file.filename, entity_names=['monomer', 'cATalysT'], ) self.assertEqual(len(polymer_dataset), 3)
[docs] def test_smiles_params(self) -> None: content_monomer = os.linesep.join( ['CCO CHEMBL545', 'C CHEMBL17564', 'CO CHEMBL14688', 'NCCS CHEMBL602'] ) content_catalyst = os.linesep.join( [ 'N#CCCC1CCCC1=NNc1ccc([N+](=O)[O-])cc1 CHEMBL543', 'CC CHEMBL17', 'NCCSCCCCC CHEMBL6402', ] ) annotated_content = os.linesep.join( [ 'label_0,label_1,monomer,catalyst', '2.3,3.4,CHEMBL545,CHEMBL17', '4.5,5.6,CHEMBL17564,CHEMBL6402', '6.7,7.8,CHEMBL602,CHEMBL6402', ] ) with TestFileContent(content_monomer) as a_test_file: with TestFileContent(content_catalyst) as another_test_file: with TestFileContent(annotated_content) as annotation_file: polymer_dataset = PolymerTokenizerDataset( *[a_test_file.filename, another_test_file.filename], annotations_filepath=annotation_file.filename, entity_names=['monomer', 'cATalysT'], all_bonds_explicit=True, all_hs_explicit=[True, False], sanitize=[True, False], padding_length=[9, None], ) pad_ind = polymer_dataset.smiles_language.padding_index monomer_start_ind = polymer_dataset.smiles_language.token_to_index[ '<MONOMER_START>' ] monomer_stop_ind = polymer_dataset.smiles_language.token_to_index[ '<MONOMER_STOP>' ] catalyst_start_ind = polymer_dataset.smiles_language.token_to_index[ '<CATALYST_START>' ] catalyst_stop_ind = polymer_dataset.smiles_language.token_to_index[ '<CATALYST_STOP>' ] ch3_ind = polymer_dataset.smiles_language.token_to_index['[CH3]'] oh_ind = polymer_dataset.smiles_language.token_to_index['[OH]'] ch2_ind = polymer_dataset.smiles_language.token_to_index['[CH2]'] b_ind = polymer_dataset.smiles_language.token_to_index['-'] c_ind = polymer_dataset.smiles_language.token_to_index['C'] # test first sample monomer, catalyst, labels = polymer_dataset[0] # CCO -> CH3CH2OH self.assertEqual( monomer.numpy().flatten().tolist(), [ pad_ind, pad_ind, monomer_start_ind, ch3_ind, b_ind, ch2_ind, b_ind, oh_ind, monomer_stop_ind, ], ) self.assertEqual( polymer_dataset.smiles_language.token_indexes_to_smiles( monomer ), '[CH3]-[CH2]-[OH]', ) # CC self.assertEqual( catalyst.numpy().flatten().tolist(), [ pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, catalyst_start_ind, c_ind, b_ind, c_ind, catalyst_stop_ind, ], ) self.assertEqual( polymer_dataset.smiles_language.token_indexes_to_smiles( catalyst ), 'C-C', ) self.assertTrue( np.allclose(labels.numpy().flatten().tolist(), [2.3, 3.4]) )
[docs] def test___getitem__(self) -> None: """Test __getitem__.""" content_monomer = os.linesep.join( ['CCO CHEMBL545', 'C CHEMBL17564', 'CO CHEMBL14688', 'NCCS CHEMBL602'] ) content_catalyst = os.linesep.join( ['c1ccc([N+])cc1 CHEMBL543', 'CC CHEMBL17', 'NCCSCCCCC CHEMBL6402'] ) annotated_content = os.linesep.join( [ 'label_0,label_1,monomer,catalyst', '2.3,3.4,CHEMBL545,CHEMBL17', '4.5,5.6,CHEMBL17564,CHEMBL6402', '6.7,7.8,CHEMBL602,CHEMBL6402', ] ) with TestFileContent(content_monomer) as a_test_file: with TestFileContent(content_catalyst) as another_test_file: with TestFileContent(annotated_content) as annotation_file: polymer_dataset = PolymerTokenizerDataset( *[a_test_file.filename, another_test_file.filename], annotations_filepath=annotation_file.filename, entity_names=['monomer', 'cATalysT'], remove_bonddir=True, ) pad_ind = polymer_dataset.smiles_language.padding_index monomer_start_ind = polymer_dataset.smiles_language.token_to_index[ '<MONOMER_START>' ] monomer_stop_ind = polymer_dataset.smiles_language.token_to_index[ '<MONOMER_STOP>' ] catalyst_start_ind = polymer_dataset.smiles_language.token_to_index[ '<CATALYST_START>' ] catalyst_stop_ind = polymer_dataset.smiles_language.token_to_index[ '<CATALYST_STOP>' ] c_ind = polymer_dataset.smiles_language.token_to_index['C'] o_ind = polymer_dataset.smiles_language.token_to_index['O'] n_ind = polymer_dataset.smiles_language.token_to_index['N'] s_ind = polymer_dataset.smiles_language.token_to_index['S'] # test first sample monomer, catalyst, labels = polymer_dataset[0] self.assertEqual( monomer.numpy().flatten().tolist(), [pad_ind] * 8 + [monomer_start_ind, c_ind, c_ind, o_ind, monomer_stop_ind], ) self.assertEqual( catalyst.numpy().flatten().tolist(), [ pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, catalyst_start_ind, c_ind, c_ind, catalyst_stop_ind, ], ) self.assertTrue( np.allclose(labels.numpy().flatten().tolist(), [2.3, 3.4]) ) monomer, catalyst, labels = polymer_dataset[2] self.assertEqual( monomer.numpy().flatten().tolist(), [pad_ind] * 7 + [ monomer_start_ind, n_ind, c_ind, c_ind, s_ind, monomer_stop_ind, ], ) self.assertEqual( catalyst.numpy().flatten().tolist(), [ pad_ind, pad_ind, catalyst_start_ind, n_ind, c_ind, c_ind, s_ind, c_ind, c_ind, c_ind, c_ind, c_ind, catalyst_stop_ind, ], ) self.assertTrue( np.allclose(labels.numpy().flatten().tolist(), [6.7, 7.8]) )
[docs] def test___getitem___with_annotation_column_names(self) -> None: """Test __getitem__ with annotations_column_names in the annotation.""" content_monomer = os.linesep.join( ['CCO CHEMBL545', 'C CHEMBL17564', 'CO CHEMBL14688', 'NCCS CHEMBL602'] ) content_catalyst = os.linesep.join( ['c1ccc([N+])cc1 CHEMBL543', 'CC CHEMBL17', 'NCCSCCCCC CHEMBL6402'] ) annotated_content = os.linesep.join( [ 'index,label_0,label_1,monomer,catalyst', '0,2.3,3.4,CHEMBL545,CHEMBL6402', '1,4.5,5.6,CHEMBL17564,CHEMBL543', '1,6.7,7.8,CHEMBL602,CHEMBL17', ] ) with TestFileContent(content_monomer) as a_test_file: with TestFileContent(content_catalyst) as another_test_file: with TestFileContent(annotated_content) as annotation_file: polymer_dataset = PolymerTokenizerDataset( *[a_test_file.filename, another_test_file.filename], annotations_filepath=annotation_file.filename, entity_names=['monomer', 'cATalysT'], annotations_column_names=['label_0'], remove_bonddir=True, ) pad_ind = polymer_dataset.smiles_language.padding_index monomer_start_ind = polymer_dataset.smiles_language.token_to_index[ '<MONOMER_START>' ] monomer_stop_ind = polymer_dataset.smiles_language.token_to_index[ '<MONOMER_STOP>' ] catalyst_start_ind = polymer_dataset.smiles_language.token_to_index[ '<CATALYST_START>' ] catalyst_stop_ind = polymer_dataset.smiles_language.token_to_index[ '<CATALYST_STOP>' ] c_ind = polymer_dataset.smiles_language.token_to_index['C'] o_ind = polymer_dataset.smiles_language.token_to_index['O'] n_ind = polymer_dataset.smiles_language.token_to_index['N'] s_ind = polymer_dataset.smiles_language.token_to_index['S'] # test first sample monomer, catalyst, labels = polymer_dataset[0] self.assertEqual( monomer.numpy().flatten().tolist(), [pad_ind] * 8 + [monomer_start_ind, c_ind, c_ind, o_ind, monomer_stop_ind], ) self.assertEqual( catalyst.numpy().flatten().tolist(), [ pad_ind, pad_ind, catalyst_start_ind, n_ind, c_ind, c_ind, s_ind, c_ind, c_ind, c_ind, c_ind, c_ind, catalyst_stop_ind, ], ) self.assertTrue( np.allclose(labels.numpy().flatten().tolist(), [2.3]) ) monomer, catalyst, labels = polymer_dataset[2] self.assertEqual( monomer.numpy().flatten().tolist(), [pad_ind] * 7 + [ monomer_start_ind, n_ind, c_ind, c_ind, s_ind, monomer_stop_ind, ], ) self.assertEqual( catalyst.numpy().flatten().tolist(), [ pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, pad_ind, catalyst_start_ind, c_ind, c_ind, catalyst_stop_ind, ], ) self.assertTrue( np.allclose(labels.numpy().flatten().tolist(), [6.7]) )
if __name__ == '__main__': unittest.main()