Source code for pytoda.datasets.utils.utils

"""Utils for the dataset module."""
from copy import copy

from ...types import Any, Files, Hashable, List, Tuple
from ..base_dataset import AnyBaseDataset, ConcatKeyDataset
from .factories import BACKGROUND_TENSOR_FACTORY


[docs]def sizeof_fmt(num, suffix='B'): """ Human readable file size. Source: https://stackoverflow.com/a/1094933 """ for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: if abs(num) < 1024.0: return "%3.1f%s%s" % (num, unit, suffix) num /= 1024.0 return "%.1f%s%s" % (num, 'Yi', suffix)
[docs]def concatenate_file_based_datasets( filepaths: Files, dataset_class: type, **kwargs ) -> ConcatKeyDataset: r""" Concatenate file-based datasets into a single one, with the ability to get the source dataset of items. Args: filepaths (Files): list of filepaths. dataset_class (type): dataset class reading from file. Supports KeyDataset and DatasetDelegator. For pure torch.utils.data.Dataset the returned instance can still be used like a `pytoda.datasets.TransparentConcatDataset`, but methods depending on key lookup will fail. kwargs (dict): additional args for dataset_class.__init__(filepath, \**kwargs). Returns: ConcatKeyDataset: the concatenated dataset. """ return ConcatKeyDataset( datasets=[dataset_class(filepath, **kwargs) for filepath in filepaths] )
[docs]def indexed(dataset: AnyBaseDataset) -> AnyBaseDataset: """ Returns mutated shallow copy of passed dataset instance, where indexing behavior is changed to additionally returning index. """ default_getitem = dataset.__getitem__ # bound method default_from_key = dataset.get_item_from_key # bound method def return_item_index_tuple(self, index: int) -> Tuple[Any, int]: return_index = len(self) + index if index < 0 else index return default_getitem(index), return_index def return_item_index_tuple_from_key(self, key: Hashable) -> Tuple[Any, int]: """prevents `get_item_from_key` to call new indexed __getitem__""" return default_from_key(key), dataset.get_index(key) methods = { '__getitem__': return_item_index_tuple, 'get_item_from_key': return_item_index_tuple_from_key, } ds = copy(dataset) ds.__class__ = type( f'Indexed{type(dataset).__name__}', (dataset.__class__,), methods ) return ds
[docs]def keyed(dataset: AnyBaseDataset) -> AnyBaseDataset: """ Returns mutated shallow copy of passed dataset instance, where indexing behavior is changed to additionally returning key. """ default_getitem = dataset.__getitem__ # bound method default_from_key = dataset.get_item_from_key # bound method def return_item_key_tuple(self, index: int) -> Tuple[Any, Hashable]: return (default_getitem(index), dataset.get_key(index)) def return_item_key_tuple_from_key(self, key: Hashable) -> Tuple[Any, Hashable]: """prevents `get_item_from_key` to call new keyed __getitem__""" return default_from_key(key), key methods = { '__getitem__': return_item_key_tuple, 'get_item_from_key': return_item_key_tuple_from_key, } ds = copy(dataset) ds.__class__ = type(f'Keyed{type(dataset).__name__}', (dataset.__class__,), methods) return ds
[docs]def pad_item( item: Tuple, padding_modes: List[str], padding_values: List, max_length: int, ) -> Tuple: """Padding function for a single item of a batch. Args: item (Tuple): Tuple returned by the __getitem__ function of a Dataset class. padding_modes (List[str]): The type of padding to perform for each datum in item. Options are 'constant' for constant value padding, and 'range' to fill the tensor with a range of values. padding_values (List): The values with which to fill the background tensor for padding. Can be a constant value or a range depending on the datum to pad in item. max_length (int): The maximum length to which the datum should be padded. Returns: Tuple: Tuple of tensors padded according to the given specifications. NOTE: pad_item function uses trailing dimensions as the repetitions argument for range_tensor(), since the 'length' of the set is covered by the value_range. That is, if a tensor of shape (5,) is required for padding_mode 'range' then () is passed as shape into range_tensor function which will repeat range(5) exactly once thus giving us a (5,) tensor. """ # for each Tensor in the list we determine the output dimensions max_sizes = [datum.size() for datum in item] out_dimses = [ (max_length, *max_sizes[i][1:]) if padding_modes[i] == 'constant' else (*max_sizes[i][1:],) for i in range(len(max_sizes)) ] out_tensors = [ BACKGROUND_TENSOR_FACTORY[mode](value, out_dims) for out_dims, mode, value in zip(out_dimses, padding_modes, padding_values) ] for datum_index, tensor in enumerate(item): length = tensor.size(0) out_tensors[datum_index][:length, ...] = tensor return out_tensors