Skip to content

dataloader

The base dataloader class.

This class defines a base structure for loading data from files and provides utility methods for configuration management and label encoding.

Attributes:

Name Type Description
name str

The name of the dataloader instance.

train_batch_size int

The batch size for training data.

test_batch_size int

The batch size for testing data.

Methods:

Name Description
__init__

Initializes the base dataloader class.

from_config

Instantiates a dataloader object from a configuration dictionary.

to_config

Exports the dataloader object to a configuration dictionary.

encode_str_labels

Encodes string labels into numeric representations, optionally as one-hot vectors.

load

Abstract method to load data from a file, to be implemented by subclasses.

Source code in tinybig/data/base_data.py
class dataloader:
    """
    The base dataloader class.

    This class defines a base structure for loading data from files and provides utility methods
    for configuration management and label encoding.

    Attributes
    ----------
    name: str
        The name of the dataloader instance.
    train_batch_size: int
        The batch size for training data.
    test_batch_size: int
        The batch size for testing data.

    Methods
    -------
    __init__(train_batch_size: int, test_batch_size: int, name: str = 'base_dataloader', *args, **kwargs)
        Initializes the base dataloader class.
    from_config(configs: dict)
        Instantiates a dataloader object from a configuration dictionary.
    to_config()
        Exports the dataloader object to a configuration dictionary.
    encode_str_labels(labels: Union[List, Tuple, np.array], one_hot: bool = False, device: str = 'cpu')
        Encodes string labels into numeric representations, optionally as one-hot vectors.
    load(*args, **kwargs)
        Abstract method to load data from a file, to be implemented by subclasses.
    """
    def __init__(self, train_batch_size: int, test_batch_size: int, name: str = 'base_dataloader', *args, **kwargs):
        """
        Initializes the base dataloader class.

        Parameters
        ----------
        train_batch_size: int
            The batch size for training data.
        test_batch_size: int
            The batch size for testing data.
        name: str, default = 'base_dataloader'
            The name of the dataloader instance.

        Returns
        -------
        None
        """
        self.name = name
        self.train_batch_size = train_batch_size
        self.test_batch_size = test_batch_size

    @staticmethod
    def from_config(configs: dict):
        """
        Instantiates a dataloader object from a configuration dictionary.

        Parameters
        ----------
        configs: dict
            The configuration dictionary containing 'data_class' and optional 'data_parameters'.

        Returns
        -------
        dataloader
            An instance of the dataloader class specified in the configuration.

        Raises
        ------
        ValueError
            If the provided configuration is None or lacks the 'data_class' key.
        """
        if configs is None:
            raise ValueError("configs cannot be None")
        assert 'data_class' in configs
        class_name = configs['data_class']
        parameters = configs['data_parameters'] if 'data_parameters' in configs else {}
        return config.get_obj_from_str(class_name)(**parameters)

    def to_config(self):
        """
        Exports the dataloader object to a configuration dictionary.

        Returns
        -------
        dict
            A dictionary containing the class name and attributes of the dataloader instance.
        """
        class_name = self.__class__.__name__
        attributes = {attr: getattr(self, attr) for attr in self.__dict__}

        return {
            "data_class": class_name,
            "data_parameters": attributes
        }

    @staticmethod
    def encode_str_labels(labels: Union[List, Tuple, np.array], one_hot: bool = False, device: str = 'cpu'):
        """
        Encodes string labels into numeric representations.

        Parameters
        ----------
        labels: Union[List, Tuple, np.array]
            The list of string labels to encode.
        one_hot: bool, default = False
            Whether to encode labels as one-hot vectors.
        device: str, default = 'cpu'
            The device to use for the encoded tensor.

        Returns
        -------
        torch.Tensor
            Encoded labels as a tensor.

        Raises
        ------
        ValueError
            If the labels are None or empty.
        """
        if labels is None or len(labels) == 0:
            raise ValueError("labels cannot be None")

        classes = set(labels)
        if one_hot:
            classes_dict = {c: np.identity(len(classes))[i, :] for i, c in enumerate(classes)}
        else:
            classes_dict = {c: i for i, c in enumerate(classes)}
        encoded_labels = np.array(list(map(classes_dict.get, labels)))
        labels_onehot = torch.tensor(encoded_labels, dtype=torch.long, device=device)
        return labels_onehot

    @abstractmethod
    def load(self, *args, **kwargs):
        """
        Abstract method for loading data from a file.

        This method must be implemented in subclasses to define specific data loading logic.

        Returns
        -------
        None
        """
        pass

__init__(train_batch_size, test_batch_size, name='base_dataloader', *args, **kwargs)

Initializes the base dataloader class.

Parameters:

Name Type Description Default
train_batch_size int

The batch size for training data.

required
test_batch_size int

The batch size for testing data.

required
name str

The name of the dataloader instance.

'base_dataloader'

Returns:

Type Description
None
Source code in tinybig/data/base_data.py
def __init__(self, train_batch_size: int, test_batch_size: int, name: str = 'base_dataloader', *args, **kwargs):
    """
    Initializes the base dataloader class.

    Parameters
    ----------
    train_batch_size: int
        The batch size for training data.
    test_batch_size: int
        The batch size for testing data.
    name: str, default = 'base_dataloader'
        The name of the dataloader instance.

    Returns
    -------
    None
    """
    self.name = name
    self.train_batch_size = train_batch_size
    self.test_batch_size = test_batch_size

encode_str_labels(labels, one_hot=False, device='cpu') staticmethod

Encodes string labels into numeric representations.

Parameters:

Name Type Description Default
labels Union[List, Tuple, array]

The list of string labels to encode.

required
one_hot bool

Whether to encode labels as one-hot vectors.

False
device str

The device to use for the encoded tensor.

'cpu'

Returns:

Type Description
Tensor

Encoded labels as a tensor.

Raises:

Type Description
ValueError

If the labels are None or empty.

Source code in tinybig/data/base_data.py
@staticmethod
def encode_str_labels(labels: Union[List, Tuple, np.array], one_hot: bool = False, device: str = 'cpu'):
    """
    Encodes string labels into numeric representations.

    Parameters
    ----------
    labels: Union[List, Tuple, np.array]
        The list of string labels to encode.
    one_hot: bool, default = False
        Whether to encode labels as one-hot vectors.
    device: str, default = 'cpu'
        The device to use for the encoded tensor.

    Returns
    -------
    torch.Tensor
        Encoded labels as a tensor.

    Raises
    ------
    ValueError
        If the labels are None or empty.
    """
    if labels is None or len(labels) == 0:
        raise ValueError("labels cannot be None")

    classes = set(labels)
    if one_hot:
        classes_dict = {c: np.identity(len(classes))[i, :] for i, c in enumerate(classes)}
    else:
        classes_dict = {c: i for i, c in enumerate(classes)}
    encoded_labels = np.array(list(map(classes_dict.get, labels)))
    labels_onehot = torch.tensor(encoded_labels, dtype=torch.long, device=device)
    return labels_onehot

from_config(configs) staticmethod

Instantiates a dataloader object from a configuration dictionary.

Parameters:

Name Type Description Default
configs dict

The configuration dictionary containing 'data_class' and optional 'data_parameters'.

required

Returns:

Type Description
dataloader

An instance of the dataloader class specified in the configuration.

Raises:

Type Description
ValueError

If the provided configuration is None or lacks the 'data_class' key.

Source code in tinybig/data/base_data.py
@staticmethod
def from_config(configs: dict):
    """
    Instantiates a dataloader object from a configuration dictionary.

    Parameters
    ----------
    configs: dict
        The configuration dictionary containing 'data_class' and optional 'data_parameters'.

    Returns
    -------
    dataloader
        An instance of the dataloader class specified in the configuration.

    Raises
    ------
    ValueError
        If the provided configuration is None or lacks the 'data_class' key.
    """
    if configs is None:
        raise ValueError("configs cannot be None")
    assert 'data_class' in configs
    class_name = configs['data_class']
    parameters = configs['data_parameters'] if 'data_parameters' in configs else {}
    return config.get_obj_from_str(class_name)(**parameters)

load(*args, **kwargs) abstractmethod

Abstract method for loading data from a file.

This method must be implemented in subclasses to define specific data loading logic.

Returns:

Type Description
None
Source code in tinybig/data/base_data.py
@abstractmethod
def load(self, *args, **kwargs):
    """
    Abstract method for loading data from a file.

    This method must be implemented in subclasses to define specific data loading logic.

    Returns
    -------
    None
    """
    pass

to_config()

Exports the dataloader object to a configuration dictionary.

Returns:

Type Description
dict

A dictionary containing the class name and attributes of the dataloader instance.

Source code in tinybig/data/base_data.py
def to_config(self):
    """
    Exports the dataloader object to a configuration dictionary.

    Returns
    -------
    dict
        A dictionary containing the class name and attributes of the dataloader instance.
    """
    class_name = self.__class__.__name__
    attributes = {attr: getattr(self, attr) for attr in self.__dict__}

    return {
        "data_class": class_name,
        "data_parameters": attributes
    }