Skip to content

sst2

Bases: text_dataloader

A dataloader class for the SST-2 dataset.

This class provides methods to load and preprocess the SST-2 dataset, which contains sentiment classification labels for sentences.

Attributes:

Name Type Description
name str, default = 'sst2'

The name of the dataset.

train_batch_size int, default = 64

The batch size for training data.

test_batch_size int, default = 64

The batch size for testing data.

max_seq_len int, default = 32

The maximum sequence length for text data.

Methods:

Name Description
__init__

Initializes the SST-2 dataset dataloader.

load_datapipe

Loads training and testing pipelines for the SST-2 dataset.

get_class_number

Returns the number of classes in the SST-2 dataset (2).

get_train_number

Returns the number of training examples (67,349).

get_test_number

Returns the number of testing examples (872).

get_idx_to_label

Returns the mapping from indices to labels.

Source code in tinybig/data/text_dataloader_torchtext.py
class sst2(text_dataloader):
    """
    A dataloader class for the SST-2 dataset.

    This class provides methods to load and preprocess the SST-2 dataset, which contains sentiment classification labels for sentences.

    Attributes
    ----------
    name : str, default = 'sst2'
        The name of the dataset.
    train_batch_size : int, default = 64
        The batch size for training data.
    test_batch_size : int, default = 64
        The batch size for testing data.
    max_seq_len : int, default = 32
        The maximum sequence length for text data.

    Methods
    ----------
    __init__
        Initializes the SST-2 dataset dataloader.
    load_datapipe
        Loads training and testing pipelines for the SST-2 dataset.
    get_class_number
        Returns the number of classes in the SST-2 dataset (2).
    get_train_number
        Returns the number of training examples (67,349).
    get_test_number
        Returns the number of testing examples (872).
    get_idx_to_label
        Returns the mapping from indices to labels.
    """
    def __init__(self, name='sst2', train_batch_size=64, test_batch_size=64, max_seq_len: int = 32):
        super().__init__(name=name, train_batch_size=train_batch_size, test_batch_size=test_batch_size, max_seq_len=max_seq_len)

    @staticmethod
    def load_datapipe(cache_dir='./data/', *args, **kwargs):
        """
        Loads training and testing pipelines for the SST-2 dataset.

        Parameters
        ----------
        cache_dir : str, default = './data/'
            Directory to store cached data.
        *args : tuple
            Additional arguments.
        **kwargs : dict
            Additional keyword arguments.

        Returns
        -------
        tuple
            A tuple containing training and testing data pipelines.
        """
        train_datapipe = SST2(root=cache_dir, split="train")
        test_datapipe = SST2(root=cache_dir, split="dev")
        return train_datapipe, test_datapipe

    @staticmethod
    def get_class_number(*args, **kwargs):
        """
        Returns the number of classes in the SST-2 dataset.

        Parameters
        ----------
        *args : tuple
            Additional arguments.
        **kwargs : dict
            Additional keyword arguments.

        Returns
        -------
        int
            The number of classes (2).
        """
        return 2

    @staticmethod
    def get_train_number(*args, **kwargs):
        """
        Returns the number of training examples in the SST-2 dataset.

        Parameters
        ----------
        *args : tuple
            Additional arguments.
        **kwargs : dict
            Additional keyword arguments.

        Returns
        -------
        int
            The number of training examples (67,349).
        """
        return 67349

    @staticmethod
    def get_test_number(*args, **kwargs):
        """
        Returns the number of testing examples in the SST-2 dataset.

        Parameters
        ----------
        *args : tuple
            Additional arguments.
        **kwargs : dict
            Additional keyword arguments.

        Returns
        -------
        int
            The number of testing examples (872).
        """
        return 872

    @staticmethod
    def get_idx_to_label(*args, **kwargs):
        """
        Returns the mapping from indices to labels for the SST-2 dataset.

        Parameters
        ----------
        *args : tuple
            Additional arguments.
        **kwargs : dict
            Additional keyword arguments.

        Returns
        -------
        dict
            A dictionary mapping indices to labels.
        """
        return {
            0: 0,
            1: 1,
        }

get_class_number(*args, **kwargs) staticmethod

Returns the number of classes in the SST-2 dataset.

Parameters:

Name Type Description Default
*args tuple

Additional arguments.

()
**kwargs dict

Additional keyword arguments.

{}

Returns:

Type Description
int

The number of classes (2).

Source code in tinybig/data/text_dataloader_torchtext.py
@staticmethod
def get_class_number(*args, **kwargs):
    """
    Returns the number of classes in the SST-2 dataset.

    Parameters
    ----------
    *args : tuple
        Additional arguments.
    **kwargs : dict
        Additional keyword arguments.

    Returns
    -------
    int
        The number of classes (2).
    """
    return 2

get_idx_to_label(*args, **kwargs) staticmethod

Returns the mapping from indices to labels for the SST-2 dataset.

Parameters:

Name Type Description Default
*args tuple

Additional arguments.

()
**kwargs dict

Additional keyword arguments.

{}

Returns:

Type Description
dict

A dictionary mapping indices to labels.

Source code in tinybig/data/text_dataloader_torchtext.py
@staticmethod
def get_idx_to_label(*args, **kwargs):
    """
    Returns the mapping from indices to labels for the SST-2 dataset.

    Parameters
    ----------
    *args : tuple
        Additional arguments.
    **kwargs : dict
        Additional keyword arguments.

    Returns
    -------
    dict
        A dictionary mapping indices to labels.
    """
    return {
        0: 0,
        1: 1,
    }

get_test_number(*args, **kwargs) staticmethod

Returns the number of testing examples in the SST-2 dataset.

Parameters:

Name Type Description Default
*args tuple

Additional arguments.

()
**kwargs dict

Additional keyword arguments.

{}

Returns:

Type Description
int

The number of testing examples (872).

Source code in tinybig/data/text_dataloader_torchtext.py
@staticmethod
def get_test_number(*args, **kwargs):
    """
    Returns the number of testing examples in the SST-2 dataset.

    Parameters
    ----------
    *args : tuple
        Additional arguments.
    **kwargs : dict
        Additional keyword arguments.

    Returns
    -------
    int
        The number of testing examples (872).
    """
    return 872

get_train_number(*args, **kwargs) staticmethod

Returns the number of training examples in the SST-2 dataset.

Parameters:

Name Type Description Default
*args tuple

Additional arguments.

()
**kwargs dict

Additional keyword arguments.

{}

Returns:

Type Description
int

The number of training examples (67,349).

Source code in tinybig/data/text_dataloader_torchtext.py
@staticmethod
def get_train_number(*args, **kwargs):
    """
    Returns the number of training examples in the SST-2 dataset.

    Parameters
    ----------
    *args : tuple
        Additional arguments.
    **kwargs : dict
        Additional keyword arguments.

    Returns
    -------
    int
        The number of training examples (67,349).
    """
    return 67349

load_datapipe(cache_dir='./data/', *args, **kwargs) staticmethod

Loads training and testing pipelines for the SST-2 dataset.

Parameters:

Name Type Description Default
cache_dir str

Directory to store cached data.

= './data/'
*args tuple

Additional arguments.

()
**kwargs dict

Additional keyword arguments.

{}

Returns:

Type Description
tuple

A tuple containing training and testing data pipelines.

Source code in tinybig/data/text_dataloader_torchtext.py
@staticmethod
def load_datapipe(cache_dir='./data/', *args, **kwargs):
    """
    Loads training and testing pipelines for the SST-2 dataset.

    Parameters
    ----------
    cache_dir : str, default = './data/'
        Directory to store cached data.
    *args : tuple
        Additional arguments.
    **kwargs : dict
        Additional keyword arguments.

    Returns
    -------
    tuple
        A tuple containing training and testing data pipelines.
    """
    train_datapipe = SST2(root=cache_dir, split="train")
    test_datapipe = SST2(root=cache_dir, split="dev")
    return train_datapipe, test_datapipe