imdb

Bases: text_dataloader

A dataloader class for the IMDB dataset.

This class provides methods to load and preprocess the IMDB dataset, which contains movie reviews labeled as positive or negative.

Attributes:

Name	Type	Description
`name`	`str, default = 'imdb'`	The name of the dataset.
`train_batch_size`	`int, default = 64`	The batch size for training data.
`test_batch_size`	`int, default = 64`	The batch size for testing data.
`max_seq_len`	`int, default = 512`	The maximum sequence length for text data.

Methods:

Name	Description
`__init__`	Initializes the IMDB dataset dataloader.
`load`	Loads the IMDB dataset with reversed (label, text) ordering.
`load_datapipe`	Loads training and testing pipelines for the IMDB dataset.
`get_class_number`	Returns the number of classes in the IMDB dataset (2).
`get_train_number`	Returns the number of training examples (25,000).
`get_test_number`	Returns the number of testing examples (25,000).
`get_idx_to_label`	Returns the mapping from indices to labels.

Source code in tinybig/data/text_dataloader_torchtext.py

class imdb(text_dataloader):
    """
    A dataloader class for the IMDB dataset.

    This class provides methods to load and preprocess the IMDB dataset, which contains movie reviews labeled as positive or negative.

    Attributes
    ----------
    name : str, default = 'imdb'
        The name of the dataset.
    train_batch_size : int, default = 64
        The batch size for training data.
    test_batch_size : int, default = 64
        The batch size for testing data.
    max_seq_len : int, default = 512
        The maximum sequence length for text data.

    Methods
    ----------
    __init__
        Initializes the IMDB dataset dataloader.
    load
        Loads the IMDB dataset with reversed (label, text) ordering.
    load_datapipe
        Loads training and testing pipelines for the IMDB dataset.
    get_class_number
        Returns the number of classes in the IMDB dataset (2).
    get_train_number
        Returns the number of training examples (25,000).
    get_test_number
        Returns the number of testing examples (25,000).
    get_idx_to_label
        Returns the mapping from indices to labels.
    """
    def __init__(self, name='imdb', train_batch_size=64, test_batch_size=64, max_seq_len: int = 512):
        """
        Initializes the IMDB dataset dataloader.

        Parameters
        ----------
        name : str, default = 'imdb'
            The name of the dataset.
        train_batch_size : int, default = 64
            The batch size for training data.
        test_batch_size : int, default = 64
            The batch size for testing data.
        max_seq_len : int, default = 512
            The maximum sequence length for text data.
        """
        super().__init__(name=name, train_batch_size=train_batch_size, test_batch_size=test_batch_size, max_seq_len=max_seq_len)

    def load(self, *args, **kwargs):
        """
        Loads the IMDB dataset with reversed (label, text) ordering.

        Parameters
        ----------
        *args : tuple
            Additional arguments.
        **kwargs : dict
            Additional keyword arguments.

        Returns
        -------
        dict
            A dictionary containing training and testing DataLoaders.
        """
        kwargs['xy_reversed'] = True
        return super().load(*args, **kwargs)

    @staticmethod
    def load_datapipe(cache_dir='./data/', *args, **kwargs):
        """
        Loads training and testing pipelines for the IMDB dataset.

        Parameters
        ----------
        cache_dir : str, default = './data/'
            Directory to store cached data.
        *args : tuple
            Additional arguments.
        **kwargs : dict
            Additional keyword arguments.

        Returns
        -------
        tuple
            A tuple containing training and testing data pipelines.
        """
        train_datapipe = IMDB(root=cache_dir, split="train")
        test_datapipe = IMDB(root=cache_dir, split="test")
        return train_datapipe, test_datapipe

    @staticmethod
    def get_class_number(*args, **kwargs):
        """
        Returns the number of classes in the IMDB dataset.

        Parameters
        ----------
        *args : tuple
            Additional arguments.
        **kwargs : dict
            Additional keyword arguments.

        Returns
        -------
        int
            The number of classes (2).
        """
        return 2

    @staticmethod
    def get_train_number(*args, **kwargs):
        """
        Returns the number of training examples in the IMDB dataset.

        Parameters
        ----------
        *args : tuple
            Additional arguments.
        **kwargs : dict
            Additional keyword arguments.

        Returns
        -------
        int
            The number of training examples (25,000).
        """
        return 25000

    @staticmethod
    def get_test_number(*args, **kwargs):
        """
        Returns the number of testing examples in the IMDB dataset.

        Parameters
        ----------
        *args : tuple
            Additional arguments.
        **kwargs : dict
            Additional keyword arguments.

        Returns
        -------
        int
            The number of testing examples (25,000).
        """
        return 25000

    @staticmethod
    def get_idx_to_label(*args, **kwargs):
        """
        Returns the mapping from indices to labels for the IMDB dataset.

        Parameters
        ----------
        *args : tuple
            Additional arguments.
        **kwargs : dict
            Additional keyword arguments.

        Returns
        -------
        dict
            A dictionary mapping indices to labels.
        """
        return {
            1: 0,
            2: 1,
        }

`init(name='imdb', train_batch_size=64, test_batch_size=64, max_seq_len=512)`

Initializes the IMDB dataset dataloader.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the dataset.	`= 'imdb'`
`train_batch_size`	`int`	The batch size for training data.	`= 64`
`test_batch_size`	`int`	The batch size for testing data.	`= 64`
`max_seq_len`	`int`	The maximum sequence length for text data.	`= 512`

Source code in tinybig/data/text_dataloader_torchtext.py

def __init__(self, name='imdb', train_batch_size=64, test_batch_size=64, max_seq_len: int = 512):
    """
    Initializes the IMDB dataset dataloader.

    Parameters
    ----------
    name : str, default = 'imdb'
        The name of the dataset.
    train_batch_size : int, default = 64
        The batch size for training data.
    test_batch_size : int, default = 64
        The batch size for testing data.
    max_seq_len : int, default = 512
        The maximum sequence length for text data.
    """
    super().__init__(name=name, train_batch_size=train_batch_size, test_batch_size=test_batch_size, max_seq_len=max_seq_len)

`get_class_number(*args, **kwargs)` `staticmethod`

Returns the number of classes in the IMDB dataset.

Parameters:

Name	Type	Description	Default
`*args`	`tuple`	Additional arguments.	`()`
`**kwargs`	`dict`	Additional keyword arguments.	`{}`

Returns:

Type	Description
`int`	The number of classes (2).

Source code in tinybig/data/text_dataloader_torchtext.py

@staticmethod
def get_class_number(*args, **kwargs):
    """
    Returns the number of classes in the IMDB dataset.

    Parameters
    ----------
    *args : tuple
        Additional arguments.
    **kwargs : dict
        Additional keyword arguments.

    Returns
    -------
    int
        The number of classes (2).
    """
    return 2

`get_idx_to_label(*args, **kwargs)` `staticmethod`

Returns the mapping from indices to labels for the IMDB dataset.

Parameters:

Name	Type	Description	Default
`*args`	`tuple`	Additional arguments.	`()`
`**kwargs`	`dict`	Additional keyword arguments.	`{}`

Returns:

Type	Description
`dict`	A dictionary mapping indices to labels.

Source code in tinybig/data/text_dataloader_torchtext.py

@staticmethod
def get_idx_to_label(*args, **kwargs):
    """
    Returns the mapping from indices to labels for the IMDB dataset.

    Parameters
    ----------
    *args : tuple
        Additional arguments.
    **kwargs : dict
        Additional keyword arguments.

    Returns
    -------
    dict
        A dictionary mapping indices to labels.
    """
    return {
        1: 0,
        2: 1,
    }

`get_test_number(*args, **kwargs)` `staticmethod`

Returns the number of testing examples in the IMDB dataset.

Parameters:

Name	Type	Description	Default
`*args`	`tuple`	Additional arguments.	`()`
`**kwargs`	`dict`	Additional keyword arguments.	`{}`

Returns:

Type	Description
`int`	The number of testing examples (25,000).

Source code in tinybig/data/text_dataloader_torchtext.py

@staticmethod
def get_test_number(*args, **kwargs):
    """
    Returns the number of testing examples in the IMDB dataset.

    Parameters
    ----------
    *args : tuple
        Additional arguments.
    **kwargs : dict
        Additional keyword arguments.

    Returns
    -------
    int
        The number of testing examples (25,000).
    """
    return 25000

`get_train_number(*args, **kwargs)` `staticmethod`

Returns the number of training examples in the IMDB dataset.

Parameters:

Name	Type	Description	Default
`*args`	`tuple`	Additional arguments.	`()`
`**kwargs`	`dict`	Additional keyword arguments.	`{}`

Returns:

Type	Description
`int`	The number of training examples (25,000).

Source code in tinybig/data/text_dataloader_torchtext.py

@staticmethod
def get_train_number(*args, **kwargs):
    """
    Returns the number of training examples in the IMDB dataset.

    Parameters
    ----------
    *args : tuple
        Additional arguments.
    **kwargs : dict
        Additional keyword arguments.

    Returns
    -------
    int
        The number of training examples (25,000).
    """
    return 25000

`load(*args, **kwargs)`

Loads the IMDB dataset with reversed (label, text) ordering.

Parameters:

Name	Type	Description	Default
`*args`	`tuple`	Additional arguments.	`()`
`**kwargs`	`dict`	Additional keyword arguments.	`{}`

Returns:

Type	Description
`dict`	A dictionary containing training and testing DataLoaders.

Source code in tinybig/data/text_dataloader_torchtext.py

def load(self, *args, **kwargs):
    """
    Loads the IMDB dataset with reversed (label, text) ordering.

    Parameters
    ----------
    *args : tuple
        Additional arguments.
    **kwargs : dict
        Additional keyword arguments.

    Returns
    -------
    dict
        A dictionary containing training and testing DataLoaders.
    """
    kwargs['xy_reversed'] = True
    return super().load(*args, **kwargs)

`load_datapipe(cache_dir='./data/', *args, **kwargs)` `staticmethod`

Loads training and testing pipelines for the IMDB dataset.

Parameters:

Name	Type	Description	Default
`cache_dir`	`str`	Directory to store cached data.	`= './data/'`
`*args`	`tuple`	Additional arguments.	`()`
`**kwargs`	`dict`	Additional keyword arguments.	`{}`

Returns:

Type	Description
`tuple`	A tuple containing training and testing data pipelines.

Source code in tinybig/data/text_dataloader_torchtext.py

@staticmethod
def load_datapipe(cache_dir='./data/', *args, **kwargs):
    """
    Loads training and testing pipelines for the IMDB dataset.

    Parameters
    ----------
    cache_dir : str, default = './data/'
        Directory to store cached data.
    *args : tuple
        Additional arguments.
    **kwargs : dict
        Additional keyword arguments.

    Returns
    -------
    tuple
        A tuple containing training and testing data pipelines.
    """
    train_datapipe = IMDB(root=cache_dir, split="train")
    test_datapipe = IMDB(root=cache_dir, split="test")
    return train_datapipe, test_datapipe

imdb

__init__(name='imdb', train_batch_size=64, test_batch_size=64, max_seq_len=512)

get_class_number(*args, **kwargs) staticmethod

get_idx_to_label(*args, **kwargs) staticmethod

get_test_number(*args, **kwargs) staticmethod

get_train_number(*args, **kwargs) staticmethod

load(*args, **kwargs)

load_datapipe(cache_dir='./data/', *args, **kwargs) staticmethod

`init(name='imdb', train_batch_size=64, test_batch_size=64, max_seq_len=512)`

`get_class_number(*args, **kwargs)` `staticmethod`

`get_idx_to_label(*args, **kwargs)` `staticmethod`

`get_test_number(*args, **kwargs)` `staticmethod`

`get_train_number(*args, **kwargs)` `staticmethod`

`load(*args, **kwargs)`

`load_datapipe(cache_dir='./data/', *args, **kwargs)` `staticmethod`