Skip to content


Bases: dataloader

A dataloader class for handling tabular datasets.

This class extends the base dataloader class to support loading, processing, normalizing, and splitting tabular datasets for machine learning tasks.


Name Type Description
data_contents (list or tuple, optional)

Raw tabular data contents, directly provided or loaded from a file.

train_batch_size int

The batch size for training datasets.

test_batch_size int

The batch size for testing datasets.


Name Description

Initializes the tabular dataloader.


Loads data from a file.


Loads raw data either from data_contents or a file.


Processes raw tabular data into structured formats.


Loads and processes the entire dataset into feature and label tensors.


Normalizes a tensor using specified normalization techniques.


Splits the dataset into training and testing datasets.


Main method to load, normalize, and split tabular data.

Source code in tinybig/data/
class tabular_dataloader(dataloader):
    A dataloader class for handling tabular datasets.

    This class extends the base `dataloader` class to support loading, processing,
    normalizing, and splitting tabular datasets for machine learning tasks.

    data_contents : list or tuple, optional
        Raw tabular data contents, directly provided or loaded from a file.
    train_batch_size : int
        The batch size for training datasets.
    test_batch_size : int
        The batch size for testing datasets.

    __init__(name, data_contents, train_batch_size, test_batch_size, ...)
        Initializes the tabular dataloader.
    load_file(cache_dir, filename)
        Loads data from a file.
    load_raw_data(data_contents, cache_dir, filename)
        Loads raw data either from `data_contents` or a file.
    process_data(data_contents, cache_dir, filename, str_converted_to_numerical, ...)
        Processes raw tabular data into structured formats.
    load_complete_data(data_contents, cache_dir, filename, ...)
        Loads and processes the entire dataset into feature and label tensors.
    normalize(input, normalize_type, normalize_range)
        Normalizes a tensor using specified normalization techniques.
    split(X, y, split_type, train_percentage, fold, random_state, shuffle)
        Splits the dataset into training and testing datasets.
    load(cache_dir, filename, split_type, train_percentage, fold, random_state, shuffle, ...)
        Main method to load, normalize, and split tabular data.
    def __init__(self, name='tabular_dataloader', data_contents: list | tuple = None,
                 train_batch_size=64, test_batch_size=64, *args, **kwargs):
        Initializes the tabular dataloader.

        name : str, default = 'tabular_dataloader'
            The name of the dataloader instance.
        data_contents : list or tuple, optional
            Raw data contents provided directly.
        train_batch_size : int, default = 64
            The batch size for training datasets.
        test_batch_size : int, default = 64
            The batch size for testing datasets.

        super().__init__(name=name, train_batch_size=train_batch_size, test_batch_size=test_batch_size, *args, **kwargs)
        self.data_contents = data_contents

    def load_file(self, cache_dir: str = None, filename: str = None):
        Loads data from a file.

        cache_dir : str
            The directory where the file is located.
        filename : str
            The name of the file to be loaded.

            A list of rows read from the file.

            If the specified file does not exist.
        file_path = cache_dir + filename
        if not os.path.exists(file_path):
            raise ValueError('The provided file path doesn\'t exist...'.format(file_path))
        f = open(file_path, 'r')
        rows = f.readlines()
        return rows

    def load_raw_data(self, data_contents=None, cache_dir: str = None, filename: str = None):
        Loads raw data either from `data_contents` or from a file.

        data_contents : list or tuple, optional
            The raw data contents directly provided.
        cache_dir : str, optional
            The directory where the file is located.
        filename : str, optional
            The name of the file to be loaded.

        list or tuple
            The raw data contents.
        data_contents = data_contents if data_contents is not None else self.data_contents
        if data_contents is not None:
            return data_contents
        elif cache_dir is not None and filename is not None:
            return self.load_file(cache_dir=cache_dir, filename=filename)

    def process_data(self, data_contents=None, cache_dir: str = None, filename: str = None,
                     str_converted_to_numerical: bool = True, *args, **kwargs):
        Processes raw tabular data into structured formats.

        data_contents : list or tuple, optional
            The raw data contents directly provided.
        cache_dir : str, optional
            The directory where the file is located.
        filename : str, optional
            The name of the file to be processed.
        str_converted_to_numerical : bool, default = True
            Whether to convert string columns to numerical values.

            A dictionary containing processed data with metadata and contents.
        rows = data_contents if data_contents is not None else self.load_file(cache_dir=cache_dir, filename=filename)

        data_dict = {
            'profile': {
                'str_converted_to_numerical': str_converted_to_numerical,
                'column_names': [],  #[list of column names],
                'column_value_types': [],  #[list of column value types, e.g., float, int, str, etc.],
                'column_codings': [],  #[list of column index: {raw_values: coding_values}]
            'contents': []  # [[vector of instance 1], [vector of instance 2], ... ]

        headline = rows[0]
        column_names, column_value_types = [], []
        feature_name_value_type_list = headline.strip('\n').split(',')
        for feature_name_value_type in feature_name_value_type_list:
            name, value_type = feature_name_value_type.split('_')

        column_codings = [None]*len(column_names)
        contents = []
        nan_detect = False
        nan_columns = {}
        for row_index in range(1, len(rows)):
            row = rows[row_index]
            x = []
            features = row.strip('\n').split(',')
            for index in range(len(features)):
                feature = features[index]
                feature_value_type = column_value_types[index]
                if feature_value_type in ['int', 'integer', 'Int', 'Integer', 'INT', 'INTEGER']:
                        feature = int(feature)
                        nan_detect = True
                        nan_columns[index] = 1
                        feature = np.nan
                elif feature_value_type in ['float', 'Float', 'double', 'Double', 'FLOAT', 'DOUBLE']:
                        feature = float(feature)
                        nan_detect = True
                        nan_columns[index] = 1
                        feature = np.nan
                elif feature_value_type in ['str', 'string', 'Str', 'String', 'strings', 'Strings', 'STR', 'STRING', 'STRINGS']:
                    if str_converted_to_numerical:
                        if column_codings[index] is None:
                            column_codings[index] = {}
                        if feature not in column_codings[index]:
                            column_codings[index][feature] = len(column_codings[index])
                        feature = column_codings[index][feature]
        data_dict['profile']['feature_names'] = column_names
        data_dict['profile']['column_value_types'] = column_value_types
        data_dict['profile']['column_codings'] = column_codings
        data_dict['profile']['row_number'] = len(contents)
        data_dict['profile']['column_number'] = len(column_names)
        data_dict['contents'] = contents

        if nan_detect:
            warnings.warn('The loaded dataset may contain NaN elements, and the columns containing nan are listed as follows', UserWarning)
        return data_dict

    def load_complete_data(self, data_contents: list | tuple = None, cache_dir: str = None,
                           filename: str = None, *args, **kwargs):
        Loads and processes the entire dataset into feature and label tensors.

        data_contents : list or tuple, optional
            Raw data contents provided directly.
        cache_dir : str, optional
            The directory where the file is located.
        filename : str, optional
            The name of the file to be loaded.

            A dictionary containing feature tensor `X`, label tensor `y`, and metadata.
        data_contents = data_contents if data_contents is not None else self.data_contents
        raw_data = self.process_data(data_contents=data_contents, cache_dir=cache_dir, filename=filename,
                                     str_converted_to_numerical=True, *args, **kwargs)

        contents = np.array(raw_data['contents'])
        row_number = raw_data['profile']['row_number']
        column_number = raw_data['profile']['column_number']
        column_names = raw_data['profile']['feature_names']
        column_value_types = raw_data['profile']['column_value_types']
        ids = contents[:, 0]
        X = torch.Tensor(contents[:, 1:-1])
        y = torch.Tensor(contents[:, -1])

        return {
            'X': X,
            'y': y,
            'profile': {
                'name': raw_data['profile']['name'],
                'instance_number': row_number,
                'feature_number': column_number - 2,
                'feature_codings': raw_data['profile']['column_codings'][1:-1],
                'label_codings': raw_data['profile']['column_codings'][-1],
                'feature_names': column_names[1:-1],
                'label_names': column_names[-1],
                'feature_value_types': column_value_types[1:-1],
                'label_value_types': column_value_types[-1]

    def normalize(input: torch.Tensor, normalize_type=None, normalize_range: list | tuple=None):
        Normalizes a tensor using specified normalization techniques.

        input : torch.Tensor
            The input tensor to be normalized.
        normalize_type : str, optional
            The type of normalization ('min_max' or 'mean_std').
        normalize_range : list or tuple, optional
            The range for min-max normalization.

            The normalized tensor.
        if normalize_type == 'min_max':
            if normalize_range is None:
                min_value, max_value = (0, 1)
                min_value, max_value = normalize_range
            X_std = (input - input.numpy().min(axis=0)) / (input.numpy().max(axis=0) - input.numpy().min(axis=0))
            X_scaled = X_std * (max_value - min_value) + min_value
        elif normalize_type == 'mean_std':
            X_mean = torch.mean(input, dim=0, keepdim=True)
            X_std = torch.std(input, dim=0, keepdim=True)
            X_scaled = (input - X_mean) / X_std
            X_scaled = input
        return X_scaled

    def split(self, X, y, split_type='train_test_split', train_percentage=0.9, fold=10, random_state=1234, shuffle=True):
        Splits the dataset into training and testing datasets.

        X : torch.Tensor
            The feature tensor.
        y : torch.Tensor
            The label tensor.
        split_type : str, default = 'train_test_split'
            The type of splitting ('train_test_split' or 'KFold').
        train_percentage : float, default = 0.9
            The proportion of data to be used for training in train-test split.
        fold : int, default = 10
            Number of folds for cross-validation.
        random_state : int, default = 1234
            Random state for reproducibility.
        shuffle : bool, default = True
            Whether to shuffle the data before splitting.

            Training and testing DataLoaders (or dictionaries in case of KFold).
        train_loader, test_loader = None, None
        if split_type == 'train_test_split':
            X_train, X_test, y_train, y_test = train_test_split(
                X, y,
                train_size=int(train_percentage * len(X)),
                random_state=random_state, shuffle=shuffle
            train_dataset = dataset(X_train, torch.unsqueeze(y_train, 1))
            test_dataset = dataset(X_test, torch.unsqueeze(y_test, 1))
            train_loader = DataLoader(train_dataset, batch_size=self.train_batch_size, shuffle=True)
            test_loader = DataLoader(test_dataset, batch_size=self.test_batch_size, shuffle=False)
        elif split_type in ['KFold', 'cross_validation']:
            kf = KFold(n_splits=fold, random_state=random_state, shuffle=shuffle)
            train_loader, test_loader = {}, {}
            for i, (train_index, test_index) in enumerate(kf.split(X)):
                X_train, y_train = X[train_index], y[train_index]
                X_test, y_test = X[test_index], y[test_index]
                train_dataset = dataset(X_train, torch.unsqueeze(y_train, 1))
                test_dataset = dataset(X_test, torch.unsqueeze(y_test, 1))
                train_loader[i] = DataLoader(train_dataset, batch_size=self.train_batch_size, shuffle=True)
                test_loader[i] = DataLoader(test_dataset, batch_size=self.test_batch_size, shuffle=False)
        return train_loader, test_loader

    def load(self, cache_dir: str = './data/', filename: str = 'classic_dataset_filename',
             split_type: str = 'train_test_split', train_percentage: float = 0.9, fold: int = 10,
             random_state: int = 123, shuffle: bool = True,
             normalize_X: bool = False, normalize_y: bool = False,
             normalize_type: str = 'min_max', normalize_range: list | tuple = (0, 1), *args, **kwargs):
        Main method to load, normalize, and split tabular data.

        cache_dir : str, default = './data/'
            The directory where the data file is located.
        filename : str, default = 'classic_dataset_filename'
            The name of the data file.
        split_type : str, default = 'train_test_split'
            The type of splitting ('train_test_split' or 'KFold').
        train_percentage : float, default = 0.9
            The proportion of data to be used for training in train-test split.
        fold : int, default = 10
            Number of folds for cross-validation.
        random_state : int, default = 123
            Random state for reproducibility.
        shuffle : bool, default = True
            Whether to shuffle the data before splitting.
        normalize_X : bool, default = False
            Whether to normalize the feature tensor.
        normalize_y : bool, default = False
            Whether to normalize the label tensor.
        normalize_type : str, default = 'min_max'
            The type of normalization ('min_max' or 'mean_std').
        normalize_range : list or tuple, default = (0, 1)
            The range for min-max normalization.

            A dictionary containing training and testing DataLoaders, and metadata.
        complete_data = self.load_complete_data(data_contents=self.data_contents, cache_dir=cache_dir, filename=filename)
        X, y = complete_data['X'], complete_data['y']

        if normalize_X:
            X = self.normalize(input=X, normalize_type=normalize_type, normalize_range=normalize_range)
        if normalize_y:
            y = self.normalize(input=y, normalize_type=normalize_type, normalize_range=normalize_range)

        train_loader, test_loader = self.split(X=X, y=y, split_type=split_type, train_percentage=train_percentage,
                                               fold=fold, random_state=random_state, shuffle=shuffle)

        return {
            'train_loader': train_loader,
            'test_loader': test_loader,
            'profile': complete_data['profile']

__init__(name='tabular_dataloader', data_contents=None, train_batch_size=64, test_batch_size=64, *args, **kwargs)

Initializes the tabular dataloader.


Name Type Description Default
name str

The name of the dataloader instance.

= 'tabular_dataloader'
data_contents list or tuple

Raw data contents provided directly.

train_batch_size int

The batch size for training datasets.

= 64
test_batch_size int

The batch size for testing datasets.

= 64


Type Description
Source code in tinybig/data/
def __init__(self, name='tabular_dataloader', data_contents: list | tuple = None,
             train_batch_size=64, test_batch_size=64, *args, **kwargs):
    Initializes the tabular dataloader.

    name : str, default = 'tabular_dataloader'
        The name of the dataloader instance.
    data_contents : list or tuple, optional
        Raw data contents provided directly.
    train_batch_size : int, default = 64
        The batch size for training datasets.
    test_batch_size : int, default = 64
        The batch size for testing datasets.

    super().__init__(name=name, train_batch_size=train_batch_size, test_batch_size=test_batch_size, *args, **kwargs)
    self.data_contents = data_contents

load(cache_dir='./data/', filename='classic_dataset_filename', split_type='train_test_split', train_percentage=0.9, fold=10, random_state=123, shuffle=True, normalize_X=False, normalize_y=False, normalize_type='min_max', normalize_range=(0, 1), *args, **kwargs)

Main method to load, normalize, and split tabular data.


Name Type Description Default
cache_dir str

The directory where the data file is located.

= './data/'
filename str

The name of the data file.

= 'classic_dataset_filename'
split_type str

The type of splitting ('train_test_split' or 'KFold').

= 'train_test_split'
train_percentage float

The proportion of data to be used for training in train-test split.

= 0.9
fold int

Number of folds for cross-validation.

= 10
random_state int

Random state for reproducibility.

= 123
shuffle bool

Whether to shuffle the data before splitting.

= True
normalize_X bool

Whether to normalize the feature tensor.

= False
normalize_y bool

Whether to normalize the label tensor.

= False
normalize_type str

The type of normalization ('min_max' or 'mean_std').

= 'min_max'
normalize_range list or tuple

The range for min-max normalization.

= (0, 1)


Type Description

A dictionary containing training and testing DataLoaders, and metadata.

Source code in tinybig/data/
def load(self, cache_dir: str = './data/', filename: str = 'classic_dataset_filename',
         split_type: str = 'train_test_split', train_percentage: float = 0.9, fold: int = 10,
         random_state: int = 123, shuffle: bool = True,
         normalize_X: bool = False, normalize_y: bool = False,
         normalize_type: str = 'min_max', normalize_range: list | tuple = (0, 1), *args, **kwargs):
    Main method to load, normalize, and split tabular data.

    cache_dir : str, default = './data/'
        The directory where the data file is located.
    filename : str, default = 'classic_dataset_filename'
        The name of the data file.
    split_type : str, default = 'train_test_split'
        The type of splitting ('train_test_split' or 'KFold').
    train_percentage : float, default = 0.9
        The proportion of data to be used for training in train-test split.
    fold : int, default = 10
        Number of folds for cross-validation.
    random_state : int, default = 123
        Random state for reproducibility.
    shuffle : bool, default = True
        Whether to shuffle the data before splitting.
    normalize_X : bool, default = False
        Whether to normalize the feature tensor.
    normalize_y : bool, default = False
        Whether to normalize the label tensor.
    normalize_type : str, default = 'min_max'
        The type of normalization ('min_max' or 'mean_std').
    normalize_range : list or tuple, default = (0, 1)
        The range for min-max normalization.

        A dictionary containing training and testing DataLoaders, and metadata.
    complete_data = self.load_complete_data(data_contents=self.data_contents, cache_dir=cache_dir, filename=filename)
    X, y = complete_data['X'], complete_data['y']

    if normalize_X:
        X = self.normalize(input=X, normalize_type=normalize_type, normalize_range=normalize_range)
    if normalize_y:
        y = self.normalize(input=y, normalize_type=normalize_type, normalize_range=normalize_range)

    train_loader, test_loader = self.split(X=X, y=y, split_type=split_type, train_percentage=train_percentage,
                                           fold=fold, random_state=random_state, shuffle=shuffle)

    return {
        'train_loader': train_loader,
        'test_loader': test_loader,
        'profile': complete_data['profile']

load_complete_data(data_contents=None, cache_dir=None, filename=None, *args, **kwargs)

Loads and processes the entire dataset into feature and label tensors.


Name Type Description Default
data_contents list or tuple

Raw data contents provided directly.

cache_dir str

The directory where the file is located.

filename str

The name of the file to be loaded.



Type Description

A dictionary containing feature tensor X, label tensor y, and metadata.

Source code in tinybig/data/
def load_complete_data(self, data_contents: list | tuple = None, cache_dir: str = None,
                       filename: str = None, *args, **kwargs):
    Loads and processes the entire dataset into feature and label tensors.

    data_contents : list or tuple, optional
        Raw data contents provided directly.
    cache_dir : str, optional
        The directory where the file is located.
    filename : str, optional
        The name of the file to be loaded.

        A dictionary containing feature tensor `X`, label tensor `y`, and metadata.
    data_contents = data_contents if data_contents is not None else self.data_contents
    raw_data = self.process_data(data_contents=data_contents, cache_dir=cache_dir, filename=filename,
                                 str_converted_to_numerical=True, *args, **kwargs)

    contents = np.array(raw_data['contents'])
    row_number = raw_data['profile']['row_number']
    column_number = raw_data['profile']['column_number']
    column_names = raw_data['profile']['feature_names']
    column_value_types = raw_data['profile']['column_value_types']
    ids = contents[:, 0]
    X = torch.Tensor(contents[:, 1:-1])
    y = torch.Tensor(contents[:, -1])

    return {
        'X': X,
        'y': y,
        'profile': {
            'name': raw_data['profile']['name'],
            'instance_number': row_number,
            'feature_number': column_number - 2,
            'feature_codings': raw_data['profile']['column_codings'][1:-1],
            'label_codings': raw_data['profile']['column_codings'][-1],
            'feature_names': column_names[1:-1],
            'label_names': column_names[-1],
            'feature_value_types': column_value_types[1:-1],
            'label_value_types': column_value_types[-1]

load_file(cache_dir=None, filename=None)

Loads data from a file.


Name Type Description Default
cache_dir str

The directory where the file is located.

filename str

The name of the file to be loaded.



Type Description

A list of rows read from the file.


Type Description

If the specified file does not exist.

Source code in tinybig/data/
def load_file(self, cache_dir: str = None, filename: str = None):
    Loads data from a file.

    cache_dir : str
        The directory where the file is located.
    filename : str
        The name of the file to be loaded.

        A list of rows read from the file.

        If the specified file does not exist.
    file_path = cache_dir + filename
    if not os.path.exists(file_path):
        raise ValueError('The provided file path doesn\'t exist...'.format(file_path))
    f = open(file_path, 'r')
    rows = f.readlines()
    return rows

load_raw_data(data_contents=None, cache_dir=None, filename=None)

Loads raw data either from data_contents or from a file.


Name Type Description Default
data_contents list or tuple

The raw data contents directly provided.

cache_dir str

The directory where the file is located.

filename str

The name of the file to be loaded.



Type Description
list or tuple

The raw data contents.

Source code in tinybig/data/
def load_raw_data(self, data_contents=None, cache_dir: str = None, filename: str = None):
    Loads raw data either from `data_contents` or from a file.

    data_contents : list or tuple, optional
        The raw data contents directly provided.
    cache_dir : str, optional
        The directory where the file is located.
    filename : str, optional
        The name of the file to be loaded.

    list or tuple
        The raw data contents.
    data_contents = data_contents if data_contents is not None else self.data_contents
    if data_contents is not None:
        return data_contents
    elif cache_dir is not None and filename is not None:
        return self.load_file(cache_dir=cache_dir, filename=filename)

normalize(input, normalize_type=None, normalize_range=None) staticmethod

Normalizes a tensor using specified normalization techniques.


Name Type Description Default
input Tensor

The input tensor to be normalized.

normalize_type str

The type of normalization ('min_max' or 'mean_std').

normalize_range list or tuple

The range for min-max normalization.



Type Description

The normalized tensor.

Source code in tinybig/data/
def normalize(input: torch.Tensor, normalize_type=None, normalize_range: list | tuple=None):
    Normalizes a tensor using specified normalization techniques.

    input : torch.Tensor
        The input tensor to be normalized.
    normalize_type : str, optional
        The type of normalization ('min_max' or 'mean_std').
    normalize_range : list or tuple, optional
        The range for min-max normalization.

        The normalized tensor.
    if normalize_type == 'min_max':
        if normalize_range is None:
            min_value, max_value = (0, 1)
            min_value, max_value = normalize_range
        X_std = (input - input.numpy().min(axis=0)) / (input.numpy().max(axis=0) - input.numpy().min(axis=0))
        X_scaled = X_std * (max_value - min_value) + min_value
    elif normalize_type == 'mean_std':
        X_mean = torch.mean(input, dim=0, keepdim=True)
        X_std = torch.std(input, dim=0, keepdim=True)
        X_scaled = (input - X_mean) / X_std
        X_scaled = input
    return X_scaled

process_data(data_contents=None, cache_dir=None, filename=None, str_converted_to_numerical=True, *args, **kwargs)

Processes raw tabular data into structured formats.


Name Type Description Default
data_contents list or tuple

The raw data contents directly provided.

cache_dir str

The directory where the file is located.

filename str

The name of the file to be processed.

str_converted_to_numerical bool

Whether to convert string columns to numerical values.

= True


Type Description

A dictionary containing processed data with metadata and contents.

Source code in tinybig/data/
def process_data(self, data_contents=None, cache_dir: str = None, filename: str = None,
                 str_converted_to_numerical: bool = True, *args, **kwargs):
    Processes raw tabular data into structured formats.

    data_contents : list or tuple, optional
        The raw data contents directly provided.
    cache_dir : str, optional
        The directory where the file is located.
    filename : str, optional
        The name of the file to be processed.
    str_converted_to_numerical : bool, default = True
        Whether to convert string columns to numerical values.

        A dictionary containing processed data with metadata and contents.
    rows = data_contents if data_contents is not None else self.load_file(cache_dir=cache_dir, filename=filename)

    data_dict = {
        'profile': {
            'str_converted_to_numerical': str_converted_to_numerical,
            'column_names': [],  #[list of column names],
            'column_value_types': [],  #[list of column value types, e.g., float, int, str, etc.],
            'column_codings': [],  #[list of column index: {raw_values: coding_values}]
        'contents': []  # [[vector of instance 1], [vector of instance 2], ... ]

    headline = rows[0]
    column_names, column_value_types = [], []
    feature_name_value_type_list = headline.strip('\n').split(',')
    for feature_name_value_type in feature_name_value_type_list:
        name, value_type = feature_name_value_type.split('_')

    column_codings = [None]*len(column_names)
    contents = []
    nan_detect = False
    nan_columns = {}
    for row_index in range(1, len(rows)):
        row = rows[row_index]
        x = []
        features = row.strip('\n').split(',')
        for index in range(len(features)):
            feature = features[index]
            feature_value_type = column_value_types[index]
            if feature_value_type in ['int', 'integer', 'Int', 'Integer', 'INT', 'INTEGER']:
                    feature = int(feature)
                    nan_detect = True
                    nan_columns[index] = 1
                    feature = np.nan
            elif feature_value_type in ['float', 'Float', 'double', 'Double', 'FLOAT', 'DOUBLE']:
                    feature = float(feature)
                    nan_detect = True
                    nan_columns[index] = 1
                    feature = np.nan
            elif feature_value_type in ['str', 'string', 'Str', 'String', 'strings', 'Strings', 'STR', 'STRING', 'STRINGS']:
                if str_converted_to_numerical:
                    if column_codings[index] is None:
                        column_codings[index] = {}
                    if feature not in column_codings[index]:
                        column_codings[index][feature] = len(column_codings[index])
                    feature = column_codings[index][feature]
    data_dict['profile']['feature_names'] = column_names
    data_dict['profile']['column_value_types'] = column_value_types
    data_dict['profile']['column_codings'] = column_codings
    data_dict['profile']['row_number'] = len(contents)
    data_dict['profile']['column_number'] = len(column_names)
    data_dict['contents'] = contents

    if nan_detect:
        warnings.warn('The loaded dataset may contain NaN elements, and the columns containing nan are listed as follows', UserWarning)
    return data_dict

split(X, y, split_type='train_test_split', train_percentage=0.9, fold=10, random_state=1234, shuffle=True)

Splits the dataset into training and testing datasets.


Name Type Description Default
X Tensor

The feature tensor.

y Tensor

The label tensor.

split_type str

The type of splitting ('train_test_split' or 'KFold').

= 'train_test_split'
train_percentage float

The proportion of data to be used for training in train-test split.

= 0.9
fold int

Number of folds for cross-validation.

= 10
random_state int

Random state for reproducibility.

= 1234
shuffle bool

Whether to shuffle the data before splitting.

= True


Type Description

Training and testing DataLoaders (or dictionaries in case of KFold).

Source code in tinybig/data/
def split(self, X, y, split_type='train_test_split', train_percentage=0.9, fold=10, random_state=1234, shuffle=True):
    Splits the dataset into training and testing datasets.

    X : torch.Tensor
        The feature tensor.
    y : torch.Tensor
        The label tensor.
    split_type : str, default = 'train_test_split'
        The type of splitting ('train_test_split' or 'KFold').
    train_percentage : float, default = 0.9
        The proportion of data to be used for training in train-test split.
    fold : int, default = 10
        Number of folds for cross-validation.
    random_state : int, default = 1234
        Random state for reproducibility.
    shuffle : bool, default = True
        Whether to shuffle the data before splitting.

        Training and testing DataLoaders (or dictionaries in case of KFold).
    train_loader, test_loader = None, None
    if split_type == 'train_test_split':
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            train_size=int(train_percentage * len(X)),
            random_state=random_state, shuffle=shuffle
        train_dataset = dataset(X_train, torch.unsqueeze(y_train, 1))
        test_dataset = dataset(X_test, torch.unsqueeze(y_test, 1))
        train_loader = DataLoader(train_dataset, batch_size=self.train_batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=self.test_batch_size, shuffle=False)
    elif split_type in ['KFold', 'cross_validation']:
        kf = KFold(n_splits=fold, random_state=random_state, shuffle=shuffle)
        train_loader, test_loader = {}, {}
        for i, (train_index, test_index) in enumerate(kf.split(X)):
            X_train, y_train = X[train_index], y[train_index]
            X_test, y_test = X[test_index], y[test_index]
            train_dataset = dataset(X_train, torch.unsqueeze(y_train, 1))
            test_dataset = dataset(X_test, torch.unsqueeze(y_test, 1))
            train_loader[i] = DataLoader(train_dataset, batch_size=self.train_batch_size, shuffle=True)
            test_loader[i] = DataLoader(test_dataset, batch_size=self.test_batch_size, shuffle=False)
    return train_loader, test_loader