Skip to content

tabular_dataloader

Bases: dataloader

A dataloader class for handling tabular datasets.

This class extends the base dataloader class to support loading, processing, normalizing, and splitting tabular datasets for machine learning tasks.

Attributes:

Name Type Description
data_contents (list or tuple, optional)

Raw tabular data contents, directly provided or loaded from a file.

train_batch_size int

The batch size for training datasets.

test_batch_size int

The batch size for testing datasets.

Methods:

Name Description
__init__

Initializes the tabular dataloader.

load_file

Loads data from a file.

load_raw_data

Loads raw data either from data_contents or a file.

process_data

Processes raw tabular data into structured formats.

load_complete_data

Loads and processes the entire dataset into feature and label tensors.

normalize

Normalizes a tensor using specified normalization techniques.

split

Splits the dataset into training and testing datasets.

load

Main method to load, normalize, and split tabular data.

Source code in tinybig/data/tabular_dataloader.py
class tabular_dataloader(dataloader):
    """
    A dataloader class for handling tabular datasets.

    This class extends the base `dataloader` class to support loading, processing,
    normalizing, and splitting tabular datasets for machine learning tasks.

    Attributes
    ----------
    data_contents : list or tuple, optional
        Raw tabular data contents, directly provided or loaded from a file.
    train_batch_size : int
        The batch size for training datasets.
    test_batch_size : int
        The batch size for testing datasets.

    Methods
    -------
    __init__(name, data_contents, train_batch_size, test_batch_size, ...)
        Initializes the tabular dataloader.
    load_file(cache_dir, filename)
        Loads data from a file.
    load_raw_data(data_contents, cache_dir, filename)
        Loads raw data either from `data_contents` or a file.
    process_data(data_contents, cache_dir, filename, str_converted_to_numerical, ...)
        Processes raw tabular data into structured formats.
    load_complete_data(data_contents, cache_dir, filename, ...)
        Loads and processes the entire dataset into feature and label tensors.
    normalize(input, normalize_type, normalize_range)
        Normalizes a tensor using specified normalization techniques.
    split(X, y, split_type, train_percentage, fold, random_state, shuffle)
        Splits the dataset into training and testing datasets.
    load(cache_dir, filename, split_type, train_percentage, fold, random_state, shuffle, ...)
        Main method to load, normalize, and split tabular data.
    """
    def __init__(self, name='tabular_dataloader', data_contents: list | tuple = None,
                 train_batch_size=64, test_batch_size=64, *args, **kwargs):
        """
        Initializes the tabular dataloader.

        Parameters
        ----------
        name : str, default = 'tabular_dataloader'
            The name of the dataloader instance.
        data_contents : list or tuple, optional
            Raw data contents provided directly.
        train_batch_size : int, default = 64
            The batch size for training datasets.
        test_batch_size : int, default = 64
            The batch size for testing datasets.

        Returns
        -------
        None
        """
        super().__init__(name=name, train_batch_size=train_batch_size, test_batch_size=test_batch_size, *args, **kwargs)
        self.data_contents = data_contents

    def load_file(self, cache_dir: str = None, filename: str = None):
        """
        Loads data from a file.

        Parameters
        ----------
        cache_dir : str
            The directory where the file is located.
        filename : str
            The name of the file to be loaded.

        Returns
        -------
        list
            A list of rows read from the file.

        Raises
        ------
        ValueError
            If the specified file does not exist.
        """
        file_path = cache_dir + filename
        if not os.path.exists(file_path):
            raise ValueError('The provided file path doesn\'t exist...'.format(file_path))
        f = open(file_path, 'r')
        rows = f.readlines()
        f.close()
        return rows

    def load_raw_data(self, data_contents=None, cache_dir: str = None, filename: str = None):
        """
        Loads raw data either from `data_contents` or from a file.

        Parameters
        ----------
        data_contents : list or tuple, optional
            The raw data contents directly provided.
        cache_dir : str, optional
            The directory where the file is located.
        filename : str, optional
            The name of the file to be loaded.

        Returns
        -------
        list or tuple
            The raw data contents.
        """
        data_contents = data_contents if data_contents is not None else self.data_contents
        if data_contents is not None:
            return data_contents
        elif cache_dir is not None and filename is not None:
            return self.load_file(cache_dir=cache_dir, filename=filename)

    def process_data(self, data_contents=None, cache_dir: str = None, filename: str = None,
                     str_converted_to_numerical: bool = True, *args, **kwargs):
        """
        Processes raw tabular data into structured formats.

        Parameters
        ----------
        data_contents : list or tuple, optional
            The raw data contents directly provided.
        cache_dir : str, optional
            The directory where the file is located.
        filename : str, optional
            The name of the file to be processed.
        str_converted_to_numerical : bool, default = True
            Whether to convert string columns to numerical values.

        Returns
        -------
        dict
            A dictionary containing processed data with metadata and contents.
        """
        rows = data_contents if data_contents is not None else self.load_file(cache_dir=cache_dir, filename=filename)

        data_dict = {
            'profile': {
                'name': self.name,
                'str_converted_to_numerical': str_converted_to_numerical,
                'column_names': [],  #[list of column names],
                'column_value_types': [],  #[list of column value types, e.g., float, int, str, etc.],
                'column_codings': [],  #[list of column index: {raw_values: coding_values}]
            },
            'contents': []  # [[vector of instance 1], [vector of instance 2], ... ]
        }


        headline = rows[0]
        column_names, column_value_types = [], []
        feature_name_value_type_list = headline.strip('\n').split(',')
        for feature_name_value_type in feature_name_value_type_list:
            name, value_type = feature_name_value_type.split('_')
            column_names.append(name)
            column_value_types.append(value_type)

        column_codings = [None]*len(column_names)
        contents = []
        nan_detect = False
        nan_columns = {}
        for row_index in range(1, len(rows)):
            row = rows[row_index]
            x = []
            features = row.strip('\n').split(',')
            for index in range(len(features)):
                feature = features[index]
                feature_value_type = column_value_types[index]
                if feature_value_type in ['int', 'integer', 'Int', 'Integer', 'INT', 'INTEGER']:
                    try:
                        feature = int(feature)
                    except:
                        nan_detect = True
                        nan_columns[index] = 1
                        feature = np.nan
                elif feature_value_type in ['float', 'Float', 'double', 'Double', 'FLOAT', 'DOUBLE']:
                    try:
                        feature = float(feature)
                    except:
                        nan_detect = True
                        nan_columns[index] = 1
                        feature = np.nan
                elif feature_value_type in ['str', 'string', 'Str', 'String', 'strings', 'Strings', 'STR', 'STRING', 'STRINGS']:
                    if str_converted_to_numerical:
                        if column_codings[index] is None:
                            column_codings[index] = {}
                        if feature not in column_codings[index]:
                            column_codings[index][feature] = len(column_codings[index])
                        feature = column_codings[index][feature]
                x.append(feature)
            contents.append(x)
        data_dict['profile']['feature_names'] = column_names
        data_dict['profile']['column_value_types'] = column_value_types
        data_dict['profile']['column_codings'] = column_codings
        data_dict['profile']['row_number'] = len(contents)
        data_dict['profile']['column_number'] = len(column_names)
        data_dict['contents'] = contents

        if nan_detect:
            warnings.warn('The loaded dataset may contain NaN elements, and the columns containing nan are listed as follows', UserWarning)
            print(nan_columns.keys())
        return data_dict

    def load_complete_data(self, data_contents: list | tuple = None, cache_dir: str = None,
                           filename: str = None, *args, **kwargs):
        """
        Loads and processes the entire dataset into feature and label tensors.

        Parameters
        ----------
        data_contents : list or tuple, optional
            Raw data contents provided directly.
        cache_dir : str, optional
            The directory where the file is located.
        filename : str, optional
            The name of the file to be loaded.

        Returns
        -------
        dict
            A dictionary containing feature tensor `X`, label tensor `y`, and metadata.
        """
        data_contents = data_contents if data_contents is not None else self.data_contents
        raw_data = self.process_data(data_contents=data_contents, cache_dir=cache_dir, filename=filename,
                                     str_converted_to_numerical=True, *args, **kwargs)

        contents = np.array(raw_data['contents'])
        row_number = raw_data['profile']['row_number']
        column_number = raw_data['profile']['column_number']
        column_names = raw_data['profile']['feature_names']
        column_value_types = raw_data['profile']['column_value_types']
        ids = contents[:, 0]
        X = torch.Tensor(contents[:, 1:-1])
        y = torch.Tensor(contents[:, -1])

        return {
            'X': X,
            'y': y,
            'profile': {
                'name': raw_data['profile']['name'],
                'instance_number': row_number,
                'feature_number': column_number - 2,
                'feature_codings': raw_data['profile']['column_codings'][1:-1],
                'label_codings': raw_data['profile']['column_codings'][-1],
                'feature_names': column_names[1:-1],
                'label_names': column_names[-1],
                'feature_value_types': column_value_types[1:-1],
                'label_value_types': column_value_types[-1]
            }
        }

    @staticmethod
    def normalize(input: torch.Tensor, normalize_type=None, normalize_range: list | tuple=None):
        """
        Normalizes a tensor using specified normalization techniques.

        Parameters
        ----------
        input : torch.Tensor
            The input tensor to be normalized.
        normalize_type : str, optional
            The type of normalization ('min_max' or 'mean_std').
        normalize_range : list or tuple, optional
            The range for min-max normalization.

        Returns
        -------
        torch.Tensor
            The normalized tensor.
        """
        if normalize_type == 'min_max':
            if normalize_range is None:
                min_value, max_value = (0, 1)
            else:
                min_value, max_value = normalize_range
            X_std = (input - input.numpy().min(axis=0)) / (input.numpy().max(axis=0) - input.numpy().min(axis=0))
            X_scaled = X_std * (max_value - min_value) + min_value
        elif normalize_type == 'mean_std':
            X_mean = torch.mean(input, dim=0, keepdim=True)
            X_std = torch.std(input, dim=0, keepdim=True)
            X_scaled = (input - X_mean) / X_std
        else:
            X_scaled = input
        return X_scaled

    def split(self, X, y, split_type='train_test_split', train_percentage=0.9, fold=10, random_state=1234, shuffle=True):
        """
        Splits the dataset into training and testing datasets.

        Parameters
        ----------
        X : torch.Tensor
            The feature tensor.
        y : torch.Tensor
            The label tensor.
        split_type : str, default = 'train_test_split'
            The type of splitting ('train_test_split' or 'KFold').
        train_percentage : float, default = 0.9
            The proportion of data to be used for training in train-test split.
        fold : int, default = 10
            Number of folds for cross-validation.
        random_state : int, default = 1234
            Random state for reproducibility.
        shuffle : bool, default = True
            Whether to shuffle the data before splitting.

        Returns
        -------
        tuple
            Training and testing DataLoaders (or dictionaries in case of KFold).
        """
        train_loader, test_loader = None, None
        if split_type == 'train_test_split':
            X_train, X_test, y_train, y_test = train_test_split(
                X, y,
                train_size=int(train_percentage * len(X)),
                random_state=random_state, shuffle=shuffle
            )
            train_dataset = dataset(X_train, torch.unsqueeze(y_train, 1))
            test_dataset = dataset(X_test, torch.unsqueeze(y_test, 1))
            train_loader = DataLoader(train_dataset, batch_size=self.train_batch_size, shuffle=True)
            test_loader = DataLoader(test_dataset, batch_size=self.test_batch_size, shuffle=False)
        elif split_type in ['KFold', 'cross_validation']:
            kf = KFold(n_splits=fold, random_state=random_state, shuffle=shuffle)
            train_loader, test_loader = {}, {}
            for i, (train_index, test_index) in enumerate(kf.split(X)):
                X_train, y_train = X[train_index], y[train_index]
                X_test, y_test = X[test_index], y[test_index]
                train_dataset = dataset(X_train, torch.unsqueeze(y_train, 1))
                test_dataset = dataset(X_test, torch.unsqueeze(y_test, 1))
                train_loader[i] = DataLoader(train_dataset, batch_size=self.train_batch_size, shuffle=True)
                test_loader[i] = DataLoader(test_dataset, batch_size=self.test_batch_size, shuffle=False)
        return train_loader, test_loader

    def load(self, cache_dir: str = './data/', filename: str = 'classic_dataset_filename',
             split_type: str = 'train_test_split', train_percentage: float = 0.9, fold: int = 10,
             random_state: int = 123, shuffle: bool = True,
             normalize_X: bool = False, normalize_y: bool = False,
             normalize_type: str = 'min_max', normalize_range: list | tuple = (0, 1), *args, **kwargs):
        """
        Main method to load, normalize, and split tabular data.

        Parameters
        ----------
        cache_dir : str, default = './data/'
            The directory where the data file is located.
        filename : str, default = 'classic_dataset_filename'
            The name of the data file.
        split_type : str, default = 'train_test_split'
            The type of splitting ('train_test_split' or 'KFold').
        train_percentage : float, default = 0.9
            The proportion of data to be used for training in train-test split.
        fold : int, default = 10
            Number of folds for cross-validation.
        random_state : int, default = 123
            Random state for reproducibility.
        shuffle : bool, default = True
            Whether to shuffle the data before splitting.
        normalize_X : bool, default = False
            Whether to normalize the feature tensor.
        normalize_y : bool, default = False
            Whether to normalize the label tensor.
        normalize_type : str, default = 'min_max'
            The type of normalization ('min_max' or 'mean_std').
        normalize_range : list or tuple, default = (0, 1)
            The range for min-max normalization.

        Returns
        -------
        dict
            A dictionary containing training and testing DataLoaders, and metadata.
        """
        complete_data = self.load_complete_data(data_contents=self.data_contents, cache_dir=cache_dir, filename=filename)
        X, y = complete_data['X'], complete_data['y']

        if normalize_X:
            X = self.normalize(input=X, normalize_type=normalize_type, normalize_range=normalize_range)
        if normalize_y:
            y = self.normalize(input=y, normalize_type=normalize_type, normalize_range=normalize_range)

        train_loader, test_loader = self.split(X=X, y=y, split_type=split_type, train_percentage=train_percentage,
                                               fold=fold, random_state=random_state, shuffle=shuffle)

        return {
            'train_loader': train_loader,
            'test_loader': test_loader,
            'profile': complete_data['profile']
        }

__init__(name='tabular_dataloader', data_contents=None, train_batch_size=64, test_batch_size=64, *args, **kwargs)

Initializes the tabular dataloader.

Parameters:

Name Type Description Default
name str

The name of the dataloader instance.

= 'tabular_dataloader'
data_contents list or tuple

Raw data contents provided directly.

None
train_batch_size int

The batch size for training datasets.

= 64
test_batch_size int

The batch size for testing datasets.

= 64

Returns:

Type Description
None
Source code in tinybig/data/tabular_dataloader.py
def __init__(self, name='tabular_dataloader', data_contents: list | tuple = None,
             train_batch_size=64, test_batch_size=64, *args, **kwargs):
    """
    Initializes the tabular dataloader.

    Parameters
    ----------
    name : str, default = 'tabular_dataloader'
        The name of the dataloader instance.
    data_contents : list or tuple, optional
        Raw data contents provided directly.
    train_batch_size : int, default = 64
        The batch size for training datasets.
    test_batch_size : int, default = 64
        The batch size for testing datasets.

    Returns
    -------
    None
    """
    super().__init__(name=name, train_batch_size=train_batch_size, test_batch_size=test_batch_size, *args, **kwargs)
    self.data_contents = data_contents

load(cache_dir='./data/', filename='classic_dataset_filename', split_type='train_test_split', train_percentage=0.9, fold=10, random_state=123, shuffle=True, normalize_X=False, normalize_y=False, normalize_type='min_max', normalize_range=(0, 1), *args, **kwargs)

Main method to load, normalize, and split tabular data.

Parameters:

Name Type Description Default
cache_dir str

The directory where the data file is located.

= './data/'
filename str

The name of the data file.

= 'classic_dataset_filename'
split_type str

The type of splitting ('train_test_split' or 'KFold').

= 'train_test_split'
train_percentage float

The proportion of data to be used for training in train-test split.

= 0.9
fold int

Number of folds for cross-validation.

= 10
random_state int

Random state for reproducibility.

= 123
shuffle bool

Whether to shuffle the data before splitting.

= True
normalize_X bool

Whether to normalize the feature tensor.

= False
normalize_y bool

Whether to normalize the label tensor.

= False
normalize_type str

The type of normalization ('min_max' or 'mean_std').

= 'min_max'
normalize_range list or tuple

The range for min-max normalization.

= (0, 1)

Returns:

Type Description
dict

A dictionary containing training and testing DataLoaders, and metadata.

Source code in tinybig/data/tabular_dataloader.py
def load(self, cache_dir: str = './data/', filename: str = 'classic_dataset_filename',
         split_type: str = 'train_test_split', train_percentage: float = 0.9, fold: int = 10,
         random_state: int = 123, shuffle: bool = True,
         normalize_X: bool = False, normalize_y: bool = False,
         normalize_type: str = 'min_max', normalize_range: list | tuple = (0, 1), *args, **kwargs):
    """
    Main method to load, normalize, and split tabular data.

    Parameters
    ----------
    cache_dir : str, default = './data/'
        The directory where the data file is located.
    filename : str, default = 'classic_dataset_filename'
        The name of the data file.
    split_type : str, default = 'train_test_split'
        The type of splitting ('train_test_split' or 'KFold').
    train_percentage : float, default = 0.9
        The proportion of data to be used for training in train-test split.
    fold : int, default = 10
        Number of folds for cross-validation.
    random_state : int, default = 123
        Random state for reproducibility.
    shuffle : bool, default = True
        Whether to shuffle the data before splitting.
    normalize_X : bool, default = False
        Whether to normalize the feature tensor.
    normalize_y : bool, default = False
        Whether to normalize the label tensor.
    normalize_type : str, default = 'min_max'
        The type of normalization ('min_max' or 'mean_std').
    normalize_range : list or tuple, default = (0, 1)
        The range for min-max normalization.

    Returns
    -------
    dict
        A dictionary containing training and testing DataLoaders, and metadata.
    """
    complete_data = self.load_complete_data(data_contents=self.data_contents, cache_dir=cache_dir, filename=filename)
    X, y = complete_data['X'], complete_data['y']

    if normalize_X:
        X = self.normalize(input=X, normalize_type=normalize_type, normalize_range=normalize_range)
    if normalize_y:
        y = self.normalize(input=y, normalize_type=normalize_type, normalize_range=normalize_range)

    train_loader, test_loader = self.split(X=X, y=y, split_type=split_type, train_percentage=train_percentage,
                                           fold=fold, random_state=random_state, shuffle=shuffle)

    return {
        'train_loader': train_loader,
        'test_loader': test_loader,
        'profile': complete_data['profile']
    }

load_complete_data(data_contents=None, cache_dir=None, filename=None, *args, **kwargs)

Loads and processes the entire dataset into feature and label tensors.

Parameters:

Name Type Description Default
data_contents list or tuple

Raw data contents provided directly.

None
cache_dir str

The directory where the file is located.

None
filename str

The name of the file to be loaded.

None

Returns:

Type Description
dict

A dictionary containing feature tensor X, label tensor y, and metadata.

Source code in tinybig/data/tabular_dataloader.py
def load_complete_data(self, data_contents: list | tuple = None, cache_dir: str = None,
                       filename: str = None, *args, **kwargs):
    """
    Loads and processes the entire dataset into feature and label tensors.

    Parameters
    ----------
    data_contents : list or tuple, optional
        Raw data contents provided directly.
    cache_dir : str, optional
        The directory where the file is located.
    filename : str, optional
        The name of the file to be loaded.

    Returns
    -------
    dict
        A dictionary containing feature tensor `X`, label tensor `y`, and metadata.
    """
    data_contents = data_contents if data_contents is not None else self.data_contents
    raw_data = self.process_data(data_contents=data_contents, cache_dir=cache_dir, filename=filename,
                                 str_converted_to_numerical=True, *args, **kwargs)

    contents = np.array(raw_data['contents'])
    row_number = raw_data['profile']['row_number']
    column_number = raw_data['profile']['column_number']
    column_names = raw_data['profile']['feature_names']
    column_value_types = raw_data['profile']['column_value_types']
    ids = contents[:, 0]
    X = torch.Tensor(contents[:, 1:-1])
    y = torch.Tensor(contents[:, -1])

    return {
        'X': X,
        'y': y,
        'profile': {
            'name': raw_data['profile']['name'],
            'instance_number': row_number,
            'feature_number': column_number - 2,
            'feature_codings': raw_data['profile']['column_codings'][1:-1],
            'label_codings': raw_data['profile']['column_codings'][-1],
            'feature_names': column_names[1:-1],
            'label_names': column_names[-1],
            'feature_value_types': column_value_types[1:-1],
            'label_value_types': column_value_types[-1]
        }
    }

load_file(cache_dir=None, filename=None)

Loads data from a file.

Parameters:

Name Type Description Default
cache_dir str

The directory where the file is located.

None
filename str

The name of the file to be loaded.

None

Returns:

Type Description
list

A list of rows read from the file.

Raises:

Type Description
ValueError

If the specified file does not exist.

Source code in tinybig/data/tabular_dataloader.py
def load_file(self, cache_dir: str = None, filename: str = None):
    """
    Loads data from a file.

    Parameters
    ----------
    cache_dir : str
        The directory where the file is located.
    filename : str
        The name of the file to be loaded.

    Returns
    -------
    list
        A list of rows read from the file.

    Raises
    ------
    ValueError
        If the specified file does not exist.
    """
    file_path = cache_dir + filename
    if not os.path.exists(file_path):
        raise ValueError('The provided file path doesn\'t exist...'.format(file_path))
    f = open(file_path, 'r')
    rows = f.readlines()
    f.close()
    return rows

load_raw_data(data_contents=None, cache_dir=None, filename=None)

Loads raw data either from data_contents or from a file.

Parameters:

Name Type Description Default
data_contents list or tuple

The raw data contents directly provided.

None
cache_dir str

The directory where the file is located.

None
filename str

The name of the file to be loaded.

None

Returns:

Type Description
list or tuple

The raw data contents.

Source code in tinybig/data/tabular_dataloader.py
def load_raw_data(self, data_contents=None, cache_dir: str = None, filename: str = None):
    """
    Loads raw data either from `data_contents` or from a file.

    Parameters
    ----------
    data_contents : list or tuple, optional
        The raw data contents directly provided.
    cache_dir : str, optional
        The directory where the file is located.
    filename : str, optional
        The name of the file to be loaded.

    Returns
    -------
    list or tuple
        The raw data contents.
    """
    data_contents = data_contents if data_contents is not None else self.data_contents
    if data_contents is not None:
        return data_contents
    elif cache_dir is not None and filename is not None:
        return self.load_file(cache_dir=cache_dir, filename=filename)

normalize(input, normalize_type=None, normalize_range=None) staticmethod

Normalizes a tensor using specified normalization techniques.

Parameters:

Name Type Description Default
input Tensor

The input tensor to be normalized.

required
normalize_type str

The type of normalization ('min_max' or 'mean_std').

None
normalize_range list or tuple

The range for min-max normalization.

None

Returns:

Type Description
Tensor

The normalized tensor.

Source code in tinybig/data/tabular_dataloader.py
@staticmethod
def normalize(input: torch.Tensor, normalize_type=None, normalize_range: list | tuple=None):
    """
    Normalizes a tensor using specified normalization techniques.

    Parameters
    ----------
    input : torch.Tensor
        The input tensor to be normalized.
    normalize_type : str, optional
        The type of normalization ('min_max' or 'mean_std').
    normalize_range : list or tuple, optional
        The range for min-max normalization.

    Returns
    -------
    torch.Tensor
        The normalized tensor.
    """
    if normalize_type == 'min_max':
        if normalize_range is None:
            min_value, max_value = (0, 1)
        else:
            min_value, max_value = normalize_range
        X_std = (input - input.numpy().min(axis=0)) / (input.numpy().max(axis=0) - input.numpy().min(axis=0))
        X_scaled = X_std * (max_value - min_value) + min_value
    elif normalize_type == 'mean_std':
        X_mean = torch.mean(input, dim=0, keepdim=True)
        X_std = torch.std(input, dim=0, keepdim=True)
        X_scaled = (input - X_mean) / X_std
    else:
        X_scaled = input
    return X_scaled

process_data(data_contents=None, cache_dir=None, filename=None, str_converted_to_numerical=True, *args, **kwargs)

Processes raw tabular data into structured formats.

Parameters:

Name Type Description Default
data_contents list or tuple

The raw data contents directly provided.

None
cache_dir str

The directory where the file is located.

None
filename str

The name of the file to be processed.

None
str_converted_to_numerical bool

Whether to convert string columns to numerical values.

= True

Returns:

Type Description
dict

A dictionary containing processed data with metadata and contents.

Source code in tinybig/data/tabular_dataloader.py
def process_data(self, data_contents=None, cache_dir: str = None, filename: str = None,
                 str_converted_to_numerical: bool = True, *args, **kwargs):
    """
    Processes raw tabular data into structured formats.

    Parameters
    ----------
    data_contents : list or tuple, optional
        The raw data contents directly provided.
    cache_dir : str, optional
        The directory where the file is located.
    filename : str, optional
        The name of the file to be processed.
    str_converted_to_numerical : bool, default = True
        Whether to convert string columns to numerical values.

    Returns
    -------
    dict
        A dictionary containing processed data with metadata and contents.
    """
    rows = data_contents if data_contents is not None else self.load_file(cache_dir=cache_dir, filename=filename)

    data_dict = {
        'profile': {
            'name': self.name,
            'str_converted_to_numerical': str_converted_to_numerical,
            'column_names': [],  #[list of column names],
            'column_value_types': [],  #[list of column value types, e.g., float, int, str, etc.],
            'column_codings': [],  #[list of column index: {raw_values: coding_values}]
        },
        'contents': []  # [[vector of instance 1], [vector of instance 2], ... ]
    }


    headline = rows[0]
    column_names, column_value_types = [], []
    feature_name_value_type_list = headline.strip('\n').split(',')
    for feature_name_value_type in feature_name_value_type_list:
        name, value_type = feature_name_value_type.split('_')
        column_names.append(name)
        column_value_types.append(value_type)

    column_codings = [None]*len(column_names)
    contents = []
    nan_detect = False
    nan_columns = {}
    for row_index in range(1, len(rows)):
        row = rows[row_index]
        x = []
        features = row.strip('\n').split(',')
        for index in range(len(features)):
            feature = features[index]
            feature_value_type = column_value_types[index]
            if feature_value_type in ['int', 'integer', 'Int', 'Integer', 'INT', 'INTEGER']:
                try:
                    feature = int(feature)
                except:
                    nan_detect = True
                    nan_columns[index] = 1
                    feature = np.nan
            elif feature_value_type in ['float', 'Float', 'double', 'Double', 'FLOAT', 'DOUBLE']:
                try:
                    feature = float(feature)
                except:
                    nan_detect = True
                    nan_columns[index] = 1
                    feature = np.nan
            elif feature_value_type in ['str', 'string', 'Str', 'String', 'strings', 'Strings', 'STR', 'STRING', 'STRINGS']:
                if str_converted_to_numerical:
                    if column_codings[index] is None:
                        column_codings[index] = {}
                    if feature not in column_codings[index]:
                        column_codings[index][feature] = len(column_codings[index])
                    feature = column_codings[index][feature]
            x.append(feature)
        contents.append(x)
    data_dict['profile']['feature_names'] = column_names
    data_dict['profile']['column_value_types'] = column_value_types
    data_dict['profile']['column_codings'] = column_codings
    data_dict['profile']['row_number'] = len(contents)
    data_dict['profile']['column_number'] = len(column_names)
    data_dict['contents'] = contents

    if nan_detect:
        warnings.warn('The loaded dataset may contain NaN elements, and the columns containing nan are listed as follows', UserWarning)
        print(nan_columns.keys())
    return data_dict

split(X, y, split_type='train_test_split', train_percentage=0.9, fold=10, random_state=1234, shuffle=True)

Splits the dataset into training and testing datasets.

Parameters:

Name Type Description Default
X Tensor

The feature tensor.

required
y Tensor

The label tensor.

required
split_type str

The type of splitting ('train_test_split' or 'KFold').

= 'train_test_split'
train_percentage float

The proportion of data to be used for training in train-test split.

= 0.9
fold int

Number of folds for cross-validation.

= 10
random_state int

Random state for reproducibility.

= 1234
shuffle bool

Whether to shuffle the data before splitting.

= True

Returns:

Type Description
tuple

Training and testing DataLoaders (or dictionaries in case of KFold).

Source code in tinybig/data/tabular_dataloader.py
def split(self, X, y, split_type='train_test_split', train_percentage=0.9, fold=10, random_state=1234, shuffle=True):
    """
    Splits the dataset into training and testing datasets.

    Parameters
    ----------
    X : torch.Tensor
        The feature tensor.
    y : torch.Tensor
        The label tensor.
    split_type : str, default = 'train_test_split'
        The type of splitting ('train_test_split' or 'KFold').
    train_percentage : float, default = 0.9
        The proportion of data to be used for training in train-test split.
    fold : int, default = 10
        Number of folds for cross-validation.
    random_state : int, default = 1234
        Random state for reproducibility.
    shuffle : bool, default = True
        Whether to shuffle the data before splitting.

    Returns
    -------
    tuple
        Training and testing DataLoaders (or dictionaries in case of KFold).
    """
    train_loader, test_loader = None, None
    if split_type == 'train_test_split':
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            train_size=int(train_percentage * len(X)),
            random_state=random_state, shuffle=shuffle
        )
        train_dataset = dataset(X_train, torch.unsqueeze(y_train, 1))
        test_dataset = dataset(X_test, torch.unsqueeze(y_test, 1))
        train_loader = DataLoader(train_dataset, batch_size=self.train_batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=self.test_batch_size, shuffle=False)
    elif split_type in ['KFold', 'cross_validation']:
        kf = KFold(n_splits=fold, random_state=random_state, shuffle=shuffle)
        train_loader, test_loader = {}, {}
        for i, (train_index, test_index) in enumerate(kf.split(X)):
            X_train, y_train = X[train_index], y[train_index]
            X_test, y_test = X[test_index], y[test_index]
            train_dataset = dataset(X_train, torch.unsqueeze(y_train, 1))
            test_dataset = dataset(X_test, torch.unsqueeze(y_test, 1))
            train_loader[i] = DataLoader(train_dataset, batch_size=self.train_batch_size, shuffle=True)
            test_loader[i] = DataLoader(test_dataset, batch_size=self.test_batch_size, shuffle=False)
    return train_loader, test_loader