Skip to content

function_dataloader

Bases: dataloader

A dataloader class for handling mathematical functions and equations.

This class extends the base dataloader class to provide functionality for loading equations, processing variables, and generating datasets based on mathematical formulas.

Attributes:

Name Type Description
name str

The name of the dataloader instance.

function_list list

A list of string representations of mathematical functions or equations.

equation_index int

The index of the currently selected equation.

Methods:

Name Description
__init__

Initializes the function dataloader.

load_equation

Loads and processes an equation from the function_list by its index.

load_all_equations

Loads and processes all equations in the function_list.

generate_data

Generates synthetic data based on a given formula and variable ranges.

load

Loads training and testing datasets for a specified equation.

Source code in tinybig/data/function_dataloader.py
class function_dataloader(dataloader):
    """
    A dataloader class for handling mathematical functions and equations.

    This class extends the base `dataloader` class to provide functionality for loading equations,
    processing variables, and generating datasets based on mathematical formulas.

    Attributes
    ----------
    name: str
        The name of the dataloader instance.
    function_list: list
        A list of string representations of mathematical functions or equations.
    equation_index: int
        The index of the currently selected equation.

    Methods
    -------
    __init__(name='function_dataloader', function_list: list = [], equation_index: int = 0, ...)
        Initializes the function dataloader.
    load_equation(index: int = 0)
        Loads and processes an equation from the `function_list` by its index.
    load_all_equations()
        Loads and processes all equations in the `function_list`.
    generate_data(formula: str, variables: dict, num: int = 2000, value_range: list = (0, 1), ...)
        Generates synthetic data based on a given formula and variable ranges.
    load(equation_index: int = None, ...)
        Loads training and testing datasets for a specified equation.
    """
    def __init__(self, name='function_dataloader', function_list: list = [], equation_index: int = 0, train_batch_size=64, test_batch_size=64):
        """
        Initializes the function dataloader with a list of equations and configurations.

        Parameters
        ----------
        name: str, default = 'function_dataloader'
            The name of the dataloader instance.
        function_list: list, default = []
            A list of string representations of mathematical functions or equations.
        equation_index: int, default = 0
            The index of the currently selected equation.
        train_batch_size: int, default = 64
            The batch size for training data.
        test_batch_size: int, default = 64
            The batch size for testing data.

        Returns
        -------
        None
        """
        super().__init__(name=name, train_batch_size=train_batch_size, test_batch_size=test_batch_size)
        self.equation_index = equation_index
        self.function_list = function_list

    def load_equation(self, index: int = 0):
        """
        Loads and processes a specific equation from the function list.

        Parameters
        ----------
        index: int, default = 0
            The index of the equation to load. If None, the default `equation_index` is used.

        Returns
        -------
        tuple
            A tuple containing the processed equation dictionary and the original string representation.

        Raises
        ------
        AssertionError
            If the index is out of range of the `function_list`.
        """
        index = index if index is not None else self.equation_index

        if index is None:
            return self.load_all_equations()
        else:
            assert index in range(0, len(self.function_list))
            str_equation = self.function_list[index]
            processed_equation = {}
            equation_contents = str_equation.split(',')
            processed_equation['equ_file_name'] = equation_contents[0]
            processed_equation['equ_number'] = int(equation_contents[1])
            processed_equation['equ_output'] = equation_contents[2]
            processed_equation['equ_formula'] = equation_contents[3]
            processed_equation['equ_variable_num'] = int(equation_contents[4])
            processed_equation['equ_variables'] = {}
            for var_index in range(0, processed_equation['equ_variable_num']):
                var_name = equation_contents[5 + var_index * 3]
                if var_name is None or var_name == '':
                    break
                var_low = float(equation_contents[6 + var_index * 3])
                var_high = float(equation_contents[7 + var_index * 3])
                processed_equation['equ_variables'][var_index] = {
                    'var_name': var_name,
                    'var_low': var_low,
                    'var_high': var_high
                }
            return processed_equation, str_equation

    def load_all_equations(self):
        """
        Loads and processes all equations in the `function_list`.

        Returns
        -------
        dict
            A dictionary where each key is an equation index, and the value is a tuple containing the
            processed equation dictionary and the original string representation.
        """
        processed_equations = {}
        for index in range(0, len(self.function_list)):
            processed_equations[index] = self.load_equation(index=index)
        return processed_equations

    @staticmethod
    def generate_data(formula: str, variables: dict, num: int = 2000, value_range: list = (0, 1),
                      normalize_X: bool = False, normalize_y: bool = False, *args, **kwargs):
        """
        Generates synthetic data based on a formula and variable ranges.

        Parameters
        ----------
        formula: str
            The mathematical formula to generate data for.
        variables: dict
            A dictionary of variable names and their respective ranges.
        num: int, default = 2000
            The number of data samples to generate.
        value_range: list, default = [0, 1]
            The default range for variables if not specified in `variables`.
        normalize_X: bool, default = False
            Whether to normalize the input features.
        normalize_y: bool, default = False
            Whether to normalize the output values.

        Returns
        -------
        tuple
            A tuple containing input features `X` and output values `y` as tensors.

        Raises
        ------
        AssertionError
            If variable ranges are not properly specified.
        """
        var_name_list = []
        var_value_space = []
        for var in variables:
            var_name_list.append(variables[var]['var_name'])
            var_low = variables[var]['var_low'] if variables[var]['var_low'] != '' else value_range[0]
            var_high = variables[var]['var_high'] if variables[var]['var_high'] != '' else value_range[1]
            assert var_low is not None and var_high is not None
            var_value_space.append(var_low + (var_high - var_low) * torch.rand(num))

        X = []
        y = []
        variables = ' '.join(var_name_list)
        func = function.string_to_function(formula, variables)
        for var_values in zip(*var_value_space):
            X.append(var_values)
            y.append(func(*var_values))
        X = torch.Tensor(X)
        y = torch.Tensor(y)

        if normalize_X:
            X_mean = torch.mean(X, dim=0, keepdim=True)
            X_std = torch.std(X, dim=0, keepdim=True)
            X = (X - X_mean)/X_std
        if normalize_y:
            y_mean = torch.mean(y, dim=0, keepdim=True)
            y_std = torch.std(y, dim=0, keepdim=True)
            y = (y - y_mean) / y_std

        return X, y

    def load(self, equation_index: int = None, cache_dir='./data/', num=2000,
             train_percentage=0.5, random_state=1234, shuffle=False, *args, **kwargs):
        """
        Loads training and testing datasets for a specified equation.

        Parameters
        ----------
        equation_index: int, default = None
            The index of the equation to load. If None, the default `equation_index` is used.
        cache_dir: str, default = './data/'
            The directory for caching data.
        num: int, default = 2000
            The number of data samples to generate.
        train_percentage: float, default = 0.5
            The percentage of data to use for training.
        random_state: int, default = 1234
            The random seed for reproducibility.
        shuffle: bool, default = False
            Whether to shuffle the data before splitting.

        Returns
        -------
        dict
            A dictionary containing training and testing data loaders, and the equation string.

        Raises
        ------
        ValueError
            If the `equation_index` is invalid or out of range.
        """
        equation_index = equation_index if equation_index is not None else self.equation_index

        if type(equation_index) is not int or equation_index not in range(0, len(self.function_list)):
            raise ValueError('The equation_index needs to be an integer from 0 to {}, '
                             'its current value {} is out of range...'.format(len(self.function_list)-1, equation_index))

        processed_equation, str_equation = self.load_equation(index=equation_index)

        X, y = self.generate_data(
            formula=processed_equation['equ_formula'],
            variables=processed_equation['equ_variables'],
            num=num,
            normalize_X=False,
            normalize_y=False,
        )
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            train_size=int(train_percentage*len(X)),
            random_state=random_state, shuffle=shuffle
        )

        train_dataset = dataset(X_train, torch.unsqueeze(y_train, 1))
        test_dataset = dataset(X_test, torch.unsqueeze(y_test, 1))
        train_loader = DataLoader(train_dataset, batch_size=self.train_batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=self.test_batch_size, shuffle=False)
        return {'train_loader': train_loader, 'test_loader': test_loader, 'str_equation': str_equation}

__init__(name='function_dataloader', function_list=[], equation_index=0, train_batch_size=64, test_batch_size=64)

Initializes the function dataloader with a list of equations and configurations.

Parameters:

Name Type Description Default
name

The name of the dataloader instance.

'function_dataloader'
function_list list

A list of string representations of mathematical functions or equations.

[]
equation_index int

The index of the currently selected equation.

0
train_batch_size

The batch size for training data.

64
test_batch_size

The batch size for testing data.

64

Returns:

Type Description
None
Source code in tinybig/data/function_dataloader.py
def __init__(self, name='function_dataloader', function_list: list = [], equation_index: int = 0, train_batch_size=64, test_batch_size=64):
    """
    Initializes the function dataloader with a list of equations and configurations.

    Parameters
    ----------
    name: str, default = 'function_dataloader'
        The name of the dataloader instance.
    function_list: list, default = []
        A list of string representations of mathematical functions or equations.
    equation_index: int, default = 0
        The index of the currently selected equation.
    train_batch_size: int, default = 64
        The batch size for training data.
    test_batch_size: int, default = 64
        The batch size for testing data.

    Returns
    -------
    None
    """
    super().__init__(name=name, train_batch_size=train_batch_size, test_batch_size=test_batch_size)
    self.equation_index = equation_index
    self.function_list = function_list

generate_data(formula, variables, num=2000, value_range=(0, 1), normalize_X=False, normalize_y=False, *args, **kwargs) staticmethod

Generates synthetic data based on a formula and variable ranges.

Parameters:

Name Type Description Default
formula str

The mathematical formula to generate data for.

required
variables dict

A dictionary of variable names and their respective ranges.

required
num int

The number of data samples to generate.

2000
value_range list

The default range for variables if not specified in variables.

(0, 1)
normalize_X bool

Whether to normalize the input features.

False
normalize_y bool

Whether to normalize the output values.

False

Returns:

Type Description
tuple

A tuple containing input features X and output values y as tensors.

Raises:

Type Description
AssertionError

If variable ranges are not properly specified.

Source code in tinybig/data/function_dataloader.py
@staticmethod
def generate_data(formula: str, variables: dict, num: int = 2000, value_range: list = (0, 1),
                  normalize_X: bool = False, normalize_y: bool = False, *args, **kwargs):
    """
    Generates synthetic data based on a formula and variable ranges.

    Parameters
    ----------
    formula: str
        The mathematical formula to generate data for.
    variables: dict
        A dictionary of variable names and their respective ranges.
    num: int, default = 2000
        The number of data samples to generate.
    value_range: list, default = [0, 1]
        The default range for variables if not specified in `variables`.
    normalize_X: bool, default = False
        Whether to normalize the input features.
    normalize_y: bool, default = False
        Whether to normalize the output values.

    Returns
    -------
    tuple
        A tuple containing input features `X` and output values `y` as tensors.

    Raises
    ------
    AssertionError
        If variable ranges are not properly specified.
    """
    var_name_list = []
    var_value_space = []
    for var in variables:
        var_name_list.append(variables[var]['var_name'])
        var_low = variables[var]['var_low'] if variables[var]['var_low'] != '' else value_range[0]
        var_high = variables[var]['var_high'] if variables[var]['var_high'] != '' else value_range[1]
        assert var_low is not None and var_high is not None
        var_value_space.append(var_low + (var_high - var_low) * torch.rand(num))

    X = []
    y = []
    variables = ' '.join(var_name_list)
    func = function.string_to_function(formula, variables)
    for var_values in zip(*var_value_space):
        X.append(var_values)
        y.append(func(*var_values))
    X = torch.Tensor(X)
    y = torch.Tensor(y)

    if normalize_X:
        X_mean = torch.mean(X, dim=0, keepdim=True)
        X_std = torch.std(X, dim=0, keepdim=True)
        X = (X - X_mean)/X_std
    if normalize_y:
        y_mean = torch.mean(y, dim=0, keepdim=True)
        y_std = torch.std(y, dim=0, keepdim=True)
        y = (y - y_mean) / y_std

    return X, y

load(equation_index=None, cache_dir='./data/', num=2000, train_percentage=0.5, random_state=1234, shuffle=False, *args, **kwargs)

Loads training and testing datasets for a specified equation.

Parameters:

Name Type Description Default
equation_index int

The index of the equation to load. If None, the default equation_index is used.

None
cache_dir

The directory for caching data.

'./data/'
num

The number of data samples to generate.

2000
train_percentage

The percentage of data to use for training.

0.5
random_state

The random seed for reproducibility.

1234
shuffle

Whether to shuffle the data before splitting.

False

Returns:

Type Description
dict

A dictionary containing training and testing data loaders, and the equation string.

Raises:

Type Description
ValueError

If the equation_index is invalid or out of range.

Source code in tinybig/data/function_dataloader.py
def load(self, equation_index: int = None, cache_dir='./data/', num=2000,
         train_percentage=0.5, random_state=1234, shuffle=False, *args, **kwargs):
    """
    Loads training and testing datasets for a specified equation.

    Parameters
    ----------
    equation_index: int, default = None
        The index of the equation to load. If None, the default `equation_index` is used.
    cache_dir: str, default = './data/'
        The directory for caching data.
    num: int, default = 2000
        The number of data samples to generate.
    train_percentage: float, default = 0.5
        The percentage of data to use for training.
    random_state: int, default = 1234
        The random seed for reproducibility.
    shuffle: bool, default = False
        Whether to shuffle the data before splitting.

    Returns
    -------
    dict
        A dictionary containing training and testing data loaders, and the equation string.

    Raises
    ------
    ValueError
        If the `equation_index` is invalid or out of range.
    """
    equation_index = equation_index if equation_index is not None else self.equation_index

    if type(equation_index) is not int or equation_index not in range(0, len(self.function_list)):
        raise ValueError('The equation_index needs to be an integer from 0 to {}, '
                         'its current value {} is out of range...'.format(len(self.function_list)-1, equation_index))

    processed_equation, str_equation = self.load_equation(index=equation_index)

    X, y = self.generate_data(
        formula=processed_equation['equ_formula'],
        variables=processed_equation['equ_variables'],
        num=num,
        normalize_X=False,
        normalize_y=False,
    )
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        train_size=int(train_percentage*len(X)),
        random_state=random_state, shuffle=shuffle
    )

    train_dataset = dataset(X_train, torch.unsqueeze(y_train, 1))
    test_dataset = dataset(X_test, torch.unsqueeze(y_test, 1))
    train_loader = DataLoader(train_dataset, batch_size=self.train_batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=self.test_batch_size, shuffle=False)
    return {'train_loader': train_loader, 'test_loader': test_loader, 'str_equation': str_equation}

load_all_equations()

Loads and processes all equations in the function_list.

Returns:

Type Description
dict

A dictionary where each key is an equation index, and the value is a tuple containing the processed equation dictionary and the original string representation.

Source code in tinybig/data/function_dataloader.py
def load_all_equations(self):
    """
    Loads and processes all equations in the `function_list`.

    Returns
    -------
    dict
        A dictionary where each key is an equation index, and the value is a tuple containing the
        processed equation dictionary and the original string representation.
    """
    processed_equations = {}
    for index in range(0, len(self.function_list)):
        processed_equations[index] = self.load_equation(index=index)
    return processed_equations

load_equation(index=0)

Loads and processes a specific equation from the function list.

Parameters:

Name Type Description Default
index int

The index of the equation to load. If None, the default equation_index is used.

0

Returns:

Type Description
tuple

A tuple containing the processed equation dictionary and the original string representation.

Raises:

Type Description
AssertionError

If the index is out of range of the function_list.

Source code in tinybig/data/function_dataloader.py
def load_equation(self, index: int = 0):
    """
    Loads and processes a specific equation from the function list.

    Parameters
    ----------
    index: int, default = 0
        The index of the equation to load. If None, the default `equation_index` is used.

    Returns
    -------
    tuple
        A tuple containing the processed equation dictionary and the original string representation.

    Raises
    ------
    AssertionError
        If the index is out of range of the `function_list`.
    """
    index = index if index is not None else self.equation_index

    if index is None:
        return self.load_all_equations()
    else:
        assert index in range(0, len(self.function_list))
        str_equation = self.function_list[index]
        processed_equation = {}
        equation_contents = str_equation.split(',')
        processed_equation['equ_file_name'] = equation_contents[0]
        processed_equation['equ_number'] = int(equation_contents[1])
        processed_equation['equ_output'] = equation_contents[2]
        processed_equation['equ_formula'] = equation_contents[3]
        processed_equation['equ_variable_num'] = int(equation_contents[4])
        processed_equation['equ_variables'] = {}
        for var_index in range(0, processed_equation['equ_variable_num']):
            var_name = equation_contents[5 + var_index * 3]
            if var_name is None or var_name == '':
                break
            var_low = float(equation_contents[6 + var_index * 3])
            var_high = float(equation_contents[7 + var_index * 3])
            processed_equation['equ_variables'][var_index] = {
                'var_name': var_name,
                'var_low': var_low,
                'var_high': var_high
            }
        return processed_equation, str_equation