graph_dataloader

Bases: dataloader

A dataloader class for graph-based datasets.

This class extends the base dataloader class to handle graph data, including nodes, links, and associated features.

Attributes:

Name	Type	Description
`data_profile`	`dict`	The data profile containing metadata and download links for the graph dataset.
`graph`	`graph`	The loaded graph structure.

Methods:

Name	Description
`__init__`	Initializes the graph dataloader.
`download_data`	Downloads the graph dataset files.
`load_raw`	Loads the raw graph data from files.
`save_graph`	Saves the graph structure to a file.
`load_graph`	Loads the graph structure from a file.
`get_graph`	Retrieves the loaded graph structure.
`get_adj`	Retrieves the adjacency matrix of the graph.
`load`	Loads the dataset, either in transductive or inductive mode.
`get_train_test_idx`	Abstract method to generate train and test indices for the dataset.

Source code in tinybig/data/graph_dataloader.py

class graph_dataloader(dataloader):
    """
    A dataloader class for graph-based datasets.

    This class extends the base `dataloader` class to handle graph data, including nodes, links, and associated features.

    Attributes
    ----------
    data_profile: dict
        The data profile containing metadata and download links for the graph dataset.
    graph: graph_class
        The loaded graph structure.

    Methods
    -------
    __init__(data_profile: dict = None, ...)
        Initializes the graph dataloader.
    download_data(data_profile: dict, cache_dir: str = None, file_name: str = None)
        Downloads the graph dataset files.
    load_raw(cache_dir: str, device: str = 'cpu', normalization: bool = True, ...)
        Loads the raw graph data from files.
    save_graph(complete_path: str, graph: graph_class = None)
        Saves the graph structure to a file.
    load_graph(complete_path: str)
        Loads the graph structure from a file.
    get_graph()
        Retrieves the loaded graph structure.
    get_adj(graph: graph_class = None)
        Retrieves the adjacency matrix of the graph.
    load(mode: str = 'transductive', ...)
        Loads the dataset, either in transductive or inductive mode.
    get_train_test_idx(X: torch.Tensor = None, y: torch.Tensor = None, ...)
        Abstract method to generate train and test indices for the dataset.
    """

    def __init__(self, data_profile: dict = None, name: str = 'graph_data', train_batch_size: int = 64, test_batch_size: int = 64):
        """
        Initializes the graph dataloader.

        Parameters
        ----------
        data_profile: dict, optional
            Metadata and download links for the graph dataset.
        name: str, default = 'graph_data'
            The name of the dataloader instance.
        train_batch_size: int, default = 64
            Batch size for the training dataset.
        test_batch_size: int, default = 64
            Batch size for the testing dataset.

        Returns
        -------
        None
        """
        super().__init__(name=name, train_batch_size=train_batch_size, test_batch_size=test_batch_size)

        self.data_profile = data_profile
        self.graph = None

    @staticmethod
    def download_data(data_profile: dict, cache_dir: str = None, file_name: str = None):
        """
        Downloads the graph dataset files.

        Parameters
        ----------
        data_profile: dict
            Metadata and download links for the graph dataset.
        cache_dir: str, optional
            Directory to store the downloaded files. Defaults to './data/'.
        file_name: str, optional
            Specific file name to download. If None, all files in the `data_profile` are downloaded.

        Returns
        -------
        None

        Raises
        ------
        ValueError
            If `data_profile` is None or doesn't contain the 'url' key.
        """
        if data_profile is None:
            raise ValueError('The data profile must be provided.')

        if cache_dir is None:
            cache_dir = './data/'

        if data_profile is None or 'url' not in data_profile:
            raise ValueError('data_profile must not be None and should contain "url" key...')

        if file_name is None:
            for file_name in data_profile['url']:
                download_file_from_github(url_link=data_profile['url'][file_name], destination_path="{}/{}".format(cache_dir, file_name))
        else:
            assert file_name in data_profile['url']
            download_file_from_github(url_link=data_profile['url'][file_name], destination_path="{}/{}".format(cache_dir, file_name))


    def load_raw(self, cache_dir: str, device: str = 'cpu', normalization: bool = True, normalization_mode: str = 'row'):
        """
        Loads the raw graph data from files.

        Parameters
        ----------
        cache_dir: str
            Directory containing the graph data files.
        device: str, default = 'cpu'
            Device to store the data.
        normalization: bool, default = True
            Whether to normalize the node features.
        normalization_mode: str, default = 'row'
            Mode of normalization ('row' or 'column').

        Returns
        -------
        tuple
            The graph structure, node features (X), and labels (y).

        Raises
        ------
        FileNotFoundError
            If the required files are not found in the cache directory.
        """
        if not check_file_existence("{}/node".format(cache_dir)):
            self.download_data(data_profile=self.data_profile, cache_dir=cache_dir, file_name='node')
        if not check_file_existence("{}/link".format(cache_dir)):
            self.download_data(data_profile=self.data_profile, cache_dir=cache_dir, file_name='link')

        idx_features_labels = np.genfromtxt("{}/node".format(cache_dir), dtype=np.dtype(str))
        X = torch.tensor(sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32).todense())
        y = dataloader.encode_str_labels(labels=idx_features_labels[:, -1], one_hot=False)

        if normalization:
            X = degree_based_normalize_matrix(mx=X, mode=normalization_mode)

        nodes = np.array(idx_features_labels[:, 0], dtype=np.int32).tolist()
        links = np.genfromtxt("{}/link".format(cache_dir), dtype=np.int32).tolist()
        graph = graph_class(
            nodes=nodes, links=links, directed=True, device=device
        )
        return graph, X, y

    def save_graph(self, complete_path: str, graph: graph_class = None):
        """
        Saves the graph structure to a file.

        Parameters
        ----------
        complete_path: str
            The file path to save the graph structure.
        graph: graph_class, optional
            The graph structure to save. If None, the internal graph is used.

        Returns
        -------
        str
            The path to the saved graph file.

        Raises
        ------
        ValueError
            If no graph structure is loaded or the path is not provided.
        """
        graph = graph if graph is not None else self.graph
        if graph is None:
            raise ValueError('The graph structure has not been loaded yet...')
        if complete_path is None:
            raise ValueError('The cache complete_path has not been set yet...')
        return graph.save(complete_path=complete_path)

    def load_graph(self, complete_path: str):
        """
        Loads the graph structure from a file.

        Parameters
        ----------
        complete_path: str
            The file path to load the graph structure from.

        Returns
        -------
        graph_class
            The loaded graph structure.

        Raises
        ------
        ValueError
            If the file path is not provided.
        """
        if complete_path is None:
            raise ValueError('The cache complete_path has not been set yet...')
        self.graph = graph_class.load(complete_path=complete_path)
        return self.graph

    def get_graph(self):
        """
        Retrieves the loaded graph structure.

        Returns
        -------
        graph_class
            The loaded graph structure, or None if no graph is loaded.
        """
        return self.graph

    def get_adj(self, graph: graph_class = None):
        """
        Retrieves the adjacency matrix of the graph.

        Parameters
        ----------
        graph: graph_class, optional
            The graph structure to use. If None, the internal graph is used.

        Returns
        -------
        torch.Tensor
            The adjacency matrix of the graph.

        Raises
        ------
        ValueError
            If no graph structure is loaded.
        """
        graph = graph if graph is not None else self.graph
        if graph is None:
            raise ValueError('The graph structure has not been loaded yet...')
        return graph.to_matrix(
            normalization=True,
            normalization_mode='row',
        )

    def load(self, mode: str = 'transductive', cache_dir: str = None, device: str = 'cpu',
             train_percentage: float = 0.5, random_state: int = 1234, shuffle: bool = False, *args, **kwargs):
        """
        Loads the dataset in either transductive or inductive mode.

        Parameters
        ----------
        mode: str, default = 'transductive'
            Mode of loading the dataset ('transductive' or 'inductive').
        cache_dir: str, optional
            Directory containing the graph data files. Defaults to './data/{name}'.
        device: str, default = 'cpu'
            Device to store the data.
        train_percentage: float, default = 0.5
            Percentage of data to use for training in inductive mode.
        random_state: int, default = 1234
            Seed for random number generation.
        shuffle: bool, default = False
            Whether to shuffle the data.

        Returns
        -------
        dict
            A dictionary containing train/test loaders and graph structure.

        Raises
        ------
        ValueError
            If required files are not found or the graph is not properly loaded.
        """
        cache_dir = cache_dir if cache_dir is not None else "./data/{}".format(self.name)
        self.graph, X, y = self.load_raw(cache_dir=cache_dir, device=device)

        if mode == 'transductive':
            warnings.warn("For transductive settings, the train, test, and val partition will not follow the provided parameters (e.g., train percentage, batch size, etc.)...")
            train_idx, test_idx = self.get_train_test_idx(X=X, y=y)
            complete_dataset = dataset(X, y)
            complete_dataloader = DataLoader(dataset=complete_dataset, batch_size=len(X), shuffle=False)
            return {'train_idx': train_idx, 'test_idx': test_idx, 'train_loader': complete_dataloader, 'test_loader': complete_dataloader, 'graph_structure': self.graph}
        else:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y,
                train_size=int(train_percentage * len(X)),
                random_state=random_state, shuffle=shuffle
            )
            train_dataset = dataset(X_train, y_train)
            test_dataset = dataset(X_test, y_test)
            if self.train_batch_size >= 1:
                train_loader = DataLoader(dataset=train_dataset, batch_size=self.train_batch_size, shuffle=True)
            else:
                train_loader = DataLoader(dataset=train_dataset, batch_size=len(X_train), shuffle=True)
            if self.test_batch_size >= 1:
                test_loader = DataLoader(dataset=test_dataset, batch_size=self.test_batch_size, shuffle=False)
            else:
                test_loader = DataLoader(dataset=test_dataset, batch_size=len(X_test), shuffle=False)
            return {'train_loader': train_loader, 'test_loader': test_loader, 'graph_structure': self.graph}

    @abstractmethod
    def get_train_test_idx(self, X: torch.Tensor = None, y: torch.Tensor = None, *args, **kwargs):
        """
        Abstract method to generate train and test indices for the dataset.

        Parameters
        ----------
        X: torch.Tensor, optional
            Node features.
        y: torch.Tensor, optional
            Labels.

        Returns
        -------
        tuple
            Train and test indices.
        """
        pass

`init(data_profile=None, name='graph_data', train_batch_size=64, test_batch_size=64)`

Initializes the graph dataloader.

Parameters:

Name	Type	Description	Default
`data_profile`	`dict`	Metadata and download links for the graph dataset.	`None`
`name`	`str`	The name of the dataloader instance.	`'graph_data'`
`train_batch_size`	`int`	Batch size for the training dataset.	`64`
`test_batch_size`	`int`	Batch size for the testing dataset.	`64`

Returns:

Type	Description
`None`

Source code in tinybig/data/graph_dataloader.py

def __init__(self, data_profile: dict = None, name: str = 'graph_data', train_batch_size: int = 64, test_batch_size: int = 64):
    """
    Initializes the graph dataloader.

    Parameters
    ----------
    data_profile: dict, optional
        Metadata and download links for the graph dataset.
    name: str, default = 'graph_data'
        The name of the dataloader instance.
    train_batch_size: int, default = 64
        Batch size for the training dataset.
    test_batch_size: int, default = 64
        Batch size for the testing dataset.

    Returns
    -------
    None
    """
    super().__init__(name=name, train_batch_size=train_batch_size, test_batch_size=test_batch_size)

    self.data_profile = data_profile
    self.graph = None

`download_data(data_profile, cache_dir=None, file_name=None)` `staticmethod`

Downloads the graph dataset files.

Parameters:

Name	Type	Description	Default
`data_profile`	`dict`	Metadata and download links for the graph dataset.	required
`cache_dir`	`str`	Directory to store the downloaded files. Defaults to './data/'.	`None`
`file_name`	`str`	Specific file name to download. If None, all files in the `data_profile` are downloaded.	`None`

Returns:

Type	Description
`None`

Raises:

Type	Description
`ValueError`	If `data_profile` is None or doesn't contain the 'url' key.

Source code in tinybig/data/graph_dataloader.py

@staticmethod
def download_data(data_profile: dict, cache_dir: str = None, file_name: str = None):
    """
    Downloads the graph dataset files.

    Parameters
    ----------
    data_profile: dict
        Metadata and download links for the graph dataset.
    cache_dir: str, optional
        Directory to store the downloaded files. Defaults to './data/'.
    file_name: str, optional
        Specific file name to download. If None, all files in the `data_profile` are downloaded.

    Returns
    -------
    None

    Raises
    ------
    ValueError
        If `data_profile` is None or doesn't contain the 'url' key.
    """
    if data_profile is None:
        raise ValueError('The data profile must be provided.')

    if cache_dir is None:
        cache_dir = './data/'

    if data_profile is None or 'url' not in data_profile:
        raise ValueError('data_profile must not be None and should contain "url" key...')

    if file_name is None:
        for file_name in data_profile['url']:
            download_file_from_github(url_link=data_profile['url'][file_name], destination_path="{}/{}".format(cache_dir, file_name))
    else:
        assert file_name in data_profile['url']
        download_file_from_github(url_link=data_profile['url'][file_name], destination_path="{}/{}".format(cache_dir, file_name))

`get_adj(graph=None)`

Retrieves the adjacency matrix of the graph.

Parameters:

Name	Type	Description	Default
`graph`	`graph`	The graph structure to use. If None, the internal graph is used.	`None`

Returns:

Type	Description
`Tensor`	The adjacency matrix of the graph.

Raises:

Type	Description
`ValueError`	If no graph structure is loaded.

Source code in tinybig/data/graph_dataloader.py

def get_adj(self, graph: graph_class = None):
    """
    Retrieves the adjacency matrix of the graph.

    Parameters
    ----------
    graph: graph_class, optional
        The graph structure to use. If None, the internal graph is used.

    Returns
    -------
    torch.Tensor
        The adjacency matrix of the graph.

    Raises
    ------
    ValueError
        If no graph structure is loaded.
    """
    graph = graph if graph is not None else self.graph
    if graph is None:
        raise ValueError('The graph structure has not been loaded yet...')
    return graph.to_matrix(
        normalization=True,
        normalization_mode='row',
    )

`get_graph()`

Retrieves the loaded graph structure.

Returns:

Type	Description
`graph`	The loaded graph structure, or None if no graph is loaded.

Source code in tinybig/data/graph_dataloader.py

def get_graph(self):
    """
    Retrieves the loaded graph structure.

    Returns
    -------
    graph_class
        The loaded graph structure, or None if no graph is loaded.
    """
    return self.graph

`get_train_test_idx(X=None, y=None, *args, **kwargs)` `abstractmethod`

Abstract method to generate train and test indices for the dataset.

Parameters:

Name	Type	Description	Default
`X`	`Tensor`	Node features.	`None`
`y`	`Tensor`	Labels.	`None`

Returns:

Type	Description
`tuple`	Train and test indices.

Source code in tinybig/data/graph_dataloader.py

@abstractmethod
def get_train_test_idx(self, X: torch.Tensor = None, y: torch.Tensor = None, *args, **kwargs):
    """
    Abstract method to generate train and test indices for the dataset.

    Parameters
    ----------
    X: torch.Tensor, optional
        Node features.
    y: torch.Tensor, optional
        Labels.

    Returns
    -------
    tuple
        Train and test indices.
    """
    pass

`load(mode='transductive', cache_dir=None, device='cpu', train_percentage=0.5, random_state=1234, shuffle=False, *args, **kwargs)`

Loads the dataset in either transductive or inductive mode.

Parameters:

Name	Type	Description	Default
`mode`	`str`	Mode of loading the dataset ('transductive' or 'inductive').	`'transductive'`
`cache_dir`	`str`	Directory containing the graph data files. Defaults to './data/{name}'.	`None`
`device`	`str`	Device to store the data.	`'cpu'`
`train_percentage`	`float`	Percentage of data to use for training in inductive mode.	`0.5`
`random_state`	`int`	Seed for random number generation.	`1234`
`shuffle`	`bool`	Whether to shuffle the data.	`False`

Returns:

Type	Description
`dict`	A dictionary containing train/test loaders and graph structure.

Raises:

Type	Description
`ValueError`	If required files are not found or the graph is not properly loaded.

Source code in tinybig/data/graph_dataloader.py

def load(self, mode: str = 'transductive', cache_dir: str = None, device: str = 'cpu',
         train_percentage: float = 0.5, random_state: int = 1234, shuffle: bool = False, *args, **kwargs):
    """
    Loads the dataset in either transductive or inductive mode.

    Parameters
    ----------
    mode: str, default = 'transductive'
        Mode of loading the dataset ('transductive' or 'inductive').
    cache_dir: str, optional
        Directory containing the graph data files. Defaults to './data/{name}'.
    device: str, default = 'cpu'
        Device to store the data.
    train_percentage: float, default = 0.5
        Percentage of data to use for training in inductive mode.
    random_state: int, default = 1234
        Seed for random number generation.
    shuffle: bool, default = False
        Whether to shuffle the data.

    Returns
    -------
    dict
        A dictionary containing train/test loaders and graph structure.

    Raises
    ------
    ValueError
        If required files are not found or the graph is not properly loaded.
    """
    cache_dir = cache_dir if cache_dir is not None else "./data/{}".format(self.name)
    self.graph, X, y = self.load_raw(cache_dir=cache_dir, device=device)

    if mode == 'transductive':
        warnings.warn("For transductive settings, the train, test, and val partition will not follow the provided parameters (e.g., train percentage, batch size, etc.)...")
        train_idx, test_idx = self.get_train_test_idx(X=X, y=y)
        complete_dataset = dataset(X, y)
        complete_dataloader = DataLoader(dataset=complete_dataset, batch_size=len(X), shuffle=False)
        return {'train_idx': train_idx, 'test_idx': test_idx, 'train_loader': complete_dataloader, 'test_loader': complete_dataloader, 'graph_structure': self.graph}
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            train_size=int(train_percentage * len(X)),
            random_state=random_state, shuffle=shuffle
        )
        train_dataset = dataset(X_train, y_train)
        test_dataset = dataset(X_test, y_test)
        if self.train_batch_size >= 1:
            train_loader = DataLoader(dataset=train_dataset, batch_size=self.train_batch_size, shuffle=True)
        else:
            train_loader = DataLoader(dataset=train_dataset, batch_size=len(X_train), shuffle=True)
        if self.test_batch_size >= 1:
            test_loader = DataLoader(dataset=test_dataset, batch_size=self.test_batch_size, shuffle=False)
        else:
            test_loader = DataLoader(dataset=test_dataset, batch_size=len(X_test), shuffle=False)
        return {'train_loader': train_loader, 'test_loader': test_loader, 'graph_structure': self.graph}

`load_graph(complete_path)`

Loads the graph structure from a file.

Parameters:

Name	Type	Description	Default
`complete_path`	`str`	The file path to load the graph structure from.	required

Returns:

Type	Description
`graph`	The loaded graph structure.

Raises:

Type	Description
`ValueError`	If the file path is not provided.

Source code in tinybig/data/graph_dataloader.py

def load_graph(self, complete_path: str):
    """
    Loads the graph structure from a file.

    Parameters
    ----------
    complete_path: str
        The file path to load the graph structure from.

    Returns
    -------
    graph_class
        The loaded graph structure.

    Raises
    ------
    ValueError
        If the file path is not provided.
    """
    if complete_path is None:
        raise ValueError('The cache complete_path has not been set yet...')
    self.graph = graph_class.load(complete_path=complete_path)
    return self.graph

`load_raw(cache_dir, device='cpu', normalization=True, normalization_mode='row')`

Loads the raw graph data from files.

Parameters:

Name	Type	Description	Default
`cache_dir`	`str`	Directory containing the graph data files.	required
`device`	`str`	Device to store the data.	`'cpu'`
`normalization`	`bool`	Whether to normalize the node features.	`True`
`normalization_mode`	`str`	Mode of normalization ('row' or 'column').	`'row'`

Returns:

Type	Description
`tuple`	The graph structure, node features (X), and labels (y).

Raises:

Type	Description
`FileNotFoundError`	If the required files are not found in the cache directory.

Source code in tinybig/data/graph_dataloader.py

def load_raw(self, cache_dir: str, device: str = 'cpu', normalization: bool = True, normalization_mode: str = 'row'):
    """
    Loads the raw graph data from files.

    Parameters
    ----------
    cache_dir: str
        Directory containing the graph data files.
    device: str, default = 'cpu'
        Device to store the data.
    normalization: bool, default = True
        Whether to normalize the node features.
    normalization_mode: str, default = 'row'
        Mode of normalization ('row' or 'column').

    Returns
    -------
    tuple
        The graph structure, node features (X), and labels (y).

    Raises
    ------
    FileNotFoundError
        If the required files are not found in the cache directory.
    """
    if not check_file_existence("{}/node".format(cache_dir)):
        self.download_data(data_profile=self.data_profile, cache_dir=cache_dir, file_name='node')
    if not check_file_existence("{}/link".format(cache_dir)):
        self.download_data(data_profile=self.data_profile, cache_dir=cache_dir, file_name='link')

    idx_features_labels = np.genfromtxt("{}/node".format(cache_dir), dtype=np.dtype(str))
    X = torch.tensor(sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32).todense())
    y = dataloader.encode_str_labels(labels=idx_features_labels[:, -1], one_hot=False)

    if normalization:
        X = degree_based_normalize_matrix(mx=X, mode=normalization_mode)

    nodes = np.array(idx_features_labels[:, 0], dtype=np.int32).tolist()
    links = np.genfromtxt("{}/link".format(cache_dir), dtype=np.int32).tolist()
    graph = graph_class(
        nodes=nodes, links=links, directed=True, device=device
    )
    return graph, X, y

`save_graph(complete_path, graph=None)`

Saves the graph structure to a file.

Parameters:

Name	Type	Description	Default
`complete_path`	`str`	The file path to save the graph structure.	required
`graph`	`graph`	The graph structure to save. If None, the internal graph is used.	`None`

Returns:

Type	Description
`str`	The path to the saved graph file.

Raises:

Type	Description
`ValueError`	If no graph structure is loaded or the path is not provided.

Source code in tinybig/data/graph_dataloader.py

def save_graph(self, complete_path: str, graph: graph_class = None):
    """
    Saves the graph structure to a file.

    Parameters
    ----------
    complete_path: str
        The file path to save the graph structure.
    graph: graph_class, optional
        The graph structure to save. If None, the internal graph is used.

    Returns
    -------
    str
        The path to the saved graph file.

    Raises
    ------
    ValueError
        If no graph structure is loaded or the path is not provided.
    """
    graph = graph if graph is not None else self.graph
    if graph is None:
        raise ValueError('The graph structure has not been loaded yet...')
    if complete_path is None:
        raise ValueError('The cache complete_path has not been set yet...')
    return graph.save(complete_path=complete_path)

graph_dataloader

__init__(data_profile=None, name='graph_data', train_batch_size=64, test_batch_size=64)

download_data(data_profile, cache_dir=None, file_name=None) staticmethod

get_adj(graph=None)

get_graph()

get_train_test_idx(X=None, y=None, *args, **kwargs) abstractmethod

load(mode='transductive', cache_dir=None, device='cpu', train_percentage=0.5, random_state=1234, shuffle=False, *args, **kwargs)

load_graph(complete_path)

load_raw(cache_dir, device='cpu', normalization=True, normalization_mode='row')

save_graph(complete_path, graph=None)

`init(data_profile=None, name='graph_data', train_batch_size=64, test_batch_size=64)`

`download_data(data_profile, cache_dir=None, file_name=None)` `staticmethod`

`get_adj(graph=None)`

`get_graph()`

`get_train_test_idx(X=None, y=None, *args, **kwargs)` `abstractmethod`

`load(mode='transductive', cache_dir=None, device='cpu', train_percentage=0.5, random_state=1234, shuffle=False, *args, **kwargs)`

`load_graph(complete_path)`

`load_raw(cache_dir, device='cpu', normalization=True, normalization_mode='row')`

`save_graph(complete_path, graph=None)`