Skip to content

graph_dataloader

Bases: dataloader

A dataloader class for graph-based datasets.

This class extends the base dataloader class to handle graph data, including nodes, links, and associated features.

Attributes:

Name Type Description
data_profile dict

The data profile containing metadata and download links for the graph dataset.

graph graph

The loaded graph structure.

Methods:

Name Description
__init__

Initializes the graph dataloader.

download_data

Downloads the graph dataset files.

load_raw

Loads the raw graph data from files.

save_graph

Saves the graph structure to a file.

load_graph

Loads the graph structure from a file.

get_graph

Retrieves the loaded graph structure.

get_adj

Retrieves the adjacency matrix of the graph.

load

Loads the dataset, either in transductive or inductive mode.

get_train_test_idx

Abstract method to generate train and test indices for the dataset.

Source code in tinybig/data/graph_dataloader.py
class graph_dataloader(dataloader):
    """
    A dataloader class for graph-based datasets.

    This class extends the base `dataloader` class to handle graph data, including nodes, links, and associated features.

    Attributes
    ----------
    data_profile: dict
        The data profile containing metadata and download links for the graph dataset.
    graph: graph_class
        The loaded graph structure.

    Methods
    -------
    __init__(data_profile: dict = None, ...)
        Initializes the graph dataloader.
    download_data(data_profile: dict, cache_dir: str = None, file_name: str = None)
        Downloads the graph dataset files.
    load_raw(cache_dir: str, device: str = 'cpu', normalization: bool = True, ...)
        Loads the raw graph data from files.
    save_graph(complete_path: str, graph: graph_class = None)
        Saves the graph structure to a file.
    load_graph(complete_path: str)
        Loads the graph structure from a file.
    get_graph()
        Retrieves the loaded graph structure.
    get_adj(graph: graph_class = None)
        Retrieves the adjacency matrix of the graph.
    load(mode: str = 'transductive', ...)
        Loads the dataset, either in transductive or inductive mode.
    get_train_test_idx(X: torch.Tensor = None, y: torch.Tensor = None, ...)
        Abstract method to generate train and test indices for the dataset.
    """

    def __init__(self, data_profile: dict = None, name: str = 'graph_data', train_batch_size: int = 64, test_batch_size: int = 64):
        """
        Initializes the graph dataloader.

        Parameters
        ----------
        data_profile: dict, optional
            Metadata and download links for the graph dataset.
        name: str, default = 'graph_data'
            The name of the dataloader instance.
        train_batch_size: int, default = 64
            Batch size for the training dataset.
        test_batch_size: int, default = 64
            Batch size for the testing dataset.

        Returns
        -------
        None
        """
        super().__init__(name=name, train_batch_size=train_batch_size, test_batch_size=test_batch_size)

        self.data_profile = data_profile
        self.graph = None

    @staticmethod
    def download_data(data_profile: dict, cache_dir: str = None, file_name: str = None):
        """
        Downloads the graph dataset files.

        Parameters
        ----------
        data_profile: dict
            Metadata and download links for the graph dataset.
        cache_dir: str, optional
            Directory to store the downloaded files. Defaults to './data/'.
        file_name: str, optional
            Specific file name to download. If None, all files in the `data_profile` are downloaded.

        Returns
        -------
        None

        Raises
        ------
        ValueError
            If `data_profile` is None or doesn't contain the 'url' key.
        """
        if data_profile is None:
            raise ValueError('The data profile must be provided.')

        if cache_dir is None:
            cache_dir = './data/'

        if data_profile is None or 'url' not in data_profile:
            raise ValueError('data_profile must not be None and should contain "url" key...')

        if file_name is None:
            for file_name in data_profile['url']:
                download_file_from_github(url_link=data_profile['url'][file_name], destination_path="{}/{}".format(cache_dir, file_name))
        else:
            assert file_name in data_profile['url']
            download_file_from_github(url_link=data_profile['url'][file_name], destination_path="{}/{}".format(cache_dir, file_name))


    def load_raw(self, cache_dir: str, device: str = 'cpu', normalization: bool = True, normalization_mode: str = 'row'):
        """
        Loads the raw graph data from files.

        Parameters
        ----------
        cache_dir: str
            Directory containing the graph data files.
        device: str, default = 'cpu'
            Device to store the data.
        normalization: bool, default = True
            Whether to normalize the node features.
        normalization_mode: str, default = 'row'
            Mode of normalization ('row' or 'column').

        Returns
        -------
        tuple
            The graph structure, node features (X), and labels (y).

        Raises
        ------
        FileNotFoundError
            If the required files are not found in the cache directory.
        """
        if not check_file_existence("{}/node".format(cache_dir)):
            self.download_data(data_profile=self.data_profile, cache_dir=cache_dir, file_name='node')
        if not check_file_existence("{}/link".format(cache_dir)):
            self.download_data(data_profile=self.data_profile, cache_dir=cache_dir, file_name='link')

        idx_features_labels = np.genfromtxt("{}/node".format(cache_dir), dtype=np.dtype(str))
        X = torch.tensor(sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32).todense())
        y = dataloader.encode_str_labels(labels=idx_features_labels[:, -1], one_hot=False)

        if normalization:
            X = degree_based_normalize_matrix(mx=X, mode=normalization_mode)

        nodes = np.array(idx_features_labels[:, 0], dtype=np.int32).tolist()
        links = np.genfromtxt("{}/link".format(cache_dir), dtype=np.int32).tolist()
        graph = graph_class(
            nodes=nodes, links=links, directed=True, device=device
        )
        return graph, X, y

    def save_graph(self, complete_path: str, graph: graph_class = None):
        """
        Saves the graph structure to a file.

        Parameters
        ----------
        complete_path: str
            The file path to save the graph structure.
        graph: graph_class, optional
            The graph structure to save. If None, the internal graph is used.

        Returns
        -------
        str
            The path to the saved graph file.

        Raises
        ------
        ValueError
            If no graph structure is loaded or the path is not provided.
        """
        graph = graph if graph is not None else self.graph
        if graph is None:
            raise ValueError('The graph structure has not been loaded yet...')
        if complete_path is None:
            raise ValueError('The cache complete_path has not been set yet...')
        return graph.save(complete_path=complete_path)

    def load_graph(self, complete_path: str):
        """
        Loads the graph structure from a file.

        Parameters
        ----------
        complete_path: str
            The file path to load the graph structure from.

        Returns
        -------
        graph_class
            The loaded graph structure.

        Raises
        ------
        ValueError
            If the file path is not provided.
        """
        if complete_path is None:
            raise ValueError('The cache complete_path has not been set yet...')
        self.graph = graph_class.load(complete_path=complete_path)
        return self.graph

    def get_graph(self):
        """
        Retrieves the loaded graph structure.

        Returns
        -------
        graph_class
            The loaded graph structure, or None if no graph is loaded.
        """
        return self.graph

    def get_adj(self, graph: graph_class = None):
        """
        Retrieves the adjacency matrix of the graph.

        Parameters
        ----------
        graph: graph_class, optional
            The graph structure to use. If None, the internal graph is used.

        Returns
        -------
        torch.Tensor
            The adjacency matrix of the graph.

        Raises
        ------
        ValueError
            If no graph structure is loaded.
        """
        graph = graph if graph is not None else self.graph
        if graph is None:
            raise ValueError('The graph structure has not been loaded yet...')
        return graph.to_matrix(
            normalization=True,
            normalization_mode='row',
        )

    def load(self, mode: str = 'transductive', cache_dir: str = None, device: str = 'cpu',
             train_percentage: float = 0.5, random_state: int = 1234, shuffle: bool = False, *args, **kwargs):
        """
        Loads the dataset in either transductive or inductive mode.

        Parameters
        ----------
        mode: str, default = 'transductive'
            Mode of loading the dataset ('transductive' or 'inductive').
        cache_dir: str, optional
            Directory containing the graph data files. Defaults to './data/{name}'.
        device: str, default = 'cpu'
            Device to store the data.
        train_percentage: float, default = 0.5
            Percentage of data to use for training in inductive mode.
        random_state: int, default = 1234
            Seed for random number generation.
        shuffle: bool, default = False
            Whether to shuffle the data.

        Returns
        -------
        dict
            A dictionary containing train/test loaders and graph structure.

        Raises
        ------
        ValueError
            If required files are not found or the graph is not properly loaded.
        """
        cache_dir = cache_dir if cache_dir is not None else "./data/{}".format(self.name)
        self.graph, X, y = self.load_raw(cache_dir=cache_dir, device=device)

        if mode == 'transductive':
            warnings.warn("For transductive settings, the train, test, and val partition will not follow the provided parameters (e.g., train percentage, batch size, etc.)...")
            train_idx, test_idx = self.get_train_test_idx(X=X, y=y)
            complete_dataset = dataset(X, y)
            complete_dataloader = DataLoader(dataset=complete_dataset, batch_size=len(X), shuffle=False)
            return {'train_idx': train_idx, 'test_idx': test_idx, 'train_loader': complete_dataloader, 'test_loader': complete_dataloader, 'graph_structure': self.graph}
        else:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y,
                train_size=int(train_percentage * len(X)),
                random_state=random_state, shuffle=shuffle
            )
            train_dataset = dataset(X_train, y_train)
            test_dataset = dataset(X_test, y_test)
            if self.train_batch_size >= 1:
                train_loader = DataLoader(dataset=train_dataset, batch_size=self.train_batch_size, shuffle=True)
            else:
                train_loader = DataLoader(dataset=train_dataset, batch_size=len(X_train), shuffle=True)
            if self.test_batch_size >= 1:
                test_loader = DataLoader(dataset=test_dataset, batch_size=self.test_batch_size, shuffle=False)
            else:
                test_loader = DataLoader(dataset=test_dataset, batch_size=len(X_test), shuffle=False)
            return {'train_loader': train_loader, 'test_loader': test_loader, 'graph_structure': self.graph}

    @abstractmethod
    def get_train_test_idx(self, X: torch.Tensor = None, y: torch.Tensor = None, *args, **kwargs):
        """
        Abstract method to generate train and test indices for the dataset.

        Parameters
        ----------
        X: torch.Tensor, optional
            Node features.
        y: torch.Tensor, optional
            Labels.

        Returns
        -------
        tuple
            Train and test indices.
        """
        pass

__init__(data_profile=None, name='graph_data', train_batch_size=64, test_batch_size=64)

Initializes the graph dataloader.

Parameters:

Name Type Description Default
data_profile dict

Metadata and download links for the graph dataset.

None
name str

The name of the dataloader instance.

'graph_data'
train_batch_size int

Batch size for the training dataset.

64
test_batch_size int

Batch size for the testing dataset.

64

Returns:

Type Description
None
Source code in tinybig/data/graph_dataloader.py
def __init__(self, data_profile: dict = None, name: str = 'graph_data', train_batch_size: int = 64, test_batch_size: int = 64):
    """
    Initializes the graph dataloader.

    Parameters
    ----------
    data_profile: dict, optional
        Metadata and download links for the graph dataset.
    name: str, default = 'graph_data'
        The name of the dataloader instance.
    train_batch_size: int, default = 64
        Batch size for the training dataset.
    test_batch_size: int, default = 64
        Batch size for the testing dataset.

    Returns
    -------
    None
    """
    super().__init__(name=name, train_batch_size=train_batch_size, test_batch_size=test_batch_size)

    self.data_profile = data_profile
    self.graph = None

download_data(data_profile, cache_dir=None, file_name=None) staticmethod

Downloads the graph dataset files.

Parameters:

Name Type Description Default
data_profile dict

Metadata and download links for the graph dataset.

required
cache_dir str

Directory to store the downloaded files. Defaults to './data/'.

None
file_name str

Specific file name to download. If None, all files in the data_profile are downloaded.

None

Returns:

Type Description
None

Raises:

Type Description
ValueError

If data_profile is None or doesn't contain the 'url' key.

Source code in tinybig/data/graph_dataloader.py
@staticmethod
def download_data(data_profile: dict, cache_dir: str = None, file_name: str = None):
    """
    Downloads the graph dataset files.

    Parameters
    ----------
    data_profile: dict
        Metadata and download links for the graph dataset.
    cache_dir: str, optional
        Directory to store the downloaded files. Defaults to './data/'.
    file_name: str, optional
        Specific file name to download. If None, all files in the `data_profile` are downloaded.

    Returns
    -------
    None

    Raises
    ------
    ValueError
        If `data_profile` is None or doesn't contain the 'url' key.
    """
    if data_profile is None:
        raise ValueError('The data profile must be provided.')

    if cache_dir is None:
        cache_dir = './data/'

    if data_profile is None or 'url' not in data_profile:
        raise ValueError('data_profile must not be None and should contain "url" key...')

    if file_name is None:
        for file_name in data_profile['url']:
            download_file_from_github(url_link=data_profile['url'][file_name], destination_path="{}/{}".format(cache_dir, file_name))
    else:
        assert file_name in data_profile['url']
        download_file_from_github(url_link=data_profile['url'][file_name], destination_path="{}/{}".format(cache_dir, file_name))

get_adj(graph=None)

Retrieves the adjacency matrix of the graph.

Parameters:

Name Type Description Default
graph graph

The graph structure to use. If None, the internal graph is used.

None

Returns:

Type Description
Tensor

The adjacency matrix of the graph.

Raises:

Type Description
ValueError

If no graph structure is loaded.

Source code in tinybig/data/graph_dataloader.py
def get_adj(self, graph: graph_class = None):
    """
    Retrieves the adjacency matrix of the graph.

    Parameters
    ----------
    graph: graph_class, optional
        The graph structure to use. If None, the internal graph is used.

    Returns
    -------
    torch.Tensor
        The adjacency matrix of the graph.

    Raises
    ------
    ValueError
        If no graph structure is loaded.
    """
    graph = graph if graph is not None else self.graph
    if graph is None:
        raise ValueError('The graph structure has not been loaded yet...')
    return graph.to_matrix(
        normalization=True,
        normalization_mode='row',
    )

get_graph()

Retrieves the loaded graph structure.

Returns:

Type Description
graph

The loaded graph structure, or None if no graph is loaded.

Source code in tinybig/data/graph_dataloader.py
def get_graph(self):
    """
    Retrieves the loaded graph structure.

    Returns
    -------
    graph_class
        The loaded graph structure, or None if no graph is loaded.
    """
    return self.graph

get_train_test_idx(X=None, y=None, *args, **kwargs) abstractmethod

Abstract method to generate train and test indices for the dataset.

Parameters:

Name Type Description Default
X Tensor

Node features.

None
y Tensor

Labels.

None

Returns:

Type Description
tuple

Train and test indices.

Source code in tinybig/data/graph_dataloader.py
@abstractmethod
def get_train_test_idx(self, X: torch.Tensor = None, y: torch.Tensor = None, *args, **kwargs):
    """
    Abstract method to generate train and test indices for the dataset.

    Parameters
    ----------
    X: torch.Tensor, optional
        Node features.
    y: torch.Tensor, optional
        Labels.

    Returns
    -------
    tuple
        Train and test indices.
    """
    pass

load(mode='transductive', cache_dir=None, device='cpu', train_percentage=0.5, random_state=1234, shuffle=False, *args, **kwargs)

Loads the dataset in either transductive or inductive mode.

Parameters:

Name Type Description Default
mode str

Mode of loading the dataset ('transductive' or 'inductive').

'transductive'
cache_dir str

Directory containing the graph data files. Defaults to './data/{name}'.

None
device str

Device to store the data.

'cpu'
train_percentage float

Percentage of data to use for training in inductive mode.

0.5
random_state int

Seed for random number generation.

1234
shuffle bool

Whether to shuffle the data.

False

Returns:

Type Description
dict

A dictionary containing train/test loaders and graph structure.

Raises:

Type Description
ValueError

If required files are not found or the graph is not properly loaded.

Source code in tinybig/data/graph_dataloader.py
def load(self, mode: str = 'transductive', cache_dir: str = None, device: str = 'cpu',
         train_percentage: float = 0.5, random_state: int = 1234, shuffle: bool = False, *args, **kwargs):
    """
    Loads the dataset in either transductive or inductive mode.

    Parameters
    ----------
    mode: str, default = 'transductive'
        Mode of loading the dataset ('transductive' or 'inductive').
    cache_dir: str, optional
        Directory containing the graph data files. Defaults to './data/{name}'.
    device: str, default = 'cpu'
        Device to store the data.
    train_percentage: float, default = 0.5
        Percentage of data to use for training in inductive mode.
    random_state: int, default = 1234
        Seed for random number generation.
    shuffle: bool, default = False
        Whether to shuffle the data.

    Returns
    -------
    dict
        A dictionary containing train/test loaders and graph structure.

    Raises
    ------
    ValueError
        If required files are not found or the graph is not properly loaded.
    """
    cache_dir = cache_dir if cache_dir is not None else "./data/{}".format(self.name)
    self.graph, X, y = self.load_raw(cache_dir=cache_dir, device=device)

    if mode == 'transductive':
        warnings.warn("For transductive settings, the train, test, and val partition will not follow the provided parameters (e.g., train percentage, batch size, etc.)...")
        train_idx, test_idx = self.get_train_test_idx(X=X, y=y)
        complete_dataset = dataset(X, y)
        complete_dataloader = DataLoader(dataset=complete_dataset, batch_size=len(X), shuffle=False)
        return {'train_idx': train_idx, 'test_idx': test_idx, 'train_loader': complete_dataloader, 'test_loader': complete_dataloader, 'graph_structure': self.graph}
    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            train_size=int(train_percentage * len(X)),
            random_state=random_state, shuffle=shuffle
        )
        train_dataset = dataset(X_train, y_train)
        test_dataset = dataset(X_test, y_test)
        if self.train_batch_size >= 1:
            train_loader = DataLoader(dataset=train_dataset, batch_size=self.train_batch_size, shuffle=True)
        else:
            train_loader = DataLoader(dataset=train_dataset, batch_size=len(X_train), shuffle=True)
        if self.test_batch_size >= 1:
            test_loader = DataLoader(dataset=test_dataset, batch_size=self.test_batch_size, shuffle=False)
        else:
            test_loader = DataLoader(dataset=test_dataset, batch_size=len(X_test), shuffle=False)
        return {'train_loader': train_loader, 'test_loader': test_loader, 'graph_structure': self.graph}

load_graph(complete_path)

Loads the graph structure from a file.

Parameters:

Name Type Description Default
complete_path str

The file path to load the graph structure from.

required

Returns:

Type Description
graph

The loaded graph structure.

Raises:

Type Description
ValueError

If the file path is not provided.

Source code in tinybig/data/graph_dataloader.py
def load_graph(self, complete_path: str):
    """
    Loads the graph structure from a file.

    Parameters
    ----------
    complete_path: str
        The file path to load the graph structure from.

    Returns
    -------
    graph_class
        The loaded graph structure.

    Raises
    ------
    ValueError
        If the file path is not provided.
    """
    if complete_path is None:
        raise ValueError('The cache complete_path has not been set yet...')
    self.graph = graph_class.load(complete_path=complete_path)
    return self.graph

load_raw(cache_dir, device='cpu', normalization=True, normalization_mode='row')

Loads the raw graph data from files.

Parameters:

Name Type Description Default
cache_dir str

Directory containing the graph data files.

required
device str

Device to store the data.

'cpu'
normalization bool

Whether to normalize the node features.

True
normalization_mode str

Mode of normalization ('row' or 'column').

'row'

Returns:

Type Description
tuple

The graph structure, node features (X), and labels (y).

Raises:

Type Description
FileNotFoundError

If the required files are not found in the cache directory.

Source code in tinybig/data/graph_dataloader.py
def load_raw(self, cache_dir: str, device: str = 'cpu', normalization: bool = True, normalization_mode: str = 'row'):
    """
    Loads the raw graph data from files.

    Parameters
    ----------
    cache_dir: str
        Directory containing the graph data files.
    device: str, default = 'cpu'
        Device to store the data.
    normalization: bool, default = True
        Whether to normalize the node features.
    normalization_mode: str, default = 'row'
        Mode of normalization ('row' or 'column').

    Returns
    -------
    tuple
        The graph structure, node features (X), and labels (y).

    Raises
    ------
    FileNotFoundError
        If the required files are not found in the cache directory.
    """
    if not check_file_existence("{}/node".format(cache_dir)):
        self.download_data(data_profile=self.data_profile, cache_dir=cache_dir, file_name='node')
    if not check_file_existence("{}/link".format(cache_dir)):
        self.download_data(data_profile=self.data_profile, cache_dir=cache_dir, file_name='link')

    idx_features_labels = np.genfromtxt("{}/node".format(cache_dir), dtype=np.dtype(str))
    X = torch.tensor(sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32).todense())
    y = dataloader.encode_str_labels(labels=idx_features_labels[:, -1], one_hot=False)

    if normalization:
        X = degree_based_normalize_matrix(mx=X, mode=normalization_mode)

    nodes = np.array(idx_features_labels[:, 0], dtype=np.int32).tolist()
    links = np.genfromtxt("{}/link".format(cache_dir), dtype=np.int32).tolist()
    graph = graph_class(
        nodes=nodes, links=links, directed=True, device=device
    )
    return graph, X, y

save_graph(complete_path, graph=None)

Saves the graph structure to a file.

Parameters:

Name Type Description Default
complete_path str

The file path to save the graph structure.

required
graph graph

The graph structure to save. If None, the internal graph is used.

None

Returns:

Type Description
str

The path to the saved graph file.

Raises:

Type Description
ValueError

If no graph structure is loaded or the path is not provided.

Source code in tinybig/data/graph_dataloader.py
def save_graph(self, complete_path: str, graph: graph_class = None):
    """
    Saves the graph structure to a file.

    Parameters
    ----------
    complete_path: str
        The file path to save the graph structure.
    graph: graph_class, optional
        The graph structure to save. If None, the internal graph is used.

    Returns
    -------
    str
        The path to the saved graph file.

    Raises
    ------
    ValueError
        If no graph structure is loaded or the path is not provided.
    """
    graph = graph if graph is not None else self.graph
    if graph is None:
        raise ValueError('The graph structure has not been loaded yet...')
    if complete_path is None:
        raise ValueError('The cache complete_path has not been set yet...')
    return graph.save(complete_path=complete_path)