import os
import random
import numpy as np
import scipy.sparse as sp
import torch
from ..dataset import URLs, GRB_SUPPORTED_DATASETS
from ..utils import download
[docs]class Dataset(object):
r"""
Description
-----------
Class that loads GRB datasets for evaluating adversarial robustness.
Parameters
----------
name: str
Name of dataset, supported datasets: ``["grb-cora", "grb-citeseer", "grb-aminer", "grb-reddit", "grb-flickr"]``.
data_dir: str, optional
Directory for dataset. If not provided, default is ``"./data/"``.
mode: str, optional
Difficulty determined according to the average degree of test nodes.
Choose from ``["easy", "medium", "hard", "full"]``. Default: ``"full"`` is to use the entire test set.
feat_norm: str, optional
Feature normalization that transform all features to range [-1, 1].
Choose from ``["arctan", "sigmoid", "tanh"]``. Default: ``None``.
verbose: bool, optional
Whether to display logs. Default: ``True``.
Attributes
----------
adj : scipy.sparse.csr.csr_matrix
Adjacency matrix in form of ``N * N`` sparse matrix.
features : torch.FloatTensor
Features in form of ``N * D`` torch float tensor.
labels : torch.LongTensor
Labels in form of ``N * L``. L=1 for multi-class classification, otherwise for multi-label classification.
num_nodes: int
Number of nodes ``N``.
num_edges: int
Number of edges.
num_features: int
Dimension of features ``D``.
num_classes : int
Number of classes ``L``.
num_train : int
Number of train nodes.
num_val: int
Number of validation nodes.
num_test: int
Number of test nodes.
mode: str
Mode of dataset. One of ``["easy", "medium", "hard", "full"]``.
index_train: np.ndarray
Index of train nodes.
index_val: np.ndarray
Index of validation nodes.
index_test: np.ndarray
Index of test nodes.
train_mask: torch.Tensor
Mask of train nodes in form of ``N * 1`` torch bool tensor.
val_mask : torch.Tensor
Mask of validation nodes in form of ``N * 1`` torch bool tensor.
test_mask : torch.Tensor
Mask of test nodes in form of ``N * 1`` torch bool tensor.
Example
-------
>>> import grb
>>> from grb.dataset import Dataset
>>> dataset = Dataset(name='grb-cora', mode='easy', feat_norm="arctan")
"""
def __init__(self, name, data_dir=None, mode="easy", feat_norm="arctan", verbose=True, custom=False):
# Create data dir
if not custom:
if name not in GRB_SUPPORTED_DATASETS:
print("{} dataset not supported.".format(name))
exit(1)
if data_dir is None:
data_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data", name)
if name not in data_dir:
data_dir = os.path.join(data_dir, name)
if not os.path.exists(data_dir):
os.makedirs(data_dir)
# Load adj
adj_name = "adj.npz"
if not os.path.exists(os.path.join(data_dir, adj_name)):
download(url=URLs[name][adj_name],
save_path=os.path.join(data_dir, adj_name))
adj = sp.load_npz(os.path.join(data_dir, adj_name))
# Load features
features_name = "features.npz"
if not os.path.exists(os.path.join(data_dir, features_name)):
download(url=URLs[name][features_name],
save_path=os.path.join(data_dir, features_name))
features = np.load(os.path.join(data_dir, features_name)).get("data")
if feat_norm is not None:
features = feat_normalize(features, norm=feat_norm)
# Load labels
labels_name = "labels.npz"
if not os.path.exists(os.path.join(data_dir, labels_name)):
download(url=URLs[name][labels_name],
save_path=os.path.join(data_dir, labels_name))
labels = np.load(os.path.join(data_dir, labels_name)).get("data")
self.name = name
self.adj = adj
self.features = torch.FloatTensor(features)
self.labels = torch.LongTensor(labels)
self.num_nodes = features.shape[0]
self.num_edges = adj.getnnz() // 2
self.num_features = features.shape[1]
self.mode = mode
if len(labels.shape) == 1:
self.num_classes = int(labels.max() + 1)
else:
self.num_classes = labels.shape[-1]
# Load index
index_name = "index.npz"
if not os.path.exists(os.path.join(data_dir, index_name)):
download(url=URLs[name][index_name],
save_path=os.path.join(data_dir, index_name))
index = np.load(os.path.join(data_dir, index_name))
index_train = index.get("index_train")
train_mask = torch.zeros(self.num_nodes, dtype=bool)
train_mask[index_train] = True
self.index_train = index_train
self.train_mask = train_mask
index_val = index.get("index_val")
val_mask = torch.zeros(self.num_nodes, dtype=bool)
val_mask[index_val] = True
self.index_val = index_val
self.val_mask = val_mask
if mode == "easy":
index_test = index.get("index_test_easy")
elif mode == "medium":
index_test = index.get("index_test_medium")
elif mode == "hard":
index_test = index.get("index_test_hard")
elif mode == "full":
index_test = index.get("index_test")
else:
index_test = index.get("index_test")
test_mask = torch.zeros(self.num_nodes, dtype=bool)
test_mask[index_test] = True
self.index_test = index_test
self.test_mask = test_mask
self.num_train = int(torch.sum(self.train_mask))
self.num_val = int(torch.sum(self.val_mask))
self.num_test = int(torch.sum(self.test_mask))
if verbose:
print("Dataset \'{}\' loaded.".format(name))
print(" Number of nodes: {}".format(self.num_nodes))
print(" Number of edges: {}".format(self.num_edges))
print(" Number of features: {}".format(self.num_features))
print(" Number of classes: {}".format(self.num_classes))
print(" Number of train samples: {}".format(self.num_train))
print(" Number of val samples: {}".format(self.num_val))
print(" Number of test samples: {}".format(self.num_test))
print(" Dataset mode: {}".format(self.mode))
print(" Feature range: [{:.4f}, {:.4f}]".format(self.features.min(), self.features.max()))
[docs]class CogDLDataset(object):
def __init__(self, name, data_dir=None, mode='origin', verbose=True):
r"""
Description
-----------
Class that loads `CogDL datasets <https://github.com/THUDM/cogdl/tree/master/cogdl/datasets>`__
for GRB evaluation.
Parameters
----------
name: str
Name of dataset, see supported datasets in self.COGDL_DATASETS.
data_dir: str, optional
Directory for dataset. If not provided, default is ``"./data/"``.
mode: str, optional
Choose from ``["original", "lcc"]``. ``lcc`` is to extract the largest connected components.
Default: ``origin``.
verbose: bool, optional
Whether to display logs. Default: ``True``.
"""
if name in self.COGDL_GRAPH_CLASSIFICATION_DATASETS:
from cogdl.datasets import build_dataset_from_name, build_dataset_from_path
try:
if data_dir:
dataset = build_dataset_from_path(data_path=data_dir, dataset=name)
else:
dataset = build_dataset_from_name(name)
except AssertionError:
print("Dataset '{}' is not supported.".format(name))
exit(1)
self.name = name
if dataset[0].x is None:
from cogdl.tasks.graph_classification import node_degree_as_feature
dataset = node_degree_as_feature(dataset)
self.graphs = dataset.data
self.labels = dataset.y
self.num_graphs = len(self.graphs)
self.index_train, self.index_val, self.index_test = self.graph_splitting(self.num_graphs)
self.num_nodes_max = max([graph.num_nodes for graph in self.graphs])
self.num_edges_max = max([graph.num_edges for graph in self.graphs])
self.num_train = len(self.index_train)
self.num_val = len(self.index_val)
self.num_test = len(self.index_test)
self.num_features = dataset.num_features
self.num_classes = dataset.num_classes
if verbose:
print("Dataset \'{}\' loaded.".format(name))
print(" Number of graphs: {}".format(self.num_graphs))
print(" Number of nodes (maximum): {}".format(self.num_nodes_max))
print(" Number of edges (maximum): {}".format(self.num_edges_max))
print(" Number of features: {}".format(self.num_features))
print(" Number of classes: {}".format(self.num_classes))
print(" Number of train samples: {}".format(self.num_train))
print(" Number of val samples: {}".format(self.num_val))
print(" Number of test samples: {}".format(self.num_test))
else:
from cogdl.datasets import build_dataset_from_name, build_dataset_from_path
try:
if data_dir:
dataset = build_dataset_from_path(data_path=data_dir, dataset=name)
else:
dataset = build_dataset_from_name(name)
except AssertionError:
print("Dataset '{}' is not supported.".format(name))
exit(1)
self.name = name
graph = dataset.data
edge_index = graph.edge_index
attr = graph.edge_attr if graph.edge_attr is not None else torch.ones(edge_index[0].shape[0])
self.adj = self.build_adj(attr, edge_index, adj_type='csr')
if mode == 'origin':
self.features = dataset.data.x
self.labels = dataset.data.y
self.train_mask = dataset.data.train_mask
self.val_mask = dataset.data.val_mask
self.test_mask = dataset.data.test_mask
self.num_train = int(torch.sum(self.train_mask))
self.num_val = int(torch.sum(self.val_mask))
self.num_test = int(torch.sum(self.test_mask))
self.num_nodes = dataset.data.num_nodes
self.num_edges = dataset.data.num_edges // 2
self.num_features = dataset.data.num_features
self.num_classes = dataset.data.num_classes
elif mode == 'lcc':
# Get largest connected component
import networkx as nx
graph_nx = nx.from_scipy_sparse_matrix(self.adj)
components = nx.connected_components(graph_nx)
lcc_nodes = list(next(components))
subgraph = graph_nx.subgraph(lcc_nodes)
self.adj = nx.to_scipy_sparse_matrix(subgraph, format='coo')
self.features = dataset.data.x[lcc_nodes]
self.labels = dataset.data.y[lcc_nodes]
self.train_mask = dataset.data.train_mask[lcc_nodes]
self.val_mask = dataset.data.val_mask[lcc_nodes]
self.test_mask = dataset.data.test_mask[lcc_nodes]
self.num_train = int(torch.sum(self.train_mask))
self.num_val = int(torch.sum(self.val_mask))
self.num_test = int(torch.sum(self.test_mask))
self.num_nodes = subgraph.number_of_nodes()
self.num_edges = subgraph.number_of_edges() // 2
self.num_features = dataset.data.num_features
self.num_classes = dataset.data.num_classes
if verbose:
print("Dataset \'{}\' loaded.".format(name))
print(" Number of nodes: {}".format(self.num_nodes))
print(" Number of edges: {}".format(self.num_edges))
print(" Number of features: {}".format(self.num_features))
print(" Number of classes: {}".format(self.num_classes))
print(" Number of train samples: {}".format(self.num_train))
print(" Number of val samples: {}".format(self.num_val))
print(" Number of test samples: {}".format(self.num_test))
print(" Feature range: [{:.4f}, {:.4f}]".format(self.features.min(), self.features.max()))
@property
def COGDL_GRAPH_CLASSIFICATION_DATASETS(self):
return {"mutag", "imdb-b", "imdb-m", "collab", "reddit-b"}
[docs] @staticmethod
def build_adj(attr, edge_index, adj_type='csr'):
if type(attr) == torch.Tensor:
attr = attr.numpy()
if type(edge_index) == torch.Tensor:
edge_index = edge_index.numpy()
if type(edge_index) == tuple:
edge_index = [edge_index[0].numpy(), edge_index[1].numpy()]
if adj_type == 'csr':
adj = sp.csr_matrix((attr, edge_index))
elif adj_type == 'coo':
adj = sp.coo_matrix((attr, edge_index))
return adj
[docs] @staticmethod
def graph_splitting(num_graphs, train_ratio=0.8, val_ratio=0.1):
assert train_ratio + val_ratio <= 1.0
train_size = int(num_graphs * train_ratio)
val_size = int(num_graphs * val_ratio)
test_size = num_graphs - train_size - val_size
index = list(range(num_graphs))
random.shuffle(index)
train_index = index[:train_size]
val_index = index[train_size:-test_size]
test_index = index[-test_size:]
return train_index, val_index, test_index
[docs]class OGBDataset(object):
def __init__(self, name, data_dir=None, verbose=True):
r"""
Description
-----------
Class that loads `OGB datasets <https://ogb.stanford.edu/docs/dataset_overview/>`__
for GRB evaluation.
Parameters
----------
name: str
Name of dataset.
data_dir: str, optional
Directory for dataset. If not provided, default is ``"./data/"``.
verbose: bool, optional
Whether to display logs. Default: ``True``.
"""
self.name = name
if name in self.OGB_NODE_CLASSIFICATION_DATASETS:
from ogb.nodeproppred import DglNodePropPredDataset
dataset = DglNodePropPredDataset(name=name, root=data_dir)
split_idx = dataset.get_idx_split()
train_idx, val_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
graph, labels = dataset[0]
self.adj = graph.adj(scipy_fmt="csr")
if name in ["ogbn-arxiv", "ogbn-products"]:
self.features = graph.ndata['feat']
self.labels = labels.squeeze()
self.num_nodes = graph.num_nodes()
if name == "ogbn-arxiv":
srcs, dsts = graph.all_edges()
graph.add_edges(dsts, srcs)
self.num_edges = graph.num_edges() // 2
self.num_features = self.features.shape[1]
self.num_classes = dataset.num_classes
elif name in ["ogbn-proteins"]:
self.features = graph.edata['feat']
self.labels = labels.squeeze()
self.num_nodes = graph.num_nodes()
self.num_edges = graph.num_edges() // 2
self.num_features = self.features.shape[1]
self.num_classes = dataset.num_classes
self.num_tasks = dataset.num_tasks
train_mask = torch.zeros(self.num_nodes, dtype=bool)
train_mask[train_idx] = True
self.train_mask = train_mask
val_mask = torch.zeros(self.num_nodes, dtype=bool)
val_mask[val_idx] = True
self.val_mask = val_mask
test_mask = torch.zeros(self.num_nodes, dtype=bool)
test_mask[test_idx] = True
self.test_mask = test_mask
self.num_train = int(torch.sum(self.train_mask))
self.num_val = int(torch.sum(self.val_mask))
self.num_test = int(torch.sum(self.test_mask))
if verbose:
print("Dataset \'{}\' loaded.".format(name))
print(" Number of nodes: {}".format(self.num_nodes))
print(" Number of edges: {}".format(self.num_edges))
print(" Number of features: {}".format(self.num_features))
print(" Number of classes: {}".format(self.num_classes))
if name in ["ogbn-proteins"]:
print(" Number of tasks: {}".format(self.num_tasks))
print(" Number of train samples: {}".format(self.num_train))
print(" Number of val samples: {}".format(self.num_val))
print(" Number of test samples: {}".format(self.num_test))
print(" Feature range: [{:.4f}, {:.4f}]".format(self.features.min(), self.features.max()))
elif name in self.OGB_GRAPH_CLASSIFICATION_DATASETS:
from ogb.graphproppred import GraphPropPredDataset
dataset = GraphPropPredDataset(name=name, root=data_dir)
split_idx = dataset.get_idx_split()
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
if name in ["ogbg-code2"]:
self.dataset = dataset
self.index_train = train_idx
self.index_val = valid_idx
self.index_test = test_idx
self.num_train = len(train_idx)
self.num_val = len(valid_idx)
self.num_test = len(test_idx)
if verbose:
print("Dataset \'{}\' loaded.".format(name))
print(" Number of graphs: {}".format(len(dataset)))
print(" Number of train samples: {}".format(self.num_train))
print(" Number of val samples: {}".format(self.num_val))
print(" Number of test samples: {}".format(self.num_test))
@property
def OGB_NODE_CLASSIFICATION_DATASETS(self):
return {"ogbn_arxiv", "ogbn_products", "ogbn_arxiv", "ogbn_proteins"}
@property
def OGB_GRAPH_CLASSIFICATION_DATASETS(self):
return {"ogbg-code2"}
[docs]class CustomDataset(object):
r"""
Description
-----------
Class that helps to build customized dataset for GRB evaluation.
Parameters
----------
adj : scipy.sparse.csr.csr_matrix
Adjacency matrix in form of ``N * N`` sparse matrix.
features : torch.FloatTensor
Features in form of ``N * D`` torch float tensor.
labels : torch.LongTensor
Labels in form of ``N * L``. L=1 for multi-class classification, otherwise for multi-label classification.
train_mask: torch.Tensor, optional
Mask of train nodes in form of ``N * 1`` torch bool tensor. Default: ``None``.
If is ``None``, generated by default splitting scheme.
val_mask : torch.Tensor, optional
Mask of validation nodes in form of ``N * 1`` torch bool tensor. Default: ``None``.
If is ``None``, generated by default splitting scheme.
test_mask : torch.Tensor, optional
Mask of test nodes in form of ``N * 1`` torch bool tensor. Default: ``None``.
If is ``None``, generated by default splitting scheme.
name : str, optional
Name of dataset.
data_dir : str, optional
Directory of dataset.
mode : str, optional
Mode of dataset. One of ``["easy", "medium", "hard", "full"]``. Default: ``full``.
feat_norm : str, optional
Feature normalization that transform all features to range [-1, 1].
Choose from ``["arctan", "sigmoid", "tanh"]``. Default: ``None``.
save : bool, optional
Whether to save data as files.
verbose : bool, optional
Whether to display logs. Default: ``True``.
Parameters
----------
name: str
Name of dataset, supported datasets: ``["grb-cora", "grb-citeseer", "grb-aminer", "grb-reddit", "grb-flickr"]``.
data_dir: str, optional
Directory for dataset. If not provided, default is ``"./data/"``.
mode: str, optional
Difficulty determined according to the average degree of test nodes.
Choose from ``["easy", "medium", "hard", "full"]``. Default: ``"full"`` is to use the entire test set.
feat_norm: str, optional
Feature normalization that transform all features to range [-1, 1].
Choose from ``["arctan", "sigmoid", "tanh"]``. Default: ``None``.
verbose: bool, optional
Whether to display logs. Default: ``True``.
Attributes
----------
adj : scipy.sparse.csr.csr_matrix
Adjacency matrix in form of ``N * N`` sparse matrix.
features : torch.FloatTensor
Features in form of ``N * D`` torch float tensor.
labels : torch.LongTensor
Labels in form of ``N * L``. L=1 for multi-class classification, otherwise for multi-label classification.
num_nodes: int
Number of nodes ``N``.
num_edges: int
Number of edges.
num_features: int
Dimension of features ``D``.
num_classes : int
Number of classes ``L``.
num_train : int
Number of train nodes.
num_val: int
Number of validation nodes.
num_test: int
Number of test nodes.
mode: str
Mode of dataset. One of ``["easy", "medium", "hard", "full"]``.
index_train: np.ndarray
Index of train nodes.
index_val: np.ndarray
Index of validation nodes.
index_test: np.ndarray
Index of test nodes.
train_mask: torch.Tensor
Mask of train nodes in form of ``N * 1`` torch bool tensor.
val_mask : torch.Tensor
Mask of validation nodes in form of ``N * 1`` torch bool tensor.
test_mask : torch.Tensor
Mask of test nodes in form of ``N * 1`` torch bool tensor.
"""
def __init__(self, adj, features, labels, train_mask=None, val_mask=None, test_mask=None,
name=None, data_dir=None, mode='full', feat_norm=None, save=False, verbose=True, seed=42):
self.name = name
self.adj = adj
self.num_nodes = features.shape[0]
self.num_edges = adj.getnnz() // 2
self.num_features = features.shape[1]
self.mode = mode
if type(features) != torch.Tensor:
features = torch.FloatTensor(features)
elif features.type() != 'torch.FloatTensor':
features = features.float()
if feat_norm is not None:
features = feat_normalize(features, norm=feat_norm)
self.features = features
if type(labels) != torch.Tensor:
labels = torch.LongTensor(labels)
elif labels.type() != 'torch.LongTensor':
labels = labels.long()
self.labels = labels
if (train_mask is None) or (val_mask is None) or (test_mask is None):
index = splitting(adj, seed=seed)
self.index = index
if train_mask is None:
index_train = index.get("index_train")
train_mask = torch.zeros(self.num_nodes, dtype=bool)
train_mask[index_train] = True
else:
if type(train_mask) != torch.Tensor:
train_mask = torch.BoolTensor(train_mask)
elif train_mask.type() != 'torch.BoolTensor':
train_mask = train_mask.bool()
self.train_mask = train_mask
if val_mask is None:
index_val = index.get("index_val")
val_mask = torch.zeros(self.num_nodes, dtype=bool)
val_mask[index_val] = True
else:
if type(val_mask) != torch.Tensor:
val_mask = torch.BoolTensor(val_mask)
elif val_mask.type() != 'torch.BoolTensor':
val_mask = val_mask.bool()
self.val_mask = val_mask
if test_mask is None:
if mode == "easy":
index_test = index.get("index_test_easy")
elif mode == "medium":
index_test = index.get("index_test_medium")
elif mode == "hard":
index_test = index.get("index_test_hard")
elif mode == "full":
index_test = index.get("index_test")
else:
index_test = index.get("index_test")
test_mask = torch.zeros(self.num_nodes, dtype=bool)
test_mask[index_test] = True
else:
if type(test_mask) != torch.Tensor:
test_mask = torch.BoolTensor(test_mask)
elif test_mask.type() != 'torch.BoolTensor':
test_mask = test_mask.bool()
self.test_mask = test_mask
self.num_train = int(torch.sum(self.train_mask))
self.num_val = int(torch.sum(self.val_mask))
self.num_test = int(torch.sum(self.test_mask))
if len(labels.shape) == 1:
self.num_classes = int(labels.max() + 1)
else:
self.num_classes = labels.shape[-1]
if save:
if data_dir is None:
data_dir = "./data"
if not os.path.exists(data_dir):
os.makedirs(data_dir)
sp.save_npz(os.path.join(data_dir, "adj.npz"), adj.tocsr())
np.savez_compressed(os.path.join(data_dir, "index.npz"), **index)
np.savez_compressed(os.path.join(data_dir, "features.npz"), data=features)
np.savez_compressed(os.path.join(data_dir, "labels.npz"), data=labels)
print(" Saved in {}.".format(data_dir))
if verbose:
print("Custom Dataset \'{}\' loaded.".format(name))
print(" Number of nodes: {}".format(self.num_nodes))
print(" Number of edges: {}".format(self.num_edges))
print(" Number of features: {}".format(self.num_features))
print(" Number of classes: {}".format(self.num_classes))
print(" Number of train samples: {}".format(self.num_train))
print(" Number of val samples: {}".format(self.num_val))
print(" Number of test samples: {}".format(self.num_test))
print(" Dataset mode: {}".format(self.mode))
print(" Feature range [{:.4f}, {:.4f}]".format(self.features.min(), self.features.max()))
[docs]def feat_normalize(features, norm=None, lim_min=-1.0, lim_max=1.0):
r"""
Description
-----------
Feature normalization function.
Parameters
----------
features : torch.FloatTensor
Features in form of ``N * D`` torch float tensor.
norm : str, optional
Type of normalization. Choose from ``["linearize", "arctan", "tanh", "standarize"]``.
Default: ``None``.
lim_min : float
Minimum limit of feature value. Default: ``-1.0``.
lim_max : float
Maximum limit of feature value. Default: ``1.0``.
Returns
-------
features : torch.FloatTensor
Normalized features in form of ``N * D`` torch float tensor.
"""
if norm == "linearize":
k = (lim_max - lim_min) / (features.max() - features.min())
features = lim_min + k * (features - features.min())
elif norm == "arctan":
features = (features - features.mean()) / features.std()
features = 2 * np.arctan(features) / np.pi
elif norm == "tanh":
features = (features - features.mean()) / features.std()
features = np.tanh(features)
elif norm == "standardize":
features = (features - features.mean()) / features.std()
else:
features = features
return features
[docs]def splitting(adj,
range_min=(0.0, 0.05),
range_max=(0.95, 1.0),
range_easy=(0.05, 0.35),
range_medium=(0.35, 0.65),
range_hard=(0.65, 0.95),
ratio_train=0.6,
ratio_val=0.1,
ratio_test=0.1,
seed=42):
r"""
Description
-----------
GRB splitting scheme designed for adversarial robustness evaluation.
Parameters
----------
adj : scipy.sparse.csr.csr_matrix
Adjacency matrix in form of ``N * N`` sparse matrix.
range_min : tuple of float, optional
Range of nodes with minimum degrees to be ignored. Value in percentage.
Default: ``(0.0, 0.05)``.
range_max : tuple of float, optional
Range of nodes with maximum degrees to be ignored. Value in percentage.
Default: ``(0.95, 1.0)``.
range_easy : tuple of float, optional
Range of nodes for ``easy`` difficulty. Value in percentage.
Default: ``(0.05, 0.35)``.
range_medium : tuple of float, optional
Range of nodes for ``medium`` difficulty. Value in percentage.
Default: ``(0.35, 0.65)``.
range_hard : tuple of float, optional
Range of nodes for ``hard`` difficulty. Value in percentage.
Default: ``(0.65, 0.95)``.
ratio_train : float, optional
Ratio of train nodes. Default: ``0.6``.
ratio_val : float, optional
Ratio of validation nodes. Default: ``0.1``.
ratio_test : float, optional
Ratio of test nodes. Default: ``0.1``.
seed : int, optional
Random seed. Default: ``42``.
Returns
-------
index : dict
Dictionary containing ``{"index_train", "index_val", "index_test",
"index_test_easy", "index_test_medium", "index_test_hard"}``.
"""
def a_not_in_b(a, b):
c = []
for i in a:
if i not in b:
c.append(i)
return np.array(c)
num_nodes = adj.shape[0]
degs = adj.getnnz(axis=1)
print("GRB data splitting...")
print(" Average degree of all nodes: {:.4f}".format(np.mean(degs)))
degs_index = np.argsort(degs)
ind_min = int(len(degs_index) * range_min[1])
ind_max = int(len(degs_index) * range_max[0])
print(" Average degree of 5% nodes with small degree: {:.4f}".format(
np.mean(degs[degs_index[:ind_min]])))
print(" Average degree of 5% nodes with large degree: {:.4f}".format(
np.mean(degs[degs_index[ind_max:]])))
# Sampling 'easy' test nodes
ind_easy_min = int(len(degs_index) * range_easy[0])
ind_easy_max = int(len(degs_index) * range_easy[1])
print(" Average degree of 30% nodes (easy): {:.4f}".format(
np.mean(degs[degs_index[ind_easy_min:ind_easy_max]])))
np.random.seed(seed)
ind_easy_sample = np.random.choice(degs_index[ind_easy_min:ind_easy_max],
int(num_nodes * ratio_test), replace=False)
print(" Randomly sampled {} nodes".format(ind_easy_sample.shape[0]))
# Sampling 'medium' test nodes
ind_medium_min = int(len(degs_index) * range_medium[0])
ind_medium_max = int(len(degs_index) * range_medium[1])
print(" Average degree of 30% nodes (medium): {:.4f}".format(
np.mean(degs[degs_index[ind_medium_min:ind_medium_max]])))
np.random.seed(seed)
ind_medium_sample = np.random.choice(degs_index[ind_medium_min:ind_medium_max],
int(num_nodes * ratio_test), replace=False)
print(" Randomly sampled {} nodes".format(ind_medium_sample.shape[0]))
# Sampling 'hard' test nodes
ind_hard_min = int(len(degs_index) * range_hard[0])
ind_hard_max = int(len(degs_index) * range_hard[1])
print(" Average degree of 30% nodes (hard): {:.4f}".format(
np.mean(degs[degs_index[ind_hard_min:ind_hard_max]])))
np.random.seed(seed)
ind_hard_sample = np.random.choice(degs_index[ind_hard_min:ind_hard_max],
int(num_nodes * ratio_test), replace=False)
print(" Randomly sampled {} nodes".format(ind_hard_sample.shape[0]))
ind_test = np.concatenate([ind_easy_sample,
ind_medium_sample,
ind_hard_sample])
# Sampling nodes for training and validation
ind_rest = a_not_in_b(degs_index, ind_test)
np.random.seed(seed)
ind_train = np.random.choice(ind_rest, int(num_nodes * ratio_train), replace=False)
ind_val = a_not_in_b(ind_rest, ind_train)
print(" Number of training/validation nodes: {}/{}".format(len(ind_train), len(ind_val)))
if len(ind_train) + len(ind_val) + len(ind_test) == num_nodes:
print(" No duplicate.")
else:
print(" Find duplicates.")
index = {"index_train" : np.sort(ind_train),
"index_val" : np.sort(ind_val),
"index_test" : np.sort(ind_test),
"index_test_easy" : np.sort(ind_easy_sample),
"index_test_medium": np.sort(ind_medium_sample),
"index_test_hard" : np.sort(ind_hard_sample)}
return index