Source code for grb.dataset.dataset

import os
import random

import numpy as np
import scipy.sparse as sp
import torch

from ..dataset import URLs, GRB_SUPPORTED_DATASETS
from ..utils import download


[docs]class Dataset(object): r""" Description ----------- Class that loads GRB datasets for evaluating adversarial robustness. Parameters ---------- name: str Name of dataset, supported datasets: ``["grb-cora", "grb-citeseer", "grb-aminer", "grb-reddit", "grb-flickr"]``. data_dir: str, optional Directory for dataset. If not provided, default is ``"./data/"``. mode: str, optional Difficulty determined according to the average degree of test nodes. Choose from ``["easy", "medium", "hard", "full"]``. Default: ``"full"`` is to use the entire test set. feat_norm: str, optional Feature normalization that transform all features to range [-1, 1]. Choose from ``["arctan", "sigmoid", "tanh"]``. Default: ``None``. verbose: bool, optional Whether to display logs. Default: ``True``. Attributes ---------- adj : scipy.sparse.csr.csr_matrix Adjacency matrix in form of ``N * N`` sparse matrix. features : torch.FloatTensor Features in form of ``N * D`` torch float tensor. labels : torch.LongTensor Labels in form of ``N * L``. L=1 for multi-class classification, otherwise for multi-label classification. num_nodes: int Number of nodes ``N``. num_edges: int Number of edges. num_features: int Dimension of features ``D``. num_classes : int Number of classes ``L``. num_train : int Number of train nodes. num_val: int Number of validation nodes. num_test: int Number of test nodes. mode: str Mode of dataset. One of ``["easy", "medium", "hard", "full"]``. index_train: np.ndarray Index of train nodes. index_val: np.ndarray Index of validation nodes. index_test: np.ndarray Index of test nodes. train_mask: torch.Tensor Mask of train nodes in form of ``N * 1`` torch bool tensor. val_mask : torch.Tensor Mask of validation nodes in form of ``N * 1`` torch bool tensor. test_mask : torch.Tensor Mask of test nodes in form of ``N * 1`` torch bool tensor. Example ------- >>> import grb >>> from grb.dataset import Dataset >>> dataset = Dataset(name='grb-cora', mode='easy', feat_norm="arctan") """ def __init__(self, name, data_dir=None, mode="easy", feat_norm="arctan", verbose=True, custom=False): # Create data dir if not custom: if name not in GRB_SUPPORTED_DATASETS: print("{} dataset not supported.".format(name)) exit(1) if data_dir is None: data_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data", name) if name not in data_dir: data_dir = os.path.join(data_dir, name) if not os.path.exists(data_dir): os.makedirs(data_dir) # Load adj adj_name = "adj.npz" if not os.path.exists(os.path.join(data_dir, adj_name)): download(url=URLs[name][adj_name], save_path=os.path.join(data_dir, adj_name)) adj = sp.load_npz(os.path.join(data_dir, adj_name)) # Load features features_name = "features.npz" if not os.path.exists(os.path.join(data_dir, features_name)): download(url=URLs[name][features_name], save_path=os.path.join(data_dir, features_name)) features = np.load(os.path.join(data_dir, features_name)).get("data") if feat_norm is not None: features = feat_normalize(features, norm=feat_norm) # Load labels labels_name = "labels.npz" if not os.path.exists(os.path.join(data_dir, labels_name)): download(url=URLs[name][labels_name], save_path=os.path.join(data_dir, labels_name)) labels = np.load(os.path.join(data_dir, labels_name)).get("data") self.name = name self.adj = adj self.features = torch.FloatTensor(features) self.labels = torch.LongTensor(labels) self.num_nodes = features.shape[0] self.num_edges = adj.getnnz() // 2 self.num_features = features.shape[1] self.mode = mode if len(labels.shape) == 1: self.num_classes = int(labels.max() + 1) else: self.num_classes = labels.shape[-1] # Load index index_name = "index.npz" if not os.path.exists(os.path.join(data_dir, index_name)): download(url=URLs[name][index_name], save_path=os.path.join(data_dir, index_name)) index = np.load(os.path.join(data_dir, index_name)) index_train = index.get("index_train") train_mask = torch.zeros(self.num_nodes, dtype=bool) train_mask[index_train] = True self.index_train = index_train self.train_mask = train_mask index_val = index.get("index_val") val_mask = torch.zeros(self.num_nodes, dtype=bool) val_mask[index_val] = True self.index_val = index_val self.val_mask = val_mask if mode == "easy": index_test = index.get("index_test_easy") elif mode == "medium": index_test = index.get("index_test_medium") elif mode == "hard": index_test = index.get("index_test_hard") elif mode == "full": index_test = index.get("index_test") else: index_test = index.get("index_test") test_mask = torch.zeros(self.num_nodes, dtype=bool) test_mask[index_test] = True self.index_test = index_test self.test_mask = test_mask self.num_train = int(torch.sum(self.train_mask)) self.num_val = int(torch.sum(self.val_mask)) self.num_test = int(torch.sum(self.test_mask)) if verbose: print("Dataset \'{}\' loaded.".format(name)) print(" Number of nodes: {}".format(self.num_nodes)) print(" Number of edges: {}".format(self.num_edges)) print(" Number of features: {}".format(self.num_features)) print(" Number of classes: {}".format(self.num_classes)) print(" Number of train samples: {}".format(self.num_train)) print(" Number of val samples: {}".format(self.num_val)) print(" Number of test samples: {}".format(self.num_test)) print(" Dataset mode: {}".format(self.mode)) print(" Feature range: [{:.4f}, {:.4f}]".format(self.features.min(), self.features.max()))
[docs]class CogDLDataset(object): def __init__(self, name, data_dir=None, mode='origin', verbose=True): r""" Description ----------- Class that loads `CogDL datasets <https://github.com/THUDM/cogdl/tree/master/cogdl/datasets>`__ for GRB evaluation. Parameters ---------- name: str Name of dataset, see supported datasets in self.COGDL_DATASETS. data_dir: str, optional Directory for dataset. If not provided, default is ``"./data/"``. mode: str, optional Choose from ``["original", "lcc"]``. ``lcc`` is to extract the largest connected components. Default: ``origin``. verbose: bool, optional Whether to display logs. Default: ``True``. """ if name in self.COGDL_GRAPH_CLASSIFICATION_DATASETS: from cogdl.datasets import build_dataset_from_name, build_dataset_from_path try: if data_dir: dataset = build_dataset_from_path(data_path=data_dir, dataset=name) else: dataset = build_dataset_from_name(name) except AssertionError: print("Dataset '{}' is not supported.".format(name)) exit(1) self.name = name if dataset[0].x is None: from cogdl.tasks.graph_classification import node_degree_as_feature dataset = node_degree_as_feature(dataset) self.graphs = dataset.data self.labels = dataset.y self.num_graphs = len(self.graphs) self.index_train, self.index_val, self.index_test = self.graph_splitting(self.num_graphs) self.num_nodes_max = max([graph.num_nodes for graph in self.graphs]) self.num_edges_max = max([graph.num_edges for graph in self.graphs]) self.num_train = len(self.index_train) self.num_val = len(self.index_val) self.num_test = len(self.index_test) self.num_features = dataset.num_features self.num_classes = dataset.num_classes if verbose: print("Dataset \'{}\' loaded.".format(name)) print(" Number of graphs: {}".format(self.num_graphs)) print(" Number of nodes (maximum): {}".format(self.num_nodes_max)) print(" Number of edges (maximum): {}".format(self.num_edges_max)) print(" Number of features: {}".format(self.num_features)) print(" Number of classes: {}".format(self.num_classes)) print(" Number of train samples: {}".format(self.num_train)) print(" Number of val samples: {}".format(self.num_val)) print(" Number of test samples: {}".format(self.num_test)) else: from cogdl.datasets import build_dataset_from_name, build_dataset_from_path try: if data_dir: dataset = build_dataset_from_path(data_path=data_dir, dataset=name) else: dataset = build_dataset_from_name(name) except AssertionError: print("Dataset '{}' is not supported.".format(name)) exit(1) self.name = name graph = dataset.data edge_index = graph.edge_index attr = graph.edge_attr if graph.edge_attr is not None else torch.ones(edge_index[0].shape[0]) self.adj = self.build_adj(attr, edge_index, adj_type='csr') if mode == 'origin': self.features = dataset.data.x self.labels = dataset.data.y self.train_mask = dataset.data.train_mask self.val_mask = dataset.data.val_mask self.test_mask = dataset.data.test_mask self.num_train = int(torch.sum(self.train_mask)) self.num_val = int(torch.sum(self.val_mask)) self.num_test = int(torch.sum(self.test_mask)) self.num_nodes = dataset.data.num_nodes self.num_edges = dataset.data.num_edges // 2 self.num_features = dataset.data.num_features self.num_classes = dataset.data.num_classes elif mode == 'lcc': # Get largest connected component import networkx as nx graph_nx = nx.from_scipy_sparse_matrix(self.adj) components = nx.connected_components(graph_nx) lcc_nodes = list(next(components)) subgraph = graph_nx.subgraph(lcc_nodes) self.adj = nx.to_scipy_sparse_matrix(subgraph, format='coo') self.features = dataset.data.x[lcc_nodes] self.labels = dataset.data.y[lcc_nodes] self.train_mask = dataset.data.train_mask[lcc_nodes] self.val_mask = dataset.data.val_mask[lcc_nodes] self.test_mask = dataset.data.test_mask[lcc_nodes] self.num_train = int(torch.sum(self.train_mask)) self.num_val = int(torch.sum(self.val_mask)) self.num_test = int(torch.sum(self.test_mask)) self.num_nodes = subgraph.number_of_nodes() self.num_edges = subgraph.number_of_edges() // 2 self.num_features = dataset.data.num_features self.num_classes = dataset.data.num_classes if verbose: print("Dataset \'{}\' loaded.".format(name)) print(" Number of nodes: {}".format(self.num_nodes)) print(" Number of edges: {}".format(self.num_edges)) print(" Number of features: {}".format(self.num_features)) print(" Number of classes: {}".format(self.num_classes)) print(" Number of train samples: {}".format(self.num_train)) print(" Number of val samples: {}".format(self.num_val)) print(" Number of test samples: {}".format(self.num_test)) print(" Feature range: [{:.4f}, {:.4f}]".format(self.features.min(), self.features.max())) @property def COGDL_GRAPH_CLASSIFICATION_DATASETS(self): return {"mutag", "imdb-b", "imdb-m", "collab", "reddit-b"}
[docs] @staticmethod def build_adj(attr, edge_index, adj_type='csr'): if type(attr) == torch.Tensor: attr = attr.numpy() if type(edge_index) == torch.Tensor: edge_index = edge_index.numpy() if type(edge_index) == tuple: edge_index = [edge_index[0].numpy(), edge_index[1].numpy()] if adj_type == 'csr': adj = sp.csr_matrix((attr, edge_index)) elif adj_type == 'coo': adj = sp.coo_matrix((attr, edge_index)) return adj
[docs] @staticmethod def graph_splitting(num_graphs, train_ratio=0.8, val_ratio=0.1): assert train_ratio + val_ratio <= 1.0 train_size = int(num_graphs * train_ratio) val_size = int(num_graphs * val_ratio) test_size = num_graphs - train_size - val_size index = list(range(num_graphs)) random.shuffle(index) train_index = index[:train_size] val_index = index[train_size:-test_size] test_index = index[-test_size:] return train_index, val_index, test_index
[docs]class OGBDataset(object): def __init__(self, name, data_dir=None, verbose=True): r""" Description ----------- Class that loads `OGB datasets <https://ogb.stanford.edu/docs/dataset_overview/>`__ for GRB evaluation. Parameters ---------- name: str Name of dataset. data_dir: str, optional Directory for dataset. If not provided, default is ``"./data/"``. verbose: bool, optional Whether to display logs. Default: ``True``. """ self.name = name if name in self.OGB_NODE_CLASSIFICATION_DATASETS: from ogb.nodeproppred import DglNodePropPredDataset dataset = DglNodePropPredDataset(name=name, root=data_dir) split_idx = dataset.get_idx_split() train_idx, val_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"] graph, labels = dataset[0] self.adj = graph.adj(scipy_fmt="csr") if name in ["ogbn-arxiv", "ogbn-products"]: self.features = graph.ndata['feat'] self.labels = labels.squeeze() self.num_nodes = graph.num_nodes() if name == "ogbn-arxiv": srcs, dsts = graph.all_edges() graph.add_edges(dsts, srcs) self.num_edges = graph.num_edges() // 2 self.num_features = self.features.shape[1] self.num_classes = dataset.num_classes elif name in ["ogbn-proteins"]: self.features = graph.edata['feat'] self.labels = labels.squeeze() self.num_nodes = graph.num_nodes() self.num_edges = graph.num_edges() // 2 self.num_features = self.features.shape[1] self.num_classes = dataset.num_classes self.num_tasks = dataset.num_tasks train_mask = torch.zeros(self.num_nodes, dtype=bool) train_mask[train_idx] = True self.train_mask = train_mask val_mask = torch.zeros(self.num_nodes, dtype=bool) val_mask[val_idx] = True self.val_mask = val_mask test_mask = torch.zeros(self.num_nodes, dtype=bool) test_mask[test_idx] = True self.test_mask = test_mask self.num_train = int(torch.sum(self.train_mask)) self.num_val = int(torch.sum(self.val_mask)) self.num_test = int(torch.sum(self.test_mask)) if verbose: print("Dataset \'{}\' loaded.".format(name)) print(" Number of nodes: {}".format(self.num_nodes)) print(" Number of edges: {}".format(self.num_edges)) print(" Number of features: {}".format(self.num_features)) print(" Number of classes: {}".format(self.num_classes)) if name in ["ogbn-proteins"]: print(" Number of tasks: {}".format(self.num_tasks)) print(" Number of train samples: {}".format(self.num_train)) print(" Number of val samples: {}".format(self.num_val)) print(" Number of test samples: {}".format(self.num_test)) print(" Feature range: [{:.4f}, {:.4f}]".format(self.features.min(), self.features.max())) elif name in self.OGB_GRAPH_CLASSIFICATION_DATASETS: from ogb.graphproppred import GraphPropPredDataset dataset = GraphPropPredDataset(name=name, root=data_dir) split_idx = dataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"] if name in ["ogbg-code2"]: self.dataset = dataset self.index_train = train_idx self.index_val = valid_idx self.index_test = test_idx self.num_train = len(train_idx) self.num_val = len(valid_idx) self.num_test = len(test_idx) if verbose: print("Dataset \'{}\' loaded.".format(name)) print(" Number of graphs: {}".format(len(dataset))) print(" Number of train samples: {}".format(self.num_train)) print(" Number of val samples: {}".format(self.num_val)) print(" Number of test samples: {}".format(self.num_test)) @property def OGB_NODE_CLASSIFICATION_DATASETS(self): return {"ogbn_arxiv", "ogbn_products", "ogbn_arxiv", "ogbn_proteins"} @property def OGB_GRAPH_CLASSIFICATION_DATASETS(self): return {"ogbg-code2"}
[docs]class CustomDataset(object): r""" Description ----------- Class that helps to build customized dataset for GRB evaluation. Parameters ---------- adj : scipy.sparse.csr.csr_matrix Adjacency matrix in form of ``N * N`` sparse matrix. features : torch.FloatTensor Features in form of ``N * D`` torch float tensor. labels : torch.LongTensor Labels in form of ``N * L``. L=1 for multi-class classification, otherwise for multi-label classification. train_mask: torch.Tensor, optional Mask of train nodes in form of ``N * 1`` torch bool tensor. Default: ``None``. If is ``None``, generated by default splitting scheme. val_mask : torch.Tensor, optional Mask of validation nodes in form of ``N * 1`` torch bool tensor. Default: ``None``. If is ``None``, generated by default splitting scheme. test_mask : torch.Tensor, optional Mask of test nodes in form of ``N * 1`` torch bool tensor. Default: ``None``. If is ``None``, generated by default splitting scheme. name : str, optional Name of dataset. data_dir : str, optional Directory of dataset. mode : str, optional Mode of dataset. One of ``["easy", "medium", "hard", "full"]``. Default: ``full``. feat_norm : str, optional Feature normalization that transform all features to range [-1, 1]. Choose from ``["arctan", "sigmoid", "tanh"]``. Default: ``None``. save : bool, optional Whether to save data as files. verbose : bool, optional Whether to display logs. Default: ``True``. Parameters ---------- name: str Name of dataset, supported datasets: ``["grb-cora", "grb-citeseer", "grb-aminer", "grb-reddit", "grb-flickr"]``. data_dir: str, optional Directory for dataset. If not provided, default is ``"./data/"``. mode: str, optional Difficulty determined according to the average degree of test nodes. Choose from ``["easy", "medium", "hard", "full"]``. Default: ``"full"`` is to use the entire test set. feat_norm: str, optional Feature normalization that transform all features to range [-1, 1]. Choose from ``["arctan", "sigmoid", "tanh"]``. Default: ``None``. verbose: bool, optional Whether to display logs. Default: ``True``. Attributes ---------- adj : scipy.sparse.csr.csr_matrix Adjacency matrix in form of ``N * N`` sparse matrix. features : torch.FloatTensor Features in form of ``N * D`` torch float tensor. labels : torch.LongTensor Labels in form of ``N * L``. L=1 for multi-class classification, otherwise for multi-label classification. num_nodes: int Number of nodes ``N``. num_edges: int Number of edges. num_features: int Dimension of features ``D``. num_classes : int Number of classes ``L``. num_train : int Number of train nodes. num_val: int Number of validation nodes. num_test: int Number of test nodes. mode: str Mode of dataset. One of ``["easy", "medium", "hard", "full"]``. index_train: np.ndarray Index of train nodes. index_val: np.ndarray Index of validation nodes. index_test: np.ndarray Index of test nodes. train_mask: torch.Tensor Mask of train nodes in form of ``N * 1`` torch bool tensor. val_mask : torch.Tensor Mask of validation nodes in form of ``N * 1`` torch bool tensor. test_mask : torch.Tensor Mask of test nodes in form of ``N * 1`` torch bool tensor. """ def __init__(self, adj, features, labels, train_mask=None, val_mask=None, test_mask=None, name=None, data_dir=None, mode='full', feat_norm=None, save=False, verbose=True, seed=42): self.name = name self.adj = adj self.num_nodes = features.shape[0] self.num_edges = adj.getnnz() // 2 self.num_features = features.shape[1] self.mode = mode if type(features) != torch.Tensor: features = torch.FloatTensor(features) elif features.type() != 'torch.FloatTensor': features = features.float() if feat_norm is not None: features = feat_normalize(features, norm=feat_norm) self.features = features if type(labels) != torch.Tensor: labels = torch.LongTensor(labels) elif labels.type() != 'torch.LongTensor': labels = labels.long() self.labels = labels if (train_mask is None) or (val_mask is None) or (test_mask is None): index = splitting(adj, seed=seed) self.index = index if train_mask is None: index_train = index.get("index_train") train_mask = torch.zeros(self.num_nodes, dtype=bool) train_mask[index_train] = True else: if type(train_mask) != torch.Tensor: train_mask = torch.BoolTensor(train_mask) elif train_mask.type() != 'torch.BoolTensor': train_mask = train_mask.bool() self.train_mask = train_mask if val_mask is None: index_val = index.get("index_val") val_mask = torch.zeros(self.num_nodes, dtype=bool) val_mask[index_val] = True else: if type(val_mask) != torch.Tensor: val_mask = torch.BoolTensor(val_mask) elif val_mask.type() != 'torch.BoolTensor': val_mask = val_mask.bool() self.val_mask = val_mask if test_mask is None: if mode == "easy": index_test = index.get("index_test_easy") elif mode == "medium": index_test = index.get("index_test_medium") elif mode == "hard": index_test = index.get("index_test_hard") elif mode == "full": index_test = index.get("index_test") else: index_test = index.get("index_test") test_mask = torch.zeros(self.num_nodes, dtype=bool) test_mask[index_test] = True else: if type(test_mask) != torch.Tensor: test_mask = torch.BoolTensor(test_mask) elif test_mask.type() != 'torch.BoolTensor': test_mask = test_mask.bool() self.test_mask = test_mask self.num_train = int(torch.sum(self.train_mask)) self.num_val = int(torch.sum(self.val_mask)) self.num_test = int(torch.sum(self.test_mask)) if len(labels.shape) == 1: self.num_classes = int(labels.max() + 1) else: self.num_classes = labels.shape[-1] if save: if data_dir is None: data_dir = "./data" if not os.path.exists(data_dir): os.makedirs(data_dir) sp.save_npz(os.path.join(data_dir, "adj.npz"), adj.tocsr()) np.savez_compressed(os.path.join(data_dir, "index.npz"), **index) np.savez_compressed(os.path.join(data_dir, "features.npz"), data=features) np.savez_compressed(os.path.join(data_dir, "labels.npz"), data=labels) print(" Saved in {}.".format(data_dir)) if verbose: print("Custom Dataset \'{}\' loaded.".format(name)) print(" Number of nodes: {}".format(self.num_nodes)) print(" Number of edges: {}".format(self.num_edges)) print(" Number of features: {}".format(self.num_features)) print(" Number of classes: {}".format(self.num_classes)) print(" Number of train samples: {}".format(self.num_train)) print(" Number of val samples: {}".format(self.num_val)) print(" Number of test samples: {}".format(self.num_test)) print(" Dataset mode: {}".format(self.mode)) print(" Feature range [{:.4f}, {:.4f}]".format(self.features.min(), self.features.max()))
[docs]def feat_normalize(features, norm=None, lim_min=-1.0, lim_max=1.0): r""" Description ----------- Feature normalization function. Parameters ---------- features : torch.FloatTensor Features in form of ``N * D`` torch float tensor. norm : str, optional Type of normalization. Choose from ``["linearize", "arctan", "tanh", "standarize"]``. Default: ``None``. lim_min : float Minimum limit of feature value. Default: ``-1.0``. lim_max : float Maximum limit of feature value. Default: ``1.0``. Returns ------- features : torch.FloatTensor Normalized features in form of ``N * D`` torch float tensor. """ if norm == "linearize": k = (lim_max - lim_min) / (features.max() - features.min()) features = lim_min + k * (features - features.min()) elif norm == "arctan": features = (features - features.mean()) / features.std() features = 2 * np.arctan(features) / np.pi elif norm == "tanh": features = (features - features.mean()) / features.std() features = np.tanh(features) elif norm == "standardize": features = (features - features.mean()) / features.std() else: features = features return features
[docs]def splitting(adj, range_min=(0.0, 0.05), range_max=(0.95, 1.0), range_easy=(0.05, 0.35), range_medium=(0.35, 0.65), range_hard=(0.65, 0.95), ratio_train=0.6, ratio_val=0.1, ratio_test=0.1, seed=42): r""" Description ----------- GRB splitting scheme designed for adversarial robustness evaluation. Parameters ---------- adj : scipy.sparse.csr.csr_matrix Adjacency matrix in form of ``N * N`` sparse matrix. range_min : tuple of float, optional Range of nodes with minimum degrees to be ignored. Value in percentage. Default: ``(0.0, 0.05)``. range_max : tuple of float, optional Range of nodes with maximum degrees to be ignored. Value in percentage. Default: ``(0.95, 1.0)``. range_easy : tuple of float, optional Range of nodes for ``easy`` difficulty. Value in percentage. Default: ``(0.05, 0.35)``. range_medium : tuple of float, optional Range of nodes for ``medium`` difficulty. Value in percentage. Default: ``(0.35, 0.65)``. range_hard : tuple of float, optional Range of nodes for ``hard`` difficulty. Value in percentage. Default: ``(0.65, 0.95)``. ratio_train : float, optional Ratio of train nodes. Default: ``0.6``. ratio_val : float, optional Ratio of validation nodes. Default: ``0.1``. ratio_test : float, optional Ratio of test nodes. Default: ``0.1``. seed : int, optional Random seed. Default: ``42``. Returns ------- index : dict Dictionary containing ``{"index_train", "index_val", "index_test", "index_test_easy", "index_test_medium", "index_test_hard"}``. """ def a_not_in_b(a, b): c = [] for i in a: if i not in b: c.append(i) return np.array(c) num_nodes = adj.shape[0] degs = adj.getnnz(axis=1) print("GRB data splitting...") print(" Average degree of all nodes: {:.4f}".format(np.mean(degs))) degs_index = np.argsort(degs) ind_min = int(len(degs_index) * range_min[1]) ind_max = int(len(degs_index) * range_max[0]) print(" Average degree of 5% nodes with small degree: {:.4f}".format( np.mean(degs[degs_index[:ind_min]]))) print(" Average degree of 5% nodes with large degree: {:.4f}".format( np.mean(degs[degs_index[ind_max:]]))) # Sampling 'easy' test nodes ind_easy_min = int(len(degs_index) * range_easy[0]) ind_easy_max = int(len(degs_index) * range_easy[1]) print(" Average degree of 30% nodes (easy): {:.4f}".format( np.mean(degs[degs_index[ind_easy_min:ind_easy_max]]))) np.random.seed(seed) ind_easy_sample = np.random.choice(degs_index[ind_easy_min:ind_easy_max], int(num_nodes * ratio_test), replace=False) print(" Randomly sampled {} nodes".format(ind_easy_sample.shape[0])) # Sampling 'medium' test nodes ind_medium_min = int(len(degs_index) * range_medium[0]) ind_medium_max = int(len(degs_index) * range_medium[1]) print(" Average degree of 30% nodes (medium): {:.4f}".format( np.mean(degs[degs_index[ind_medium_min:ind_medium_max]]))) np.random.seed(seed) ind_medium_sample = np.random.choice(degs_index[ind_medium_min:ind_medium_max], int(num_nodes * ratio_test), replace=False) print(" Randomly sampled {} nodes".format(ind_medium_sample.shape[0])) # Sampling 'hard' test nodes ind_hard_min = int(len(degs_index) * range_hard[0]) ind_hard_max = int(len(degs_index) * range_hard[1]) print(" Average degree of 30% nodes (hard): {:.4f}".format( np.mean(degs[degs_index[ind_hard_min:ind_hard_max]]))) np.random.seed(seed) ind_hard_sample = np.random.choice(degs_index[ind_hard_min:ind_hard_max], int(num_nodes * ratio_test), replace=False) print(" Randomly sampled {} nodes".format(ind_hard_sample.shape[0])) ind_test = np.concatenate([ind_easy_sample, ind_medium_sample, ind_hard_sample]) # Sampling nodes for training and validation ind_rest = a_not_in_b(degs_index, ind_test) np.random.seed(seed) ind_train = np.random.choice(ind_rest, int(num_nodes * ratio_train), replace=False) ind_val = a_not_in_b(ind_rest, ind_train) print(" Number of training/validation nodes: {}/{}".format(len(ind_train), len(ind_val))) if len(ind_train) + len(ind_val) + len(ind_test) == num_nodes: print(" No duplicate.") else: print(" Find duplicates.") index = {"index_train" : np.sort(ind_train), "index_val" : np.sort(ind_val), "index_test" : np.sort(ind_test), "index_test_easy" : np.sort(ind_easy_sample), "index_test_medium": np.sort(ind_medium_sample), "index_test_hard" : np.sort(ind_hard_sample)} return index