import logging
import numpy as np
import torch
from tqdm import tqdm
import autogl
from ._basic import BaseFeatureGenerator
from .._feature_engineer_registry import FeatureEngineerUniversalRegistry
_LOGGER = logging.getLogger("FE")
class _Graphlet:
def __init__(self, data, sample_error=0.1, sample_confidence=0.1):
self._data = data
self._init()
self._sample_error = sample_error
self._sample_confidence = sample_confidence
self._dw = int(
np.ceil(
0.5 * (self._sample_error ** -2) * np.log(2 / self._sample_confidence)
)
)
_LOGGER.info(
"sample error {} , confidence {},num {}".format(
self._sample_error, self._sample_confidence, self._dw
)
)
def _init(self):
self._edges = list(self._data.edge_index)
self._edges = [self._edges[0], self._edges[1]]
self._num_nodes = self._data.x.shape[0]
self._num_edges = len(self._edges[0])
self._neighbours = [[] for _ in range(self._num_nodes)]
for i in range(len(self._edges[0])):
u, v = self._edges[0][i], self._edges[1][i]
self._neighbours[u].append(v)
_LOGGER.info("nodes {} , edges {}".format(self._num_nodes, self._num_edges))
# sorting
self._node_degrees = np.array([len(x) for x in self._neighbours])
self._nodes = np.argsort(self._node_degrees)
for i in self._nodes:
self._neighbours[i] = [
x
for _, x in sorted(
zip(self._node_degrees[self._neighbours[i]], self._neighbours[i]),
reverse=True,
)
]
self._neighbours = [np.array(x) for x in self._neighbours]
def _get_gdv(self, v, u):
if self._node_degrees[v] >= self._node_degrees[u]:
pass
else:
u, v = v, u
Sv, Su, Te = set(), set(), set()
sigma1, sigma2 = 0, 0
nb = self._neighbours
N = self._num_nodes
M = self._num_edges
phi = np.zeros(self._num_nodes, dtype=int)
c1, c2, c3, c4 = 1, 2, 3, 4
x = np.zeros(16, dtype=int)
# p1
for w in nb[v]:
if w != u:
Sv.add(w)
phi[w] = c1
# p2
for w in nb[u]:
if w != v:
if phi[w] == c1:
Te.add(w)
phi[w] = c3
Sv.remove(w)
else:
Su.add(w)
phi[w] = c2
# p3
for w in Te:
for r in nb[w]:
if phi[r] == c3:
x[5] += 1
phi[w] = c4
sigma2 = sigma2 + len(nb[w]) - 2
# p4
for w in Su:
for r in nb[w]:
if phi[r] == c1:
x[8] += 1
if phi[r] == c2:
x[7] += 1
if phi[r] == c4:
sigma1 += 1
phi[w] = 0
sigma2 = sigma2 + len(nb[w]) - 1
# p5
for w in Sv:
for r in nb[w]:
if phi[r] == c1:
x[7] += 1
if phi[r] == c4:
sigma1 += 1
phi[w] = 0
sigma2 = sigma2 + len(nb[w]) - 1
lsv, lsu, lte, du, dv = len(Sv), len(Su), len(Te), len(nb[u]), len(nb[v])
# 3-graphlet
x[1] = lte
x[2] = du + dv - 2 - 2 * x[1]
x[3] = N - x[2] - x[1] - 2
x[4] = N * (N - 1) * (N - 2) / 6 - (x[1] + x[2] + x[3])
# 4 connected graphlets
x[6] = x[1] * (x[1] - 1) / 2 - x[5]
x[10] = lsv * lsu - x[8]
x[9] = lsv * (lsv - 1) / 2 + lsu * (lsu - 1) / 2 - x[7]
# 4 disconnected graphlets
t1 = N - (lte + lsu + lsv + 2)
x[11] = x[1] * t1
x[12] = M - (du + dv - 1) - (sigma2 - sigma1 - x[5] - x[8] - x[7])
x[13] = (lsu + lsv) * t1
x[14] = t1 * (t1 - 1) / 2 - x[12]
x[15] = N * (N - 1) * (N - 2) * (N - 3) / 24 - np.sum(x[5:15])
return x
def _get_gdv_sample(self, v, u):
if self._node_degrees[v] >= self._node_degrees[u]:
pass
else:
u, v = v, u
Sv = set()
sigma1, sigma2 = 0, 0
nb = self._neighbours
N = self._num_nodes
M = self._num_edges
phi = np.zeros(self._num_nodes, dtype=int)
c1, c2, c3, c4 = 1, 2, 3, 4
x = np.zeros(16)
dw = self._dw
# p1
Sv = set(nb[v][nb[v] != u])
phi[list(Sv)] = c1
# p2
p2w = nb[u][nb[u] != c1]
p2w1 = p2w[phi[p2w] == c1]
p2w2 = p2w[phi[p2w] != c1]
Te = p2w1
phi[p2w1] = c3
Sv -= set(list(p2w1))
Su = p2w2
phi[p2w2] = c2
# p3
for w in Te:
if dw >= len(nb[w]):
region = nb[w]
inc = 1
else:
region = np.random.choice(nb[w], dw, replace=False)
inc = self._node_degrees[w] / dw
phir = phi[region]
x[5] += inc * np.sum(phir == c3)
phi[w] = c4
sigma2 = sigma2 + len(nb[w]) - 2
# p4
for w in Su:
if dw >= len(nb[w]):
region = nb[w]
inc = 1
else:
region = np.random.choice(nb[w], dw, replace=False)
inc = self._node_degrees[w] / dw
phir = phi[region]
x[8] += inc * np.sum(phir == c1)
x[7] += inc * np.sum(phir == c2)
sigma1 += inc * np.sum(phir == c4)
phi[w] = 0
sigma2 = sigma2 + len(nb[w]) - 1
# p5
for w in Sv:
if dw >= len(nb[w]):
region = nb[w]
inc = 1
else:
region = np.random.choice(nb[w], dw, replace=False)
inc = self._node_degrees[w] / dw
phir = phi[region]
x[7] += inc * np.sum(phir == c1)
sigma1 += inc * np.sum(phir == c4)
phi[w] = 0
sigma2 = sigma2 + len(nb[w]) - 1
lsv, lsu, lte, du, dv = len(Sv), len(Su), len(Te), len(nb[u]), len(nb[v])
# 3-graphlet
x[1] = lte
x[2] = du + dv - 2 - 2 * x[1]
x[3] = N - x[2] - x[1] - 2
x[4] = N * (N - 1) * (N - 2) / 6 - (x[1] + x[2] + x[3])
# 4 connected graphlets
x[6] = x[1] * (x[1] - 1) / 2 - x[5]
x[10] = lsv * lsu - x[8]
x[9] = lsv * (lsv - 1) / 2 + lsu * (lsu - 1) / 2 - x[7]
# 4 disconnected graphlets
t1 = N - (lte + lsu + lsv + 2)
x[11] = x[1] * t1
x[12] = M - (du + dv - 1) - (sigma2 - sigma1 - x[5] - x[8] - x[7])
x[13] = (lsu + lsv) * t1
x[14] = t1 * (t1 - 1) / 2 - x[12]
x[15] = N * (N - 1) * (N - 2) * (N - 3) / 24 - np.sum(x[5:15])
return x
def get_gdvs(self, sample=True):
res = np.zeros((self._num_nodes, 15))
for u in tqdm(range(self._num_nodes)):
vs = self._neighbours[u]
if len(vs) != 0:
gdvs = []
for v in tqdm(vs, disable=len(vs) < 100):
if sample:
gdvs.append(self._get_gdv_sample(u, v))
else:
gdvs.append(self._get_gdv(u, v))
res[u, :] = np.mean(gdvs, axis=0)[1:]
return res
[docs]@FeatureEngineerUniversalRegistry.register_feature_engineer("graph" + "let")
class GraphletGenerator(BaseFeatureGenerator):
r"""generate local graphlet numbers as features. The implementation refers to [#]_ .
References
----------
.. [#] Ahmed, N. K., Willke, T. L., & Rossi, R. A. (2016).
Estimation of local subgraph counts. Proceedings - 2016 IEEE International Conference on Big Data, Big Data 2016, 586–595.
https://doi.org/10.1109/BigData.2016.7840651
"""
def _extract_nodes_feature(self, data: autogl.data.Data) -> torch.Tensor:
result: np.ndarray = _Graphlet(data).get_gdvs()
return torch.from_numpy(result)