Source code for autogl.module.feature._generators._graphlet

import logging
import numpy as np
import torch
from tqdm import tqdm
import autogl
from ._basic import BaseFeatureGenerator
from .._feature_engineer_registry import FeatureEngineerUniversalRegistry

_LOGGER = logging.getLogger("FE")


class _Graphlet:
    def __init__(self, data, sample_error=0.1, sample_confidence=0.1):
        self._data = data
        self._init()

        self._sample_error = sample_error
        self._sample_confidence = sample_confidence
        self._dw = int(
            np.ceil(
                0.5 * (self._sample_error ** -2) * np.log(2 / self._sample_confidence)
            )
        )
        _LOGGER.info(
            "sample error {} , confidence {},num {}".format(
                self._sample_error, self._sample_confidence, self._dw
            )
        )

    def _init(self):
        self._edges = list(self._data.edge_index)
        self._edges = [self._edges[0], self._edges[1]]
        self._num_nodes = self._data.x.shape[0]
        self._num_edges = len(self._edges[0])
        self._neighbours = [[] for _ in range(self._num_nodes)]
        for i in range(len(self._edges[0])):
            u, v = self._edges[0][i], self._edges[1][i]
            self._neighbours[u].append(v)

        _LOGGER.info("nodes {} , edges {}".format(self._num_nodes, self._num_edges))

        # sorting
        self._node_degrees = np.array([len(x) for x in self._neighbours])
        self._nodes = np.argsort(self._node_degrees)
        for i in self._nodes:
            self._neighbours[i] = [
                x
                for _, x in sorted(
                    zip(self._node_degrees[self._neighbours[i]], self._neighbours[i]),
                    reverse=True,
                )
            ]
        self._neighbours = [np.array(x) for x in self._neighbours]

    def _get_gdv(self, v, u):
        if self._node_degrees[v] >= self._node_degrees[u]:
            pass
        else:
            u, v = v, u
        Sv, Su, Te = set(), set(), set()
        sigma1, sigma2 = 0, 0
        nb = self._neighbours
        N = self._num_nodes
        M = self._num_edges
        phi = np.zeros(self._num_nodes, dtype=int)
        c1, c2, c3, c4 = 1, 2, 3, 4
        x = np.zeros(16, dtype=int)
        # p1
        for w in nb[v]:
            if w != u:
                Sv.add(w)
                phi[w] = c1
        # p2
        for w in nb[u]:
            if w != v:
                if phi[w] == c1:
                    Te.add(w)
                    phi[w] = c3
                    Sv.remove(w)
                else:
                    Su.add(w)
                    phi[w] = c2
        # p3
        for w in Te:
            for r in nb[w]:
                if phi[r] == c3:
                    x[5] += 1
            phi[w] = c4
            sigma2 = sigma2 + len(nb[w]) - 2
        # p4
        for w in Su:
            for r in nb[w]:
                if phi[r] == c1:
                    x[8] += 1
                if phi[r] == c2:
                    x[7] += 1
                if phi[r] == c4:
                    sigma1 += 1
            phi[w] = 0
            sigma2 = sigma2 + len(nb[w]) - 1
        # p5
        for w in Sv:
            for r in nb[w]:
                if phi[r] == c1:
                    x[7] += 1
                if phi[r] == c4:
                    sigma1 += 1
            phi[w] = 0
            sigma2 = sigma2 + len(nb[w]) - 1

        lsv, lsu, lte, du, dv = len(Sv), len(Su), len(Te), len(nb[u]), len(nb[v])
        # 3-graphlet
        x[1] = lte
        x[2] = du + dv - 2 - 2 * x[1]
        x[3] = N - x[2] - x[1] - 2
        x[4] = N * (N - 1) * (N - 2) / 6 - (x[1] + x[2] + x[3])
        # 4 connected graphlets
        x[6] = x[1] * (x[1] - 1) / 2 - x[5]
        x[10] = lsv * lsu - x[8]
        x[9] = lsv * (lsv - 1) / 2 + lsu * (lsu - 1) / 2 - x[7]
        # 4 disconnected graphlets
        t1 = N - (lte + lsu + lsv + 2)
        x[11] = x[1] * t1
        x[12] = M - (du + dv - 1) - (sigma2 - sigma1 - x[5] - x[8] - x[7])
        x[13] = (lsu + lsv) * t1
        x[14] = t1 * (t1 - 1) / 2 - x[12]
        x[15] = N * (N - 1) * (N - 2) * (N - 3) / 24 - np.sum(x[5:15])

        return x

    def _get_gdv_sample(self, v, u):
        if self._node_degrees[v] >= self._node_degrees[u]:
            pass
        else:
            u, v = v, u
        Sv = set()
        sigma1, sigma2 = 0, 0
        nb = self._neighbours
        N = self._num_nodes
        M = self._num_edges
        phi = np.zeros(self._num_nodes, dtype=int)
        c1, c2, c3, c4 = 1, 2, 3, 4
        x = np.zeros(16)
        dw = self._dw

        # p1
        Sv = set(nb[v][nb[v] != u])
        phi[list(Sv)] = c1
        # p2
        p2w = nb[u][nb[u] != c1]
        p2w1 = p2w[phi[p2w] == c1]
        p2w2 = p2w[phi[p2w] != c1]
        Te = p2w1
        phi[p2w1] = c3
        Sv -= set(list(p2w1))
        Su = p2w2
        phi[p2w2] = c2
        # p3
        for w in Te:
            if dw >= len(nb[w]):
                region = nb[w]
                inc = 1
            else:
                region = np.random.choice(nb[w], dw, replace=False)
                inc = self._node_degrees[w] / dw
            phir = phi[region]
            x[5] += inc * np.sum(phir == c3)
            phi[w] = c4
            sigma2 = sigma2 + len(nb[w]) - 2
        # p4
        for w in Su:
            if dw >= len(nb[w]):
                region = nb[w]
                inc = 1
            else:
                region = np.random.choice(nb[w], dw, replace=False)
                inc = self._node_degrees[w] / dw
            phir = phi[region]
            x[8] += inc * np.sum(phir == c1)
            x[7] += inc * np.sum(phir == c2)
            sigma1 += inc * np.sum(phir == c4)
            phi[w] = 0
            sigma2 = sigma2 + len(nb[w]) - 1
        # p5
        for w in Sv:
            if dw >= len(nb[w]):
                region = nb[w]
                inc = 1
            else:
                region = np.random.choice(nb[w], dw, replace=False)
                inc = self._node_degrees[w] / dw
            phir = phi[region]
            x[7] += inc * np.sum(phir == c1)
            sigma1 += inc * np.sum(phir == c4)
            phi[w] = 0
            sigma2 = sigma2 + len(nb[w]) - 1

        lsv, lsu, lte, du, dv = len(Sv), len(Su), len(Te), len(nb[u]), len(nb[v])
        # 3-graphlet
        x[1] = lte
        x[2] = du + dv - 2 - 2 * x[1]
        x[3] = N - x[2] - x[1] - 2
        x[4] = N * (N - 1) * (N - 2) / 6 - (x[1] + x[2] + x[3])
        # 4 connected graphlets
        x[6] = x[1] * (x[1] - 1) / 2 - x[5]
        x[10] = lsv * lsu - x[8]
        x[9] = lsv * (lsv - 1) / 2 + lsu * (lsu - 1) / 2 - x[7]
        # 4 disconnected graphlets
        t1 = N - (lte + lsu + lsv + 2)
        x[11] = x[1] * t1
        x[12] = M - (du + dv - 1) - (sigma2 - sigma1 - x[5] - x[8] - x[7])
        x[13] = (lsu + lsv) * t1
        x[14] = t1 * (t1 - 1) / 2 - x[12]
        x[15] = N * (N - 1) * (N - 2) * (N - 3) / 24 - np.sum(x[5:15])

        return x

    def get_gdvs(self, sample=True):
        res = np.zeros((self._num_nodes, 15))
        for u in tqdm(range(self._num_nodes)):
            vs = self._neighbours[u]
            if len(vs) != 0:
                gdvs = []
                for v in tqdm(vs, disable=len(vs) < 100):
                    if sample:
                        gdvs.append(self._get_gdv_sample(u, v))
                    else:
                        gdvs.append(self._get_gdv(u, v))
                res[u, :] = np.mean(gdvs, axis=0)[1:]
        return res


[docs]@FeatureEngineerUniversalRegistry.register_feature_engineer("graph" + "let") class GraphletGenerator(BaseFeatureGenerator): r"""generate local graphlet numbers as features. The implementation refers to [#]_ . References ---------- .. [#] Ahmed, N. K., Willke, T. L., & Rossi, R. A. (2016). Estimation of local subgraph counts. Proceedings - 2016 IEEE International Conference on Big Data, Big Data 2016, 586–595. https://doi.org/10.1109/BigData.2016.7840651 """ def _extract_nodes_feature(self, data: autogl.data.Data) -> torch.Tensor: result: np.ndarray = _Graphlet(data).get_gdvs() return torch.from_numpy(result)