Christian McDaniel
¶Data Scientist & Software Engineer,
NCR Innovation Lab
¶| Word Embedding | Molecular Structure | Traffic Routes | Neuronal Networks | 
|---|---|---|---|
![]()  | 
![]()  | 
| _ | _ | 
|---|---|
+ Great success with computer vision-based applications  | 
![]()  | 

| _ | _ | 
|---|---|
![]()  | 
![]()  | 
| _ | _ | _ | 
|---|---|---|
![]()  | 
![]()  | 
![]()  | 
$Z = \tilde{D}^{\frac{-1}{2}}\tilde{A}\tilde{D}^{\frac{-1}{2}}X\Theta$
$H^{(l+1)} = \sigma(\tilde{D}^{\frac{-1}{2}}\tilde{A}\tilde{D}^{\frac{-1}{2}}H^{l}W^{l})$
$Z = f(X,A) = \textrm{softmax}(\hat{A} \textrm{ReLU}(\hat{A}XW^{0})W^{1})$




$L = U^{T} \Lambda U$
$L = I - D^{-1/2} A D^{-1/2} = U^{T} \Lambda U$
$L = I - D^{-1/2} A D^{-1/2} = U^{T} \Lambda U$
$\hat{x} = U^{T}x \in \mathbb{R}^{n}$
$g_{\Theta} \ast x = U ((U^{T}g_{\Theta}) \bigodot (U^{T}x)) = g_{\Theta}(U \Lambda U^{T})x$
$L = I - D^{-1/2} A D^{-1/2} = U^{T} \Lambda U$
$g_{\Theta} \ast x = U ((U^{T}g_{\Theta}) \bigodot (U^{T}x)) = g_{\Theta}(U \Lambda U^{T})x$
$g_{\Theta} \ast x = g_{\Theta}(U \Lambda U^{T})x = U\hat{G}U^{T}x$
$L = I - D^{-1/2} A D^{-1/2} = U^{T} \Lambda U$
$g_{\Theta} \ast x = U ((U^{T}g_{\Theta}) \bigodot (U^{T}x)) = g_{\Theta}(U \Lambda U^{T})x$
$g_{\Theta} \ast x = g_{\Theta}(U \Lambda U^{T})x = U\hat{G}U^{T}x$
$g_{\Theta}\prime \approx \sum^{K}_{k=0}\Theta\prime_{k}T_{k}(\tilde{\Lambda})$
$L = I - D^{-1/2} A D^{-1/2} = U^{T} \Lambda U$
$g_{\Theta} \ast x = U ((U^{T}g_{\Theta}) \bigodot (U^{T}x)) = g_{\Theta}(U \Lambda U^{T})x$
$g_{\Theta} \ast x = g_{\Theta}(U \Lambda U^{T})x = U\hat{G}U^{T}x$
$g_{\Theta}^{\prime} \approx \sum^{K}_{k=0}\Theta^{\prime}_{k}T_{k}(\tilde{\Lambda})$
$g_{\Theta} \ast x \approx \sum^{K}_{k=0}\Theta^{\prime}_{k}T_{k}(\tilde{L})x$
$L = I - D^{-1/2} A D^{-1/2} = U^{T} \Lambda U$
$g_{\Theta} \ast x = U ((U^{T}g_{\Theta}) \bigodot (U^{T}x)) = g_{\Theta}(U \Lambda U^{T})x = U\hat{G}U^{T}x$
$g_{\Theta} \ast x \approx \sum^{K}_{k=0}\Theta^{\prime}_{k}T_{k}(\tilde{L})x$
$L = I - D^{-1/2} A D^{-1/2} = U^{T} \Lambda U$
$g_{\Theta} \ast x = U ((U^{T}g_{\Theta}) \bigodot (U^{T}x)) = g_{\Theta}(U \Lambda U^{T})x = U\hat{G}U^{T}x$
$g_{\Theta} \ast x \approx \sum^{K}_{k=0}\Theta^{\prime}_{k}T_{k}(\tilde{L})x$
1) In moving toward multi-layer networks, we can remove the explicit Chebychev parameterization by limiting the layer-wise convolution operation to $K=1$
$L = I - D^{-1/2} A D^{-1/2} = U^{T} \Lambda U$
$g_{\Theta} \ast x = U ((U^{T}g_{\Theta}) \bigodot (U^{T}x)) = g_{\Theta}(U \Lambda U^{T})x = U\hat{G}U^{T}x$
$g_{\Theta} \ast x \approx \sum^{K}_{k=0}\Theta^{\prime}_{k}T_{k}(\tilde{L})x$
1) $K=1$
2) When calculating the rescaled $\tilde{\Lambda}$ and $\tilde{L}$, we can approximate $\lambda_{max} \approx 2$, expecting the neural network paramaters to adapt accordingly during training.
$\tilde{L} = \frac{2}{\lambda_{max}}L - I$, where $L = I - D^{-1/2} A D^{-1/2} \rightarrow \tilde{L} \approx L - I = D^{\frac{-1}{2}}AD^{\frac{-1}{2}}$
$g_{\Theta} \ast x \approx \theta^{\prime}_{0}x + \theta^{\prime}_{1}(L - I)x = \theta^{\prime}_{0}x + \theta^{\prime}_{1}D^{\frac{-1}{2}}AD^{\frac{-1}{2}}x$
$L = I - D^{-1/2} A D^{-1/2} = U^{T} \Lambda U$
$g_{\Theta} \ast x = U ((U^{T}g_{\Theta}) \bigodot (U^{T}x)) = g_{\Theta}(U \Lambda U^{T})x = U\hat{G}U^{T}x$
$g_{\Theta} \ast x \approx \sum^{K}_{k=0}\Theta^{\prime}_{k}T_{k}(\tilde{L})x$
1) $K=1$
2) $g_{\Theta} \ast x \approx \theta^{\prime}_{0}x + \theta^{\prime}_{1}D^{\frac{-1}{2}}AD^{\frac{-1}{2}}x$
3) We can further constrain the problem to learning a single parameter (per dimension in $x$): $\theta = \theta^{\prime}_{0} = -\theta^{\prime}_{1}$
4) The previous revisions have left $L$ with eigenvalues in $[0,2]$, potentially leading to numerical instabilities and exploding/vanishing gradients due to repeated operations
$D^{\frac{-1}{2}}AD^{\frac{-1}{2}} \rightarrow \tilde{D}^{\frac{-1}{2}}\tilde{A}\tilde{D}^{\frac{-1}{2}}$
$Z = \tilde{D}^{\frac{-1}{2}}\tilde{A}\tilde{D}^{\frac{-1}{2}}X\Theta$

$Z = \tilde{D}^{\frac{-1}{2}}\tilde{A}\tilde{D}^{\frac{-1}{2}}X\Theta$
$H^{(l+1)} = \sigma(\tilde{D}^{\frac{-1}{2}}\tilde{A}\tilde{D}^{\frac{-1}{2}}H^{l}W^{l})$
$Z = f(X,A) = \textrm{softmax}(\hat{A} \textrm{ReLU}(\hat{A}XW^{0})W^{1})$
""" Github: tkipf/pygcn """
import math
import torch
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module
class GraphConvolution(Module):
    """
    Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
    """
    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()
    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)
    def forward(self, input, adj):
        support = torch.mm(input, self.weight)
        output = torch.spmm(adj, support)
        if self.bias is not None:
            return output + self.bias
        else:
            return output
""" Github: tkipf/pygcn """
import torch.nn as nn
import torch.nn.functional as F
from layers import GraphConvolution
class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout):
        super(GCN, self).__init__()
        self.gc1 = GraphConvolution(nfeat, nhid)
        self.gc2 = GraphConvolution(nhid, nclass)
        self.dropout = dropout
    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gc2(x, adj)
        return F.log_softmax(x, dim=1)
import pandas as pd
edges    = pd.read_csv('/Users/cm185255/Documents/pygcn/data/cora/cora.cites',sep='\t',header=None,names=["cited paper ID","ID of paper cited"])
features = pd.read_csv('/Users/cm185255/Documents/pygcn/data/cora/cora.content',sep='\t',header=None)
print('Table for the Edges (citations between papers)')
print(edges.head())
print('\nTable for the Features (word counts for each paper; last column = class)')
print(features.head())
...
Table for the Edges (citations between papers)
   cited paper ID  ID of paper cited
0              35               1033
1              35             103482
2              35             103515
3              35            1050679
4              35            1103960
Table for the Features (word counts for each paper; last column = class)
      0     1     2     3     ...  1431  1432  1433                    1434
0    31336     0     0     0  ...     0     0     0         Neural_Networks
1  1061127     0     0     0  ...     0     0     0           Rule_Learning
2  1106406     0     0     0  ...     0     0     0  Reinforcement_Learning
3    13195     0     0     0  ...     0     0     0  Reinforcement_Learning
4    37879     0     0     0  ...     0     0     0   Probabilistic_Methods
[5 rows x 1435 columns]

At each layer, the features at each node are transformed
$\pmb{h} = \{h_{1}, h_{2}, ..., h_{N}\}, h_{i} \in \mathbb{R}^{F} \rightarrow \pmb{h}^{\prime} = \{h_{1}^{\prime}, h_{2}^{\prime}, ..., h_{N}^{\prime}\}, h_{i}^{\prime} \in \mathbb{R}^{F^{\prime}}$
A shared weight matrix is learned for all nodes, and attention coefficients are calculated to denote importance of every node i to its neighbors j
$e_{ij} = a(\pmb{W}h_{i},\pmb{W}h_{j})$
Where $a(\cdot)$ is an attention mechanism. In our case - a single GCN layer| Anatomical MRI | Diffusion MRI | 
|---|---|
![]() ![]()  | 
![]()  | 

Fin!
Questions?