Numerical examples for "A bit-by-bit guide to the equations governing differentiable neural computers"

Auxiliary functions

# Auxiliary functions:

def _render_table (j=0,u=0,gamma=0,prod=0,a=0,state="row"):
    if state=="start":
        _render_table.s= ''.join(["$$\\newcommand{\\b}[1]{\\boldsymbol{#1}}$$",
            "<table style='width:60%'><colgroup><col style='width:5%'>"
            "<col style='width:10%'><col style='width:25%'>"
            "<col style='width:20%'><col style='width:30%'>",
            "<col style='width:10%'></colgroup>",
            "<thead><tr style='background-color: lightgrey'>",
            "<th>$\\gamma : \\b{\\phi}_t[\\gamma] = j$</th>",
            "<th>$\prod_{i=1}^{\\gamma-1} \\b{u}_t[\\b{\\phi}_t[i]]$</th>",
    elif state=="end":
        _render_table.s+= "</tbody></table>"
        return _render_table.s
    elif state=="row":
        _render_table.s+= ''.join(["<tr>",
def _matrix_to_latex (A):
    s= "\\begin{bmatrix}"
    s+= " \\\\\n".join([" & ".join(map(str,line)) for line in A])
    s+= "\\end{bmatrix}"
    return s

Example of a dualistic read operation

$$ \newcommand{\b}[1]{\boldsymbol{#1}} \newcommand{\r}[1]{\mathrm{#1}} \newcommand{\c}[1]{\mathcal{#1}} $$

A position in memory can be read with the operation $\b{M}^\top_t \b{w}^\r{r}_t$.

import numpy as np
from IPython.display import display, Markdown

# Memory at time t-1:
M= np.array([[-0.5,  0.01, 3.1],
             [ 0.2,  0.6,  1.2],
             [ 0,    0,    0  ],
             [-0.1, -0.05, 0  ]])
# Read weighting:
w= np.array([0,1,0,0]).reshape(-1,1)

# Operation to retrieve memory contents using the read weighting:
r= M.T @ w

display(Markdown("$$\\b{M}_t="+_matrix_to_latex(M)+"\\quad \\b{w}^\\r{r}_t="+_matrix_to_latex(w)+"$$"))
display(Markdown("Vector retrieved from memory:"))


$$\b{M}_t=\begin{bmatrix}-0.5 & 0.01 & 3.1 \\ 0.2 & 0.6 & 1.2 \\ 0.0 & 0.0 & 0.0 \\ -0.1 & -0.05 & 0.0\end{bmatrix}\quad \b{w}^\r{r}_t=\begin{bmatrix}0 \\ 1 \\ 0 \\ 0\end{bmatrix}$$

Vector retrieved from memory:

$$\b{r}_t=\begin{bmatrix}0.2 \\ 0.6 \\ 1.2\end{bmatrix}$$

Example of a non-dualistic read operation

A position in memory can be read with the operation $\b{M}^\top_t \b{w}^\r{r}_t$.

# Memory at time t-1:
M= np.array([[-0.5,  0.01, 3.1],
             [ 0.2,  0.6,  1.2],
             [ 0,    0,    0  ],
             [-0.1, -0.05, 0  ]])
# Read weighting:
w= np.array([0,0.8,0.1,0.1]).reshape(-1,1)

# Operation to retrieve memory contents using the read weighting:
r= M.T @ w

display(Markdown("$$\\b{M}_t="+_matrix_to_latex(M)+"\\quad \\b{w}^\\r{r}_t="+_matrix_to_latex(w)+"$$"))
display(Markdown("Vector retrieved from memory:"))


$$\b{M}_t=\begin{bmatrix}-0.5 & 0.01 & 3.1 \\ 0.2 & 0.6 & 1.2 \\ 0.0 & 0.0 & 0.0 \\ -0.1 & -0.05 & 0.0\end{bmatrix}\quad \b{w}^\r{r}_t=\begin{bmatrix}0.0 \\ 0.8 \\ 0.1 \\ 0.1\end{bmatrix}$$

Vector retrieved from memory:

$$\b{r}_t=\begin{bmatrix}0.15 \\ 0.475 \\ 0.96\end{bmatrix}$$

Example of a dualistic write operation

The equation that determines the new content of the memory is:

$$ \b{M}_t = \b{M}_{t-1} \circ \left( \b{E} - \b{w}^\r{w}_t \b{e}^\top_t \right) + \b{w}^\r{w}_t \b{v}^\top_t $$
import numpy as np

# Memory at time t-1:
M1= np.array([[-0.5,  0.01, 3.1],
              [ 0.2,  0.6,  1.2],
              [ 0,    0,    0  ],
              [-0.1, -0.05, 0]])
# Write weighting:
w= np.array([0,1,0,0]).reshape(-1,1)
# Write vector:
v= np.array([-1.5,-1.3,-1.1]).reshape(-1,1)
# Erase vector:
e= np.array([1,1,1]).reshape(-1,1)
# Matrix of ones:
E= np.ones((4,3))

# Operation to obtain the memory at time t:
M= M1 * (E - w @ e.T) + w @ v.T

display(Markdown("$$\\b{M}_{t-1}="+_matrix_to_latex(M1)+"\\quad \\b{w}^\\r{w}_t="+_matrix_to_latex(w)+
                 "\\quad \\b{v}_t="+_matrix_to_latex(v)+"\\quad \\b{e}_t="+_matrix_to_latex(e)+
                 "\\quad \\b{E}_t="+_matrix_to_latex(E)+"$$"))
display(Markdown("New memory matrix:"))


$$\b{M}_{t-1}=\begin{bmatrix}-0.5 & 0.01 & 3.1 \\ 0.2 & 0.6 & 1.2 \\ 0.0 & 0.0 & 0.0 \\ -0.1 & -0.05 & 0.0\end{bmatrix}\quad \b{w}^\r{w}_t=\begin{bmatrix}0 \\ 1 \\ 0 \\ 0\end{bmatrix}\quad \b{v}_t=\begin{bmatrix}-1.5 \\ -1.3 \\ -1.1\end{bmatrix}\quad \b{e}_t=\begin{bmatrix}1 \\ 1 \\ 1\end{bmatrix}\quad \b{E}_t=\begin{bmatrix}1.0 & 1.0 & 1.0 \\ 1.0 & 1.0 & 1.0 \\ 1.0 & 1.0 & 1.0 \\ 1.0 & 1.0 & 1.0\end{bmatrix}$$

New memory matrix:

$$\b{M}_t=\begin{bmatrix}-0.5 & 0.01 & 3.1 \\ -1.5 & -1.3 & -1.1 \\ 0.0 & 0.0 & 0.0 \\ -0.1 & -0.05 & 0.0\end{bmatrix}$$

Example of a non-dualistic write operation

The equation that determines the new content of the memory is:

$$ \b{M}_t = \b{M}_{t-1} \circ \left( \b{E} - \b{w}^\r{w}_t \b{e}^\top_t \right) + \b{w}^\r{w}_t \b{v}^\top_t $$
import numpy as np

# Memory at time t-1:
M1= np.array([[-0.5,  0.01, 3.1],
              [ 0.2,  0.6,  1.2],
              [ 0,    0,    0  ],
              [-0.1, -0.05, 0]])
# Write weighting:
w= np.array([0,0.8,0.1,0.1]).reshape(-1,1)
# Write vector:
v= np.array([-1.5,-1.3,-1.1]).reshape(-1,1)
# Erase vector:
e= np.array([1,0.5,0]).reshape(-1,1)
# Matrix of ones:
E= np.ones((4,3))

# Operation to obtain the memory at time t:
M= M1 * (E - w @ e.T) + w @ v.T

display(Markdown("$$\\b{M}_{t-1}="+_matrix_to_latex(M1)+"\\quad \\b{w}^\\r{w}_t="+_matrix_to_latex(w)+
                 "\\quad \\b{v}_t="+_matrix_to_latex(v)+"\\quad \\b{e}_t="+_matrix_to_latex(e)+
                 "\\quad \\b{E}_t="+_matrix_to_latex(E)+"$$"))
display(Markdown("New memory matrix:"))


$$\b{M}_{t-1}=\begin{bmatrix}-0.5 & 0.01 & 3.1 \\ 0.2 & 0.6 & 1.2 \\ 0.0 & 0.0 & 0.0 \\ -0.1 & -0.05 & 0.0\end{bmatrix}\quad \b{w}^\r{w}_t=\begin{bmatrix}0.0 \\ 0.8 \\ 0.1 \\ 0.1\end{bmatrix}\quad \b{v}_t=\begin{bmatrix}-1.5 \\ -1.3 \\ -1.1\end{bmatrix}\quad \b{e}_t=\begin{bmatrix}1.0 \\ 0.5 \\ 0.0\end{bmatrix}\quad \b{E}_t=\begin{bmatrix}1.0 & 1.0 & 1.0 \\ 1.0 & 1.0 & 1.0 \\ 1.0 & 1.0 & 1.0 \\ 1.0 & 1.0 & 1.0\end{bmatrix}$$

New memory matrix:

$$\b{M}_t=\begin{bmatrix}-0.5 & 0.01 & 3.1 \\ -1.16 & -0.68 & 0.32 \\ -0.15 & -0.13 & -0.11 \\ -0.24 & -0.1775 & -0.11\end{bmatrix}$$

Content-based addressing

The $i$-th element of $\c{C}$ is obtained as follows:

$$ \c{C}(\b{M},\b{k},\beta)[i] = \frac{ \r{exp}\left(\c{D}(\b{k},\b{M}[i,\cdot])\right)^\beta }{ \sum_{j=1}^N \r{exp}\left(\c{D}(\b{k},\b{M}[j,\cdot])\right)^\beta } $$

The cosine similarity $\c{D}$ is defined as:

$$ \c{D}(\b{u},\b{v}) = \frac{\b{u} \cdot \b{v}}{\lVert\b{u}\rVert \, \lVert\b{v}\rVert} $$
def cosine_sim(u,v):
    return,v) / (np.sqrt(,u)) * np.sqrt(,v)))

def softmax(x,beta):
    return np.exp(x*beta) / np.sum(np.exp(x*beta), axis=0)

# Memory:
M= np.array([[-0.5,  0.01, 3.1],
             [ 0.2,  0.6,  1.2]])
# Lookup vector (represented as a row vector):
k= np.array([0.3,0.5,1])
# Cosine similarities:
D= np.apply_along_axis(cosine_sim, 1, M, k)

                 +"\\quad \\b{k}="+_matrix_to_latex(k.reshape(-1,1))+"$$"))
display(Markdown("Cosine similarities:"))
for index, item in enumerate(D):

# Key strength:
beta= 1
# Content-based weighting:
C= softmax(D,beta)
display(Markdown("Content-based weighting (with $\\beta$="+str(beta)+"):"))

# Key strength:
beta= 10
# Content-based weighting:
C= softmax(D,beta)
display(Markdown("Content-based weighting (with $\\beta$="+str(beta)+"):"))


$$\b{M}=\begin{bmatrix}-0.5 & 0.01 & 3.1 \\ 0.2 & 0.6 & 1.2\end{bmatrix}\quad \b{k}=\begin{bmatrix}0.3 \\ 0.5 \\ 1.0\end{bmatrix}$$

Cosine similarities:


Content-based weighting (with $\beta$=1):

$$\quad\c{C}(\b{M},\b{k},\beta=1)=\begin{bmatrix}0.454987593255 \\ 0.545012406745\end{bmatrix}$$

Content-based weighting (with $\beta$=10):

$$\quad\c{C}(\b{M},\b{k},\beta=10)=\begin{bmatrix}0.141196927269 \\ 0.858803072731\end{bmatrix}$$

Computation of the allocation weighting vector

Generation of the tables included in the guide for the computation of $\b{a}_t$. Note that indexes start at one.

def _generate_allocation_weighting_vector (u):
    # Free list (with indexes starting at 0):
    phi0= sorted(range(len(u)), key=lambda k: u[k])

    a= [None]*len(u)

    for j in range(0,len(phi0)):
        gamma= phi0.index(j)
        prod= 1
        for i in range(0,gamma):
            prod= prod*u[phi0[i]]
        a[j]= (1-u[j])*prod

    s= _render_table(state="end")
    return (a,s)

from IPython.display import display, Markdown

# Usage vector:
u= [1,0,0.8,0.4]
(a,s)= _generate_allocation_weighting_vector (u)
display(Markdown("#### First example"))
display(Markdown("Usage vector and resulting allocation vector:"))
                 +"\\quad \\b{a}_t="+_matrix_to_latex(np.array(a).reshape(-1,1))+"$$"))
display(Markdown("Table with intermediate computations:"))

u= [0.4,0.6,0.2,0.5]
(a,s)= _generate_allocation_weighting_vector (u)
display(Markdown("#### Second example"))
display(Markdown("Another usage vector and resulting allocation vector:"))
                 +"\\quad \\b{a}_t="+_matrix_to_latex(np.array(a).reshape(-1,1))+"$$"))
display(Markdown("Table with intermediate computations:"))

First example

Usage vector and resulting allocation vector:

$$\b{u}_t=\begin{bmatrix}1.0 \\ 0.0 \\ 0.8 \\ 0.4\end{bmatrix}\quad \b{a}_t=\begin{bmatrix}0.0 \\ 1.0 \\ 0.0 \\ 0.0\end{bmatrix}$$

Table with intermediate computations:

$j$$\b{u}_t[j]$$\gamma : \b{\phi}_t[\gamma] = j$$1-\b{u}_t[j]$$\prod_{i=1}^{\gamma-1} \b{u}_t[\b{\phi}_t[i]]$$\b{a}_t[j]$

Second example

Another usage vector and resulting allocation vector:

$$\b{u}_t=\begin{bmatrix}0.4 \\ 0.6 \\ 0.2 \\ 0.5\end{bmatrix}\quad \b{a}_t=\begin{bmatrix}0.12 \\ 0.016 \\ 0.8 \\ 0.04\end{bmatrix}$$

Table with intermediate computations:

$j$$\b{u}_t[j]$$\gamma : \b{\phi}_t[\gamma] = j$$1-\b{u}_t[j]$$\prod_{i=1}^{\gamma-1} \b{u}_t[\b{\phi}_t[i]]$$\b{a}_t[j]$
