diff --git a/labml_nn/transformers/fast_weights/__init__.py b/labml_nn/transformers/fast_weights/__init__.py
index 195358c3..68d6411d 100644
--- a/labml_nn/transformers/fast_weights/__init__.py
+++ b/labml_nn/transformers/fast_weights/__init__.py
@@ -19,125 +19,126 @@
 
 ## Fast weights
 
-Consider a sequence of inputs $\big\{x^{(i)}\big\}^L_{i=1}$ or length $L$
-and each step is a vector of size $d_{in}$; i.e. $x \in \mathbb{R}^{d_{in}}$.
+Consider a sequence of inputs $\x08ig\\{x^{(i)}\x08ig\\}^L_{i=1}$ or length $L$
+and each step is a vector of size $d_{in}$; i.e. $x \\in \\mathbb{R}^{d_{in}}$.
 The fast weight model generates a weight matrix at each step to produce output
-$\big\{y^{(i)}\big\}^L_{i=1}$, $y \in \mathbb{R}^{d_{out}}$
+$\x08ig\\{y^{(i)}\x08ig\\}^L_{i=1}$, $y \\in \\mathbb{R}^{d_{out}}$
 
-\begin{align}
-a^{(i)}, b^{(i)} &= \textcolor{orange}{W_a} x^{(i)}, \textcolor{orange}{W_b} x^{(i)} \\
-\textcolor{cyan}{W^{(i)}} &= \sigma \Big( \textcolor{cyan}{W^{(i-1)}} + a^{(i)} \otimes b^{(i)} \Big) \\
-y^{(i)} &= \textcolor{cyan}{W^{(i)}} x^{(i)}
-\end{align}
+\x08egin{align}
+a^{(i)}, b^{(i)} &= 	extcolor{orange}{W_a} x^{(i)}, 	extcolor{orange}{W_b} x^{(i)} \\
+	extcolor{cyan}{W^{(i)}} &= \\sigma \\Big( 	extcolor{cyan}{W^{(i-1)}} + a^{(i)} \\otimes b^{(i)} \\Big) \\
+y^{(i)} &= 	extcolor{cyan}{W^{(i)}} x^{(i)}
+\\end{align}
 
-$\otimes$ is the outer product ($a \otimes b = a b^\top$), where elements of the two vectors are multiplied with each other
+$\\otimes$ is the outer product ($a \\otimes b = a b^	op$), where elements of the two vectors are multiplied with each other
 to give a matrix.
-$\sigma$ is an activation function.
-$\textcolor{orange}{W_a}$ and $\textcolor{orange}{W_b}$ are trainable weights (parameters).
-$\textcolor{cyan}{W^{(i)}}$ are the fast weights that are generated at each step.
+$\\sigma$ is an activation function.
+$	extcolor{orange}{W_a}$ and $	extcolor{orange}{W_b}$ are trainable weights (parameters).
+$	extcolor{cyan}{W^{(i)}}$ are the fast weights that are generated at each step.
 
 ## Linear self-attention
 
-Original transformer self-attention is, (omitting $\frac{1}{d_k}$ for clarity)
+Original transformer self-attention is, (omitting $\x0crac{1}{d_k}$ for clarity)
 
-\begin{align}
-y^{(i)} &= \Big[v^{(1)}, v^{(2)}, ..., v^{(i)}\Big] \text{softmax}
- \bigg(
-    \Big[k^{(1)}, k^{(2)}, ..., k^{(i)}\Big] ^\top
+\x08egin{align}
+y^{(i)} &= \\Big[v^{(1)}, v^{(2)}, ..., v^{(i)}\\Big] 	ext{softmax}
+ \x08igg(
+    \\Big[k^{(1)}, k^{(2)}, ..., k^{(i)}\\Big] ^	op
     q^{(i)}
- \bigg) \\
- &= \sum^i_{j=1} \frac
- { v^{(j)} \kappa(k^{(j)}, q^{(i)}) }
- { \sum^i_{j'=1} \kappa(k^{(j')}, q^{(i)}) } \\
-\end{align}
+ \x08igg) \\
+ &= \\sum^i_{j=1} \x0crac
+ { v^{(j)} \\kappa(k^{(j)}, q^{(i)}) }
+ { \\sum^i_{j'=1} \\kappa(k^{(j')}, q^{(i)}) } \\
+\\end{align}
 
-where $\kappa(k, q) = \text{exp}(k \cdot q)$
+where $\\kappa(k, q) = 	ext{exp}(k \\cdot q)$
 
 The idea behind linearizing self attention is to replace softmax
-kernel $\kappa$ with a different kernel $\kappa '$ so that we can calculate the
+kernel $\\kappa$ with a different kernel $\\kappa '$ so that we can calculate the
 denominator of the self attention function faster:
 
-$$\kappa '(k, q) = \textcolor{lightgreen}{\phi(k)}^\top \textcolor{lightgreen}{\phi(q)}$$
+$$\\kappa '(k, q) = 	extcolor{lightgreen}{\\phi(k)}^	op 	extcolor{lightgreen}{\\phi(q)}$$
 
 This gives
 
-\begin{align}
-y^{(i)} &= \frac
- {\Big( \sum^i_{j=1} v^{(j)} \otimes \textcolor{lightgreen}{\phi(k^{(j)})} \Big)
-  \textcolor{lightgreen}{\phi(q^{(i)})} }
- { \Big( \sum^i_{j'=1}
-   \textcolor{lightgreen}{\phi(k^{(j')})} \Big)
-    \textcolor{lightgreen}{\phi(q^{(i)})} }
-\end{align}
-
-With $\textcolor{cyan}{W^{(i)}} = \sum^i_{j=1} v^{(j)} \otimes \phi(k^{(j)})$ and
-$z^{(i)} = \sum^i_{j=1} \textcolor{lightgreen}{\phi(k^{(j)})}$, we can calculate them efficiently:
-
-\begin{align}
-\textcolor{cyan}{W^{(i)}} &= \textcolor{cyan}{W^{(i-1)}} + v^{(i)} \otimes \textcolor{lightgreen}{\phi(k^{(i)})} \\
-z^{(i)} &= z{(i)} + \textcolor{lightgreen}{\phi(k^{(i)})} \\
-y^{(i)} &= \frac{1}{z^{(i)} \cdot \textcolor{lightgreen}{\phi(q^{(i)})}}
-    W^{(i)} \textcolor{lightgreen}{\phi(q^{(i)})}
-\end{align}
+\x08egin{align}
+y^{(i)} &= \x0crac
+ {\\Big( \\sum^i_{j=1} v^{(j)} \\otimes 	extcolor{lightgreen}{\\phi(k^{(j)})} \\Big)
+  	extcolor{lightgreen}{\\phi(q^{(i)})} }
+ { \\Big( \\sum^i_{j'=1}
+   	extcolor{lightgreen}{\\phi(k^{(j')})} \\Big)
+    	extcolor{lightgreen}{\\phi(q^{(i)})} }
+\\end{align}
+
+With $	extcolor{cyan}{W^{(i)}} = \\sum^i_{j=1} v^{(j)} \\otimes \\phi(k^{(j)})$ and
+$z^{(i)} = \\sum^i_{j=1} 	extcolor{lightgreen}{\\phi(k^{(j)})}$, we can calculate them efficiently:
+
+\x08egin{align}
+	extcolor{cyan}{W^{(i)}} &= 	extcolor{cyan}{W^{(i-1)}} + v^{(i)} \\otimes 	extcolor{lightgreen}{\\phi(k^{(i)})} \\
+z^{(i)} &= z{(i)} + 	extcolor{lightgreen}{\\phi(k^{(i)})} \\
+y^{(i)} &= \x0crac{1}{z^{(i)} \\cdot 	extcolor{lightgreen}{\\phi(q^{(i)})}}
+    W^{(i)} 	extcolor{lightgreen}{\\phi(q^{(i)})}
+\\end{align}
 
 This is quite similar to fast weights.
 
-The paper introduces a new linear attention projection function $\textcolor{lightgreen}{\phi}$
-a new update rule for $\textcolor{cyan}{W^{(i)}} = f(\textcolor{cyan}{W^{(i-1)}})$ and change the normalization
-$\frac{1}{z^{(i)} \cdot \textcolor{lightgreen}{\phi(q^{(i)})}}$
+The paper introduces a new linear attention projection function $	extcolor{lightgreen}{\\phi}$
+a new update rule for $	extcolor{cyan}{W^{(i)}} = f(	extcolor{cyan}{W^{(i-1)}})$ and change the normalization
+$\x0crac{1}{z^{(i)} \\cdot 	extcolor{lightgreen}{\\phi(q^{(i)})}}$
 
 Here are [the training code](experiment.html) and a notebook for training a fast weights
  transformer on the Tiny Shakespeare dataset.
 
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/fast_weights/experiment.ipynb)
 """
-
 import torch
 from torch import nn
-
 from labml_helpers.module import Module
 from labml_nn.transformers.feed_forward import FeedForward
 from labml_nn.transformers.mha import PrepareForMultiHeadAttention
 from labml_nn.utils import clone_module_list
 
-
 class DPFP(Module):
     """
     ## Deterministic Parameter Free Project (DPFP)
 
-    This is the new projection function $\textcolor{lightgreen}{\phi}$ introduced in the paper.
-    DPFP projects $k$ of dimensionality $d_{key}$ to dimensionality $d_{dot} = 2 d_{key} \nu$,
-    where $\nu \in \\{1, 2, ..., 2 d_{key} - 1 \\}$ is a hyper-parameter.
+    This is the new projection function $	extcolor{lightgreen}{\\phi}$ introduced in the paper.
+    DPFP projects $k$ of dimensionality $d_{key}$ to dimensionality $d_{dot} = 2 d_{key} 
+u$,
+    where $
+u \\in \\{1, 2, ..., 2 d_{key} - 1 \\}$ is a hyper-parameter.
 
-    $$\textcolor{lightgreen}{\phi_{2 d_{key} (i - 1)  + j}(k)}
-     = \text{ReLU}\Big(\big[k, -k\big]\Big)_{j}
-                        \text{ReLU}\Big(\big[k, -k\big]\Big)_{i + j}$$
+    $$	extcolor{lightgreen}{\\phi_{2 d_{key} (i - 1)  + j}(k)}
+     = 	ext{ReLU}\\Big(\x08ig[k, -k\x08ig]\\Big)_{j}
+                        	ext{ReLU}\\Big(\x08ig[k, -k\x08ig]\\Big)_{i + j}$$
 
-    where $\big[k, -k\big]$ is the concatenation of $k$ and $-k$ to give a vector of
-    size $2 d_{key}$, $i \in \\{1, 2, ..., \nu \\}$, and $j \in \\{1, 2, ..., 2 d_{key}\\}$.
+    where $\x08ig[k, -k\x08ig]$ is the concatenation of $k$ and $-k$ to give a vector of
+    size $2 d_{key}$, $i \\in \\{1, 2, ..., 
+u \\}$, and $j \\in \\{1, 2, ..., 2 d_{key}\\}$.
     $x_i$ is the $i$-th element of vector $x$ and is rolled around if
     $i$ is larger than the number of elements in $x$.
 
     Basically, it creates a new vector by multiplying elements of $[k, -k]$ shifted by $i$.
 
     This produces projections that are sparse (only a few elements of $phi$ are non-zero) and
-    orthogonal ($\textcolor{lightgreen}{\phi(k^{(i)})} \cdot \textcolor{lightgreen}{\phi(k^{(j)})}
-     \approx 0$ for most $i, j$
+    orthogonal ($	extcolor{lightgreen}{\\phi(k^{(i)})} \\cdot 	extcolor{lightgreen}{\\phi(k^{(j)})}
+     \x07pprox 0$ for most $i, j$
     unless $k^{(i)}$ and $k^{(j)}$ are very similar.
 
     ### Normalization
 
-    Paper introduces a simple normalization for $\textcolor{lightgreen}{\phi}$,
+    Paper introduces a simple normalization for $	extcolor{lightgreen}{\\phi}$,
 
-    $$\textcolor{lightgreen}{\phi '(k)} =
-     \frac{\textcolor{lightgreen}{\phi(k)}}{\sum^{d_{dot}}_{j=1} \textcolor{lightgreen}{\phi(k)_j}}$$
+    $$	extcolor{lightgreen}{\\phi '(k)} =
+     \x0crac{	extcolor{lightgreen}{\\phi(k)}}{\\sum^{d_{dot}}_{j=1} 	extcolor{lightgreen}{\\phi(k)_j}}$$
 
     *Check the paper for derivation.*
     """
 
-    def __init__(self, nu: int = 1, eps: float = 1e-6):
+    def __init__(self, nu: int=1, eps: float=1e-06):
         """
-        * `nu` is the hyper-parameter $\nu$.
+        * `nu` is the hyper-parameter $
+u$.
         * `eps` is the small value used to make sure there is no division-by-zero when normalizing.
         """
         super().__init__()
@@ -146,183 +147,115 @@ def __init__(self, nu: int = 1, eps: float = 1e-6):
         self.eps = eps
 
     def forward(self, k: torch.Tensor):
-        # Get $\textcolor{lightgreen}{\phi(k)}$
+        """Implement the forward operation on the given input: k."""
         k = self.dpfp(k)
-        # Normalize by $\sum^{d_{dot}}_{j=1} \textcolor{lightgreen}{\phi(k)_j}$
         return k / (torch.sum(k, dim=-1, keepdim=True) + self.eps)
 
     def dpfp(self, k: torch.Tensor):
         """
-        $$\textcolor{lightgreen}{\phi(k)}$$
+        $$	extcolor{lightgreen}{\\phi(k)}$$
         """
-        # $x = \text{ReLU}\Big(\big[k, -k\big]\Big)$
         x = self.relu(torch.cat([k, -k], dim=-1))
-        # Shift and roll by $i \in \\{1, 2, ..., \nu \\}$,
-        # to get $$x'_{i,j} = \text{ReLU}\Big(\big[k, -k\big]\Big)_{i+j}$$
         x_rolled = [x.roll(shifts=i, dims=-1) for i in range(1, self.nu + 1)]
-        # Concatenate to get
-        # $$x'_{2 d_{key} (i - 1)  + j} = \text{ReLU}\Big(\big[k, -k\big]\Big)_{i+j}$$
         x_rolled = torch.cat(x_rolled, dim=-1)
-        # Concatenate copies of $x$
         x_repeat = torch.cat([x] * self.nu, dim=-1)
-
-        # Multiply them,
-        # $$\textcolor{lightgreen}{\phi_{2 d_{key} (i - 1)  + j}(k)}
-        # = \text{ReLU}\Big(\big[k, -k\big]\Big)_{j}
-        #                         \text{ReLU}\Big(\big[k, -k\big]\Big)_{i + j}$$
         return x_repeat * x_rolled
 
-
 class FastWeightsAttention(Module):
     """
     ## Fast Weights Attention
 
-    The paper introduces a new update rule for calculating $\textcolor{cyan}{W^{(i)}}$.
+    The paper introduces a new update rule for calculating $	extcolor{cyan}{W^{(i)}}$.
     The model first retrieves the current value
-    $\bar{v}^{(i)}$ paired with the key $k^{(i)}$.
+    $\x08ar{v}^{(i)}$ paired with the key $k^{(i)}$.
     Then stores a combination $v^{(i)}_{new}$
-    of the retrieved value $\bar{v}^{(i)}$ and the input $v^{(i)}$.
+    of the retrieved value $\x08ar{v}^{(i)}$ and the input $v^{(i)}$.
 
-    \begin{align}
+    \x08egin{align}
     k^{(i)}, v^{(i)}, q^{(i)} &=
-     \textcolor{orange}{W_k} x^{(i)}, \textcolor{orange}{W_v} x^{(i)}, \textcolor{orange}{W_q} x^{(i)} \\
-    \bar{v}^{(i)} &= \textcolor{cyan}{W^{(i-1)}} \textcolor{lightgreen}{\phi'(k^{(i)})} \\
-    \beta^{(i)} &= \sigma \Big(\textcolor{orange}{W_\beta} x^{(i)} \Big) \\
-    v^{(i)}_{new} &= \beta^{(i)} v^{(i)} + \Big(1 - \beta^{(i)} \Big) \bar{v}^{(i)} \\
-    \textcolor{cyan}{W^{(i)}}
-     &= \textcolor{cyan}{W^{(i-1)}} + v^{(i)}_{new} \otimes \textcolor{lightgreen}{\phi'(k^{(i)})} \\
-     &= \textcolor{cyan}{W^{(i-1)}} +
-     \beta^{(i)} \Big( v^{(i)} - \bar{v}^{(i)} \Big ) \otimes \textcolor{lightgreen}{\phi'(k^{(i)})} \\
-    y^{(i)} &= \textcolor{cyan}{W^{(i)}} \textcolor{lightgreen}{\phi'(q^{(i)})}
-    \end{align}
-
-    where $\textcolor{orange}{W_\beta}$ is a trainable parameter and $\sigma$ is the sigmoid function.
-
-    Note that we don't need the normalization term $z$ because $\textcolor{lightgreen}{\phi'}$ is normalized.
+     	extcolor{orange}{W_k} x^{(i)}, 	extcolor{orange}{W_v} x^{(i)}, 	extcolor{orange}{W_q} x^{(i)} \\
+    \x08ar{v}^{(i)} &= 	extcolor{cyan}{W^{(i-1)}} 	extcolor{lightgreen}{\\phi'(k^{(i)})} \\
+    \x08eta^{(i)} &= \\sigma \\Big(	extcolor{orange}{W_\x08eta} x^{(i)} \\Big) \\
+    v^{(i)}_{new} &= \x08eta^{(i)} v^{(i)} + \\Big(1 - \x08eta^{(i)} \\Big) \x08ar{v}^{(i)} \\
+    	extcolor{cyan}{W^{(i)}}
+     &= 	extcolor{cyan}{W^{(i-1)}} + v^{(i)}_{new} \\otimes 	extcolor{lightgreen}{\\phi'(k^{(i)})} \\
+     &= 	extcolor{cyan}{W^{(i-1)}} +
+     \x08eta^{(i)} \\Big( v^{(i)} - \x08ar{v}^{(i)} \\Big ) \\otimes 	extcolor{lightgreen}{\\phi'(k^{(i)})} \\
+    y^{(i)} &= 	extcolor{cyan}{W^{(i)}} 	extcolor{lightgreen}{\\phi'(q^{(i)})}
+    \\end{align}
+
+    where $	extcolor{orange}{W_\x08eta}$ is a trainable parameter and $\\sigma$ is the sigmoid function.
+
+    Note that we don't need the normalization term $z$ because $	extcolor{lightgreen}{\\phi'}$ is normalized.
     """
 
     def __init__(self, heads: int, d_model: int, dropout_prob: float, phi: DPFP):
+        """Implement the __init__ operation on the given input: heads, d_model, dropout_prob, phi."""
         super().__init__()
-
-        # Number of features per head $d_k$
         self.d_k = d_model // heads
-        # Number of heads
         self.heads = heads
-
-        # These transform the `query`, `key` and `value` multi-headed attention.
         self.query = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=False)
         self.key = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=False)
         self.value = PrepareForMultiHeadAttention(d_model, heads, self.d_k, bias=False)
-
-        # Interpolation weight function $\sigma \Big(\textcolor{orange}{W_\beta} x^{(i)} \Big)$ for each head
-        self.interpolation_weight = nn.Sequential(
-            PrepareForMultiHeadAttention(d_model, heads, 1, bias=False),
-            nn.Sigmoid()
-        )
-
-        # $\textcolor{lightgreen}{\phi'}$
+        self.interpolation_weight = nn.Sequential(PrepareForMultiHeadAttention(d_model, heads, 1, bias=False), nn.Sigmoid())
         self.phi = phi
-
-        # Output layer
         self.output = nn.Linear(d_model, d_model)
-        # Dropout
         self.dropout = nn.Dropout(dropout_prob)
 
     def forward(self, x: torch.Tensor):
-        # Get the number of steps $L$
+        """Calculate and return the output of forward based on x."""
         seq_len = x.shape[0]
-        # $\textcolor{lightgreen}{\phi'(q^{(i)})}$ for all steps and heads
         query = self.phi(self.query(x))
-        # $\textcolor{lightgreen}{\phi'(k^{(i)})}$ for all steps and heads
         key = self.phi(self.key(x))
-        # $v^{(i)}$ for all steps and heads
         value = self.value(x)
-        # $\beta^{(i)}$ for all steps and heads
         beta = self.interpolation_weight(x)
-
-        # $\textcolor{cyan}{W^{(0)}}$
         weights = key.new_zeros((key.shape[1], key.shape[2], value.shape[3], key.shape[3]))
-        # List to store outputs $y^{(i)}$
         outputs = []
-
-        # Iterate through steps
         for i in range(seq_len):
-            # $$\bar{v}^{(i)} = \textcolor{cyan}{W^{(i-1)}} \textcolor{lightgreen}{\phi'(k^{(i)})}$$
             value_existing = torch.einsum('bhvk,bhk->bhv', weights, key[i])
-
-            # $$\textcolor{cyan}{W^{(i)}}
-            #      = \textcolor{cyan}{W^{(i-1)}} +
-            #      \beta^{(i)} \Big( v^{(i)} - \bar{v}^{(i)} \Big ) \otimes \textcolor{lightgreen}{\phi'(k^{(i)})}$$
             weights = weights + torch.einsum('bhv,bhk->bhvk', beta[i] * (value[i] - value_existing), key[i])
-
-            # $$y^{(i)} = \textcolor{cyan}{W^{(i)}} \textcolor{lightgreen}{\phi'(q^{(i)})}$$
             y = torch.einsum('bhvk,bhk->bhv', weights, query[i])
-
-            # Merge multiple heads and append to `outputs`
             outputs.append(y.reshape(y.shape[0], -1))
-
-        # Stack outputs at each step into a single tensor
         x = torch.stack(outputs)
-
-        # Output layer
         return self.output(x)
 
-
 class FastWeightsAttentionTransformerLayer(Module):
     """
     This is a general transformer layer that combines self attention and feedforward network.
     """
-    def __init__(self, *,
-                 d_model: int,
-                 attn: FastWeightsAttention,
-                 feed_forward: FeedForward,
-                 dropout_prob: float):
+
+    def __init__(self, *, d_model: int, attn: FastWeightsAttention, feed_forward: FeedForward, dropout_prob: float):
+        """Process data using __init__ with arguments no arguments."""
         super().__init__()
-        # Transformer size $d_{model}$
         self.size = d_model
-        # Fast weights attention module
         self.attn = attn
-        # Feed-forward network
         self.feed_forward = feed_forward
-        # Dropout layer
         self.dropout = nn.Dropout(dropout_prob)
-
-        # Normalization layers
         self.norm_self_attn = nn.LayerNorm([d_model])
         self.norm_ff = nn.LayerNorm([d_model])
 
     def forward(self, x: torch.Tensor):
-        # Calculate fast weights self attention
+        """Compute forward given x."""
         attn = self.attn(x)
-        # Add the self attention results
         x = x + self.dropout(attn)
-
-        # Normalize for feed-forward
         z = self.norm_ff(x)
-        # Pass through the feed-forward network
         ff = self.feed_forward(z)
-        # Add the feed-forward results back
         x = x + self.dropout(ff)
-
-        #
         return x
 
-
 class FastWeightsAttentionTransformer(Module):
     """
     This is a general transformer module with multiple transformer layers
     """
+
     def __init__(self, layer: FastWeightsAttentionTransformerLayer, n_layers: int):
+        """Process data using __init__ with arguments layer, n_layers."""
         super().__init__()
-        # Make copies of the transformer layer
         self.layers = clone_module_list(layer, n_layers)
-        # Final normalization layer
         self.norm = nn.LayerNorm([layer.size])
 
     def forward(self, x: torch.Tensor):
+        """Execute forward with input (x)."""
         for i, layer in enumerate(self.layers):
-            # Get layer output
             x = layer(x)
-
-        # Normalize the output
-        return self.norm(x)
+        return self.norm(x)
\ No newline at end of file