Section 06

The code: scaled dot-product attention in NumPy

Attention Is All You Need 2017

6. The code — scaled dot-product attention in NumPy

🟡 First-year college. Basic Python and NumPy needed. Runs free on Google Colab — no GPU required.

We implement the complete scaled dot-product attention operation from scratch, then show a minimal multi-head attention wrapper.

import numpy as np

def scaled_dot_product_attention(Q, K, V, mask=None):
    """
    Compute Attention(Q, K, V) = softmax(Q·Kᵀ / √dₖ) · V
    
    Q: (seq_len, d_k)  — query matrix
    K: (seq_len, d_k)  — key matrix
    V: (seq_len, d_v)  — value matrix
    mask: optional (seq_len, seq_len) boolean — True = mask out (for decoder)
    """
    d_k = Q.shape[-1]                      # key dimension (e.g. 64)
    
    # Step 1: Q·Kᵀ — all pairwise dot products in one matrix multiply
    scores = Q @ K.T                       # shape: (seq_len, seq_len)
    
    # Step 2: Scale to prevent softmax saturation
    scores = scores / np.sqrt(d_k)         # divide by √dₖ
    
    # Step 3: Apply mask (for decoder — prevent attending to future positions)
    if mask is not None:
        scores[mask] = -1e9               # large negative → softmax ≈ 0
    
    # Step 4: Softmax row-wise → attention weights (each row sums to 1)
    scores_shifted = scores - scores.max(axis=-1, keepdims=True)  # stability
    exp_scores     = np.exp(scores_shifted)
    attn_weights   = exp_scores / exp_scores.sum(axis=-1, keepdims=True)
    
    # Step 5: Weighted sum of values
    output = attn_weights @ V              # shape: (seq_len, d_v)
    
    return output, attn_weights

# ── Demo: "The chai is hot" (4 tokens, d_model=8, d_k=4) ─────────────────────
np.random.seed(42)                         # reproducibility
seq_len, d_model, d_k = 4, 8, 4

words = ["The", "chai", "is", "hot"]

# Simulate input: word embeddings + positional encoding (random here)
X = np.random.randn(seq_len, d_model)      # (4, 8)

# Projection matrices (learned during training; random here for demo)
W_Q = np.random.randn(d_model, d_k)       # (8, 4)
W_K = np.random.randn(d_model, d_k)       # (8, 4)
W_V = np.random.randn(d_model, d_k)       # (8, 4) — using d_v = d_k here

Q = X @ W_Q                               # (4, 4)
K = X @ W_K                               # (4, 4)
V = X @ W_V                               # (4, 4)

output, attn = scaled_dot_product_attention(Q, K, V)

print("Attention weight matrix (rows = query word, cols = key word):")
print(np.array(words))
for i, row in enumerate(attn):
    print(f"  {words[i]:5s}: {np.round(row, 3)}")
print(f"\nOutput shape: {output.shape}")   # should be (4, 4)

# ── Visualise ─────────────────────────────────────────────────────────────────
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(5, 4))
im = ax.imshow(attn, cmap="YlOrRd", vmin=0, vmax=1)
ax.set_xticks(range(4)); ax.set_xticklabels(words)   # keys (columns)
ax.set_yticks(range(4)); ax.set_yticklabels(words)   # queries (rows)
for i in range(4):
    for j in range(4):
        ax.text(j, i, f"{attn[i,j]:.2f}", ha="center", va="center", fontsize=9)
ax.set_title("Self-Attention Weights")
ax.set_xlabel("Key (what each word offers)")
ax.set_ylabel("Query (what each word looks for)")
plt.colorbar(im, ax=ax, shrink=0.8)
plt.tight_layout()
plt.savefig("transformer_attention.png", dpi=120)
plt.show()

What to look for in the output:

  • Each row of the attention matrix sums to 1.0 — it is a probability distribution over the 4 source positions
  • “chai” and “hot” may attend to each other with high weight, since adjective-noun relationships are semantically close
  • The diagonal (each word attending to itself) is often non-trivial — self-attention learns to “copy” its own representation when useful

Add a causal mask for the decoder:

# Causal mask: position i cannot attend to positions j > i
# Used in decoder self-attention during training
mask = np.triu(np.ones((seq_len, seq_len), dtype=bool), k=1)
# mask[i,j] = True when j > i → scores set to -1e9 → softmax ≈ 0

print("\nCausal mask (True = blocked):")
print(mask)

output_masked, attn_masked = scaled_dot_product_attention(Q, K, V, mask=mask)
print("\nAttention weights with causal mask:")
for i, row in enumerate(attn_masked):
    print(f"  {words[i]:5s}: {np.round(row, 3)}")
# "chai" (pos 1) can only see "The" (pos 0) and itself — not "is" or "hot"

Try this: Change d_k = 4 to d_k = 1. Notice the attention weights become more uniform (lower scores → flatter softmax). Then try d_k = 64. Now scores are larger before scaling — notice the code’s / np.sqrt(d_k) is what keeps them well-behaved.