Below are complete Python implementations that run on Google Colab. We’ll implement RMSNorm from scratch and show how to load and run LLaMA.
Code 1: Implementing RMSNorm from Scratch
import torch
import torch.nn as nn
import math
# Implement RMSNorm (Root Mean Square Normalization)
class RMSNorm(nn.Module):
"""RMSNorm: Simpler and faster than LayerNorm."""
def __init__(self, dim, eps=1e-8):
super().__init__()
self.eps = eps # Small constant for numerical stability
self.gamma = nn.Parameter(torch.ones(dim)) # Learnable scale
def forward(self, x):
# Compute RMS: sqrt(mean(x^2))
rms = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True) + self.eps)
# Normalize and scale
normalized = x / rms
return normalized * self.gamma
# Compare RMSNorm vs LayerNorm
rms_norm = RMSNorm(4)
layer_norm = nn.LayerNorm(4)
# Test input
x = torch.tensor([[2.0, -1.0, 3.0, 0.0]])
print("Input:", x)
print("RMSNorm output:", rms_norm(x))
print("LayerNorm output:", layer_norm(x))
print("\nKey difference: RMSNorm is simpler (no mean subtraction)")
# Verify RMSNorm computation manually
rms_manual = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True))
print(f"Manual RMS: {rms_manual.item():.4f}")
print(f"Expected: sqrt((4+1+9+0)/4) = sqrt(3.5) = 1.8708")
Expected Output:
Input: tensor([[2., -1., 3., 0.]])
RMSNorm output: tensor([[1.0690, -0.5345, 1.6035, 0.0000]], grad_fn=<MulBackward0>)
LayerNorm output: tensor([[ 0.6325, -1.2649, 1.2649, -0.6325]], grad_fn=<AddBackward0>)
Key difference: RMSNorm is simpler (no mean subtraction)
Manual RMS: 1.8708
Expected: sqrt((4+1+9+0)/4) = sqrt(3.5) = 1.8708
Code 2: Implementing SwiGLU
import torch
import torch.nn as nn
class SwiGLU(nn.Module):
"""SwiGLU: Swish-gated Linear Unit."""
def __init__(self, input_dim, output_dim):
super().__init__()
# Project to intermediate (2x because of gating)
self.proj = nn.Linear(input_dim, 2 * output_dim)
self.output_dim = output_dim
def forward(self, x):
# Project to 2*output_dim
proj = self.proj(x) # Shape: (..., 2*output_dim)
# Split into two parts
gate_input = proj[..., :self.output_dim] # First half
gate = proj[..., self.output_dim:] # Second half
# Swish activation: x * sigmoid(x)
swish = gate_input * torch.sigmoid(gate_input)
# Gate: multiply by the second projection
output = swish * gate
return output
# Test SwiGLU
swiglu = SwiGLU(input_dim=4, output_dim=8)
x = torch.tensor([[1.5, -0.5, 2.0, 0.3]])
output = swiglu(x)
print("Input shape:", x.shape)
print("Output shape:", output.shape)
print("SwiGLU output:", output)
print("\nKey: SwiGLU uses gating (element-wise multiply) for selectivity")
Expected Output:
Input shape: torch.Size([1, 4])
Output shape: torch.Size([1, 8])
SwiGLU output: tensor([...], grad_fn=<MulBackward0>)
Key: SwiGLU uses gating (element-wise multiply) for selectivity
Code 3: Loading and Running LLaMA
# Install transformers library (first time only)
# !pip install transformers torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
print("Loading LLaMA-2 tokenizer...")
# Note: LLaMA-2 7B is available on Hugging Face
# We'll use a smaller model for Colab compatibility
model_name = "meta-llama/Llama-2-7b-hf"
# Note: You may need to accept the license on Hugging Face first
# Go to: https://huggingface.co/meta-llama/Llama-2-7b-hf
try:
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("✓ Tokenizer loaded")
# Load model in 8-bit quantization to save memory
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
load_in_8bit=True,
torch_dtype=torch.float16
)
print("✓ Model loaded (8-bit quantized)")
# Generate text
prompt = "The most important breakthrough in AI is"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
print(f"\nPrompt: {prompt}")
print("Generating...")
outputs = model.generate(
input_ids,
max_length=100,
temperature=0.7,
do_sample=True
)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Output: {generated_text}")
except Exception as e:
print(f"Note: {e}")
print("\nLLaMA requires accepting the license on Hugging Face Hub first.")
print("Alternative: Use a smaller open model like Mistral-7B (Paper 18)")
Code 4: Using Mistral-7B (Smaller Alternative)
If LLaMA licensing is an issue, Mistral-7B is fully open and can run on Colab:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
print("Loading Mistral-7B tokenizer...")
model_name = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
load_in_8bit=True,
torch_dtype=torch.float16
)
print("✓ Model loaded")
# Generate text
prompt = "Let's solve a math problem: "
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs = model.generate(
input_ids,
max_length=150,
temperature=0.7,
top_p=0.9
)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Generated: {generated_text}")
Code 5: Understanding LLaMA’s Architecture
import torch
import torch.nn as nn
# Simplified LLaMA block (what happens inside each transformer layer)
class SimpleLLaMABlock(nn.Module):
def __init__(self, dim, num_heads):
super().__init__()
self.rms_norm1 = RMSNorm(dim) # Pre-norm
self.attention = nn.MultiheadAttention(dim, num_heads, batch_first=True)
self.rms_norm2 = RMSNorm(dim) # Pre-norm
self.swiglu = SwiGLU(dim, dim * 4) # FFN with SwiGLU
def forward(self, x):
# Pre-norm + Attention + Residual
norm_x = self.rms_norm1(x)
attn_out, _ = self.attention(norm_x, norm_x, norm_x)
x = x + attn_out
# Pre-norm + SwiGLU + Residual
norm_x = self.rms_norm2(x)
swiglu_out = self.swiglu(norm_x)
x = x + swiglu_out
return x
# Test the block
dim = 64
num_heads = 8
block = SimpleLLaMABlock(dim, num_heads)
x = torch.randn(1, 10, dim) # Batch: 1, Seq: 10, Dim: 64
output = block(x)
print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")
print("✓ LLaMA block processed successfully")
How to Run on Google Colab
- Go to Google Colab
- Create a new notebook
- Copy each code block above into separate cells
- Run cells in order (Code 1 → Code 2 → Code 3 or 4)
- For Code 3 (LLaMA), you’ll need to:
- Accept the license at https://huggingface.co/meta-llama/Llama-2-7b-hf
- Authenticate with:
huggingface-cli login
Colab memory tips:
- Use 8-bit quantization (shown in code) to fit models in Colab’s 16GB GPU
- For full precision, you may need a Colab Pro account
- Mistral-7B (Code 4) is smaller and fits more easily
Key Observations from the Code
- RMSNorm is simpler: No mean subtraction, just RMS + scale
- SwiGLU uses gating: Element-wise multiply allows selective feature use
- LLaMA layers stack: Many blocks of Attention + SwiGLU FFN
- Pre-normalization: RMSNorm before operations, not after (more stable)
- Residual connections: Every layer preserves the original signal via x = x + output
This architecture enabled LLaMA to be more efficient than GPT-3 while achieving better performance.