Implementation and simulation of RNN

Character-level language model trained on Shakespeare text

Back to Home
Step 1: Import Libraries & Load Dataset
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

plt.rcParams.update({
    "figure.figsize": (8, 5),
    "axes.grid": True,
    "lines.linewidth": 2,
    "font.size": 11
})

# Download Tiny Shakespeare dataset
!curl -L -o tiny_shakespeare.txt https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

with open("tiny_shakespeare.txt", "r", encoding="utf-8") as f:
    text = f.read()

print("Dataset loaded successfully")
Step 2: Dataset Preprocessing
n = len(text)

train_text = text[:int(0.9*n)]
val_text   = text[int(0.9*n):int(0.95*n)]
test_text  = text[int(0.95*n):]

print("Train:", len(train_text))
print("Val  :", len(val_text))
print("Test :", len(test_text))

# Create character vocabulary and mappings
chars = sorted(list(set(text)))
vocab_size = len(chars)

char2idx = {ch:i for i,ch in enumerate(chars)}
idx2char = {i:ch for i,ch in enumerate(chars)}

def text_to_ints(s):
    return np.array([char2idx[c] for c in s], dtype=np.int32)

train_ids = text_to_ints(train_text)
val_ids   = text_to_ints(val_text)
test_ids  = text_to_ints(test_text)

print("Vocabulary size:", vocab_size)
Step 3: Hyperparameters Setup

epochs = 15            
learning_rate = 0.01 
   
# =====================================
embed_size = 128      
batch_size = 64       
seq_length = 100      
num_layers = 2    
hidden_size = 256 

def get_batches(data, batch_size, seq_length):
    n_batches = len(data) // (batch_size * seq_length)
    data = data[:n_batches * batch_size * seq_length]

    x = data.reshape(batch_size, -1)
    y = np.roll(x, -1, axis=1)

    return x, y
Step 4: Defining the RNN Model
class CharRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embed(x)
        out, hidden = self.rnn(x, hidden)
        out = self.fc(out)
        return out, hidden

model = CharRNN(vocab_size, embed_size, hidden_size, num_layers)
print(model)
Step 5: Define Training Function
def train_model(model, train_ids, val_ids):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    train_losses, val_losses = [], []

    for epoch in range(epochs):
        # -------- Training --------
        model.train()
        x, y = get_batches(train_ids, batch_size, seq_length)
        hidden = None
        total_loss = 0
        num_batches = 0

        for i in range(0, x.shape[1], seq_length):
            inputs = torch.tensor(
                x[:, i:i+seq_length], dtype=torch.long
            )
            targets = torch.tensor(
                y[:, i:i+seq_length], dtype=torch.long
            )

            optimizer.zero_grad()

            if hidden is not None:
                hidden = hidden.detach()

            outputs, hidden = model(inputs, hidden)

            loss = criterion(
                outputs.view(-1, vocab_size),
                targets.view(-1)
            )

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
            optimizer.step()

            total_loss += loss.item()
            num_batches += 1

        train_losses.append(total_loss / num_batches)

        # -------- Validation --------
        model.eval()
        with torch.no_grad():
            vx, vy = get_batches(val_ids, batch_size, seq_length)
            vhidden = None
            vloss = 0
            v_batches = 0

            for i in range(0, vx.shape[1], seq_length):
                vin = torch.tensor(
                    vx[:, i:i+seq_length], dtype=torch.long
                )
                vtar = torch.tensor(
                    vy[:, i:i+seq_length], dtype=torch.long
                )

                if vhidden is not None:
                    vhidden = vhidden.detach()

                out, vhidden = model(vin, vhidden)
                vloss += criterion(
                    out.view(-1, vocab_size),
                    vtar.view(-1)
                ).item()
                v_batches += 1

        val_losses.append(vloss / v_batches)

        print(
            f"Epoch {epoch+1}/{epochs} | "
            f"Train Loss: {train_losses[-1]:.4f} | "
            f"Val Loss: {val_losses[-1]:.4f}"
        )

    return train_losses, val_losses
Step 6: Run Training
train_losses, val_losses = train_model(model, train_ids, val_ids)
Step 7: Plot Training Loss
epochs_range = range(1, epochs + 1)

plt.figure()
plt.plot(epochs_range, train_losses, label="Train Loss", marker="o")
plt.plot(epochs_range, val_losses, label="Validation Loss", marker="o")

plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss")
plt.legend()
plt.show()
Step 8: Text Generation
def generate_text(model, start_text, length=500):
    model.eval()
    hidden = None
    generated = start_text

    # Prime the RNN with the starting text
    for ch in start_text[:-1]:
        x = torch.tensor([[char2idx[ch]]])
        _, hidden = model(x, hidden)

    # Start generation from the last character
    inp = torch.tensor([[char2idx[start_text[-1]]]])

    for _ in range(length):
        out, hidden = model(inp, hidden)
        probs = torch.softmax(out[:, -1, :], dim=-1)
        char_idx = torch.multinomial(probs, 1).item()

        generated += idx2char[char_idx]
        inp = torch.tensor([[char_idx]])

    return generated

print(generate_text(model, "ROMEO: HELLO! "))