NanoMind/model.py at main · Pro1943/NanoMind

83 lines (70 loc) · 2.79 KB
import torch
import torch.nn as nn
from torch.nn import functional as F
from config import BLOCK_SIZE, N_EMBED, N_HEAD, N_LAYER, DROPOUT, DEVICE
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key   = nn.Linear(N_EMBED, head_size, bias=False)
        self.query = nn.Linear(N_EMBED, head_size, bias=False)
        self.value = nn.Linear(N_EMBED, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(BLOCK_SIZE, BLOCK_SIZE)))
        self.dropout = nn.Dropout(DROPOUT)
    def forward(self, x):
        B, T, C = x.shape
        k   = self.key(x)
        q   = self.query(x)
        att = q @ k.transpose(-2, -1) * (k.shape[-1] ** -0.5)
        att = att.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        att = F.softmax(att, dim=-1)
        att = self.dropout(att)
        return att @ self.value(x)
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads   = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj    = nn.Linear(N_EMBED, N_EMBED)
        self.dropout = nn.Dropout(DROPOUT)
    def forward(self, x):
        return self.dropout(self.proj(torch.cat([h(x) for h in self.heads], dim=-1)))
class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(N_EMBED, 4 * N_EMBED),
            nn.GELU(),
            nn.Linear(4 * N_EMBED, N_EMBED),
            nn.Dropout(DROPOUT),
    def forward(self, x): return self.net(x)
class Block(nn.Module):
    def __init__(self):
        super().__init__()
        self.sa  = MultiHeadAttention(N_HEAD, N_EMBED // N_HEAD)
        self.ff  = FeedForward()
        self.ln1 = nn.LayerNorm(N_EMBED)
        self.ln2 = nn.LayerNorm(N_EMBED)
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x
class ChatGPT(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, N_EMBED)
        self.pos_embed   = nn.Embedding(BLOCK_SIZE, N_EMBED)
        self.blocks      = nn.Sequential(*[Block() for _ in range(N_LAYER)])
        self.ln_f        = nn.LayerNorm(N_EMBED)
        self.lm_head     = nn.Linear(N_EMBED, vocab_size)
        self.lm_head.weight = self.token_embed.weight
    def forward(self, idx, targets=None):
        B, T = idx.shape
        x = self.token_embed(idx) + self.pos_embed(torch.arange(T, device=DEVICE))
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            B, T, C = logits.shape
            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        return logits, loss
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

model.py

Latest commit

History

model.py

File metadata and controls