diff --git a/tutorials/01_NeMo_Models.ipynb b/tutorials/01_NeMo_Models.ipynb index 43972549203c..a46a3238d040 100644 --- a/tutorials/01_NeMo_Models.ipynb +++ b/tutorials/01_NeMo_Models.ipynb @@ -1,24 +1,12 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "01_NeMo_Models.ipynb", - "provenance": [], - "collapsed_sections": [], - "toc_visible": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - } - }, "cells": [ { "cell_type": "code", + "execution_count": null, "metadata": { "id": "ASnx4b5jXsil" }, + "outputs": [], "source": [ "\"\"\"\n", "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", @@ -45,9 +33,7 @@ "\n", "## Grab the config we'll use in this example\n", "!mkdir configs" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -174,17 +160,17 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "piLOgwOPX1FS" }, + "outputs": [], "source": [ "import torch\n", "import nemo\n", "from nemo.core import NeuralModule\n", "from nemo.core import typecheck" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -208,29 +194,29 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "bseLiNoqqQrE" }, + "outputs": [], "source": [ "class MyEmptyModule(NeuralModule):\n", "\n", " def forward(self):\n", " print(\"Neural Module ~ hello world!\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "j4Q36L5urdOQ" }, + "outputs": [], "source": [ "x = MyEmptyModule()\n", "x()" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -261,33 +247,33 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "ZvC57bbxwXxN" }, + "outputs": [], "source": [ "# Case 1:\n", "embedding = torch.nn.Embedding(num_embeddings=10, embedding_dim=30)\n", "x = torch.randint(high=10, size=(1, 5))\n", "print(\"x :\", x)\n", "print(\"embedding(x) :\", embedding(x).shape)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "sMaqhMBgxe2C" }, + "outputs": [], "source": [ "# Case 2\n", "lstm = torch.nn.LSTM(1, 30, batch_first=True)\n", "x = torch.randn(1, 5, 1)\n", "print(\"x :\", x)\n", "print(\"lstm(x) :\", lstm(x)[0].shape) # Let's take all timestep outputs of the LSTM" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -340,21 +326,23 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "yp0FG8NJt1Jd" }, + "outputs": [], "source": [ "from nemo.core.neural_types import NeuralType\n", "from nemo.core.neural_types import *" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "3tsgs8Fp0-WV" }, + "outputs": [], "source": [ "class EmbeddingModule(NeuralModule):\n", " def __init__(self):\n", @@ -376,9 +364,7 @@ " return {\n", " 'y': NeuralType(axes=('B', 'T', 'C'), elements_type=EmbeddedTextType())\n", " }" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -442,14 +428,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "boxxMniv27vi" }, + "outputs": [], "source": [ "embedding_module = EmbeddingModule()" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -462,9 +448,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "SZZOOoCJ2-iV" }, + "outputs": [], "source": [ "class LSTMModule(NeuralModule):\n", " def __init__(self):\n", @@ -486,9 +474,7 @@ " return {\n", " 'y': NeuralType(axes=('B', 'T', 'C'), elements_type=EncodedRepresentation())\n", " }" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -506,14 +492,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "6LlOJf0C8GN4" }, + "outputs": [], "source": [ "lstm_module = LSTMModule()" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -527,17 +513,17 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "giLJlub78-Ja" }, + "outputs": [], "source": [ "# Case 1 [ERROR CELL]\n", "x1 = torch.randint(high=10, size=(1, 5))\n", "print(\"x :\", x1)\n", "print(\"embedding(x) :\", embedding_module(x1).shape)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -553,16 +539,16 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "2KUj_p6M9L-f" }, + "outputs": [], "source": [ "# Case 1\n", "print(\"x :\", x1)\n", "print(\"embedding(x) :\", embedding_module(x=x1).shape)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -575,17 +561,17 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "FMu3B0-9-CqE" }, + "outputs": [], "source": [ "# Case 2 [ERROR CELL]\n", "x2 = torch.randn(1, 5, 1) # Input = [B=1, T=5, C=1]\n", "print(\"x :\", x2)\n", "print(\"lstm(x) :\", lstm_module(x=x2)[0].shape) # Let's take all timestep outputs of the LSTM" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -611,9 +597,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "q2u-keAM-d-B" }, + "outputs": [], "source": [ "class CorrectLSTMModule(LSTMModule): # Let's inherit the wrong class to make it easy to override\n", " @property\n", @@ -622,9 +610,7 @@ " 'y': NeuralType(axes=('B', 'T', 'C'), elements_type=EncodedRepresentation()),\n", " 'h_c': [NeuralType(axes=('D', 'B', 'C'), elements_type=EncodedRepresentation())],\n", " }" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -641,20 +627,22 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "GyPZH-fz_dG4" }, + "outputs": [], "source": [ "lstm_module = CorrectLSTMModule()" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "9whH50PE_Xyx" }, + "outputs": [], "source": [ "# Case 2\n", "x2 = torch.randn(1, 5, 1)\n", @@ -663,9 +651,7 @@ "print(\"lstm(x) :\", y2.shape) # The output of the LSTM RNN\n", "print(\"hidden state (h) :\", h.shape) # The first hidden state of the LSTM RNN\n", "print(\"hidden state (c) :\", c.shape) # The second hidden state of the LSTM RNN" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -683,30 +669,30 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "bGQ9XbWU_ffa" }, + "outputs": [], "source": [ "emb_out = embedding_module(x=x1)\n", "lstm_out = lstm_module(x=x2)[0]\n", "\n", "assert hasattr(emb_out, 'neural_type')\n", "assert hasattr(lstm_out, 'neural_type')" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "kEpBruSOScPJ" }, + "outputs": [], "source": [ "print(\"Embedding tensor :\", emb_out.neural_type)\n", "print(\"LSTM tensor :\", lstm_out.neural_type)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -724,25 +710,25 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "8AU9FMtdATIm" }, + "outputs": [], "source": [ "emb_out.neural_type.compare(lstm_out.neural_type)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "2cqnqAGIBCjA" }, + "outputs": [], "source": [ "emb_out.neural_type == lstm_out.neural_type" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -775,9 +761,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "AGbKB4gJEzcU" }, + "outputs": [], "source": [ "embedding_module = EmbeddingModule()\n", "x1 = torch.randint(high=10, size=(1, 5))\n", @@ -786,23 +774,21 @@ "x1.neural_type = NeuralType(('B', 'T'), Index())\n", "\n", "print(\"embedding(x) :\", embedding_module(x=x1).shape)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "F0j-evylFM5j" }, + "outputs": [], "source": [ "# Attach wrong neural type [ERROR CELL]\n", "x1.neural_type = NeuralType(('B', 'T'), LabelsType())\n", "\n", "print(\"embedding(x) :\", embedding_module(x=x1).shape)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -814,24 +800,40 @@ "\n", "Now that we have a somewhat firm grasp of neural type checking, let's begin porting the minGPT example code. Once again, most of the code will be a direct port from the [minGPT repository](https://github.com/karpathy/minGPT).\n", "\n", - "Here, you will notice one thing. By just changing class imports, one `@typecheck()` on forward, and adding `input_types` and `output_types` (which are also entirely optional!), we are almost entirely done with the PyTorch Lightning port!" + "Here, you will notice one thing. By just changing class imports, one `@typecheck()` on forward, and adding `input_types` and `output_types` (which are also entirely optional!), we are almost entirely done with the PyTorch Lightning port!\n", + "\n", + "**Note**: We've moved all the GPT component classes to a helper module to avoid `__main__` namespace issues with NeMo's security validation. Let's import them:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from helper_files.gpt_components import (\n", + " AttentionType, SelfAttentionType, CausalSelfAttentionType,\n", + " CausalSelfAttention, Block,\n", + " GPTEmbedding, GPTTransformerEncoder, GPTDecoder\n", + ")\n" ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "raFkuSRaBAE0" }, + "outputs": [], "source": [ + "# Basic imports needed for the tutorial\n", "import math\n", "from typing import List, Set, Dict, Tuple, Optional\n", "\n", "import torch\n", "import torch.nn as nn\n", "from torch.nn import functional as F" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -843,26 +845,25 @@ "\n", "Till now, we have used the Neural Types provided by the NeMo core. But we need not be restricted to the pre-defined element types !\n", "\n", - "Users have total flexibility in defining any hierarchy of element types as they please!" + "Users have total flexibility in defining any hierarchy of element types as they please!\n", + "\n", + "We've defined custom element types in our helper module: `AttentionType`, `SelfAttentionType`, and `CausalSelfAttentionType` that create a hierarchy of attention-related neural types. These are imported from `helper_files.gpt_components`." ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "ybhLLVyUF0mo" }, + "outputs": [], "source": [ - "class AttentionType(EncodedRepresentation):\n", - " \"\"\"Basic Attention Element Type\"\"\"\n", - "\n", - "class SelfAttentionType(AttentionType):\n", - " \"\"\"Self Attention Element Type\"\"\"\n", - "\n", - "class CausalSelfAttentionType(SelfAttentionType):\n", - " \"\"\"Causal Self Attention Element Type\"\"\"" - ], - "execution_count": null, - "outputs": [] + "# Custom element types are now imported from helper_files.gpt_components:\n", + "# - AttentionType(EncodedRepresentation): Basic Attention Element Type\n", + "# - SelfAttentionType(AttentionType): Self Attention Element Type \n", + "# - CausalSelfAttentionType(SelfAttentionType): Causal Self Attention Element Type\n", + "print(\"Custom element types imported successfully!\")" + ] }, { "cell_type": "markdown", @@ -874,81 +875,28 @@ "\n", "Neural Modules are generally top-level modules but can be used at any level of the module hierarchy.\n", "\n", - "For demonstration, we will treat an encoder comprising a block of Causal Self Attention modules as a typed Neural Module. Of course, we can also treat each Causal Self Attention layer itself as a neural module if we require it, but top-level modules are generally preferred." + "For demonstration, we will treat an encoder comprising a block of Causal Self Attention modules as a typed Neural Module. Of course, we can also treat each Causal Self Attention layer itself as a neural module if we require it, but top-level modules are generally preferred.\n", + "\n", + "The basic PyTorch modules (`CausalSelfAttention` and `Block`) are now imported from our helper module to avoid `__main__` namespace issues with NeMo's security validation." ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "w4oXpAL_CoDp" }, + "outputs": [], "source": [ - "class CausalSelfAttention(nn.Module):\n", - " \"\"\"\n", - " A vanilla multi-head masked self-attention layer with a projection at the end.\n", - " It is possible to use torch.nn.MultiheadAttention here but I am including an\n", - " explicit implementation here to show that there is nothing too scary here.\n", - " \"\"\"\n", - "\n", - " def __init__(self, n_embd, block_size, n_head, attn_pdrop, resid_pdrop):\n", - " super().__init__()\n", - " assert n_embd % n_head == 0\n", - " self.n_head = n_head\n", - " # key, query, value projections for all heads\n", - " self.key = nn.Linear(n_embd, n_embd)\n", - " self.query = nn.Linear(n_embd, n_embd)\n", - " self.value = nn.Linear(n_embd, n_embd)\n", - " # regularization\n", - " self.attn_drop = nn.Dropout(attn_pdrop)\n", - " self.resid_drop = nn.Dropout(resid_pdrop)\n", - " # output projection\n", - " self.proj = nn.Linear(n_embd, n_embd)\n", - " # causal mask to ensure that attention is only applied to the left in the input sequence\n", - " self.register_buffer(\"mask\", torch.tril(torch.ones(block_size, block_size))\n", - " .view(1, 1, block_size, block_size))\n", - " def forward(self, x, layer_past=None):\n", - " B, T, C = x.size()\n", - "\n", - " # calculate query, key, values for all heads in batch and move head forward to be the batch dim\n", - " k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)\n", - " q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)\n", - " v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)\n", - "\n", - " # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)\n", - " att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))\n", - " att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))\n", - " att = F.softmax(att, dim=-1)\n", - " att = self.attn_drop(att)\n", - " y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)\n", - " y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side\n", - "\n", - " # output projection\n", - " y = self.resid_drop(self.proj(y))\n", - " return y\n", - " \n", - "\n", - "class Block(nn.Module):\n", - " \"\"\" an unassuming Transformer block \"\"\"\n", + "# CausalSelfAttention and Block classes are now imported from helper_files.gpt_components\n", + "# These are standard PyTorch nn.Module implementations:\n", + "# - CausalSelfAttention: A vanilla multi-head masked self-attention layer\n", + "# - Block: An unassuming Transformer block combining attention and MLP\n", "\n", - " def __init__(self, n_embd, block_size, n_head, attn_pdrop, resid_pdrop):\n", - " super().__init__()\n", - " self.ln1 = nn.LayerNorm(n_embd)\n", - " self.ln2 = nn.LayerNorm(n_embd)\n", - " self.attn = CausalSelfAttention(n_embd, block_size, n_head, attn_pdrop, resid_pdrop)\n", - " self.mlp = nn.Sequential(\n", - " nn.Linear(n_embd, 4 * n_embd),\n", - " nn.GELU(),\n", - " nn.Linear(4 * n_embd, n_embd),\n", - " nn.Dropout(resid_pdrop),\n", - " )\n", - "\n", - " def forward(self, x):\n", - " x = x + self.attn(self.ln1(x))\n", - " x = x + self.mlp(self.ln2(x))\n", - " return x" - ], - "execution_count": null, - "outputs": [] + "print(\"Basic PyTorch modules imported successfully!\")\n", + "print(f\"CausalSelfAttention: {CausalSelfAttention}\")\n", + "print(f\"Block: {Block}\")" + ] }, { "cell_type": "markdown", @@ -980,16 +928,16 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "0TsfmCYthMux" }, + "outputs": [], "source": [ "import lightning.pytorch as ptl\n", "from nemo.core import ModelPT\n", "from omegaconf import OmegaConf" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1005,9 +953,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "98x9-Fh-HVwj" }, + "outputs": [], "source": [ "class PTLGPT(ptl.LightningModule):\n", " def __init__(self,\n", @@ -1077,9 +1027,7 @@ " elif isinstance(module, nn.LayerNorm):\n", " module.bias.data.zero_()\n", " module.weight.data.fill_(1.0)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1093,14 +1041,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "rrXIBzg4wutC" }, + "outputs": [], "source": [ "m = PTLGPT(vocab_size=100, block_size=32, n_layer=1, n_embd=32, n_head=4)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1147,48 +1095,26 @@ "source": [ "### Refactoring the Embedding module\n", "\n", - "Let's first refactor out the embedding module from the above implementation" + "Let's first refactor out the embedding module from the above implementation. The `GPTEmbedding` class is now imported from our helper module." ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "uYwMyjqK05RL" }, + "outputs": [], "source": [ - "class GPTEmbedding(NeuralModule):\n", - " def __init__(self, vocab_size: int, n_embd: int, block_size: int, embd_pdrop: float = 0.0):\n", - " super().__init__()\n", - "\n", - " # input embedding stem: drop(content + position)\n", - " self.tok_emb = nn.Embedding(vocab_size, n_embd)\n", - " self.pos_emb = nn.Parameter(torch.zeros(1, block_size, n_embd))\n", - " self.drop = nn.Dropout(embd_pdrop)\n", - "\n", - " @typecheck()\n", - " def forward(self, idx):\n", - " b, t = idx.size()\n", - " \n", - " # forward the GPT model\n", - " token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) vector\n", - " position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) vector\n", - " x = self.drop(token_embeddings + position_embeddings)\n", - " return x\n", - "\n", - " @property\n", - " def input_types(self):\n", - " return {\n", - " 'idx': NeuralType(('B', 'T'), Index())\n", - " }\n", + "# GPTEmbedding NeuralModule is now imported from helper_files.gpt_components\n", + "# It implements token and positional embeddings with dropout\n", + "print(f\"GPTEmbedding imported: {GPTEmbedding}\")\n", "\n", - " @property\n", - " def output_types(self):\n", - " return {\n", - " 'embeddings': NeuralType(('B', 'T', 'C'), EmbeddedTextType())\n", - " }" - ], - "execution_count": null, - "outputs": [] + "# Example instantiation (with dummy parameters for demonstration)\n", + "dummy_embedding = GPTEmbedding(vocab_size=100, n_embd=32, block_size=128)\n", + "print(f\"Input types: {dummy_embedding.input_types}\")\n", + "print(f\"Output types: {dummy_embedding.output_types}\")" + ] }, { "cell_type": "markdown", @@ -1198,7 +1124,7 @@ "source": [ "### Refactoring the Encoder\n", "\n", - "Next, let's refactor the GPT Encoder - which is implemented as a multi layer Transformer (Decoder) network.\n", + "Next, let's refactor the GPT Encoder - which is implemented as a multi layer Transformer (Decoder) network. The `GPTTransformerEncoder` class is now imported from our helper module.\n", "\n", "------\n", "It can be noted that we refer to the GPT \"Encoder\" module - but it is constructed by using Transformer \"Decoder\" blocks.\n", @@ -1217,35 +1143,21 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "1QeQnQ_G2PwH" }, + "outputs": [], "source": [ - "class GPTTransformerEncoder(NeuralModule):\n", - " def __init__(self, n_embd: int, block_size: int, n_head: int, n_layer: int, attn_pdrop: float = 0.0, resid_pdrop: float = 0.0):\n", - " super().__init__()\n", - "\n", - " self.blocks = nn.Sequential(*[Block(n_embd, block_size, n_head, attn_pdrop, resid_pdrop) \n", - " for _ in range(n_layer)])\n", - " \n", - " @typecheck()\n", - " def forward(self, embed):\n", - " return self.blocks(embed)\n", - "\n", - " @property\n", - " def input_types(self):\n", - " return {\n", - " 'embed': NeuralType(('B', 'T', 'C'), EmbeddedTextType())\n", - " }\n", + "# GPTTransformerEncoder NeuralModule is now imported from helper_files.gpt_components\n", + "# It implements a sequence of transformer blocks for encoding\n", + "print(f\"GPTTransformerEncoder imported: {GPTTransformerEncoder}\")\n", "\n", - " @property\n", - " def output_types(self):\n", - " return {\n", - " 'encoding': NeuralType(('B', 'T', 'C'), CausalSelfAttentionType())\n", - " }" - ], - "execution_count": null, - "outputs": [] + "# Example instantiation (with dummy parameters for demonstration)\n", + "dummy_encoder = GPTTransformerEncoder(n_embd=32, block_size=128, n_head=4, n_layer=1)\n", + "print(f\"Input types: {dummy_encoder.input_types}\")\n", + "print(f\"Output types: {dummy_encoder.output_types}\")" + ] }, { "cell_type": "markdown", @@ -1255,7 +1167,7 @@ "source": [ "### Refactoring the Decoder\n", "\n", - "Finally, let's refactor the Decoder - the small one-layer feed-forward network to decode the answer.\n", + "Finally, let's refactor the Decoder - the small one-layer feed-forward network to decode the answer. The `GPTDecoder` class is now imported from our helper module.\n", "\n", "-------\n", "\n", @@ -1268,36 +1180,21 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "VCPUu0EWQIBX" }, + "outputs": [], "source": [ - "class GPTDecoder(NeuralModule):\n", - " def __init__(self, n_embd: int, vocab_size: int):\n", - " super().__init__()\n", - " self.ln_f = nn.LayerNorm(n_embd)\n", - " self.head = nn.Linear(n_embd, vocab_size, bias=False) # no need for extra bias due to one in ln_f\n", + "# GPTDecoder NeuralModule is now imported from helper_files.gpt_components\n", + "# It implements layer normalization followed by a linear layer to produce logits\n", + "print(f\"GPTDecoder imported: {GPTDecoder}\")\n", "\n", - " @typecheck()\n", - " def forward(self, encoding):\n", - " x = self.ln_f(encoding)\n", - " logits = self.head(x)\n", - " return logits\n", - "\n", - " @property\n", - " def input_types(self):\n", - " return {\n", - " 'encoding': NeuralType(('B', 'T', 'C'), EncodedRepresentation())\n", - " }\n", - " \n", - " @property\n", - " def output_types(self):\n", - " return {\n", - " 'logits': NeuralType(('B', 'T', 'C'), LogitsType())\n", - " }\n" - ], - "execution_count": null, - "outputs": [] + "# Example instantiation (with dummy parameters for demonstration)\n", + "dummy_decoder = GPTDecoder(n_embd=32, vocab_size=100)\n", + "print(f\"Input types: {dummy_decoder.input_types}\") \n", + "print(f\"Output types: {dummy_decoder.output_types}\")\n" + ] }, { "cell_type": "markdown", @@ -1314,9 +1211,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "ZQlmtYU6iDwi" }, + "outputs": [], "source": [ "class AbstractNeMoGPT(ModelPT):\n", " def __init__(self, cfg: OmegaConf, trainer: ptl.Trainer = None):\n", @@ -1375,9 +1274,7 @@ " return {\n", " 'logits': NeuralType(('B', 'T', 'C'), LogitsType())\n", " }" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1396,9 +1293,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "uygo0BEYjKuj" }, + "outputs": [], "source": [ "# model definition args (required)\n", "# ================================\n", @@ -1413,9 +1312,7 @@ "# embd_pdrop: float = 0.1, # \\in [0,1]: amount of dropout on input embeddings\n", "# resid_pdrop: float = 0.1, # \\in [0,1]: amount of dropout in each residual connection\n", "# attn_pdrop: float = 0.1, # \\in [0,1]: amount of dropout on the attention matrix" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1431,27 +1328,27 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "XqLSZq7Soo2j" }, + "outputs": [], "source": [ "from omegaconf import MISSING" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "JTH-1vu8TO7o" }, + "outputs": [], "source": [ "# Let's create a utility for building the class path\n", "def get_class_path(cls):\n", " return f'{cls.__module__}.{cls.__name__}'" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1466,9 +1363,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "ZCvLdOlMVLy_" }, + "outputs": [], "source": [ "common_config = OmegaConf.create({\n", " 'vocab_size': MISSING,\n", @@ -1477,9 +1376,7 @@ " 'n_embd': MISSING,\n", " 'n_head': MISSING,\n", "})" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1510,9 +1407,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "ntsxQKH0pDac" }, + "outputs": [], "source": [ "embedding_config = OmegaConf.create({\n", " '_target_': get_class_path(GPTEmbedding),\n", @@ -1538,9 +1437,7 @@ " 'n_embd': '${model.n_embd}',\n", " 'vocab_size': '${model.vocab_size}'\n", "})" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1591,9 +1488,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "c8hvNeB_aDgi" }, + "outputs": [], "source": [ "model_config = OmegaConf.create({\n", " 'model': common_config\n", @@ -1603,9 +1502,7 @@ "model_config.model.embedding = embedding_config\n", "model_config.model.encoder = encoder_config\n", "model_config.model.decoder = decoder_config" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1619,14 +1516,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "2SyKNgp9pG0N" }, + "outputs": [], "source": [ "print(OmegaConf.to_yaml(model_config))" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1642,20 +1539,22 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "0X4C76JyOAnN" }, + "outputs": [], "source": [ "import copy" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "ugxA0TPtbHVZ" }, + "outputs": [], "source": [ "temp_config = copy.deepcopy(model_config)\n", "temp_config.model.vocab_size = 10\n", @@ -1666,9 +1565,7 @@ "\n", "temp_config = OmegaConf.create(OmegaConf.to_container(temp_config, resolve=True))\n", "print(OmegaConf.to_yaml(temp_config))" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1682,21 +1579,23 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "IIIVi2IfpsJ4" }, + "outputs": [], "source": [ "# Let's work on a copy of the model config and update it before we send it into the Model.\n", "cfg = copy.deepcopy(model_config)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "OllBhswPqQXq" }, + "outputs": [], "source": [ "# Let's set the values of the config (for some plausible small model)\n", "cfg.model.vocab_size = 100\n", @@ -1704,32 +1603,30 @@ "cfg.model.n_layer = 1\n", "cfg.model.n_embd = 32\n", "cfg.model.n_head = 4" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "QJm2LnTqqcIM" }, + "outputs": [], "source": [ "print(OmegaConf.to_yaml(cfg))" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "E7tpB8BcqeBO" }, + "outputs": [], "source": [ "# Try to create a model with this config [ERROR CELL]\n", "m = AbstractNeMoGPT(cfg.model)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1759,20 +1656,22 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "Vcwi1lO7t7Sm" }, + "outputs": [], "source": [ "from nemo.core.classes.common import PretrainedModelInfo" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "ckCxyVLYqrz0" }, + "outputs": [], "source": [ "class BasicNeMoGPT(AbstractNeMoGPT):\n", "\n", @@ -1788,9 +1687,7 @@ " \n", " def setup_test_data(self, test_data_config: OmegaConf):\n", " self._test_dl = None" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1804,14 +1701,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "G8iYQSC5vptU" }, + "outputs": [], "source": [ "m = BasicNeMoGPT(cfg.model)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1836,9 +1733,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "QU3oQAVovxRg" }, + "outputs": [], "source": [ "class BasicNeMoGPTWithSteps(BasicNeMoGPT):\n", "\n", @@ -1868,20 +1767,18 @@ " def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0):\n", " test_loss_mean = torch.stack([x['test_loss'] for x in outputs]).mean()\n", " return {'test_loss': test_loss_mean}" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "2Ki3kRxag511" }, + "outputs": [], "source": [ "m = BasicNeMoGPTWithSteps(cfg=cfg.model)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1932,9 +1829,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "FgXkZQiVjnOv" }, + "outputs": [], "source": [ "class BasicNeMoGPTWithOptim(BasicNeMoGPTWithSteps):\n", "\n", @@ -1983,20 +1882,18 @@ " ]\n", " optimizer = torch.optim.AdamW(optim_groups, lr=self.cfg.optim.lr, betas=self.cfg.optim.betas)\n", " return optimizer\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "kARDwthakEQk" }, + "outputs": [], "source": [ "m = BasicNeMoGPTWithOptim(cfg=cfg.model)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -2010,9 +1907,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "5K7zh9Cn2s2u" }, + "outputs": [], "source": [ "OmegaConf.set_struct(cfg.model, False)\n", "\n", @@ -2025,9 +1924,7 @@ "cfg.model.optim = optim_config\n", "\n", "OmegaConf.set_struct(cfg.model, True)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -2066,22 +1963,24 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "E-fswFkig9t4" }, + "outputs": [], "source": [ "from nemo.core import Dataset\n", "from torch.utils import data\n", "from torch.utils.data.dataloader import DataLoader" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "-Z8XuPeClGNm" }, + "outputs": [], "source": [ "class TinyShakespeareDataset(Dataset):\n", "\n", @@ -2136,9 +2035,7 @@ " 'input': NeuralType(('B', 'T'), Index()),\n", " 'target': NeuralType(('B', 'T'), LabelsType())\n", " }" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -2168,50 +2065,50 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "VwsdXtVzo--t" }, + "outputs": [], "source": [ "import os" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "QvKcDCvIl9-A" }, + "outputs": [], "source": [ "if not os.path.exists('tiny-shakespeare.txt'):\n", " !wget https://raw.githubusercontent.com/jcjohnson/torch-rnn/master/data/tiny-shakespeare.txt" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "ynCwqDu6vK8P" }, + "outputs": [], "source": [ "!head -n 5 tiny-shakespeare.txt" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "bfRL4t9_oS4C" }, + "outputs": [], "source": [ "train_dataset = TinyShakespeareDataset('tiny-shakespeare.txt', cfg.model.block_size, crop=(0, int(1e6)))\n", "val_dataset = TinyShakespeareDataset('tiny-shakespeare.txt', cfg.model.block_size, crop=(int(1e6), int(50e3)), override_vocab=train_dataset.vocab)\n", "test_dataset = TinyShakespeareDataset('tiny-shakespeare.txt', cfg.model.block_size, crop=(int(1.05e6), int(100e3)), override_vocab=train_dataset.vocab)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -2230,9 +2127,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "SVSfIk_-rMSg" }, + "outputs": [], "source": [ "class NeMoGPT(BasicNeMoGPTWithOptim):\n", "\n", @@ -2270,9 +2169,7 @@ " \n", " def setup_test_data(self, test_data_config: OmegaConf):\n", " self._test_dl = self._setup_data_loader(test_data_config)\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -2287,9 +2184,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "C6zcTqJixOOL" }, + "outputs": [], "source": [ "OmegaConf.set_struct(cfg.model, False)\n", "\n", @@ -2298,15 +2197,15 @@ "cfg.model.vocab_size = train_dataset.vocab_size\n", "\n", "OmegaConf.set_struct(cfg.model, True)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "zlvThf7BysyT" }, + "outputs": [], "source": [ "train_ds = OmegaConf.create({\n", " 'data_path': '${model.data_path}',\n", @@ -2331,15 +2230,15 @@ " 'batch_size': 4,\n", " 'shuffle': False,\n", "})" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "QVVzR6WKyMT5" }, + "outputs": [], "source": [ "# Attach to the model config\n", "OmegaConf.set_struct(cfg.model, False)\n", @@ -2349,33 +2248,31 @@ "cfg.model.test_ds = test_ds\n", "\n", "OmegaConf.set_struct(cfg.model, True)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "nd_9_mxS0ET-" }, + "outputs": [], "source": [ "# Let's see the config now !\n", "print(OmegaConf.to_yaml(cfg))" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "dlwSQENU0JxA" }, + "outputs": [], "source": [ "# Let's try creating a model now !\n", "model = NeMoGPT(cfg=cfg.model)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -2410,9 +2307,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "johk6Z0e0WEm" }, + "outputs": [], "source": [ "if torch.cuda.is_available():\n", " accelerator = 'gpu'\n", @@ -2420,20 +2319,18 @@ " accelerator = 'cpu'\n", "\n", "trainer = ptl.Trainer(devices=1, accelerator=accelerator, limit_test_batches=1.0)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "oqeeofEr1S8e" }, + "outputs": [], "source": [ "trainer.test(model)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -2450,48 +2347,48 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "DksG_-7G1Vbe" }, + "outputs": [], "source": [ "model.save_to('gpt_model.nemo')" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "JhjoFdCnBWVh" }, + "outputs": [], "source": [ "!ls -d -- *.nemo" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "567txSF0BYXN" }, + "outputs": [], "source": [ "temp_model = NeMoGPT.restore_from('gpt_model.nemo')" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "YvnfG0kxBfTt" }, + "outputs": [], "source": [ "# [ERROR CELL]\n", "temp_model.setup_test_data(temp_model.cfg.test_ds)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -2510,9 +2407,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "_Atyoc4NBjEV" }, + "outputs": [], "source": [ "class NeMoGPTv2(NeMoGPT):\n", " \n", @@ -2552,61 +2451,61 @@ " self.vocab = vocab\n", "\n", " self._test_dl = self._setup_data_loader(test_data_config)\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "mn09jsRZDusN" }, + "outputs": [], "source": [ "# Let's try creating a model now !\n", "model = NeMoGPTv2(cfg=cfg.model)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "sQPIPySDD1K0" }, + "outputs": [], "source": [ "# Now let's try to save and restore !\n", "model.save_to('gpt_model.nemo')" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "0YwCJ4xaJ3bU" }, + "outputs": [], "source": [ "temp_model = NeMoGPTv2.restore_from('gpt_model.nemo')" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "tcxwDIIWKKCQ" }, + "outputs": [], "source": [ "temp_model.setup_multiple_test_data(temp_model.cfg.test_ds)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "j3Olm6ZTKRbO" }, + "outputs": [], "source": [ "if torch.cuda.is_available():\n", " accelerator = 'gpu'\n", @@ -2614,20 +2513,18 @@ " accelerator = 'cpu'\n", "\n", "trainer = ptl.Trainer(devices=1, accelerator=accelerator, limit_test_batches =1.0)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "_QE2SngCKV2p" }, + "outputs": [], "source": [ "trainer.test(model)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -2641,14 +2538,26 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "ZjCV5u3_OO7a" }, - "source": [ - "" - ], - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "01_NeMo_Models.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" } - ] + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/tutorials/helper_files/__init__.py b/tutorials/helper_files/__init__.py new file mode 100644 index 000000000000..1e1142df0443 --- /dev/null +++ b/tutorials/helper_files/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helper files for NeMo tutorials.""" diff --git a/tutorials/helper_files/gpt_components.py b/tutorials/helper_files/gpt_components.py new file mode 100644 index 000000000000..234ff45c1bec --- /dev/null +++ b/tutorials/helper_files/gpt_components.py @@ -0,0 +1,187 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +GPT components for the NeMo Models tutorial. +This module contains the neural network components used in the tutorial 01_NeMo_Models.ipynb +""" + +import math +from typing import Optional + +import torch +import torch.nn as nn +from torch.nn import functional as F + +from nemo.core import NeuralModule, typecheck +from nemo.core.neural_types import EmbeddedTextType, EncodedRepresentation, Index, LogitsType, NeuralType +from nemo.core.neural_types.elements import * + + +# Custom Element Types +class AttentionType(EncodedRepresentation): + """Basic Attention Element Type""" + + +class SelfAttentionType(AttentionType): + """Self Attention Element Type""" + + +class CausalSelfAttentionType(SelfAttentionType): + """Causal Self Attention Element Type""" + + +# Neural Network Modules (not NeMo neural modules) +class CausalSelfAttention(nn.Module): + """ + A vanilla multi-head masked self-attention layer with a projection at the end. + It is possible to use torch.nn.MultiheadAttention here but I am including an + explicit implementation here to show that there is nothing too scary here. + """ + + def __init__(self, n_embd, block_size, n_head, attn_pdrop, resid_pdrop): + super().__init__() + assert n_embd % n_head == 0 + self.n_head = n_head + # key, query, value projections for all heads + self.key = nn.Linear(n_embd, n_embd) + self.query = nn.Linear(n_embd, n_embd) + self.value = nn.Linear(n_embd, n_embd) + # regularization + self.attn_drop = nn.Dropout(attn_pdrop) + self.resid_drop = nn.Dropout(resid_pdrop) + # output projection + self.proj = nn.Linear(n_embd, n_embd) + # causal mask to ensure that attention is only applied to the left in the input sequence + self.register_buffer("mask", torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size)) + + def forward(self, x, layer_past=None): + B, T, C = x.size() + + # calculate query, key, values for all heads in batch and move head forward to be the batch dim + k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) + q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) + v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) + + # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.attn_drop(att) + y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) + y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side + + # output projection + y = self.resid_drop(self.proj(y)) + return y + + +class Block(nn.Module): + """an unassuming Transformer block""" + + def __init__(self, n_embd, block_size, n_head, attn_pdrop, resid_pdrop): + super().__init__() + self.ln1 = nn.LayerNorm(n_embd) + self.ln2 = nn.LayerNorm(n_embd) + self.attn = CausalSelfAttention(n_embd, block_size, n_head, attn_pdrop, resid_pdrop) + self.mlp = nn.Sequential( + nn.Linear(n_embd, 4 * n_embd), + nn.GELU(), + nn.Linear(4 * n_embd, n_embd), + nn.Dropout(resid_pdrop), + ) + + def forward(self, x): + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + + +# NeMo Neural Modules +class GPTEmbedding(NeuralModule): + def __init__(self, vocab_size: int, n_embd: int, block_size: int, embd_pdrop: float = 0.0): + super().__init__() + + # input embedding stem: drop(content + position) + self.tok_emb = nn.Embedding(vocab_size, n_embd) + self.pos_emb = nn.Parameter(torch.zeros(1, block_size, n_embd)) + self.drop = nn.Dropout(embd_pdrop) + + @typecheck() + def forward(self, idx): + b, t = idx.size() + + # forward the GPT model + token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) vector + position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) vector + x = self.drop(token_embeddings + position_embeddings) + return x + + @property + def input_types(self): + return {'idx': NeuralType(('B', 'T'), Index())} + + @property + def output_types(self): + return {'embeddings': NeuralType(('B', 'T', 'C'), EmbeddedTextType())} + + +class GPTTransformerEncoder(NeuralModule): + def __init__( + self, + n_embd: int, + block_size: int, + n_head: int, + n_layer: int, + attn_pdrop: float = 0.0, + resid_pdrop: float = 0.0, + ): + super().__init__() + + self.blocks = nn.Sequential( + *[Block(n_embd, block_size, n_head, attn_pdrop, resid_pdrop) for _ in range(n_layer)] + ) + + @typecheck() + def forward(self, embed): + return self.blocks(embed) + + @property + def input_types(self): + return {'embed': NeuralType(('B', 'T', 'C'), EmbeddedTextType())} + + @property + def output_types(self): + return {'encoding': NeuralType(('B', 'T', 'C'), CausalSelfAttentionType())} + + +class GPTDecoder(NeuralModule): + def __init__(self, n_embd: int, vocab_size: int): + super().__init__() + self.ln_f = nn.LayerNorm(n_embd) + self.head = nn.Linear(n_embd, vocab_size, bias=False) # no need for extra bias due to one in ln_f + + @typecheck() + def forward(self, encoding): + x = self.ln_f(encoding) + logits = self.head(x) + return logits + + @property + def input_types(self): + return {'encoding': NeuralType(('B', 'T', 'C'), EncodedRepresentation())} + + @property + def output_types(self): + return {'logits': NeuralType(('B', 'T', 'C'), LogitsType())}