diff --git a/tutorials/01_NeMo_Models.ipynb b/tutorials/01_NeMo_Models.ipynb
index 43972549203c..a46a3238d040 100644
--- a/tutorials/01_NeMo_Models.ipynb
+++ b/tutorials/01_NeMo_Models.ipynb
@@ -1,24 +1,12 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "01_NeMo_Models.ipynb",
-      "provenance": [],
-      "collapsed_sections": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    }
-  },
   "cells": [
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "ASnx4b5jXsil"
       },
+      "outputs": [],
       "source": [
         "\"\"\"\n",
         "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
@@ -45,9 +33,7 @@
         "\n",
         "## Grab the config we'll use in this example\n",
         "!mkdir configs"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -174,17 +160,17 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "piLOgwOPX1FS"
       },
+      "outputs": [],
       "source": [
         "import torch\n",
         "import nemo\n",
         "from nemo.core import NeuralModule\n",
         "from nemo.core import typecheck"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -208,29 +194,29 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "bseLiNoqqQrE"
       },
+      "outputs": [],
       "source": [
         "class MyEmptyModule(NeuralModule):\n",
         "\n",
         "  def forward(self):\n",
         "    print(\"Neural Module ~ hello world!\")"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "j4Q36L5urdOQ"
       },
+      "outputs": [],
       "source": [
         "x = MyEmptyModule()\n",
         "x()"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -261,33 +247,33 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "ZvC57bbxwXxN"
       },
+      "outputs": [],
       "source": [
         "# Case 1:\n",
         "embedding = torch.nn.Embedding(num_embeddings=10, embedding_dim=30)\n",
         "x = torch.randint(high=10, size=(1, 5))\n",
         "print(\"x :\", x)\n",
         "print(\"embedding(x) :\", embedding(x).shape)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "sMaqhMBgxe2C"
       },
+      "outputs": [],
       "source": [
         "# Case 2\n",
         "lstm = torch.nn.LSTM(1, 30, batch_first=True)\n",
         "x = torch.randn(1, 5, 1)\n",
         "print(\"x :\", x)\n",
         "print(\"lstm(x) :\", lstm(x)[0].shape)  # Let's take all timestep outputs of the LSTM"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -340,21 +326,23 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "yp0FG8NJt1Jd"
       },
+      "outputs": [],
       "source": [
         "from nemo.core.neural_types import NeuralType\n",
         "from nemo.core.neural_types import *"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "3tsgs8Fp0-WV"
       },
+      "outputs": [],
       "source": [
         "class EmbeddingModule(NeuralModule):\n",
         "  def __init__(self):\n",
@@ -376,9 +364,7 @@
         "    return {\n",
         "        'y': NeuralType(axes=('B', 'T', 'C'), elements_type=EmbeddedTextType())\n",
         "    }"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -442,14 +428,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "boxxMniv27vi"
       },
+      "outputs": [],
       "source": [
         "embedding_module = EmbeddingModule()"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -462,9 +448,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "SZZOOoCJ2-iV"
       },
+      "outputs": [],
       "source": [
         "class LSTMModule(NeuralModule):\n",
         "  def __init__(self):\n",
@@ -486,9 +474,7 @@
         "    return {\n",
         "        'y': NeuralType(axes=('B', 'T', 'C'), elements_type=EncodedRepresentation())\n",
         "    }"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -506,14 +492,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "6LlOJf0C8GN4"
       },
+      "outputs": [],
       "source": [
         "lstm_module = LSTMModule()"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -527,17 +513,17 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "giLJlub78-Ja"
       },
+      "outputs": [],
       "source": [
         "# Case 1 [ERROR CELL]\n",
         "x1 = torch.randint(high=10, size=(1, 5))\n",
         "print(\"x :\", x1)\n",
         "print(\"embedding(x) :\", embedding_module(x1).shape)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -553,16 +539,16 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "2KUj_p6M9L-f"
       },
+      "outputs": [],
       "source": [
         "# Case 1\n",
         "print(\"x :\", x1)\n",
         "print(\"embedding(x) :\", embedding_module(x=x1).shape)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -575,17 +561,17 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "FMu3B0-9-CqE"
       },
+      "outputs": [],
       "source": [
         "# Case 2 [ERROR CELL]\n",
         "x2 = torch.randn(1, 5, 1)  # Input = [B=1, T=5, C=1]\n",
         "print(\"x :\", x2)\n",
         "print(\"lstm(x) :\", lstm_module(x=x2)[0].shape)  # Let's take all timestep outputs of the LSTM"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -611,9 +597,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "q2u-keAM-d-B"
       },
+      "outputs": [],
       "source": [
         "class CorrectLSTMModule(LSTMModule):  # Let's inherit the wrong class to make it easy to override\n",
         "  @property\n",
@@ -622,9 +610,7 @@
         "        'y': NeuralType(axes=('B', 'T', 'C'), elements_type=EncodedRepresentation()),\n",
         "        'h_c': [NeuralType(axes=('D', 'B', 'C'), elements_type=EncodedRepresentation())],\n",
         "    }"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -641,20 +627,22 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "GyPZH-fz_dG4"
       },
+      "outputs": [],
       "source": [
         "lstm_module = CorrectLSTMModule()"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "9whH50PE_Xyx"
       },
+      "outputs": [],
       "source": [
         "# Case 2\n",
         "x2 = torch.randn(1, 5, 1)\n",
@@ -663,9 +651,7 @@
         "print(\"lstm(x) :\", y2.shape)  # The output of the LSTM RNN\n",
         "print(\"hidden state (h) :\", h.shape)  # The first hidden state of the LSTM RNN\n",
         "print(\"hidden state (c) :\", c.shape)  # The second hidden state of the LSTM RNN"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -683,30 +669,30 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "bGQ9XbWU_ffa"
       },
+      "outputs": [],
       "source": [
         "emb_out = embedding_module(x=x1)\n",
         "lstm_out = lstm_module(x=x2)[0]\n",
         "\n",
         "assert hasattr(emb_out, 'neural_type')\n",
         "assert hasattr(lstm_out, 'neural_type')"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "kEpBruSOScPJ"
       },
+      "outputs": [],
       "source": [
         "print(\"Embedding tensor :\", emb_out.neural_type)\n",
         "print(\"LSTM tensor :\", lstm_out.neural_type)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -724,25 +710,25 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "8AU9FMtdATIm"
       },
+      "outputs": [],
       "source": [
         "emb_out.neural_type.compare(lstm_out.neural_type)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "2cqnqAGIBCjA"
       },
+      "outputs": [],
       "source": [
         "emb_out.neural_type == lstm_out.neural_type"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -775,9 +761,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "AGbKB4gJEzcU"
       },
+      "outputs": [],
       "source": [
         "embedding_module = EmbeddingModule()\n",
         "x1 = torch.randint(high=10, size=(1, 5))\n",
@@ -786,23 +774,21 @@
         "x1.neural_type = NeuralType(('B', 'T'), Index())\n",
         "\n",
         "print(\"embedding(x) :\", embedding_module(x=x1).shape)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "F0j-evylFM5j"
       },
+      "outputs": [],
       "source": [
         "# Attach wrong neural type [ERROR CELL]\n",
         "x1.neural_type = NeuralType(('B', 'T'), LabelsType())\n",
         "\n",
         "print(\"embedding(x) :\", embedding_module(x=x1).shape)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -814,24 +800,40 @@
         "\n",
         "Now that we have a somewhat firm grasp of neural type checking, let's begin porting the minGPT example code. Once again, most of the code will be a direct port from the [minGPT repository](https://github.com/karpathy/minGPT).\n",
         "\n",
-        "Here, you will notice one thing. By just changing class imports, one `@typecheck()` on forward, and adding `input_types` and `output_types` (which are also entirely optional!), we are almost entirely done with the PyTorch Lightning port!"
+        "Here, you will notice one thing. By just changing class imports, one `@typecheck()` on forward, and adding `input_types` and `output_types` (which are also entirely optional!), we are almost entirely done with the PyTorch Lightning port!\n",
+        "\n",
+        "**Note**: We've moved all the GPT component classes to a helper module to avoid `__main__` namespace issues with NeMo's security validation. Let's import them:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from helper_files.gpt_components import (\n",
+        "    AttentionType, SelfAttentionType, CausalSelfAttentionType,\n",
+        "    CausalSelfAttention, Block,\n",
+        "    GPTEmbedding, GPTTransformerEncoder, GPTDecoder\n",
+        ")\n"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "raFkuSRaBAE0"
       },
+      "outputs": [],
       "source": [
+        "# Basic imports needed for the tutorial\n",
         "import math\n",
         "from typing import List, Set, Dict, Tuple, Optional\n",
         "\n",
         "import torch\n",
         "import torch.nn as nn\n",
         "from torch.nn import functional as F"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -843,26 +845,25 @@
         "\n",
         "Till now, we have used the Neural Types provided by the NeMo core. But we need not be restricted to the pre-defined element types !\n",
         "\n",
-        "Users have total flexibility in defining any hierarchy of element types as they please!"
+        "Users have total flexibility in defining any hierarchy of element types as they please!\n",
+        "\n",
+        "We've defined custom element types in our helper module: `AttentionType`, `SelfAttentionType`, and `CausalSelfAttentionType` that create a hierarchy of attention-related neural types. These are imported from `helper_files.gpt_components`."
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "ybhLLVyUF0mo"
       },
+      "outputs": [],
       "source": [
-        "class AttentionType(EncodedRepresentation):\n",
-        "  \"\"\"Basic Attention Element Type\"\"\"\n",
-        "\n",
-        "class SelfAttentionType(AttentionType):\n",
-        "  \"\"\"Self Attention Element Type\"\"\"\n",
-        "\n",
-        "class CausalSelfAttentionType(SelfAttentionType):\n",
-        "  \"\"\"Causal Self Attention Element Type\"\"\""
-      ],
-      "execution_count": null,
-      "outputs": []
+        "# Custom element types are now imported from helper_files.gpt_components:\n",
+        "# - AttentionType(EncodedRepresentation): Basic Attention Element Type\n",
+        "# - SelfAttentionType(AttentionType): Self Attention Element Type  \n",
+        "# - CausalSelfAttentionType(SelfAttentionType): Causal Self Attention Element Type\n",
+        "print(\"Custom element types imported successfully!\")"
+      ]
     },
     {
       "cell_type": "markdown",
@@ -874,81 +875,28 @@
         "\n",
         "Neural Modules are generally top-level modules but can be used at any level of the module hierarchy.\n",
         "\n",
-        "For demonstration, we will treat an encoder comprising a block of Causal Self Attention modules as a typed Neural Module. Of course, we can also treat each Causal Self Attention layer itself as a neural module if we require it, but top-level modules are generally preferred."
+        "For demonstration, we will treat an encoder comprising a block of Causal Self Attention modules as a typed Neural Module. Of course, we can also treat each Causal Self Attention layer itself as a neural module if we require it, but top-level modules are generally preferred.\n",
+        "\n",
+        "The basic PyTorch modules (`CausalSelfAttention` and `Block`) are now imported from our helper module to avoid `__main__` namespace issues with NeMo's security validation."
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "w4oXpAL_CoDp"
       },
+      "outputs": [],
       "source": [
-        "class CausalSelfAttention(nn.Module):\n",
-        "    \"\"\"\n",
-        "    A vanilla multi-head masked self-attention layer with a projection at the end.\n",
-        "    It is possible to use torch.nn.MultiheadAttention here but I am including an\n",
-        "    explicit implementation here to show that there is nothing too scary here.\n",
-        "    \"\"\"\n",
-        "\n",
-        "    def __init__(self, n_embd, block_size, n_head, attn_pdrop, resid_pdrop):\n",
-        "        super().__init__()\n",
-        "        assert n_embd % n_head == 0\n",
-        "        self.n_head = n_head\n",
-        "        # key, query, value projections for all heads\n",
-        "        self.key = nn.Linear(n_embd, n_embd)\n",
-        "        self.query = nn.Linear(n_embd, n_embd)\n",
-        "        self.value = nn.Linear(n_embd, n_embd)\n",
-        "        # regularization\n",
-        "        self.attn_drop = nn.Dropout(attn_pdrop)\n",
-        "        self.resid_drop = nn.Dropout(resid_pdrop)\n",
-        "        # output projection\n",
-        "        self.proj = nn.Linear(n_embd, n_embd)\n",
-        "        # causal mask to ensure that attention is only applied to the left in the input sequence\n",
-        "        self.register_buffer(\"mask\", torch.tril(torch.ones(block_size, block_size))\n",
-        "                                     .view(1, 1, block_size, block_size))\n",
-        "    def forward(self, x, layer_past=None):\n",
-        "        B, T, C = x.size()\n",
-        "\n",
-        "        # calculate query, key, values for all heads in batch and move head forward to be the batch dim\n",
-        "        k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)\n",
-        "        q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)\n",
-        "        v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)\n",
-        "\n",
-        "        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)\n",
-        "        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))\n",
-        "        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))\n",
-        "        att = F.softmax(att, dim=-1)\n",
-        "        att = self.attn_drop(att)\n",
-        "        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)\n",
-        "        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side\n",
-        "\n",
-        "        # output projection\n",
-        "        y = self.resid_drop(self.proj(y))\n",
-        "        return y\n",
-        "    \n",
-        "\n",
-        "class Block(nn.Module):\n",
-        "    \"\"\" an unassuming Transformer block \"\"\"\n",
+        "# CausalSelfAttention and Block classes are now imported from helper_files.gpt_components\n",
+        "# These are standard PyTorch nn.Module implementations:\n",
+        "# - CausalSelfAttention: A vanilla multi-head masked self-attention layer\n",
+        "# - Block: An unassuming Transformer block combining attention and MLP\n",
         "\n",
-        "    def __init__(self, n_embd, block_size, n_head, attn_pdrop, resid_pdrop):\n",
-        "        super().__init__()\n",
-        "        self.ln1 = nn.LayerNorm(n_embd)\n",
-        "        self.ln2 = nn.LayerNorm(n_embd)\n",
-        "        self.attn = CausalSelfAttention(n_embd, block_size, n_head, attn_pdrop, resid_pdrop)\n",
-        "        self.mlp = nn.Sequential(\n",
-        "            nn.Linear(n_embd, 4 * n_embd),\n",
-        "            nn.GELU(),\n",
-        "            nn.Linear(4 * n_embd, n_embd),\n",
-        "            nn.Dropout(resid_pdrop),\n",
-        "        )\n",
-        "\n",
-        "    def forward(self, x):\n",
-        "        x = x + self.attn(self.ln1(x))\n",
-        "        x = x + self.mlp(self.ln2(x))\n",
-        "        return x"
-      ],
-      "execution_count": null,
-      "outputs": []
+        "print(\"Basic PyTorch modules imported successfully!\")\n",
+        "print(f\"CausalSelfAttention: {CausalSelfAttention}\")\n",
+        "print(f\"Block: {Block}\")"
+      ]
     },
     {
       "cell_type": "markdown",
@@ -980,16 +928,16 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "0TsfmCYthMux"
       },
+      "outputs": [],
       "source": [
         "import lightning.pytorch as ptl\n",
         "from nemo.core import ModelPT\n",
         "from omegaconf import OmegaConf"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1005,9 +953,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "98x9-Fh-HVwj"
       },
+      "outputs": [],
       "source": [
         "class PTLGPT(ptl.LightningModule):\n",
         "  def __init__(self,\n",
@@ -1077,9 +1027,7 @@
         "      elif isinstance(module, nn.LayerNorm):\n",
         "          module.bias.data.zero_()\n",
         "          module.weight.data.fill_(1.0)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1093,14 +1041,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "rrXIBzg4wutC"
       },
+      "outputs": [],
       "source": [
         "m = PTLGPT(vocab_size=100, block_size=32, n_layer=1, n_embd=32, n_head=4)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1147,48 +1095,26 @@
       "source": [
         "### Refactoring the Embedding module\n",
         "\n",
-        "Let's first refactor out the embedding module from the above implementation"
+        "Let's first refactor out the embedding module from the above implementation. The `GPTEmbedding` class is now imported from our helper module."
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "uYwMyjqK05RL"
       },
+      "outputs": [],
       "source": [
-        "class GPTEmbedding(NeuralModule):\n",
-        "  def __init__(self, vocab_size: int, n_embd: int, block_size: int, embd_pdrop: float = 0.0):\n",
-        "    super().__init__()\n",
-        "\n",
-        "    # input embedding stem: drop(content + position)\n",
-        "    self.tok_emb = nn.Embedding(vocab_size, n_embd)\n",
-        "    self.pos_emb = nn.Parameter(torch.zeros(1, block_size, n_embd))\n",
-        "    self.drop = nn.Dropout(embd_pdrop)\n",
-        "\n",
-        "  @typecheck()\n",
-        "  def forward(self, idx):\n",
-        "    b, t = idx.size()\n",
-        "    \n",
-        "    # forward the GPT model\n",
-        "    token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) vector\n",
-        "    position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) vector\n",
-        "    x = self.drop(token_embeddings + position_embeddings)\n",
-        "    return x\n",
-        "\n",
-        "  @property\n",
-        "  def input_types(self):\n",
-        "    return {\n",
-        "        'idx': NeuralType(('B', 'T'), Index())\n",
-        "    }\n",
+        "# GPTEmbedding NeuralModule is now imported from helper_files.gpt_components\n",
+        "# It implements token and positional embeddings with dropout\n",
+        "print(f\"GPTEmbedding imported: {GPTEmbedding}\")\n",
         "\n",
-        "  @property\n",
-        "  def output_types(self):\n",
-        "    return {\n",
-        "        'embeddings': NeuralType(('B', 'T', 'C'), EmbeddedTextType())\n",
-        "    }"
-      ],
-      "execution_count": null,
-      "outputs": []
+        "# Example instantiation (with dummy parameters for demonstration)\n",
+        "dummy_embedding = GPTEmbedding(vocab_size=100, n_embd=32, block_size=128)\n",
+        "print(f\"Input types: {dummy_embedding.input_types}\")\n",
+        "print(f\"Output types: {dummy_embedding.output_types}\")"
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1198,7 +1124,7 @@
       "source": [
         "### Refactoring the Encoder\n",
         "\n",
-        "Next, let's refactor the GPT Encoder - which is implemented as a multi layer Transformer (Decoder) network.\n",
+        "Next, let's refactor the GPT Encoder - which is implemented as a multi layer Transformer (Decoder) network. The `GPTTransformerEncoder` class is now imported from our helper module.\n",
         "\n",
         "------\n",
         "It can be noted that we refer to the GPT \"Encoder\" module - but it is constructed by using Transformer \"Decoder\" blocks.\n",
@@ -1217,35 +1143,21 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "1QeQnQ_G2PwH"
       },
+      "outputs": [],
       "source": [
-        "class GPTTransformerEncoder(NeuralModule):\n",
-        "  def __init__(self, n_embd: int, block_size: int, n_head: int, n_layer: int, attn_pdrop: float = 0.0, resid_pdrop: float = 0.0):\n",
-        "    super().__init__()\n",
-        "\n",
-        "    self.blocks = nn.Sequential(*[Block(n_embd, block_size, n_head, attn_pdrop, resid_pdrop) \n",
-        "                                  for _ in range(n_layer)])\n",
-        "    \n",
-        "  @typecheck()\n",
-        "  def forward(self, embed):\n",
-        "    return self.blocks(embed)\n",
-        "\n",
-        "  @property\n",
-        "  def input_types(self):\n",
-        "    return {\n",
-        "        'embed': NeuralType(('B', 'T', 'C'), EmbeddedTextType())\n",
-        "    }\n",
+        "# GPTTransformerEncoder NeuralModule is now imported from helper_files.gpt_components\n",
+        "# It implements a sequence of transformer blocks for encoding\n",
+        "print(f\"GPTTransformerEncoder imported: {GPTTransformerEncoder}\")\n",
         "\n",
-        "  @property\n",
-        "  def output_types(self):\n",
-        "    return {\n",
-        "        'encoding': NeuralType(('B', 'T', 'C'), CausalSelfAttentionType())\n",
-        "    }"
-      ],
-      "execution_count": null,
-      "outputs": []
+        "# Example instantiation (with dummy parameters for demonstration)\n",
+        "dummy_encoder = GPTTransformerEncoder(n_embd=32, block_size=128, n_head=4, n_layer=1)\n",
+        "print(f\"Input types: {dummy_encoder.input_types}\")\n",
+        "print(f\"Output types: {dummy_encoder.output_types}\")"
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1255,7 +1167,7 @@
       "source": [
         "### Refactoring the Decoder\n",
         "\n",
-        "Finally, let's refactor the Decoder - the small one-layer feed-forward network to decode the answer.\n",
+        "Finally, let's refactor the Decoder - the small one-layer feed-forward network to decode the answer. The `GPTDecoder` class is now imported from our helper module.\n",
         "\n",
         "-------\n",
         "\n",
@@ -1268,36 +1180,21 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "VCPUu0EWQIBX"
       },
+      "outputs": [],
       "source": [
-        "class GPTDecoder(NeuralModule):\n",
-        "  def __init__(self, n_embd: int, vocab_size: int):\n",
-        "    super().__init__()\n",
-        "    self.ln_f = nn.LayerNorm(n_embd)\n",
-        "    self.head = nn.Linear(n_embd, vocab_size, bias=False) # no need for extra bias due to one in ln_f\n",
+        "# GPTDecoder NeuralModule is now imported from helper_files.gpt_components\n",
+        "# It implements layer normalization followed by a linear layer to produce logits\n",
+        "print(f\"GPTDecoder imported: {GPTDecoder}\")\n",
         "\n",
-        "  @typecheck()\n",
-        "  def forward(self, encoding):\n",
-        "    x = self.ln_f(encoding)\n",
-        "    logits = self.head(x)\n",
-        "    return logits\n",
-        "\n",
-        "  @property\n",
-        "  def input_types(self):\n",
-        "    return {\n",
-        "        'encoding': NeuralType(('B', 'T', 'C'), EncodedRepresentation())\n",
-        "    }\n",
-        "  \n",
-        "  @property\n",
-        "  def output_types(self):\n",
-        "    return {\n",
-        "        'logits': NeuralType(('B', 'T', 'C'), LogitsType())\n",
-        "    }\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+        "# Example instantiation (with dummy parameters for demonstration)\n",
+        "dummy_decoder = GPTDecoder(n_embd=32, vocab_size=100)\n",
+        "print(f\"Input types: {dummy_decoder.input_types}\")  \n",
+        "print(f\"Output types: {dummy_decoder.output_types}\")\n"
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1314,9 +1211,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "ZQlmtYU6iDwi"
       },
+      "outputs": [],
       "source": [
         "class AbstractNeMoGPT(ModelPT):\n",
         "  def __init__(self, cfg: OmegaConf, trainer: ptl.Trainer = None):\n",
@@ -1375,9 +1274,7 @@
         "    return {\n",
         "        'logits': NeuralType(('B', 'T', 'C'), LogitsType())\n",
         "    }"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1396,9 +1293,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "uygo0BEYjKuj"
       },
+      "outputs": [],
       "source": [
         "# model definition args (required)\n",
         "# ================================\n",
@@ -1413,9 +1312,7 @@
         "# embd_pdrop: float = 0.1, # \\in [0,1]: amount of dropout on input embeddings\n",
         "# resid_pdrop: float = 0.1, # \\in [0,1]: amount of dropout in each residual connection\n",
         "# attn_pdrop: float = 0.1, # \\in [0,1]: amount of dropout on the attention matrix"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1431,27 +1328,27 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "XqLSZq7Soo2j"
       },
+      "outputs": [],
       "source": [
         "from omegaconf import MISSING"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "JTH-1vu8TO7o"
       },
+      "outputs": [],
       "source": [
         "# Let's create a utility for building the class path\n",
         "def get_class_path(cls):\n",
         "  return f'{cls.__module__}.{cls.__name__}'"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1466,9 +1363,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "ZCvLdOlMVLy_"
       },
+      "outputs": [],
       "source": [
         "common_config = OmegaConf.create({\n",
         "    'vocab_size': MISSING,\n",
@@ -1477,9 +1376,7 @@
         "    'n_embd': MISSING,\n",
         "    'n_head': MISSING,\n",
         "})"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1510,9 +1407,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "ntsxQKH0pDac"
       },
+      "outputs": [],
       "source": [
         "embedding_config = OmegaConf.create({\n",
         "    '_target_': get_class_path(GPTEmbedding),\n",
@@ -1538,9 +1437,7 @@
         "    'n_embd': '${model.n_embd}',\n",
         "    'vocab_size': '${model.vocab_size}'\n",
         "})"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1591,9 +1488,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "c8hvNeB_aDgi"
       },
+      "outputs": [],
       "source": [
         "model_config = OmegaConf.create({\n",
         "    'model': common_config\n",
@@ -1603,9 +1502,7 @@
         "model_config.model.embedding = embedding_config\n",
         "model_config.model.encoder = encoder_config\n",
         "model_config.model.decoder = decoder_config"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1619,14 +1516,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "2SyKNgp9pG0N"
       },
+      "outputs": [],
       "source": [
         "print(OmegaConf.to_yaml(model_config))"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1642,20 +1539,22 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "0X4C76JyOAnN"
       },
+      "outputs": [],
       "source": [
         "import copy"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "ugxA0TPtbHVZ"
       },
+      "outputs": [],
       "source": [
         "temp_config = copy.deepcopy(model_config)\n",
         "temp_config.model.vocab_size = 10\n",
@@ -1666,9 +1565,7 @@
         "\n",
         "temp_config = OmegaConf.create(OmegaConf.to_container(temp_config, resolve=True))\n",
         "print(OmegaConf.to_yaml(temp_config))"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1682,21 +1579,23 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "IIIVi2IfpsJ4"
       },
+      "outputs": [],
       "source": [
         "# Let's work on a copy of the model config and update it before we send it into the Model.\n",
         "cfg = copy.deepcopy(model_config)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "OllBhswPqQXq"
       },
+      "outputs": [],
       "source": [
         "# Let's set the values of the config (for some plausible small model)\n",
         "cfg.model.vocab_size = 100\n",
@@ -1704,32 +1603,30 @@
         "cfg.model.n_layer = 1\n",
         "cfg.model.n_embd = 32\n",
         "cfg.model.n_head = 4"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "QJm2LnTqqcIM"
       },
+      "outputs": [],
       "source": [
         "print(OmegaConf.to_yaml(cfg))"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "E7tpB8BcqeBO"
       },
+      "outputs": [],
       "source": [
         "# Try to create a model with this config [ERROR CELL]\n",
         "m = AbstractNeMoGPT(cfg.model)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1759,20 +1656,22 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "Vcwi1lO7t7Sm"
       },
+      "outputs": [],
       "source": [
         "from nemo.core.classes.common import PretrainedModelInfo"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "ckCxyVLYqrz0"
       },
+      "outputs": [],
       "source": [
         "class BasicNeMoGPT(AbstractNeMoGPT):\n",
         "\n",
@@ -1788,9 +1687,7 @@
         "  \n",
         "  def setup_test_data(self, test_data_config: OmegaConf):\n",
         "    self._test_dl = None"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1804,14 +1701,14 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "G8iYQSC5vptU"
       },
+      "outputs": [],
       "source": [
         "m = BasicNeMoGPT(cfg.model)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1836,9 +1733,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "QU3oQAVovxRg"
       },
+      "outputs": [],
       "source": [
         "class BasicNeMoGPTWithSteps(BasicNeMoGPT):\n",
         "\n",
@@ -1868,20 +1767,18 @@
         "    def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0):\n",
         "        test_loss_mean = torch.stack([x['test_loss'] for x in outputs]).mean()\n",
         "        return {'test_loss': test_loss_mean}"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "2Ki3kRxag511"
       },
+      "outputs": [],
       "source": [
         "m = BasicNeMoGPTWithSteps(cfg=cfg.model)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -1932,9 +1829,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "FgXkZQiVjnOv"
       },
+      "outputs": [],
       "source": [
         "class BasicNeMoGPTWithOptim(BasicNeMoGPTWithSteps):\n",
         "\n",
@@ -1983,20 +1882,18 @@
         "        ]\n",
         "        optimizer = torch.optim.AdamW(optim_groups, lr=self.cfg.optim.lr, betas=self.cfg.optim.betas)\n",
         "        return optimizer\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "kARDwthakEQk"
       },
+      "outputs": [],
       "source": [
         "m = BasicNeMoGPTWithOptim(cfg=cfg.model)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -2010,9 +1907,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "5K7zh9Cn2s2u"
       },
+      "outputs": [],
       "source": [
         "OmegaConf.set_struct(cfg.model, False)\n",
         "\n",
@@ -2025,9 +1924,7 @@
         "cfg.model.optim = optim_config\n",
         "\n",
         "OmegaConf.set_struct(cfg.model, True)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -2066,22 +1963,24 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "E-fswFkig9t4"
       },
+      "outputs": [],
       "source": [
         "from nemo.core import Dataset\n",
         "from torch.utils import data\n",
         "from torch.utils.data.dataloader import DataLoader"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "-Z8XuPeClGNm"
       },
+      "outputs": [],
       "source": [
         "class TinyShakespeareDataset(Dataset):\n",
         "\n",
@@ -2136,9 +2035,7 @@
         "        'input': NeuralType(('B', 'T'), Index()),\n",
         "        'target': NeuralType(('B', 'T'), LabelsType())\n",
         "    }"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -2168,50 +2065,50 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "VwsdXtVzo--t"
       },
+      "outputs": [],
       "source": [
         "import os"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "QvKcDCvIl9-A"
       },
+      "outputs": [],
       "source": [
         "if not os.path.exists('tiny-shakespeare.txt'):\n",
         "  !wget https://raw.githubusercontent.com/jcjohnson/torch-rnn/master/data/tiny-shakespeare.txt"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "ynCwqDu6vK8P"
       },
+      "outputs": [],
       "source": [
         "!head -n 5 tiny-shakespeare.txt"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "bfRL4t9_oS4C"
       },
+      "outputs": [],
       "source": [
         "train_dataset = TinyShakespeareDataset('tiny-shakespeare.txt', cfg.model.block_size, crop=(0,         int(1e6)))\n",
         "val_dataset   = TinyShakespeareDataset('tiny-shakespeare.txt', cfg.model.block_size, crop=(int(1e6), int(50e3)), override_vocab=train_dataset.vocab)\n",
         "test_dataset  = TinyShakespeareDataset('tiny-shakespeare.txt', cfg.model.block_size, crop=(int(1.05e6), int(100e3)), override_vocab=train_dataset.vocab)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -2230,9 +2127,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "SVSfIk_-rMSg"
       },
+      "outputs": [],
       "source": [
         "class NeMoGPT(BasicNeMoGPTWithOptim):\n",
         "\n",
@@ -2270,9 +2169,7 @@
         "  \n",
         "  def setup_test_data(self, test_data_config: OmegaConf):\n",
         "    self._test_dl = self._setup_data_loader(test_data_config)\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -2287,9 +2184,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "C6zcTqJixOOL"
       },
+      "outputs": [],
       "source": [
         "OmegaConf.set_struct(cfg.model, False)\n",
         "\n",
@@ -2298,15 +2197,15 @@
         "cfg.model.vocab_size = train_dataset.vocab_size\n",
         "\n",
         "OmegaConf.set_struct(cfg.model, True)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "zlvThf7BysyT"
       },
+      "outputs": [],
       "source": [
         "train_ds = OmegaConf.create({\n",
         "    'data_path': '${model.data_path}',\n",
@@ -2331,15 +2230,15 @@
         "    'batch_size': 4,\n",
         "    'shuffle': False,\n",
         "})"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "QVVzR6WKyMT5"
       },
+      "outputs": [],
       "source": [
         "# Attach to the model config\n",
         "OmegaConf.set_struct(cfg.model, False)\n",
@@ -2349,33 +2248,31 @@
         "cfg.model.test_ds = test_ds\n",
         "\n",
         "OmegaConf.set_struct(cfg.model, True)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "nd_9_mxS0ET-"
       },
+      "outputs": [],
       "source": [
         "# Let's see the config now !\n",
         "print(OmegaConf.to_yaml(cfg))"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "dlwSQENU0JxA"
       },
+      "outputs": [],
       "source": [
         "# Let's try creating a model now !\n",
         "model = NeMoGPT(cfg=cfg.model)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -2410,9 +2307,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "johk6Z0e0WEm"
       },
+      "outputs": [],
       "source": [
         "if torch.cuda.is_available():\n",
         "  accelerator = 'gpu'\n",
@@ -2420,20 +2319,18 @@
         "  accelerator = 'cpu'\n",
         "\n",
         "trainer = ptl.Trainer(devices=1, accelerator=accelerator, limit_test_batches=1.0)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "oqeeofEr1S8e"
       },
+      "outputs": [],
       "source": [
         "trainer.test(model)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -2450,48 +2347,48 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "DksG_-7G1Vbe"
       },
+      "outputs": [],
       "source": [
         "model.save_to('gpt_model.nemo')"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "JhjoFdCnBWVh"
       },
+      "outputs": [],
       "source": [
         "!ls -d -- *.nemo"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "567txSF0BYXN"
       },
+      "outputs": [],
       "source": [
         "temp_model = NeMoGPT.restore_from('gpt_model.nemo')"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "YvnfG0kxBfTt"
       },
+      "outputs": [],
       "source": [
         "# [ERROR CELL]\n",
         "temp_model.setup_test_data(temp_model.cfg.test_ds)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -2510,9 +2407,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "_Atyoc4NBjEV"
       },
+      "outputs": [],
       "source": [
         "class NeMoGPTv2(NeMoGPT):\n",
         "  \n",
@@ -2552,61 +2451,61 @@
         "      self.vocab = vocab\n",
         "\n",
         "    self._test_dl = self._setup_data_loader(test_data_config)\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "mn09jsRZDusN"
       },
+      "outputs": [],
       "source": [
         "# Let's try creating a model now !\n",
         "model = NeMoGPTv2(cfg=cfg.model)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "sQPIPySDD1K0"
       },
+      "outputs": [],
       "source": [
         "# Now let's try to save and restore !\n",
         "model.save_to('gpt_model.nemo')"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "0YwCJ4xaJ3bU"
       },
+      "outputs": [],
       "source": [
         "temp_model = NeMoGPTv2.restore_from('gpt_model.nemo')"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "tcxwDIIWKKCQ"
       },
+      "outputs": [],
       "source": [
         "temp_model.setup_multiple_test_data(temp_model.cfg.test_ds)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "j3Olm6ZTKRbO"
       },
+      "outputs": [],
       "source": [
         "if torch.cuda.is_available():\n",
         "  accelerator = 'gpu'\n",
@@ -2614,20 +2513,18 @@
         "  accelerator = 'cpu'\n",
         "\n",
         "trainer = ptl.Trainer(devices=1, accelerator=accelerator, limit_test_batches =1.0)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "_QE2SngCKV2p"
       },
+      "outputs": [],
       "source": [
         "trainer.test(model)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -2641,14 +2538,26 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "ZjCV5u3_OO7a"
       },
-      "source": [
-        ""
-      ],
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": []
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "01_NeMo_Models.ipynb",
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
     }
-  ]
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
 }
diff --git a/tutorials/helper_files/__init__.py b/tutorials/helper_files/__init__.py
new file mode 100644
index 000000000000..1e1142df0443
--- /dev/null
+++ b/tutorials/helper_files/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper files for NeMo tutorials."""
diff --git a/tutorials/helper_files/gpt_components.py b/tutorials/helper_files/gpt_components.py
new file mode 100644
index 000000000000..234ff45c1bec
--- /dev/null
+++ b/tutorials/helper_files/gpt_components.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+GPT components for the NeMo Models tutorial.
+This module contains the neural network components used in the tutorial 01_NeMo_Models.ipynb
+"""
+
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+from nemo.core import NeuralModule, typecheck
+from nemo.core.neural_types import EmbeddedTextType, EncodedRepresentation, Index, LogitsType, NeuralType
+from nemo.core.neural_types.elements import *
+
+
+# Custom Element Types
+class AttentionType(EncodedRepresentation):
+    """Basic Attention Element Type"""
+
+
+class SelfAttentionType(AttentionType):
+    """Self Attention Element Type"""
+
+
+class CausalSelfAttentionType(SelfAttentionType):
+    """Causal Self Attention Element Type"""
+
+
+# Neural Network Modules (not NeMo neural modules)
+class CausalSelfAttention(nn.Module):
+    """
+    A vanilla multi-head masked self-attention layer with a projection at the end.
+    It is possible to use torch.nn.MultiheadAttention here but I am including an
+    explicit implementation here to show that there is nothing too scary here.
+    """
+
+    def __init__(self, n_embd, block_size, n_head, attn_pdrop, resid_pdrop):
+        super().__init__()
+        assert n_embd % n_head == 0
+        self.n_head = n_head
+        # key, query, value projections for all heads
+        self.key = nn.Linear(n_embd, n_embd)
+        self.query = nn.Linear(n_embd, n_embd)
+        self.value = nn.Linear(n_embd, n_embd)
+        # regularization
+        self.attn_drop = nn.Dropout(attn_pdrop)
+        self.resid_drop = nn.Dropout(resid_pdrop)
+        # output projection
+        self.proj = nn.Linear(n_embd, n_embd)
+        # causal mask to ensure that attention is only applied to the left in the input sequence
+        self.register_buffer("mask", torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size))
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+        q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+        v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
+
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.attn_drop(att)
+        y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side
+
+        # output projection
+        y = self.resid_drop(self.proj(y))
+        return y
+
+
+class Block(nn.Module):
+    """an unassuming Transformer block"""
+
+    def __init__(self, n_embd, block_size, n_head, attn_pdrop, resid_pdrop):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.ln2 = nn.LayerNorm(n_embd)
+        self.attn = CausalSelfAttention(n_embd, block_size, n_head, attn_pdrop, resid_pdrop)
+        self.mlp = nn.Sequential(
+            nn.Linear(n_embd, 4 * n_embd),
+            nn.GELU(),
+            nn.Linear(4 * n_embd, n_embd),
+            nn.Dropout(resid_pdrop),
+        )
+
+    def forward(self, x):
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+
+# NeMo Neural Modules
+class GPTEmbedding(NeuralModule):
+    def __init__(self, vocab_size: int, n_embd: int, block_size: int, embd_pdrop: float = 0.0):
+        super().__init__()
+
+        # input embedding stem: drop(content + position)
+        self.tok_emb = nn.Embedding(vocab_size, n_embd)
+        self.pos_emb = nn.Parameter(torch.zeros(1, block_size, n_embd))
+        self.drop = nn.Dropout(embd_pdrop)
+
+    @typecheck()
+    def forward(self, idx):
+        b, t = idx.size()
+
+        # forward the GPT model
+        token_embeddings = self.tok_emb(idx)  # each index maps to a (learnable) vector
+        position_embeddings = self.pos_emb[:, :t, :]  # each position maps to a (learnable) vector
+        x = self.drop(token_embeddings + position_embeddings)
+        return x
+
+    @property
+    def input_types(self):
+        return {'idx': NeuralType(('B', 'T'), Index())}
+
+    @property
+    def output_types(self):
+        return {'embeddings': NeuralType(('B', 'T', 'C'), EmbeddedTextType())}
+
+
+class GPTTransformerEncoder(NeuralModule):
+    def __init__(
+        self,
+        n_embd: int,
+        block_size: int,
+        n_head: int,
+        n_layer: int,
+        attn_pdrop: float = 0.0,
+        resid_pdrop: float = 0.0,
+    ):
+        super().__init__()
+
+        self.blocks = nn.Sequential(
+            *[Block(n_embd, block_size, n_head, attn_pdrop, resid_pdrop) for _ in range(n_layer)]
+        )
+
+    @typecheck()
+    def forward(self, embed):
+        return self.blocks(embed)
+
+    @property
+    def input_types(self):
+        return {'embed': NeuralType(('B', 'T', 'C'), EmbeddedTextType())}
+
+    @property
+    def output_types(self):
+        return {'encoding': NeuralType(('B', 'T', 'C'), CausalSelfAttentionType())}
+
+
+class GPTDecoder(NeuralModule):
+    def __init__(self, n_embd: int, vocab_size: int):
+        super().__init__()
+        self.ln_f = nn.LayerNorm(n_embd)
+        self.head = nn.Linear(n_embd, vocab_size, bias=False)  # no need for extra bias due to one in ln_f
+
+    @typecheck()
+    def forward(self, encoding):
+        x = self.ln_f(encoding)
+        logits = self.head(x)
+        return logits
+
+    @property
+    def input_types(self):
+        return {'encoding': NeuralType(('B', 'T', 'C'), EncodedRepresentation())}
+
+    @property
+    def output_types(self):
+        return {'logits': NeuralType(('B', 'T', 'C'), LogitsType())}