|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "metadata": {}, |
| 6 | + "source": [ |
| 7 | + "# NLP : Building The Model\n", |
| 8 | + "\n", |
| 9 | + "<br>\n", |
| 10 | + "# Candidates from String Edits\n", |
| 11 | + "Create a list of candidate strings by applying an edit operation\n", |
| 12 | + "<br>\n", |
| 13 | + "### Imports and Data" |
| 14 | + ] |
| 15 | + }, |
| 16 | + { |
| 17 | + "cell_type": "code", |
| 18 | + "execution_count": 1, |
| 19 | + "metadata": {}, |
| 20 | + "outputs": [], |
| 21 | + "source": [ |
| 22 | + "# data\n", |
| 23 | + "word = 'dearz' # 🦌" |
| 24 | + ] |
| 25 | + }, |
| 26 | + { |
| 27 | + "cell_type": "markdown", |
| 28 | + "metadata": {}, |
| 29 | + "source": [ |
| 30 | + "### Splits\n", |
| 31 | + "Find all the ways you can split a word into 2 parts !" |
| 32 | + ] |
| 33 | + }, |
| 34 | + { |
| 35 | + "cell_type": "code", |
| 36 | + "execution_count": 2, |
| 37 | + "metadata": {}, |
| 38 | + "outputs": [ |
| 39 | + { |
| 40 | + "name": "stdout", |
| 41 | + "output_type": "stream", |
| 42 | + "text": [ |
| 43 | + "['', 'dearz']\n", |
| 44 | + "['d', 'earz']\n", |
| 45 | + "['de', 'arz']\n", |
| 46 | + "['dea', 'rz']\n", |
| 47 | + "['dear', 'z']\n", |
| 48 | + "['dearz', '']\n" |
| 49 | + ] |
| 50 | + } |
| 51 | + ], |
| 52 | + "source": [ |
| 53 | + "# splits with a loop\n", |
| 54 | + "splits_a = []\n", |
| 55 | + "for i in range(len(word)+1):\n", |
| 56 | + " splits_a.append([word[:i],word[i:]])\n", |
| 57 | + "\n", |
| 58 | + "for i in splits_a:\n", |
| 59 | + " print(i)" |
| 60 | + ] |
| 61 | + }, |
| 62 | + { |
| 63 | + "cell_type": "code", |
| 64 | + "execution_count": 3, |
| 65 | + "metadata": {}, |
| 66 | + "outputs": [ |
| 67 | + { |
| 68 | + "name": "stdout", |
| 69 | + "output_type": "stream", |
| 70 | + "text": [ |
| 71 | + "('', 'dearz')\n", |
| 72 | + "('d', 'earz')\n", |
| 73 | + "('de', 'arz')\n", |
| 74 | + "('dea', 'rz')\n", |
| 75 | + "('dear', 'z')\n", |
| 76 | + "('dearz', '')\n" |
| 77 | + ] |
| 78 | + } |
| 79 | + ], |
| 80 | + "source": [ |
| 81 | + "# same splits, done using a list comprehension\n", |
| 82 | + "splits_b = [(word[:i], word[i:]) for i in range(len(word) + 1)]\n", |
| 83 | + "\n", |
| 84 | + "for i in splits_b:\n", |
| 85 | + " print(i)" |
| 86 | + ] |
| 87 | + }, |
| 88 | + { |
| 89 | + "cell_type": "markdown", |
| 90 | + "metadata": {}, |
| 91 | + "source": [ |
| 92 | + "### Delete Edit\n", |
| 93 | + "Delete a letter from each string in the `splits` list.\n", |
| 94 | + "<br>\n", |
| 95 | + "What this does is effectivly delete each possible letter from the original word being edited. " |
| 96 | + ] |
| 97 | + }, |
| 98 | + { |
| 99 | + "cell_type": "code", |
| 100 | + "execution_count": 4, |
| 101 | + "metadata": {}, |
| 102 | + "outputs": [ |
| 103 | + { |
| 104 | + "name": "stdout", |
| 105 | + "output_type": "stream", |
| 106 | + "text": [ |
| 107 | + "word : dearz\n", |
| 108 | + "earz <-- delete d\n", |
| 109 | + "darz <-- delete e\n", |
| 110 | + "derz <-- delete a\n", |
| 111 | + "deaz <-- delete r\n", |
| 112 | + "dear <-- delete z\n" |
| 113 | + ] |
| 114 | + } |
| 115 | + ], |
| 116 | + "source": [ |
| 117 | + "# deletes with a loop\n", |
| 118 | + "splits = splits_a\n", |
| 119 | + "deletes = []\n", |
| 120 | + "\n", |
| 121 | + "print('word : ', word)\n", |
| 122 | + "for L,R in splits:\n", |
| 123 | + " if R:\n", |
| 124 | + " print(L + R[1:], ' <-- delete ', R[0])" |
| 125 | + ] |
| 126 | + }, |
| 127 | + { |
| 128 | + "cell_type": "markdown", |
| 129 | + "metadata": {}, |
| 130 | + "source": [ |
| 131 | + "It's worth taking a closer look at how this is excecuting a 'delete'.\n", |
| 132 | + "<br>\n", |
| 133 | + "Taking the first item from the `splits` list :" |
| 134 | + ] |
| 135 | + }, |
| 136 | + { |
| 137 | + "cell_type": "code", |
| 138 | + "execution_count": 5, |
| 139 | + "metadata": {}, |
| 140 | + "outputs": [ |
| 141 | + { |
| 142 | + "name": "stdout", |
| 143 | + "output_type": "stream", |
| 144 | + "text": [ |
| 145 | + "word : dearz\n", |
| 146 | + "first item from the splits list : ['', 'dearz']\n", |
| 147 | + "L : \n", |
| 148 | + "R : dearz\n", |
| 149 | + "*** now implicit delete by excluding the leading letter ***\n", |
| 150 | + "L + R[1:] : earz <-- delete d\n" |
| 151 | + ] |
| 152 | + } |
| 153 | + ], |
| 154 | + "source": [ |
| 155 | + "# breaking it down\n", |
| 156 | + "print('word : ', word)\n", |
| 157 | + "one_split = splits[0]\n", |
| 158 | + "print('first item from the splits list : ', one_split)\n", |
| 159 | + "L = one_split[0]\n", |
| 160 | + "R = one_split[1]\n", |
| 161 | + "print('L : ', L)\n", |
| 162 | + "print('R : ', R)\n", |
| 163 | + "print('*** now implicit delete by excluding the leading letter ***')\n", |
| 164 | + "print('L + R[1:] : ',L + R[1:], ' <-- delete ', R[0])" |
| 165 | + ] |
| 166 | + }, |
| 167 | + { |
| 168 | + "cell_type": "markdown", |
| 169 | + "metadata": {}, |
| 170 | + "source": [ |
| 171 | + "So the end result transforms **'dearz'** to **'earz'** by deleting the first character.\n", |
| 172 | + "<br>\n", |
| 173 | + "And you use a **loop** (code block above) or a **list comprehension** (code block below) to do\n", |
| 174 | + "<br>\n", |
| 175 | + "this for the entire `splits` list." |
| 176 | + ] |
| 177 | + }, |
| 178 | + { |
| 179 | + "cell_type": "code", |
| 180 | + "execution_count": 6, |
| 181 | + "metadata": {}, |
| 182 | + "outputs": [ |
| 183 | + { |
| 184 | + "name": "stdout", |
| 185 | + "output_type": "stream", |
| 186 | + "text": [ |
| 187 | + "['earz', 'darz', 'derz', 'deaz', 'dear']\n", |
| 188 | + "*** which is the same as ***\n", |
| 189 | + "earz\n", |
| 190 | + "darz\n", |
| 191 | + "derz\n", |
| 192 | + "deaz\n", |
| 193 | + "dear\n" |
| 194 | + ] |
| 195 | + } |
| 196 | + ], |
| 197 | + "source": [ |
| 198 | + "# deletes with a list comprehension\n", |
| 199 | + "splits = splits_a\n", |
| 200 | + "deletes = [L + R[1:] for L, R in splits if R]\n", |
| 201 | + "\n", |
| 202 | + "print(deletes)\n", |
| 203 | + "print('*** which is the same as ***')\n", |
| 204 | + "for i in deletes:\n", |
| 205 | + " print(i)" |
| 206 | + ] |
| 207 | + }, |
| 208 | + { |
| 209 | + "cell_type": "markdown", |
| 210 | + "metadata": {}, |
| 211 | + "source": [ |
| 212 | + "### Ungraded Exercise\n", |
| 213 | + "You now have a list of ***candidate strings*** created after performing a **delete** edit.\n", |
| 214 | + "<br>\n", |
| 215 | + "Next step will be to filter this list for ***candidate words*** found in a vocabulary.\n", |
| 216 | + "<br>\n", |
| 217 | + "Given the example vocab below, can you think of a way to create a list of candidate words ? \n", |
| 218 | + "<br>\n", |
| 219 | + "Remember, you already have a list of candidate strings, some of which are certainly not actual words you might find in your vocabulary !\n", |
| 220 | + "<br>\n", |
| 221 | + "<br>\n", |
| 222 | + "So from the above list **earz, darz, derz, deaz, dear**. \n", |
| 223 | + "<br>\n", |
| 224 | + "You're really only interested in **dear**." |
| 225 | + ] |
| 226 | + }, |
| 227 | + { |
| 228 | + "cell_type": "code", |
| 229 | + "execution_count": 17, |
| 230 | + "metadata": {}, |
| 231 | + "outputs": [ |
| 232 | + { |
| 233 | + "name": "stdout", |
| 234 | + "output_type": "stream", |
| 235 | + "text": [ |
| 236 | + "vocab : ['dean', 'deer', 'dear', 'fries', 'and', 'coke']\n", |
| 237 | + "edits : ['earz', 'darz', 'derz', 'deaz', 'dear']\n", |
| 238 | + "candidate words : {'dear'}\n" |
| 239 | + ] |
| 240 | + } |
| 241 | + ], |
| 242 | + "source": [ |
| 243 | + "vocab = ['dean','deer','dear','fries','and','coke']\n", |
| 244 | + "# edits = list(deletes)\n", |
| 245 | + "\n", |
| 246 | + "print('vocab : ', vocab)\n", |
| 247 | + "print('edits : ', edits)\n", |
| 248 | + "\n", |
| 249 | + "# vocal = set(vocab)\n", |
| 250 | + "# edits = set(edits)\n", |
| 251 | + "\n", |
| 252 | + "candidates=[]\n", |
| 253 | + "\n", |
| 254 | + "### START CODE HERE ###\n", |
| 255 | + "#candidates = ?? # hint: 'set.intersection'\n", |
| 256 | + "# cadidates = set(vocab).intersection(set(edits))\n", |
| 257 | + "### END CODE HERE ###\n", |
| 258 | + "\n", |
| 259 | + "print('candidate words : ', set(vocab).intersection(set(edits)))" |
| 260 | + ] |
| 261 | + }, |
| 262 | + { |
| 263 | + "cell_type": "markdown", |
| 264 | + "metadata": {}, |
| 265 | + "source": [ |
| 266 | + "Expected Outcome:\n", |
| 267 | + "\n", |
| 268 | + "vocab : ['dean', 'deer', 'dear', 'fries', 'and', 'coke']\n", |
| 269 | + "<br>\n", |
| 270 | + "edits : ['earz', 'darz', 'derz', 'deaz', 'dear']\n", |
| 271 | + "<br>\n", |
| 272 | + "candidate words : {'dear'}" |
| 273 | + ] |
| 274 | + }, |
| 275 | + { |
| 276 | + "cell_type": "markdown", |
| 277 | + "metadata": {}, |
| 278 | + "source": [ |
| 279 | + "### Summary\n", |
| 280 | + "You've unpacked an integral part of the assignment by breaking down **splits** and **edits**, specifically looking at **deletes** here.\n", |
| 281 | + "<br>\n", |
| 282 | + "Implementation of the other edit types (insert, replace, switch) follow a similar methodology and should now feel somewhat familiar when you see them.\n", |
| 283 | + "<br>\n", |
| 284 | + "This bit of the code isn't as intuitive as other sections, so well done!\n", |
| 285 | + "<br>\n", |
| 286 | + "You should now feel confident facing some of the more technical parts of the assignment at the end of the week." |
| 287 | + ] |
| 288 | + } |
| 289 | + ], |
| 290 | + "metadata": { |
| 291 | + "kernelspec": { |
| 292 | + "display_name": "Python 3", |
| 293 | + "language": "python", |
| 294 | + "name": "python3" |
| 295 | + }, |
| 296 | + "language_info": { |
| 297 | + "codemirror_mode": { |
| 298 | + "name": "ipython", |
| 299 | + "version": 3 |
| 300 | + }, |
| 301 | + "file_extension": ".py", |
| 302 | + "mimetype": "text/x-python", |
| 303 | + "name": "python", |
| 304 | + "nbconvert_exporter": "python", |
| 305 | + "pygments_lexer": "ipython3", |
| 306 | + "version": "3.7.1" |
| 307 | + } |
| 308 | + }, |
| 309 | + "nbformat": 4, |
| 310 | + "nbformat_minor": 2 |
| 311 | +} |
0 commit comments