xyssz1234
diff --git a/‎10. TopicModels/LDA_tutorial.ipynb
Lines changed: 229 additions & 0 deletions b/‎10. TopicModels/LDA_tutorial.ipynb
Lines changed: 229 additions & 0 deletions
@@ -0,0 +1,229 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![](attachment:image.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![](attachment:image.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Gibbs sampler for LDA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# words\n",
+    "import numpy as np\n",
+    "W = np.array([0, 1, 2, 3, 4])\n",
+    "\n",
+    "# D := document words\n",
+    "X = np.array([\n",
+    "    [0, 0, 1, 2, 2],\n",
+    "    [0, 0, 1, 1, 1],\n",
+    "    [0, 1, 2, 2, 2],\n",
+    "    [4, 4, 4, 4, 4],\n",
+    "    [3, 3, 4, 4, 4],\n",
+    "    [3, 4, 4, 4, 4]\n",
+    "])\n",
+    "\n",
+    "N_D = X.shape[0]  # num of docs\n",
+    "N_V = W.shape[0]  # num of words\n",
+    "N_K = 2  # num of topics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "# Dirichlet priors\n",
+    "alpha = 1\n",
+    "gamma = 1\n",
+    "\n",
+    "# Z := word topic assignment\n",
+    "Z = np.zeros(shape=[N_D, N_V])\n",
+    "\n",
+    "for i in range(N_D):\n",
+    "    for l in range(N_V):\n",
+    "        Z[i, l] = np.random.randint(N_K)  # randomly assign word's topic\n",
+    "\n",
+    "# Pi := document topic distribution\n",
+    "theta = np.zeros([N_D, N_K])\n",
+    "\n",
+    "for i in range(N_D):\n",
+    "    theta[i] = np.random.dirichlet(alpha*np.ones(N_K))\n",
+    "\n",
+    "# phi := word topic distribution\n",
+    "phi = np.zeros([N_K, N_V])\n",
+    "\n",
+    "for k in range(N_K):\n",
+    "    phi[k] = np.random.dirichlet(gamma*np.ones(N_V))\n",
+    "\n",
+    "for it in range(1000):\n",
+    "    # Sample from full conditional of Z\n",
+    "    # ---------------------------------\n",
+    "    for i in range(N_D):\n",
+    "        for v in range(N_V):\n",
+    "            # Calculate params for Z\n",
+    "            p_iv = np.exp(np.log(theta[i]) + np.log(phi[:, X[i, v]]))\n",
+    "            p_iv /= np.sum(p_iv)\n",
+    "\n",
+    "            # Resample word topic assignment Z\n",
+    "            Z[i, v] = np.random.multinomial(1, p_iv).argmax()\n",
+    "\n",
+    "    # Sample from full conditional of \\theta\n",
+    "    # ----------------------------------\n",
+    "    for i in range(N_D):\n",
+    "        m = np.zeros(N_K)\n",
+    "\n",
+    "        # Gather sufficient statistics\n",
+    "        for k in range(N_K):\n",
+    "            m[k] = np.sum(Z[i] == k)\n",
+    "\n",
+    "        # Resample doc topic dist.\n",
+    "        theta[i, :] = np.random.dirichlet(alpha + m)\n",
+    "\n",
+    "    # Sample from full conditional of \\phi\n",
+    "    # ---------------------------------\n",
+    "    for k in range(N_K):\n",
+    "        n = np.zeros(N_V)\n",
+    "\n",
+    "        # Gather sufficient statistics\n",
+    "        for v in range(N_V):\n",
+    "            for i in range(N_D):\n",
+    "                for l in range(N_V):\n",
+    "                    n[v] += (X[i, l] == v) and (Z[i, l] == k)\n",
+    "\n",
+    "        # Resample word topic dist.\n",
+    "        phi[k, :] = np.random.dirichlet(gamma + n)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[0.33941922 0.66058078]\n",
+      " [0.12011984 0.87988016]\n",
+      " [0.02318953 0.97681047]\n",
+      " [0.80301107 0.19698893]\n",
+      " [0.94780545 0.05219455]\n",
+      " [0.84677554 0.15322446]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print (theta)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Mini-homework - Collapsed Gibbs Sampler for LDA\n",
+    "自己重新推导一下Collapsed Gibbs Sampler，并实现一下LDA类并利用上面给定的YELP数据集来测试一下，并跟sklearn的结果比较一下。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "IndentationError",
+     "evalue": "expected an indented block (<ipython-input-7-fe62b0541667>, line 7)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-7-fe62b0541667>\"\u001b[0;36m, line \u001b[0;32m7\u001b[0m\n\u001b[0;31m    def fit(self, X, y=None):\u001b[0m\n\u001b[0m      ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m expected an indented block\n"
+     ]
+    }
+   ],
+   "source": [
+    "class LDA:\n",
+    "    \"\"\"Latent Dirichlet allocation using collapsed Gibbs sampling\"\"\"\n",
+    "    \n",
+    "    def __init__():\n",
+    "        \n",
+    "\n",
+    "    def fit(self, X, y=None):\n",
+    "        \"\"\"Fit the model with X.\"\"\"\n",
+    "        \n",
+    "\n",
+    "    def fit_transform(self, X, y=None):\n",
+    "        \n",
+    "\n",
+    "    def transform(self, X, max_iter=20, tol=1e-16):\n",
+    "        \n",
+    "       \n",
+    "    def loglikelihood(self):\n",
+    "        \"\"\"Calculate complete log likelihood, log p(w,z)\n",
+    "        Formula used is log p(w,z) = log p(w|z) + log p(z)\n",
+    "        \"\"\"\n",
+    "    def perplexity(self):\n",
+    "        \"\"\"Calculate the perplexity\"\"\"\n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}