fix: update the llm+kmeans notebook with recent change (#236)

ashleyxuu · Genesis929 · commit 78b70e2da13a · 2023-12-12T21:53:33.000Z
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue 313682530 🦕
diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb
@@ -366,18 +366,6 @@
         "predicted_embeddings.head() "
       ]
     },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "4H_etYfsEOFP"
-      },
-      "outputs": [],
-      "source": [
-        "# Join the complaints with their embeddings in the same DataFrame\n",
-        "combined_df = downsampled_issues_df.join(predicted_embeddings)"
-      ]
-    },
     {
       "attachments": {},
       "cell_type": "markdown",
@@ -426,30 +414,19 @@
       "outputs": [],
       "source": [
         "# Use KMeans clustering to calculate our groups. Will take ~3 minutes.\n",
-        "cluster_model.fit(combined_df[[\"text_embedding\"]])\n",
-        "clustered_result = cluster_model.predict(combined_df[[\"text_embedding\"]])\n",
+        "cluster_model.fit(predicted_embeddings[[\"text_embedding\"]])\n",
+        "clustered_result = cluster_model.predict(predicted_embeddings)\n",
         "# Notice the CENTROID_ID column, which is the ID number of the group that\n",
         "# each complaint belongs to.\n",
         "clustered_result.head(n=5)"
       ]
     },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Join the group number to the complaints and their text embeddings\n",
-        "combined_clustered_result = combined_df.join(clustered_result)\n",
-        "combined_clustered_result.head(n=5) "
-      ]
-    },
     {
       "attachments": {},
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Our dataframe combined_clustered_result now has three columns: the complaints, their text embeddings, and an ID from 1-10 (inclusive) indicating which semantically similar group they belong to."
+        "Our dataframe combined_clustered_result now has three complaint columns: the content, their text embeddings, and an ID from 1-10 (inclusive) indicating which semantically similar group they belong to."
       ]
     },
     {
@@ -480,14 +457,14 @@
       "source": [
         "# Using bigframes, with syntax identical to pandas,\n",
         "# filter out the first and second groups\n",
-        "cluster_1_result = combined_clustered_result[\n",
-        "    combined_clustered_result[\"CENTROID_ID\"] == 1\n",
-        "][[\"consumer_complaint_narrative\"]]\n",
+        "cluster_1_result = clustered_result[\n",
+        "    clustered_result[\"CENTROID_ID\"] == 1\n",
+        "][[\"content\"]]\n",
         "cluster_1_result_pandas = cluster_1_result.head(5).to_pandas()\n",
         "\n",
-        "cluster_2_result = combined_clustered_result[\n",
-        "    combined_clustered_result[\"CENTROID_ID\"] == 2\n",
-        "][[\"consumer_complaint_narrative\"]]\n",
+        "cluster_2_result = clustered_result[\n",
+        "    clustered_result[\"CENTROID_ID\"] == 2\n",
+        "][[\"content\"]]\n",
         "cluster_2_result_pandas = cluster_2_result.head(5).to_pandas()"
       ]
     },
@@ -503,15 +480,15 @@
         "prompt1 = 'comment list 1:\\n'\n",
         "for i in range(5):\n",
         "    prompt1 += str(i + 1) + '. ' + \\\n",
-        "        cluster_1_result_pandas[\"consumer_complaint_narrative\"].iloc[i] + '\\n'\n",
+        "        cluster_1_result_pandas[\"content\"].iloc[i] + '\\n'\n",
         "\n",
         "prompt2 = 'comment list 2:\\n'\n",
         "for i in range(5):\n",
         "    prompt2 += str(i + 1) + '. ' + \\\n",
-        "        cluster_2_result_pandas[\"consumer_complaint_narrative\"].iloc[i] + '\\n'\n",
+        "        cluster_2_result_pandas[\"content\"].iloc[i] + '\\n'\n",
         "\n",
         "print(prompt1)\n",
-        "print(prompt2)\n"
+        "print(prompt2)"
       ]
     },
     {