|
366 | 366 | "predicted_embeddings.head() "
|
367 | 367 | ]
|
368 | 368 | },
|
369 |
| - { |
370 |
| - "cell_type": "code", |
371 |
| - "execution_count": null, |
372 |
| - "metadata": { |
373 |
| - "id": "4H_etYfsEOFP" |
374 |
| - }, |
375 |
| - "outputs": [], |
376 |
| - "source": [ |
377 |
| - "# Join the complaints with their embeddings in the same DataFrame\n", |
378 |
| - "combined_df = downsampled_issues_df.join(predicted_embeddings)" |
379 |
| - ] |
380 |
| - }, |
381 | 369 | {
|
382 | 370 | "attachments": {},
|
383 | 371 | "cell_type": "markdown",
|
|
426 | 414 | "outputs": [],
|
427 | 415 | "source": [
|
428 | 416 | "# Use KMeans clustering to calculate our groups. Will take ~3 minutes.\n",
|
429 |
| - "cluster_model.fit(combined_df[[\"text_embedding\"]])\n", |
430 |
| - "clustered_result = cluster_model.predict(combined_df[[\"text_embedding\"]])\n", |
| 417 | + "cluster_model.fit(predicted_embeddings[[\"text_embedding\"]])\n", |
| 418 | + "clustered_result = cluster_model.predict(predicted_embeddings)\n", |
431 | 419 | "# Notice the CENTROID_ID column, which is the ID number of the group that\n",
|
432 | 420 | "# each complaint belongs to.\n",
|
433 | 421 | "clustered_result.head(n=5)"
|
434 | 422 | ]
|
435 | 423 | },
|
436 |
| - { |
437 |
| - "cell_type": "code", |
438 |
| - "execution_count": null, |
439 |
| - "metadata": {}, |
440 |
| - "outputs": [], |
441 |
| - "source": [ |
442 |
| - "# Join the group number to the complaints and their text embeddings\n", |
443 |
| - "combined_clustered_result = combined_df.join(clustered_result)\n", |
444 |
| - "combined_clustered_result.head(n=5) " |
445 |
| - ] |
446 |
| - }, |
447 | 424 | {
|
448 | 425 | "attachments": {},
|
449 | 426 | "cell_type": "markdown",
|
450 | 427 | "metadata": {},
|
451 | 428 | "source": [
|
452 |
| - "Our dataframe combined_clustered_result now has three columns: the complaints, their text embeddings, and an ID from 1-10 (inclusive) indicating which semantically similar group they belong to." |
| 429 | + "Our dataframe combined_clustered_result now has three complaint columns: the content, their text embeddings, and an ID from 1-10 (inclusive) indicating which semantically similar group they belong to." |
453 | 430 | ]
|
454 | 431 | },
|
455 | 432 | {
|
|
480 | 457 | "source": [
|
481 | 458 | "# Using bigframes, with syntax identical to pandas,\n",
|
482 | 459 | "# filter out the first and second groups\n",
|
483 |
| - "cluster_1_result = combined_clustered_result[\n", |
484 |
| - " combined_clustered_result[\"CENTROID_ID\"] == 1\n", |
485 |
| - "][[\"consumer_complaint_narrative\"]]\n", |
| 460 | + "cluster_1_result = clustered_result[\n", |
| 461 | + " clustered_result[\"CENTROID_ID\"] == 1\n", |
| 462 | + "][[\"content\"]]\n", |
486 | 463 | "cluster_1_result_pandas = cluster_1_result.head(5).to_pandas()\n",
|
487 | 464 | "\n",
|
488 |
| - "cluster_2_result = combined_clustered_result[\n", |
489 |
| - " combined_clustered_result[\"CENTROID_ID\"] == 2\n", |
490 |
| - "][[\"consumer_complaint_narrative\"]]\n", |
| 465 | + "cluster_2_result = clustered_result[\n", |
| 466 | + " clustered_result[\"CENTROID_ID\"] == 2\n", |
| 467 | + "][[\"content\"]]\n", |
491 | 468 | "cluster_2_result_pandas = cluster_2_result.head(5).to_pandas()"
|
492 | 469 | ]
|
493 | 470 | },
|
|
503 | 480 | "prompt1 = 'comment list 1:\\n'\n",
|
504 | 481 | "for i in range(5):\n",
|
505 | 482 | " prompt1 += str(i + 1) + '. ' + \\\n",
|
506 |
| - " cluster_1_result_pandas[\"consumer_complaint_narrative\"].iloc[i] + '\\n'\n", |
| 483 | + " cluster_1_result_pandas[\"content\"].iloc[i] + '\\n'\n", |
507 | 484 | "\n",
|
508 | 485 | "prompt2 = 'comment list 2:\\n'\n",
|
509 | 486 | "for i in range(5):\n",
|
510 | 487 | " prompt2 += str(i + 1) + '. ' + \\\n",
|
511 |
| - " cluster_2_result_pandas[\"consumer_complaint_narrative\"].iloc[i] + '\\n'\n", |
| 488 | + " cluster_2_result_pandas[\"content\"].iloc[i] + '\\n'\n", |
512 | 489 | "\n",
|
513 | 490 | "print(prompt1)\n",
|
514 |
| - "print(prompt2)\n" |
| 491 | + "print(prompt2)" |
515 | 492 | ]
|
516 | 493 | },
|
517 | 494 | {
|
|
0 commit comments