New method aiEmbeddings

nshiab · nshiab · commit 88ccc934f592 · 2025-05-12T16:38:04.000-04:00
diff --git a/deno.json b/deno.json
@@ -21,7 +21,7 @@
   "nodeModulesDir": "auto",
   "imports": {
     "@duckdb/node-api": "npm:@duckdb/node-api@1.2.2-alpha.18",
-    "@nshiab/journalism": "jsr:@nshiab/journalism@^1.28.2",
+    "@nshiab/journalism": "jsr:@nshiab/journalism@1.28.5",
     "@observablehq/plot": "npm:@observablehq/plot@0.6.17",
     "@std/assert": "jsr:@std/assert@1.0.12"
   },
diff --git a/deno.lock b/deno.lock
diff --git a/src/class/SimpleTable.ts b/src/class/SimpleTable.ts
@@ -95,6 +95,7 @@ import stringifyDates from "../helpers/stringifyDates.ts";
 import stringifyDatesInvert from "../helpers/stringifyDatesInvert.ts";
 import aiRowByRow from "../methods/aiRowByRow.ts";
 import aiQuery from "../methods/aiQuery.ts";
+import aiEmbeddings from "../methods/aiEmbeddings.ts";
 
 /**
  * SimpleTable is a class representing a table in a SimpleDB. It can handle tabular and geospatial data. To create one, it's best to instantiate a SimpleDB first.
@@ -627,6 +628,74 @@ export default class SimpleTable extends Simple {
     await aiRowByRow(this, column, newColumn, prompt, options);
   }
 
+  /**
+   * Generates embeddings for a specified column and stores the results in a new column.
+   *
+   * This method currently supports Google Gemini, Vertex AI, and local models running with Ollama. It retrieves credentials and the model from environment variables (`AI_KEY`, `AI_PROJECT`, `AI_LOCATION`, `AI_EMBEDDINGS_MODEL`) or accepts them as options. Options take precedence over environment variables.
+   *
+   * To run local models with Ollama, set the `OLLAMA` environment variable to `true` and start Ollama on your machine. Make sure to install the model you want and set the `AI_EMBEDDINGS_MODEL` environment variable to the model name.
+   *
+   * To avoid exceeding rate limits, you can use the `rateLimitPerMinute` option to automatically add a delay between requests to comply with the rate limit.
+   *
+   * If you have a business or professional account with high rate limits, you can set the `concurrent` option to process multiple requests concurrently and speed up the process.
+   *
+   * The `cache` option allows you to cache the results of each request locally, saving resources and time. The data is cached in the local hidden folder `.journalism-cache` (because this method uses the `getEmbedding` function from the [journalism library](https://github.com/nshiab/journalism)). Don't forget to add `.journalism-cache` to your `.gitignore` file!
+   *
+   * This method won't work if your table contains geometries.
+   *
+   * @example
+   * Basic usage with cache, rate limit, and verbose logging
+   * ```ts
+   * // New table with column "food".
+   * await table.loadArray([
+   *   { food: "pizza" },
+   *   { food: "sushi" },
+   *   { food: "burger" },
+   *   { food: "pasta" },
+   *   { food: "salad" },
+   *   { food: "tacos" }
+   * ]);
+   *
+   * // Ask the AI to generate embeddings in a new column "embeddings".
+   * await table.aiEmbeddings("food", "embeddings", {
+   *   // Cache the results locally
+   *   cache: true,
+   *   // Avoid exceeding a rate limit by waiting between requests
+   *   rateLimitPerMinute: 15,
+   *   // Log details
+   *   verbose: true,
+   * });
+   * ```
+   *
+   * @param column - The column to be used as input for the embeddings.
+   * @param newColumn - The name of the new column where the embeddings will be stored.
+   * @param options - Configuration options for the AI request.
+   *   @param options.concurrent - The number of concurrent requests to send. Defaults to 1.
+   *   @param options.cache - If true, the results will be cached locally. Defaults to false.
+   *   @param options.rateLimitPerMinute - The rate limit for the AI requests in requests per minute. If necessary, the method will wait between requests. Defaults to no limit.
+   *   @param options.model - The model to use. Defaults to the `AI_MODEL` environment variable.
+   *   @param options.apiKey - The API key. Defaults to the `AI_KEY` environment variable.
+   *   @param options.vertex - Whether to use Vertex AI. Defaults to `false`. If `AI_PROJECT` and `AI_LOCATION` are set in the environment, it will automatically switch to true.
+   *   @param options.project - The Google Cloud project ID. Defaults to the `AI_PROJECT` environment variable.
+   *   @param options.location - The Google Cloud location. Defaults to the `AI_LOCATION` environment variable.
+   *   @param options.ollama - Whether to use Ollama. Defaults to the `OLLAMA` environment variable.
+   *   @param options.verbose - Whether to log additional information. Defaults to `false`.
+   */
+  async aiEmbeddings(column: string, newColumn: string, options: {
+    concurrent?: number;
+    cache?: boolean;
+    model?: string;
+    apiKey?: string;
+    vertex?: boolean;
+    project?: string;
+    location?: string;
+    ollama?: boolean;
+    verbose?: boolean;
+    rateLimitPerMinute?: number;
+  } = {}) {
+    await aiEmbeddings(this, column, newColumn, options);
+  }
+
   /**
    * Generates and executes a SQL query based on a prompt. Additional instructions are automatically added before and after your prompt, such as the column types. To see the full prompt, set the `verbose` option to true.
    *
diff --git a/src/helpers/convertForJS.ts b/src/helpers/convertForJS.ts
@@ -29,6 +29,10 @@ export default function convertForJS(rows: {
         for (const row of rows) {
           row[key] = row[key] === null ? null : "<Geometry>";
         }
+      } else if (types[key].includes("FLOAT[")) {
+        for (const row of rows) {
+          row[key] = row[key] === null ? null : `<${types[key]}>`;
+        }
       }
     }
   }
diff --git a/src/helpers/parseDuckDBType.ts b/src/helpers/parseDuckDBType.ts
@@ -1,8 +1,10 @@
 import {
+  ARRAY,
   BIGINT,
   BOOLEAN,
   DATE,
   DOUBLE,
+  FLOAT,
   INTEGER,
   TIME,
   TIMESTAMP,
@@ -29,6 +31,10 @@ export default function parseDuckDBType(type: string) {
     return TIME;
   } else if (type === "BOOLEAN") {
     return BOOLEAN;
+  } else if (type.includes("FLOAT[")) {
+    // For embeddings
+    const size = type.replace("FLOAT[", "").replace("]", "");
+    return ARRAY(FLOAT, parseInt(size));
   } else {
     throw new Error(`Type ${type} not supported.`);
   }
diff --git a/src/methods/aiEmbeddings.ts b/src/methods/aiEmbeddings.ts
@@ -0,0 +1,83 @@
+import { formatNumber, getEmbedding, sleep } from "@nshiab/journalism";
+import type { SimpleTable } from "../index.ts";
+
+export default async function aiEmbeddings(
+  simpleTable: SimpleTable,
+  column: string,
+  newColumn: string,
+  options: {
+    concurrent?: number;
+    cache?: boolean;
+    model?: string;
+    apiKey?: string;
+    vertex?: boolean;
+    project?: string;
+    location?: string;
+    ollama?: boolean;
+    verbose?: boolean;
+    rateLimitPerMinute?: number;
+  } = {},
+) {
+  await simpleTable.updateWithJS(async (rows) => {
+    if (options.verbose) {
+      console.log("\naiEmbeddings()");
+    }
+
+    const concurrent = options.concurrent ?? 1;
+
+    let requests = [];
+    for (let i = 0; i < rows.length; i++) {
+      if (options.verbose) {
+        console.log(
+          `\nProcessing row ${i + 1} of ${rows.length}... (${
+            formatNumber(
+              (i + 1) / rows.length * 100,
+              {
+                significantDigits: 3,
+                suffix: "%",
+              },
+            )
+          })`,
+        );
+      }
+
+      if (requests.length < concurrent) {
+        const text = rows[i][column];
+        if (typeof text !== "string") {
+          throw new Error(
+            `The column "${column}" must be a string. Found ${text} instead.`,
+          );
+        }
+        requests.push(
+          getEmbedding(text, options),
+        );
+      }
+
+      if (requests.length === concurrent || i + 1 >= rows.length) {
+        const start = new Date();
+        const newValues = await Promise.all(requests);
+        for (let j = 0; j < newValues.length; j++) {
+          // Should be improved...
+          rows[i + j][newColumn] = newValues[j] as unknown as number;
+        }
+        const end = new Date();
+
+        const duration = end.getTime() - start.getTime();
+        // If duration is less than 10ms per request, it should means data comes from cache and we don't need to wait
+        if (
+          typeof options.rateLimitPerMinute === "number" &&
+          duration > 10 * requests.length && i + 1 < rows.length
+        ) {
+          const delay = Math.round(
+            (60 / (options.rateLimitPerMinute / concurrent)) * 1000,
+          );
+          await sleep(delay, { start, log: options.verbose });
+        }
+
+        requests = [];
+      }
+    }
+
+    return rows;
+  });
+}
diff --git a/src/methods/loadArray.ts b/src/methods/loadArray.ts
@@ -1,4 +1,5 @@
 import {
+  arrayValue,
   type DuckDBConnection,
   DuckDBDataChunk,
   DuckDBTimestampValue,
@@ -43,6 +44,13 @@ export default async function loadArray(
             );
           }
         }
+      } else if (Array.isArray(arrayOfObjects[0][key])) {
+        types[i] = `FLOAT[${arrayOfObjects[0][key].length}]`;
+
+        for (let j = 0; j < arrayOfObjects.length; j++) {
+          const d = arrayOfObjects[j][key];
+          dataForChunk[j][i] = arrayValue(d as number[]);
+        }
       } else {
         throw new Error(`Type object not supported.`);
       }
diff --git a/test/unit/methods/aiEmbeddings.test.ts b/test/unit/methods/aiEmbeddings.test.ts

-Original file line number
+Diff line change
   "nodeModulesDir": "auto",
   "imports": {
     "@duckdb/node-api": "npm:@duckdb/[email protected]",
 -    "@nshiab/journalism": "jsr:@nshiab/journalism@^1.28.2",
 +    "@nshiab/journalism": "jsr:@nshiab/[email protected].5",
     "@observablehq/plot": "npm:@observablehq/[email protected]",
     "@std/assert": "jsr:@std/[email protected]"
   },
Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,10 @@ export default function convertForJS(rows: {`
`29`	`29`	`for (const row of rows) {`
`30`	`30`	`row[key] = row[key] === null ? null : "<Geometry>";`
`31`	`31`	`}`
	`32`	`+ } else if (types[key].includes("FLOAT[")) {`
	`33`	`+ for (const row of rows) {`
	`34`	+ row[key] = row[key] === null ? null : `<${types[key]}>`;
	`35`	`+ }`
`32`	`36`	`}`
`33`	`37`	`}`
`34`	`38`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`import {`
	`2`	`+ arrayValue,`
`2`	`3`	`type DuckDBConnection,`
`3`	`4`	`DuckDBDataChunk,`
`4`	`5`	`DuckDBTimestampValue,`
`@@ -43,6 +44,13 @@ export default async function loadArray(`
`43`	`44`	`);`
`44`	`45`	`}`
`45`	`46`	`}`
	`47`	`+ } else if (Array.isArray(arrayOfObjects[0][key])) {`
	`48`	+ types[i] = `FLOAT[${arrayOfObjects[0][key].length}]`;
	`49`	`+`
	`50`	`+ for (let j = 0; j < arrayOfObjects.length; j++) {`
	`51`	`+ const d = arrayOfObjects[j][key];`
	`52`	`+ dataForChunk[j][i] = arrayValue(d as number[]);`
	`53`	`+ }`
`46`	`54`	`} else {`
`47`	`55`	throw new Error(`Type object not supported.`);
`48`	`56`	`}`