community[minor]: feat: BaiduQianfan embeddings (#4926)

zandko · jacoblee93 · web-flow · commit a2dfb465a983 · 2024-04-08T17:30:13.000-07:00
* feat: BaiduQianfan embeddings

* docs: Update instructions for configuring BAIDU API and Secret keys as env variables

* refactor: rename BaiduQianFanEmbeddings to BaiduQianfanEmbeddings for naming consistency

* Add entrypoint

---------

Co-authored-by: jacoblee93 &lt;jacoblee93@gmail.com&gt;
diff --git a/docs/core_docs/docs/integrations/text_embedding/baidu_qianfan.mdx b/docs/core_docs/docs/integrations/text_embedding/baidu_qianfan.mdx
@@ -0,0 +1,32 @@
+---
+sidebar_class_name: node-only
+---
+
+# Baidu Qianfan
+
+The `BaiduQianfanEmbeddings` class uses the Baidu Qianfan API to generate embeddings for a given text.
+
+## Setup
+
+Official Website: https://cloud.baidu.com/doc/WENXINWORKSHOP/s/alj562vvu
+
+An API key is required to use this embedding model. You can get one by registering at https://cloud.baidu.com/doc/WENXINWORKSHOP/s/alj562vvu.
+
+Please set the acquired API key as an environment variable named BAIDU_API_KEY, and set your secret key as an environment variable named BAIDU_SECRET_KEY.
+
+Then, you'll need to install the [`@langchain/community`](https://www.npmjs.com/package/@langchain/community) package:
+
+import IntegrationInstallTooltip from "@mdx_components/integration_install_tooltip.mdx";
+
+<IntegrationInstallTooltip></IntegrationInstallTooltip>
+
+```bash npm2yarn
+npm install @langchain/community
+```
+
+## Usage
+
+import CodeBlock from "@theme/CodeBlock";
+import BaiduQianFanExample from "@examples/embeddings/baidu_qianfan.ts";
+
+<CodeBlock language="typescript">{BaiduQianFanExample}</CodeBlock>
diff --git a/examples/src/embeddings/baidu_qianfan.ts b/examples/src/embeddings/baidu_qianfan.ts
@@ -0,0 +1,7 @@
+import { BaiduQianfanEmbeddings } from "@langchain/community/embeddings/baidu_qianfan";
+
+const embeddings = new BaiduQianfanEmbeddings();
+const res = await embeddings.embedQuery(
+  "What would be a good company name a company that makes colorful socks?"
+);
+console.log({ res });
diff --git a/libs/langchain-community/.gitignore b/libs/langchain-community/.gitignore
@@ -118,6 +118,10 @@ embeddings/alibaba_tongyi.cjs
 embeddings/alibaba_tongyi.js
 embeddings/alibaba_tongyi.d.ts
 embeddings/alibaba_tongyi.d.cts
+embeddings/baidu_qianfan.cjs
+embeddings/baidu_qianfan.js
+embeddings/baidu_qianfan.d.ts
+embeddings/baidu_qianfan.d.cts
 embeddings/bedrock.cjs
 embeddings/bedrock.js
 embeddings/bedrock.d.ts
diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js
@@ -59,6 +59,7 @@ export const config = {
     "agents/toolkits/connery": "agents/toolkits/connery/index",
     // embeddings
     "embeddings/alibaba_tongyi": "embeddings/alibaba_tongyi",
+    "embeddings/baidu_qianfan": "embeddings/baidu_qianfan",
     "embeddings/bedrock": "embeddings/bedrock",
     "embeddings/cloudflare_workersai": "embeddings/cloudflare_workersai",
     "embeddings/cohere": "embeddings/cohere",
diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json
@@ -822,6 +822,15 @@
       "import": "./embeddings/alibaba_tongyi.js",
       "require": "./embeddings/alibaba_tongyi.cjs"
     },
+    "./embeddings/baidu_qianfan": {
+      "types": {
+        "import": "./embeddings/baidu_qianfan.d.ts",
+        "require": "./embeddings/baidu_qianfan.d.cts",
+        "default": "./embeddings/baidu_qianfan.d.ts"
+      },
+      "import": "./embeddings/baidu_qianfan.js",
+      "require": "./embeddings/baidu_qianfan.cjs"
+    },
     "./embeddings/bedrock": {
       "types": {
         "import": "./embeddings/bedrock.d.ts",
@@ -2359,6 +2368,10 @@
     "embeddings/alibaba_tongyi.js",
     "embeddings/alibaba_tongyi.d.ts",
     "embeddings/alibaba_tongyi.d.cts",
+    "embeddings/baidu_qianfan.cjs",
+    "embeddings/baidu_qianfan.js",
+    "embeddings/baidu_qianfan.d.ts",
+    "embeddings/baidu_qianfan.d.cts",
     "embeddings/bedrock.cjs",
     "embeddings/bedrock.js",
     "embeddings/bedrock.d.ts",
diff --git a/libs/langchain-community/src/embeddings/baidu_qianfan.ts b/libs/langchain-community/src/embeddings/baidu_qianfan.ts
@@ -0,0 +1,238 @@
+import { Embeddings, type EmbeddingsParams } from "@langchain/core/embeddings";
+import { chunkArray } from "@langchain/core/utils/chunk_array";
+import { getEnvironmentVariable } from "@langchain/core/utils/env";
+
+export interface BaiduQianfanEmbeddingsParams extends EmbeddingsParams {
+  /** Model name to use */
+  modelName: "embedding-v1" | "bge_large_zh" | "bge-large-en" | "tao-8k";
+
+  /**
+   * Timeout to use when making requests to BaiduQianfan.
+   */
+  timeout?: number;
+
+  /**
+   * The maximum number of characters allowed for embedding in a single request varies by model:
+   * - Embedding-V1 model: up to 1000 characters
+   * - bge-large-zh model: up to 2000 characters
+   * - bge-large-en model: up to 2000 characters
+   * - tao-8k model: up to 28000 characters
+   *
+   * Note: These limits are model-specific and should be adhered to for optimal performance.
+   */
+  batchSize?: number;
+
+  /**
+   * Whether to strip new lines from the input text.
+   */
+  stripNewLines?: boolean;
+}
+
+interface EmbeddingCreateParams {
+  input: string[];
+}
+
+interface EmbeddingResponse {
+  data: { object: "embedding"; index: number; embedding: number[] }[];
+
+  usage: {
+    prompt_tokens: number;
+    total_tokens: number;
+  };
+
+  id: string;
+}
+
+interface EmbeddingErrorResponse {
+  error_code: number | string;
+  error_msg: string;
+}
+
+export class BaiduQianfanEmbeddings
+  extends Embeddings
+  implements BaiduQianfanEmbeddingsParams
+{
+  modelName: BaiduQianfanEmbeddingsParams["modelName"] = "embedding-v1";
+
+  batchSize = 16;
+
+  stripNewLines = true;
+
+  baiduApiKey: string;
+
+  baiduSecretKey: string;
+
+  accessToken: string;
+
+  constructor(
+    fields?: Partial<BaiduQianfanEmbeddingsParams> & {
+      verbose?: boolean;
+      baiduApiKey?: string;
+      baiduSecretKey?: string;
+    }
+  ) {
+    const fieldsWithDefaults = { maxConcurrency: 2, ...fields };
+    super(fieldsWithDefaults);
+
+    const baiduApiKey =
+      fieldsWithDefaults?.baiduApiKey ??
+      getEnvironmentVariable("BAIDU_API_KEY");
+
+    const baiduSecretKey =
+      fieldsWithDefaults?.baiduSecretKey ??
+      getEnvironmentVariable("BAIDU_SECRET_KEY");
+
+    if (!baiduApiKey) {
+      throw new Error("Baidu API key not found");
+    }
+
+    if (!baiduSecretKey) {
+      throw new Error("Baidu Secret key not found");
+    }
+
+    this.baiduApiKey = baiduApiKey;
+    this.baiduSecretKey = baiduSecretKey;
+
+    this.modelName = fieldsWithDefaults?.modelName ?? this.modelName;
+
+    if (this.modelName === "tao-8k") {
+      if (fieldsWithDefaults?.batchSize && fieldsWithDefaults.batchSize !== 1) {
+        throw new Error(
+          "tao-8k model supports only a batchSize of 1. Please adjust your batchSize accordingly"
+        );
+      }
+      this.batchSize = 1;
+    } else {
+      this.batchSize = fieldsWithDefaults?.batchSize ?? this.batchSize;
+    }
+
+    this.stripNewLines =
+      fieldsWithDefaults?.stripNewLines ?? this.stripNewLines;
+  }
+
+  /**
+   * Method to generate embeddings for an array of documents. Splits the
+   * documents into batches and makes requests to the BaiduQianFan API to generate
+   * embeddings.
+   * @param texts Array of documents to generate embeddings for.
+   * @returns Promise that resolves to a 2D array of embeddings for each document.
+   */
+  async embedDocuments(texts: string[]): Promise<number[][]> {
+    const batches = chunkArray(
+      this.stripNewLines ? texts.map((t) => t.replace(/\n/g, " ")) : texts,
+      this.batchSize
+    );
+
+    const batchRequests = batches.map((batch) => {
+      const params = this.getParams(batch);
+
+      return this.embeddingWithRetry(params);
+    });
+
+    const batchResponses = await Promise.all(batchRequests);
+
+    const embeddings: number[][] = [];
+
+    for (let i = 0; i < batchResponses.length; i += 1) {
+      const batch = batches[i];
+      const batchResponse = batchResponses[i] || [];
+      for (let j = 0; j < batch.length; j += 1) {
+        embeddings.push(batchResponse[j]);
+      }
+    }
+
+    return embeddings;
+  }
+
+  /**
+   * Method to generate an embedding for a single document. Calls the
+   * embeddingWithRetry method with the document as the input.
+   * @param text Document to generate an embedding for.
+   * @returns Promise that resolves to an embedding for the document.
+   */
+  async embedQuery(text: string): Promise<number[]> {
+    const params = this.getParams([
+      this.stripNewLines ? text.replace(/\n/g, " ") : text,
+    ]);
+
+    const embeddings = (await this.embeddingWithRetry(params)) || [[]];
+    return embeddings[0];
+  }
+
+  /**
+   * Method to generate an embedding params.
+   * @param texts Array of documents to generate embeddings for.
+   * @returns an embedding params.
+   */
+  private getParams(
+    texts: EmbeddingCreateParams["input"]
+  ): EmbeddingCreateParams {
+    return {
+      input: texts,
+    };
+  }
+
+  /**
+   * Private method to make a request to the BaiduAI API to generate
+   * embeddings. Handles the retry logic and returns the response from the
+   * API.
+   * @param request Request to send to the BaiduAI API.
+   * @returns Promise that resolves to the response from the API.
+   */
+  private async embeddingWithRetry(body: EmbeddingCreateParams) {
+    if (!this.accessToken) {
+      this.accessToken = await this.getAccessToken();
+    }
+
+    return fetch(
+      `https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/embeddings/${this.modelName}?access_token=${this.accessToken}`,
+      {
+        method: "POST",
+        headers: {
+          "Content-Type": "application/json",
+        },
+        body: JSON.stringify(body),
+      }
+    ).then(async (response) => {
+      const embeddingData: EmbeddingResponse | EmbeddingErrorResponse =
+        await response.json();
+
+      if ("error_code" in embeddingData && embeddingData.error_code) {
+        throw new Error(
+          `${embeddingData.error_code}: ${embeddingData.error_msg}`
+        );
+      }
+
+      return (embeddingData as EmbeddingResponse).data.map(
+        ({ embedding }) => embedding
+      );
+    });
+  }
+
+  /**
+   * Method that retrieves the access token for making requests to the Baidu
+   * API.
+   * @returns The access token for making requests to the Baidu API.
+   */
+  private async getAccessToken() {
+    const url = `https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=${this.baiduApiKey}&client_secret=${this.baiduSecretKey}`;
+    const response = await fetch(url, {
+      method: "POST",
+      headers: {
+        "Content-Type": "application/json",
+        Accept: "application/json",
+      },
+    });
+    if (!response.ok) {
+      const text = await response.text();
+      const error = new Error(
+        `Baidu get access token failed with status code ${response.status}, response: ${text}`
+      );
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      (error as any).response = response;
+      throw error;
+    }
+    const json = await response.json();
+    return json.access_token;
+  }
+}
diff --git a/libs/langchain-community/src/embeddings/tests/baidu_qianfan.int.test.ts b/libs/langchain-community/src/embeddings/tests/baidu_qianfan.int.test.ts
@@ -0,0 +1,34 @@
+import { test, expect } from "@jest/globals";
+import { BaiduQianfanEmbeddings } from "../baidu_qianfan.js";
+
+test.skip("Test BaiduQianfanEmbeddings.embedQuery", async () => {
+  const embeddings = new BaiduQianfanEmbeddings();
+  const res = await embeddings.embedQuery("Hello world");
+  expect(typeof res[0]).toBe("number");
+});
+
+test.skip("Test BaiduQianfanEmbeddings.embedDocuments", async () => {
+  const embeddings = new BaiduQianfanEmbeddings();
+  const res = await embeddings.embedDocuments(["Hello world", "Bye bye"]);
+  expect(res).toHaveLength(2);
+  expect(typeof res[0][0]).toBe("number");
+  expect(typeof res[1][0]).toBe("number");
+});
+
+test.skip("Test BaiduQianfanEmbeddings concurrency", async () => {
+  const embeddings = new BaiduQianfanEmbeddings({
+    batchSize: 1,
+  });
+  const res = await embeddings.embedDocuments([
+    "Hello world",
+    "Bye bye",
+    "Hello world",
+    "Bye bye",
+    "Hello world",
+    "Bye bye",
+  ]);
+  expect(res).toHaveLength(6);
+  expect(res.find((embedding) => typeof embedding[0] !== "number")).toBe(
+    undefined
+  );
+});