langchain-ai · jacoblee93 · Apr 11, 2024 · Apr 10, 2024 · Apr 10, 2024 · Apr 10, 2024
diff --git a/docs/core_docs/.gitignore b/docs/core_docs/.gitignore
@@ -53,16 +53,6 @@ docs/use_cases/question_answering/citations.md
 docs/use_cases/question_answering/citations.mdx
 docs/use_cases/question_answering/chat_history.md
 docs/use_cases/question_answering/chat_history.mdx
-docs/use_cases/query_analysis/quickstart.md
-docs/use_cases/query_analysis/quickstart.mdx
-docs/use_cases/query_analysis/index.md
-docs/use_cases/query_analysis/index.mdx
-docs/use_cases/extraction/quickstart.md
-docs/use_cases/extraction/quickstart.mdx
-docs/use_cases/extraction/index.md
-docs/use_cases/extraction/index.mdx
-docs/use_cases/extraction/guidelines.md
-docs/use_cases/extraction/guidelines.mdx
 docs/use_cases/graph/semantic.md
 docs/use_cases/graph/semantic.mdx
 docs/use_cases/graph/quickstart.md
@@ -73,18 +63,26 @@ docs/use_cases/graph/mapping.md
 docs/use_cases/graph/mapping.mdx
 docs/use_cases/graph/index.md
 docs/use_cases/graph/index.mdx
-docs/use_cases/query_analysis/techniques/structuring.md
-docs/use_cases/query_analysis/techniques/structuring.mdx
-docs/use_cases/query_analysis/techniques/step_back.md
-docs/use_cases/query_analysis/techniques/step_back.mdx
-docs/use_cases/query_analysis/techniques/routing.md
-docs/use_cases/query_analysis/techniques/routing.mdx
-docs/use_cases/query_analysis/techniques/hyde.md
-docs/use_cases/query_analysis/techniques/hyde.mdx
-docs/use_cases/query_analysis/techniques/expansion.md
-docs/use_cases/query_analysis/techniques/expansion.mdx
-docs/use_cases/query_analysis/techniques/decomposition.md
-docs/use_cases/query_analysis/techniques/decomposition.mdx
+docs/use_cases/graph/construction.md
+docs/use_cases/graph/construction.mdx
+docs/use_cases/extraction/quickstart.md
+docs/use_cases/extraction/quickstart.mdx
+docs/use_cases/extraction/index.md
+docs/use_cases/extraction/index.mdx
+docs/use_cases/extraction/guidelines.md
+docs/use_cases/extraction/guidelines.mdx
+docs/use_cases/query_analysis/quickstart.md
+docs/use_cases/query_analysis/quickstart.mdx
+docs/use_cases/query_analysis/index.md
+docs/use_cases/query_analysis/index.mdx
+docs/use_cases/extraction/how_to/parse.md
+docs/use_cases/extraction/how_to/parse.mdx
+docs/use_cases/extraction/how_to/handle_long_text.md
+docs/use_cases/extraction/how_to/handle_long_text.mdx
+docs/use_cases/extraction/how_to/handle_files.md
+docs/use_cases/extraction/how_to/handle_files.mdx
+docs/use_cases/extraction/how_to/examples.md
+docs/use_cases/extraction/how_to/examples.mdx
 docs/use_cases/query_analysis/how_to/no_queries.md
 docs/use_cases/query_analysis/how_to/no_queries.mdx
 docs/use_cases/query_analysis/how_to/multiple_retrievers.md
@@ -97,20 +95,24 @@ docs/use_cases/query_analysis/how_to/few_shot.md
 docs/use_cases/query_analysis/how_to/few_shot.mdx
 docs/use_cases/query_analysis/how_to/constructing_filters.md
 docs/use_cases/query_analysis/how_to/constructing_filters.mdx
-docs/use_cases/extraction/how_to/parse.md
-docs/use_cases/extraction/how_to/parse.mdx
-docs/use_cases/extraction/how_to/handle_long_text.md
-docs/use_cases/extraction/how_to/handle_long_text.mdx
-docs/use_cases/extraction/how_to/handle_files.md
-docs/use_cases/extraction/how_to/handle_files.mdx
-docs/use_cases/extraction/how_to/examples.md
-docs/use_cases/extraction/how_to/examples.mdx
-docs/modules/memory/chat_messages/custom.md
-docs/modules/memory/chat_messages/custom.mdx
+docs/use_cases/query_analysis/techniques/structuring.md
+docs/use_cases/query_analysis/techniques/structuring.mdx
+docs/use_cases/query_analysis/techniques/step_back.md
+docs/use_cases/query_analysis/techniques/step_back.mdx
+docs/use_cases/query_analysis/techniques/routing.md
+docs/use_cases/query_analysis/techniques/routing.mdx
+docs/use_cases/query_analysis/techniques/hyde.md
+docs/use_cases/query_analysis/techniques/hyde.mdx
+docs/use_cases/query_analysis/techniques/expansion.md
+docs/use_cases/query_analysis/techniques/expansion.mdx
+docs/use_cases/query_analysis/techniques/decomposition.md
+docs/use_cases/query_analysis/techniques/decomposition.mdx
 docs/modules/model_io/output_parsers/custom.md
 docs/modules/model_io/output_parsers/custom.mdx
 docs/modules/model_io/chat/function_calling.md
 docs/modules/model_io/chat/function_calling.mdx
+docs/modules/memory/chat_messages/custom.md
+docs/modules/memory/chat_messages/custom.mdx
 docs/modules/data_connection/vectorstores/custom.md
 docs/modules/data_connection/vectorstores/custom.mdx
 docs/modules/model_io/output_parsers/types/openai_tools.md

diff --git a/docs/core_docs/docs/use_cases/media.mdx b/docs/core_docs/docs/use_cases/media.mdx
@@ -0,0 +1,56 @@
+# Audio/Video Structured Extraction
+
+Google's Gemini API offers support for audio and video input, along with function calling.
+Together, we can pair these API features to extract structured data given audio or video input.
+
+In the following examples, we'll demonstrate how to read and send MP3 and MP4 files to the Gemini API, and receive structured output as a response.
+
+## Setup
+
+These examples use the Gemini API, so you'll need a Google VertexAI credentials file (or stringified credentials file if using a web environment):
+
+```bash
+GOOGLE_APPLICATION_CREDENTIALS="credentials.json"
+```
+
+Next, install the `@langchain/google-vertexai` and `@langchain/community` packages:
+
+import IntegrationInstallTooltip from "@mdx_components/integration_install_tooltip.mdx";
+
+<IntegrationInstallTooltip></IntegrationInstallTooltip>
+
+```bash npm2yarn
+npm install @langchain/google-vertexai @langchain/core
+```
+
+## Video
+
+This example uses a [LangChain YouTube video on datasets and testing in LangSmith](https://www.youtube.com/watch?v=N9hjO-Uy1Vo) sped up to 1.5x speed.
+It's then converted to `base64`, and sent to Gemini with a prompt asking for structured output of tasks I can do to improve my knowledge of datasets and testing in LangSmith.
+
+We create a new tool for this using Zod, and pass it to the model via the `withStructuredOutput` method.
+
+import CodeBlock from "@theme/CodeBlock";
+
+import VideoExample from "@examples/use_cases/media/video.ts";
+
+<CodeBlock language="typescript">{VideoExample}</CodeBlock>
+
+## Audio
+
+The next example loads an audio (MP3) file containing Mozart's Requiem in D Minor and prompts Gemini to return a single array of strings, with each string being an instrument from the song.
+
+Here, we'll also use the `withStructuredOutput` method to get structured output from the model.
+
+import AudioExample from "@examples/use_cases/media/audio.ts";
+
+<CodeBlock language="typescript">{AudioExample}</CodeBlock>
+
+From a quick Google search, we see the song was composed using the following instruments:
+
+```txt
+The Requiem is scored for 2 basset horns in F, 2 bassoons, 2 trumpets in D, 3 trombones (alto, tenor, and bass),
+timpani (2 drums), violins, viola, and basso continuo (cello, double bass, and organ).
+```
+
+Gemini did pretty well here! For music not being its primary focus, it was able to identify a few of the instruments used in the song, and didn't hallucinate any!
diff --git a/examples/Mozart_Requiem_D_minor.mp3 b/examples/Mozart_Requiem_D_minor.mp3
diff --git a/examples/lance_ls_eval_video.mp4 b/examples/lance_ls_eval_video.mp4
diff --git a/examples/src/use_cases/media/audio.ts b/examples/src/use_cases/media/audio.ts
@@ -0,0 +1,67 @@
+import {
+  ChatPromptTemplate,
+  MessagesPlaceholder,
+} from "@langchain/core/prompts";
+import { ChatVertexAI } from "@langchain/google-vertexai";
+import { HumanMessage } from "@langchain/core/messages";
+import fs from "fs";
+import { z } from "zod";
+
+function fileToBase64(filePath: string): string {
+  return fs.readFileSync(filePath, "base64");
+}
+
+const mozartMp3File = "Mozart_Requiem_D_minor.mp3";
+const mozartInBase64 = fileToBase64(mozartMp3File);
+
+const tool = z.object({
+  instruments: z
+    .array(z.string())
+    .describe("A list of instruments found in the audio."),
+});
+
+const model = new ChatVertexAI({
+  model: "gemini-1.5-pro-preview-0409",
+  temperature: 0,
+}).withStructuredOutput(tool, {
+  name: "instruments_list_tool",
+});
+
+const prompt = ChatPromptTemplate.fromMessages([
+  new MessagesPlaceholder("audio"),
+]);
+
+const chain = prompt.pipe(model);
+const response = await chain.invoke({
+  audio: new HumanMessage({
+    content: [
+      {
+        type: "media",
+        mimeType: "audio/mp3",
+        data: mozartInBase64,
+      },
+
+      {
+        type: "text",
+        text: `The following audio is a song by Mozart. Respond with a list of instruments you hear in the song.
+
+Rules:
+Use the "instruments_list_tool" to return a list of tasks.`,
+      },
+    ],
+  }),
+});
+
+console.log("response", response);
+/*
+response {
+  instruments: [
+    'violin',   'viola',
+    'cello',    'double bass',
+    'flute',    'oboe',
+    'clarinet', 'bassoon',
+    'horn',     'trumpet',
+    'timpani'
+  ]
+}
+*/
diff --git a/examples/src/use_cases/media/video.ts b/examples/src/use_cases/media/video.ts
@@ -0,0 +1,64 @@
+import {
+  ChatPromptTemplate,
+  MessagesPlaceholder,
+} from "@langchain/core/prompts";
+import { ChatVertexAI } from "@langchain/google-vertexai";
+import { HumanMessage } from "@langchain/core/messages";
+import fs from "fs";
+import { z } from "zod";
+
+function fileToBase64(filePath: string): string {
+  return fs.readFileSync(filePath, "base64");
+}
+
+const lanceLsEvalsVideo = "lance_ls_eval_video.mp4";
+const lanceInBase64 = fileToBase64(lanceLsEvalsVideo);
+
+const tool = z.object({
+  tasks: z.array(z.string()).describe("A list of tasks."),
+});
+
+const model = new ChatVertexAI({
+  model: "gemini-1.5-pro-preview-0409",
+  temperature: 0,
+}).withStructuredOutput(tool, {
+  name: "tasks_list_tool",
+});
+
+const prompt = ChatPromptTemplate.fromMessages([
+  new MessagesPlaceholder("video"),
+]);
+
+const chain = prompt.pipe(model);
+const response = await chain.invoke({
+  video: new HumanMessage({
+    content: [
+      {
+        type: "media",
+        mimeType: "video/mp4",
+        data: lanceInBase64,
+      },
+      {
+        type: "text",
+        text: `The following video is an overview of how to build datasets in LangSmith.
+Given the following video, come up with three tasks I should do to further improve my knowledge around using datasets in LangSmith.
+Only reference features that were outlined or described in the video.
+
+Rules:
+Use the "tasks_list_tool" to return a list of tasks.
+Your tasks should be tailored for an engineer who is looking to improve their knowledge around using datasets and evaluations, specifically with LangSmith.`,
+      },
+    ],
+  }),
+});
+
+console.log("response", response);
+/*
+response {
+  tasks: [
+    'Explore the LangSmith SDK documentation for in-depth understanding of dataset creation, manipulation, and versioning functionalities.',
+    'Experiment with different dataset types like Key-Value, Chat, and LLM to understand their structures and use cases.',
+    'Try uploading a CSV file containing question-answer pairs to LangSmith and create a new dataset from it.'
+  ]
+}
+*/
diff --git a/langchain-core/src/prompts/chat.ts b/langchain-core/src/prompts/chat.ts
@@ -706,7 +706,13 @@ function _coerceMessagePromptTemplateLike(
   const message = coerceMessageLikeToMessage(messagePromptTemplateLike);
   let templateData:
     | string
-    | (string | _TextTemplateParam | _ImageTemplateParam)[];
+    | (
+        | string
+        | _TextTemplateParam
+        | _ImageTemplateParam
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        | Record<string, any>
+      )[];
 
   if (typeof message.content === "string") {
     templateData = message.content;
@@ -718,7 +724,7 @@ function _coerceMessagePromptTemplateLike(
       } else if ("image_url" in item) {
         return { image_url: item.image_url };
       } else {
-        throw new Error("Invalid message content");
+        return item;
       }
     });
   }

diff --git a/libs/langchain-google-common/src/utils/common.ts b/libs/langchain-google-common/src/utils/common.ts
@@ -24,6 +24,7 @@ export function copyAIModelParamsInto(
   const model = options?.model ?? params?.model ?? target.model;
   ret.modelName =
     model ?? options?.modelName ?? params?.modelName ?? target.modelName;
+  ret.model = model;
   ret.temperature =
     options?.temperature ?? params?.temperature ?? target.temperature;
   ret.maxOutputTokens =

diff --git a/libs/langchain-google-common/src/utils/gemini.ts b/libs/langchain-google-common/src/utils/gemini.ts
@@ -32,6 +32,18 @@ import type {
 } from "../types.js";
 import { GoogleAISafetyError } from "./safety.js";
 
+const extractMimeType = (
+  str: string
+): { mimeType: string; data: string } | null => {
+  if (str.startsWith("data:")) {
+    return {
+      mimeType: str.split(":")[1].split(";")[0],
+      data: str.split(",")[1],
+    };
+  }
+  return null;
+};
+
 function messageContentText(
   content: MessageContentText
 ): GeminiPartText | null {
@@ -51,17 +63,14 @@ function messageContentImageUrl(
     typeof content.image_url === "string"
       ? content.image_url
       : content.image_url.url;
-
   if (!url) {
     throw new Error("Missing Image URL");
   }
 
-  if (url.startsWith("data:")) {
+  const mineTypeAndData = extractMimeType(url);
+  if (mineTypeAndData) {
     return {
-      inlineData: {
-        mimeType: url.split(":")[1].split(";")[0],
-        data: url.split(",")[1],
-      },
+      inlineData: mineTypeAndData,
     };
   } else {
     // FIXME - need some way to get mime type
@@ -74,6 +83,29 @@ function messageContentImageUrl(
   }
 }
 
+function messageContentMedia(
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  content: Record<string, any>
+): GeminiPartInlineData | GeminiPartFileData {
+  if ("mimeType" in content && "data" in content) {
+    return {
+      inlineData: {
+        mimeType: content.mimeType,
+        data: content.data,
+      },
+    };
+  } else if ("mimeType" in content && "fileUri" in content) {
+    return {
+      fileData: {
+        mimeType: content.mimeType,
+        fileUri: content.fileUri,
+      },
+    };
+  }
+
+  throw new Error("Invalid media content");
+}
+
 export function messageContentToParts(content: MessageContent): GeminiPart[] {
   // Convert a string to a text type MessageContent if needed
   const messageContent: MessageContent =
@@ -101,6 +133,8 @@ export function messageContentToParts(content: MessageContent): GeminiPart[] {
             return messageContentImageUrl(content as MessageContentImageUrl);
           }
           break;
+        case "media":
+          return messageContentMedia(content);
         default:
           throw new Error(
             `Unsupported type received while converting message to message parts`