diff --git a/docs/core_docs/.gitignore b/docs/core_docs/.gitignore
index 924a6920834f..11ed6749ec06 100644
--- a/docs/core_docs/.gitignore
+++ b/docs/core_docs/.gitignore
@@ -53,10 +53,6 @@ docs/use_cases/question_answering/citations.md
docs/use_cases/question_answering/citations.mdx
docs/use_cases/question_answering/chat_history.md
docs/use_cases/question_answering/chat_history.mdx
-docs/use_cases/query_analysis/quickstart.md
-docs/use_cases/query_analysis/quickstart.mdx
-docs/use_cases/query_analysis/index.md
-docs/use_cases/query_analysis/index.mdx
docs/use_cases/graph/semantic.md
docs/use_cases/graph/semantic.mdx
docs/use_cases/graph/quickstart.md
@@ -75,6 +71,18 @@ docs/use_cases/extraction/index.md
docs/use_cases/extraction/index.mdx
docs/use_cases/extraction/guidelines.md
docs/use_cases/extraction/guidelines.mdx
+docs/use_cases/query_analysis/quickstart.md
+docs/use_cases/query_analysis/quickstart.mdx
+docs/use_cases/query_analysis/index.md
+docs/use_cases/query_analysis/index.mdx
+docs/use_cases/extraction/how_to/parse.md
+docs/use_cases/extraction/how_to/parse.mdx
+docs/use_cases/extraction/how_to/handle_long_text.md
+docs/use_cases/extraction/how_to/handle_long_text.mdx
+docs/use_cases/extraction/how_to/handle_files.md
+docs/use_cases/extraction/how_to/handle_files.mdx
+docs/use_cases/extraction/how_to/examples.md
+docs/use_cases/extraction/how_to/examples.mdx
docs/use_cases/query_analysis/how_to/no_queries.md
docs/use_cases/query_analysis/how_to/no_queries.mdx
docs/use_cases/query_analysis/how_to/multiple_retrievers.md
@@ -99,14 +107,6 @@ docs/use_cases/query_analysis/techniques/expansion.md
docs/use_cases/query_analysis/techniques/expansion.mdx
docs/use_cases/query_analysis/techniques/decomposition.md
docs/use_cases/query_analysis/techniques/decomposition.mdx
-docs/use_cases/extraction/how_to/parse.md
-docs/use_cases/extraction/how_to/parse.mdx
-docs/use_cases/extraction/how_to/handle_long_text.md
-docs/use_cases/extraction/how_to/handle_long_text.mdx
-docs/use_cases/extraction/how_to/handle_files.md
-docs/use_cases/extraction/how_to/handle_files.mdx
-docs/use_cases/extraction/how_to/examples.md
-docs/use_cases/extraction/how_to/examples.mdx
docs/modules/model_io/output_parsers/custom.md
docs/modules/model_io/output_parsers/custom.mdx
docs/modules/model_io/chat/function_calling.md
diff --git a/docs/core_docs/docs/use_cases/media.mdx b/docs/core_docs/docs/use_cases/media.mdx
new file mode 100644
index 000000000000..261c23d3cbf8
--- /dev/null
+++ b/docs/core_docs/docs/use_cases/media.mdx
@@ -0,0 +1,56 @@
+# Audio/Video Structured Extraction
+
+Google's Gemini API offers support for audio and video input, along with function calling.
+Together, we can pair these API features to extract structured data given audio or video input.
+
+In the following examples, we'll demonstrate how to read and send MP3 and MP4 files to the Gemini API, and receive structured output as a response.
+
+## Setup
+
+These examples use the Gemini API, so you'll need a Google VertexAI credentials file (or stringified credentials file if using a web environment):
+
+```bash
+GOOGLE_APPLICATION_CREDENTIALS="credentials.json"
+```
+
+Next, install the `@langchain/google-vertexai` and `@langchain/community` packages:
+
+import IntegrationInstallTooltip from "@mdx_components/integration_install_tooltip.mdx";
+
+
+
+```bash npm2yarn
+npm install @langchain/google-vertexai @langchain/core
+```
+
+## Video
+
+This example uses a [LangChain YouTube video on datasets and testing in LangSmith](https://www.youtube.com/watch?v=N9hjO-Uy1Vo) sped up to 1.5x speed.
+It's then converted to `base64`, and sent to Gemini with a prompt asking for structured output of tasks I can do to improve my knowledge of datasets and testing in LangSmith.
+
+We create a new tool for this using Zod, and pass it to the model via the `withStructuredOutput` method.
+
+import CodeBlock from "@theme/CodeBlock";
+
+import VideoExample from "@examples/use_cases/media/video.ts";
+
+{VideoExample}
+
+## Audio
+
+The next example loads an audio (MP3) file containing Mozart's Requiem in D Minor and prompts Gemini to return a single array of strings, with each string being an instrument from the song.
+
+Here, we'll also use the `withStructuredOutput` method to get structured output from the model.
+
+import AudioExample from "@examples/use_cases/media/audio.ts";
+
+{AudioExample}
+
+From a quick Google search, we see the song was composed using the following instruments:
+
+```txt
+The Requiem is scored for 2 basset horns in F, 2 bassoons, 2 trumpets in D, 3 trombones (alto, tenor, and bass),
+timpani (2 drums), violins, viola, and basso continuo (cello, double bass, and organ).
+```
+
+Gemini did pretty well here! For music not being its primary focus, it was able to identify a few of the instruments used in the song, and didn't hallucinate any!
diff --git a/examples/src/use_cases/media/audio.ts b/examples/src/use_cases/media/audio.ts
new file mode 100644
index 000000000000..cca82b428e29
--- /dev/null
+++ b/examples/src/use_cases/media/audio.ts
@@ -0,0 +1,67 @@
+import {
+ ChatPromptTemplate,
+ MessagesPlaceholder,
+} from "@langchain/core/prompts";
+import { ChatVertexAI } from "@langchain/google-vertexai";
+import { HumanMessage } from "@langchain/core/messages";
+import fs from "fs";
+import { z } from "zod";
+
+function fileToBase64(filePath: string): string {
+ return fs.readFileSync(filePath, "base64");
+}
+
+const mozartMp3File = "Mozart_Requiem_D_minor.mp3";
+const mozartInBase64 = fileToBase64(mozartMp3File);
+
+const tool = z.object({
+ instruments: z
+ .array(z.string())
+ .describe("A list of instruments found in the audio."),
+});
+
+const model = new ChatVertexAI({
+ model: "gemini-1.5-pro-preview-0409",
+ temperature: 0,
+}).withStructuredOutput(tool, {
+ name: "instruments_list_tool",
+});
+
+const prompt = ChatPromptTemplate.fromMessages([
+ new MessagesPlaceholder("audio"),
+]);
+
+const chain = prompt.pipe(model);
+const response = await chain.invoke({
+ audio: new HumanMessage({
+ content: [
+ {
+ type: "media",
+ mimeType: "audio/mp3",
+ data: mozartInBase64,
+ },
+
+ {
+ type: "text",
+ text: `The following audio is a song by Mozart. Respond with a list of instruments you hear in the song.
+
+Rules:
+Use the "instruments_list_tool" to return a list of tasks.`,
+ },
+ ],
+ }),
+});
+
+console.log("response", response);
+/*
+response {
+ instruments: [
+ 'violin', 'viola',
+ 'cello', 'double bass',
+ 'flute', 'oboe',
+ 'clarinet', 'bassoon',
+ 'horn', 'trumpet',
+ 'timpani'
+ ]
+}
+*/
diff --git a/examples/src/use_cases/media/video.ts b/examples/src/use_cases/media/video.ts
new file mode 100644
index 000000000000..275cb5e8a2d1
--- /dev/null
+++ b/examples/src/use_cases/media/video.ts
@@ -0,0 +1,64 @@
+import {
+ ChatPromptTemplate,
+ MessagesPlaceholder,
+} from "@langchain/core/prompts";
+import { ChatVertexAI } from "@langchain/google-vertexai";
+import { HumanMessage } from "@langchain/core/messages";
+import fs from "fs";
+import { z } from "zod";
+
+function fileToBase64(filePath: string): string {
+ return fs.readFileSync(filePath, "base64");
+}
+
+const lanceLsEvalsVideo = "lance_ls_eval_video.mp4";
+const lanceInBase64 = fileToBase64(lanceLsEvalsVideo);
+
+const tool = z.object({
+ tasks: z.array(z.string()).describe("A list of tasks."),
+});
+
+const model = new ChatVertexAI({
+ model: "gemini-1.5-pro-preview-0409",
+ temperature: 0,
+}).withStructuredOutput(tool, {
+ name: "tasks_list_tool",
+});
+
+const prompt = ChatPromptTemplate.fromMessages([
+ new MessagesPlaceholder("video"),
+]);
+
+const chain = prompt.pipe(model);
+const response = await chain.invoke({
+ video: new HumanMessage({
+ content: [
+ {
+ type: "media",
+ mimeType: "video/mp4",
+ data: lanceInBase64,
+ },
+ {
+ type: "text",
+ text: `The following video is an overview of how to build datasets in LangSmith.
+Given the following video, come up with three tasks I should do to further improve my knowledge around using datasets in LangSmith.
+Only reference features that were outlined or described in the video.
+
+Rules:
+Use the "tasks_list_tool" to return a list of tasks.
+Your tasks should be tailored for an engineer who is looking to improve their knowledge around using datasets and evaluations, specifically with LangSmith.`,
+ },
+ ],
+ }),
+});
+
+console.log("response", response);
+/*
+response {
+ tasks: [
+ 'Explore the LangSmith SDK documentation for in-depth understanding of dataset creation, manipulation, and versioning functionalities.',
+ 'Experiment with different dataset types like Key-Value, Chat, and LLM to understand their structures and use cases.',
+ 'Try uploading a CSV file containing question-answer pairs to LangSmith and create a new dataset from it.'
+ ]
+}
+*/
diff --git a/langchain-core/src/prompts/chat.ts b/langchain-core/src/prompts/chat.ts
index 45defd2ce07d..948d88eb12ed 100644
--- a/langchain-core/src/prompts/chat.ts
+++ b/langchain-core/src/prompts/chat.ts
@@ -706,7 +706,13 @@ function _coerceMessagePromptTemplateLike(
const message = coerceMessageLikeToMessage(messagePromptTemplateLike);
let templateData:
| string
- | (string | _TextTemplateParam | _ImageTemplateParam)[];
+ | (
+ | string
+ | _TextTemplateParam
+ | _ImageTemplateParam
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
+ | Record
+ )[];
if (typeof message.content === "string") {
templateData = message.content;
@@ -718,7 +724,7 @@ function _coerceMessagePromptTemplateLike(
} else if ("image_url" in item) {
return { image_url: item.image_url };
} else {
- throw new Error("Invalid message content");
+ return item;
}
});
}
diff --git a/libs/langchain-google-common/src/utils/common.ts b/libs/langchain-google-common/src/utils/common.ts
index 512535987234..f242de83215c 100644
--- a/libs/langchain-google-common/src/utils/common.ts
+++ b/libs/langchain-google-common/src/utils/common.ts
@@ -24,6 +24,7 @@ export function copyAIModelParamsInto(
const model = options?.model ?? params?.model ?? target.model;
ret.modelName =
model ?? options?.modelName ?? params?.modelName ?? target.modelName;
+ ret.model = model;
ret.temperature =
options?.temperature ?? params?.temperature ?? target.temperature;
ret.maxOutputTokens =
diff --git a/libs/langchain-google-common/src/utils/gemini.ts b/libs/langchain-google-common/src/utils/gemini.ts
index e15bdde098d1..8e504408a832 100644
--- a/libs/langchain-google-common/src/utils/gemini.ts
+++ b/libs/langchain-google-common/src/utils/gemini.ts
@@ -35,6 +35,18 @@ import type {
} from "../types.js";
import { GoogleAISafetyError } from "./safety.js";
+const extractMimeType = (
+ str: string
+): { mimeType: string; data: string } | null => {
+ if (str.startsWith("data:")) {
+ return {
+ mimeType: str.split(":")[1].split(";")[0],
+ data: str.split(",")[1],
+ };
+ }
+ return null;
+};
+
function messageContentText(
content: MessageContentText
): GeminiPartText | null {
@@ -54,17 +66,14 @@ function messageContentImageUrl(
typeof content.image_url === "string"
? content.image_url
: content.image_url.url;
-
if (!url) {
throw new Error("Missing Image URL");
}
- if (url.startsWith("data:")) {
+ const mineTypeAndData = extractMimeType(url);
+ if (mineTypeAndData) {
return {
- inlineData: {
- mimeType: url.split(":")[1].split(";")[0],
- data: url.split(",")[1],
- },
+ inlineData: mineTypeAndData,
};
} else {
// FIXME - need some way to get mime type
@@ -77,6 +86,29 @@ function messageContentImageUrl(
}
}
+function messageContentMedia(
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
+ content: Record
+): GeminiPartInlineData | GeminiPartFileData {
+ if ("mimeType" in content && "data" in content) {
+ return {
+ inlineData: {
+ mimeType: content.mimeType,
+ data: content.data,
+ },
+ };
+ } else if ("mimeType" in content && "fileUri" in content) {
+ return {
+ fileData: {
+ mimeType: content.mimeType,
+ fileUri: content.fileUri,
+ },
+ };
+ }
+
+ throw new Error("Invalid media content");
+}
+
export function messageContentToParts(content: MessageContent): GeminiPart[] {
// Convert a string to a text type MessageContent if needed
const messageContent: MessageContent =
@@ -104,6 +136,8 @@ export function messageContentToParts(content: MessageContent): GeminiPart[] {
return messageContentImageUrl(content as MessageContentImageUrl);
}
break;
+ case "media":
+ return messageContentMedia(content);
default:
throw new Error(
`Unsupported type received while converting message to message parts`
diff --git a/libs/langchain-google-gauth/package.json b/libs/langchain-google-gauth/package.json
index cef43ab3f6a9..4fa126edb474 100644
--- a/libs/langchain-google-gauth/package.json
+++ b/libs/langchain-google-gauth/package.json
@@ -65,7 +65,8 @@
"release-it": "^15.10.1",
"rollup": "^4.5.2",
"ts-jest": "^29.1.0",
- "typescript": "<5.2.0"
+ "typescript": "<5.2.0",
+ "zod": "^3.22.4"
},
"publishConfig": {
"access": "public"
diff --git a/libs/langchain-google-gauth/src/tests/chat_models.audio.int.test.ts b/libs/langchain-google-gauth/src/tests/chat_models.audio.int.test.ts
new file mode 100644
index 000000000000..8cc04b4c61c2
--- /dev/null
+++ b/libs/langchain-google-gauth/src/tests/chat_models.audio.int.test.ts
@@ -0,0 +1,178 @@
+import fs from "fs";
+import {
+ ChatPromptTemplate,
+ MessagesPlaceholder,
+} from "@langchain/core/prompts";
+import { HumanMessage } from "@langchain/core/messages";
+import { z } from "zod";
+import { ChatGoogle } from "../chat_models.js";
+
+function fileToBase64(filePath: string): string {
+ const fileData = fs.readFileSync(filePath);
+ const base64String = Buffer.from(fileData).toString("base64");
+ return base64String;
+}
+
+test.skip("Gemini can understand audio", async () => {
+ const model = new ChatGoogle({
+ model: "gemini-1.5-pro-preview-0409",
+ temperature: 0,
+ });
+
+ const audioPath = "../../examples/Mozart_Requiem_D_minor.mp3";
+ const audioMimeType = "audio/mp3";
+ const audioBase64 = fileToBase64(audioPath);
+
+ const prompt = ChatPromptTemplate.fromMessages([
+ new MessagesPlaceholder("audio"),
+ ]);
+
+ const chain = prompt.pipe(model);
+ const response = await chain.invoke({
+ audio: new HumanMessage({
+ content: [
+ {
+ type: "media",
+ mimeType: audioMimeType,
+ data: audioBase64,
+ },
+ {
+ type: "text",
+ text: "Do you know this song? If so, who is the composer and can you give me a brief overview of the tone/tempo?",
+ },
+ ],
+ }),
+ });
+
+ expect(typeof response.content).toBe("string");
+ expect((response.content as string).length).toBeGreaterThan(15);
+});
+
+test.skip("Gemini can understand video", async () => {
+ const model = new ChatGoogle({
+ model: "gemini-1.5-pro-preview-0409",
+ temperature: 0,
+ });
+
+ const videoPath = "../../examples/lance_ls_eval_video.mp4";
+ const videoMimeType = "video/mp4";
+ const videoBase64 = fileToBase64(videoPath);
+
+ const prompt = ChatPromptTemplate.fromMessages([
+ new MessagesPlaceholder("video"),
+ ]);
+
+ const chain = prompt.pipe(model);
+ const response = await chain.invoke({
+ video: new HumanMessage({
+ content: [
+ {
+ type: "media",
+ mimeType: videoMimeType,
+ data: videoBase64,
+ },
+ {
+ type: "text",
+ text: "Summarize the video in a few sentences.",
+ },
+ ],
+ }),
+ });
+
+ expect(typeof response.content).toBe("string");
+ expect((response.content as string).length).toBeGreaterThan(15);
+});
+
+test.skip("Gemini can use tools with audio", async () => {
+ const audioPath = "../../examples/Mozart_Requiem_D_minor.mp3";
+ const audioMimeType = "audio/mp3";
+ const audioBase64 = fileToBase64(audioPath);
+
+ const tool = z.object({
+ instruments: z
+ .array(z.string())
+ .describe("A list of instruments found in the audio."),
+ });
+
+ const model = new ChatGoogle({
+ model: "gemini-1.5-pro-preview-0409",
+ temperature: 0,
+ }).withStructuredOutput(tool, {
+ name: "instruments_list_tool",
+ });
+
+ const prompt = ChatPromptTemplate.fromMessages([
+ new MessagesPlaceholder("audio"),
+ ]);
+
+ const chain = prompt.pipe(model);
+ const response = await chain.invoke({
+ audio: new HumanMessage({
+ content: [
+ {
+ type: "media",
+ mimeType: audioMimeType,
+ data: audioBase64,
+ },
+
+ {
+ type: "text",
+ text: `The following audio is a song by Mozart. Respond with a list of instruments you hear in the song.
+
+ Rules:
+ Use the "instruments_list_tool" to return a list of tasks.`,
+ },
+ ],
+ }),
+ });
+
+ expect(response.instruments).toBeTruthy();
+ expect(response.instruments.length).toBeGreaterThan(0);
+});
+
+test.skip("Gemini can use tools with video", async () => {
+ const videoPath = "../../examples/lance_ls_eval_video.mp4";
+ const videoMimeType = "video/mp4";
+ const videoBase64 = fileToBase64(videoPath);
+
+ const tool = z.object({
+ tasks: z.array(z.string()).describe("A list of tasks."),
+ });
+
+ const model = new ChatGoogle({
+ model: "gemini-1.5-pro-preview-0409",
+ temperature: 0,
+ }).withStructuredOutput(tool, {
+ name: "tasks_list_tool",
+ });
+
+ const prompt = ChatPromptTemplate.fromMessages([
+ new MessagesPlaceholder("video"),
+ ]);
+
+ const chain = prompt.pipe(model);
+ const response = await chain.invoke({
+ video: new HumanMessage({
+ content: [
+ {
+ type: "media",
+ mimeType: videoMimeType,
+ data: videoBase64,
+ },
+ {
+ type: "text",
+ text: `The following video is an overview of how to build datasets in LangSmith.
+ Given the following video, come up with three tasks I should do to further improve my knowledge around using datasets in LangSmith.
+ Only reference features that were outlined or described in the video.
+
+ Rules:
+ Use the "tasks_list_tool" to return a list of tasks.
+ Your tasks should be tailored for an engineer who is looking to improve their knowledge around using datasets and evaluations, specifically with LangSmith.`,
+ },
+ ],
+ }),
+ });
+
+ expect(response.tasks).toBeTruthy();
+ expect(response.tasks.length).toBeGreaterThanOrEqual(3);
+});
diff --git a/yarn.lock b/yarn.lock
index 819d35b18d47..ca2c1b40c297 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -9486,6 +9486,7 @@ __metadata:
rollup: ^4.5.2
ts-jest: ^29.1.0
typescript: <5.2.0
+ zod: ^3.22.4
languageName: unknown
linkType: soft