diff --git a/docs/core_docs/.gitignore b/docs/core_docs/.gitignore index 924a6920834f..11ed6749ec06 100644 --- a/docs/core_docs/.gitignore +++ b/docs/core_docs/.gitignore @@ -53,10 +53,6 @@ docs/use_cases/question_answering/citations.md docs/use_cases/question_answering/citations.mdx docs/use_cases/question_answering/chat_history.md docs/use_cases/question_answering/chat_history.mdx -docs/use_cases/query_analysis/quickstart.md -docs/use_cases/query_analysis/quickstart.mdx -docs/use_cases/query_analysis/index.md -docs/use_cases/query_analysis/index.mdx docs/use_cases/graph/semantic.md docs/use_cases/graph/semantic.mdx docs/use_cases/graph/quickstart.md @@ -75,6 +71,18 @@ docs/use_cases/extraction/index.md docs/use_cases/extraction/index.mdx docs/use_cases/extraction/guidelines.md docs/use_cases/extraction/guidelines.mdx +docs/use_cases/query_analysis/quickstart.md +docs/use_cases/query_analysis/quickstart.mdx +docs/use_cases/query_analysis/index.md +docs/use_cases/query_analysis/index.mdx +docs/use_cases/extraction/how_to/parse.md +docs/use_cases/extraction/how_to/parse.mdx +docs/use_cases/extraction/how_to/handle_long_text.md +docs/use_cases/extraction/how_to/handle_long_text.mdx +docs/use_cases/extraction/how_to/handle_files.md +docs/use_cases/extraction/how_to/handle_files.mdx +docs/use_cases/extraction/how_to/examples.md +docs/use_cases/extraction/how_to/examples.mdx docs/use_cases/query_analysis/how_to/no_queries.md docs/use_cases/query_analysis/how_to/no_queries.mdx docs/use_cases/query_analysis/how_to/multiple_retrievers.md @@ -99,14 +107,6 @@ docs/use_cases/query_analysis/techniques/expansion.md docs/use_cases/query_analysis/techniques/expansion.mdx docs/use_cases/query_analysis/techniques/decomposition.md docs/use_cases/query_analysis/techniques/decomposition.mdx -docs/use_cases/extraction/how_to/parse.md -docs/use_cases/extraction/how_to/parse.mdx -docs/use_cases/extraction/how_to/handle_long_text.md -docs/use_cases/extraction/how_to/handle_long_text.mdx -docs/use_cases/extraction/how_to/handle_files.md -docs/use_cases/extraction/how_to/handle_files.mdx -docs/use_cases/extraction/how_to/examples.md -docs/use_cases/extraction/how_to/examples.mdx docs/modules/model_io/output_parsers/custom.md docs/modules/model_io/output_parsers/custom.mdx docs/modules/model_io/chat/function_calling.md diff --git a/docs/core_docs/docs/use_cases/media.mdx b/docs/core_docs/docs/use_cases/media.mdx new file mode 100644 index 000000000000..261c23d3cbf8 --- /dev/null +++ b/docs/core_docs/docs/use_cases/media.mdx @@ -0,0 +1,56 @@ +# Audio/Video Structured Extraction + +Google's Gemini API offers support for audio and video input, along with function calling. +Together, we can pair these API features to extract structured data given audio or video input. + +In the following examples, we'll demonstrate how to read and send MP3 and MP4 files to the Gemini API, and receive structured output as a response. + +## Setup + +These examples use the Gemini API, so you'll need a Google VertexAI credentials file (or stringified credentials file if using a web environment): + +```bash +GOOGLE_APPLICATION_CREDENTIALS="credentials.json" +``` + +Next, install the `@langchain/google-vertexai` and `@langchain/community` packages: + +import IntegrationInstallTooltip from "@mdx_components/integration_install_tooltip.mdx"; + + + +```bash npm2yarn +npm install @langchain/google-vertexai @langchain/core +``` + +## Video + +This example uses a [LangChain YouTube video on datasets and testing in LangSmith](https://www.youtube.com/watch?v=N9hjO-Uy1Vo) sped up to 1.5x speed. +It's then converted to `base64`, and sent to Gemini with a prompt asking for structured output of tasks I can do to improve my knowledge of datasets and testing in LangSmith. + +We create a new tool for this using Zod, and pass it to the model via the `withStructuredOutput` method. + +import CodeBlock from "@theme/CodeBlock"; + +import VideoExample from "@examples/use_cases/media/video.ts"; + +{VideoExample} + +## Audio + +The next example loads an audio (MP3) file containing Mozart's Requiem in D Minor and prompts Gemini to return a single array of strings, with each string being an instrument from the song. + +Here, we'll also use the `withStructuredOutput` method to get structured output from the model. + +import AudioExample from "@examples/use_cases/media/audio.ts"; + +{AudioExample} + +From a quick Google search, we see the song was composed using the following instruments: + +```txt +The Requiem is scored for 2 basset horns in F, 2 bassoons, 2 trumpets in D, 3 trombones (alto, tenor, and bass), +timpani (2 drums), violins, viola, and basso continuo (cello, double bass, and organ). +``` + +Gemini did pretty well here! For music not being its primary focus, it was able to identify a few of the instruments used in the song, and didn't hallucinate any! diff --git a/examples/src/use_cases/media/audio.ts b/examples/src/use_cases/media/audio.ts new file mode 100644 index 000000000000..cca82b428e29 --- /dev/null +++ b/examples/src/use_cases/media/audio.ts @@ -0,0 +1,67 @@ +import { + ChatPromptTemplate, + MessagesPlaceholder, +} from "@langchain/core/prompts"; +import { ChatVertexAI } from "@langchain/google-vertexai"; +import { HumanMessage } from "@langchain/core/messages"; +import fs from "fs"; +import { z } from "zod"; + +function fileToBase64(filePath: string): string { + return fs.readFileSync(filePath, "base64"); +} + +const mozartMp3File = "Mozart_Requiem_D_minor.mp3"; +const mozartInBase64 = fileToBase64(mozartMp3File); + +const tool = z.object({ + instruments: z + .array(z.string()) + .describe("A list of instruments found in the audio."), +}); + +const model = new ChatVertexAI({ + model: "gemini-1.5-pro-preview-0409", + temperature: 0, +}).withStructuredOutput(tool, { + name: "instruments_list_tool", +}); + +const prompt = ChatPromptTemplate.fromMessages([ + new MessagesPlaceholder("audio"), +]); + +const chain = prompt.pipe(model); +const response = await chain.invoke({ + audio: new HumanMessage({ + content: [ + { + type: "media", + mimeType: "audio/mp3", + data: mozartInBase64, + }, + + { + type: "text", + text: `The following audio is a song by Mozart. Respond with a list of instruments you hear in the song. + +Rules: +Use the "instruments_list_tool" to return a list of tasks.`, + }, + ], + }), +}); + +console.log("response", response); +/* +response { + instruments: [ + 'violin', 'viola', + 'cello', 'double bass', + 'flute', 'oboe', + 'clarinet', 'bassoon', + 'horn', 'trumpet', + 'timpani' + ] +} +*/ diff --git a/examples/src/use_cases/media/video.ts b/examples/src/use_cases/media/video.ts new file mode 100644 index 000000000000..275cb5e8a2d1 --- /dev/null +++ b/examples/src/use_cases/media/video.ts @@ -0,0 +1,64 @@ +import { + ChatPromptTemplate, + MessagesPlaceholder, +} from "@langchain/core/prompts"; +import { ChatVertexAI } from "@langchain/google-vertexai"; +import { HumanMessage } from "@langchain/core/messages"; +import fs from "fs"; +import { z } from "zod"; + +function fileToBase64(filePath: string): string { + return fs.readFileSync(filePath, "base64"); +} + +const lanceLsEvalsVideo = "lance_ls_eval_video.mp4"; +const lanceInBase64 = fileToBase64(lanceLsEvalsVideo); + +const tool = z.object({ + tasks: z.array(z.string()).describe("A list of tasks."), +}); + +const model = new ChatVertexAI({ + model: "gemini-1.5-pro-preview-0409", + temperature: 0, +}).withStructuredOutput(tool, { + name: "tasks_list_tool", +}); + +const prompt = ChatPromptTemplate.fromMessages([ + new MessagesPlaceholder("video"), +]); + +const chain = prompt.pipe(model); +const response = await chain.invoke({ + video: new HumanMessage({ + content: [ + { + type: "media", + mimeType: "video/mp4", + data: lanceInBase64, + }, + { + type: "text", + text: `The following video is an overview of how to build datasets in LangSmith. +Given the following video, come up with three tasks I should do to further improve my knowledge around using datasets in LangSmith. +Only reference features that were outlined or described in the video. + +Rules: +Use the "tasks_list_tool" to return a list of tasks. +Your tasks should be tailored for an engineer who is looking to improve their knowledge around using datasets and evaluations, specifically with LangSmith.`, + }, + ], + }), +}); + +console.log("response", response); +/* +response { + tasks: [ + 'Explore the LangSmith SDK documentation for in-depth understanding of dataset creation, manipulation, and versioning functionalities.', + 'Experiment with different dataset types like Key-Value, Chat, and LLM to understand their structures and use cases.', + 'Try uploading a CSV file containing question-answer pairs to LangSmith and create a new dataset from it.' + ] +} +*/ diff --git a/langchain-core/src/prompts/chat.ts b/langchain-core/src/prompts/chat.ts index 45defd2ce07d..948d88eb12ed 100644 --- a/langchain-core/src/prompts/chat.ts +++ b/langchain-core/src/prompts/chat.ts @@ -706,7 +706,13 @@ function _coerceMessagePromptTemplateLike( const message = coerceMessageLikeToMessage(messagePromptTemplateLike); let templateData: | string - | (string | _TextTemplateParam | _ImageTemplateParam)[]; + | ( + | string + | _TextTemplateParam + | _ImageTemplateParam + // eslint-disable-next-line @typescript-eslint/no-explicit-any + | Record + )[]; if (typeof message.content === "string") { templateData = message.content; @@ -718,7 +724,7 @@ function _coerceMessagePromptTemplateLike( } else if ("image_url" in item) { return { image_url: item.image_url }; } else { - throw new Error("Invalid message content"); + return item; } }); } diff --git a/libs/langchain-google-common/src/utils/common.ts b/libs/langchain-google-common/src/utils/common.ts index 512535987234..f242de83215c 100644 --- a/libs/langchain-google-common/src/utils/common.ts +++ b/libs/langchain-google-common/src/utils/common.ts @@ -24,6 +24,7 @@ export function copyAIModelParamsInto( const model = options?.model ?? params?.model ?? target.model; ret.modelName = model ?? options?.modelName ?? params?.modelName ?? target.modelName; + ret.model = model; ret.temperature = options?.temperature ?? params?.temperature ?? target.temperature; ret.maxOutputTokens = diff --git a/libs/langchain-google-common/src/utils/gemini.ts b/libs/langchain-google-common/src/utils/gemini.ts index e15bdde098d1..8e504408a832 100644 --- a/libs/langchain-google-common/src/utils/gemini.ts +++ b/libs/langchain-google-common/src/utils/gemini.ts @@ -35,6 +35,18 @@ import type { } from "../types.js"; import { GoogleAISafetyError } from "./safety.js"; +const extractMimeType = ( + str: string +): { mimeType: string; data: string } | null => { + if (str.startsWith("data:")) { + return { + mimeType: str.split(":")[1].split(";")[0], + data: str.split(",")[1], + }; + } + return null; +}; + function messageContentText( content: MessageContentText ): GeminiPartText | null { @@ -54,17 +66,14 @@ function messageContentImageUrl( typeof content.image_url === "string" ? content.image_url : content.image_url.url; - if (!url) { throw new Error("Missing Image URL"); } - if (url.startsWith("data:")) { + const mineTypeAndData = extractMimeType(url); + if (mineTypeAndData) { return { - inlineData: { - mimeType: url.split(":")[1].split(";")[0], - data: url.split(",")[1], - }, + inlineData: mineTypeAndData, }; } else { // FIXME - need some way to get mime type @@ -77,6 +86,29 @@ function messageContentImageUrl( } } +function messageContentMedia( + // eslint-disable-next-line @typescript-eslint/no-explicit-any + content: Record +): GeminiPartInlineData | GeminiPartFileData { + if ("mimeType" in content && "data" in content) { + return { + inlineData: { + mimeType: content.mimeType, + data: content.data, + }, + }; + } else if ("mimeType" in content && "fileUri" in content) { + return { + fileData: { + mimeType: content.mimeType, + fileUri: content.fileUri, + }, + }; + } + + throw new Error("Invalid media content"); +} + export function messageContentToParts(content: MessageContent): GeminiPart[] { // Convert a string to a text type MessageContent if needed const messageContent: MessageContent = @@ -104,6 +136,8 @@ export function messageContentToParts(content: MessageContent): GeminiPart[] { return messageContentImageUrl(content as MessageContentImageUrl); } break; + case "media": + return messageContentMedia(content); default: throw new Error( `Unsupported type received while converting message to message parts` diff --git a/libs/langchain-google-gauth/package.json b/libs/langchain-google-gauth/package.json index cef43ab3f6a9..4fa126edb474 100644 --- a/libs/langchain-google-gauth/package.json +++ b/libs/langchain-google-gauth/package.json @@ -65,7 +65,8 @@ "release-it": "^15.10.1", "rollup": "^4.5.2", "ts-jest": "^29.1.0", - "typescript": "<5.2.0" + "typescript": "<5.2.0", + "zod": "^3.22.4" }, "publishConfig": { "access": "public" diff --git a/libs/langchain-google-gauth/src/tests/chat_models.audio.int.test.ts b/libs/langchain-google-gauth/src/tests/chat_models.audio.int.test.ts new file mode 100644 index 000000000000..8cc04b4c61c2 --- /dev/null +++ b/libs/langchain-google-gauth/src/tests/chat_models.audio.int.test.ts @@ -0,0 +1,178 @@ +import fs from "fs"; +import { + ChatPromptTemplate, + MessagesPlaceholder, +} from "@langchain/core/prompts"; +import { HumanMessage } from "@langchain/core/messages"; +import { z } from "zod"; +import { ChatGoogle } from "../chat_models.js"; + +function fileToBase64(filePath: string): string { + const fileData = fs.readFileSync(filePath); + const base64String = Buffer.from(fileData).toString("base64"); + return base64String; +} + +test.skip("Gemini can understand audio", async () => { + const model = new ChatGoogle({ + model: "gemini-1.5-pro-preview-0409", + temperature: 0, + }); + + const audioPath = "../../examples/Mozart_Requiem_D_minor.mp3"; + const audioMimeType = "audio/mp3"; + const audioBase64 = fileToBase64(audioPath); + + const prompt = ChatPromptTemplate.fromMessages([ + new MessagesPlaceholder("audio"), + ]); + + const chain = prompt.pipe(model); + const response = await chain.invoke({ + audio: new HumanMessage({ + content: [ + { + type: "media", + mimeType: audioMimeType, + data: audioBase64, + }, + { + type: "text", + text: "Do you know this song? If so, who is the composer and can you give me a brief overview of the tone/tempo?", + }, + ], + }), + }); + + expect(typeof response.content).toBe("string"); + expect((response.content as string).length).toBeGreaterThan(15); +}); + +test.skip("Gemini can understand video", async () => { + const model = new ChatGoogle({ + model: "gemini-1.5-pro-preview-0409", + temperature: 0, + }); + + const videoPath = "../../examples/lance_ls_eval_video.mp4"; + const videoMimeType = "video/mp4"; + const videoBase64 = fileToBase64(videoPath); + + const prompt = ChatPromptTemplate.fromMessages([ + new MessagesPlaceholder("video"), + ]); + + const chain = prompt.pipe(model); + const response = await chain.invoke({ + video: new HumanMessage({ + content: [ + { + type: "media", + mimeType: videoMimeType, + data: videoBase64, + }, + { + type: "text", + text: "Summarize the video in a few sentences.", + }, + ], + }), + }); + + expect(typeof response.content).toBe("string"); + expect((response.content as string).length).toBeGreaterThan(15); +}); + +test.skip("Gemini can use tools with audio", async () => { + const audioPath = "../../examples/Mozart_Requiem_D_minor.mp3"; + const audioMimeType = "audio/mp3"; + const audioBase64 = fileToBase64(audioPath); + + const tool = z.object({ + instruments: z + .array(z.string()) + .describe("A list of instruments found in the audio."), + }); + + const model = new ChatGoogle({ + model: "gemini-1.5-pro-preview-0409", + temperature: 0, + }).withStructuredOutput(tool, { + name: "instruments_list_tool", + }); + + const prompt = ChatPromptTemplate.fromMessages([ + new MessagesPlaceholder("audio"), + ]); + + const chain = prompt.pipe(model); + const response = await chain.invoke({ + audio: new HumanMessage({ + content: [ + { + type: "media", + mimeType: audioMimeType, + data: audioBase64, + }, + + { + type: "text", + text: `The following audio is a song by Mozart. Respond with a list of instruments you hear in the song. + + Rules: + Use the "instruments_list_tool" to return a list of tasks.`, + }, + ], + }), + }); + + expect(response.instruments).toBeTruthy(); + expect(response.instruments.length).toBeGreaterThan(0); +}); + +test.skip("Gemini can use tools with video", async () => { + const videoPath = "../../examples/lance_ls_eval_video.mp4"; + const videoMimeType = "video/mp4"; + const videoBase64 = fileToBase64(videoPath); + + const tool = z.object({ + tasks: z.array(z.string()).describe("A list of tasks."), + }); + + const model = new ChatGoogle({ + model: "gemini-1.5-pro-preview-0409", + temperature: 0, + }).withStructuredOutput(tool, { + name: "tasks_list_tool", + }); + + const prompt = ChatPromptTemplate.fromMessages([ + new MessagesPlaceholder("video"), + ]); + + const chain = prompt.pipe(model); + const response = await chain.invoke({ + video: new HumanMessage({ + content: [ + { + type: "media", + mimeType: videoMimeType, + data: videoBase64, + }, + { + type: "text", + text: `The following video is an overview of how to build datasets in LangSmith. + Given the following video, come up with three tasks I should do to further improve my knowledge around using datasets in LangSmith. + Only reference features that were outlined or described in the video. + + Rules: + Use the "tasks_list_tool" to return a list of tasks. + Your tasks should be tailored for an engineer who is looking to improve their knowledge around using datasets and evaluations, specifically with LangSmith.`, + }, + ], + }), + }); + + expect(response.tasks).toBeTruthy(); + expect(response.tasks.length).toBeGreaterThanOrEqual(3); +}); diff --git a/yarn.lock b/yarn.lock index 819d35b18d47..ca2c1b40c297 100644 --- a/yarn.lock +++ b/yarn.lock @@ -9486,6 +9486,7 @@ __metadata: rollup: ^4.5.2 ts-jest: ^29.1.0 typescript: <5.2.0 + zod: ^3.22.4 languageName: unknown linkType: soft