diff --git a/deno.json b/deno.json index f419933e4dec..7ce0f98dc68d 100644 --- a/deno.json +++ b/deno.json @@ -27,6 +27,7 @@ "uuid": "npm:/uuid", "youtubei.js": "npm:/youtubei.js", "youtube-transcript": "npm:/youtube-transcript", - "neo4j-driver": "npm:/neo4j-driver" + "neo4j-driver": "npm:/neo4j-driver", + "axios": "npm:/axios" } } \ No newline at end of file diff --git a/docs/core_docs/docs/concepts.mdx b/docs/core_docs/docs/concepts.mdx index 387e40d3a1d2..206cb642f857 100644 --- a/docs/core_docs/docs/concepts.mdx +++ b/docs/core_docs/docs/concepts.mdx @@ -515,6 +515,14 @@ In order to solve that we built LangGraph to be this flexible, highly-controllab If you are still using AgentExecutor, do not fear: we still have a guide on [how to use AgentExecutor](/docs/how_to/agent_executor). It is recommended, however, that you start to transition to [LangGraph](https://github.com/langchain-ai/langgraphjs). +### Multimodal + +Some models are multimodal, accepting images, audio and even video as inputs. These are still less common, meaning model providers haven't standardized on the "best" way to define the API. +Multimodal **outputs** are even less common. As such, we've kept our multimodal abstractions fairly light weight and plan to further solidify the multimodal APIs and interaction patterns as the field matures. + +In LangChain, most chat models that support multimodal inputs also accept those values in OpenAI's content blocks format. +So far this is restricted to image inputs. For models like Gemini which support video and other bytes input, the APIs also support the native, model-specific representations. + ### Callbacks LangChain provides a callbacks system that allows you to hook into the various stages of your LLM application. This is useful for logging, monitoring, streaming, and other tasks. diff --git a/docs/core_docs/docs/how_to/index.mdx b/docs/core_docs/docs/how_to/index.mdx index c8e174cf526e..0303e0f2f04a 100644 --- a/docs/core_docs/docs/how_to/index.mdx +++ b/docs/core_docs/docs/how_to/index.mdx @@ -153,7 +153,6 @@ LangChain Tools contain a description of the tool (to pass to the language model - [How to: use built-in tools and built-in toolkits](/docs/how_to/tools_builtin) - [How to: use a chat model to call tools](/docs/how_to/tool_calling/) - [How to: add ad-hoc tool calling capability to LLMs and Chat Models](/docs/how_to/tools_prompting) -- [How to: call tools using multi-modal data](/docs/how_to/tool_calls_multi_modal) ### Agents diff --git a/docs/core_docs/docs/how_to/tool_calls_multi_modal.ipynb b/docs/core_docs/docs/how_to/multimodal_inputs.ipynb similarity index 100% rename from docs/core_docs/docs/how_to/tool_calls_multi_modal.ipynb rename to docs/core_docs/docs/how_to/multimodal_inputs.ipynb diff --git a/docs/core_docs/docs/how_to/multimodal_prompts.ipynb b/docs/core_docs/docs/how_to/multimodal_prompts.ipynb new file mode 100644 index 000000000000..0916444ead9d --- /dev/null +++ b/docs/core_docs/docs/how_to/multimodal_prompts.ipynb @@ -0,0 +1,193 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4facdf7f-680e-4d28-908b-2b8408e2a741", + "metadata": {}, + "source": [ + "# How to use multimodal prompts\n", + "\n", + "Here we demonstrate how to use prompt templates to format multimodal inputs to models. \n", + "\n", + "In this example we will ask a model to describe an image.\n", + "\n", + ":::info Prerequisites\n", + "\n", + "This guide assumes familiarity with the following concepts:\n", + "\n", + "- [Chat models](/docs/concepts/#chat-models)\n", + "- [LangChain Tools](/docs/concepts/#tools)\n", + "\n", + ":::\n", + "\n", + "```{=mdx}\n", + "import Npm2Yarn from \"@theme/Npm2Yarn\"\n", + "\n", + "\n", + " axios @langchain/core @langchain/openai\n", + "\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "0d9fd81a-b7f0-445a-8e3d-cfc2d31fdd59", + "metadata": {}, + "outputs": [], + "source": [ + "import axios from \"axios\";\n", + "\n", + "const imageUrl = \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\";\n", + "const axiosRes = await axios.get(imageUrl, { responseType: \"arraybuffer\" });\n", + "const base64 = btoa(\n", + " new Uint8Array(axiosRes.data).reduce(\n", + " (data, byte) => data + String.fromCharCode(byte),\n", + " ''\n", + " )\n", + ");" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2671f995", + "metadata": {}, + "outputs": [], + "source": [ + "import { ChatPromptTemplate } from \"@langchain/core/prompts\";\n", + "import { ChatOpenAI } from \"@langchain/openai\";\n", + "\n", + "const model = new ChatOpenAI({ model: \"gpt-4o\" })" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4ee35e4f", + "metadata": {}, + "outputs": [], + "source": [ + "const prompt = ChatPromptTemplate.fromMessages(\n", + " [\n", + " [\"system\", \"Describe the image provided\"],\n", + " [\n", + " \"user\",\n", + " [{ type: \"image_url\", image_url: \"data:image/jpeg;base64,{base64}\" }],\n", + " ]\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "089f75c2", + "metadata": {}, + "outputs": [], + "source": [ + "const chain = prompt.pipe(model);" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "02744b06", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The image depicts a scenic outdoor landscape featuring a wooden boardwalk path extending forward through a large field of green grass and vegetation. On either side of the path, the grass is lush and vibrant, with a variety of bushes and low shrubs visible as well. The sky overhead is expansive and mostly clear, adorned with soft, wispy clouds, illuminated by the light giving a warm and serene ambiance. In the distant background, there are clusters of trees and additional foliage, suggesting a natural and tranquil setting, ideal for a peaceful walk or nature exploration.\n" + ] + } + ], + "source": [ + "const response = await chain.invoke({ base64 })\n", + "console.log(response.content)" + ] + }, + { + "cell_type": "markdown", + "id": "e9b9ebf6", + "metadata": {}, + "source": [ + "We can also pass in multiple images." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "02190ee3", + "metadata": {}, + "outputs": [], + "source": [ + "const prompt = ChatPromptTemplate.fromMessages(\n", + " [\n", + " [\"system\", \"compare the two pictures provided\"],\n", + " [\n", + " \"user\",\n", + " [\n", + " {\n", + " \"type\": \"image_url\",\n", + " \"image_url\": \"data:image/jpeg;base64,{imageData1}\",\n", + " },\n", + " {\n", + " \"type\": \"image_url\",\n", + " \"image_url\": \"data:image/jpeg;base64,{imageData2}\",\n", + " },\n", + " ],\n", + " ],\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "42af057b", + "metadata": {}, + "outputs": [], + "source": [ + "const chain = prompt.pipe(model);" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "513abe00", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The two images provided are identical. Both show a wooden boardwalk path extending into a grassy field under a blue sky with scattered clouds. The scenery includes green shrubs and trees in the background, with a bright and clear sky above.\n" + ] + } + ], + "source": [ + "const response = await chain.invoke({ imageData1: base64, imageData2: base64 })\n", + "console.log(response.content)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Deno", + "language": "typescript", + "name": "deno" + }, + "language_info": { + "file_extension": ".ts", + "mimetype": "text/x.typescript", + "name": "typescript", + "nb_converter": "script", + "pygments_lexer": "typescript", + "version": "5.3.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/core_docs/vercel.json b/docs/core_docs/vercel.json index aebe341be564..818afaff2e0e 100644 --- a/docs/core_docs/vercel.json +++ b/docs/core_docs/vercel.json @@ -36,6 +36,10 @@ { "source": "/docs/:path(.*/?)*", "destination": "/v0.1/docs/:path*" + }, + { + "source": "/docs/how_to/tool_calls_multi_modal(/?)", + "destination": "/docs/how_to/multimodal_inputs/" } ] }