Skip to content

Commit e20e0e4

Browse files
committed
got audio working
1 parent 7676f8f commit e20e0e4

File tree

8 files changed

+356
-23
lines changed

8 files changed

+356
-23
lines changed

langchain-core/src/messages/index.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,10 @@ export type MessageContentImageUrl = {
5252
type AudioMimeType = "audio/mpeg" | "audio/mp3" | "audio/wav";
5353

5454
export type MessageContentAudio = {
55-
type: "audio";
56-
data: {
57-
type: AudioMimeType,
58-
base64: string;
55+
type: "audio_url";
56+
audio_url: string | {
57+
format: AudioMimeType,
58+
url: string;
5959
}
6060
};
6161

langchain-core/src/prompt_values.ts

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,3 +175,68 @@ export class ImagePromptValue extends BasePromptValue {
175175
];
176176
}
177177
}
178+
179+
export type AudioContent = {
180+
/** Specifies the format of the audio data. */
181+
format?: "audio/mpeg" | "audio/mp3" | "audio/wav";
182+
183+
/** Either a URL of the audio or the base64 encoded audio data. */
184+
url: string;
185+
};
186+
187+
export interface AudioPromptValueFields {
188+
audioUrl: AudioContent;
189+
}
190+
191+
/**
192+
* Class that represents an audio prompt value. It extends the
193+
* BasePromptValue and includes an AudioURL instance.
194+
*/
195+
export class AudioPromptValue extends BasePromptValue {
196+
lc_namespace = ["langchain_core", "prompt_values"];
197+
198+
lc_serializable = true;
199+
200+
static lc_name() {
201+
return "AudioPromptValue";
202+
}
203+
204+
audioUrl: AudioContent;
205+
206+
/** @ignore */
207+
value: string;
208+
209+
constructor(fields: AudioPromptValueFields);
210+
211+
constructor(fields: AudioContent);
212+
213+
constructor(fields: AudioContent | AudioPromptValueFields) {
214+
if (!("audioUrl" in fields)) {
215+
// eslint-disable-next-line no-param-reassign
216+
fields = { audioUrl: fields };
217+
}
218+
219+
super(fields);
220+
this.audioUrl = fields.audioUrl;
221+
}
222+
223+
toString() {
224+
return this.audioUrl.url;
225+
}
226+
227+
toChatMessages() {
228+
return [
229+
new HumanMessage({
230+
content: [
231+
{
232+
type: "audio_url",
233+
audio_url: {
234+
format: this.audioUrl.format,
235+
url: this.audioUrl.url,
236+
},
237+
},
238+
],
239+
}),
240+
];
241+
}
242+
}

langchain-core/src/prompts/audio.ts

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
import { MessageContent } from "../messages/index.js";
2+
import { AudioPromptValue, StringPromptValue, AudioContent } from "../prompt_values.js";
3+
import type { InputValues, PartialValues } from "../utils/types/index.js";
4+
import {
5+
BasePromptTemplate,
6+
BasePromptTemplateInput,
7+
TypedPromptInputValues,
8+
} from "./base.js";
9+
import { TemplateFormat, checkValidTemplate } from "./template.js";
10+
11+
/**
12+
* Inputs to create a {@link AudioPromptTemplate}
13+
* @augments BasePromptTemplateInput
14+
*/
15+
export interface AudioPromptTemplateInput<
16+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
17+
RunInput extends InputValues = any,
18+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
19+
PartialVariableName extends string = any
20+
> extends BasePromptTemplateInput<RunInput, PartialVariableName> {
21+
/**
22+
* The prompt template
23+
*/
24+
template: Record<string, unknown>;
25+
26+
/**
27+
* The format of the prompt template. Options are 'f-string'
28+
*
29+
* @defaultValue 'f-string'
30+
*/
31+
templateFormat?: TemplateFormat;
32+
33+
/**
34+
* Whether or not to try validating the template on initialization
35+
*
36+
* @defaultValue `true`
37+
*/
38+
validateTemplate?: boolean;
39+
}
40+
41+
/**
42+
* An audio prompt template for a multimodal model.
43+
*/
44+
export class AudioPromptTemplate<
45+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
46+
RunInput extends InputValues = any,
47+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
48+
PartialVariableName extends string = any
49+
> extends BasePromptTemplate<RunInput, StringPromptValue, PartialVariableName> {
50+
static lc_name() {
51+
return "AudioPromptTemplate";
52+
}
53+
54+
lc_namespace = ["langchain_core", "prompts", "audio"];
55+
56+
template: Record<string, unknown>;
57+
58+
templateFormat: TemplateFormat = "f-string";
59+
60+
validateTemplate = true;
61+
62+
constructor(input: AudioPromptTemplateInput<RunInput, PartialVariableName>) {
63+
super(input);
64+
this.template = input.template;
65+
this.templateFormat = input.templateFormat ?? this.templateFormat;
66+
this.validateTemplate = input.validateTemplate ?? this.validateTemplate;
67+
68+
if (this.validateTemplate) {
69+
let totalInputVariables: string[] = this.inputVariables;
70+
if (this.partialVariables) {
71+
totalInputVariables = totalInputVariables.concat(
72+
Object.keys(this.partialVariables)
73+
);
74+
}
75+
checkValidTemplate(
76+
[
77+
{ type: "audio_url", audio_url: this.template },
78+
] as unknown as MessageContent,
79+
this.templateFormat,
80+
totalInputVariables
81+
);
82+
}
83+
}
84+
85+
_getPromptType(): "prompt" {
86+
return "prompt";
87+
}
88+
89+
/**
90+
* Partially applies values to the prompt template.
91+
* @param values The values to be partially applied to the prompt template.
92+
* @returns A new instance of AudioPromptTemplate with the partially applied values.
93+
*/
94+
async partial<NewPartialVariableName extends string>(
95+
values: PartialValues<NewPartialVariableName>
96+
) {
97+
const newInputVariables = this.inputVariables.filter(
98+
(iv) => !(iv in values)
99+
) as Exclude<Extract<keyof RunInput, string>, NewPartialVariableName>[];
100+
const newPartialVariables = {
101+
...(this.partialVariables ?? {}),
102+
...values,
103+
} as PartialValues<PartialVariableName | NewPartialVariableName>;
104+
const promptDict = {
105+
...this,
106+
inputVariables: newInputVariables,
107+
partialVariables: newPartialVariables,
108+
};
109+
return new AudioPromptTemplate<
110+
InputValues<
111+
Exclude<Extract<keyof RunInput, string>, NewPartialVariableName>
112+
>
113+
>(promptDict);
114+
}
115+
116+
/**
117+
* Formats the prompt template with the provided values.
118+
* @param values The values to be used to format the prompt template.
119+
* @returns A promise that resolves to a string which is the formatted prompt.
120+
*/
121+
async format<FormatOutput = AudioContent>(
122+
values: TypedPromptInputValues<RunInput>
123+
): Promise<FormatOutput> {
124+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
125+
const formatted: Record<string, any> = {};
126+
for (const [key, value] of Object.entries(this.template)) {
127+
if (typeof value === "string") {
128+
formatted[key] = value.replace(/{([^{}]*)}/g, (match, group) => {
129+
const replacement = values[group];
130+
return typeof replacement === "string" ||
131+
typeof replacement === "number"
132+
? String(replacement)
133+
: match;
134+
});
135+
} else {
136+
formatted[key] = value;
137+
}
138+
}
139+
const format = values.format || formatted.format;
140+
const url = values.url || formatted.url;
141+
if (!format) {
142+
throw new Error("Must provide an audio format type.");
143+
}
144+
145+
const output: AudioContent = { url };
146+
if (format) {
147+
output.format = format;
148+
}
149+
return output as FormatOutput;
150+
}
151+
152+
/**
153+
* Formats the prompt given the input values and returns a formatted
154+
* prompt value.
155+
* @param values The input values to format the prompt.
156+
* @returns A Promise that resolves to a formatted prompt value.
157+
*/
158+
async formatPromptValue(
159+
values: TypedPromptInputValues<RunInput>
160+
): Promise<AudioPromptValue> {
161+
const formattedPrompt = await this.format(values);
162+
return new AudioPromptValue(formattedPrompt);
163+
}
164+
}

langchain-core/src/prompts/chat.ts

Lines changed: 66 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import {
2828
import { PromptTemplate, type ParamsFromFString } from "./prompt.js";
2929
import { ImagePromptTemplate } from "./image.js";
3030
import { parseFString } from "./template.js";
31+
import { AudioPromptTemplate } from "./audio.js";
3132

3233
/**
3334
* Abstract class that serves as a base for creating message prompt
@@ -344,6 +345,11 @@ interface _ImageTemplateParam {
344345
image_url?: string | Record<string, any>;
345346
}
346347

348+
interface _AudioTemplateParam {
349+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
350+
audio_url?: string | Record<string, any>;
351+
}
352+
347353
type MessageClass =
348354
| typeof HumanMessage
349355
| typeof AIMessage
@@ -381,6 +387,10 @@ class _StringImageMessagePromptTemplate<
381387
| MessageStringPromptTemplateFields<
382388
InputValues<Extract<keyof RunInput, string>>
383389
>
390+
| AudioPromptTemplate<
391+
InputValues<Extract<keyof RunInput, string>>,
392+
string
393+
>
384394
>;
385395

386396
protected messageClass?: MessageClass;
@@ -455,14 +465,14 @@ class _StringImageMessagePromptTemplate<
455465
}
456466

457467
static fromTemplate(
458-
template: string | Array<string | _TextTemplateParam | _ImageTemplateParam>,
468+
template: string | Array<string | _TextTemplateParam | _ImageTemplateParam | _AudioTemplateParam>,
459469
additionalOptions?: Record<string, unknown>
460470
) {
461471
if (typeof template === "string") {
462472
return new this(PromptTemplate.fromTemplate(template));
463473
}
464474
const prompt: Array<
465-
PromptTemplate<InputValues> | ImagePromptTemplate<InputValues>
475+
PromptTemplate<InputValues> | ImagePromptTemplate<InputValues> | AudioPromptTemplate<InputValues>
466476
> = [];
467477
for (const item of template) {
468478
if (
@@ -519,6 +529,50 @@ class _StringImageMessagePromptTemplate<
519529
throw new Error("Invalid image template");
520530
}
521531
prompt.push(imgTemplateObject);
532+
} else if (typeof item === "object" && "audio_url" in item) {
533+
const castItem = item as _AudioTemplateParam;
534+
let audioTemplate = castItem.audio_url ?? "";
535+
let audioTemplateObject: AudioPromptTemplate<InputValues>;
536+
let inputVariables: string[] = [];
537+
if (typeof audioTemplate === "string") {
538+
const parsedTemplate = parseFString(audioTemplate);
539+
const variables = parsedTemplate.flatMap((item) =>
540+
item.type === "variable" ? [item.name] : []
541+
);
542+
543+
if ((variables?.length ?? 0) > 0) {
544+
if (variables.length > 1) {
545+
throw new Error(
546+
`Only one format variable allowed per audio template.\nGot: ${variables}\nFrom: ${audioTemplate}`
547+
);
548+
}
549+
inputVariables = [variables[0]];
550+
} else {
551+
inputVariables = [];
552+
}
553+
554+
audioTemplate = { url: audioTemplate };
555+
audioTemplateObject = new AudioPromptTemplate<InputValues>({
556+
template: audioTemplate,
557+
inputVariables,
558+
});
559+
} else if (typeof audioTemplate === "object") {
560+
if ("url" in audioTemplate) {
561+
const parsedTemplate = parseFString(audioTemplate.url);
562+
inputVariables = parsedTemplate.flatMap((item) =>
563+
item.type === "variable" ? [item.name] : []
564+
);
565+
} else {
566+
inputVariables = [];
567+
}
568+
audioTemplateObject = new AudioPromptTemplate<InputValues>({
569+
template: audioTemplate,
570+
inputVariables,
571+
});
572+
} else {
573+
throw new Error("Invalid audio template");
574+
}
575+
prompt.push(audioTemplateObject);
522576
}
523577
}
524578
return new this({ prompt, additionalOptions });
@@ -559,6 +613,12 @@ class _StringImageMessagePromptTemplate<
559613
inputs as TypedPromptInputValues<RunInput>
560614
);
561615
content.push({ type: "image_url", image_url: formatted });
616+
// eslint-disable-next-line no-instanceof/no-instanceof
617+
} else if (prompt instanceof AudioPromptTemplate) {
618+
const formatted = await prompt.format(
619+
inputs as TypedPromptInputValues<RunInput>
620+
);
621+
content.push({ type: "audio_url", audio_url: formatted });
562622
}
563623
}
564624

@@ -706,7 +766,7 @@ function _coerceMessagePromptTemplateLike(
706766
const message = coerceMessageLikeToMessage(messagePromptTemplateLike);
707767
let templateData:
708768
| string
709-
| (string | _TextTemplateParam | _ImageTemplateParam)[];
769+
| (string | _TextTemplateParam | _ImageTemplateParam | _AudioTemplateParam)[];
710770

711771
if (typeof message.content === "string") {
712772
templateData = message.content;
@@ -717,8 +777,8 @@ function _coerceMessagePromptTemplateLike(
717777
return { text: item.text };
718778
} else if ("image_url" in item) {
719779
return { image_url: item.image_url };
720-
} else if ("data" in item) {
721-
return { data: item.data };
780+
} else if ("audio_url" in item) {
781+
return { audio_url: item.audio_url };
722782
} else {
723783
throw new Error("Invalid message content");
724784
}
@@ -887,6 +947,7 @@ export class ChatPromptTemplate<
887947
): Promise<BaseMessage[]> {
888948
const allValues = await this.mergePartialAndUserVariables(values);
889949
let resultMessages: BaseMessage[] = [];
950+
console.log("FORMAT MESSAGES", this.promptMessages)
890951

891952
for (const promptMessage of this.promptMessages) {
892953
// eslint-disable-next-line no-instanceof/no-instanceof

0 commit comments

Comments
 (0)