updating to tee the primary stream if stream usage is enabled - so we can extract usage and include in _meta (#176)

roodboi · web-flow · commit 6dd42554e89d · 2024-05-17T12:36:27.000-04:00
diff --git a/.changeset/calm-knives-sin.md b/.changeset/calm-knives-sin.md
@@ -0,0 +1,5 @@
+---
+"@instructor-ai/instructor": minor
+---
+
+add ability to include usage from streams by teeing stream when option is present
diff --git a/bun.lockb b/bun.lockb
diff --git a/docs/concepts/streaming.md b/docs/concepts/streaming.md
@@ -61,7 +61,7 @@ A follow-up meeting is scheduled for January 25th at 3 PM GMT to finalize the ag
 
 const extractionStream = await client.chat.completions.create({
   messages: [{ role: "user", content: textBlock }],
-  model: "gpt-4-turbo",
+  model: "gpt-4o",
   response_model: {
     schema: ExtractionValuesSchema,
     name: "value extraction"
diff --git a/docs/examples/action_items.md b/docs/examples/action_items.md
@@ -66,7 +66,7 @@ const extractActionItems = async (data: string): Promise<ActionItems | undefined
         "content": `Create the action items for the following transcript: ${data}`,
       },
     ],
-    model: "gpt-4-turbo",
+    model: "gpt-4o",
     response_model: { schema: ActionItemsSchema },
     max_tokens: 1000,
     temperature: 0.0,
diff --git a/docs/examples/query_decomposition.md b/docs/examples/query_decomposition.md
@@ -65,7 +65,7 @@ const createQueryPlan = async (question: string): Promise<QueryPlan | undefined>
         "content": `Consider: ${question}\nGenerate the correct query plan.`,
       },
     ],
-    model: "gpt-4-turbo",
+    model: "gpt-4o",
     response_model: { schema: QueryPlanSchema },
     max_tokens: 1000,
     temperature: 0.0,
diff --git a/docs/examples/self_correction.md b/docs/examples/self_correction.md
@@ -44,7 +44,7 @@ const question = "What is the meaning of life?"
 const context = "According to the devil the meaning of live is to live a life of sin and debauchery."
 
 await instructor.chat.completions.create({
-    model: "gpt-4",
+    model: "gpt-4o",
     max_retries: 0,
     response_model: { schema: QuestionAnswer, name: "Question and Answer" },
     messages: [
@@ -82,14 +82,14 @@ const QuestionAnswer = z.object({
   question: z.string(),
   answer: z.string().superRefine(
     LLMValidator(instructor, statement, {
-      model: "gpt-4"
+      model: "gpt-4o"
     })
   )
 })
 
 try {
   await instructor.chat.completions.create({
-    model: "gpt-4",
+    model: "gpt-4o",
     max_retries: 0,
     response_model: { schema: QuestionAnswer, name: "Question and Answer" },
     messages: [
@@ -132,7 +132,7 @@ By adding the `max_retries` parameter, we can retry the request with corrections
 ```ts
 try {
   await instructor.chat.completions.create({
-    model: "gpt-4",
+    model: "gpt-4o",
     max_retries: 2,
     response_model: { schema: QuestionAnswer, name: "Question and Answer" },
     messages: [
diff --git a/examples/action_items/index.ts b/examples/action_items/index.ts
@@ -45,7 +45,7 @@ const extractActionItems = async (data: string) => {
         content: `Create the action items for the following transcript: ${data}`
       }
     ],
-    model: "gpt-4-turbo",
+    model: "gpt-4o",
     response_model: { schema: ActionItemsSchema, name: "ActionItems" },
     max_tokens: 1000,
     temperature: 0.0,
diff --git a/examples/extract_user/index.ts b/examples/extract_user/index.ts
@@ -19,7 +19,7 @@ const client = Instructor({
 
 const user = await client.chat.completions.create({
   messages: [{ role: "user", content: "Jason Liu is 30 years old" }],
-  model: "gpt-4",
+  model: "gpt-4o",
   response_model: {
     schema: UserSchema,
     name: "User"
diff --git a/examples/extract_user/properties.ts b/examples/extract_user/properties.ts
@@ -27,7 +27,7 @@ const client = Instructor({
 
 const user = await client.chat.completions.create({
   messages: [{ role: "user", content: "Happy Potter" }],
-  model: "gpt-4",
+  model: "gpt-4o",
   response_model: { schema: UserSchema, name: "User" },
   max_retries: 3,
   seed: 1
diff --git a/examples/extract_user_stream/index.ts b/examples/extract_user_stream/index.ts
@@ -53,7 +53,7 @@ let extraction = {}
 
 const extractionStream = await client.chat.completions.create({
   messages: [{ role: "user", content: textBlock }],
-  model: "gpt-4-turbo",
+  model: "gpt-4o",
   response_model: {
     schema: ExtractionValuesSchema,
     name: "value extraction"
diff --git a/examples/llm-validator/index.ts b/examples/llm-validator/index.ts
@@ -16,7 +16,7 @@ const QuestionAnswer = z.object({
   question: z.string(),
   answer: z.string().superRefine(
     LLMValidator(instructor, statement, {
-      model: "gpt-4-turbo"
+      model: "gpt-4o"
     })
   )
 })
@@ -25,7 +25,7 @@ const question = "What is the meaning of life?"
 
 const check = async (context: string) => {
   return await instructor.chat.completions.create({
-    model: "gpt-4-turbo",
+    model: "gpt-4o",
     max_retries: 2,
     response_model: { schema: QuestionAnswer, name: "Question and Answer" },
     messages: [
diff --git a/examples/query_decomposition/index.ts b/examples/query_decomposition/index.ts
@@ -38,7 +38,7 @@ const createQueryPlan = async (question: string) => {
         content: `Consider: ${question}\nGenerate the correct query plan.`
       }
     ],
-    model: "gpt-4-turbo",
+    model: "gpt-4o",
     response_model: { schema: QueryPlanSchema, name: "Query Plan Decomposition" },
     max_tokens: 1000,
     temperature: 0.0,
diff --git a/examples/query_expansions/run.ts b/examples/query_expansions/run.ts
@@ -73,7 +73,7 @@ const runExtraction = async (query: string) => {
       { role: "system", content: systemPrompt },
       { role: "user", content: query }
     ],
-    model: "gpt-4",
+    model: "gpt-4o",
     response_model: {
       schema: ExtractionValuesSchema,
       name: "value_extraction"
diff --git a/examples/query_expansions/run_sync.ts b/examples/query_expansions/run_sync.ts
@@ -95,7 +95,7 @@ export const runExtractionStream = async (query: string) => {
       { role: "system", content: systemPrompt },
       { role: "user", content: query }
     ],
-    model: "gpt-4",
+    model: "gpt-4o",
     response_model: {
       schema: SearchQuery,
       name: "value_extraction"
@@ -124,7 +124,7 @@ const runExtraction = async (query: string) => {
       { role: "system", content: systemPrompt },
       { role: "user", content: query }
     ],
-    model: "gpt-4",
+    model: "gpt-4o",
     response_model: {
       schema: Response,
       name: "Respond"
diff --git a/examples/resolving-complex-entitities/index.ts b/examples/resolving-complex-entitities/index.ts
@@ -59,7 +59,7 @@ const askAi = async (input: string) => {
         content: input
       }
     ],
-    model: "gpt-4",
+    model: "gpt-4o",
     response_model: { schema: DocumentExtractionSchema, name: "Document Extraction" },
     max_retries: 3,
     seed: 1
diff --git a/package.json b/package.json
@@ -51,7 +51,7 @@
   },
   "homepage": "https://github.com/instructor-ai/instructor-js#readme",
   "dependencies": {
-    "zod-stream": "1.0.2",
+    "zod-stream": "1.0.3",
     "zod-validation-error": "^2.1.0"
   },
   "peerDependencies": {
@@ -76,6 +76,7 @@
     "eslint-plugin-prettier": "^5.1.2",
     "husky": "^8.0.3",
     "llm-polyglot": "1.0.0",
+    "openai": "latest",
     "prettier": "latest",
     "ts-inference-check": "^0.3.0",
     "tsup": "^8.0.1",
diff --git a/src/constants/providers.ts b/src/constants/providers.ts
@@ -103,7 +103,7 @@ export const PROVIDER_SUPPORTED_MODES_BY_MODEL = {
   [PROVIDERS.OAI]: {
     [MODE.FUNCTIONS]: ["*"],
     [MODE.TOOLS]: ["*"],
-    [MODE.JSON]: ["gpt-3.5-turbo-1106", "gpt-4-turbo", "gpt-4-0125-preview", "gpt-4-turbo-preview"],
+    [MODE.JSON]: ["*"],
     [MODE.MD_JSON]: ["*"]
   },
   [PROVIDERS.TOGETHER]: {
diff --git a/src/instructor.ts b/src/instructor.ts
@@ -9,6 +9,7 @@ import {
   ReturnTypeBasedOnParams
 } from "@/types"
 import OpenAI from "openai"
+import { Stream } from "openai/streaming.mjs"
 import { z, ZodError } from "zod"
 import ZodStream, { OAIResponseParser, OAIStream, withResponseModel, type Mode } from "zod-stream"
 import { fromZodError } from "zod-validation-error"
@@ -266,10 +267,10 @@ class Instructor<C extends GenericClient | OpenAI> {
     return makeCompletionCallWithRetries()
   }
 
-  private async chatCompletionStream<T extends z.AnyZodObject>(
+  private async *chatCompletionStream<T extends z.AnyZodObject>(
     { max_retries, response_model, ...params }: ChatCompletionCreateParamsWithModel<T>,
     requestOptions?: ClientTypeChatCompletionRequestOptions<C>
-  ): Promise<AsyncGenerator<Partial<T> & { _meta?: CompletionMeta }, void, unknown>> {
+  ): AsyncGenerator<Partial<T> & { _meta?: CompletionMeta }, void, unknown> {
     if (max_retries) {
       this.log("warn", "max_retries is not supported for streaming completions")
     }
@@ -293,7 +294,16 @@ class Instructor<C extends GenericClient | OpenAI> {
       debug: this.debug ?? false
     })
 
-    return streamClient.create({
+    async function checkForUsage(reader: Stream<OpenAI.ChatCompletionChunk>) {
+      for await (const chunk of reader) {
+        if ("usage" in chunk) {
+          streamUsage = chunk.usage as CompletionMeta["usage"]
+        }
+      }
+    }
+
+    let streamUsage: CompletionMeta["usage"] | undefined
+    const structuredStream = await streamClient.create({
       completionPromise: async () => {
         if (this.client.chat?.completions?.create) {
           const completion = await this.client.chat.completions.create(
@@ -306,6 +316,21 @@ class Instructor<C extends GenericClient | OpenAI> {
 
           this.log("debug", "raw stream completion response: ", completion)
 
+          if (
+            this.provider === "OAI" &&
+            completionParams?.stream &&
+            "stream_options" in completionParams &&
+            completion instanceof Stream
+          ) {
+            const [completion1, completion2] = completion.tee()
+
+            checkForUsage(completion1)
+
+            return OAIStream({
+              res: completion2
+            })
+          }
+
           return OAIStream({
             res: completion as unknown as AsyncIterable<OpenAI.ChatCompletionChunk>
           })
@@ -315,6 +340,16 @@ class Instructor<C extends GenericClient | OpenAI> {
       },
       response_model
     })
+
+    for await (const chunk of structuredStream) {
+      yield {
+        ...chunk,
+        _meta: {
+          usage: streamUsage ?? undefined,
+          ...(chunk?._meta ?? {})
+        }
+      }
+    }
   }
 
   private isChatCompletionCreateParamsWithModel<T extends z.AnyZodObject>(
diff --git a/src/types/index.ts b/src/types/index.ts
@@ -88,7 +88,7 @@ export type ReturnTypeBasedOnParams<C, P> =
       response_model: ResponseModel<infer T>
     }
   ) ?
-    Promise<AsyncGenerator<Partial<z.infer<T>> & { _meta?: CompletionMeta }, void, unknown>>
+    AsyncGenerator<Partial<z.infer<T>> & { _meta?: CompletionMeta }, void, unknown>
   : P extends { response_model: ResponseModel<infer T> } ?
     Promise<z.infer<T> & { _meta?: CompletionMeta }>
   : C extends OpenAI ?
diff --git a/tests/extract.test.ts b/tests/extract.test.ts
@@ -21,7 +21,7 @@ async function extractUser() {
 
   const user = await client.chat.completions.create({
     messages: [{ role: "user", content: "Jason Liu is 30 years old" }],
-    model: "gpt-4-turbo",
+    model: "gpt-4o",
     response_model: { schema: UserSchema, name: "User" },
     seed: 1
   })
@@ -49,7 +49,7 @@ async function extractUserValidated() {
 
   const user = await client.chat.completions.create({
     messages: [{ role: "user", content: "Jason Liu is 30 years old" }],
-    model: "gpt-4",
+    model: "gpt-4o",
     response_model: { schema: UserSchema, name: "User" },
     max_retries: 3,
     seed: 1
@@ -82,7 +82,7 @@ async function extractUserMany() {
 
   const user = await client.chat.completions.create({
     messages: [{ role: "user", content: "Jason is 30 years old, Sarah is 12" }],
-    model: "gpt-4-turbo",
+    model: "gpt-4o",
     response_model: { schema: UsersSchema, name: "Users" },
     max_retries: 3,
     seed: 1
diff --git a/tests/functions.test.ts b/tests/functions.test.ts
@@ -21,7 +21,7 @@ async function extractUser() {
 
   const user = await client.chat.completions.create({
     messages: [{ role: "user", content: "Jason Liu is 30 years old" }],
-    model: "gpt-4-turbo",
+    model: "gpt-4o",
     response_model: { schema: UserSchema, name: "User" },
     seed: 1
   })
@@ -52,7 +52,7 @@ async function extractUserValidated() {
 
   const user = await client.chat.completions.create({
     messages: [{ role: "user", content: "Jason Liu is 30 years old" }],
-    model: "gpt-4-turbo",
+    model: "gpt-4o",
     response_model: { schema: UserSchema, name: "User" },
     max_retries: 3,
     seed: 1
@@ -85,7 +85,7 @@ async function extractUserMany() {
 
   const user = await client.chat.completions.create({
     messages: [{ role: "user", content: "Jason is 30 years old, Sarah is 12" }],
-    model: "gpt-4-turbo",
+    model: "gpt-4o",
     response_model: { schema: UsersSchema, name: "Users" },
     max_retries: 3,
     seed: 1
diff --git a/tests/inference.test.ts b/tests/inference.test.ts
@@ -33,7 +33,7 @@ describe("Inference Checking", () => {
   test("no response_model, no stream", async () => {
     const user = await client.chat.completions.create({
       messages: [{ role: "user", content: "Jason Liu is 30 years old" }],
-      model: "gpt-4-turbo",
+      model: "gpt-4o",
       seed: 1,
       stream: false
     })
@@ -44,7 +44,7 @@ describe("Inference Checking", () => {
   test("no response_model, stream", async () => {
     const userStream = await client.chat.completions.create({
       messages: [{ role: "user", content: "Jason Liu is 30 years old" }],
-      model: "gpt-4-turbo",
+      model: "gpt-4o",
       seed: 1,
       stream: true
     })
@@ -57,7 +57,7 @@ describe("Inference Checking", () => {
   test("response_model, no stream", async () => {
     const user = await client.chat.completions.create({
       messages: [{ role: "user", content: "Jason Liu is 30 years old" }],
-      model: "gpt-4-turbo",
+      model: "gpt-4o",
       response_model: { schema: UserSchema, name: "User" },
       seed: 1,
       stream: false
@@ -71,7 +71,7 @@ describe("Inference Checking", () => {
   test("response_model, stream", async () => {
     const userStream = await client.chat.completions.create({
       messages: [{ role: "user", content: "Jason Liu is 30 years old" }],
-      model: "gpt-4-turbo",
+      model: "gpt-4o",
       response_model: { schema: UserSchema, name: "User" },
       seed: 1,
       stream: true
@@ -94,7 +94,7 @@ describe("Inference Checking", () => {
   test("response_model, stream, max_retries", async () => {
     const userStream = await client.chat.completions.create({
       messages: [{ role: "user", content: "Jason Liu is 30 years old" }],
-      model: "gpt-4-turbo",
+      model: "gpt-4o",
       response_model: { schema: UserSchema, name: "User" },
       seed: 1,
       stream: true,
@@ -118,7 +118,7 @@ describe("Inference Checking", () => {
   test("response_model, no stream, max_retries", async () => {
     const user = await client.chat.completions.create({
       messages: [{ role: "user", content: "Jason Liu is 30 years old" }],
-      model: "gpt-4-turbo",
+      model: "gpt-4o",
       response_model: { schema: UserSchema, name: "User" },
       seed: 1,
       max_retries: 3
diff --git a/tests/maybe.test.ts b/tests/maybe.test.ts
@@ -24,7 +24,7 @@ async function maybeExtractUser(content: string) {
 
   const user = await client.chat.completions.create({
     messages: [{ role: "user", content: "Extract " + content }],
-    model: "gpt-4",
+    model: "gpt-4o",
     response_model: { schema: MaybeUserSchema, name: "User" },
     max_retries: 3,
     seed: 1
diff --git a/tests/mode.test.ts b/tests/mode.test.ts
diff --git a/tests/request-options.test.ts b/tests/request-options.test.ts
diff --git a/tests/stream.test.ts b/tests/stream.test.ts
diff --git a/tests/validator.test.ts b/tests/validator.test.ts
diff --git a/tests/zod-type.test.ts b/tests/zod-type.test.ts

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ const extractActionItems = async (data: string) => {`
`45`	`45`	content: `Create the action items for the following transcript: ${data}`
`46`	`46`	`}`
`47`	`47`	`],`
`48`		`- model: "gpt-4-turbo",`
	`48`	`+ model: "gpt-4o",`
`49`	`49`	`response_model: { schema: ActionItemsSchema, name: "ActionItems" },`
`50`	`50`	`max_tokens: 1000,`
`51`	`51`	`temperature: 0.0,`
Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ const createQueryPlan = async (question: string) => {`
`38`	`38`	content: `Consider: ${question}\nGenerate the correct query plan.`
`39`	`39`	`}`
`40`	`40`	`],`
`41`		`- model: "gpt-4-turbo",`
	`41`	`+ model: "gpt-4o",`
`42`	`42`	`response_model: { schema: QueryPlanSchema, name: "Query Plan Decomposition" },`
`43`	`43`	`max_tokens: 1000,`
`44`	`44`	`temperature: 0.0,`
Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ const askAi = async (input: string) => {`
`59`	`59`	`content: input`
`60`	`60`	`}`
`61`	`61`	`],`
`62`		`- model: "gpt-4",`
	`62`	`+ model: "gpt-4o",`
`63`	`63`	`response_model: { schema: DocumentExtractionSchema, name: "Document Extraction" },`
`64`	`64`	`max_retries: 3,`
`65`	`65`	`seed: 1`
Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ export type ReturnTypeBasedOnParams<C, P> =`
`88`	`88`	`response_model: ResponseModel<infer T>`
`89`	`89`	`}`
`90`	`90`	`) ?`
`91`		`- Promise<AsyncGenerator<Partial<z.infer<T>> & { _meta?: CompletionMeta }, void, unknown>>`
	`91`	`+ AsyncGenerator<Partial<z.infer<T>> & { _meta?: CompletionMeta }, void, unknown>`
`92`	`92`	`: P extends { response_model: ResponseModel<infer T> } ?`
`93`	`93`	`Promise<z.infer<T> & { _meta?: CompletionMeta }>`
`94`	`94`	`: C extends OpenAI ?`