Skip to content

Commit 030607b

Browse files
Unstructured in memory loader (#5726)
* feat: add in-memory option * feat: add in memory community * fix: object destructure fix * Fix lint * Rename args and add to docstring * Export types --------- Co-authored-by: andrewdoro <[email protected]>
1 parent a550a7f commit 030607b

File tree

5 files changed

+138
-27
lines changed

5 files changed

+138
-27
lines changed

langchain/src/document_loaders/fs/unstructured.ts

+36-10
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,11 @@ type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & {
126126
unknown?: UnknownHandling;
127127
};
128128

129+
type UnstructuredMemoryLoaderOptions = {
130+
buffer: Buffer;
131+
fileName: string;
132+
};
133+
129134
/**
130135
* @deprecated - Import from "@langchain/community/document_loaders/fs/unstructured" instead. This entrypoint will be removed in 0.3.0.
131136
*
@@ -139,6 +144,10 @@ type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & {
139144
export class UnstructuredLoader extends BaseDocumentLoader {
140145
public filePath: string;
141146

147+
private buffer?: Buffer;
148+
149+
private fileName?: string;
150+
142151
private apiUrl = "https://api.unstructured.io/general/v0/general";
143152

144153
private apiKey?: string;
@@ -175,19 +184,30 @@ export class UnstructuredLoader extends BaseDocumentLoader {
175184
private maxCharacters?: number;
176185

177186
constructor(
178-
filePathOrLegacyApiUrl: string,
187+
filePathOrLegacyApiUrlOrMemoryBuffer:
188+
| string
189+
| UnstructuredMemoryLoaderOptions,
179190
optionsOrLegacyFilePath: UnstructuredLoaderOptions | string = {}
180191
) {
181192
super();
182193

183194
// Temporary shim to avoid breaking existing users
184195
// Remove when API keys are enforced by Unstructured and existing code will break anyway
185196
const isLegacySyntax = typeof optionsOrLegacyFilePath === "string";
186-
if (isLegacySyntax) {
197+
const isMemorySyntax =
198+
typeof filePathOrLegacyApiUrlOrMemoryBuffer === "object";
199+
200+
if (isMemorySyntax) {
201+
this.buffer = filePathOrLegacyApiUrlOrMemoryBuffer.buffer;
202+
this.fileName = filePathOrLegacyApiUrlOrMemoryBuffer.fileName;
203+
} else if (isLegacySyntax) {
187204
this.filePath = optionsOrLegacyFilePath;
188-
this.apiUrl = filePathOrLegacyApiUrl;
205+
this.apiUrl = filePathOrLegacyApiUrlOrMemoryBuffer;
189206
} else {
190-
this.filePath = filePathOrLegacyApiUrl;
207+
this.filePath = filePathOrLegacyApiUrlOrMemoryBuffer;
208+
}
209+
210+
if (!isLegacySyntax) {
191211
const options = optionsOrLegacyFilePath;
192212
this.apiKey = options.apiKey;
193213
this.apiUrl = options.apiUrl ?? this.apiUrl;
@@ -209,14 +229,20 @@ export class UnstructuredLoader extends BaseDocumentLoader {
209229
}
210230

211231
async _partition() {
212-
const { readFile, basename } = await this.imports();
232+
let { buffer } = this;
233+
let { fileName } = this;
234+
235+
if (!buffer) {
236+
const { readFile, basename } = await this.imports();
213237

214-
const buffer = await readFile(this.filePath);
215-
const fileName = basename(this.filePath);
238+
buffer = await readFile(this.filePath);
239+
fileName = basename(this.filePath);
240+
241+
// I'm aware this reads the file into memory first, but we have lots of work
242+
// to do on then consuming Documents in a streaming fashion anyway, so not
243+
// worried about this for now.
244+
}
216245

217-
// I'm aware this reads the file into memory first, but we have lots of work
218-
// to do on then consuming Documents in a streaming fashion anyway, so not
219-
// worried about this for now.
220246
const formData = new FormData();
221247
formData.append("files", new Blob([buffer]), fileName);
222248
formData.append("strategy", this.strategy);

langchain/src/document_loaders/tests/unstructured.int.test.ts

+29
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import * as url from "node:url";
55
import * as path from "node:path";
6+
import { readFile } from "node:fs/promises";
67
import { test, expect } from "@jest/globals";
78
import {
89
UnstructuredDirectoryLoader,
@@ -29,6 +30,34 @@ test.skip("Test Unstructured base loader", async () => {
2930
}
3031
});
3132

33+
test.skip("Test Unstructured base loader with buffer", async () => {
34+
const filePath = path.resolve(
35+
path.dirname(url.fileURLToPath(import.meta.url)),
36+
"./example_data/example.txt"
37+
);
38+
39+
const options = {
40+
apiKey: process.env.UNSTRUCTURED_API_KEY!,
41+
};
42+
43+
const buffer = await readFile(filePath);
44+
const fileName = "example.txt";
45+
46+
const loader = new UnstructuredLoader(
47+
{
48+
buffer,
49+
fileName,
50+
},
51+
options
52+
);
53+
const docs = await loader.load();
54+
55+
expect(docs.length).toBe(3);
56+
for (const doc of docs) {
57+
expect(typeof doc.pageContent).toBe("string");
58+
}
59+
});
60+
3261
test.skip("Test Unstructured base loader with fast strategy", async () => {
3362
const filePath = path.resolve(
3463
path.dirname(url.fileURLToPath(import.meta.url)),

libs/langchain-community/.eslintrc.cjs

+1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ module.exports = {
6464
"prefer-rest-params": 0,
6565
"new-cap": ["error", { properties: false, capIsNew: false }],
6666
"arrow-body-style": 0,
67+
"prefer-destructuring": 0
6768
},
6869
overrides: [
6970
{

libs/langchain-community/src/document_loaders/fs/unstructured.ts

+43-17
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import {
1010
} from "langchain/document_loaders/fs/directory";
1111
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";
1212

13-
const UNSTRUCTURED_API_FILETYPES = [
13+
export const UNSTRUCTURED_API_FILETYPES = [
1414
".txt",
1515
".text",
1616
".pdf",
@@ -94,7 +94,7 @@ export type SkipInferTableTypes =
9494
/**
9595
* Set the chunking_strategy to chunk text into larger or smaller elements. Defaults to None with optional arg of by_title
9696
*/
97-
type ChunkingStrategy = "None" | "by_title";
97+
export type ChunkingStrategy = "None" | "by_title";
9898

9999
export type UnstructuredLoaderOptions = {
100100
apiKey?: string;
@@ -115,22 +115,34 @@ export type UnstructuredLoaderOptions = {
115115
maxCharacters?: number;
116116
};
117117

118-
type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & {
118+
export type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & {
119119
recursive?: boolean;
120120
unknown?: UnknownHandling;
121121
};
122122

123+
export type UnstructuredMemoryLoaderOptions = {
124+
buffer: Buffer;
125+
fileName: string;
126+
};
127+
123128
/**
124129
* A document loader that uses the Unstructured API to load unstructured
125130
* documents. It supports both the new syntax with options object and the
126131
* legacy syntax for backward compatibility. The load() method sends a
127132
* partitioning request to the Unstructured API and retrieves the
128133
* partitioned elements. It creates a Document instance for each element
129134
* and returns an array of Document instances.
135+
*
136+
* It accepts either a filepath or an object containing a buffer and a filename
137+
* as input.
130138
*/
131139
export class UnstructuredLoader extends BaseDocumentLoader {
132140
public filePath: string;
133141

142+
private buffer?: Buffer;
143+
144+
private fileName?: string;
145+
134146
private apiUrl = "https://api.unstructured.io/general/v0/general";
135147

136148
private apiKey?: string;
@@ -167,20 +179,28 @@ export class UnstructuredLoader extends BaseDocumentLoader {
167179
private maxCharacters?: number;
168180

169181
constructor(
170-
filePathOrLegacyApiUrl: string,
171-
optionsOrLegacyFilePath: UnstructuredLoaderOptions | string = {}
182+
filepathOrBufferOptions: string | UnstructuredMemoryLoaderOptions,
183+
unstructuredOptions: UnstructuredLoaderOptions | string = {}
172184
) {
173185
super();
174186

175187
// Temporary shim to avoid breaking existing users
176188
// Remove when API keys are enforced by Unstructured and existing code will break anyway
177-
const isLegacySyntax = typeof optionsOrLegacyFilePath === "string";
178-
if (isLegacySyntax) {
179-
this.filePath = optionsOrLegacyFilePath;
180-
this.apiUrl = filePathOrLegacyApiUrl;
189+
const isLegacySyntax = typeof unstructuredOptions === "string";
190+
const isMemorySyntax = typeof filepathOrBufferOptions === "object";
191+
192+
if (isMemorySyntax) {
193+
this.buffer = filepathOrBufferOptions.buffer;
194+
this.fileName = filepathOrBufferOptions.fileName;
195+
} else if (isLegacySyntax) {
196+
this.filePath = unstructuredOptions;
197+
this.apiUrl = filepathOrBufferOptions;
181198
} else {
182-
this.filePath = filePathOrLegacyApiUrl;
183-
const options = optionsOrLegacyFilePath;
199+
this.filePath = filepathOrBufferOptions;
200+
}
201+
202+
if (!isLegacySyntax) {
203+
const options = unstructuredOptions;
184204
this.apiKey =
185205
options.apiKey ?? getEnvironmentVariable("UNSTRUCTURED_API_KEY");
186206
this.apiUrl =
@@ -205,14 +225,20 @@ export class UnstructuredLoader extends BaseDocumentLoader {
205225
}
206226

207227
async _partition() {
208-
const { readFile, basename } = await this.imports();
228+
let buffer = this.buffer;
229+
let fileName = this.fileName;
230+
231+
if (!buffer) {
232+
const { readFile, basename } = await this.imports();
209233

210-
const buffer = await readFile(this.filePath);
211-
const fileName = basename(this.filePath);
234+
buffer = await readFile(this.filePath);
235+
fileName = basename(this.filePath);
236+
237+
// I'm aware this reads the file into memory first, but we have lots of work
238+
// to do on then consuming Documents in a streaming fashion anyway, so not
239+
// worried about this for now.
240+
}
212241

213-
// I'm aware this reads the file into memory first, but we have lots of work
214-
// to do on then consuming Documents in a streaming fashion anyway, so not
215-
// worried about this for now.
216242
const formData = new FormData();
217243
formData.append("files", new Blob([buffer]), fileName);
218244
formData.append("strategy", this.strategy);

libs/langchain-community/src/document_loaders/tests/unstructured.int.test.ts

+29
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import * as url from "node:url";
55
import * as path from "node:path";
6+
import { readFile } from "node:fs/promises";
67
import { test, expect } from "@jest/globals";
78
import {
89
UnstructuredDirectoryLoader,
@@ -29,6 +30,34 @@ test.skip("Test Unstructured base loader", async () => {
2930
}
3031
});
3132

33+
test.skip("Test Unstructured base loader with buffer", async () => {
34+
const filePath = path.resolve(
35+
path.dirname(url.fileURLToPath(import.meta.url)),
36+
"./example_data/example.txt"
37+
);
38+
39+
const options = {
40+
apiKey: process.env.UNSTRUCTURED_API_KEY!,
41+
};
42+
43+
const buffer = await readFile(filePath);
44+
const fileName = "example.txt";
45+
46+
const loader = new UnstructuredLoader(
47+
{
48+
buffer,
49+
fileName,
50+
},
51+
options
52+
);
53+
const docs = await loader.load();
54+
55+
expect(docs.length).toBe(3);
56+
for (const doc of docs) {
57+
expect(typeof doc.pageContent).toBe("string");
58+
}
59+
});
60+
3261
test.skip("Test Unstructured base loader with fast strategy", async () => {
3362
const filePath = path.resolve(
3463
path.dirname(url.fileURLToPath(import.meta.url)),

0 commit comments

Comments
 (0)