@@ -10,7 +10,7 @@ import {
10
10
} from "langchain/document_loaders/fs/directory" ;
11
11
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base" ;
12
12
13
- const UNSTRUCTURED_API_FILETYPES = [
13
+ export const UNSTRUCTURED_API_FILETYPES = [
14
14
".txt" ,
15
15
".text" ,
16
16
".pdf" ,
@@ -94,7 +94,7 @@ export type SkipInferTableTypes =
94
94
/**
95
95
* Set the chunking_strategy to chunk text into larger or smaller elements. Defaults to None with optional arg of by_title
96
96
*/
97
- type ChunkingStrategy = "None" | "by_title" ;
97
+ export type ChunkingStrategy = "None" | "by_title" ;
98
98
99
99
export type UnstructuredLoaderOptions = {
100
100
apiKey ?: string ;
@@ -115,22 +115,34 @@ export type UnstructuredLoaderOptions = {
115
115
maxCharacters ?: number ;
116
116
} ;
117
117
118
- type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & {
118
+ export type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & {
119
119
recursive ?: boolean ;
120
120
unknown ?: UnknownHandling ;
121
121
} ;
122
122
123
+ export type UnstructuredMemoryLoaderOptions = {
124
+ buffer : Buffer ;
125
+ fileName : string ;
126
+ } ;
127
+
123
128
/**
124
129
* A document loader that uses the Unstructured API to load unstructured
125
130
* documents. It supports both the new syntax with options object and the
126
131
* legacy syntax for backward compatibility. The load() method sends a
127
132
* partitioning request to the Unstructured API and retrieves the
128
133
* partitioned elements. It creates a Document instance for each element
129
134
* and returns an array of Document instances.
135
+ *
136
+ * It accepts either a filepath or an object containing a buffer and a filename
137
+ * as input.
130
138
*/
131
139
export class UnstructuredLoader extends BaseDocumentLoader {
132
140
public filePath : string ;
133
141
142
+ private buffer ?: Buffer ;
143
+
144
+ private fileName ?: string ;
145
+
134
146
private apiUrl = "https://api.unstructured.io/general/v0/general" ;
135
147
136
148
private apiKey ?: string ;
@@ -167,20 +179,28 @@ export class UnstructuredLoader extends BaseDocumentLoader {
167
179
private maxCharacters ?: number ;
168
180
169
181
constructor (
170
- filePathOrLegacyApiUrl : string ,
171
- optionsOrLegacyFilePath : UnstructuredLoaderOptions | string = { }
182
+ filepathOrBufferOptions : string | UnstructuredMemoryLoaderOptions ,
183
+ unstructuredOptions : UnstructuredLoaderOptions | string = { }
172
184
) {
173
185
super ( ) ;
174
186
175
187
// Temporary shim to avoid breaking existing users
176
188
// Remove when API keys are enforced by Unstructured and existing code will break anyway
177
- const isLegacySyntax = typeof optionsOrLegacyFilePath === "string" ;
178
- if ( isLegacySyntax ) {
179
- this . filePath = optionsOrLegacyFilePath ;
180
- this . apiUrl = filePathOrLegacyApiUrl ;
189
+ const isLegacySyntax = typeof unstructuredOptions === "string" ;
190
+ const isMemorySyntax = typeof filepathOrBufferOptions === "object" ;
191
+
192
+ if ( isMemorySyntax ) {
193
+ this . buffer = filepathOrBufferOptions . buffer ;
194
+ this . fileName = filepathOrBufferOptions . fileName ;
195
+ } else if ( isLegacySyntax ) {
196
+ this . filePath = unstructuredOptions ;
197
+ this . apiUrl = filepathOrBufferOptions ;
181
198
} else {
182
- this . filePath = filePathOrLegacyApiUrl ;
183
- const options = optionsOrLegacyFilePath ;
199
+ this . filePath = filepathOrBufferOptions ;
200
+ }
201
+
202
+ if ( ! isLegacySyntax ) {
203
+ const options = unstructuredOptions ;
184
204
this . apiKey =
185
205
options . apiKey ?? getEnvironmentVariable ( "UNSTRUCTURED_API_KEY" ) ;
186
206
this . apiUrl =
@@ -205,14 +225,20 @@ export class UnstructuredLoader extends BaseDocumentLoader {
205
225
}
206
226
207
227
async _partition ( ) {
208
- const { readFile, basename } = await this . imports ( ) ;
228
+ let buffer = this . buffer ;
229
+ let fileName = this . fileName ;
230
+
231
+ if ( ! buffer ) {
232
+ const { readFile, basename } = await this . imports ( ) ;
209
233
210
- const buffer = await readFile ( this . filePath ) ;
211
- const fileName = basename ( this . filePath ) ;
234
+ buffer = await readFile ( this . filePath ) ;
235
+ fileName = basename ( this . filePath ) ;
236
+
237
+ // I'm aware this reads the file into memory first, but we have lots of work
238
+ // to do on then consuming Documents in a streaming fashion anyway, so not
239
+ // worried about this for now.
240
+ }
212
241
213
- // I'm aware this reads the file into memory first, but we have lots of work
214
- // to do on then consuming Documents in a streaming fashion anyway, so not
215
- // worried about this for now.
216
242
const formData = new FormData ( ) ;
217
243
formData . append ( "files" , new Blob ( [ buffer ] ) , fileName ) ;
218
244
formData . append ( "strategy" , this . strategy ) ;
0 commit comments