Skip to content

Commit 34d0e43

Browse files
slaplante-raftScott LaplanteHenryHengZJ
authored
S3 Directory Document Loading Component (#2818)
* Add new S3Directory component * Add Additional Metadata and Omit Metadata Keys parameters * Update S3Directory.ts add placeholder for prefix --------- Co-authored-by: Scott Laplante <[email protected]> Co-authored-by: Henry Heng <[email protected]>
1 parent 9b0b383 commit 34d0e43

File tree

2 files changed

+326
-0
lines changed

2 files changed

+326
-0
lines changed
Lines changed: 321 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,321 @@
1+
import { omit } from 'lodash'
2+
import { ICommonObject, INode, INodeData, INodeOptionsValue, INodeParams } from '../../../src/Interface'
3+
import { getCredentialData, getCredentialParam } from '../../../src/utils'
4+
import { S3Client, GetObjectCommand, S3ClientConfig, ListObjectsV2Command, ListObjectsV2Output } from '@aws-sdk/client-s3'
5+
import { getRegions, MODEL_TYPE } from '../../../src/modelLoader'
6+
import { Readable } from 'node:stream'
7+
import * as fsDefault from 'node:fs'
8+
import * as path from 'node:path'
9+
import * as os from 'node:os'
10+
11+
import { DirectoryLoader } from 'langchain/document_loaders/fs/directory'
12+
import { JSONLoader } from 'langchain/document_loaders/fs/json'
13+
import { CSVLoader } from 'langchain/document_loaders/fs/csv'
14+
import { PDFLoader } from 'langchain/document_loaders/fs/pdf'
15+
import { DocxLoader } from 'langchain/document_loaders/fs/docx'
16+
import { TextLoader } from 'langchain/document_loaders/fs/text'
17+
import { TextSplitter } from 'langchain/text_splitter'
18+
19+
class S3_DocumentLoaders implements INode {
20+
label: string
21+
name: string
22+
version: number
23+
description: string
24+
type: string
25+
icon: string
26+
category: string
27+
baseClasses: string[]
28+
credential: INodeParams
29+
inputs?: INodeParams[]
30+
31+
constructor() {
32+
this.label = 'S3 Directory'
33+
this.name = 's3Directory'
34+
this.version = 3.0
35+
this.type = 'Document'
36+
this.icon = 's3.svg'
37+
this.category = 'Document Loaders'
38+
this.description = 'Load Data from S3 Buckets'
39+
this.baseClasses = [this.type]
40+
this.credential = {
41+
label: 'Credential',
42+
name: 'credential',
43+
type: 'credential',
44+
credentialNames: ['awsApi'],
45+
optional: true
46+
}
47+
this.inputs = [
48+
{
49+
label: 'Text Splitter',
50+
name: 'textSplitter',
51+
type: 'TextSplitter',
52+
optional: true
53+
},
54+
{
55+
label: 'Bucket',
56+
name: 'bucketName',
57+
type: 'string'
58+
},
59+
{
60+
label: 'Region',
61+
name: 'region',
62+
type: 'asyncOptions',
63+
loadMethod: 'listRegions',
64+
default: 'us-east-1'
65+
},
66+
{
67+
label: 'Server URL',
68+
name: 'serverUrl',
69+
description:
70+
'The fully qualified endpoint of the webservice. This is only for using a custom endpoint (for example, when using a local version of S3).',
71+
type: 'string',
72+
optional: true
73+
},
74+
{
75+
label: 'Prefix',
76+
name: 'prefix',
77+
type: 'string',
78+
description: 'Limits the response to keys that begin with the specified prefix',
79+
placeholder: 'TestFolder/Something',
80+
optional: true
81+
},
82+
{
83+
label: 'Pdf Usage',
84+
name: 'pdfUsage',
85+
type: 'options',
86+
options: [
87+
{
88+
label: 'One document per page',
89+
name: 'perPage'
90+
},
91+
{
92+
label: 'One document per file',
93+
name: 'perFile'
94+
}
95+
],
96+
default: 'perPage',
97+
optional: true,
98+
additionalParams: true
99+
},
100+
{
101+
label: 'Additional Metadata',
102+
name: 'metadata',
103+
type: 'json',
104+
description: 'Additional metadata to be added to the extracted documents',
105+
optional: true,
106+
additionalParams: true
107+
},
108+
{
109+
label: 'Omit Metadata Keys',
110+
name: 'omitMetadataKeys',
111+
type: 'string',
112+
rows: 4,
113+
description:
114+
'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma. Use * to omit all metadata keys execept the ones you specify in the Additional Metadata field',
115+
placeholder: 'key1, key2, key3.nestedKey1',
116+
optional: true,
117+
additionalParams: true
118+
}
119+
]
120+
}
121+
122+
loadMethods = {
123+
async listRegions(): Promise<INodeOptionsValue[]> {
124+
return await getRegions(MODEL_TYPE.CHAT, 'awsChatBedrock')
125+
}
126+
}
127+
128+
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
129+
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
130+
const bucketName = nodeData.inputs?.bucketName as string
131+
const prefix = nodeData.inputs?.prefix as string
132+
const region = nodeData.inputs?.region as string
133+
const serverUrl = nodeData.inputs?.serverUrl as string
134+
const pdfUsage = nodeData.inputs?.pdfUsage
135+
const metadata = nodeData.inputs?.metadata
136+
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
137+
138+
let omitMetadataKeys: string[] = []
139+
if (_omitMetadataKeys) {
140+
omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim())
141+
}
142+
143+
let credentials: S3ClientConfig['credentials'] | undefined
144+
145+
if (nodeData.credential) {
146+
const credentialData = await getCredentialData(nodeData.credential, options)
147+
const accessKeyId = getCredentialParam('awsKey', credentialData, nodeData)
148+
const secretAccessKey = getCredentialParam('awsSecret', credentialData, nodeData)
149+
150+
if (accessKeyId && secretAccessKey) {
151+
credentials = {
152+
accessKeyId,
153+
secretAccessKey
154+
}
155+
}
156+
}
157+
158+
let s3Config: S3ClientConfig = {
159+
region: region,
160+
credentials: credentials
161+
}
162+
163+
if (serverUrl) {
164+
s3Config = {
165+
region: region,
166+
credentials: credentials,
167+
endpoint: serverUrl,
168+
forcePathStyle: true
169+
}
170+
}
171+
172+
const tempDir = fsDefault.mkdtempSync(path.join(os.tmpdir(), 's3fileloader-'))
173+
174+
try {
175+
const s3Client = new S3Client(s3Config)
176+
177+
const listObjectsOutput: ListObjectsV2Output = await s3Client.send(
178+
new ListObjectsV2Command({
179+
Bucket: bucketName,
180+
Prefix: prefix
181+
})
182+
)
183+
184+
const keys: string[] = (listObjectsOutput?.Contents ?? []).filter((item) => item.Key && item.ETag).map((item) => item.Key!)
185+
186+
await Promise.all(
187+
keys.map(async (key) => {
188+
const filePath = path.join(tempDir, key)
189+
try {
190+
const response = await s3Client.send(
191+
new GetObjectCommand({
192+
Bucket: bucketName,
193+
Key: key
194+
})
195+
)
196+
197+
const objectData = await new Promise<Buffer>((resolve, reject) => {
198+
const chunks: Buffer[] = []
199+
200+
if (response.Body instanceof Readable) {
201+
response.Body.on('data', (chunk: Buffer) => chunks.push(chunk))
202+
response.Body.on('end', () => resolve(Buffer.concat(chunks)))
203+
response.Body.on('error', reject)
204+
} else {
205+
reject(new Error('Response body is not a readable stream.'))
206+
}
207+
})
208+
209+
// create the directory if it doesnt already exist
210+
fsDefault.mkdirSync(path.dirname(filePath), { recursive: true })
211+
212+
// write the file to the directory
213+
fsDefault.writeFileSync(filePath, objectData)
214+
} catch (e: any) {
215+
throw new Error(`Failed to download file ${key} from S3 bucket ${bucketName}: ${e.message}`)
216+
}
217+
})
218+
)
219+
220+
const loader = new DirectoryLoader(
221+
tempDir,
222+
{
223+
'.json': (path) => new JSONLoader(path),
224+
'.txt': (path) => new TextLoader(path),
225+
'.csv': (path) => new CSVLoader(path),
226+
'.docx': (path) => new DocxLoader(path),
227+
'.pdf': (path) =>
228+
pdfUsage === 'perFile'
229+
? // @ts-ignore
230+
new PDFLoader(path, { splitPages: false, pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') })
231+
: // @ts-ignore
232+
new PDFLoader(path, { pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') }),
233+
'.aspx': (path) => new TextLoader(path),
234+
'.asp': (path) => new TextLoader(path),
235+
'.cpp': (path) => new TextLoader(path), // C++
236+
'.c': (path) => new TextLoader(path),
237+
'.cs': (path) => new TextLoader(path),
238+
'.css': (path) => new TextLoader(path),
239+
'.go': (path) => new TextLoader(path), // Go
240+
'.h': (path) => new TextLoader(path), // C++ Header files
241+
'.kt': (path) => new TextLoader(path), // Kotlin
242+
'.java': (path) => new TextLoader(path), // Java
243+
'.js': (path) => new TextLoader(path), // JavaScript
244+
'.less': (path) => new TextLoader(path), // Less files
245+
'.ts': (path) => new TextLoader(path), // TypeScript
246+
'.php': (path) => new TextLoader(path), // PHP
247+
'.proto': (path) => new TextLoader(path), // Protocol Buffers
248+
'.python': (path) => new TextLoader(path), // Python
249+
'.py': (path) => new TextLoader(path), // Python
250+
'.rst': (path) => new TextLoader(path), // reStructuredText
251+
'.ruby': (path) => new TextLoader(path), // Ruby
252+
'.rb': (path) => new TextLoader(path), // Ruby
253+
'.rs': (path) => new TextLoader(path), // Rust
254+
'.scala': (path) => new TextLoader(path), // Scala
255+
'.sc': (path) => new TextLoader(path), // Scala
256+
'.scss': (path) => new TextLoader(path), // Sass
257+
'.sol': (path) => new TextLoader(path), // Solidity
258+
'.sql': (path) => new TextLoader(path), //SQL
259+
'.swift': (path) => new TextLoader(path), // Swift
260+
'.markdown': (path) => new TextLoader(path), // Markdown
261+
'.md': (path) => new TextLoader(path), // Markdown
262+
'.tex': (path) => new TextLoader(path), // LaTeX
263+
'.ltx': (path) => new TextLoader(path), // LaTeX
264+
'.html': (path) => new TextLoader(path), // HTML
265+
'.vb': (path) => new TextLoader(path), // Visual Basic
266+
'.xml': (path) => new TextLoader(path) // XML
267+
},
268+
true
269+
)
270+
271+
let docs = []
272+
273+
if (textSplitter) {
274+
docs = await loader.loadAndSplit(textSplitter)
275+
} else {
276+
docs = await loader.load()
277+
}
278+
279+
if (metadata) {
280+
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
281+
docs = docs.map((doc) => ({
282+
...doc,
283+
metadata:
284+
_omitMetadataKeys === '*'
285+
? {
286+
...parsedMetadata
287+
}
288+
: omit(
289+
{
290+
...doc.metadata,
291+
...parsedMetadata
292+
},
293+
omitMetadataKeys
294+
)
295+
}))
296+
} else {
297+
docs = docs.map((doc) => ({
298+
...doc,
299+
metadata:
300+
_omitMetadataKeys === '*'
301+
? {}
302+
: omit(
303+
{
304+
...doc.metadata
305+
},
306+
omitMetadataKeys
307+
)
308+
}))
309+
}
310+
311+
// remove the temp directory before returning docs
312+
fsDefault.rmSync(tempDir, { recursive: true })
313+
314+
return docs
315+
} catch (e: any) {
316+
fsDefault.rmSync(tempDir, { recursive: true })
317+
throw new Error(`Failed to load data from bucket ${bucketName}: ${e.message}`)
318+
}
319+
}
320+
}
321+
module.exports = { nodeClass: S3_DocumentLoaders }
Lines changed: 5 additions & 0 deletions
Loading

0 commit comments

Comments
 (0)