|
| 1 | +import { omit } from 'lodash' |
| 2 | +import { ICommonObject, INode, INodeData, INodeOptionsValue, INodeParams } from '../../../src/Interface' |
| 3 | +import { getCredentialData, getCredentialParam } from '../../../src/utils' |
| 4 | +import { S3Client, GetObjectCommand, S3ClientConfig, ListObjectsV2Command, ListObjectsV2Output } from '@aws-sdk/client-s3' |
| 5 | +import { getRegions, MODEL_TYPE } from '../../../src/modelLoader' |
| 6 | +import { Readable } from 'node:stream' |
| 7 | +import * as fsDefault from 'node:fs' |
| 8 | +import * as path from 'node:path' |
| 9 | +import * as os from 'node:os' |
| 10 | + |
| 11 | +import { DirectoryLoader } from 'langchain/document_loaders/fs/directory' |
| 12 | +import { JSONLoader } from 'langchain/document_loaders/fs/json' |
| 13 | +import { CSVLoader } from 'langchain/document_loaders/fs/csv' |
| 14 | +import { PDFLoader } from 'langchain/document_loaders/fs/pdf' |
| 15 | +import { DocxLoader } from 'langchain/document_loaders/fs/docx' |
| 16 | +import { TextLoader } from 'langchain/document_loaders/fs/text' |
| 17 | +import { TextSplitter } from 'langchain/text_splitter' |
| 18 | + |
| 19 | +class S3_DocumentLoaders implements INode { |
| 20 | + label: string |
| 21 | + name: string |
| 22 | + version: number |
| 23 | + description: string |
| 24 | + type: string |
| 25 | + icon: string |
| 26 | + category: string |
| 27 | + baseClasses: string[] |
| 28 | + credential: INodeParams |
| 29 | + inputs?: INodeParams[] |
| 30 | + |
| 31 | + constructor() { |
| 32 | + this.label = 'S3 Directory' |
| 33 | + this.name = 's3Directory' |
| 34 | + this.version = 3.0 |
| 35 | + this.type = 'Document' |
| 36 | + this.icon = 's3.svg' |
| 37 | + this.category = 'Document Loaders' |
| 38 | + this.description = 'Load Data from S3 Buckets' |
| 39 | + this.baseClasses = [this.type] |
| 40 | + this.credential = { |
| 41 | + label: 'Credential', |
| 42 | + name: 'credential', |
| 43 | + type: 'credential', |
| 44 | + credentialNames: ['awsApi'], |
| 45 | + optional: true |
| 46 | + } |
| 47 | + this.inputs = [ |
| 48 | + { |
| 49 | + label: 'Text Splitter', |
| 50 | + name: 'textSplitter', |
| 51 | + type: 'TextSplitter', |
| 52 | + optional: true |
| 53 | + }, |
| 54 | + { |
| 55 | + label: 'Bucket', |
| 56 | + name: 'bucketName', |
| 57 | + type: 'string' |
| 58 | + }, |
| 59 | + { |
| 60 | + label: 'Region', |
| 61 | + name: 'region', |
| 62 | + type: 'asyncOptions', |
| 63 | + loadMethod: 'listRegions', |
| 64 | + default: 'us-east-1' |
| 65 | + }, |
| 66 | + { |
| 67 | + label: 'Server URL', |
| 68 | + name: 'serverUrl', |
| 69 | + description: |
| 70 | + 'The fully qualified endpoint of the webservice. This is only for using a custom endpoint (for example, when using a local version of S3).', |
| 71 | + type: 'string', |
| 72 | + optional: true |
| 73 | + }, |
| 74 | + { |
| 75 | + label: 'Prefix', |
| 76 | + name: 'prefix', |
| 77 | + type: 'string', |
| 78 | + description: 'Limits the response to keys that begin with the specified prefix', |
| 79 | + placeholder: 'TestFolder/Something', |
| 80 | + optional: true |
| 81 | + }, |
| 82 | + { |
| 83 | + label: 'Pdf Usage', |
| 84 | + name: 'pdfUsage', |
| 85 | + type: 'options', |
| 86 | + options: [ |
| 87 | + { |
| 88 | + label: 'One document per page', |
| 89 | + name: 'perPage' |
| 90 | + }, |
| 91 | + { |
| 92 | + label: 'One document per file', |
| 93 | + name: 'perFile' |
| 94 | + } |
| 95 | + ], |
| 96 | + default: 'perPage', |
| 97 | + optional: true, |
| 98 | + additionalParams: true |
| 99 | + }, |
| 100 | + { |
| 101 | + label: 'Additional Metadata', |
| 102 | + name: 'metadata', |
| 103 | + type: 'json', |
| 104 | + description: 'Additional metadata to be added to the extracted documents', |
| 105 | + optional: true, |
| 106 | + additionalParams: true |
| 107 | + }, |
| 108 | + { |
| 109 | + label: 'Omit Metadata Keys', |
| 110 | + name: 'omitMetadataKeys', |
| 111 | + type: 'string', |
| 112 | + rows: 4, |
| 113 | + description: |
| 114 | + 'Each document loader comes with a default set of metadata keys that are extracted from the document. You can use this field to omit some of the default metadata keys. The value should be a list of keys, seperated by comma. Use * to omit all metadata keys execept the ones you specify in the Additional Metadata field', |
| 115 | + placeholder: 'key1, key2, key3.nestedKey1', |
| 116 | + optional: true, |
| 117 | + additionalParams: true |
| 118 | + } |
| 119 | + ] |
| 120 | + } |
| 121 | + |
| 122 | + loadMethods = { |
| 123 | + async listRegions(): Promise<INodeOptionsValue[]> { |
| 124 | + return await getRegions(MODEL_TYPE.CHAT, 'awsChatBedrock') |
| 125 | + } |
| 126 | + } |
| 127 | + |
| 128 | + async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> { |
| 129 | + const textSplitter = nodeData.inputs?.textSplitter as TextSplitter |
| 130 | + const bucketName = nodeData.inputs?.bucketName as string |
| 131 | + const prefix = nodeData.inputs?.prefix as string |
| 132 | + const region = nodeData.inputs?.region as string |
| 133 | + const serverUrl = nodeData.inputs?.serverUrl as string |
| 134 | + const pdfUsage = nodeData.inputs?.pdfUsage |
| 135 | + const metadata = nodeData.inputs?.metadata |
| 136 | + const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string |
| 137 | + |
| 138 | + let omitMetadataKeys: string[] = [] |
| 139 | + if (_omitMetadataKeys) { |
| 140 | + omitMetadataKeys = _omitMetadataKeys.split(',').map((key) => key.trim()) |
| 141 | + } |
| 142 | + |
| 143 | + let credentials: S3ClientConfig['credentials'] | undefined |
| 144 | + |
| 145 | + if (nodeData.credential) { |
| 146 | + const credentialData = await getCredentialData(nodeData.credential, options) |
| 147 | + const accessKeyId = getCredentialParam('awsKey', credentialData, nodeData) |
| 148 | + const secretAccessKey = getCredentialParam('awsSecret', credentialData, nodeData) |
| 149 | + |
| 150 | + if (accessKeyId && secretAccessKey) { |
| 151 | + credentials = { |
| 152 | + accessKeyId, |
| 153 | + secretAccessKey |
| 154 | + } |
| 155 | + } |
| 156 | + } |
| 157 | + |
| 158 | + let s3Config: S3ClientConfig = { |
| 159 | + region: region, |
| 160 | + credentials: credentials |
| 161 | + } |
| 162 | + |
| 163 | + if (serverUrl) { |
| 164 | + s3Config = { |
| 165 | + region: region, |
| 166 | + credentials: credentials, |
| 167 | + endpoint: serverUrl, |
| 168 | + forcePathStyle: true |
| 169 | + } |
| 170 | + } |
| 171 | + |
| 172 | + const tempDir = fsDefault.mkdtempSync(path.join(os.tmpdir(), 's3fileloader-')) |
| 173 | + |
| 174 | + try { |
| 175 | + const s3Client = new S3Client(s3Config) |
| 176 | + |
| 177 | + const listObjectsOutput: ListObjectsV2Output = await s3Client.send( |
| 178 | + new ListObjectsV2Command({ |
| 179 | + Bucket: bucketName, |
| 180 | + Prefix: prefix |
| 181 | + }) |
| 182 | + ) |
| 183 | + |
| 184 | + const keys: string[] = (listObjectsOutput?.Contents ?? []).filter((item) => item.Key && item.ETag).map((item) => item.Key!) |
| 185 | + |
| 186 | + await Promise.all( |
| 187 | + keys.map(async (key) => { |
| 188 | + const filePath = path.join(tempDir, key) |
| 189 | + try { |
| 190 | + const response = await s3Client.send( |
| 191 | + new GetObjectCommand({ |
| 192 | + Bucket: bucketName, |
| 193 | + Key: key |
| 194 | + }) |
| 195 | + ) |
| 196 | + |
| 197 | + const objectData = await new Promise<Buffer>((resolve, reject) => { |
| 198 | + const chunks: Buffer[] = [] |
| 199 | + |
| 200 | + if (response.Body instanceof Readable) { |
| 201 | + response.Body.on('data', (chunk: Buffer) => chunks.push(chunk)) |
| 202 | + response.Body.on('end', () => resolve(Buffer.concat(chunks))) |
| 203 | + response.Body.on('error', reject) |
| 204 | + } else { |
| 205 | + reject(new Error('Response body is not a readable stream.')) |
| 206 | + } |
| 207 | + }) |
| 208 | + |
| 209 | + // create the directory if it doesnt already exist |
| 210 | + fsDefault.mkdirSync(path.dirname(filePath), { recursive: true }) |
| 211 | + |
| 212 | + // write the file to the directory |
| 213 | + fsDefault.writeFileSync(filePath, objectData) |
| 214 | + } catch (e: any) { |
| 215 | + throw new Error(`Failed to download file ${key} from S3 bucket ${bucketName}: ${e.message}`) |
| 216 | + } |
| 217 | + }) |
| 218 | + ) |
| 219 | + |
| 220 | + const loader = new DirectoryLoader( |
| 221 | + tempDir, |
| 222 | + { |
| 223 | + '.json': (path) => new JSONLoader(path), |
| 224 | + '.txt': (path) => new TextLoader(path), |
| 225 | + '.csv': (path) => new CSVLoader(path), |
| 226 | + '.docx': (path) => new DocxLoader(path), |
| 227 | + '.pdf': (path) => |
| 228 | + pdfUsage === 'perFile' |
| 229 | + ? // @ts-ignore |
| 230 | + new PDFLoader(path, { splitPages: false, pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') }) |
| 231 | + : // @ts-ignore |
| 232 | + new PDFLoader(path, { pdfjs: () => import('pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js') }), |
| 233 | + '.aspx': (path) => new TextLoader(path), |
| 234 | + '.asp': (path) => new TextLoader(path), |
| 235 | + '.cpp': (path) => new TextLoader(path), // C++ |
| 236 | + '.c': (path) => new TextLoader(path), |
| 237 | + '.cs': (path) => new TextLoader(path), |
| 238 | + '.css': (path) => new TextLoader(path), |
| 239 | + '.go': (path) => new TextLoader(path), // Go |
| 240 | + '.h': (path) => new TextLoader(path), // C++ Header files |
| 241 | + '.kt': (path) => new TextLoader(path), // Kotlin |
| 242 | + '.java': (path) => new TextLoader(path), // Java |
| 243 | + '.js': (path) => new TextLoader(path), // JavaScript |
| 244 | + '.less': (path) => new TextLoader(path), // Less files |
| 245 | + '.ts': (path) => new TextLoader(path), // TypeScript |
| 246 | + '.php': (path) => new TextLoader(path), // PHP |
| 247 | + '.proto': (path) => new TextLoader(path), // Protocol Buffers |
| 248 | + '.python': (path) => new TextLoader(path), // Python |
| 249 | + '.py': (path) => new TextLoader(path), // Python |
| 250 | + '.rst': (path) => new TextLoader(path), // reStructuredText |
| 251 | + '.ruby': (path) => new TextLoader(path), // Ruby |
| 252 | + '.rb': (path) => new TextLoader(path), // Ruby |
| 253 | + '.rs': (path) => new TextLoader(path), // Rust |
| 254 | + '.scala': (path) => new TextLoader(path), // Scala |
| 255 | + '.sc': (path) => new TextLoader(path), // Scala |
| 256 | + '.scss': (path) => new TextLoader(path), // Sass |
| 257 | + '.sol': (path) => new TextLoader(path), // Solidity |
| 258 | + '.sql': (path) => new TextLoader(path), //SQL |
| 259 | + '.swift': (path) => new TextLoader(path), // Swift |
| 260 | + '.markdown': (path) => new TextLoader(path), // Markdown |
| 261 | + '.md': (path) => new TextLoader(path), // Markdown |
| 262 | + '.tex': (path) => new TextLoader(path), // LaTeX |
| 263 | + '.ltx': (path) => new TextLoader(path), // LaTeX |
| 264 | + '.html': (path) => new TextLoader(path), // HTML |
| 265 | + '.vb': (path) => new TextLoader(path), // Visual Basic |
| 266 | + '.xml': (path) => new TextLoader(path) // XML |
| 267 | + }, |
| 268 | + true |
| 269 | + ) |
| 270 | + |
| 271 | + let docs = [] |
| 272 | + |
| 273 | + if (textSplitter) { |
| 274 | + docs = await loader.loadAndSplit(textSplitter) |
| 275 | + } else { |
| 276 | + docs = await loader.load() |
| 277 | + } |
| 278 | + |
| 279 | + if (metadata) { |
| 280 | + const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata) |
| 281 | + docs = docs.map((doc) => ({ |
| 282 | + ...doc, |
| 283 | + metadata: |
| 284 | + _omitMetadataKeys === '*' |
| 285 | + ? { |
| 286 | + ...parsedMetadata |
| 287 | + } |
| 288 | + : omit( |
| 289 | + { |
| 290 | + ...doc.metadata, |
| 291 | + ...parsedMetadata |
| 292 | + }, |
| 293 | + omitMetadataKeys |
| 294 | + ) |
| 295 | + })) |
| 296 | + } else { |
| 297 | + docs = docs.map((doc) => ({ |
| 298 | + ...doc, |
| 299 | + metadata: |
| 300 | + _omitMetadataKeys === '*' |
| 301 | + ? {} |
| 302 | + : omit( |
| 303 | + { |
| 304 | + ...doc.metadata |
| 305 | + }, |
| 306 | + omitMetadataKeys |
| 307 | + ) |
| 308 | + })) |
| 309 | + } |
| 310 | + |
| 311 | + // remove the temp directory before returning docs |
| 312 | + fsDefault.rmSync(tempDir, { recursive: true }) |
| 313 | + |
| 314 | + return docs |
| 315 | + } catch (e: any) { |
| 316 | + fsDefault.rmSync(tempDir, { recursive: true }) |
| 317 | + throw new Error(`Failed to load data from bucket ${bucketName}: ${e.message}`) |
| 318 | + } |
| 319 | + } |
| 320 | +} |
| 321 | +module.exports = { nodeClass: S3_DocumentLoaders } |
0 commit comments