@@ -229,7 +229,7 @@ def chunk_document_multimodal(self,
229
229
doc_filepath : str ) -> \
230
230
List [object ]:
231
231
"""
232
- Process a pdf document into multimodal chunks (b64 and text) for embeddings
232
+ Process a file document into multimodal chunks (b64 and text) for embeddings
233
233
234
234
Args:
235
235
doc_name: file name of document
@@ -242,14 +242,14 @@ def chunk_document_multimodal(self,
242
242
"""
243
243
Logger .info (f"generating index data for { doc_name } " )
244
244
245
- # Confirm that this is a PDF
245
+ # Confirm that this is a valid file type
246
+ allowed_image_types = ["png" , "jpeg" , "jpg" , "bmp" , "gif" ]
246
247
try :
247
248
doc_extension = doc_name .split ("." )[- 1 ]
248
249
doc_extension = doc_extension .lower ()
249
- if doc_extension != "pdf" :
250
- raise ValueError (f"File { doc_name } must be a PDF" )
250
+ if doc_extension != "pdf" and doc_extension not in allowed_image_types :
251
+ raise ValueError (f"{ doc_name } must be a PDF, PNG, JPG, BMP, or GIF " )
251
252
# TODO: Insert elif statements to check for additional types of
252
- # multimodal docs, such as images (PNG, JPG, BMP, GIF, TIFF, etc),
253
253
# videos (AVI, MP4, MOV, etc), and audio (MP3, WAV, etc)
254
254
except Exception as e :
255
255
Logger .error (f"error reading doc { doc_name } : { e } " )
@@ -279,6 +279,7 @@ def chunk_document_multimodal(self,
279
279
chunk_bucket_folder = f"{ bucket_folder } /{ chunk_bucket_folder } "
280
280
281
281
# If doc is a PDF, convert it to an array of PNGs for each page
282
+ allowed_image_types = ["png" , "jpg" , "jpeg" , "bmp" , "gif" ]
282
283
if doc_extension == "pdf" :
283
284
284
285
with tempfile .TemporaryDirectory () as path :
@@ -290,27 +291,16 @@ def chunk_document_multimodal(self,
290
291
num_pages = len (reader .pages )
291
292
Logger .info (f"Reading pdf doc { doc_name } with { num_pages } pages" )
292
293
for i in range (num_pages ):
293
- # Create a pdf file for the page and chunk into text chunks
294
+ # Create a pdf file for the page and chunk into contextual_text
294
295
pdf_doc = self .create_pdf_page (reader .pages [i ], doc_filepath , i )
295
- #chunk_document returns 2 outputs, text_chunks and embed_chunks.
296
- #Each element of text_chunks has the same info as its corresponding
297
- #element in embed_chunks, but is padded with adjacent sentences
298
- #before and after. Use the 2nd output here (embed_chunks).
299
- _ , embed_chunks = self .chunk_document (pdf_doc ["filename" ],
300
- doc_url , pdf_doc ["filepath" ])
301
- contextual_text = [string .strip () for string in embed_chunks ]
302
- contextual_text = " " .join (contextual_text )
303
- #TODO: Consider all characters in my_contextual_text,
304
- #not just the first 1024
305
- contextual_text = contextual_text [0 :1023 ]
296
+ contextual_text = self .extract_contextual_text (pdf_doc ["filename" ],
297
+ pdf_doc ["filepath" ], doc_url )
306
298
307
299
# Take PNG version of page and convert to b64
308
300
png_doc_filepath = \
309
301
".png" .join (pdf_doc ["filepath" ].rsplit (".pdf" , 1 ))
310
302
png_array [i ].save (png_doc_filepath , format = "png" )
311
- with open (png_doc_filepath , "rb" ) as f :
312
- png_bytes = f .read ()
313
- png_b64 = b64encode (png_bytes ).decode ("utf-8" )
303
+ png_b64 = self .extract_b64 (png_doc_filepath )
314
304
315
305
# Upload to Google Cloud Bucket and return gs URL
316
306
png_url = gcs_helper .upload_to_gcs (self .storage_client ,
@@ -329,9 +319,26 @@ def chunk_document_multimodal(self,
329
319
"text" : contextual_text
330
320
}
331
321
doc_chunks .append (chunk_obj )
322
+ elif doc_extension in allowed_image_types :
323
+ # TODO: Convert image file into something text readable (pdf, html, ext)
324
+ # So that we can extract text chunks
325
+
326
+ # Get text associated with the document
327
+ contextual_text = self .extract_contextual_text (doc_name ,
328
+ doc_filepath , doc_url )
329
+
330
+ # Get b64 for the document
331
+ image_b64 = self .extract_b64 (doc_filepath )
332
+
333
+ # Push chunk object into chunk array
334
+ chunk_obj = {
335
+ "image" : image_b64 ,
336
+ "image_url" : doc_url ,
337
+ "text" : contextual_text
338
+ }
339
+ doc_chunks .append (chunk_obj )
332
340
333
341
# TODO: Insert elif statements to chunk additional types of
334
- # multimodal docs, such as images (PNG, JPG, BMP, GIF, TIFF, etc),
335
342
# videos (AVI, MP4, MOV, etc), and audio (MP3, WAV, etc)
336
343
# - For images, set "image" and "text" fields of chunk_obj
337
344
# - For video and audio, set "timestamp_start" and "timestamp_stop"
@@ -344,6 +351,52 @@ def chunk_document_multimodal(self,
344
351
# Return array of page data
345
352
return doc_chunks
346
353
354
+ def extract_contextual_text (self , doc_name : str , doc_filepath : str , \
355
+ doc_url : str ) -> str :
356
+ """
357
+ Extract the contextual text for a multimodal document
358
+
359
+ Args:
360
+
361
+ doc_name: The name of the doc we are reading the data from
362
+ doc_filepath: string filepath of the doc we are reading the data from
363
+ doc_url: The url of the doc we are reading the data from
364
+ Returns:
365
+ str containing the contextual_text of a multimodal doc
366
+ """
367
+ #chunk_document returns 2 outputs, text_chunks and contextual_text.
368
+ #Each element of text_chunks has the same info as its corresponding
369
+ #element in contextual_text, but is padded with adjacent sentences
370
+ #before and after. Use the 2nd output here (contextual_text).
371
+ _ , contextual_text = self .chunk_document (doc_name ,
372
+ doc_url , doc_filepath )
373
+
374
+ # Format text if not None
375
+ if contextual_text is not None :
376
+ contextual_text = [string .strip () for string in contextual_text ]
377
+ contextual_text = " " .join (contextual_text )
378
+
379
+ #TODO: Consider all characters in my_contextual_text,
380
+ #not just the first 1024
381
+ contextual_text = contextual_text [0 :1023 ]
382
+
383
+ return contextual_text
384
+
385
+ def extract_b64 (self , doc_filepath : str ) -> str :
386
+ """
387
+ Extract the b64 a multimodal document
388
+
389
+ Args:
390
+ doc_filepath: string filepath of the doc we are reading the data from
391
+ Returns:
392
+ str containing b64 of the doc
393
+ """
394
+ # Take the doc and convert it to b64
395
+ with open (doc_filepath , "rb" ) as f :
396
+ doc_bytes = f .read ()
397
+ doc_b64 = b64encode (doc_bytes ).decode ("utf-8" )
398
+ return doc_b64
399
+
347
400
@classmethod
348
401
def text_to_sentence_list (cls , text : str ) -> List [str ]:
349
402
"""
0 commit comments