Skip to content

Commit ed2cf2a

Browse files
authored
Merge pull request #43 from apigeek3000/627-chunk-all-images
627: add image types to chunker
2 parents 0acdbb0 + 56808a0 commit ed2cf2a

File tree

2 files changed

+78
-25
lines changed

2 files changed

+78
-25
lines changed

components/llm_service/notebooks/Chunking_Test.ipynb

+4-4
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,10 @@
5959
" slide_chunks = data_source.chunk_document_multi(doc_name, index_doc_url, doc_filepath)\n",
6060
" for slide in slide_chunks:\n",
6161
" print(f\"image_url: {slide['image_url']}\")\n",
62-
" if slide[\"image_b64\"]:\n",
63-
" print(f\"image_b64: {True}\")\n",
64-
" if slide[\"text_chunks\"]:\n",
65-
" print(f\"text_chunks: {True}\")\n",
62+
" if slide[\"image\"]:\n",
63+
" print(f\"image: {True}\")\n",
64+
" if slide[\"text\"]:\n",
65+
" print(f\"text: {True}\")\n",
6666
" # image_bytes = base64.b64decode(slide[\"image_b64\"])\n",
6767
" # with open(\"image.png\", \"wb\") as f:\n",
6868
" # f.write(image_bytes)\n",

components/llm_service/src/services/query/data_source.py

+74-21
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ def chunk_document_multimodal(self,
229229
doc_filepath: str) -> \
230230
List[object]:
231231
"""
232-
Process a pdf document into multimodal chunks (b64 and text) for embeddings
232+
Process a file document into multimodal chunks (b64 and text) for embeddings
233233
234234
Args:
235235
doc_name: file name of document
@@ -242,14 +242,14 @@ def chunk_document_multimodal(self,
242242
"""
243243
Logger.info(f"generating index data for {doc_name}")
244244

245-
# Confirm that this is a PDF
245+
# Confirm that this is a valid file type
246+
allowed_image_types = ["png", "jpeg", "jpg", "bmp", "gif"]
246247
try:
247248
doc_extension = doc_name.split(".")[-1]
248249
doc_extension = doc_extension.lower()
249-
if doc_extension != "pdf":
250-
raise ValueError(f"File {doc_name} must be a PDF")
250+
if doc_extension != "pdf" and doc_extension not in allowed_image_types:
251+
raise ValueError(f"{doc_name} must be a PDF, PNG, JPG, BMP, or GIF")
251252
# TODO: Insert elif statements to check for additional types of
252-
# multimodal docs, such as images (PNG, JPG, BMP, GIF, TIFF, etc),
253253
# videos (AVI, MP4, MOV, etc), and audio (MP3, WAV, etc)
254254
except Exception as e:
255255
Logger.error(f"error reading doc {doc_name}: {e}")
@@ -279,6 +279,7 @@ def chunk_document_multimodal(self,
279279
chunk_bucket_folder = f"{bucket_folder}/{chunk_bucket_folder}"
280280

281281
# If doc is a PDF, convert it to an array of PNGs for each page
282+
allowed_image_types = ["png", "jpg", "jpeg", "bmp", "gif"]
282283
if doc_extension == "pdf":
283284

284285
with tempfile.TemporaryDirectory() as path:
@@ -290,27 +291,16 @@ def chunk_document_multimodal(self,
290291
num_pages = len(reader.pages)
291292
Logger.info(f"Reading pdf doc {doc_name} with {num_pages} pages")
292293
for i in range(num_pages):
293-
# Create a pdf file for the page and chunk into text chunks
294+
# Create a pdf file for the page and chunk into contextual_text
294295
pdf_doc = self.create_pdf_page(reader.pages[i], doc_filepath, i)
295-
#chunk_document returns 2 outputs, text_chunks and embed_chunks.
296-
#Each element of text_chunks has the same info as its corresponding
297-
#element in embed_chunks, but is padded with adjacent sentences
298-
#before and after. Use the 2nd output here (embed_chunks).
299-
_, embed_chunks = self.chunk_document(pdf_doc["filename"],
300-
doc_url, pdf_doc["filepath"])
301-
contextual_text = [string.strip() for string in embed_chunks]
302-
contextual_text = " ".join(contextual_text)
303-
#TODO: Consider all characters in my_contextual_text,
304-
#not just the first 1024
305-
contextual_text = contextual_text[0:1023]
296+
contextual_text = self.extract_contextual_text(pdf_doc["filename"],
297+
pdf_doc["filepath"], doc_url)
306298

307299
# Take PNG version of page and convert to b64
308300
png_doc_filepath = \
309301
".png".join(pdf_doc["filepath"].rsplit(".pdf", 1))
310302
png_array[i].save(png_doc_filepath, format="png")
311-
with open(png_doc_filepath, "rb") as f:
312-
png_bytes = f.read()
313-
png_b64 = b64encode(png_bytes).decode("utf-8")
303+
png_b64 = self.extract_b64(png_doc_filepath)
314304

315305
# Upload to Google Cloud Bucket and return gs URL
316306
png_url = gcs_helper.upload_to_gcs(self.storage_client,
@@ -329,9 +319,26 @@ def chunk_document_multimodal(self,
329319
"text": contextual_text
330320
}
331321
doc_chunks.append(chunk_obj)
322+
elif doc_extension in allowed_image_types:
323+
# TODO: Convert image file into something text readable (pdf, html, ext)
324+
# So that we can extract text chunks
325+
326+
# Get text associated with the document
327+
contextual_text = self.extract_contextual_text(doc_name,
328+
doc_filepath, doc_url)
329+
330+
# Get b64 for the document
331+
image_b64 = self.extract_b64(doc_filepath)
332+
333+
# Push chunk object into chunk array
334+
chunk_obj = {
335+
"image": image_b64,
336+
"image_url": doc_url,
337+
"text": contextual_text
338+
}
339+
doc_chunks.append(chunk_obj)
332340

333341
# TODO: Insert elif statements to chunk additional types of
334-
# multimodal docs, such as images (PNG, JPG, BMP, GIF, TIFF, etc),
335342
# videos (AVI, MP4, MOV, etc), and audio (MP3, WAV, etc)
336343
# - For images, set "image" and "text" fields of chunk_obj
337344
# - For video and audio, set "timestamp_start" and "timestamp_stop"
@@ -344,6 +351,52 @@ def chunk_document_multimodal(self,
344351
# Return array of page data
345352
return doc_chunks
346353

354+
def extract_contextual_text(self, doc_name: str, doc_filepath: str, \
355+
doc_url: str) -> str:
356+
"""
357+
Extract the contextual text for a multimodal document
358+
359+
Args:
360+
361+
doc_name: The name of the doc we are reading the data from
362+
doc_filepath: string filepath of the doc we are reading the data from
363+
doc_url: The url of the doc we are reading the data from
364+
Returns:
365+
str containing the contextual_text of a multimodal doc
366+
"""
367+
#chunk_document returns 2 outputs, text_chunks and contextual_text.
368+
#Each element of text_chunks has the same info as its corresponding
369+
#element in contextual_text, but is padded with adjacent sentences
370+
#before and after. Use the 2nd output here (contextual_text).
371+
_, contextual_text = self.chunk_document(doc_name,
372+
doc_url, doc_filepath)
373+
374+
# Format text if not None
375+
if contextual_text is not None:
376+
contextual_text = [string.strip() for string in contextual_text]
377+
contextual_text = " ".join(contextual_text)
378+
379+
#TODO: Consider all characters in my_contextual_text,
380+
#not just the first 1024
381+
contextual_text = contextual_text[0:1023]
382+
383+
return contextual_text
384+
385+
def extract_b64(self, doc_filepath: str) -> str:
386+
"""
387+
Extract the b64 a multimodal document
388+
389+
Args:
390+
doc_filepath: string filepath of the doc we are reading the data from
391+
Returns:
392+
str containing b64 of the doc
393+
"""
394+
# Take the doc and convert it to b64
395+
with open(doc_filepath, "rb") as f:
396+
doc_bytes = f.read()
397+
doc_b64 = b64encode(doc_bytes).decode("utf-8")
398+
return doc_b64
399+
347400
@classmethod
348401
def text_to_sentence_list(cls, text: str) -> List[str]:
349402
"""

0 commit comments

Comments
 (0)