|
15 | 15 | Query Data Sources
|
16 | 16 | """
|
17 | 17 | import json
|
| 18 | +import hashlib |
18 | 19 | import traceback
|
19 | 20 | import os
|
20 | 21 | import re
|
21 | 22 | import tempfile
|
22 |
| -from time import time |
23 |
| -from random import randint |
24 | 23 | from urllib.parse import unquote
|
25 | 24 | from copy import copy
|
26 | 25 | from base64 import b64encode
|
|
46 | 45 | # number of sentences included before and after the current
|
47 | 46 | # sentence when creating chunks (chunks have overlapping text)
|
48 | 47 | CHUNK_SENTENCE_PADDING = 1
|
| 48 | +# string added to the front of a folder to identify it as a folder crated by |
| 49 | +# genie to store extracted files duirng document parsing that should not |
| 50 | +# itself be parsed |
| 51 | +GENIE_FOLDER_MARKER = "_genie_" |
49 | 52 |
|
50 | 53 | class DataSourceFile():
|
51 | 54 | """ object storing meta data about a data source file """
|
@@ -242,8 +245,8 @@ def chunk_document(self, doc_name: str, doc_url: str,
|
242 | 245 | def chunk_document_multimodal(self,
|
243 | 246 | doc_name: str,
|
244 | 247 | doc_url: str,
|
245 |
| - doc_filepath: str) -> \ |
246 |
| - List[object]: |
| 248 | + doc_filepath: str |
| 249 | + ) -> list[object]: |
247 | 250 | """
|
248 | 251 | Process a file document into multimodal chunks (b64 and text) for embeddings
|
249 | 252 |
|
@@ -275,24 +278,26 @@ def chunk_document_multimodal(self,
|
275 | 278 |
|
276 | 279 | # Get bucket name & the doc file path within bucket
|
277 | 280 | if doc_url.startswith("https://storage.googleapis.com/"):
|
278 |
| - bucket_parts = unquote(\ |
| 281 | + bucket_parts = unquote( |
279 | 282 | doc_url.split("https://storage.googleapis.com/")[1]).split("/")
|
280 | 283 | elif doc_url.startswith("gs://"):
|
281 | 284 | bucket_parts = unquote(doc_url.split("gs://")[1]).split("/")
|
282 | 285 | else:
|
283 | 286 | raise ValueError(f"Invalid Doc URL: {doc_url}")
|
284 | 287 |
|
285 | 288 | bucket_name = bucket_parts[0]
|
286 |
| - bucket_folder = "/".join(bucket_parts[1:-1]) \ |
287 |
| - if len(bucket_parts) > 2 else None |
| 289 | + filepath_in_bucket = "/".join(bucket_parts[1:]) |
288 | 290 |
|
289 |
| - # Determine bucket folder to store all chunk docs created |
290 |
| - # Add time-in-ms_randint to ensure that that folders are unique |
291 |
| - chunk_ext_i = bucket_parts[-1].rfind(".") |
292 |
| - chunk_bucket_folder = bucket_parts[-1][:chunk_ext_i]+"_"+\ |
293 |
| - str(round(time() * 1000))+"_"+str(randint(1000,9999)) |
294 |
| - if bucket_folder: |
295 |
| - chunk_bucket_folder = f"{bucket_folder}/{chunk_bucket_folder}" |
| 291 | + if filepath_in_bucket.startswith(GENIE_FOLDER_MARKER): |
| 292 | + # if this is true this file was created by genie as a chunk of another |
| 293 | + # file and should not be processed |
| 294 | + return [] |
| 295 | + |
| 296 | + # Determine bucket folder for document chunks that require storage |
| 297 | + # The folder is marked as a genie folder and uses a hash of the |
| 298 | + # document |
| 299 | + chunk_bucket_folder = (f"{GENIE_FOLDER_MARKER}/" |
| 300 | + f"{get_file_hash(doc_filepath)}") |
296 | 301 |
|
297 | 302 | # If doc is a PDF, convert it to an array of PNGs for each page
|
298 | 303 | allowed_image_types = ["png", "jpg", "jpeg", "bmp", "gif"]
|
@@ -523,3 +528,21 @@ def create_pdf_page(page: PageObject, doc_filepath: str,
|
523 | 528 | "filename": page_pdf_filename,
|
524 | 529 | "filepath": page_pdf_filepath
|
525 | 530 | }
|
| 531 | + |
| 532 | +def get_file_hash(filepath: str) -> str: |
| 533 | + """ |
| 534 | + Calculates the sha256 hash of a file |
| 535 | + This would probably be better in utils/file_helper.py but that causes a |
| 536 | + circular import loop, so it's in the file where it's used for now |
| 537 | + This can be replaced with hashlib.file_digest when using python3.11 or greater |
| 538 | + Taken from stackoverflow.com/questions/69339582 |
| 539 | + Takes a path to the file |
| 540 | + Returns the hash of the file as a hexadecimal string |
| 541 | + """ |
| 542 | + h = hashlib.sha256() |
| 543 | + with open(filepath, "rb") as f: |
| 544 | + data = f.read(2048) |
| 545 | + while data != b"": |
| 546 | + h.update(data) |
| 547 | + data = f.read(2048) |
| 548 | + return h.hexdigest() |
0 commit comments