Merge pull request #60 from RobertKoehlmoos/654-document-chunking-improvement

smcazares · web-flow · commit ee4270f5d121 · 2024-10-22T13:14:29.000-04:00
654 document chunking improvement
diff --git a/components/llm_service/src/services/query/data_source.py b/components/llm_service/src/services/query/data_source.py
@@ -15,12 +15,11 @@
 Query Data Sources
 """
 import json
+import hashlib
 import traceback
 import os
 import re
 import tempfile
-from time import time
-from random import randint
 from urllib.parse import unquote
 from copy import copy
 from base64 import b64encode
@@ -46,6 +45,10 @@
 # number of sentences included before and after the current
 # sentence when creating chunks (chunks have overlapping text)
 CHUNK_SENTENCE_PADDING = 1
+# string added to the front of a folder to identify it as a folder crated by
+# genie to store extracted files duirng document parsing that should not
+# itself be parsed
+GENIE_FOLDER_MARKER = "_genie_"
 
 class DataSourceFile():
   """ object storing meta data about a data source file """
@@ -242,8 +245,8 @@ def chunk_document(self, doc_name: str, doc_url: str,
   def chunk_document_multimodal(self,
                            doc_name: str,
                            doc_url: str,
-                           doc_filepath: str) -> \
-                            List[object]:
+                           doc_filepath: str
+                           ) -> list[object]:
     """
     Process a file document into multimodal chunks (b64 and text) for embeddings
 
@@ -275,24 +278,26 @@ def chunk_document_multimodal(self,
 
       # Get bucket name & the doc file path within bucket
       if doc_url.startswith("https://storage.googleapis.com/"):
-        bucket_parts = unquote(\
+        bucket_parts = unquote(
           doc_url.split("https://storage.googleapis.com/")[1]).split("/")
       elif doc_url.startswith("gs://"):
         bucket_parts = unquote(doc_url.split("gs://")[1]).split("/")
       else:
         raise ValueError(f"Invalid Doc URL: {doc_url}")
 
       bucket_name = bucket_parts[0]
-      bucket_folder = "/".join(bucket_parts[1:-1]) \
-        if len(bucket_parts) > 2 else None
+      filepath_in_bucket = "/".join(bucket_parts[1:])
 
-      # Determine bucket folder to store all chunk docs created
-      # Add time-in-ms_randint to ensure that that folders are unique
-      chunk_ext_i = bucket_parts[-1].rfind(".")
-      chunk_bucket_folder = bucket_parts[-1][:chunk_ext_i]+"_"+\
-        str(round(time() * 1000))+"_"+str(randint(1000,9999))
-      if bucket_folder:
-        chunk_bucket_folder = f"{bucket_folder}/{chunk_bucket_folder}"
+      if filepath_in_bucket.startswith(GENIE_FOLDER_MARKER):
+        # if this is true this file was created by genie as a chunk of another
+        # file and should not be processed
+        return []
+
+      # Determine bucket folder for document chunks that require storage
+      # The folder is marked as a genie folder and uses a hash of the
+      # document
+      chunk_bucket_folder = (f"{GENIE_FOLDER_MARKER}/"
+                             f"{get_file_hash(doc_filepath)}")
 
       # If doc is a PDF, convert it to an array of PNGs for each page
       allowed_image_types = ["png", "jpg", "jpeg", "bmp", "gif"]
@@ -523,3 +528,21 @@ def create_pdf_page(page: PageObject, doc_filepath: str,
       "filename": page_pdf_filename,
       "filepath": page_pdf_filepath
     }
+
+def get_file_hash(filepath: str) -> str:
+  """
+  Calculates the sha256 hash of a file
+  This would probably be better in utils/file_helper.py but that causes a
+  circular import loop, so it's in the file where it's used for now
+  This can be replaced with hashlib.file_digest when using python3.11 or greater
+  Taken from stackoverflow.com/questions/69339582
+  Takes a path to the file
+  Returns the hash of the file as a hexadecimal string
+  """
+  h = hashlib.sha256()
+  with open(filepath, "rb") as f:
+    data = f.read(2048)
+    while data != b"":
+      h.update(data)
+      data = f.read(2048)
+  return h.hexdigest()
diff --git a/components/llm_service/src/services/query/data_source_test.py b/components/llm_service/src/services/query/data_source_test.py
@@ -0,0 +1,28 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+  Unit tests for Data Source
+"""
+import services
+import tempfile
+
+import services.query.data_source
+
+def test_get_file_hash():
+  correct_hash = (
+    "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855")
+  with tempfile.NamedTemporaryFile() as f:
+    f.write(b"hello world!")
+    file_hash = services.query.data_source.get_file_hash(f.name)
+    assert file_hash == correct_hash