Skip to content

Commit ee4270f

Browse files
authored
Merge pull request #60 from RobertKoehlmoos/654-document-chunking-improvement
654 document chunking improvement
2 parents 0d49965 + 801f293 commit ee4270f

File tree

2 files changed

+65
-14
lines changed

2 files changed

+65
-14
lines changed

components/llm_service/src/services/query/data_source.py

+37-14
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,11 @@
1515
Query Data Sources
1616
"""
1717
import json
18+
import hashlib
1819
import traceback
1920
import os
2021
import re
2122
import tempfile
22-
from time import time
23-
from random import randint
2423
from urllib.parse import unquote
2524
from copy import copy
2625
from base64 import b64encode
@@ -46,6 +45,10 @@
4645
# number of sentences included before and after the current
4746
# sentence when creating chunks (chunks have overlapping text)
4847
CHUNK_SENTENCE_PADDING = 1
48+
# string added to the front of a folder to identify it as a folder crated by
49+
# genie to store extracted files duirng document parsing that should not
50+
# itself be parsed
51+
GENIE_FOLDER_MARKER = "_genie_"
4952

5053
class DataSourceFile():
5154
""" object storing meta data about a data source file """
@@ -242,8 +245,8 @@ def chunk_document(self, doc_name: str, doc_url: str,
242245
def chunk_document_multimodal(self,
243246
doc_name: str,
244247
doc_url: str,
245-
doc_filepath: str) -> \
246-
List[object]:
248+
doc_filepath: str
249+
) -> list[object]:
247250
"""
248251
Process a file document into multimodal chunks (b64 and text) for embeddings
249252
@@ -275,24 +278,26 @@ def chunk_document_multimodal(self,
275278

276279
# Get bucket name & the doc file path within bucket
277280
if doc_url.startswith("https://storage.googleapis.com/"):
278-
bucket_parts = unquote(\
281+
bucket_parts = unquote(
279282
doc_url.split("https://storage.googleapis.com/")[1]).split("/")
280283
elif doc_url.startswith("gs://"):
281284
bucket_parts = unquote(doc_url.split("gs://")[1]).split("/")
282285
else:
283286
raise ValueError(f"Invalid Doc URL: {doc_url}")
284287

285288
bucket_name = bucket_parts[0]
286-
bucket_folder = "/".join(bucket_parts[1:-1]) \
287-
if len(bucket_parts) > 2 else None
289+
filepath_in_bucket = "/".join(bucket_parts[1:])
288290

289-
# Determine bucket folder to store all chunk docs created
290-
# Add time-in-ms_randint to ensure that that folders are unique
291-
chunk_ext_i = bucket_parts[-1].rfind(".")
292-
chunk_bucket_folder = bucket_parts[-1][:chunk_ext_i]+"_"+\
293-
str(round(time() * 1000))+"_"+str(randint(1000,9999))
294-
if bucket_folder:
295-
chunk_bucket_folder = f"{bucket_folder}/{chunk_bucket_folder}"
291+
if filepath_in_bucket.startswith(GENIE_FOLDER_MARKER):
292+
# if this is true this file was created by genie as a chunk of another
293+
# file and should not be processed
294+
return []
295+
296+
# Determine bucket folder for document chunks that require storage
297+
# The folder is marked as a genie folder and uses a hash of the
298+
# document
299+
chunk_bucket_folder = (f"{GENIE_FOLDER_MARKER}/"
300+
f"{get_file_hash(doc_filepath)}")
296301

297302
# If doc is a PDF, convert it to an array of PNGs for each page
298303
allowed_image_types = ["png", "jpg", "jpeg", "bmp", "gif"]
@@ -523,3 +528,21 @@ def create_pdf_page(page: PageObject, doc_filepath: str,
523528
"filename": page_pdf_filename,
524529
"filepath": page_pdf_filepath
525530
}
531+
532+
def get_file_hash(filepath: str) -> str:
533+
"""
534+
Calculates the sha256 hash of a file
535+
This would probably be better in utils/file_helper.py but that causes a
536+
circular import loop, so it's in the file where it's used for now
537+
This can be replaced with hashlib.file_digest when using python3.11 or greater
538+
Taken from stackoverflow.com/questions/69339582
539+
Takes a path to the file
540+
Returns the hash of the file as a hexadecimal string
541+
"""
542+
h = hashlib.sha256()
543+
with open(filepath, "rb") as f:
544+
data = f.read(2048)
545+
while data != b"":
546+
h.update(data)
547+
data = f.read(2048)
548+
return h.hexdigest()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""
15+
Unit tests for Data Source
16+
"""
17+
import services
18+
import tempfile
19+
20+
import services.query.data_source
21+
22+
def test_get_file_hash():
23+
correct_hash = (
24+
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855")
25+
with tempfile.NamedTemporaryFile() as f:
26+
f.write(b"hello world!")
27+
file_hash = services.query.data_source.get_file_hash(f.name)
28+
assert file_hash == correct_hash

0 commit comments

Comments
 (0)