Skip to content

Commit 808a0c7

Browse files
authored
Add batching to SearchClient ingestion (#12)
1 parent 0c53dd6 commit 808a0c7

File tree

1 file changed

+21
-12
lines changed

1 file changed

+21
-12
lines changed

no-ocr-api/np_ocr/search.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,8 @@ def __init__(self, storage_dir: str, vector_size: int, base_url: str, token: st
7070
self.vector_size = vector_size
7171
self.colpali_client = ColPaliClient(base_url, token)
7272

73-
def ingest(self, case_name: str, dataset, user_id: str):
73+
def ingest(self, case_name: str, dataset, user_id: str, batch_size: int = 50):
74+
"""Ingest a dataset of images into LanceDB in batches."""
7475
logger.info("start ingest")
7576
start_time = time.time()
7677

@@ -85,27 +86,35 @@ def ingest(self, case_name: str, dataset, user_id: str):
8586
lance_client = lancedb.connect(f"{self.storage_dir}/{user_id}/{case_name}")
8687
tbl = lance_client.create_table(case_name, schema=schema)
8788

88-
# TODO: ingest in batches
89-
9089
with tqdm(total=len(dataset), desc="Indexing Progress") as pbar:
90+
batch = []
9191
for i in range(len(dataset)):
9292
image = dataset[i]["image"]
9393
response = self.colpali_client.process_pil_image(image)
9494
image_embedding = response["embedding"]
9595

96-
data = {
97-
"index": dataset[i]["index"],
98-
"pdf_name": dataset[i]["pdf_name"],
99-
"pdf_page": dataset[i]["pdf_page"],
100-
"vector": image_embedding,
101-
}
96+
batch.append(
97+
{
98+
"index": dataset[i]["index"],
99+
"pdf_name": dataset[i]["pdf_name"],
100+
"pdf_page": dataset[i]["pdf_page"],
101+
"vector": image_embedding,
102+
}
103+
)
104+
105+
if len(batch) >= batch_size:
106+
try:
107+
tbl.add(batch)
108+
except Exception as e:
109+
logger.error(f"Error during upsert: {e}")
110+
batch = []
111+
pbar.update(1)
102112

113+
if batch:
103114
try:
104-
tbl.add([data])
115+
tbl.add(batch)
105116
except Exception as e:
106117
logger.error(f"Error during upsert: {e}")
107-
continue
108-
pbar.update(1)
109118

110119
tbl.create_index(metric="cosine")
111120

0 commit comments

Comments
 (0)