Skip to content

Commit 1eb493b

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: Update v1beta1 sdk to support llmparser in import file functions
PiperOrigin-RevId: 708932562
1 parent 2224c83 commit 1eb493b

File tree

6 files changed

+160
-2
lines changed

6 files changed

+160
-2
lines changed

tests/unit/vertex_rag/test_rag_constants_preview.py

+24
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
Filter,
2424
HybridSearch,
2525
LayoutParserConfig,
26+
LlmParserConfig,
2627
LlmRanker,
2728
Pinecone,
2829
RagCorpus,
@@ -623,6 +624,12 @@
623624
max_parsing_requests_per_min=100,
624625
)
625626

627+
TEST_LLM_PARSER_CONFIG = LlmParserConfig(
628+
model_name="gemini-1.5-pro-002",
629+
max_parsing_requests_per_min=500,
630+
custom_parsing_prompt="test-custom-parsing-prompt",
631+
)
632+
626633
TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE_NO_FOLDERS = ImportRagFilesConfig(
627634
rag_file_transformation_config=TEST_RAG_FILE_TRANSFORMATION_CONFIG,
628635
share_point_sources=GapicSharePointSources(
@@ -677,6 +684,23 @@
677684
import_rag_files_config=TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH,
678685
)
679686

687+
TEST_IMPORT_FILES_CONFIG_LLM_PARSER = ImportRagFilesConfig(
688+
TEST_IMPORT_FILES_CONFIG_DRIVE_FOLDER
689+
)
690+
691+
TEST_IMPORT_FILES_CONFIG_LLM_PARSER.rag_file_parsing_config = RagFileParsingConfig(
692+
llm_parser=RagFileParsingConfig.LlmParser(
693+
model_name="gemini-1.5-pro-002",
694+
max_parsing_requests_per_min=500,
695+
custom_parsing_prompt="test-custom-parsing-prompt",
696+
)
697+
)
698+
699+
TEST_IMPORT_REQUEST_LLM_PARSER = ImportRagFilesRequest(
700+
parent=TEST_RAG_CORPUS_RESOURCE_NAME,
701+
import_rag_files_config=TEST_IMPORT_FILES_CONFIG_LLM_PARSER,
702+
)
703+
680704
# Retrieval
681705
TEST_QUERY_TEXT = "What happen to the fox and the dog?"
682706
TEST_CONTEXTS = RagContexts(

tests/unit/vertex_rag/test_rag_data_preview.py

+64
Original file line numberDiff line numberDiff line change
@@ -1156,6 +1156,70 @@ async def test_advanced_pdf_parsing_and_layout_parser_both_set_error_async(self)
11561156
"passed in at a time"
11571157
)
11581158

1159+
def test_prepare_import_files_request_llm_parser(self):
1160+
request = prepare_import_files_request(
1161+
corpus_name=test_rag_constants_preview.TEST_RAG_CORPUS_RESOURCE_NAME,
1162+
paths=[test_rag_constants_preview.TEST_DRIVE_FOLDER],
1163+
transformation_config=create_transformation_config(),
1164+
llm_parser=test_rag_constants_preview.TEST_LLM_PARSER_CONFIG,
1165+
)
1166+
import_files_request_eq(
1167+
request,
1168+
test_rag_constants_preview.TEST_IMPORT_REQUEST_LLM_PARSER,
1169+
)
1170+
1171+
def test_advanced_pdf_parsing_and_llm_parser_both_set_error(self):
1172+
with pytest.raises(ValueError) as e:
1173+
rag.import_files(
1174+
corpus_name=test_rag_constants_preview.TEST_RAG_CORPUS_RESOURCE_NAME,
1175+
paths=[test_rag_constants_preview.TEST_DRIVE_FOLDER],
1176+
transformation_config=create_transformation_config(),
1177+
use_advanced_pdf_parsing=True,
1178+
llm_parser=test_rag_constants_preview.TEST_LLM_PARSER_CONFIG,
1179+
)
1180+
e.match(
1181+
"Only one of use_advanced_pdf_parsing or llm_parser may be "
1182+
"passed in at a time"
1183+
)
1184+
1185+
def test_layout_parser_and_llm_parser_both_set_error(self):
1186+
with pytest.raises(ValueError) as e:
1187+
rag.import_files(
1188+
corpus_name=test_rag_constants_preview.TEST_RAG_CORPUS_RESOURCE_NAME,
1189+
paths=[test_rag_constants_preview.TEST_DRIVE_FOLDER],
1190+
transformation_config=create_transformation_config(),
1191+
layout_parser=test_rag_constants_preview.TEST_LAYOUT_PARSER_WITH_PROCESSOR_PATH_CONFIG,
1192+
llm_parser=test_rag_constants_preview.TEST_LLM_PARSER_CONFIG,
1193+
)
1194+
e.match("Only one of layout_parser or llm_parser may be passed in at a time")
1195+
1196+
@pytest.mark.asyncio
1197+
async def test_advanced_pdf_parsing_and_llm_parser_both_set_error_async(self):
1198+
with pytest.raises(ValueError) as e:
1199+
await rag.import_files_async(
1200+
corpus_name=test_rag_constants_preview.TEST_RAG_CORPUS_RESOURCE_NAME,
1201+
paths=[test_rag_constants_preview.TEST_DRIVE_FOLDER],
1202+
transformation_config=create_transformation_config(),
1203+
use_advanced_pdf_parsing=True,
1204+
llm_parser=test_rag_constants_preview.TEST_LLM_PARSER_CONFIG,
1205+
)
1206+
e.match(
1207+
"Only one of use_advanced_pdf_parsing or llm_parser may be "
1208+
"passed in at a time"
1209+
)
1210+
1211+
@pytest.mark.asyncio
1212+
async def test_layout_parser_and_llm_parser_both_set_error_async(self):
1213+
with pytest.raises(ValueError) as e:
1214+
await rag.import_files_async(
1215+
corpus_name=test_rag_constants_preview.TEST_RAG_CORPUS_RESOURCE_NAME,
1216+
paths=[test_rag_constants_preview.TEST_DRIVE_FOLDER],
1217+
transformation_config=create_transformation_config(),
1218+
layout_parser=test_rag_constants_preview.TEST_LAYOUT_PARSER_WITH_PROCESSOR_PATH_CONFIG,
1219+
llm_parser=test_rag_constants_preview.TEST_LLM_PARSER_CONFIG,
1220+
)
1221+
e.match("Only one of layout_parser or llm_parser may be passed in at a time")
1222+
11591223
def test_set_embedding_model_config_set_both_error(self):
11601224
embedding_model_config = rag.EmbeddingModelConfig(
11611225
publisher_model="whatever",

vertexai/preview/rag/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
JiraQuery,
4242
JiraSource,
4343
LayoutParserConfig,
44+
LlmParserConfig,
4445
LlmRanker,
4546
Pinecone,
4647
RagCorpus,
@@ -72,6 +73,7 @@
7273
"JiraQuery",
7374
"JiraSource",
7475
"LayoutParserConfig",
76+
"LlmParserConfig",
7577
"LlmRanker",
7678
"Pinecone",
7779
"RagCorpus",

vertexai/preview/rag/rag_data.py

+31-2
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
EmbeddingModelConfig,
4747
JiraSource,
4848
LayoutParserConfig,
49+
LlmParserConfig,
4950
Pinecone,
5051
RagCorpus,
5152
RagFile,
@@ -475,6 +476,7 @@ def import_files(
475476
use_advanced_pdf_parsing: Optional[bool] = False,
476477
partial_failures_sink: Optional[str] = None,
477478
layout_parser: Optional[LayoutParserConfig] = None,
479+
llm_parser: Optional[LlmParserConfig] = None,
478480
) -> ImportRagFilesResponse:
479481
"""
480482
Import files to an existing RagCorpus, wait until completion.
@@ -592,7 +594,10 @@ def import_files(
592594
to the table.
593595
layout_parser: Configuration for the Document AI Layout Parser Processor
594596
to use for document parsing. Optional.
595-
If not None,`use_advanced_pdf_parsing` must be False.
597+
If not None, the other parser configs must be None.
598+
llm_parser: Configuration for the LLM Parser to use for document parsing.
599+
Optional.
600+
If not None, the other parser configs must be None.
596601
Returns:
597602
ImportRagFilesResponse.
598603
"""
@@ -605,6 +610,15 @@ def import_files(
605610
"Only one of use_advanced_pdf_parsing or layout_parser may be "
606611
"passed in at a time"
607612
)
613+
if use_advanced_pdf_parsing and llm_parser is not None:
614+
raise ValueError(
615+
"Only one of use_advanced_pdf_parsing or llm_parser may be "
616+
"passed in at a time"
617+
)
618+
if layout_parser is not None and llm_parser is not None:
619+
raise ValueError(
620+
"Only one of layout_parser or llm_parser may be passed in at a time"
621+
)
608622
corpus_name = _gapic_utils.get_corpus_name(corpus_name)
609623
request = _gapic_utils.prepare_import_files_request(
610624
corpus_name=corpus_name,
@@ -617,6 +631,7 @@ def import_files(
617631
use_advanced_pdf_parsing=use_advanced_pdf_parsing,
618632
partial_failures_sink=partial_failures_sink,
619633
layout_parser=layout_parser,
634+
llm_parser=llm_parser,
620635
)
621636
client = _gapic_utils.create_rag_data_service_client()
622637
try:
@@ -638,6 +653,7 @@ async def import_files_async(
638653
use_advanced_pdf_parsing: Optional[bool] = False,
639654
partial_failures_sink: Optional[str] = None,
640655
layout_parser: Optional[LayoutParserConfig] = None,
656+
llm_parser: Optional[LlmParserConfig] = None,
641657
) -> operation_async.AsyncOperation:
642658
"""
643659
Import files to an existing RagCorpus asynchronously.
@@ -755,7 +771,10 @@ async def import_files_async(
755771
to the table.
756772
layout_parser: Configuration for the Document AI Layout Parser Processor
757773
to use for document parsing. Optional.
758-
If not None,`use_advanced_pdf_parsing` must be False.
774+
If not None, the other parser configs must be None.
775+
llm_parser: Configuration for the LLM Parser to use for document parsing.
776+
Optional.
777+
If not None, the other parser configs must be None.
759778
Returns:
760779
operation_async.AsyncOperation.
761780
"""
@@ -768,6 +787,15 @@ async def import_files_async(
768787
"Only one of use_advanced_pdf_parsing or layout_parser may be "
769788
"passed in at a time"
770789
)
790+
if use_advanced_pdf_parsing and llm_parser is not None:
791+
raise ValueError(
792+
"Only one of use_advanced_pdf_parsing or llm_parser may be "
793+
"passed in at a time"
794+
)
795+
if layout_parser is not None and llm_parser is not None:
796+
raise ValueError(
797+
"Only one of layout_parser or llm_parser may be passed in at a time"
798+
)
771799
corpus_name = _gapic_utils.get_corpus_name(corpus_name)
772800
request = _gapic_utils.prepare_import_files_request(
773801
corpus_name=corpus_name,
@@ -780,6 +808,7 @@ async def import_files_async(
780808
use_advanced_pdf_parsing=use_advanced_pdf_parsing,
781809
partial_failures_sink=partial_failures_sink,
782810
layout_parser=layout_parser,
811+
llm_parser=llm_parser,
783812
)
784813
async_client = _gapic_utils.create_rag_data_service_async_client()
785814
try:

vertexai/preview/rag/utils/_gapic_utils.py

+15
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
EmbeddingModelConfig,
4444
VertexPredictionEndpoint,
4545
LayoutParserConfig,
46+
LlmParserConfig,
4647
Pinecone,
4748
RagCorpus,
4849
RagFile,
@@ -450,6 +451,7 @@ def prepare_import_files_request(
450451
use_advanced_pdf_parsing: bool = False,
451452
partial_failures_sink: Optional[str] = None,
452453
layout_parser: Optional[LayoutParserConfig] = None,
454+
llm_parser: Optional[LlmParserConfig] = None,
453455
) -> ImportRagFilesRequest:
454456
if len(corpus_name.split("/")) != 6:
455457
raise ValueError(
@@ -479,6 +481,19 @@ def prepare_import_files_request(
479481
processor_name=layout_parser.processor_name,
480482
max_parsing_requests_per_min=layout_parser.max_parsing_requests_per_min,
481483
)
484+
if llm_parser is not None:
485+
rag_file_parsing_config.llm_parser = RagFileParsingConfig.LlmParser(
486+
model_name=llm_parser.model_name
487+
)
488+
if llm_parser.max_parsing_requests_per_min is not None:
489+
rag_file_parsing_config.llm_parser.max_parsing_requests_per_min = (
490+
llm_parser.max_parsing_requests_per_min
491+
)
492+
if llm_parser.custom_parsing_prompt is not None:
493+
rag_file_parsing_config.llm_parser.custom_parsing_prompt = (
494+
llm_parser.custom_parsing_prompt
495+
)
496+
482497
local_chunk_size = chunk_size
483498
local_chunk_overlap = chunk_overlap
484499
if transformation_config and transformation_config.chunking_config:

vertexai/preview/rag/utils/resources.py

+24
Original file line numberDiff line numberDiff line change
@@ -500,3 +500,27 @@ class LayoutParserConfig:
500500

501501
processor_name: str
502502
max_parsing_requests_per_min: Optional[int] = None
503+
504+
505+
@dataclasses.dataclass
506+
class LlmParserConfig:
507+
"""Configuration for the Document AI Layout Parser Processor.
508+
509+
Attributes:
510+
model_name (str):
511+
The full resource name of a Vertex AI model. Format:
512+
- `projects/{project_id}/locations/{location}/publishers/google/models/{model_id}`
513+
- `projects/{project_id}/locations/{location}/models/{model_id}`
514+
max_parsing_requests_per_min (int):
515+
The maximum number of requests the job is allowed to make to the
516+
Vertex AI model per minute. Consult
517+
https://cloud.google.com/vertex-ai/generative-ai/docs/quotas and
518+
the Quota page for your project to set an appropriate value here.
519+
If unspecified, a default value of 120 QPM will be used.
520+
custom_parsing_prompt (str):
521+
A custom prompt to use for parsing.
522+
"""
523+
524+
model_name: str
525+
max_parsing_requests_per_min: Optional[int] = None
526+
custom_parsing_prompt: Optional[str] = None

0 commit comments

Comments
 (0)