Skip to content

Commit 565c800

Browse files
ilai-deutelcopybara-github
authored andcommitted
feat: add support for Document AI Layout Parser as a RAG import option
PiperOrigin-RevId: 702358388
1 parent f02692d commit 565c800

File tree

6 files changed

+180
-0
lines changed

6 files changed

+180
-0
lines changed

tests/unit/vertex_rag/test_rag_constants_preview.py

+43
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
EmbeddingModelConfig,
2323
Filter,
2424
HybridSearch,
25+
LayoutParserConfig,
2526
LlmRanker,
2627
Pinecone,
2728
RagCorpus,
@@ -612,6 +613,16 @@
612613
],
613614
)
614615

616+
TEST_LAYOUT_PARSER_WITH_PROCESSOR_PATH_CONFIG = LayoutParserConfig(
617+
processor_name="projects/test-project/locations/us/processors/abc123",
618+
max_parsing_requests_per_min=100,
619+
)
620+
621+
TEST_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH_CONFIG = LayoutParserConfig(
622+
processor_name="projects/test-project/locations/us/processors/abc123/processorVersions/pretrained-layout-parser-v0.0-2020-01-0",
623+
max_parsing_requests_per_min=100,
624+
)
625+
615626
TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE_NO_FOLDERS = ImportRagFilesConfig(
616627
rag_file_transformation_config=TEST_RAG_FILE_TRANSFORMATION_CONFIG,
617628
share_point_sources=GapicSharePointSources(
@@ -634,6 +645,38 @@
634645
import_rag_files_config=TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE,
635646
)
636647

648+
TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_PATH = ImportRagFilesConfig(
649+
TEST_IMPORT_FILES_CONFIG_DRIVE_FOLDER
650+
)
651+
TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_PATH.rag_file_parsing_config = (
652+
RagFileParsingConfig(
653+
layout_parser=RagFileParsingConfig.LayoutParser(
654+
processor_name="projects/test-project/locations/us/processors/abc123",
655+
max_parsing_requests_per_min=100,
656+
)
657+
)
658+
)
659+
660+
TEST_IMPORT_REQUEST_LAYOUT_PARSER_WITH_PROCESSOR_PATH = ImportRagFilesRequest(
661+
parent=TEST_RAG_CORPUS_RESOURCE_NAME,
662+
import_rag_files_config=TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_PATH,
663+
)
664+
665+
TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH = (
666+
ImportRagFilesConfig(TEST_IMPORT_FILES_CONFIG_DRIVE_FOLDER)
667+
)
668+
TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH.rag_file_parsing_config = RagFileParsingConfig(
669+
layout_parser=RagFileParsingConfig.LayoutParser(
670+
processor_name="projects/test-project/locations/us/processors/abc123/processorVersions/pretrained-layout-parser-v0.0-2020-01-0",
671+
max_parsing_requests_per_min=100,
672+
)
673+
)
674+
675+
TEST_IMPORT_REQUEST_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH = ImportRagFilesRequest(
676+
parent=TEST_RAG_CORPUS_RESOURCE_NAME,
677+
import_rag_files_config=TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH,
678+
)
679+
637680
# Retrieval
638681
TEST_QUERY_TEXT = "What happen to the fox and the dog?"
639682
TEST_CONTEXTS = RagContexts(

tests/unit/vertex_rag/test_rag_data_preview.py

+68
Original file line numberDiff line numberDiff line change
@@ -1088,6 +1088,74 @@ def test_prepare_import_files_request_sharepoint_source_no_folders(self):
10881088
test_rag_constants_preview.TEST_IMPORT_REQUEST_SHARE_POINT_SOURCE_NO_FOLDERS,
10891089
)
10901090

1091+
def test_prepare_import_files_request_valid_layout_parser_with_processor_path(self):
1092+
request = prepare_import_files_request(
1093+
corpus_name=test_rag_constants_preview.TEST_RAG_CORPUS_RESOURCE_NAME,
1094+
paths=[test_rag_constants_preview.TEST_DRIVE_FOLDER],
1095+
transformation_config=create_transformation_config(),
1096+
layout_parser=test_rag_constants_preview.TEST_LAYOUT_PARSER_WITH_PROCESSOR_PATH_CONFIG,
1097+
)
1098+
import_files_request_eq(
1099+
request,
1100+
test_rag_constants_preview.TEST_IMPORT_REQUEST_LAYOUT_PARSER_WITH_PROCESSOR_PATH,
1101+
)
1102+
1103+
def test_prepare_import_files_request_valid_layout_parser_with_processor_version_path(
1104+
self,
1105+
):
1106+
request = prepare_import_files_request(
1107+
corpus_name=test_rag_constants_preview.TEST_RAG_CORPUS_RESOURCE_NAME,
1108+
paths=[test_rag_constants_preview.TEST_DRIVE_FOLDER],
1109+
transformation_config=create_transformation_config(),
1110+
layout_parser=test_rag_constants_preview.TEST_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH_CONFIG,
1111+
)
1112+
import_files_request_eq(
1113+
request,
1114+
test_rag_constants_preview.TEST_IMPORT_REQUEST_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH,
1115+
)
1116+
1117+
def test_prepare_import_files_request_invalid_layout_parser_name(self):
1118+
layout_parser = rag.LayoutParserConfig(
1119+
processor_name="projects/test-project/locations/us/processorTypes/LAYOUT_PARSER",
1120+
)
1121+
with pytest.raises(ValueError) as e:
1122+
prepare_import_files_request(
1123+
corpus_name=test_rag_constants_preview.TEST_RAG_CORPUS_RESOURCE_NAME,
1124+
paths=[test_rag_constants_preview.TEST_DRIVE_FOLDER],
1125+
transformation_config=create_transformation_config(),
1126+
layout_parser=layout_parser,
1127+
)
1128+
e.match("processor_name must be of the format")
1129+
1130+
def test_advanced_pdf_parsing_and_layout_parser_both_set_error(self):
1131+
with pytest.raises(ValueError) as e:
1132+
rag.import_files(
1133+
corpus_name=test_rag_constants_preview.TEST_RAG_CORPUS_RESOURCE_NAME,
1134+
paths=[test_rag_constants_preview.TEST_DRIVE_FOLDER],
1135+
transformation_config=create_transformation_config(),
1136+
use_advanced_pdf_parsing=True,
1137+
layout_parser=test_rag_constants_preview.TEST_LAYOUT_PARSER_WITH_PROCESSOR_PATH_CONFIG,
1138+
)
1139+
e.match(
1140+
"Only one of use_advanced_pdf_parsing or layout_parser may be "
1141+
"passed in at a time"
1142+
)
1143+
1144+
@pytest.mark.asyncio
1145+
async def test_advanced_pdf_parsing_and_layout_parser_both_set_error_async(self):
1146+
with pytest.raises(ValueError) as e:
1147+
await rag.import_files_async(
1148+
corpus_name=test_rag_constants_preview.TEST_RAG_CORPUS_RESOURCE_NAME,
1149+
paths=[test_rag_constants_preview.TEST_DRIVE_FOLDER],
1150+
transformation_config=create_transformation_config(),
1151+
use_advanced_pdf_parsing=True,
1152+
layout_parser=test_rag_constants_preview.TEST_LAYOUT_PARSER_WITH_PROCESSOR_PATH_CONFIG,
1153+
)
1154+
e.match(
1155+
"Only one of use_advanced_pdf_parsing or layout_parser may be "
1156+
"passed in at a time"
1157+
)
1158+
10911159
def test_set_embedding_model_config_set_both_error(self):
10921160
embedding_model_config = rag.EmbeddingModelConfig(
10931161
publisher_model="whatever",

vertexai/preview/rag/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
HybridSearch,
4141
JiraQuery,
4242
JiraSource,
43+
LayoutParserConfig,
4344
LlmRanker,
4445
Pinecone,
4546
RagCorpus,
@@ -70,6 +71,7 @@
7071
"HybridSearch",
7172
"JiraQuery",
7273
"JiraSource",
74+
"LayoutParserConfig",
7375
"LlmRanker",
7476
"Pinecone",
7577
"RagCorpus",

vertexai/preview/rag/rag_data.py

+21
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
from vertexai.preview.rag.utils.resources import (
4646
EmbeddingModelConfig,
4747
JiraSource,
48+
LayoutParserConfig,
4849
Pinecone,
4950
RagCorpus,
5051
RagFile,
@@ -466,6 +467,7 @@ def import_files(
466467
max_embedding_requests_per_min: int = 1000,
467468
use_advanced_pdf_parsing: Optional[bool] = False,
468469
partial_failures_sink: Optional[str] = None,
470+
layout_parser: Optional[LayoutParserConfig] = None,
469471
) -> ImportRagFilesResponse:
470472
"""
471473
Import files to an existing RagCorpus, wait until completion.
@@ -581,13 +583,21 @@ def import_files(
581583
exist - if it does not exist, it will be created. If it does exist,
582584
the schema will be checked and the partial failures will be appended
583585
to the table.
586+
layout_parser: Configuration for the Document AI Layout Parser Processor
587+
to use for document parsing. Optional.
588+
If not None,`use_advanced_pdf_parsing` must be False.
584589
Returns:
585590
ImportRagFilesResponse.
586591
"""
587592
if source is not None and paths is not None:
588593
raise ValueError("Only one of source or paths must be passed in at a time")
589594
if source is None and paths is None:
590595
raise ValueError("One of source or paths must be passed in")
596+
if use_advanced_pdf_parsing and layout_parser is not None:
597+
raise ValueError(
598+
"Only one of use_advanced_pdf_parsing or layout_parser may be "
599+
"passed in at a time"
600+
)
591601
corpus_name = _gapic_utils.get_corpus_name(corpus_name)
592602
request = _gapic_utils.prepare_import_files_request(
593603
corpus_name=corpus_name,
@@ -599,6 +609,7 @@ def import_files(
599609
max_embedding_requests_per_min=max_embedding_requests_per_min,
600610
use_advanced_pdf_parsing=use_advanced_pdf_parsing,
601611
partial_failures_sink=partial_failures_sink,
612+
layout_parser=layout_parser,
602613
)
603614
client = _gapic_utils.create_rag_data_service_client()
604615
try:
@@ -619,6 +630,7 @@ async def import_files_async(
619630
max_embedding_requests_per_min: int = 1000,
620631
use_advanced_pdf_parsing: Optional[bool] = False,
621632
partial_failures_sink: Optional[str] = None,
633+
layout_parser: Optional[LayoutParserConfig] = None,
622634
) -> operation_async.AsyncOperation:
623635
"""
624636
Import files to an existing RagCorpus asynchronously.
@@ -734,13 +746,21 @@ async def import_files_async(
734746
exist - if it does not exist, it will be created. If it does exist,
735747
the schema will be checked and the partial failures will be appended
736748
to the table.
749+
layout_parser: Configuration for the Document AI Layout Parser Processor
750+
to use for document parsing. Optional.
751+
If not None,`use_advanced_pdf_parsing` must be False.
737752
Returns:
738753
operation_async.AsyncOperation.
739754
"""
740755
if source is not None and paths is not None:
741756
raise ValueError("Only one of source or paths must be passed in at a time")
742757
if source is None and paths is None:
743758
raise ValueError("One of source or paths must be passed in")
759+
if use_advanced_pdf_parsing and layout_parser is not None:
760+
raise ValueError(
761+
"Only one of use_advanced_pdf_parsing or layout_parser may be "
762+
"passed in at a time"
763+
)
744764
corpus_name = _gapic_utils.get_corpus_name(corpus_name)
745765
request = _gapic_utils.prepare_import_files_request(
746766
corpus_name=corpus_name,
@@ -752,6 +772,7 @@ async def import_files_async(
752772
max_embedding_requests_per_min=max_embedding_requests_per_min,
753773
use_advanced_pdf_parsing=use_advanced_pdf_parsing,
754774
partial_failures_sink=partial_failures_sink,
775+
layout_parser=layout_parser,
755776
)
756777
async_client = _gapic_utils.create_rag_data_service_async_client()
757778
try:

vertexai/preview/rag/utils/_gapic_utils.py

+23
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
from vertexai.preview.rag.utils.resources import (
4343
EmbeddingModelConfig,
4444
VertexPredictionEndpoint,
45+
LayoutParserConfig,
4546
Pinecone,
4647
RagCorpus,
4748
RagFile,
@@ -60,6 +61,9 @@
6061

6162

6263
_VALID_RESOURCE_NAME_REGEX = "[a-z][a-zA-Z0-9._-]{0,127}"
64+
_VALID_DOCUMENT_AI_PROCESSOR_NAME_REGEX = (
65+
r"projects/[^/]+/locations/[^/]+/processors/[^/]+(?:/processorVersions/[^/]+)?"
66+
)
6367

6468

6569
def create_rag_data_service_client():
@@ -445,6 +449,7 @@ def prepare_import_files_request(
445449
max_embedding_requests_per_min: int = 1000,
446450
use_advanced_pdf_parsing: bool = False,
447451
partial_failures_sink: Optional[str] = None,
452+
layout_parser: Optional[LayoutParserConfig] = None,
448453
) -> ImportRagFilesRequest:
449454
if len(corpus_name.split("/")) != 6:
450455
raise ValueError(
@@ -456,6 +461,24 @@ def prepare_import_files_request(
456461
use_advanced_pdf_parsing=use_advanced_pdf_parsing,
457462
),
458463
)
464+
if layout_parser is not None:
465+
if (
466+
re.fullmatch(
467+
_VALID_DOCUMENT_AI_PROCESSOR_NAME_REGEX, layout_parser.processor_name
468+
)
469+
is None
470+
):
471+
raise ValueError(
472+
"processor_name must be of the format "
473+
"`projects/{project_id}/locations/{location}/processors/{processor_id}`"
474+
"or "
475+
"`projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`, "
476+
f"got {layout_parser.processor_name!r}"
477+
)
478+
rag_file_parsing_config.layout_parser = RagFileParsingConfig.LayoutParser(
479+
processor_name=layout_parser.processor_name,
480+
max_parsing_requests_per_min=layout_parser.max_parsing_requests_per_min,
481+
)
459482
local_chunk_size = chunk_size
460483
local_chunk_overlap = chunk_overlap
461484
if transformation_config and transformation_config.chunking_config:

vertexai/preview/rag/utils/resources.py

+23
Original file line numberDiff line numberDiff line change
@@ -477,3 +477,26 @@ class TransformationConfig:
477477
"""
478478

479479
chunking_config: Optional[ChunkingConfig] = None
480+
481+
482+
@dataclasses.dataclass
483+
class LayoutParserConfig:
484+
"""Configuration for the Document AI Layout Parser Processor.
485+
486+
Attributes:
487+
processor_name (str):
488+
The full resource name of a Document AI processor or processor
489+
version. The processor must have type `LAYOUT_PARSER_PROCESSOR`.
490+
Format:
491+
- `projects/{project_id}/locations/{location}/processors/{processor_id}`
492+
- `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
493+
max_parsing_requests_per_min (int):
494+
The maximum number of requests the job is allowed to make to the
495+
Document AI processor per minute. Consult
496+
https://cloud.google.com/document-ai/quotas and the Quota page for
497+
your project to set an appropriate value here. If unspecified, a
498+
default value of 120 QPM will be used.
499+
"""
500+
501+
processor_name: str
502+
max_parsing_requests_per_min: Optional[int] = None

0 commit comments

Comments
 (0)