Skip to content

Commit 1837390

Browse files
ilai-deutelcopybara-github
authored andcommitted
feat: add support for Document AI Layout Parser in RAG v1
PiperOrigin-RevId: 735821016
1 parent e425ded commit 1837390

File tree

6 files changed

+165
-0
lines changed

6 files changed

+165
-0
lines changed

tests/unit/vertex_rag/test_rag_constants.py

+44
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
from vertexai.rag import (
2222
Filter,
23+
LayoutParserConfig,
2324
Pinecone,
2425
RagCorpus,
2526
RagFile,
@@ -40,6 +41,7 @@
4041
from google.cloud.aiplatform_v1 import (
4142
GoogleDriveSource,
4243
RagFileChunkingConfig,
44+
RagFileParsingConfig,
4345
RagFileTransformationConfig,
4446
ImportRagFilesConfig,
4547
ImportRagFilesRequest,
@@ -462,6 +464,16 @@
462464
],
463465
)
464466

467+
TEST_LAYOUT_PARSER_WITH_PROCESSOR_PATH_CONFIG = LayoutParserConfig(
468+
processor_name="projects/test-project/locations/us/processors/abc123",
469+
max_parsing_requests_per_min=100,
470+
)
471+
472+
TEST_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH_CONFIG = LayoutParserConfig(
473+
processor_name="projects/test-project/locations/us/processors/abc123/processorVersions/pretrained-layout-parser-v0.0-2020-01-0",
474+
max_parsing_requests_per_min=100,
475+
)
476+
465477
TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE_NO_FOLDERS = ImportRagFilesConfig(
466478
rag_file_transformation_config=TEST_RAG_FILE_TRANSFORMATION_CONFIG,
467479
share_point_sources=GapicSharePointSources(
@@ -484,6 +496,38 @@
484496
import_rag_files_config=TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE,
485497
)
486498

499+
TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_PATH = ImportRagFilesConfig(
500+
TEST_IMPORT_FILES_CONFIG_DRIVE_FOLDER
501+
)
502+
TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_PATH.rag_file_parsing_config = (
503+
RagFileParsingConfig(
504+
layout_parser=RagFileParsingConfig.LayoutParser(
505+
processor_name="projects/test-project/locations/us/processors/abc123",
506+
max_parsing_requests_per_min=100,
507+
)
508+
)
509+
)
510+
511+
TEST_IMPORT_REQUEST_LAYOUT_PARSER_WITH_PROCESSOR_PATH = ImportRagFilesRequest(
512+
parent=TEST_RAG_CORPUS_RESOURCE_NAME,
513+
import_rag_files_config=TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_PATH,
514+
)
515+
516+
TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH = (
517+
ImportRagFilesConfig(TEST_IMPORT_FILES_CONFIG_DRIVE_FOLDER)
518+
)
519+
TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH.rag_file_parsing_config = RagFileParsingConfig(
520+
layout_parser=RagFileParsingConfig.LayoutParser(
521+
processor_name="projects/test-project/locations/us/processors/abc123/processorVersions/pretrained-layout-parser-v0.0-2020-01-0",
522+
max_parsing_requests_per_min=100,
523+
)
524+
)
525+
526+
TEST_IMPORT_REQUEST_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH = ImportRagFilesRequest(
527+
parent=TEST_RAG_CORPUS_RESOURCE_NAME,
528+
import_rag_files_config=TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH,
529+
)
530+
487531
# Retrieval
488532
TEST_QUERY_TEXT = "What happen to the fox and the dog?"
489533
TEST_CONTEXTS = RagContexts(

tests/unit/vertex_rag/test_rag_data.py

+39
Original file line numberDiff line numberDiff line change
@@ -740,6 +740,45 @@ def test_prepare_import_files_request_sharepoint_source_no_folders(self):
740740
test_rag_constants.TEST_IMPORT_REQUEST_SHARE_POINT_SOURCE_NO_FOLDERS,
741741
)
742742

743+
def test_prepare_import_files_request_valid_layout_parser_with_processor_path(self):
744+
request = prepare_import_files_request(
745+
corpus_name=test_rag_constants.TEST_RAG_CORPUS_RESOURCE_NAME,
746+
paths=[test_rag_constants.TEST_DRIVE_FOLDER],
747+
transformation_config=create_transformation_config(),
748+
parser=test_rag_constants.TEST_LAYOUT_PARSER_WITH_PROCESSOR_PATH_CONFIG,
749+
)
750+
import_files_request_eq(
751+
request,
752+
test_rag_constants.TEST_IMPORT_REQUEST_LAYOUT_PARSER_WITH_PROCESSOR_PATH,
753+
)
754+
755+
def test_prepare_import_files_request_valid_layout_parser_with_processor_version_path(
756+
self,
757+
):
758+
request = prepare_import_files_request(
759+
corpus_name=test_rag_constants.TEST_RAG_CORPUS_RESOURCE_NAME,
760+
paths=[test_rag_constants.TEST_DRIVE_FOLDER],
761+
transformation_config=create_transformation_config(),
762+
parser=test_rag_constants.TEST_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH_CONFIG,
763+
)
764+
import_files_request_eq(
765+
request,
766+
test_rag_constants.TEST_IMPORT_REQUEST_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH,
767+
)
768+
769+
def test_prepare_import_files_request_invalid_layout_parser_name(self):
770+
layout_parser = rag.LayoutParserConfig(
771+
processor_name="projects/test-project/locations/us/processorTypes/LAYOUT_PARSER",
772+
)
773+
with pytest.raises(ValueError) as e:
774+
prepare_import_files_request(
775+
corpus_name=test_rag_constants.TEST_RAG_CORPUS_RESOURCE_NAME,
776+
paths=[test_rag_constants.TEST_DRIVE_FOLDER],
777+
transformation_config=create_transformation_config(),
778+
parser=layout_parser,
779+
)
780+
e.match("processor_name must be of the format")
781+
743782
def test_set_embedding_model_config_set_both_error(self):
744783
embedding_model_config = rag.RagEmbeddingModelConfig(
745784
vertex_prediction_endpoint=rag.VertexPredictionEndpoint(

vertexai/rag/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
Filter,
4343
JiraQuery,
4444
JiraSource,
45+
LayoutParserConfig,
4546
Pinecone,
4647
RagCorpus,
4748
RagEmbeddingModelConfig,
@@ -65,6 +66,7 @@
6566
"Filter",
6667
"JiraQuery",
6768
"JiraSource",
69+
"LayoutParserConfig",
6870
"Pinecone",
6971
"RagCorpus",
7072
"RagEmbeddingModelConfig",

vertexai/rag/rag_data.py

+33
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
)
4545
from vertexai.rag.utils.resources import (
4646
JiraSource,
47+
LayoutParserConfig,
4748
RagCorpus,
4849
RagFile,
4950
RagVectorDbConfig,
@@ -395,6 +396,7 @@ def import_files(
395396
timeout: int = 600,
396397
max_embedding_requests_per_min: int = 1000,
397398
partial_failures_sink: Optional[str] = None,
399+
parser: Optional[LayoutParserConfig] = None,
398400
) -> ImportRagFilesResponse:
399401
"""
400402
Import files to an existing RagCorpus, wait until completion.
@@ -473,6 +475,17 @@ def import_files(
473475
# Return the number of imported RagFiles after completion.
474476
print(response.imported_rag_files_count)
475477
478+
# Document AI Layout Parser example.
479+
parser = LayoutParserConfig(
480+
processor_name="projects/my-project/locations/us-central1/processors/my-processor-id",
481+
max_parsing_requests_per_min=120,
482+
)
483+
response = rag.import_files(
484+
corpus_name="projects/my-project/locations/us-central1/ragCorpora/my-corpus-1",
485+
paths=paths,
486+
parser=parser,
487+
)
488+
476489
```
477490
Args:
478491
corpus_name: The name of the RagCorpus resource into which to import files.
@@ -504,6 +517,9 @@ def import_files(
504517
exist - if it does not exist, it will be created. If it does exist,
505518
the schema will be checked and the partial failures will be appended
506519
to the table.
520+
parser: Document parser to use. Should be either None (default parser),
521+
or a LayoutParserConfig (to parse documents using a Document AI
522+
Layout Parser processor).
507523
Returns:
508524
ImportRagFilesResponse.
509525
"""
@@ -519,6 +535,7 @@ def import_files(
519535
transformation_config=transformation_config,
520536
max_embedding_requests_per_min=max_embedding_requests_per_min,
521537
partial_failures_sink=partial_failures_sink,
538+
parser=parser,
522539
)
523540
client = _gapic_utils.create_rag_data_service_client()
524541
try:
@@ -536,6 +553,7 @@ async def import_files_async(
536553
transformation_config: Optional[TransformationConfig] = None,
537554
max_embedding_requests_per_min: int = 1000,
538555
partial_failures_sink: Optional[str] = None,
556+
parser: Optional[LayoutParserConfig] = None,
539557
) -> operation_async.AsyncOperation:
540558
"""
541559
Import files to an existing RagCorpus asynchronously.
@@ -612,6 +630,17 @@ async def import_files_async(
612630
share_point_sources=[sharepoint_query],
613631
)
614632
633+
# Document AI Layout Parser example.
634+
parser = LayoutParserConfig(
635+
processor_name="projects/my-project/locations/us-central1/processors/my-processor-id",
636+
max_parsing_requests_per_min=120,
637+
)
638+
response = rag.import_files_async(
639+
corpus_name="projects/my-project/locations/us-central1/ragCorpora/my-corpus-1",
640+
paths=paths,
641+
parser=parser,
642+
)
643+
615644
# Get the result.
616645
await response.result()
617646
@@ -645,6 +674,9 @@ async def import_files_async(
645674
exist - if it does not exist, it will be created. If it does exist,
646675
the schema will be checked and the partial failures will be appended
647676
to the table.
677+
parser: Document parser to use. Should be either None (default parser),
678+
or a LayoutParserConfig (to parse documents using a Document AI
679+
Layout Parser processor).
648680
Returns:
649681
operation_async.AsyncOperation.
650682
"""
@@ -660,6 +692,7 @@ async def import_files_async(
660692
transformation_config=transformation_config,
661693
max_embedding_requests_per_min=max_embedding_requests_per_min,
662694
partial_failures_sink=partial_failures_sink,
695+
parser=parser,
663696
)
664697
async_client = _gapic_utils.create_rag_data_service_async_client()
665698
try:

vertexai/rag/utils/_gapic_utils.py

+25
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
ImportRagFilesConfig,
2424
ImportRagFilesRequest,
2525
RagFileChunkingConfig,
26+
RagFileParsingConfig,
2627
RagFileTransformationConfig,
2728
RagCorpus as GapicRagCorpus,
2829
RagFile as GapicRagFile,
@@ -38,6 +39,7 @@
3839
VertexRagClientWithOverride,
3940
)
4041
from vertexai.rag.utils.resources import (
42+
LayoutParserConfig,
4143
Pinecone,
4244
RagCorpus,
4345
RagEmbeddingModelConfig,
@@ -54,6 +56,9 @@
5456

5557

5658
_VALID_RESOURCE_NAME_REGEX = "[a-z][a-zA-Z0-9._-]{0,127}"
59+
_VALID_DOCUMENT_AI_PROCESSOR_NAME_REGEX = (
60+
r"projects/[^/]+/locations/[^/]+/processors/[^/]+(?:/processorVersions/[^/]+)?"
61+
)
5762

5863

5964
def create_rag_data_service_client():
@@ -356,12 +361,31 @@ def prepare_import_files_request(
356361
transformation_config: Optional[TransformationConfig] = None,
357362
max_embedding_requests_per_min: int = 1000,
358363
partial_failures_sink: Optional[str] = None,
364+
parser: Optional[LayoutParserConfig] = None,
359365
) -> ImportRagFilesRequest:
360366
if len(corpus_name.split("/")) != 6:
361367
raise ValueError(
362368
"corpus_name must be of the format `projects/{project}/locations/{location}/ragCorpora/{rag_corpus}`"
363369
)
364370

371+
rag_file_parsing_config = RagFileParsingConfig()
372+
if parser is not None:
373+
if (
374+
re.fullmatch(_VALID_DOCUMENT_AI_PROCESSOR_NAME_REGEX, parser.processor_name)
375+
is None
376+
):
377+
raise ValueError(
378+
"processor_name must be of the format "
379+
"`projects/{project_id}/locations/{location}/processors/{processor_id}`"
380+
"or "
381+
"`projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`, "
382+
f"got {parser.processor_name!r}"
383+
)
384+
rag_file_parsing_config.layout_parser = RagFileParsingConfig.LayoutParser(
385+
processor_name=parser.processor_name,
386+
max_parsing_requests_per_min=parser.max_parsing_requests_per_min,
387+
)
388+
365389
chunk_size = 1024
366390
chunk_overlap = 200
367391
if transformation_config and transformation_config.chunking_config:
@@ -379,6 +403,7 @@ def prepare_import_files_request(
379403

380404
import_rag_files_config = ImportRagFilesConfig(
381405
rag_file_transformation_config=rag_file_transformation_config,
406+
rag_file_parsing_config=rag_file_parsing_config,
382407
max_embedding_requests_per_min=max_embedding_requests_per_min,
383408
)
384409

vertexai/rag/utils/resources.py

+22
Original file line numberDiff line numberDiff line change
@@ -367,3 +367,25 @@ class TransformationConfig:
367367
"""
368368

369369
chunking_config: Optional[ChunkingConfig] = None
370+
371+
372+
@dataclasses.dataclass
373+
class LayoutParserConfig:
374+
"""Configuration for the Document AI Layout Parser Processor.
375+
376+
Attributes:
377+
processor_name: The full resource name of a Document AI processor or
378+
processor version. The processor must have type
379+
`LAYOUT_PARSER_PROCESSOR`.
380+
Format must be one of the following:
381+
- `projects/{project_id}/locations/{location}/processors/{processor_id}`
382+
- `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
383+
max_parsing_requests_per_min: The maximum number of requests the job is
384+
allowed to make to the Document AI processor per minute. Consult
385+
https://cloud.google.com/document-ai/quotas and the Quota page for
386+
your project to set an appropriate value here. If unspecified, a
387+
default value of 120 QPM will be used.
388+
"""
389+
390+
processor_name: str
391+
max_parsing_requests_per_min: Optional[int] = None

0 commit comments

Comments
 (0)