feat: add support for Document AI Layout Parser in RAG v1

ilai-deutel · copybara-github · commit 183739080612 · 2025-03-11T11:13:17.000-07:00
PiperOrigin-RevId: 735821016
diff --git a/tests/unit/vertex_rag/test_rag_constants.py b/tests/unit/vertex_rag/test_rag_constants.py
@@ -20,6 +20,7 @@
 
 from vertexai.rag import (
     Filter,
+    LayoutParserConfig,
     Pinecone,
     RagCorpus,
     RagFile,
@@ -40,6 +41,7 @@
 from google.cloud.aiplatform_v1 import (
     GoogleDriveSource,
     RagFileChunkingConfig,
+    RagFileParsingConfig,
     RagFileTransformationConfig,
     ImportRagFilesConfig,
     ImportRagFilesRequest,
@@ -462,6 +464,16 @@
     ],
 )
 
+TEST_LAYOUT_PARSER_WITH_PROCESSOR_PATH_CONFIG = LayoutParserConfig(
+    processor_name="projects/test-project/locations/us/processors/abc123",
+    max_parsing_requests_per_min=100,
+)
+
+TEST_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH_CONFIG = LayoutParserConfig(
+    processor_name="projects/test-project/locations/us/processors/abc123/processorVersions/pretrained-layout-parser-v0.0-2020-01-0",
+    max_parsing_requests_per_min=100,
+)
+
 TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE_NO_FOLDERS = ImportRagFilesConfig(
     rag_file_transformation_config=TEST_RAG_FILE_TRANSFORMATION_CONFIG,
     share_point_sources=GapicSharePointSources(
@@ -484,6 +496,38 @@
     import_rag_files_config=TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE,
 )
 
+TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_PATH = ImportRagFilesConfig(
+    TEST_IMPORT_FILES_CONFIG_DRIVE_FOLDER
+)
+TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_PATH.rag_file_parsing_config = (
+    RagFileParsingConfig(
+        layout_parser=RagFileParsingConfig.LayoutParser(
+            processor_name="projects/test-project/locations/us/processors/abc123",
+            max_parsing_requests_per_min=100,
+        )
+    )
+)
+
+TEST_IMPORT_REQUEST_LAYOUT_PARSER_WITH_PROCESSOR_PATH = ImportRagFilesRequest(
+    parent=TEST_RAG_CORPUS_RESOURCE_NAME,
+    import_rag_files_config=TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_PATH,
+)
+
+TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH = (
+    ImportRagFilesConfig(TEST_IMPORT_FILES_CONFIG_DRIVE_FOLDER)
+)
+TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH.rag_file_parsing_config = RagFileParsingConfig(
+    layout_parser=RagFileParsingConfig.LayoutParser(
+        processor_name="projects/test-project/locations/us/processors/abc123/processorVersions/pretrained-layout-parser-v0.0-2020-01-0",
+        max_parsing_requests_per_min=100,
+    )
+)
+
+TEST_IMPORT_REQUEST_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH = ImportRagFilesRequest(
+    parent=TEST_RAG_CORPUS_RESOURCE_NAME,
+    import_rag_files_config=TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH,
+)
+
 # Retrieval
 TEST_QUERY_TEXT = "What happen to the fox and the dog?"
 TEST_CONTEXTS = RagContexts(
diff --git a/tests/unit/vertex_rag/test_rag_data.py b/tests/unit/vertex_rag/test_rag_data.py
@@ -740,6 +740,45 @@ def test_prepare_import_files_request_sharepoint_source_no_folders(self):
             test_rag_constants.TEST_IMPORT_REQUEST_SHARE_POINT_SOURCE_NO_FOLDERS,
         )
 
+    def test_prepare_import_files_request_valid_layout_parser_with_processor_path(self):
+        request = prepare_import_files_request(
+            corpus_name=test_rag_constants.TEST_RAG_CORPUS_RESOURCE_NAME,
+            paths=[test_rag_constants.TEST_DRIVE_FOLDER],
+            transformation_config=create_transformation_config(),
+            parser=test_rag_constants.TEST_LAYOUT_PARSER_WITH_PROCESSOR_PATH_CONFIG,
+        )
+        import_files_request_eq(
+            request,
+            test_rag_constants.TEST_IMPORT_REQUEST_LAYOUT_PARSER_WITH_PROCESSOR_PATH,
+        )
+
+    def test_prepare_import_files_request_valid_layout_parser_with_processor_version_path(
+        self,
+    ):
+        request = prepare_import_files_request(
+            corpus_name=test_rag_constants.TEST_RAG_CORPUS_RESOURCE_NAME,
+            paths=[test_rag_constants.TEST_DRIVE_FOLDER],
+            transformation_config=create_transformation_config(),
+            parser=test_rag_constants.TEST_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH_CONFIG,
+        )
+        import_files_request_eq(
+            request,
+            test_rag_constants.TEST_IMPORT_REQUEST_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH,
+        )
+
+    def test_prepare_import_files_request_invalid_layout_parser_name(self):
+        layout_parser = rag.LayoutParserConfig(
+            processor_name="projects/test-project/locations/us/processorTypes/LAYOUT_PARSER",
+        )
+        with pytest.raises(ValueError) as e:
+            prepare_import_files_request(
+                corpus_name=test_rag_constants.TEST_RAG_CORPUS_RESOURCE_NAME,
+                paths=[test_rag_constants.TEST_DRIVE_FOLDER],
+                transformation_config=create_transformation_config(),
+                parser=layout_parser,
+            )
+        e.match("processor_name must be of the format")
+
     def test_set_embedding_model_config_set_both_error(self):
         embedding_model_config = rag.RagEmbeddingModelConfig(
             vertex_prediction_endpoint=rag.VertexPredictionEndpoint(
diff --git a/vertexai/rag/__init__.py b/vertexai/rag/__init__.py
@@ -42,6 +42,7 @@
     Filter,
     JiraQuery,
     JiraSource,
+    LayoutParserConfig,
     Pinecone,
     RagCorpus,
     RagEmbeddingModelConfig,
@@ -65,6 +66,7 @@
     "Filter",
     "JiraQuery",
     "JiraSource",
+    "LayoutParserConfig",
     "Pinecone",
     "RagCorpus",
     "RagEmbeddingModelConfig",
diff --git a/vertexai/rag/rag_data.py b/vertexai/rag/rag_data.py
@@ -44,6 +44,7 @@
 )
 from vertexai.rag.utils.resources import (
     JiraSource,
+    LayoutParserConfig,
     RagCorpus,
     RagFile,
     RagVectorDbConfig,
@@ -395,6 +396,7 @@ def import_files(
     timeout: int = 600,
     max_embedding_requests_per_min: int = 1000,
     partial_failures_sink: Optional[str] = None,
+    parser: Optional[LayoutParserConfig] = None,
 ) -> ImportRagFilesResponse:
     """
     Import files to an existing RagCorpus, wait until completion.
@@ -473,6 +475,17 @@ def import_files(
     # Return the number of imported RagFiles after completion.
     print(response.imported_rag_files_count)
 
+    # Document AI Layout Parser example.
+    parser = LayoutParserConfig(
+        processor_name="projects/my-project/locations/us-central1/processors/my-processor-id",
+        max_parsing_requests_per_min=120,
+    )
+    response = rag.import_files(
+        corpus_name="projects/my-project/locations/us-central1/ragCorpora/my-corpus-1",
+        paths=paths,
+        parser=parser,
+    )
+
     ```
     Args:
         corpus_name: The name of the RagCorpus resource into which to import files.
@@ -504,6 +517,9 @@ def import_files(
             exist - if it does not exist, it will be created. If it does exist,
             the schema will be checked and the partial failures will be appended
             to the table.
+        parser: Document parser to use. Should be either None (default parser),
+            or a LayoutParserConfig (to parse documents using a Document AI
+            Layout Parser processor).
     Returns:
         ImportRagFilesResponse.
     """
@@ -519,6 +535,7 @@ def import_files(
         transformation_config=transformation_config,
         max_embedding_requests_per_min=max_embedding_requests_per_min,
         partial_failures_sink=partial_failures_sink,
+        parser=parser,
     )
     client = _gapic_utils.create_rag_data_service_client()
     try:
@@ -536,6 +553,7 @@ async def import_files_async(
     transformation_config: Optional[TransformationConfig] = None,
     max_embedding_requests_per_min: int = 1000,
     partial_failures_sink: Optional[str] = None,
+    parser: Optional[LayoutParserConfig] = None,
 ) -> operation_async.AsyncOperation:
     """
     Import files to an existing RagCorpus asynchronously.
@@ -612,6 +630,17 @@ async def import_files_async(
         share_point_sources=[sharepoint_query],
     )
 
+    # Document AI Layout Parser example.
+    parser = LayoutParserConfig(
+        processor_name="projects/my-project/locations/us-central1/processors/my-processor-id",
+        max_parsing_requests_per_min=120,
+    )
+    response = rag.import_files_async(
+        corpus_name="projects/my-project/locations/us-central1/ragCorpora/my-corpus-1",
+        paths=paths,
+        parser=parser,
+    )
+
     # Get the result.
     await response.result()
 
@@ -645,6 +674,9 @@ async def import_files_async(
             exist - if it does not exist, it will be created. If it does exist,
             the schema will be checked and the partial failures will be appended
             to the table.
+        parser: Document parser to use. Should be either None (default parser),
+            or a LayoutParserConfig (to parse documents using a Document AI
+            Layout Parser processor).
     Returns:
         operation_async.AsyncOperation.
     """
@@ -660,6 +692,7 @@ async def import_files_async(
         transformation_config=transformation_config,
         max_embedding_requests_per_min=max_embedding_requests_per_min,
         partial_failures_sink=partial_failures_sink,
+        parser=parser,
     )
     async_client = _gapic_utils.create_rag_data_service_async_client()
     try:
diff --git a/vertexai/rag/utils/_gapic_utils.py b/vertexai/rag/utils/_gapic_utils.py
@@ -23,6 +23,7 @@
     ImportRagFilesConfig,
     ImportRagFilesRequest,
     RagFileChunkingConfig,
+    RagFileParsingConfig,
     RagFileTransformationConfig,
     RagCorpus as GapicRagCorpus,
     RagFile as GapicRagFile,
@@ -38,6 +39,7 @@
     VertexRagClientWithOverride,
 )
 from vertexai.rag.utils.resources import (
+    LayoutParserConfig,
     Pinecone,
     RagCorpus,
     RagEmbeddingModelConfig,
@@ -54,6 +56,9 @@
 
 
 _VALID_RESOURCE_NAME_REGEX = "[a-z][a-zA-Z0-9._-]{0,127}"
+_VALID_DOCUMENT_AI_PROCESSOR_NAME_REGEX = (
+    r"projects/[^/]+/locations/[^/]+/processors/[^/]+(?:/processorVersions/[^/]+)?"
+)
 
 
 def create_rag_data_service_client():
@@ -356,12 +361,31 @@ def prepare_import_files_request(
     transformation_config: Optional[TransformationConfig] = None,
     max_embedding_requests_per_min: int = 1000,
     partial_failures_sink: Optional[str] = None,
+    parser: Optional[LayoutParserConfig] = None,
 ) -> ImportRagFilesRequest:
     if len(corpus_name.split("/")) != 6:
         raise ValueError(
             "corpus_name must be of the format `projects/{project}/locations/{location}/ragCorpora/{rag_corpus}`"
         )
 
+    rag_file_parsing_config = RagFileParsingConfig()
+    if parser is not None:
+        if (
+            re.fullmatch(_VALID_DOCUMENT_AI_PROCESSOR_NAME_REGEX, parser.processor_name)
+            is None
+        ):
+            raise ValueError(
+                "processor_name must be of the format "
+                "`projects/{project_id}/locations/{location}/processors/{processor_id}`"
+                "or "
+                "`projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`, "
+                f"got {parser.processor_name!r}"
+            )
+        rag_file_parsing_config.layout_parser = RagFileParsingConfig.LayoutParser(
+            processor_name=parser.processor_name,
+            max_parsing_requests_per_min=parser.max_parsing_requests_per_min,
+        )
+
     chunk_size = 1024
     chunk_overlap = 200
     if transformation_config and transformation_config.chunking_config:
@@ -379,6 +403,7 @@ def prepare_import_files_request(
 
     import_rag_files_config = ImportRagFilesConfig(
         rag_file_transformation_config=rag_file_transformation_config,
+        rag_file_parsing_config=rag_file_parsing_config,
         max_embedding_requests_per_min=max_embedding_requests_per_min,
     )
 
diff --git a/vertexai/rag/utils/resources.py b/vertexai/rag/utils/resources.py
@@ -367,3 +367,25 @@ class TransformationConfig:
     """
 
     chunking_config: Optional[ChunkingConfig] = None
+
+
+@dataclasses.dataclass
+class LayoutParserConfig:
+    """Configuration for the Document AI Layout Parser Processor.
+
+    Attributes:
+        processor_name: The full resource name of a Document AI processor or
+            processor version. The processor must have type
+            `LAYOUT_PARSER_PROCESSOR`.
+            Format must be one of the following:
+            -  `projects/{project_id}/locations/{location}/processors/{processor_id}`
+            -  `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
+        max_parsing_requests_per_min: The maximum number of requests the job is
+            allowed to make to the Document AI processor per minute. Consult
+            https://cloud.google.com/document-ai/quotas and the Quota page for
+            your project to set an appropriate value here. If unspecified, a
+            default value of 120 QPM will be used.
+    """
+
+    processor_name: str
+    max_parsing_requests_per_min: Optional[int] = None