fix: Propagating import result sink correctly in the vertexai sdk.

vertex-sdk-bot · copybara-github · commit 184cca51f8f2 · 2025-03-27T16:23:47.000-07:00
PiperOrigin-RevId: 741317609
diff --git a/tests/unit/vertex_rag/test_rag_constants.py b/tests/unit/vertex_rag/test_rag_constants.py
@@ -209,6 +209,8 @@
         ),
     ),
 )
+TEST_IMPORT_RESULT_GCS_SINK = "gs://test-bucket/test-object.ndjson"
+TEST_IMPORT_RESULT_BIGQUERY_SINK = "bq://test-project.test_dataset.test_table"
 # GCS
 TEST_IMPORT_FILES_CONFIG_GCS = ImportRagFilesConfig(
     rag_file_transformation_config=TEST_RAG_FILE_TRANSFORMATION_CONFIG,
diff --git a/tests/unit/vertex_rag/test_rag_data.py b/tests/unit/vertex_rag/test_rag_data.py
@@ -276,6 +276,14 @@ def import_files_request_eq(returned_request, expected_request):
         returned_request.import_rag_files_config.rag_file_transformation_config
         == expected_request.import_rag_files_config.rag_file_transformation_config
     )
+    assert (
+        returned_request.import_rag_files_config.import_result_gcs_sink
+        == expected_request.import_rag_files_config.import_result_gcs_sink
+    )
+    assert (
+        returned_request.import_rag_files_config.import_result_bigquery_sink
+        == expected_request.import_rag_files_config.import_result_bigquery_sink
+    )
 
 
 @pytest.mark.usefixtures("google_auth_mock")
@@ -517,6 +525,26 @@ def test_import_files(self, import_files_mock):
 
         assert response.imported_rag_files_count == 2
 
+    def test_import_files_with_import_result_gcs_sink(self, import_files_mock):
+        response = rag.import_files(
+            corpus_name=test_rag_constants.TEST_RAG_CORPUS_RESOURCE_NAME,
+            paths=[test_rag_constants.TEST_GCS_PATH],
+            import_result_sink=test_rag_constants.TEST_IMPORT_RESULT_GCS_SINK,
+        )
+        import_files_mock.assert_called_once()
+
+        assert response.imported_rag_files_count == 2
+
+    def test_import_files_with_import_result_bigquery_sink(self, import_files_mock):
+        response = rag.import_files(
+            corpus_name=test_rag_constants.TEST_RAG_CORPUS_RESOURCE_NAME,
+            paths=[test_rag_constants.TEST_GCS_PATH],
+            import_result_sink=test_rag_constants.TEST_IMPORT_RESULT_BIGQUERY_SINK,
+        )
+        import_files_mock.assert_called_once()
+
+        assert response.imported_rag_files_count == 2
+
     @pytest.mark.usefixtures("rag_data_client_mock_exception")
     def test_import_files_failure(self):
         with pytest.raises(RuntimeError) as e:
@@ -536,6 +564,32 @@ async def test_import_files_async(self, import_files_async_mock):
 
         assert response.result().imported_rag_files_count == 2
 
+    @pytest.mark.asyncio
+    async def test_import_files_with_import_result_gcs_sink_async(
+        self, import_files_async_mock
+    ):
+        response = await rag.import_files_async(
+            corpus_name=test_rag_constants.TEST_RAG_CORPUS_RESOURCE_NAME,
+            paths=[test_rag_constants.TEST_GCS_PATH],
+            import_result_sink=test_rag_constants.TEST_IMPORT_RESULT_GCS_SINK,
+        )
+        import_files_async_mock.assert_called_once()
+
+        assert response.result().imported_rag_files_count == 2
+
+    @pytest.mark.asyncio
+    async def test_import_files_with_import_result_bigquery_sink_async(
+        self, import_files_async_mock
+    ):
+        response = await rag.import_files_async(
+            corpus_name=test_rag_constants.TEST_RAG_CORPUS_RESOURCE_NAME,
+            paths=[test_rag_constants.TEST_GCS_PATH],
+            import_result_sink=test_rag_constants.TEST_IMPORT_RESULT_BIGQUERY_SINK,
+        )
+        import_files_async_mock.assert_called_once()
+
+        assert response.result().imported_rag_files_count == 2
+
     @pytest.mark.asyncio
     @pytest.mark.usefixtures("rag_data_async_client_mock_exception")
     async def test_import_files_async_failure(self):
diff --git a/vertexai/rag/rag_data.py b/vertexai/rag/rag_data.py
@@ -395,6 +395,7 @@ def import_files(
     transformation_config: Optional[TransformationConfig] = None,
     timeout: int = 600,
     max_embedding_requests_per_min: int = 1000,
+    import_result_sink: Optional[str] = None,
     partial_failures_sink: Optional[str] = None,
     parser: Optional[LayoutParserConfig] = None,
 ) -> ImportRagFilesResponse:
@@ -509,8 +510,17 @@ def import_files(
             here. If unspecified, a default value of 1,000
             QPM would be used.
         timeout: Default is 600 seconds.
-        partial_failures_sink: Either a GCS path to store partial failures or a
-            BigQuery table to store partial failures. The format is
+        import_result_sink: Either a GCS path to store import results or a
+            BigQuery table to store import results. The format is
+            "gs://my-bucket/my/object.ndjson" for GCS or
+            "bq://my-project.my-dataset.my-table" for BigQuery. An existing GCS
+            object cannot be used. However, the BigQuery table may or may not
+            exist - if it does not exist, it will be created. If it does exist,
+            the schema will be checked and the import results will be appended
+            to the table.
+        partial_failures_sink: Deprecated. Prefer to use `import_result_sink`.
+            Either a GCS path to store partial failures or a BigQuery table to
+            store partial failures. The format is
             "gs://my-bucket/my/object.ndjson" for GCS or
             "bq://my-project.my-dataset.my-table" for BigQuery. An existing GCS
             object cannot be used. However, the BigQuery table may or may not
@@ -534,6 +544,7 @@ def import_files(
         source=source,
         transformation_config=transformation_config,
         max_embedding_requests_per_min=max_embedding_requests_per_min,
+        import_result_sink=import_result_sink,
         partial_failures_sink=partial_failures_sink,
         parser=parser,
     )
@@ -552,6 +563,7 @@ async def import_files_async(
     source: Optional[Union[SlackChannelsSource, JiraSource, SharePointSources]] = None,
     transformation_config: Optional[TransformationConfig] = None,
     max_embedding_requests_per_min: int = 1000,
+    import_result_sink: Optional[str] = None,
     partial_failures_sink: Optional[str] = None,
     parser: Optional[LayoutParserConfig] = None,
 ) -> operation_async.AsyncOperation:
@@ -666,8 +678,17 @@ async def import_files_async(
             page on the project to set an appropriate value
             here. If unspecified, a default value of 1,000
             QPM would be used.
-        partial_failures_sink: Either a GCS path to store partial failures or a
-            BigQuery table to store partial failures. The format is
+        import_result_sink: Either a GCS path to store import results or a
+            BigQuery table to store import results. The format is
+            "gs://my-bucket/my/object.ndjson" for GCS or
+            "bq://my-project.my-dataset.my-table" for BigQuery. An existing GCS
+            object cannot be used. However, the BigQuery table may or may not
+            exist - if it does not exist, it will be created. If it does exist,
+            the schema will be checked and the import results will be appended
+            to the table.
+        partial_failures_sink: Deprecated. Prefer to use `import_result_sink`.
+            Either a GCS path to store partial failures or a BigQuery table to
+            store partial failures. The format is
             "gs://my-bucket/my/object.ndjson" for GCS or
             "bq://my-project.my-dataset.my-table" for BigQuery. An existing GCS
             object cannot be used. However, the BigQuery table may or may not
@@ -691,6 +712,7 @@ async def import_files_async(
         source=source,
         transformation_config=transformation_config,
         max_embedding_requests_per_min=max_embedding_requests_per_min,
+        import_result_sink=import_result_sink,
         partial_failures_sink=partial_failures_sink,
         parser=parser,
     )
diff --git a/vertexai/rag/utils/_gapic_utils.py b/vertexai/rag/utils/_gapic_utils.py
@@ -360,6 +360,7 @@ def prepare_import_files_request(
     source: Optional[Union[SlackChannelsSource, JiraSource, SharePointSources]] = None,
     transformation_config: Optional[TransformationConfig] = None,
     max_embedding_requests_per_min: int = 1000,
+    import_result_sink: Optional[str] = None,
     partial_failures_sink: Optional[str] = None,
     parser: Optional[LayoutParserConfig] = None,
 ) -> ImportRagFilesRequest:
@@ -407,6 +408,22 @@ def prepare_import_files_request(
         max_embedding_requests_per_min=max_embedding_requests_per_min,
     )
 
+    import_result_sink = import_result_sink or partial_failures_sink
+
+    if import_result_sink is not None:
+        if import_result_sink.startswith("gs://"):
+            import_rag_files_config.partial_failure_gcs_sink.output_uri_prefix = (
+                import_result_sink
+            )
+        elif import_result_sink.startswith("bq://"):
+            import_rag_files_config.partial_failure_bigquery_sink.output_uri = (
+                import_result_sink
+            )
+        else:
+            raise ValueError(
+                "import_result_sink must be a GCS path or a BigQuery table."
+            )
+
     if source is not None:
         gapic_source = convert_source_for_rag_import(source)
         if isinstance(gapic_source, GapicSlackSource):

Original file line number	Diff line number	Diff line change
`@@ -209,6 +209,8 @@`
`209`	`209`	`),`
`210`	`210`	`),`
`211`	`211`	`)`
	`212`	`+TEST_IMPORT_RESULT_GCS_SINK = "gs://test-bucket/test-object.ndjson"`
	`213`	`+TEST_IMPORT_RESULT_BIGQUERY_SINK = "bq://test-project.test_dataset.test_table"`
`212`	`214`	`# GCS`
`213`	`215`	`TEST_IMPORT_FILES_CONFIG_GCS = ImportRagFilesConfig(`
`214`	`216`	`rag_file_transformation_config=TEST_RAG_FILE_TRANSFORMATION_CONFIG,`