Skip to content

Commit f89df1f

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: add support for SharePoint as a ImportRagFiles source.
PiperOrigin-RevId: 677936135
1 parent b456ce3 commit f89df1f

File tree

6 files changed

+297
-5
lines changed

6 files changed

+297
-5
lines changed

tests/unit/vertex_rag/test_rag_constants.py

+119
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
RagCorpus,
2525
RagFile,
2626
RagResource,
27+
SharePointSource,
28+
SharePointSources,
2729
SlackChannelsSource,
2830
SlackChannel,
2931
JiraSource,
@@ -42,6 +44,7 @@
4244
JiraSource as GapicJiraSource,
4345
RagCorpus as GapicRagCorpus,
4446
RagFile as GapicRagFile,
47+
SharePointSources as GapicSharePointSources,
4548
SlackSource as GapicSlackSource,
4649
RagContexts,
4750
RetrieveContextsResponse,
@@ -390,6 +393,122 @@
390393
import_rag_files_config=TEST_IMPORT_FILES_CONFIG_JIRA_SOURCE,
391394
)
392395

396+
# SharePoint sources
397+
TEST_SHARE_POINT_SOURCE = SharePointSources(
398+
share_point_sources=[
399+
SharePointSource(
400+
sharepoint_folder_path="test-sharepoint-folder-path",
401+
drive_name="test-drive-name",
402+
client_id="test-client-id",
403+
client_secret="test-client-secret",
404+
tenant_id="test-tenant-id",
405+
sharepoint_site_name="test-sharepoint-site-name",
406+
)
407+
],
408+
)
409+
TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE = ImportRagFilesConfig(
410+
rag_file_chunking_config=RagFileChunkingConfig(
411+
chunk_size=TEST_CHUNK_SIZE,
412+
chunk_overlap=TEST_CHUNK_OVERLAP,
413+
),
414+
share_point_sources=GapicSharePointSources(
415+
share_point_sources=[
416+
GapicSharePointSources.SharePointSource(
417+
sharepoint_folder_path="test-sharepoint-folder-path",
418+
drive_name="test-drive-name",
419+
client_id="test-client-id",
420+
client_secret=api_auth.ApiAuth.ApiKeyConfig(
421+
api_key_secret_version="test-client-secret"
422+
),
423+
tenant_id="test-tenant-id",
424+
sharepoint_site_name="test-sharepoint-site-name",
425+
)
426+
]
427+
),
428+
)
429+
430+
TEST_IMPORT_REQUEST_SHARE_POINT_SOURCE = ImportRagFilesRequest(
431+
parent=TEST_RAG_CORPUS_RESOURCE_NAME,
432+
import_rag_files_config=TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE,
433+
)
434+
435+
TEST_SHARE_POINT_SOURCE_2_DRIVES = SharePointSources(
436+
share_point_sources=[
437+
SharePointSource(
438+
sharepoint_folder_path="test-sharepoint-folder-path",
439+
drive_name="test-drive-name",
440+
drive_id="test-drive-id",
441+
client_id="test-client-id",
442+
client_secret="test-client-secret",
443+
tenant_id="test-tenant-id",
444+
sharepoint_site_name="test-sharepoint-site-name",
445+
)
446+
],
447+
)
448+
449+
TEST_SHARE_POINT_SOURCE_NO_DRIVES = SharePointSources(
450+
share_point_sources=[
451+
SharePointSource(
452+
sharepoint_folder_path="test-sharepoint-folder-path",
453+
client_id="test-client-id",
454+
client_secret="test-client-secret",
455+
tenant_id="test-tenant-id",
456+
sharepoint_site_name="test-sharepoint-site-name",
457+
)
458+
],
459+
)
460+
461+
TEST_SHARE_POINT_SOURCE_2_FOLDERS = SharePointSources(
462+
share_point_sources=[
463+
SharePointSource(
464+
sharepoint_folder_path="test-sharepoint-folder-path",
465+
sharepoint_folder_id="test-sharepoint-folder-id",
466+
drive_name="test-drive-name",
467+
client_id="test-client-id",
468+
client_secret="test-client-secret",
469+
tenant_id="test-tenant-id",
470+
sharepoint_site_name="test-sharepoint-site-name",
471+
)
472+
],
473+
)
474+
475+
TEST_SHARE_POINT_SOURCE_NO_FOLDERS = SharePointSources(
476+
share_point_sources=[
477+
SharePointSource(
478+
drive_name="test-drive-name",
479+
client_id="test-client-id",
480+
client_secret="test-client-secret",
481+
tenant_id="test-tenant-id",
482+
sharepoint_site_name="test-sharepoint-site-name",
483+
)
484+
],
485+
)
486+
487+
TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE_NO_FOLDERS = ImportRagFilesConfig(
488+
rag_file_chunking_config=RagFileChunkingConfig(
489+
chunk_size=TEST_CHUNK_SIZE,
490+
chunk_overlap=TEST_CHUNK_OVERLAP,
491+
),
492+
share_point_sources=GapicSharePointSources(
493+
share_point_sources=[
494+
GapicSharePointSources.SharePointSource(
495+
drive_name="test-drive-name",
496+
client_id="test-client-id",
497+
client_secret=api_auth.ApiAuth.ApiKeyConfig(
498+
api_key_secret_version="test-client-secret"
499+
),
500+
tenant_id="test-tenant-id",
501+
sharepoint_site_name="test-sharepoint-site-name",
502+
)
503+
]
504+
),
505+
)
506+
507+
TEST_IMPORT_REQUEST_SHARE_POINT_SOURCE_NO_FOLDERS = ImportRagFilesRequest(
508+
parent=TEST_RAG_CORPUS_RESOURCE_NAME,
509+
import_rag_files_config=TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE,
510+
)
511+
393512
# Retrieval
394513
TEST_QUERY_TEXT = "What happen to the fox and the dog?"
395514
TEST_CONTEXTS = RagContexts(

tests/unit/vertex_rag/test_rag_data.py

+50
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,56 @@ def test_prepare_import_files_request_jira_source(self):
563563
)
564564
import_files_request_eq(request, tc.TEST_IMPORT_REQUEST_JIRA_SOURCE)
565565

566+
def test_prepare_import_files_request_sharepoint_source(self):
567+
request = prepare_import_files_request(
568+
corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME,
569+
source=tc.TEST_SHARE_POINT_SOURCE,
570+
chunk_size=tc.TEST_CHUNK_SIZE,
571+
chunk_overlap=tc.TEST_CHUNK_OVERLAP,
572+
)
573+
import_files_request_eq(request, tc.TEST_IMPORT_REQUEST_SHARE_POINT_SOURCE)
574+
575+
def test_prepare_import_files_request_sharepoint_source_2_drives(self):
576+
with pytest.raises(ValueError) as e:
577+
prepare_import_files_request(
578+
corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME,
579+
source=tc.TEST_SHARE_POINT_SOURCE_2_DRIVES,
580+
chunk_size=tc.TEST_CHUNK_SIZE,
581+
chunk_overlap=tc.TEST_CHUNK_OVERLAP,
582+
)
583+
e.match("drive_name and drive_id cannot both be set.")
584+
585+
def test_prepare_import_files_request_sharepoint_source_2_folders(self):
586+
with pytest.raises(ValueError) as e:
587+
prepare_import_files_request(
588+
corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME,
589+
source=tc.TEST_SHARE_POINT_SOURCE_2_FOLDERS,
590+
chunk_size=tc.TEST_CHUNK_SIZE,
591+
chunk_overlap=tc.TEST_CHUNK_OVERLAP,
592+
)
593+
e.match("sharepoint_folder_path and sharepoint_folder_id cannot both be set.")
594+
595+
def test_prepare_import_files_request_sharepoint_source_no_drives(self):
596+
with pytest.raises(ValueError) as e:
597+
prepare_import_files_request(
598+
corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME,
599+
source=tc.TEST_SHARE_POINT_SOURCE_NO_DRIVES,
600+
chunk_size=tc.TEST_CHUNK_SIZE,
601+
chunk_overlap=tc.TEST_CHUNK_OVERLAP,
602+
)
603+
e.match("Either drive_name and drive_id must be set.")
604+
605+
def test_prepare_import_files_request_sharepoint_source_no_folders(self):
606+
request = prepare_import_files_request(
607+
corpus_name=tc.TEST_RAG_CORPUS_RESOURCE_NAME,
608+
source=tc.TEST_SHARE_POINT_SOURCE_NO_FOLDERS,
609+
chunk_size=tc.TEST_CHUNK_SIZE,
610+
chunk_overlap=tc.TEST_CHUNK_OVERLAP,
611+
)
612+
import_files_request_eq(
613+
request, tc.TEST_IMPORT_REQUEST_SHARE_POINT_SOURCE_NO_FOLDERS
614+
)
615+
566616
def test_set_embedding_model_config_set_both_error(self):
567617
embedding_model_config = rag.EmbeddingModelConfig(
568618
publisher_model="whatever",

vertexai/preview/rag/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
RagCorpus,
4545
RagFile,
4646
RagResource,
47+
SharePointSource,
48+
SharePointSources,
4749
SlackChannel,
4850
SlackChannelsSource,
4951
VertexFeatureStore,
@@ -61,6 +63,8 @@
6163
"RagFile",
6264
"RagResource",
6365
"Retrieval",
66+
"SharePointSource",
67+
"SharePointSources",
6468
"SlackChannel",
6569
"SlackChannelsSource",
6670
"VertexFeatureStore",

vertexai/preview/rag/rag_data.py

+29-2
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
Pinecone,
4949
RagCorpus,
5050
RagFile,
51+
SharePointSources,
5152
SlackChannelsSource,
5253
VertexFeatureStore,
5354
VertexVectorSearch,
@@ -290,7 +291,7 @@ def upload_file(
290291
def import_files(
291292
corpus_name: str,
292293
paths: Optional[Sequence[str]] = None,
293-
source: Optional[Union[SlackChannelsSource, JiraSource]] = None,
294+
source: Optional[Union[SlackChannelsSource, JiraSource, SharePointSources]] = None,
294295
chunk_size: int = 1024,
295296
chunk_overlap: int = 200,
296297
timeout: int = 600,
@@ -354,6 +355,19 @@ def import_files(
354355
chunk_overlap=100,
355356
)
356357
358+
# SharePoint Example.
359+
sharepoint_query = rag.SharePointSource(
360+
sharepoint_folder_path="https://my-sharepoint-site.com/my-folder",
361+
sharepoint_site_name="my-sharepoint-site.com",
362+
client_id="my-client-id",
363+
client_secret="my-client-secret",
364+
tenant_id="my-tenant-id",
365+
drive_id="my-drive-id",
366+
)
367+
source = rag.SharePointSources(
368+
share_point_sources=[sharepoint_query],
369+
)
370+
357371
# Return the number of imported RagFiles after completion.
358372
print(response.imported_rag_files_count)
359373
@@ -420,7 +434,7 @@ def import_files(
420434
async def import_files_async(
421435
corpus_name: str,
422436
paths: Optional[Sequence[str]] = None,
423-
source: Optional[Union[SlackChannelsSource, JiraSource]] = None,
437+
source: Optional[Union[SlackChannelsSource, JiraSource, SharePointSources]] = None,
424438
chunk_size: int = 1024,
425439
chunk_overlap: int = 200,
426440
max_embedding_requests_per_min: int = 1000,
@@ -484,6 +498,19 @@ async def import_files_async(
484498
chunk_overlap=100,
485499
)
486500
501+
# SharePoint Example.
502+
sharepoint_query = rag.SharePointSource(
503+
sharepoint_folder_path="https://my-sharepoint-site.com/my-folder",
504+
sharepoint_site_name="my-sharepoint-site.com",
505+
client_id="my-client-id",
506+
client_secret="my-client-secret",
507+
tenant_id="my-tenant-id",
508+
drive_id="my-drive-id",
509+
)
510+
source = rag.SharePointSources(
511+
share_point_sources=[sharepoint_query],
512+
)
513+
487514
# Get the result.
488515
await response.result()
489516

vertexai/preview/rag/utils/_gapic_utils.py

+50-3
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
RagFileParsingConfig,
2727
RagCorpus as GapicRagCorpus,
2828
RagFile as GapicRagFile,
29+
SharePointSources as GapicSharePointSources,
2930
SlackSource as GapicSlackSource,
3031
JiraSource as GapicJiraSource,
3132
RagVectorDbConfig,
@@ -41,6 +42,7 @@
4142
Pinecone,
4243
RagCorpus,
4344
RagFile,
45+
SharePointSources,
4446
SlackChannelsSource,
4547
JiraSource,
4648
VertexFeatureStore,
@@ -222,7 +224,7 @@ def convert_path_to_resource_id(
222224

223225

224226
def convert_source_for_rag_import(
225-
source: Union[SlackChannelsSource, JiraSource]
227+
source: Union[SlackChannelsSource, JiraSource, SharePointSources]
226228
) -> Union[GapicSlackSource, GapicJiraSource]:
227229
"""Converts a SlackChannelsSource or JiraSource to a GapicSlackSource or GapicJiraSource."""
228230
if isinstance(source, SlackChannelsSource):
@@ -269,14 +271,57 @@ def convert_source_for_rag_import(
269271
return GapicJiraSource(
270272
jira_queries=result_source_queries,
271273
)
274+
elif isinstance(source, SharePointSources):
275+
result_source_share_point_sources = []
276+
for share_point_source in source.share_point_sources:
277+
sharepoint_folder_path = share_point_source.sharepoint_folder_path
278+
sharepoint_folder_id = share_point_source.sharepoint_folder_id
279+
drive_name = share_point_source.drive_name
280+
drive_id = share_point_source.drive_id
281+
client_id = share_point_source.client_id
282+
client_secret = share_point_source.client_secret
283+
tenant_id = share_point_source.tenant_id
284+
sharepoint_site_name = share_point_source.sharepoint_site_name
285+
result_share_point_source = GapicSharePointSources.SharePointSource(
286+
client_id=client_id,
287+
client_secret=api_auth.ApiAuth.ApiKeyConfig(
288+
api_key_secret_version=client_secret
289+
),
290+
tenant_id=tenant_id,
291+
sharepoint_site_name=sharepoint_site_name,
292+
)
293+
if sharepoint_folder_path is not None and sharepoint_folder_id is not None:
294+
raise ValueError(
295+
"sharepoint_folder_path and sharepoint_folder_id cannot both be set."
296+
)
297+
elif sharepoint_folder_path is not None:
298+
result_share_point_source.sharepoint_folder_path = (
299+
sharepoint_folder_path
300+
)
301+
elif sharepoint_folder_id is not None:
302+
result_share_point_source.sharepoint_folder_id = sharepoint_folder_id
303+
if drive_name is not None and drive_id is not None:
304+
raise ValueError("drive_name and drive_id cannot both be set.")
305+
elif drive_name is not None:
306+
result_share_point_source.drive_name = drive_name
307+
elif drive_id is not None:
308+
result_share_point_source.drive_id = drive_id
309+
else:
310+
raise ValueError("Either drive_name and drive_id must be set.")
311+
result_source_share_point_sources.append(result_share_point_source)
312+
return GapicSharePointSources(
313+
share_point_sources=result_source_share_point_sources,
314+
)
272315
else:
273-
raise TypeError("source must be a SlackChannelsSource or JiraSource.")
316+
raise TypeError(
317+
"source must be a SlackChannelsSource or JiraSource or SharePointSources."
318+
)
274319

275320

276321
def prepare_import_files_request(
277322
corpus_name: str,
278323
paths: Optional[Sequence[str]] = None,
279-
source: Optional[Union[SlackChannelsSource, JiraSource]] = None,
324+
source: Optional[Union[SlackChannelsSource, JiraSource, SharePointSources]] = None,
280325
chunk_size: int = 1024,
281326
chunk_overlap: int = 200,
282327
max_embedding_requests_per_min: int = 1000,
@@ -307,6 +352,8 @@ def prepare_import_files_request(
307352
import_rag_files_config.slack_source = gapic_source
308353
if isinstance(gapic_source, GapicJiraSource):
309354
import_rag_files_config.jira_source = gapic_source
355+
if isinstance(gapic_source, GapicSharePointSources):
356+
import_rag_files_config.share_point_sources = gapic_source
310357
else:
311358
uris = []
312359
resource_ids = []

0 commit comments

Comments
 (0)