Skip to content

Commit f7685ce

Browse files
eavanvalkenburgmoonbox3
authored andcommitted
Python: restructed data folder and multiple improvements to vector stores (microsoft#11302)
### Motivation and Context <!-- Thank you for your contribution to the semantic-kernel repo! Please help reviewers and future users, providing the following information: 1. Why is this change required? 2. What problem does it solve? 3. What scenario does it contribute to? 4. If it fixes an open issue, please link to the issue here. --> This PR: - restructures the data folder, flattening the hierarchy - it adds overloads to the Get, Upsert, Delete methods, to allow batch as well, marks the batch versions as deprecated, closes microsoft#11301 - restructured the inheritance of VectorStoreRecordCollection, which now has a base that does record handling, which is shared with the VectorSearchBase for the serialization, and the search method mixins now subclass the VectorSearchBase so that there is one less parent direct parent, a full Collection will now inherit from VectorStoreRecordCollection and zero or more VectorSearch mixins, this is done to clean things up and in preparation of adding HybridSearch. - Also adds a convenience method to each search mixin to create a VectorStoreTextSearch directly, allowing folks to import one less concept, closes microsoft#11111 ### Description <!-- Describe your changes, the overall approach, the underlying design. These notes will help understanding how your code works. Thanks! --> Most of the code has been moved out of many files and folders, into a few files which make purpose and collaboration between the classes easier to understand. This is a breaking change: If you have implemented your own VectorStoreRecordCollection, make sure to: - Add `VectorStoreRecordCollection[TKey, TModel]` as superclass to the implementation - Remove `VectorSearchBase` as superclass from implementation - Set the SearchMixins to `[TKey, TModel]` instead of just `[TModel]` ### Contribution Checklist <!-- Before submitting this PR, please make sure: --> - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [ ] I didn't break anyone 😄 --------- Co-authored-by: Evan Mattson <[email protected]>
1 parent ff6c059 commit f7685ce

File tree

99 files changed

+2079
-2440
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

99 files changed

+2079
-2440
lines changed

python/samples/concepts/caching/semantic_caching.py

+10-13
Original file line numberDiff line numberDiff line change
@@ -7,25 +7,22 @@
77
from typing import Annotated
88
from uuid import uuid4
99

10+
from semantic_kernel import Kernel
1011
from semantic_kernel.connectors.ai.embedding_generator_base import EmbeddingGeneratorBase
11-
from semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion import OpenAIChatCompletion
12-
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_embedding import OpenAITextEmbedding
12+
from semantic_kernel.connectors.ai.open_ai import OpenAIChatCompletion, OpenAITextEmbedding
1313
from semantic_kernel.connectors.memory.in_memory.in_memory_store import InMemoryVectorStore
14-
from semantic_kernel.data.record_definition import vectorstoremodel
15-
from semantic_kernel.data.record_definition.vector_store_record_fields import (
14+
from semantic_kernel.data import (
15+
VectorizedSearchMixin,
16+
VectorSearchOptions,
17+
VectorStore,
18+
VectorStoreRecordCollection,
1619
VectorStoreRecordDataField,
1720
VectorStoreRecordKeyField,
1821
VectorStoreRecordVectorField,
22+
vectorstoremodel,
1923
)
20-
from semantic_kernel.data.vector_search.vector_search_options import VectorSearchOptions
21-
from semantic_kernel.data.vector_search.vectorized_search import VectorizedSearchMixin
22-
from semantic_kernel.data.vector_storage.vector_store import VectorStore
23-
from semantic_kernel.data.vector_storage.vector_store_record_collection import VectorStoreRecordCollection
24-
from semantic_kernel.filters.filter_types import FilterTypes
25-
from semantic_kernel.filters.functions.function_invocation_context import FunctionInvocationContext
26-
from semantic_kernel.filters.prompts.prompt_render_context import PromptRenderContext
27-
from semantic_kernel.functions.function_result import FunctionResult
28-
from semantic_kernel.kernel import Kernel
24+
from semantic_kernel.filters import FilterTypes, FunctionInvocationContext, PromptRenderContext
25+
from semantic_kernel.functions import FunctionResult
2926

3027
COLLECTION_NAME = "llm_responses"
3128
RECORD_ID_KEY = "cache_record_id"

python/samples/concepts/chat_history/store_chat_history_in_cosmosdb.py

+6-7
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,18 @@
66

77
from samples.concepts.setup.chat_completion_services import Services, get_chat_completion_service_and_request_settings
88
from semantic_kernel import Kernel
9-
from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior
9+
from semantic_kernel.connectors.ai import FunctionChoiceBehavior
1010
from semantic_kernel.connectors.memory.azure_cosmos_db.azure_cosmos_db_no_sql_store import AzureCosmosDBNoSQLStore
11-
from semantic_kernel.contents import ChatHistory
12-
from semantic_kernel.contents.chat_message_content import ChatMessageContent
11+
from semantic_kernel.contents import ChatHistory, ChatMessageContent
1312
from semantic_kernel.core_plugins.math_plugin import MathPlugin
1413
from semantic_kernel.core_plugins.time_plugin import TimePlugin
15-
from semantic_kernel.data.record_definition.vector_store_model_decorator import vectorstoremodel
16-
from semantic_kernel.data.record_definition.vector_store_record_fields import (
14+
from semantic_kernel.data import (
15+
VectorStore,
16+
VectorStoreRecordCollection,
1717
VectorStoreRecordDataField,
1818
VectorStoreRecordKeyField,
19+
vectorstoremodel,
1920
)
20-
from semantic_kernel.data.vector_storage.vector_store import VectorStore
21-
from semantic_kernel.data.vector_storage.vector_store_record_collection import VectorStoreRecordCollection
2221

2322
"""
2423
This sample demonstrates how to build a conversational chatbot

python/samples/concepts/memory/azure_ai_search_hotel_samples/step_1_interact_with_the_collection.py

+12-13
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22

33
import asyncio
44

5+
from step_0_data_model import HotelSampleClass
6+
7+
from semantic_kernel import Kernel
8+
from semantic_kernel.connectors.ai.open_ai import OpenAITextEmbedding
9+
from semantic_kernel.connectors.memory.azure_ai_search import AzureAISearchCollection
10+
511
###
612
# The data model used for this sample is based on the hotel data model from the Azure AI Search samples.
713
# When deploying a new index in Azure AI Search using the import wizard you can choose to deploy the 'hotel-samples'
@@ -13,24 +19,19 @@
1319
# This sample assumes the index is deployed, the vector fields can be empty.
1420
# If the vector fields are empty, change the first_run parameter to True to add the vectors.
1521
###
16-
from step_0_data_model import HotelSampleClass
17-
18-
from semantic_kernel import Kernel
19-
from semantic_kernel.connectors.ai.open_ai import OpenAITextEmbedding
20-
from semantic_kernel.connectors.memory.azure_ai_search import AzureAISearchCollection
2122
from semantic_kernel.data import (
2223
VectorSearchOptions,
23-
VectorStoreRecordUtils,
2424
)
25+
from semantic_kernel.data.vector_search import add_vector_to_records
2526

2627
first_run = False
2728

2829
# Note: you may need to update this `collection_name` depending upon how your index is named.
2930
COLLECTION_NAME = "hotels-sample-index"
3031

3132

32-
async def add_vectors(collection: AzureAISearchCollection, vectorizer: VectorStoreRecordUtils):
33-
"""This is a simple function that uses the VectorStoreRecordUtils to add vectors to the records in the collection.
33+
async def add_vectors(collection: AzureAISearchCollection, kernel: Kernel):
34+
"""This is a simple function that uses the add_vector_to_records function to add vectors.
3435
3536
It first uses the search_client within the collection to get a list of ids.
3637
and then uses the upsert to add the vectors to the records.
@@ -42,7 +43,7 @@ async def add_vectors(collection: AzureAISearchCollection, vectorizer: VectorSto
4243
if hotels is not None and isinstance(hotels, list):
4344
for hotel in hotels:
4445
if not hotel.description_vector or not hotel.description_fr_vector:
45-
hotel = await vectorizer.add_vector_to_records(hotel, HotelSampleClass)
46+
hotel = await add_vector_to_records(kernel, hotel, HotelSampleClass)
4647
await collection.upsert(hotel)
4748

4849

@@ -52,10 +53,8 @@ async def main(query: str, first_run: bool = False):
5253
# Add the OpenAI text embedding service
5354
embeddings = OpenAITextEmbedding(service_id="embedding", ai_model_id="text-embedding-3-small")
5455
kernel.add_service(embeddings)
55-
# Create the VectorStoreRecordUtils object
56-
vectorizer = VectorStoreRecordUtils(kernel)
5756
# Create the Azure AI Search collection
58-
collection = AzureAISearchCollection[HotelSampleClass](
57+
collection = AzureAISearchCollection[str, HotelSampleClass](
5958
collection_name=COLLECTION_NAME, data_model_type=HotelSampleClass
6059
)
6160
# Check if the collection exists.
@@ -71,7 +70,7 @@ async def main(query: str, first_run: bool = False):
7170

7271
# If it is the first run and there are no vectors, add them.
7372
if first_run:
74-
await add_vectors(collection, vectorizer)
73+
await add_vectors(collection, kernel)
7574

7675
# Search using just text, by default this will search all the searchable text fields in the index.
7776
results = await collection.text_search(search_text=query)

python/samples/concepts/memory/azure_ai_search_hotel_samples/step_2_use_as_a_plugin.py

+21-21
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,21 @@
22

33

44
import asyncio
5-
from collections.abc import Coroutine
5+
from collections.abc import Awaitable, Callable
66
from typing import Any
77

8+
from step_0_data_model import HotelSampleClass
9+
10+
from semantic_kernel import Kernel
11+
from semantic_kernel.connectors.ai import FunctionChoiceBehavior
12+
from semantic_kernel.connectors.ai.open_ai import (
13+
OpenAIChatCompletion,
14+
OpenAIChatPromptExecutionSettings,
15+
OpenAITextEmbedding,
16+
)
17+
from semantic_kernel.connectors.memory.azure_ai_search import AzureAISearchCollection
18+
from semantic_kernel.contents import ChatHistory
19+
820
###
921
# The data model used for this sample is based on the hotel data model from the Azure AI Search samples.
1022
# When deploying a new index in Azure AI Search using the import wizard you can choose to deploy the 'hotel-samples'
@@ -16,26 +28,12 @@
1628
# This sample assumes the index is deployed, and the vectors have been filled.
1729
# Use the step_1_interact_with_the_collection.py sample, with `first_run = True` to fill the vectors.
1830
###
19-
from step_0_data_model import HotelSampleClass
20-
21-
from semantic_kernel import Kernel
22-
from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior
23-
from semantic_kernel.connectors.ai.open_ai import (
24-
OpenAIChatCompletion,
25-
OpenAIChatPromptExecutionSettings,
26-
OpenAITextEmbedding,
27-
)
28-
from semantic_kernel.connectors.memory.azure_ai_search import AzureAISearchCollection
29-
from semantic_kernel.contents import ChatHistory
3031
from semantic_kernel.data import (
3132
VectorSearchFilter,
3233
VectorSearchOptions,
33-
VectorStoreRecordUtils,
3434
)
35-
from semantic_kernel.data.search_options import SearchOptions
36-
from semantic_kernel.data.text_search.vector_store_text_search import VectorStoreTextSearch
37-
from semantic_kernel.filters.filter_types import FilterTypes
38-
from semantic_kernel.filters.functions.function_invocation_context import FunctionInvocationContext
35+
from semantic_kernel.data.text_search import SearchOptions
36+
from semantic_kernel.filters import FilterTypes, FunctionInvocationContext
3937
from semantic_kernel.functions import (
4038
KernelArguments,
4139
KernelParameterMetadata,
@@ -50,15 +48,15 @@
5048
kernel.add_service(OpenAIChatCompletion(service_id=service_id))
5149
embeddings = OpenAITextEmbedding(service_id="embedding", ai_model_id="text-embedding-3-small")
5250
kernel.add_service(embeddings)
53-
vectorizer = VectorStoreRecordUtils(kernel)
5451

5552
# Create a Text Search object, with a Azure AI Search collection.
5653
# using the `from_vector_text_search` method means that this plugin will only use text search.
5754
# You can also choose to use the `from_vectorized_search` method to use vector search.
5855
# Or the `from_vectorizable_text_search` method if the collection is setup to vectorize incoming texts.
59-
text_search = VectorStoreTextSearch.from_vector_text_search(
60-
AzureAISearchCollection[HotelSampleClass](collection_name=COLLECTION_NAME, data_model_type=HotelSampleClass)
56+
collection = AzureAISearchCollection[str, HotelSampleClass](
57+
collection_name=COLLECTION_NAME, data_model_type=HotelSampleClass
6158
)
59+
text_search = collection.create_text_search_from_vector_text_search()
6260

6361

6462
# Before we create the plugin, we want to create a function that will help the plugin work the way we want it to.
@@ -195,7 +193,9 @@ def update_options_search(
195193
# This allows us to see what parameters are being passed to the plugin.
196194
# And this gives us a way to debug the search experience and if necessary tweak the parameters and descriptions.
197195
@kernel.filter(filter_type=FilterTypes.FUNCTION_INVOCATION)
198-
async def log_search_filter(context: FunctionInvocationContext, next: Coroutine[FunctionInvocationContext, Any, None]):
196+
async def log_search_filter(
197+
context: FunctionInvocationContext, next: Callable[[FunctionInvocationContext], Awaitable[None]]
198+
):
199199
if context.function.plugin_name == "azure_ai_search":
200200
print(f"Calling Azure AI Search ({context.function.name}) with arguments:")
201201
for arg in context.arguments:

python/samples/concepts/memory/complex_memory.py

+4-8
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,6 @@
3131
from semantic_kernel.connectors.memory.redis import RedisHashsetCollection, RedisJsonCollection
3232
from semantic_kernel.connectors.memory.weaviate import WeaviateCollection
3333
from semantic_kernel.data import (
34-
DISTANCE_FUNCTION_DIRECTION_HELPER,
35-
DistanceFunction,
36-
IndexKind,
3734
VectorizableTextSearchMixin,
3835
VectorizedSearchMixin,
3936
VectorSearchFilter,
@@ -45,7 +42,8 @@
4542
VectorTextSearchMixin,
4643
vectorstoremodel,
4744
)
48-
from semantic_kernel.data.record_definition.vector_store_record_utils import VectorStoreRecordUtils
45+
from semantic_kernel.data.const import DISTANCE_FUNCTION_DIRECTION_HELPER, DistanceFunction, IndexKind
46+
from semantic_kernel.data.vector_search import add_vector_to_records
4947

5048
# This is a rather complex sample, showing how to use the vector store
5149
# with a number of different collections.
@@ -254,14 +252,12 @@ async def main(collection: str, use_azure_openai: bool):
254252
)
255253

256254
print_with_color("Adding records!", Colors.CBLUE)
257-
records = await VectorStoreRecordUtils(kernel).add_vector_to_records(
258-
[record1, record2, record3], data_model_type=DataModel
259-
)
255+
records = await add_vector_to_records(kernel, [record1, record2, record3], data_model_type=DataModel)
260256
records = [record1, record2, record3]
261257
keys = await record_collection.upsert_batch(records)
262258
print(f" Upserted {keys=}")
263259
print_with_color("Getting records!", Colors.CBLUE)
264-
results = await record_collection.get_batch([record1.id, record2.id, record3.id])
260+
results = await record_collection.get([record1.id, record2.id, record3.id])
265261
if results:
266262
[print_record(record=result) for result in results]
267263
else:

python/samples/concepts/memory/data_models.py

+3-18
Original file line numberDiff line numberDiff line change
@@ -63,17 +63,6 @@ class DataModelPydantic(BaseModel):
6363
other: str | None = None
6464

6565

66-
# Data model using Pydantic BaseModels with mixed annotations (from pydantic and SK)
67-
@vectorstoremodel
68-
class DataModelPydanticComplex(BaseModel):
69-
vector: Annotated[list[float], VectorStoreRecordVectorField]
70-
key: Annotated[str, Field(default_factory=lambda: str(uuid4())), VectorStoreRecordKeyField()]
71-
content: Annotated[str, VectorStoreRecordDataField(has_embedding=True, embedding_property_name="vector")] = (
72-
"content1"
73-
)
74-
other: str | None = None
75-
76-
7766
# Data model using Python classes
7867
# This one includes a custom serialize and deserialize method
7968
@vectorstoremodel
@@ -133,25 +122,21 @@ def deserialize(cls, obj: dict[str, Any]) -> "DataModelDataclass":
133122
if __name__ == "__main__":
134123
data_item1 = DataModelDataclass(content="Hello, world!", vector=[1.0, 2.0, 3.0], other=None)
135124
data_item2 = DataModelPydantic(content="Hello, world!", vector=[1.0, 2.0, 3.0], other=None)
136-
data_item3 = DataModelPydanticComplex(content="Hello, world!", vector=[1.0, 2.0, 3.0], other=None)
137-
data_item4 = DataModelPython(content="Hello, world!", vector=[1.0, 2.0, 3.0], other=None)
125+
data_item3 = DataModelPython(content="Hello, world!", vector=[1.0, 2.0, 3.0], other=None)
138126
print("Example records:")
139127
print(f"DataClass:\n {data_item1}", end="\n\n")
140128
print(f"Pydantic:\n {data_item2}", end="\n\n")
141-
print(f"Pydantic with annotations:\n {data_item3}", end="\n\n")
142-
print(f"Python:\n {data_item4}", end="\n\n")
129+
print(f"Python:\n {data_item3}", end="\n\n")
143130

144131
print("Item definitions:")
145132
print(f"DataClass:\n {data_item1.__kernel_vectorstoremodel_definition__}", end="\n\n")
146133
print(f"Pydantic:\n {data_item2.__kernel_vectorstoremodel_definition__}", end="\n\n")
147-
print(f"Pydantic with annotations:\n {data_item3.__kernel_vectorstoremodel_definition__}", end="\n\n")
148-
print(f"Python:\n {data_item4.__kernel_vectorstoremodel_definition__}", end="\n\n")
134+
print(f"Python:\n {data_item3.__kernel_vectorstoremodel_definition__}", end="\n\n")
149135
print(f"Definition for use with Pandas:\n {data_model_definition_pandas}", end="\n\n")
150136
if (
151137
data_item1.__kernel_vectorstoremodel_definition__.fields
152138
== data_item2.__kernel_vectorstoremodel_definition__.fields
153139
== data_item3.__kernel_vectorstoremodel_definition__.fields
154-
== data_item4.__kernel_vectorstoremodel_definition__.fields
155140
== data_model_definition_pandas.fields
156141
):
157142
print("All data models are the same")

python/samples/concepts/memory/memory_with_pandas.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@
1515
VectorStoreRecordDataField,
1616
VectorStoreRecordDefinition,
1717
VectorStoreRecordKeyField,
18-
VectorStoreRecordUtils,
1918
VectorStoreRecordVectorField,
2019
)
20+
from semantic_kernel.data.vector_search import add_vector_to_records
2121

2222
model_fields = VectorStoreRecordDefinition(
2323
container_mode=True,
@@ -51,7 +51,7 @@ async def main():
5151

5252
# create the dataframe and add the embeddings
5353
df = pd.DataFrame(records)
54-
df = await VectorStoreRecordUtils(kernel).add_vector_to_records(df, None, data_model_definition=model_fields)
54+
df = await add_vector_to_records(kernel, df, None, data_model_definition=model_fields)
5555
print("Records with embeddings:")
5656
print(df.shape)
5757
print(df.head(5))

python/samples/concepts/memory/simple_memory.py

+3-7
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,15 @@
1515
)
1616
from semantic_kernel.connectors.memory.in_memory import InMemoryVectorCollection
1717
from semantic_kernel.data import (
18-
DISTANCE_FUNCTION_DIRECTION_HELPER,
19-
DistanceFunction,
20-
IndexKind,
2118
VectorSearchFilter,
2219
VectorSearchOptions,
2320
VectorStoreRecordDataField,
2421
VectorStoreRecordKeyField,
25-
VectorStoreRecordUtils,
2622
VectorStoreRecordVectorField,
2723
vectorstoremodel,
2824
)
25+
from semantic_kernel.data.const import DISTANCE_FUNCTION_DIRECTION_HELPER, DistanceFunction, IndexKind
26+
from semantic_kernel.data.vector_search import add_vector_to_records
2927

3028
# This is the most basic example of a vector store and collection
3129
# For a more complex example, using different collection types, see "complex_memory.py"
@@ -115,9 +113,7 @@ async def main():
115113

116114
# First add vectors to the records
117115
print_with_color("Adding records!", Colors.CBLUE)
118-
records_with_embedding = await VectorStoreRecordUtils(kernel).add_vector_to_records(
119-
records, data_model_type=DataModel
120-
)
116+
records_with_embedding = await add_vector_to_records(kernel, records, data_model_type=DataModel)
121117
# Next upsert them to the store.
122118
keys = await record_collection.upsert_batch(records_with_embedding)
123119
print(f" Upserted {keys=}")

python/samples/concepts/memory/utils.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
11
# Copyright (c) Microsoft. All rights reserved.
22

3-
43
from typing import TypeVar
54

65
from samples.concepts.resources.utils import Colors, print_with_color
7-
from semantic_kernel.data import (
8-
VectorSearchResult,
9-
)
6+
from semantic_kernel.data import VectorSearchResult
107

118
_T = TypeVar("_T")
129

0 commit comments

Comments
 (0)