Skip to content

Commit 7141c68

Browse files
jlonge4anakin87
andauthored
* hybrid retrieval ex * Update integrations/pgvector/examples/hybrid_retrieval.py Co-authored-by: Stefano Fiorucci <[email protected]> * suggested updates * suggested updates * suggested updates --------- Co-authored-by: Stefano Fiorucci <[email protected]>
1 parent c4f1cc4 commit 7141c68

File tree

1 file changed

+69
-0
lines changed

1 file changed

+69
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# Before running this example, ensure you have PostgreSQL installed with the pgvector extension.
2+
# For a quick setup using Docker:
3+
# docker run -d -p 5432:5432 -e POSTGRES_USER=postgres -e POSTGRES_PASSWORD=postgres
4+
# -e POSTGRES_DB=postgres ankane/pgvector
5+
6+
# Install required packages for this example, including pgvector-haystack and other libraries needed
7+
# for Markdown conversion and embeddings generation. Use the following command:
8+
# pip install pgvector-haystack markdown-it-py mdit_plain "sentence-transformers>=2.2.0"
9+
10+
# Download some Markdown files to index.
11+
# git clone https://github.com/anakin87/neural-search-pills
12+
13+
import glob
14+
15+
from haystack import Pipeline
16+
from haystack.components.converters import MarkdownToDocument
17+
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
18+
from haystack.components.joiners import DocumentJoiner
19+
from haystack.components.preprocessors import DocumentSplitter
20+
from haystack.components.writers import DocumentWriter
21+
from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever, PgvectorKeywordRetriever
22+
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore
23+
24+
# Set an environment variable `PG_CONN_STR` with the connection string to your PostgreSQL database.
25+
# e.g., "postgresql://USER:PASSWORD@HOST:PORT/DB_NAME"
26+
27+
# Initialize PgvectorDocumentStore
28+
document_store = PgvectorDocumentStore(
29+
table_name="haystack_test",
30+
embedding_dimension=768,
31+
vector_function="cosine_similarity",
32+
recreate_table=True,
33+
search_strategy="hnsw",
34+
)
35+
36+
# Create the indexing Pipeline and index some documents
37+
file_paths = glob.glob("neural-search-pills/pills/*.md")
38+
39+
40+
indexing = Pipeline()
41+
indexing.add_component("converter", MarkdownToDocument())
42+
indexing.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2))
43+
indexing.add_component("document_embedder", SentenceTransformersDocumentEmbedder())
44+
indexing.add_component("writer", DocumentWriter(document_store))
45+
indexing.connect("converter", "splitter")
46+
indexing.connect("splitter", "document_embedder")
47+
indexing.connect("document_embedder", "writer")
48+
49+
indexing.run({"converter": {"sources": file_paths}})
50+
51+
# Create the querying Pipeline and try a query
52+
querying = Pipeline()
53+
querying.add_component("text_embedder", SentenceTransformersTextEmbedder())
54+
querying.add_component("retriever", PgvectorEmbeddingRetriever(document_store=document_store, top_k=3))
55+
querying.add_component("keyword_retriever", PgvectorKeywordRetriever(document_store=document_store, top_k=3))
56+
querying.add_component(
57+
"joiner",
58+
DocumentJoiner(join_mode="reciprocal_rank_fusion", top_k=3),
59+
)
60+
querying.connect("text_embedder", "retriever")
61+
querying.connect("keyword_retriever", "joiner")
62+
querying.connect("retriever", "joiner")
63+
64+
query = "cross-encoder"
65+
results = querying.run({"text_embedder": {"text": query}, "keyword_retriever": {"query": query}})
66+
67+
for doc in results["joiner"]["documents"]:
68+
print(doc)
69+
print("-" * 10)

0 commit comments

Comments
 (0)