|
| 1 | +# Before running this example, ensure you have PostgreSQL installed with the pgvector extension. |
| 2 | +# For a quick setup using Docker: |
| 3 | +# docker run -d -p 5432:5432 -e POSTGRES_USER=postgres -e POSTGRES_PASSWORD=postgres |
| 4 | +# -e POSTGRES_DB=postgres ankane/pgvector |
| 5 | + |
| 6 | +# Install required packages for this example, including pgvector-haystack and other libraries needed |
| 7 | +# for Markdown conversion and embeddings generation. Use the following command: |
| 8 | +# pip install pgvector-haystack markdown-it-py mdit_plain "sentence-transformers>=2.2.0" |
| 9 | + |
| 10 | +# Download some Markdown files to index. |
| 11 | +# git clone https://github.com/anakin87/neural-search-pills |
| 12 | + |
| 13 | +import glob |
| 14 | + |
| 15 | +from haystack import Pipeline |
| 16 | +from haystack.components.converters import MarkdownToDocument |
| 17 | +from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder |
| 18 | +from haystack.components.joiners import DocumentJoiner |
| 19 | +from haystack.components.preprocessors import DocumentSplitter |
| 20 | +from haystack.components.writers import DocumentWriter |
| 21 | +from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever, PgvectorKeywordRetriever |
| 22 | +from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore |
| 23 | + |
| 24 | +# Set an environment variable `PG_CONN_STR` with the connection string to your PostgreSQL database. |
| 25 | +# e.g., "postgresql://USER:PASSWORD@HOST:PORT/DB_NAME" |
| 26 | + |
| 27 | +# Initialize PgvectorDocumentStore |
| 28 | +document_store = PgvectorDocumentStore( |
| 29 | + table_name="haystack_test", |
| 30 | + embedding_dimension=768, |
| 31 | + vector_function="cosine_similarity", |
| 32 | + recreate_table=True, |
| 33 | + search_strategy="hnsw", |
| 34 | +) |
| 35 | + |
| 36 | +# Create the indexing Pipeline and index some documents |
| 37 | +file_paths = glob.glob("neural-search-pills/pills/*.md") |
| 38 | + |
| 39 | + |
| 40 | +indexing = Pipeline() |
| 41 | +indexing.add_component("converter", MarkdownToDocument()) |
| 42 | +indexing.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2)) |
| 43 | +indexing.add_component("document_embedder", SentenceTransformersDocumentEmbedder()) |
| 44 | +indexing.add_component("writer", DocumentWriter(document_store)) |
| 45 | +indexing.connect("converter", "splitter") |
| 46 | +indexing.connect("splitter", "document_embedder") |
| 47 | +indexing.connect("document_embedder", "writer") |
| 48 | + |
| 49 | +indexing.run({"converter": {"sources": file_paths}}) |
| 50 | + |
| 51 | +# Create the querying Pipeline and try a query |
| 52 | +querying = Pipeline() |
| 53 | +querying.add_component("text_embedder", SentenceTransformersTextEmbedder()) |
| 54 | +querying.add_component("retriever", PgvectorEmbeddingRetriever(document_store=document_store, top_k=3)) |
| 55 | +querying.add_component("keyword_retriever", PgvectorKeywordRetriever(document_store=document_store, top_k=3)) |
| 56 | +querying.add_component( |
| 57 | + "joiner", |
| 58 | + DocumentJoiner(join_mode="reciprocal_rank_fusion", top_k=3), |
| 59 | +) |
| 60 | +querying.connect("text_embedder", "retriever") |
| 61 | +querying.connect("keyword_retriever", "joiner") |
| 62 | +querying.connect("retriever", "joiner") |
| 63 | + |
| 64 | +query = "cross-encoder" |
| 65 | +results = querying.run({"text_embedder": {"text": query}, "keyword_retriever": {"query": query}}) |
| 66 | + |
| 67 | +for doc in results["joiner"]["documents"]: |
| 68 | + print(doc) |
| 69 | + print("-" * 10) |
0 commit comments