|
| 1 | +import os |
| 2 | +import sys |
| 3 | +import pathlib |
| 4 | + |
| 5 | + |
| 6 | +from llama_index.embeddings.huggingface import HuggingFaceEmbedding |
| 7 | +from llama_index.core.ingestion import ( |
| 8 | + DocstoreStrategy, |
| 9 | + IngestionPipeline, |
| 10 | + IngestionCache, |
| 11 | +) |
| 12 | +from llama_index.storage.kvstore.redis import RedisKVStore as RedisCache |
| 13 | +from llama_index.storage.docstore.redis import RedisDocumentStore |
| 14 | +from llama_index.core.node_parser import SentenceSplitter |
| 15 | +from llama_index.vector_stores.redis import RedisVectorStore |
| 16 | + |
| 17 | +from redisvl.schema import IndexSchema |
| 18 | +from llama_index.core import SimpleDirectoryReader, VectorStoreIndex |
| 19 | + |
| 20 | +# Add rag_demo package to PYTHONPATH so this script can access it. |
| 21 | +sys.path.append(str(pathlib.Path(__file__).parent.parent.absolute())) |
| 22 | +from rag_demo import custom_schema, getenv_or_exit |
| 23 | + |
| 24 | + |
| 25 | +EMBEDDING_MODEL_NAME = os.getenv("EMBEDDING_MODEL_NAME", "BAAI/bge-small-en-v1.5") |
| 26 | +REDIS_HOST = getenv_or_exit("REDIS_HOST") |
| 27 | +REDIS_PORT = int(os.getenv("REDIS_URL", "6379")) |
| 28 | +INPUT_DIR = getenv_or_exit("INPUT_DIR") |
| 29 | + |
| 30 | +embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL_NAME) |
| 31 | +vector_store = RedisVectorStore( |
| 32 | + schema=custom_schema, |
| 33 | + redis_url=f"redis://{REDIS_HOST}", |
| 34 | +) |
| 35 | + |
| 36 | +# Set up the ingestion cache layer |
| 37 | +cache = IngestionCache( |
| 38 | + cache=RedisCache.from_host_and_port(REDIS_HOST, REDIS_PORT), |
| 39 | + collection="redis_cache", |
| 40 | +) |
| 41 | + |
| 42 | +pipeline = IngestionPipeline( |
| 43 | + transformations=[ |
| 44 | + SentenceSplitter(), |
| 45 | + embed_model, |
| 46 | + ], |
| 47 | + docstore=RedisDocumentStore.from_host_and_port( |
| 48 | + REDIS_HOST, REDIS_PORT, namespace="document_store" |
| 49 | + ), |
| 50 | + vector_store=vector_store, |
| 51 | + cache=cache, |
| 52 | + docstore_strategy=DocstoreStrategy.UPSERTS, |
| 53 | +) |
| 54 | + |
| 55 | +index = VectorStoreIndex.from_vector_store( |
| 56 | + pipeline.vector_store, |
| 57 | + embed_model=embed_model |
| 58 | +) |
| 59 | + |
| 60 | +reader = SimpleDirectoryReader(input_dir=INPUT_DIR) |
| 61 | + |
| 62 | +def load_data(reader: SimpleDirectoryReader): |
| 63 | + docs = reader.load_data() |
| 64 | + for doc in docs: |
| 65 | + doc.id_ = doc.metadata["file_path"] |
| 66 | + return docs |
| 67 | + |
| 68 | +docs = load_data(reader) |
| 69 | +print(f"Loaded {len(docs)} docs") |
| 70 | + |
| 71 | +nodes = pipeline.run(documents=docs, show_progress=True) |
| 72 | +print(f"Ingested {len(nodes)} Nodes") |
0 commit comments