add RAG local deploy support

etwk · etwk · commit a5800e011f44 · 2024-08-02T11:14:21.000Z
diff --git a/.env b/.env
@@ -1,4 +1,4 @@
 LLM_LOCAL_BASE_URL=http://xinference:9997/v1
 OLLAMA_BASE_URL=http://ollama:11434
 HOSTING_CHECK_BASE_URL=http://127.0.0.1:8000
-
+RAG_MODEL_DEPLOY=local
diff --git a/README.md b/README.md
@@ -62,6 +62,8 @@ Contexts
 - [ ] Filter out non-related contexts before send for verdict
 
 ### Toolchain
+- [ ] Evaluate MLOps pipeline
+  - https://kitops.ml
 - [ ] Evaluate data quality of searching and url fetching. Better error handle.
 - [ ] Use multiple sources for factcheck.
 
@@ -78,11 +80,17 @@ Contexts
 ### Calculate
 - [ ] Shall we calculate percentage of true and false in the input? Any better calculation than items count?
 
+## Issues
+- [ ] Uses many different types of models, diffcult for performance optimization and maintenance.
+
 ## References
 ### Reports
 - [ ] AI-generated misinformation
 ### Factcheck
 - https://www.bmi.bund.de/SharedDocs/schwerpunkte/EN/disinformation/examples-of-russian-disinformation-and-the-facts.html
+### Resources
+#### Inference
+- https://console.groq.com/docs/ (free tier)
 
 ## Thanks
 - Jina Reader: https://jina.ai
diff --git a/src/index.py b/src/index.py
@@ -24,17 +24,24 @@
 # todo: high lantency between client and the ollama embedding server will slow down embedding a lot
 from llama_index.embeddings.ollama import OllamaEmbedding
 
+# set RAG model deploy mode
+RAG_MODEL_DEPLOY = os.environ.get("RAG_MODEL_DEPLOY") or "local"
+
 def build_automerging_index(
     documents,
     llm,
-    # embed_model="local:BAAI/bge-small-en-v1.5",
     chunk_sizes=None,
 ):
     chunk_sizes = chunk_sizes or [2048, 512, 128]
-    embed_model = OllamaEmbedding(
-        model_name="jina/jina-embeddings-v2-base-en",
-        base_url=os.environ.get("OLLAMA_BASE_URL"),  # todo: any other configs here?
-    )
+
+    if RAG_MODEL_DEPLOY == "local":
+        embed_model="local:jinaai/jina-embeddings-v2-base-en"
+    else:
+        embed_model = OllamaEmbedding(
+            model_name="jina/jina-embeddings-v2-base-en",
+            base_url=os.environ.get("OLLAMA_BASE_URL"),  # todo: any other configs here?
+        )
+        
     node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=chunk_sizes)
     nodes = node_parser.get_nodes_from_documents(documents)
     leaf_nodes = get_leaf_nodes(nodes)
@@ -59,10 +66,14 @@ def get_automerging_query_engine(
     retriever = AutoMergingRetriever(
         base_retriever, automerging_index.storage_context, verbose=True
     )
-    # rerank = SentenceTransformerRerank(
-    #     top_n=rerank_top_n, model="BAAI/bge-reranker-base"
-    # )
-    rerank = jinaai_rerank.JinaRerank(api_key='', top_n=rerank_top_n, model="jina-reranker-v2")
+
+    if RAG_MODEL_DEPLOY == "local":
+        rerank = SentenceTransformerRerank(
+            top_n=rerank_top_n, model="jinaai/jina-reranker-v2-base-multilingual"
+        )
+    else:
+        rerank = jinaai_rerank.JinaRerank(api_key='', top_n=rerank_top_n, model="jina-reranker-v2")
+    
     auto_merging_engine = RetrieverQueryEngine.from_args(
         retriever, node_postprocessors=[rerank]
     )