fix: unify embedding models (#2027)

jaluma · web-flow · commit 40638a18a571 · 2024-07-31T14:35:46.000+02:00
* feat: unify embedding model to nomic

* docs: add embedding dimensions mismatch

* docs: fix fern
diff --git a/fern/docs/pages/installation/troubleshooting.mdx b/fern/docs/pages/installation/troubleshooting.mdx
@@ -28,4 +28,22 @@ PrivateGPT uses the `AutoTokenizer` library to tokenize input text accurately. I
    ```
 2. **Set Access Token for Gated Models:**
    If you are using a gated model, ensure the `access_token` is set as mentioned in the previous section.
-This configuration ensures that PrivateGPT can download and use the correct tokenizer for the model you are working with.
+This configuration ensures that PrivateGPT can download and use the correct tokenizer for the model you are working with.
+
+# Embedding dimensions mismatch
+If you encounter an error message like `Embedding dimensions mismatch`, it is likely due to the embedding model and
+current vector dimension mismatch. To resolve this issue, ensure that the model and the input data have the same vector dimensions.
+
+By default, PrivateGPT uses `nomic-embed-text` embeddings, which have a vector dimension of 768.
+If you are using a different embedding model, ensure that the vector dimensions match the model's output.
+
+<Callout intent = "warning">
+In versions below to 0.6.0, the default embedding model was `BAAI/bge-small-en-v1.5` in `huggingface` setup.
+If you plan to reuse the old generated embeddings, you need to update the `settings.yaml` file to use the correct embedding model:
+```yaml
+huggingface:
+  embedding_hf_model_name: BAAI/bge-small-en-v1.5
+embedding:
+  embed_dim: 384
+```
+</Callout>
diff --git a/settings-docker.yaml b/settings-docker.yaml
@@ -13,7 +13,7 @@ llamacpp:
   llm_hf_model_file: ${PGPT_HF_MODEL_FILE:Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf}
 
 huggingface:
-  embedding_hf_model_name: ${PGPT_EMBEDDING_HF_MODEL_NAME:BAAI/bge-small-en-v1.5}
+  embedding_hf_model_name: ${PGPT_EMBEDDING_HF_MODEL_NAME:nomic-ai/nomic-embed-text-v1.5}
 
 sagemaker:
   llm_endpoint_name: ${PGPT_SAGEMAKER_LLM_ENDPOINT_NAME:}
diff --git a/settings-local.yaml b/settings-local.yaml
@@ -18,7 +18,7 @@ embedding:
   mode: huggingface
 
 huggingface:
-  embedding_hf_model_name: BAAI/bge-small-en-v1.5
+  embedding_hf_model_name: nomic-ai/nomic-embed-text-v1.5
 
 vectorstore:
   database: qdrant
diff --git a/settings-vllm.yaml b/settings-vllm.yaml
@@ -12,7 +12,7 @@ embedding:
   ingest_mode: simple
 
 huggingface:
-  embedding_hf_model_name: BAAI/bge-small-en-v1.5
+  embedding_hf_model_name: nomic-ai/nomic-embed-text-v1.5
 
 openai:
   api_base: http://localhost:8000/v1
diff --git a/settings.yaml b/settings.yaml
@@ -76,10 +76,10 @@ embedding:
   # Should be matching the value above in most cases
   mode: huggingface
   ingest_mode: simple
-  embed_dim: 384 # 384 is for BAAI/bge-small-en-v1.5
+  embed_dim: 768 # 768 is for nomic-ai/nomic-embed-text-v1.5
 
 huggingface:
-  embedding_hf_model_name: BAAI/bge-small-en-v1.5
+  embedding_hf_model_name: nomic-ai/nomic-embed-text-v1.5
   access_token: ${HF_TOKEN:}
 
 vectorstore: