Merge pull request #57 from Tanzania-AI-Community/documentation

jurmy24 · web-flow · commit 19221db8ceae · 2024-11-19T19:58:31.000+01:00
Add documentation and small code modifications for modularity
diff --git a/app/config.py b/app/config.py
@@ -2,7 +2,7 @@
 This module sets the env configs for our WhatsApp app.
 """
 
-from typing import Optional
+from typing import Literal, Optional
 import os
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from pydantic import SecretStr, field_validator
@@ -66,22 +66,32 @@ class LLMSettings(BaseSettings):
         case_sensitive=False,
         env_nested_delimiter="__",
     )
-    # Together AI settings
+
+    # AI provider api key
     llm_api_key: Optional[SecretStr] = None
 
     # Model selection
     llm_model_options: dict = {
         "llama_405b": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
         "llama_70b": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
         "mixtral": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+        "gpt-4o": "gpt-4o",
+        "gpt-4o_mini": "gpt-40-mini",
     }
-    llm_model_name: str = llm_model_options["llama_405b"]
 
-    # Embedding model
-    embedding_model: str = "BAAI/bge-large-en-v1.5"
+    embedder_model_options: dict = {
+        "bge-large": "BAAI/bge-large-en-v1.5",  # 1024 dimensions
+        "text-embedding-3-small": "text-embedding-3-small",  # 1536 dimensions
+    }
 
-    # Exercise generator model
+    """
+    XXX: FILL YOUR AI PROVIDER AND MODEL CHOICES HERE (DEFAULTS ARE PREFILLED)
+     - make sure your choice of LLM, embedder, and ai_provider are compatible
+    """
+    ai_provider: Literal["together", "openai"] = "together"
+    llm_model_name: str = llm_model_options["llama_405b"]
     exercise_generator_model: str = llm_model_options["llama_70b"]
+    embedding_model: str = embedder_model_options["bge-large"]
 
 
 def initialize_settings():
diff --git a/app/database/models.py b/app/database/models.py
@@ -417,7 +417,14 @@ class Chunk(SQLModel, table=True):
     content_type: Optional[str] = Field(
         max_length=30
     )  # exercise, text, image, etc. (to define later)  - maybe add index in future
-    embedding: Any = Field(sa_column=Column(Vector(1024)))  # BAAI/bge-large-en-v1.5
+
+    """
+    XXX: FILL IN THE EMBEDDING LENGTH FOR YOUR EMBEDDINGS
+    - Default is set to 1024 (for bge-large vectors)
+    - Replace with 1536 for text-embedding-3-small if using OpenAI's embedder
+    """
+    embedding: Any = Field(sa_column=Column(Vector(1024)))
+
     top_level_section_index: Optional[str] = Field(max_length=10, default=None)
     top_level_section_title: Optional[str] = Field(max_length=100, default=None)
     created_at: Optional[datetime] = Field(
diff --git a/app/services/llm_service.py b/app/services/llm_service.py
@@ -2,7 +2,6 @@
 import logging
 import asyncio
 from typing import List, Optional
-from openai import AsyncOpenAI
 from openai.types.chat import ChatCompletionMessageToolCall
 
 from app.database.models import Message, MessageRole, User
@@ -41,10 +40,6 @@ def is_locked(self) -> bool:
 
 class LLMClient:
     def __init__(self):
-        self.client = AsyncOpenAI(
-            base_url="https://api.together.xyz/v1",
-            api_key=llm_settings.llm_api_key.get_secret_value(),
-        )
         self.logger = logging.getLogger(__name__)
         self._processors: dict[int, MessageProcessor] = {}
 
diff --git a/app/utils/embedder.py b/app/utils/embedder.py
@@ -1,10 +1,12 @@
-# This is in scripts/database for now but will be moved to app/database
 from typing import List
 from app.config import llm_settings
 from together import Together
+from openai import OpenAI
 
-client = Together(
-    api_key=llm_settings.llm_api_key.get_secret_value(),
+client = (
+    OpenAI(api_key=llm_settings.llm_api_key.get_secret_value())
+    if llm_settings.ai_provider == "openai"
+    else Together(api_key=llm_settings.llm_api_key.get_secret_value())
 )
 
 
diff --git a/app/utils/llm_utils.py b/app/utils/llm_utils.py
@@ -12,10 +12,13 @@
 # Set up basic logging configuration
 logger = logging.getLogger(__name__)
 
-llm_client = openai.AsyncOpenAI(
-    base_url="https://api.together.xyz/v1",
-    api_key=llm_settings.llm_api_key.get_secret_value(),
-)
+if llm_settings.ai_provider == "together":
+    llm_client = openai.AsyncOpenAI(
+        base_url="https://api.together.xyz/v1",
+        api_key=llm_settings.llm_api_key.get_secret_value(),
+    )
+else:
+    llm_client = openai.AsyncOpenAI(api_key=llm_settings.llm_api_key.get_secret_value())
 
 
 def num_tokens_from_string(string: str, encoding_name: str = "cl100k_base") -> int:
diff --git a/docs/en/ARCHITECTURE.md b/docs/en/ARCHITECTURE.md
@@ -1,4 +1,5 @@
 # Platform infrastructure
+
 <div align="center">
 
 ![Twiga Architecture](https://github.com/user-attachments/assets/33e4e394-b724-4ea4-af2a-7e75f93615aa)
@@ -7,8 +8,42 @@
 
 This diagram is an overview of the infrastructure for the first iteration of Twiga in production. We appreciate simple architectures and want to minimize the number of platforms we use all while maintaining good performance.
 
-# Code infrastructure
-...tbd
+# Code architecture
+
+We have designed Twiga's backend for simplicity and modularity.
+
+## `app`
+
+Everything used to run the Twiga application is within the `app` folder. Requests coming from the WhatsApp users (via the Meta API) are first received by the endpoints in the `app/main.py` file (the `webhooks` endpoint). Some WhatsApp signatures are controlled by the decorators in `app/security.py`and then the `handle_request` function in `app/services/messaging_service.py` routes the requests in the right direction depending on the type of request and the state of the user.
+
+All environment variables are fetched from `app/config.py`, so when using these in any way just import the settings to your file.
+
+> [!Note]
+>
+> Don't use `dotenv`, just use our settings.
+
+The AI-relevant code is mainly handled in the `app/llm_service.py`. Conveniently, if you're planning on creating any new tools, you can create it in the `app/tools/` folder. Just follow the convention we've set.
+
+We'll leave it up to you to explore the rest.
+
+> [!Warning]
+>
+> If anything here appears off it may not be up to date. Let us know 😁
+
+## `scripts`
+
+Within the `scripts` folder we keep files that are run intermittently from the developer side. Look in there if you want to populate your own version of the database with some textbook data.
+
+## `tests`
+
+> [!Note]
+>
+> We are yet to make tests but it's in the roadmap.
 
 # Database schema
-...tbd
+
+We're using tiangolos [SQLModel](https://sqlmodel.tiangolo.com/) as an [ORM](https://en.wikipedia.org/wiki/Object%E2%80%93relational_mapping) to interact with the Neon Postgres database in this project. Instead of statically sharing the database schema here (which is likely to change over time) we refer you to the `app/database/model.py` file which should contain everything you need to know regarding what tables are used in Twiga. We also have an [entity-relationship diagram](https://drive.google.com/file/d/10dKIW6I6_d-712rt0s-7KltTWTmBjRIP/view?usp=sharing) (ERD) providing an overview of the table relations but it is not consistently maintained and may not match exactly with the current database version.
+
+## `migrations`
+
+This folder keeps track of the database history. We use [_alembic_](https://medium.com/@kasperjuunge/how-to-get-started-with-alembic-and-sqlmodel-288700002543) migrations. Unless you want to use _alembic_ for your own copy of the database you can ignore this folder. If you're in the core team and have access to our Neon database, it might be good to know how it works and why we use it.
diff --git a/docs/en/GETTING_STARTED.md b/docs/en/GETTING_STARTED.md
diff --git a/scripts/database/init_twigadb.py b/scripts/database/init_twigadb.py
diff --git a/scripts/database/resource_ingestion.py b/scripts/database/resource_ingestion.py