menloresearch · namchuai · Nov 11, 2024 · Nov 19, 2024 · Nov 21, 2024 · Nov 29, 2024
diff --git a/.github/workflows/cortex-cpp-quality-gate.yml b/.github/workflows/cortex-cpp-quality-gate.yml
@@ -34,7 +34,7 @@ jobs:
             ccache-dir: ""
           - os: "mac"
             name: "arm64"
-            runs-on: "macos-silicon"
+            runs-on: "macos-selfhosted-12-arm64"
             cmake-flags: "-DCORTEX_CPP_VERSION=${{github.event.pull_request.head.sha}} -DCMAKE_BUILD_TEST=ON -DMAC_ARM64=ON -DCMAKE_TOOLCHAIN_FILE=vcpkg/scripts/buildsystems/vcpkg.cmake"
             build-deps-cmake-flags: ""
             ccache-dir: ""
@@ -124,7 +124,7 @@ jobs:
           cat ~/.cortexrc
 
       - name: Run e2e tests
-        if: runner.os != 'Windows' && github.event.pull_request.draft == false
+        if: github.event_name != 'schedule' && runner.os != 'Windows' && github.event.pull_request.draft == false
         run: |
           cd engine
           cp build/cortex build/cortex-nightly
@@ -138,7 +138,7 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.PAT_SERVICE_ACCOUNT }}
 
       - name: Run e2e tests
-        if: runner.os == 'Windows' && github.event.pull_request.draft == false
+        if: github.event_name != 'schedule' && runner.os == 'Windows' && github.event.pull_request.draft == false
         run: |
           cd engine
           cp build/cortex.exe build/cortex-nightly.exe

diff --git a/.github/workflows/template-build-macos.yml b/.github/workflows/template-build-macos.yml
@@ -82,7 +82,7 @@ jobs:
       matrix:
         include:
           - arch: 'arm64'
-            runs-on: 'macos-silicon'
+            runs-on: 'macos-selfhosted-12-arm64'
             extra-cmake-flags: "-DMAC_ARM64=ON"
 
           - arch: 'amd64'

diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh
@@ -7,10 +7,10 @@ echo "enableCors: true" >> /root/.cortexrc
 
 # Install the engine
 cortex engines install llama-cpp -s /opt/cortex.llamacpp
-cortex engines list
 
 # Start the cortex server
 cortex start
+cortex engines list
 
 # Keep the container running by tailing the log files
 tail -f /root/cortexcpp/logs/cortex.log &

diff --git a/docs/docs/cli/models/index.mdx b/docs/docs/cli/models/index.mdx
@@ -120,8 +120,11 @@ For example, it returns the following:w
 
 | Option                    | Description                                        | Required | Default value | Example              |
 |---------------------------|----------------------------------------------------|----------|---------------|----------------------|
-| `-h`, `--help`            | Display help for command.                          | No       | -             | `-h`             |
-<!-- | `-f`, `--format <format>` | Specify output format for the models list.         | No       | `json`        | `-f json`       | -->
+| `-h`, `--help`            | Display help for command.                          | No       | -             | `-h`                 |
+| `-e`, `--engine`          | Display engines.                                   | No       | -             | `--engine`           |
+| `-v`, `--version`         | Display version for model.                         | No       | -             | `--version`          |
+| `--cpu_mode`              | Display CPU mode.                                  | No       | -             | `--cpu_mode`         |
+| `--gpu_mode`              | Display GPU mode.                                  | No       | -             | `--gpu_mode`         |
 
 ## `cortex models start`
 :::info
@@ -156,9 +159,10 @@ This command uses a `model_id` from the model that you have downloaded or availa
 
 | Option                    | Description                                                               | Required | Default value                                | Example                |
 |---------------------------|---------------------------------------------------------------------------|----------|----------------------------------------------|------------------------|
-| `model_id`                | The identifier of the model you want to start.                            | Yes       | `Prompt to select from the available models` | `mistral`       |
-| `--gpus`                  | List of GPUs to use.                                                      | No       | -                                            | `[0,1]`           |
-| `-h`, `--help`            | Display help information for the command.                                 | No       | -                                            | `-h`               |
+| `model_id`                | The identifier of the model you want to start.                            | Yes      | `Prompt to select from the available models` | `mistral`              |
+| `--gpus`                  | List of GPUs to use.                                                      | No       | -                                            | `[0,1]`                |
+| `--ctx_len`               | Maximum context length for inference.                                     | No       | `min(8192, max_model_context_length)`        | `1024`                 |
+| `-h`, `--help`            | Display help information for the command.                                 | No       | -                                            | `-h`                   |
 
 ## `cortex models stop`
 :::info

diff --git a/docs/docs/cli/models/start.md b/docs/docs/cli/models/start.md
@@ -33,6 +33,7 @@ cortex models start [model_id]:[engine] [options]
 |---------------------------|----------------------------------------------------------|----------|----------------------------------------------|-------------------|
 | `model_id`                | The identifier of the model you want to start.           | No       | `Prompt to select from the available models` | `mistral`         |
 | `--gpus`                  | List of GPUs to use.                                     | No       | -                                            | `[0,1]`           |
+| `--ctx_len`               | Maximum context length for inference.                    | No       | `min(8192, max_model_context_length)`        | `1024`            |
 | `-h`, `--help`            | Display help information for the command.                | No       | -                                            | `-h`              |
 
 

diff --git a/docs/docs/cli/run.mdx b/docs/docs/cli/run.mdx
@@ -36,7 +36,8 @@ You can use the `--verbose` flag to display more detailed output of the internal
 
 | Option                      | Description                                                                 | Required | Default value                                | Example                |
 |-----------------------------|-----------------------------------------------------------------------------|----------|----------------------------------------------|------------------------|
-| `model_id`                  | The identifier of the model you want to chat with.                          | Yes       | - | `mistral`       |
-| `--gpus`                   | List of GPUs to use.                                                         | No       | -                                            | `[0,1]`           |
+| `model_id`                  | The identifier of the model you want to chat with.                          | Yes      | - | `mistral`       |
+| `--gpus`                    | List of GPUs to use.                                                        | No       | -                                            | `[0,1]`           |
+| `--ctx_len`                 | Maximum context length for inference.                                       | No       | `min(8192, max_model_context_length)`        | `1024`                 |
 | `-h`, `--help`              | Display help information for the command.                                   | No       | -                                            | `-h`               |
 <!-- | `-t`, `--thread <thread_id>`  | Specify the Thread ID. Defaults to creating a new thread if none specified. | No       | -                                            | `-t jan_1717650808`       |                                      | `-c`               | -->
diff --git a/docs/docs/engines/engine-extension.mdx b/docs/docs/engines/engine-extension.mdx
@@ -1,89 +1,210 @@
 ---
-title: Building Engine Extensions
+title: Adding a Third-Party Engine to Cortex
 description: Cortex supports Engine Extensions to integrate both :ocal inference engines, and Remote APIs.
 ---
 
-:::info
-🚧 Cortex is currently under development, and this page is a stub for future development. 
-:::
-
-<!-- 
-import Tabs from "@theme/Tabs";
-import TabItem from "@theme/TabItem";
-
 :::warning
 🚧 Cortex.cpp is currently under development. Our documentation outlines the intended behavior of Cortex, which may not yet be fully implemented in the codebase.
 :::
 
+# Guide to Adding a Third-Party Engine to Cortex
+
+## Introduction
+
+This guide outlines the steps to integrate a custom engine with Cortex. We hope this helps developers understand the integration process.
+
+## Implementation Steps
+
+### 1. Implement the Engine Interface
+
+First, create an engine that implements the `EngineI.h` interface. Here's the interface definition:
+
+```cpp
+class EngineI {
+ public:
+  struct RegisterLibraryOption {
+    std::vector<std::filesystem::path> paths;
+  };
+
+  struct EngineLoadOption {
+    // engine
+    std::filesystem::path engine_path;
+    std::filesystem::path cuda_path;
+    bool custom_engine_path;
+
+    // logging
+    std::filesystem::path log_path;
+    int max_log_lines;
+    trantor::Logger::LogLevel log_level;
+  };
+
+  struct EngineUnloadOption {
+    bool unload_dll;
+  };
+
+  virtual ~EngineI() {}
 
-This document provides a step-by-step guide to adding a new engine to the Cortex codebase, similar to the `OpenAIEngineExtension`.
+  virtual void RegisterLibraryPath(RegisterLibraryOption opts) = 0;
 
+  virtual void Load(EngineLoadOption opts) = 0;
 
-## Integrate a New Remote Engine
+  virtual void Unload(EngineUnloadOption opts) = 0;
 
-### Step 1: Create the New Engine Extension
+  // Cortex.llamacpp interface methods
+  virtual void HandleChatCompletion(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
 
-1. Navigate to the `cortex-js/src/extensions` directory.
-2. Create a new file named `<new-engine>.engine.ts` (replace `<new-engine>` with the name of your engine).
-3. Implement your new engine extension class using the following template:
+  virtual void HandleEmbedding(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
 
-```typescript
-class <NewEngine>EngineExtension extends OAIEngineExtension {
-  apiUrl = 'https://api.<new-engine>.com/v1/chat/completions';
-  name = '<new-engine>';
-  productName = '<New Engine> Inference Engine';
-  description = 'This extension enables <New Engine> chat completion API calls';
-  version = '0.0.1';
-  apiKey?: string;
-}
+  virtual void LoadModel(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+
+  virtual void UnloadModel(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+
+  virtual void GetModelStatus(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+
+  // Compatibility and model management
+  virtual bool IsSupported(const std::string& f) = 0;
+
+  virtual void GetModels(
+      std::shared_ptr<Json::Value> jsonBody,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+
+  // Logging configuration
+  virtual bool SetFileLogger(int max_log_lines,
+                           const std::string& log_path) = 0;
+  virtual void SetLogLevel(trantor::Logger::LogLevel logLevel) = 0;
+};
 ```
 
-:::info
-Be sure to replace all placeholders with the appropriate values for your engine.
-:::
+#### Lifecycle Management
+
+##### RegisterLibraryPath
+
+```cpp
+virtual void RegisterLibraryPath(RegisterLibraryOption opts) = 0;
+```
+
+This method is called during engine initialization to set up dynamic library search paths. For example, in Linux, we still have to use `LD_LIBRARY_PATH` to add CUDA dependencies to the search path.
+
+**Parameters:**
+
+- `opts.paths`: Vector of filesystem paths that the engine should register
 
-### Step 2: Register the New Engine
+**Implementation Requirements:**
 
-1. Open the `extensions.module.ts` located at `cortex-js/src/extensions/`.
+- Register provided paths for dynamic library loading
+- Handle invalid paths gracefully
+- Thread-safe implementation
+- No exceptions should escape the method
 
-2. Register your new engine in the provider array using the following code:
+##### Load
 
-```typescript
-[
-    new OpenAIEngineExtension(httpService, configUsecases, eventEmitter),
-    //... other remote engines
-    new <NewEngine>EngineExtension(httpService, configUsecases, eventEmitter),
-]
+```cpp
+virtual void Load(EngineLoadOption opts) = 0;
 ```
 
-## Explanation of Key Properties and Methods
-| **Value**                   | **Description**                                                                                  |
-|------------------------------------|--------------------------------------------------------------------------------------------------|
-| `apiUrl`                           | This is the URL endpoint for the new engine's API. It is used to make chat completion requests.   |
-| `name`                             | This is a unique identifier for the engine. It is used internally to reference the engine.        |
-| `productName`                      | This is a human-readable name for the engine. It is used for display purposes.                    |
-| `description`                      | This provides a brief description of what the engine does. It is used for documentation and display purposes. |
-| `version`                          | This indicates the version of the engine extension. It is used for version control and display purposes. |
-| `eventEmmitter.on('config.updated')` | This is an event listener that listens for configuration updates. When the configuration for the engine is updated, this listener updates the `apiKey` and the engine's status. |
-| `onLoad`                           | This method is called when the engine extension is loaded. It retrieves the engine's configuration (such as the `apiKey`) and sets the engine's status based on whether the `apiKey` is available. |
+Initializes the engine with the provided configuration options.
 
-## Advanced: Transforming Payloads and Responses
+**Parameters:**
 
-Some engines require custom transformations for the payload sent to the API and the response received from the API. This is achieved using the `transformPayload` and `transformResponse` methods. These methods allow you to modify the data structure to match the specific requirements of the engine.
+- `engine_path`: Base path for engine files
+- `cuda_path`: Path to CUDA installation
+- `custom_engine_path`: Flag for using custom engine location
+- `log_path`: Location for log files
+- `max_log_lines`: Maximum number of lines per log file
+- `log_level`: Logging verbosity level
 
-### `transformPayload`
+**Implementation Requirements:**
+
+- Validate all paths before use
+- Initialize engine components
+- Set up logging configuration
+- Handle missing dependencies gracefully
+- Clean initialization state in case of failures
+
+##### Unload
+
+```cpp
+virtual void Unload(EngineUnloadOption opts) = 0;
+```
+
+Performs cleanup and shutdown of the engine.
+
+**Parameters:**
+
+- `unload_dll`: Boolean flag indicating whether to unload dynamic libraries
+
+**Implementation Requirements:**
+
+- Clean up all allocated resources
+- Close file handles and connections
+- Release memory
+- Ensure proper shutdown of running models
+- Handle cleanup in a thread-safe manner
+
+### 2. Create a Dynamic Library
+
+We recommend using the [dylib library](https://github.com/martin-olivier/dylib) to build your dynamic library. This library provides helpful tools for creating cross-platform dynamic libraries.
+
+### 3. Package Dependencies
+
+Please ensure all dependencies are included with your dynamic library. This allows us to create a single, self-contained package for distribution.
+
+### 4. Publication and Integration
+
+#### 4.1 Publishing Your Engine (Optional)
+
+If you wish to make your engine publicly available, you can publish it through GitHub. For reference, examine the [cortex.llamacpp releases](https://github.com/janhq/cortex.llamacpp/releases) structure:
+
+- Each release tag should represent your version
+- Include all variants within the same release
+- Cortex will automatically select the most suitable variant or allow users to specify their preferred variant
+
+#### 4.2 Integration with Cortex
+
+Once your engine is ready, we encourage you to:
+
+1. Notify the Cortex team about your engine for potential inclusion in our default supported engines list
+2. Allow us to help test and validate your implementation
+
+### 5. Local Testing Guide
+
+To test your engine locally:
+
+1. Create a directory structure following this hierarchy:
+
+```bash
+engines/
+└── cortex.llamacpp/
+    └── mac-arm64/
+        └── v0.1.40/
+            ├── libengine.dylib
+            └── version.txt
+```
 
-The `transformPayload` method is used to transform the data before sending it to the engine's API. This method takes the original payload and modifies it as needed.
+1. Configure your engine:
 
-**Example: Anthropic Engine**
+   - Edit the `~/.cortexrc` file to register your engine name
+   - Add your model with the appropriate engine field in `model.yaml`
 
-In the Anthropic Engine, the `transformPayload` method extracts the system message and other messages, and includes additional parameters like `model`, `stream`, and `max_tokens`.
+2. Testing:
+   - Start the engine
+   - Load your model
+   - Verify functionality
 
-### `transformResponse`
+## Future Development
 
-The `transformResponse` method is used to transform the data received from the engine's API. This method processes the response and converts it into a format that the application can use.
+We're currently working on expanding support for additional release sources to make distribution more flexible.
 
-**Example: Anthropic Engine**
+## Contributing
 
-In the Anthropic Engine, the `transformResponse` method handles both stream and non-stream responses. It processes the response data and converts it into a standardized format.
- -->
+We welcome suggestions and contributions to improve this integration process. Please feel free to submit issues or pull requests through our repository.