diff --git a/edge-data-transfer-demo-v2/.envrc b/edge-data-transfer-demo-v2/.envrc deleted file mode 100644 index e69de29b..00000000 diff --git a/edge-data-transfer-demo-v2/Readme.md b/edge-data-transfer-demo-v2/Readme.md deleted file mode 100644 index 4ddf6952..00000000 --- a/edge-data-transfer-demo-v2/Readme.md +++ /dev/null @@ -1,180 +0,0 @@ -# Edge Data Transfer Demo - -In the built backend Docker image, there is already Midnight Commander installed, so it could be used -to ease configuration changes. - -## Requirements - -- [Docker](https://www.docker.com/get-started) (version 20.10 or higher) -- A `.env` file containing the necessary environment variables (see Configuration section) - -## Configuration - -Before running the project, create a `.env` file in the root directory (where the `docker-compose.yml` file is located) with the following content: - -```env -BACALHAU_API_HOST=https:// -BACALHAU_API_KEY= -BACALHAU_API_TLS_USETLS=true - -COMPUTE_AUTH_TOKEN= -COMPUTE_ORCHESTRATOR=nats://:4222 -COMPUTE_AWS_REGION=us-west-1 -``` - -> **Note:** -> You can use variables that will be presented to you in cloud.expanso.io while creating new network. -> Just create new network and then add node. Configuration variables can be displayed in portal. ---- - -# Building and Running - -## 1. Start the application - -Build the images and start the containers: - -```bash -docker-compose up --build -d -``` - -> **Note:** Newest version of compose is incorporated in Docker itself and the command is called without a dash - - -Once the application is running, open your browser and navigate to: - -``` -http://localhost:3000 -``` - ---- - -## 2. Backend Setup - -### Access the backend container: - -```bash -docker exec -it edge-data-transfer-demo-backend-1 /bin/bash -``` - -### Configure AWS credentials: - -Inside the container, run: - -```bash -aws configure -``` - -Provide your AWS Access Key, Secret Access Key, region, and output format. - -### Fetch the latest Ubuntu AMIs: - -> **Note:** This step is optional and configuration could be left as is, as long as already fetched AMI images are working properly. - -```bash -uv run -s util/get_ubuntu_amis.py -``` - -#### View the downloaded AMIs: - -```bash -cat ubuntu_amis.csv -``` - -Choose the appropriate AMI(s) for your region and use case. - ---- - -## 3. Update Configuration - -In order to create example configuration you can use /backend/generate_example_config.sh script. -Run these commands in backend container -```bash -chmod +x generate_example_config.sh -./generate_example_config.sh -``` -This will overwrite existing config.yaml_example file with provided environment variables for you. - -You can then change machine type or number of nodes to instantiate. -Once reviewed, example file can be used as config.yaml - -```bash -cp config.yaml_example config.yaml -``` - ---- - -## 4. Deploy Spot Instances - -To create the Spot instances: - -```bash -uv run -s ./deploy_spot.py create -``` -This step will deploy EC2 instances and create EFS share, used in demo. - -> **Note:** In case of any problems see debug.log file in /backend folder. - ---- - -## 5. Verify Instance Registration - -In the backend container, check that the new nodes have registered correctly with Bacalhau: - -```bash -bacalhau node list -``` - -You should see your Spot instances listed as active nodes. - ---- - -## 6. Verify NFS Mount - -SSH into one of the Spot instances. - -The private ssh key for the machines is located in the /root/.ssh/id_rsa directory on backend docker. - -By default user in nodes is 'bacalhau-runner'. The public IP address of node could be seen in labels while listing bacalhau nodes. - -```bash -ssh bacalhau-runner@ -``` - and verify that the NFS volume is mounted. -```bash -df -h -``` - -You should see `/mnt/data` listed in the output. - ---- - -## 7. Generate Test Data - -Submit a job to generate random test files: -Jobs are available in the /backend/job directory -```bash -bacalhau job run generate.yaml -``` - -> **Warning:** This job can take up to 40 minutes. After a 5 minutes you'll see timeout while tracking job execution but job itself is running on network. - ---- - -## 8. Run Metadata Generation Job - -Submit the main processing job to generate metadata: - -```bash -bacalhau job run process_metadata.yaml -``` - -## 9. Cleanup - -After demo, you can destroy EC2 instances by running - -```bash -uv run -s ./deploy_spot.py destroy -``` -in backend container. - -You should also delete data from EFS share and EFS itself. diff --git a/edge-data-transfer-demo-v2/config.yaml b/edge-data-transfer-demo-v2/config.yaml deleted file mode 100644 index 77d2cacd..00000000 --- a/edge-data-transfer-demo-v2/config.yaml +++ /dev/null @@ -1,12 +0,0 @@ -max_instances: 5 -username: bacalhau-runner -public_ssh_key_path: /root/.ssh/id_rsa.pub -compute_orchestrators: - - nats://6hmm25qaoocs2b.us1.cloud.expanso.io:4222 -compute_auth_token: Q3NjqKjhxEKQsiJuNYZp5K8KekWgdhmQiKPGDKpmU2Fs6EHJ5vbJPTVaMG5LwCv6 -compute_tls: true -regions: - - us-west-1: - image: auto - machine_type: m6gd.medium - node_count: auto diff --git a/edge-data-transfer-demo-v2/docker-compose.yml b/edge-data-transfer-demo-v2/docker-compose.yml deleted file mode 100644 index c34923bb..00000000 --- a/edge-data-transfer-demo-v2/docker-compose.yml +++ /dev/null @@ -1,7 +0,0 @@ -services: - backend: - build: - context: ./edge-data-spots - dockerfile: Dockerfile - env_file: - - .env diff --git a/edge-data-transfer-demo-v2/edge-data-spots/.gitignore b/edge-data-transfer-demo-v2/edge-data-spots/.gitignore deleted file mode 100644 index 16a84abb..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -.env -.idea -.pem -config.yaml diff --git a/edge-data-transfer-demo-v2/edge-data-spots/CLAUDE.md b/edge-data-transfer-demo-v2/edge-data-spots/CLAUDE.md deleted file mode 100644 index e5ca9e74..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/CLAUDE.md +++ /dev/null @@ -1,25 +0,0 @@ -# Bacalhau AWS Spot Cluster Setup Guide - -## Commands - -### Build/Run/Test -- `uv run -s util/get_available_regions.py [--show-all]` - Find regions with suitable spot instances -- `uv run -s util/get_ubuntu_amis.py` - Get Ubuntu AMIs for available regions -- `uv run -s util/update_config_with_regions.py` - Update config with available regions -- `uv run -s deploy_spot.py [create|list|destroy]` - Manage AWS spot instances - -### Alternative (pip) -- `python util/get_available_regions.py` -- `python util/get_ubuntu_amis.py` -- `python util/update_config_with_regions.py` -- `python deploy_spot.py [create|list|destroy]` - -## Code Style Guidelines -- Use f-strings for string formatting -- Use async/await for asynchronous operations -- Use rich library for terminal UI components -- Wrap AWS API calls with timeouts and error handling -- Use comprehensive logging with appropriate levels -- Follow PEP 8 naming conventions (snake_case for functions/variables) -- Error handling with try/except blocks and detailed error messages -- Organize code with proper separation of concerns diff --git a/edge-data-transfer-demo-v2/edge-data-spots/Dockerfile b/edge-data-transfer-demo-v2/edge-data-spots/Dockerfile deleted file mode 100644 index 8d1f508a..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/Dockerfile +++ /dev/null @@ -1,21 +0,0 @@ -FROM ubuntu:22.04 -WORKDIR /backend - -RUN apt-get update -RUN apt-get install python3 python3-pip curl unzip mc vim nano openssh-client -y -RUN pip install uv -RUN curl -sL https://get.bacalhau.org/install.sh | bash - -RUN if [ "$(uname -m)" = "aarch64" ] || [ "$(uname -m)" = "arm64" ]; then \ - curl "https://awscli.amazonaws.com/awscli-exe-linux-aarch64.zip" -o "awscliv2.zip"; \ - else \ - curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"; \ - fi -RUN unzip awscliv2.zip -RUN ./aws/install - -COPY . . - -RUN ssh-keygen -t rsa -b 4096 -N "" -f /root/.ssh/id_rsa - -CMD ["sleep", "infinity"] diff --git a/edge-data-transfer-demo-v2/edge-data-spots/README.md b/edge-data-transfer-demo-v2/edge-data-spots/README.md deleted file mode 100644 index 82e5a5ad..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/README.md +++ /dev/null @@ -1,185 +0,0 @@ -# AWS Spot Instance Region Finder - -This repository contains scripts to help set up a Bacalhau cluster on AWS spot instances by finding the most cost-effective regions and instance types. - -## Scripts - -### 1. Region Availability Checker (`util/get_available_regions.py`) - -This script checks all AWS regions to find those that have spot instances available that meet the minimum requirements for running Docker and one small Python container: -- At least 1 vCPU -- At least 2 GiB of memory - -The script: -1. Queries all AWS regions (not just a subset) -2. Checks each region for instance types that meet the minimum requirements -3. Prioritizes smaller, cost-effective instance types (t3, t3a, t4g, t2, a1, m6g, m5, m5a families) -4. Verifies spot instance availability and pricing for suitable instance types -5. Outputs the results to: - - `available_regions.json` - Comprehensive JSON file with detailed region and instance information - - `available_regions.py` - Python importable format (for backward compatibility) -6. Displays a summary of the top 5 cheapest regions by default (with an option to show all) - -#### Command-line Options - -``` -usage: get_available_regions.py [-h] [--show-all] [--max-workers MAX_WORKERS] - -Find AWS regions with suitable spot instances for Docker and containers - -options: - -h, --help show this help message and exit - --show-all Show all available regions, not just the top 5 - --max-workers MAX_WORKERS - Maximum number of parallel workers (default: 10) -``` - -### 2. Ubuntu AMI Finder (`util/get_ubuntu_amis.py`) - -This script finds the latest Ubuntu 22.04 LTS AMI IDs for each available region: -1. Reads the list of available regions from `available_regions.json` (created by the first script) -2. Queries AWS for the latest Ubuntu 22.04 LTS AMI in each region -3. Outputs the results to `ubuntu_amis.csv` with detailed instance information including: - - Region - - AMI ID - - Instance Type - - vCPUs - - Memory (GiB) - - Spot Price ($/hr) - -### 3. Config Updater (`util/update_config_with_regions.py`) - -This script updates your Bacalhau cluster configuration with the available regions: -1. Reads the list of available regions from `available_regions.json` -2. Loads your existing `config.yaml` file -3. Adds all new regions that aren't already in your configuration -4. Uses recommended instance types from the region details when available -5. Creates a backup of your original configuration -6. Saves the updated configuration with all available regions - -## Workflow - -The scripts are designed to work together in sequence: - -1. First, run `get_available_regions.py` to find regions with suitable spot instances -2. Then, run `get_ubuntu_amis.py` to get the latest Ubuntu AMIs for those regions -3. Finally, run `update_config_with_regions.py` to update your Bacalhau configuration - -This approach ensures you're only looking for AMIs in regions that have suitable spot instances available, and that your configuration includes all viable regions. - -## Usage - -### Prerequisites - -1. AWS CLI configured with appropriate credentials -2. Python 3.6+ with required packages - -You can run these scripts in two ways: - -#### Option 1: Using uv (recommended) - -The scripts include dependency metadata for use with [uv](https://github.com/astral-sh/uv), which will automatically install required dependencies: - -```bash -# Install uv if you don't have it -pip install uv - -# Run scripts directly with uv -uv run -s util/get_available_regions.py -uv run -s util/get_ubuntu_amis.py -uv run -s util/update_config_with_regions.py - -# To see all available regions, not just the top 5 -uv run -s util/get_available_regions.py --show-all -``` - -#### Option 2: Using pip - -```bash -# Install dependencies manually -pip install boto3 botocore argparse pyyaml - -# Run scripts -python util/get_available_regions.py -python util/get_ubuntu_amis.py -python util/update_config_with_regions.py - -# To see all available regions, not just the top 5 -python util/get_available_regions.py --show-all -``` - -### Step 1: Find Available Regions with Smallest Suitable Instances - -```bash -uv run -s util/get_available_regions.py -``` - -This will create: -- `available_regions.json` - Comprehensive JSON file with detailed region and instance information -- `available_regions.py` - Python importable format (for backward compatibility) -- A console output showing the top 5 cheapest regions and their smallest suitable instances - -Example output: -``` -Checking 28 AWS regions for spot availability... -Looking for instances with at least 1 vCPUs and 2 GiB RAM - -Found 18 regions with suitable spot instances out of 28 total regions -Available regions saved to: available_regions.json -Python module also saved to: available_regions.py - -Top 5 cheapest regions for running Docker with a small Python container: -(Use --show-all to see all 18 available regions) -1. us-east-1 - t3.small - 2 vCPUs, 2.0 GiB RAM, $0.0078/hr -2. us-west-2 - t3a.small - 2 vCPUs, 2.0 GiB RAM, $0.0084/hr -3. eu-west-1 - t3.small - 2 vCPUs, 2.0 GiB RAM, $0.0091/hr -4. ap-southeast-1 - t3.small - 2 vCPUs, 2.0 GiB RAM, $0.0094/hr -5. eu-central-1 - t3.small - 2 vCPUs, 2.0 GiB RAM, $0.0098/hr -``` - -### Step 2: Get Ubuntu AMIs for Available Regions - -```bash -uv run -s util/get_ubuntu_amis.py -``` - -This will create: -- `ubuntu_amis.csv` - CSV file with region, AMI ID, and instance details - -Example CSV content: -``` -Region,AMI ID,Instance Type,vCPUs,Memory (GiB),Spot Price ($/hr) -us-east-1,ami-0c7217cdde317cfec,t3.small,2,2.0,$0.0078 -us-west-2,ami-0efcece6bed30fd98,t3a.small,2,2.0,$0.0084 -eu-west-1,ami-0694d931cee176e7d,t3.small,2,2.0,$0.0091 -``` - -### Step 3: Update Your Bacalhau Configuration - -```bash -uv run -s util/update_config_with_regions.py -``` - -This will: -- Read your existing `config.yaml` file -- Add all new regions from `available_regions.json` -- Use recommended instance types for each region -- Create a backup of your original configuration at `config.yaml.bak` -- Save the updated configuration with all available regions - -Example output: -``` -Found 30 available regions in available_regions.json -Loaded configuration from config.yaml -Adding 27 new regions to config.yaml -Created backup of original config at config.yaml.bak -Updated config.yaml with 27 new regions -Total regions in config: 30 -``` - -## Notes - -- The region availability script may take several minutes to run as it checks all AWS regions -- If `available_regions.json` is not found, the Ubuntu AMI finder will fall back to a default list of regions -- AWS credentials with EC2 describe permissions are required to run these scripts -- Spot instance pricing is dynamic and may change over time, so it's recommended to run the script periodically to get the latest pricing information diff --git a/edge-data-transfer-demo-v2/edge-data-spots/available_regions.json b/edge-data-transfer-demo-v2/edge-data-spots/available_regions.json deleted file mode 100644 index f84c861e..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/available_regions.json +++ /dev/null @@ -1,1254 +0,0 @@ -{ - "available_regions": [ - "ap-northeast-3", - "eu-north-1", - "sa-east-1", - "eu-west-3", - "ap-southeast-1", - "us-west-1", - "ap-south-1", - "ap-northeast-2", - "ap-northeast-1", - "eu-central-1", - "ap-southeast-2", - "eu-west-2", - "us-east-2", - "ca-central-1", - "us-west-2", - "eu-west-1", - "us-east-1" - ], - "region_details": { - "ap-northeast-3": { - "region": "ap-northeast-3", - "available": true, - "instances": [ - { - "instance_type": "t3.medium", - "vcpus": 2, - "memory_gib": 4.0, - "spot_price": 0.0198 - }, - { - "instance_type": "t4g.medium", - "vcpus": 2, - "memory_gib": 4.0, - "spot_price": 0.0298 - }, - { - "instance_type": "t3.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.0413 - }, - { - "instance_type": "t2.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.0485 - }, - { - "instance_type": "m5d.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.0557 - }, - { - "instance_type": "m6g.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.073 - }, - { - "instance_type": "t3.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0806 - }, - { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0905 - }, - { - "instance_type": "m5.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0916 - }, - { - "instance_type": "m5d.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.1057 - } - ], - "cheapest_instance": { - "instance_type": "t3.medium", - "vcpus": 2, - "memory_gib": 4.0, - "spot_price": 0.0198 - } - }, - "eu-north-1": { - "region": "eu-north-1", - "available": true, - "instances": [ - { - "instance_type": "c5d.large", - "vcpus": 2, - "memory_gib": 4.0, - "spot_price": 0.0243 - }, - { - "instance_type": "m7g.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.0245 - }, - { - "instance_type": "c6i.large", - "vcpus": 2, - "memory_gib": 4.0, - "spot_price": 0.0307 - }, - { - "instance_type": "c5n.large", - "vcpus": 2, - "memory_gib": 5.2, - "spot_price": 0.0309 - }, - { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0509 - }, - { - "instance_type": "m5d.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0527 - }, - { - "instance_type": "m6gd.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.2149 - }, - { - "instance_type": "m6gd.12xlarge", - "vcpus": 48, - "memory_gib": 192.0, - "spot_price": 0.289 - }, - { - "instance_type": "m5.8xlarge", - "vcpus": 32, - "memory_gib": 128.0, - "spot_price": 0.338 - }, - { - "instance_type": "m5d.8xlarge", - "vcpus": 32, - "memory_gib": 128.0, - "spot_price": 0.4684 - } - ], - "cheapest_instance": { - "instance_type": "c5d.large", - "vcpus": 2, - "memory_gib": 4.0, - "spot_price": 0.0243 - } - }, - "sa-east-1": { - "region": "sa-east-1", - "available": true, - "instances": [ - { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0457 - }, - { - "instance_type": "m5d.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0703 - }, - { - "instance_type": "m5zn.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.1193 - }, - { - "instance_type": "t2.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1361 - }, - { - "instance_type": "m5a.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1642 - }, - { - "instance_type": "m6gd.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.2402 - }, - { - "instance_type": "m5a.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.4847 - }, - { - "instance_type": "m5zn.6xlarge", - "vcpus": 24, - "memory_gib": 96.0, - "spot_price": 0.5793 - }, - { - "instance_type": "m5ad.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.6749 - }, - { - "instance_type": "m5zn.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.8541 - } - ], - "cheapest_instance": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0457 - } - }, - "eu-west-3": { - "region": "eu-west-3", - "available": true, - "instances": [ - { - "instance_type": "m5ad.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.0514 - }, - { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.084 - }, - { - "instance_type": "m5d.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0944 - }, - { - "instance_type": "t3a.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1312 - }, - { - "instance_type": "t2.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1353 - }, - { - "instance_type": "t3.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1597 - }, - { - "instance_type": "m5a.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1684 - }, - { - "instance_type": "m6gd.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.254 - }, - { - "instance_type": "m5a.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.3445 - }, - { - "instance_type": "m5ad.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.5308 - } - ], - "cheapest_instance": { - "instance_type": "m5ad.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.0514 - } - }, - "ap-southeast-1": { - "region": "ap-southeast-1", - "available": true, - "instances": [ - { - "instance_type": "m5dn.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.057 - }, - { - "instance_type": "m5d.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0764 - }, - { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.106 - }, - { - "instance_type": "t2.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1279 - }, - { - "instance_type": "m5a.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.3485 - }, - { - "instance_type": "m5zn.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.4423 - }, - { - "instance_type": "m6gd.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.4517 - }, - { - "instance_type": "m5a.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.4811 - }, - { - "instance_type": "m5dn.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.5408 - }, - { - "instance_type": "m5d.8xlarge", - "vcpus": 32, - "memory_gib": 128.0, - "spot_price": 0.9276 - } - ], - "cheapest_instance": { - "instance_type": "m5dn.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.057 - } - }, - "us-west-1": { - "region": "us-west-1", - "available": true, - "instances": [ - { - "instance_type": "m5zn.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.058 - }, - { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0586 - }, - { - "instance_type": "m5d.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0701 - }, - { - "instance_type": "t3a.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1101 - }, - { - "instance_type": "t2.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1223 - }, - { - "instance_type": "m5a.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1975 - }, - { - "instance_type": "m5zn.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.3331 - }, - { - "instance_type": "m6gd.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.3454 - }, - { - "instance_type": "m5a.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.3799 - }, - { - "instance_type": "m5ad.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.4594 - } - ], - "cheapest_instance": { - "instance_type": "m5zn.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.058 - } - }, - "ap-south-1": { - "region": "ap-south-1", - "available": true, - "instances": [ - { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0621 - }, - { - "instance_type": "m5d.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0865 - }, - { - "instance_type": "t2.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.0955 - }, - { - "instance_type": "m5a.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1338 - }, - { - "instance_type": "m6gd.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.1545 - }, - { - "instance_type": "m5a.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.2201 - }, - { - "instance_type": "m6gd.12xlarge", - "vcpus": 48, - "memory_gib": 192.0, - "spot_price": 0.5622 - }, - { - "instance_type": "m5.8xlarge", - "vcpus": 32, - "memory_gib": 128.0, - "spot_price": 0.5912 - }, - { - "instance_type": "m5d.8xlarge", - "vcpus": 32, - "memory_gib": 128.0, - "spot_price": 0.9426 - }, - { - "instance_type": "m5ad.12xlarge", - "vcpus": 48, - "memory_gib": 192.0, - "spot_price": 1.0168 - } - ], - "cheapest_instance": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0621 - } - }, - "ap-northeast-2": { - "region": "ap-northeast-2", - "available": true, - "instances": [ - { - "instance_type": "m5zn.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.0655 - }, - { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0677 - }, - { - "instance_type": "m5d.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0789 - }, - { - "instance_type": "t2.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1064 - }, - { - "instance_type": "t3a.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1081 - }, - { - "instance_type": "m5a.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1801 - }, - { - "instance_type": "m5zn.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.2644 - }, - { - "instance_type": "m6gd.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.2695 - }, - { - "instance_type": "m5a.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.3167 - }, - { - "instance_type": "m5ad.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.4854 - } - ], - "cheapest_instance": { - "instance_type": "m5zn.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.0655 - } - }, - "ap-northeast-1": { - "region": "ap-northeast-1", - "available": true, - "instances": [ - { - "instance_type": "m5dn.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.0709 - }, - { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0901 - }, - { - "instance_type": "m5d.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.1224 - }, - { - "instance_type": "t2.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1479 - }, - { - "instance_type": "m5a.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1926 - }, - { - "instance_type": "m5dn.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.2556 - }, - { - "instance_type": "m6gd.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.3369 - }, - { - "instance_type": "m5zn.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.3438 - }, - { - "instance_type": "m5.8xlarge", - "vcpus": 32, - "memory_gib": 128.0, - "spot_price": 0.5805 - }, - { - "instance_type": "m5d.8xlarge", - "vcpus": 32, - "memory_gib": 128.0, - "spot_price": 0.8606 - } - ], - "cheapest_instance": { - "instance_type": "m5dn.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.0709 - } - }, - "eu-central-1": { - "region": "eu-central-1", - "available": true, - "instances": [ - { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0716 - }, - { - "instance_type": "m5d.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.1164 - }, - { - "instance_type": "t2.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1818 - }, - { - "instance_type": "m5a.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.226 - }, - { - "instance_type": "m6gd.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.3702 - }, - { - "instance_type": "m6gd.12xlarge", - "vcpus": 48, - "memory_gib": 192.0, - "spot_price": 0.4034 - }, - { - "instance_type": "m5zn.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.5208 - }, - { - "instance_type": "m5.8xlarge", - "vcpus": 32, - "memory_gib": 128.0, - "spot_price": 0.6029 - }, - { - "instance_type": "m5d.8xlarge", - "vcpus": 32, - "memory_gib": 128.0, - "spot_price": 0.9943 - }, - { - "instance_type": "m5dn.12xlarge", - "vcpus": 48, - "memory_gib": 192.0, - "spot_price": 2.1319 - } - ], - "cheapest_instance": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0716 - } - }, - "ap-southeast-2": { - "region": "ap-southeast-2", - "available": true, - "instances": [ - { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0741 - }, - { - "instance_type": "m5d.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.1075 - }, - { - "instance_type": "t2.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1449 - }, - { - "instance_type": "m5a.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.2158 - }, - { - "instance_type": "m6gd.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.3673 - }, - { - "instance_type": "m5zn.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.415 - }, - { - "instance_type": "m5a.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.4162 - }, - { - "instance_type": "m5.8xlarge", - "vcpus": 32, - "memory_gib": 128.0, - "spot_price": 0.6925 - }, - { - "instance_type": "m5d.8xlarge", - "vcpus": 32, - "memory_gib": 128.0, - "spot_price": 0.8595 - }, - { - "instance_type": "m5ad.12xlarge", - "vcpus": 48, - "memory_gib": 192.0, - "spot_price": 1.5639 - } - ], - "cheapest_instance": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0741 - } - }, - "eu-west-2": { - "region": "eu-west-2", - "available": true, - "instances": [ - { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0746 - }, - { - "instance_type": "m5d.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0882 - }, - { - "instance_type": "t2.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1399 - }, - { - "instance_type": "m5a.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.2428 - }, - { - "instance_type": "m6gd.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.2923 - }, - { - "instance_type": "m5a.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.5546 - }, - { - "instance_type": "m5.8xlarge", - "vcpus": 32, - "memory_gib": 128.0, - "spot_price": 0.5879 - }, - { - "instance_type": "m5ad.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.6965 - }, - { - "instance_type": "m5d.8xlarge", - "vcpus": 32, - "memory_gib": 128.0, - "spot_price": 0.7973 - }, - { - "instance_type": "m6gd.12xlarge", - "vcpus": 48, - "memory_gib": 192.0, - "spot_price": 0.8046 - } - ], - "cheapest_instance": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0746 - } - }, - "us-east-2": { - "region": "us-east-2", - "available": true, - "instances": [ - { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0766 - }, - { - "instance_type": "m5d.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0819 - }, - { - "instance_type": "t2.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.0937 - }, - { - "instance_type": "m5a.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1703 - }, - { - "instance_type": "m5zn.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.2513 - }, - { - "instance_type": "m6gd.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.3794 - }, - { - "instance_type": "m5.8xlarge", - "vcpus": 32, - "memory_gib": 128.0, - "spot_price": 0.5876 - }, - { - "instance_type": "m5d.8xlarge", - "vcpus": 32, - "memory_gib": 128.0, - "spot_price": 0.6747 - }, - { - "instance_type": "m6gd.12xlarge", - "vcpus": 48, - "memory_gib": 192.0, - "spot_price": 0.7906 - }, - { - "instance_type": "m5dn.12xlarge", - "vcpus": 48, - "memory_gib": 192.0, - "spot_price": 1.0693 - } - ], - "cheapest_instance": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0766 - } - }, - "ca-central-1": { - "region": "ca-central-1", - "available": true, - "instances": [ - { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0781 - }, - { - "instance_type": "m5d.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0856 - }, - { - "instance_type": "t3a.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1293 - }, - { - "instance_type": "t2.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1599 - }, - { - "instance_type": "m5a.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1845 - }, - { - "instance_type": "m5a.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.2958 - }, - { - "instance_type": "m6gd.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.2965 - }, - { - "instance_type": "m5ad.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.4745 - }, - { - "instance_type": "m5.8xlarge", - "vcpus": 32, - "memory_gib": 128.0, - "spot_price": 0.6354 - }, - { - "instance_type": "m5d.8xlarge", - "vcpus": 32, - "memory_gib": 128.0, - "spot_price": 0.8273 - } - ], - "cheapest_instance": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0781 - } - }, - "us-west-2": { - "region": "us-west-2", - "available": true, - "instances": [ - { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0826 - }, - { - "instance_type": "m5d.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0832 - }, - { - "instance_type": "t2.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1103 - }, - { - "instance_type": "m5a.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1676 - }, - { - "instance_type": "m6gd.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.2346 - }, - { - "instance_type": "m5zn.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.2742 - }, - { - "instance_type": "m6gd.12xlarge", - "vcpus": 48, - "memory_gib": 192.0, - "spot_price": 0.6193 - }, - { - "instance_type": "m5a.16xlarge", - "vcpus": 64, - "memory_gib": 256.0, - "spot_price": 1.0728 - }, - { - "instance_type": "m5ad.12xlarge", - "vcpus": 48, - "memory_gib": 192.0, - "spot_price": 1.1702 - }, - { - "instance_type": "m5n.16xlarge", - "vcpus": 64, - "memory_gib": 256.0, - "spot_price": 1.3766 - } - ], - "cheapest_instance": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0826 - } - }, - "eu-west-1": { - "region": "eu-west-1", - "available": true, - "instances": [ - { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.091 - }, - { - "instance_type": "m5d.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.1126 - }, - { - "instance_type": "m5a.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1821 - }, - { - "instance_type": "t2.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.187 - }, - { - "instance_type": "m5zn.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.3205 - }, - { - "instance_type": "m6gd.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.3457 - }, - { - "instance_type": "m5.8xlarge", - "vcpus": 32, - "memory_gib": 128.0, - "spot_price": 0.8306 - }, - { - "instance_type": "m5d.8xlarge", - "vcpus": 32, - "memory_gib": 128.0, - "spot_price": 0.9543 - }, - { - "instance_type": "m6gd.12xlarge", - "vcpus": 48, - "memory_gib": 192.0, - "spot_price": 1.134 - }, - { - "instance_type": "m5ad.12xlarge", - "vcpus": 48, - "memory_gib": 192.0, - "spot_price": 1.1971 - } - ], - "cheapest_instance": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.091 - } - }, - "us-east-1": { - "region": "us-east-1", - "available": true, - "instances": [ - { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0928 - }, - { - "instance_type": "t2.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.0945 - }, - { - "instance_type": "m5d.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.1165 - }, - { - "instance_type": "m5a.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.1572 - }, - { - "instance_type": "m6gd.4xlarge", - "vcpus": 16, - "memory_gib": 64.0, - "spot_price": 0.3785 - }, - { - "instance_type": "m5zn.2xlarge", - "vcpus": 8, - "memory_gib": 32.0, - "spot_price": 0.4328 - }, - { - "instance_type": "m6gd.12xlarge", - "vcpus": 48, - "memory_gib": 192.0, - "spot_price": 0.7786 - }, - { - "instance_type": "m5a.16xlarge", - "vcpus": 64, - "memory_gib": 256.0, - "spot_price": 1.0773 - }, - { - "instance_type": "m5ad.12xlarge", - "vcpus": 48, - "memory_gib": 192.0, - "spot_price": 1.3283 - }, - { - "instance_type": "m5n.16xlarge", - "vcpus": 64, - "memory_gib": 256.0, - "spot_price": 1.5341 - } - ], - "cheapest_instance": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.0928 - } - } - }, - "all_regions_checked": 17, - "available_regions_count": 17, - "min_requirements": { - "vcpu": 2, - "memory_gib": 2 - }, - "timestamp": "2025-03-25 11:29:42 UTC" -} diff --git a/edge-data-transfer-demo-v2/edge-data-spots/available_regions.py b/edge-data-transfer-demo-v2/edge-data-spots/available_regions.py deleted file mode 100644 index 3e3244eb..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/available_regions.py +++ /dev/null @@ -1,128 +0,0 @@ -# AWS regions with spot instances suitable for Docker and containers -# This file is auto-generated by get_available_regions.py - -AVAILABLE_REGIONS = [ - "ap-northeast-1", - "ap-northeast-2", - "ap-northeast-3", - "ap-south-1", - "ap-southeast-1", - "ap-southeast-2", - "ca-central-1", - "eu-central-1", - "eu-north-1", - "eu-west-1", - "eu-west-2", - "eu-west-3", - "sa-east-1", - "us-east-1", - "us-east-2", - "us-west-1", - "us-west-2", -] - -# Detailed information about each region's smallest suitable instance -REGION_DETAILS = { - "ap-northeast-3": { - "instance_type": "t3.medium", - "vcpus": 2, - "memory_gib": 4.0, - "spot_price": 0.019800, - }, - "eu-north-1": { - "instance_type": "c5d.large", - "vcpus": 2, - "memory_gib": 4.0, - "spot_price": 0.024300, - }, - "sa-east-1": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.045700, - }, - "eu-west-3": { - "instance_type": "m5ad.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.051400, - }, - "ap-southeast-1": { - "instance_type": "m5dn.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.057000, - }, - "us-west-1": { - "instance_type": "m5zn.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.058000, - }, - "ap-south-1": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.062100, - }, - "ap-northeast-2": { - "instance_type": "m5zn.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.065500, - }, - "ap-northeast-1": { - "instance_type": "m5dn.large", - "vcpus": 2, - "memory_gib": 8.0, - "spot_price": 0.070900, - }, - "eu-central-1": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.071600, - }, - "ap-southeast-2": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.074100, - }, - "eu-west-2": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.074600, - }, - "us-east-2": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.076600, - }, - "ca-central-1": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.078100, - }, - "us-west-2": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.082600, - }, - "eu-west-1": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.091000, - }, - "us-east-1": { - "instance_type": "m6gd.xlarge", - "vcpus": 4, - "memory_gib": 16.0, - "spot_price": 0.092800, - }, -} diff --git a/edge-data-transfer-demo-v2/edge-data-spots/config.yaml_example b/edge-data-transfer-demo-v2/edge-data-spots/config.yaml_example deleted file mode 100644 index 84ddf566..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/config.yaml_example +++ /dev/null @@ -1,21 +0,0 @@ -max_instances: 3 -username: bacalhau-runner -public_ssh_key_path: ~/.ssh/id_rsa.pub -private_ssh_key_path: ~/.ssh/id_rsa -compute_orchestrators: - - nats://:4222 -compute_auth_token: -compute_tls: true -regions: - - eu-central-1: - image: auto - machine_type: t2.micro - node_count: auto - - eu-west-1: - image: auto - machine_type: t2.micro - node_count: auto - - eu-west-2: - image: auto - machine_type: t2.micro - node_count: auto diff --git a/edge-data-transfer-demo-v2/edge-data-spots/deploy_spot.py b/edge-data-transfer-demo-v2/edge-data-spots/deploy_spot.py deleted file mode 100755 index 972e959c..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/deploy_spot.py +++ /dev/null @@ -1,3060 +0,0 @@ -#!/usr/bin/env -S uv run --script -# /// script -# requires-python = ">=3.11" -# dependencies = [ -# "boto3", -# "botocore", -# "pyyaml", -# "rich", -# ] -# /// - -import argparse -import asyncio -import base64 -import hashlib -import json -import logging -import os -import subprocess -import sys -import time -from concurrent.futures import TimeoutError -from datetime import datetime, timezone - -import boto3 -import botocore -from rich.console import Console -from rich.layout import Layout -from rich.live import Live -from rich.panel import Panel -from rich.progress import ( - BarColumn, - Progress, - SpinnerColumn, - TaskProgressColumn, - TextColumn, - TimeElapsedColumn, -) -from rich.table import Table, box - -from util.config import Config -from util.scripts_provider import ScriptsProvider - -# Set up logging with a unified approach - everything will go to the console panel -# and be written to the debug.log file as a backup - -# Set up logging with a unified stream approach -# All logs will go to both debug.log and the Rich console panel - -# Formatter for logs - concise but informative -log_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") - -# Set up main logger -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) # Default level, will be updated based on args -# Important: Prevent propagation to root logger to avoid stderr output -logger.propagate = False - -# The file handler will be shared with the Rich console handler -file_handler = None - -# Tag to filter instances by -FILTER_TAG_NAME = "ManagedBy" -FILTER_TAG_VALUE = "SpotInstanceScript" - -# Initialize console with auto-detection of width -console = Console() - -config = Config("config.yaml") -scripts_provider = ScriptsProvider(config) - -AWS_REGIONS = config.get_regions() -TOTAL_INSTANCES = config.get_total_instances() -global_node_count = 0 -INSTANCES_PER_REGION = ( - TOTAL_INSTANCES // len(AWS_REGIONS) -) or TOTAL_INSTANCES # Evenly distribute instances if set to 'auto' in config - -MAX_NODES = ( - config.get_total_instances() -) # Global limit for total nodes across all regions -current_dir = os.path.dirname(__file__) - -SCRIPT_DIR = "instance/scripts" - -# Status tracking -all_statuses = {} # Dictionary to track all instance statuses -status_lock = asyncio.Lock() # Lock for thread-safe updates to all_statuses - -# Event for signaling the table update task to stop -table_update_event = asyncio.Event() - -# Task tracking -task_name = "TASK NAME" -task_total = 10000 -events_to_progress = [] - -# AWS API timeouts -AWS_API_TIMEOUT = 30 # seconds - - -async def update_status(status): - """Thread-safe update of instance status""" - async with status_lock: - all_statuses[status.id] = status - # Add to events queue for progress tracking - events_to_progress.append(status) - - -class InstanceStatus: - def __init__(self, region, zone, index=0, instance_id=None): - input_string = f"{region}-{zone}-{index}" - hashed_string = hashlib.sha256(input_string.encode()).hexdigest() - - self.id = hashed_string[:6] - self.region = region - self.zone = zone - self.status = "Initializing" - self.detailed_status = "Initializing" - self.start_time = time.time() - self.elapsed_time = 0 - self.instance_id = instance_id - self.public_ip = None - self.private_ip = None - self.vpc_id = None - self.spot_request_id = None # Track the spot request ID for monitoring - self.fulfilled = False # Track if the spot request was fulfilled - - if self.instance_id is not None: - self.id = self.instance_id - - def update_elapsed_time(self): - self.elapsed_time = time.time() - self.start_time - return self.elapsed_time - - def combined_status(self): - if self.detailed_status and self.detailed_status != self.status: - combined = f"{self.detailed_status}" - if len(combined) > 30: - return combined[:27] + "..." - return combined - return self.status - - -def format_elapsed_time(seconds): - """Format elapsed time in a human-readable format""" - if seconds < 60: - return f"{seconds:.1f}s" - elif seconds < 3600: - minutes = seconds / 60 - return f"{minutes:.1f}m" - else: - hours = seconds / 3600 - return f"{hours:.1f}h" - - -def make_progress_table(): - """Create a table showing instance status with adaptive column widths""" - # Get terminal width - width = console.width - - # Calculate column widths based on available space - id_width = 6 - region_width = min(15, max(10, int(width * 0.10))) - zone_width = min(15, max(10, int(width * 0.10))) - status_width = min(30, max(20, int(width * 0.20))) # Wider status column - elapsed_width = 8 - instance_id_width = min(20, max(10, int(width * 0.12))) - ip_width = min(15, max(10, int(width * 0.08))) - - # Create table with adaptive column widths - table = Table(show_header=True, header_style="bold magenta", expand=False) - - # Add columns with appropriate widths - table.add_column("ID", width=id_width, style="cyan", no_wrap=True) - table.add_column("Region", width=region_width, style="cyan", no_wrap=True) - table.add_column("Zone", width=zone_width, style="cyan", no_wrap=True) - table.add_column("Status", width=status_width, style="yellow", no_wrap=True) - table.add_column( - "Time", width=elapsed_width, justify="right", style="magenta", no_wrap=True - ) - table.add_column("Instance ID", width=instance_id_width, style="blue", no_wrap=True) - table.add_column("Public IP", width=ip_width, style="green", no_wrap=True) - table.add_column("Private IP", width=ip_width, style="blue", no_wrap=True) - - # Update elapsed time for all statuses - for status in all_statuses.values(): - status.update_elapsed_time() - - # Sort statuses for consistent display - sorted_statuses = sorted(all_statuses.values(), key=lambda x: (x.region, x.zone)) - - # Add rows to the table - for status in sorted_statuses: - table.add_row( - status.id, - status.region, - status.zone, - status.combined_status(), - format_elapsed_time(status.elapsed_time), - status.instance_id or "", - status.public_ip or "", - status.private_ip or "", - ) - - return table - - -def create_layout(progress, table): - """Create a responsive layout that adapts to terminal size""" - layout = Layout() - - # Calculate panel heights based on terminal height - height = console.height - progress_height = min(4, max(3, int(height * 0.1))) # 10% for progress - console_height = min(6, max(4, int(height * 0.2))) # 20% for console - - # Create progress panel - progress_panel = Panel( - progress, - title="Progress", - border_style="green", - padding=(1, 1), - ) - - # Create console panel for log messages - console_panel = Panel( - "", # Start with empty content - title="Console Output", - border_style="blue", - padding=(0, 1), - ) - - # Split layout with responsive sizing - layout.split( - Layout(progress_panel, size=progress_height), - Layout(table), # This will take the remaining space (about 70%) - Layout(console_panel, size=console_height), - ) - - return layout - - -# Configure console handler to use rich console -class RichConsoleHandler(logging.Handler): - """Unified console handler that shows log messages from debug.log in the Rich UI. - - This handler streams the debug.log content to the console panel in the Rich UI. - It also forwards log records to the file handler, creating a single logging path. - """ - def __init__(self, live, layout, file_handler=None): - super().__init__() - self.live = live - self.layout = layout # Store the layout directly - self.messages = ["Logs will appear here..."] # Start with a simple message - - # Use the same formatter as the file handler for consistency - self.setFormatter(log_formatter) - - # Keep reference to file handler for forwarding - self.file_handler = file_handler - - # Set the level to match the file handler if provided - if file_handler: - self.setLevel(file_handler.level) - else: - self.setLevel(logging.INFO) - - # Initialize the console panel content right away - console_panel = self.layout.children[-1].renderable - console_panel.renderable = "\n".join(self.messages) - - # Read any existing content from debug.log to show history - self._load_existing_logs() - - def _load_existing_logs(self): - """Load the last few lines from debug.log to provide context""" - try: - if os.path.exists("debug.log"): - with open("debug.log", "r") as f: - # Get the last 10 lines from the file - lines = f.readlines()[-10:] - if lines: - # Replace our waiting message with actual log content - self.messages = [line.strip() for line in lines] - - # Update the console panel right away - console_panel = self.layout.children[-1].renderable - console_panel.renderable = "\n".join(self.messages) - except Exception: - # If we can't read the log file, just continue with the default message - pass - - def emit(self, record): - """Process log records and update the console panel""" - try: - # Format the message using our formatter - msg = self.format(record) - - # If we still have the default message, clear it first - if len(self.messages) == 1 and self.messages[0] == "Logs will appear here...": - self.messages = [] - - # Add the new message - self.messages.append(msg) - - # Keep only the last 20 messages (increased from 10 for more context) - if len(self.messages) > 20: - self.messages = self.messages[-20:] - - # Update the console panel content - console_panel = self.layout.children[-1].renderable - console_panel.renderable = "\n".join(self.messages) - - # Forward to file handler if we have one and it's not already handling this record - if self.file_handler and record.levelno >= self.file_handler.level: - self.file_handler.emit(record) - - except Exception: - self.handleError(record) - - -async def update_display(live): - """Update the live display with current status information""" - logger.debug("Entering update_display function") - try: - logger.debug("Creating progress bar") - progress = Progress( - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), - TextColumn("[progress.completed]{task.completed} of {task.total}"), - expand=True, - ) - - logger.debug(f"Adding task: {task_name} with total: {task_total}") - task = progress.add_task(task_name, total=task_total) - - # Create initial layout - logger.debug("Creating table") - table = make_progress_table() - logger.debug("Creating layout") - layout = create_layout(progress, table) - - # For display updates we don't need to create a new handler - # Just update the existing one with the new layout - rich_handler = None - for h in logger.handlers: - if isinstance(h, RichConsoleHandler): - rich_handler = h - break - - if rich_handler is None: - logger.debug("No existing RichConsoleHandler found - display updates may not work") - else: - # Update the existing handler with the new layout - logger.debug("Updating existing RichConsoleHandler layout") - rich_handler.layout = layout - - logger.debug("Starting update loop") - while not table_update_event.is_set(): - logger.debug("Processing status updates") - async with status_lock: - events_to_progress.clear() - progress.update(task, completed=len(all_statuses), refresh=True) - - logger.debug("Creating table and layout") - table = make_progress_table() - layout = create_layout(progress, table) - - # Find and update the RichConsoleHandler with the new layout - for h in logger.handlers: - if isinstance(h, RichConsoleHandler): - h.layout = layout - break - - logger.debug("Updating live display") - live.update(layout) - - # Slightly longer sleep to reduce log volume - await asyncio.sleep(0.5) - - except Exception as e: - logger.error(f"Error updating display: {str(e)}", exc_info=True) - # Don't re-raise the exception to keep the display running - - -def get_ec2_client(region): - """Get EC2 client with proper configuration for the specified region""" - logger.debug(f"Creating EC2 client for region {region}") - try: - # Create a boto3 client with explicit timeout configuration - logger.debug(f"Configuring boto3 client with timeout={AWS_API_TIMEOUT}") - config = botocore.config.Config( - connect_timeout=AWS_API_TIMEOUT, - read_timeout=AWS_API_TIMEOUT, - retries={"max_attempts": 3, "mode": "standard"}, - ) - logger.debug("Creating boto3 client") - client = boto3.client("ec2", region_name=region, config=config) - logger.debug("Successfully created EC2 client") - return client - except Exception as e: - logger.error( - f"Error creating EC2 client for region {region}: {str(e)}", exc_info=True - ) - raise - - -async def safe_aws_call(func, *args, **kwargs): - """Execute AWS API calls with proper timeout handling""" - try: - # Set a timeout for the AWS API call - return await asyncio.wait_for( - asyncio.to_thread(func, *args, **kwargs), timeout=AWS_API_TIMEOUT - ) - except asyncio.TimeoutError: - error_msg = ( - f"AWS API call timed out after {AWS_API_TIMEOUT} seconds: {func.__name__}" - ) - logging.error(error_msg) - if "describe_instances" in func.__name__: - logging.error( - "This may be due to SSO credential issues. Please check your AWS credentials." - ) - logging.error("Try running 'aws sso login' to refresh your credentials.") - raise TimeoutError(error_msg) - except botocore.exceptions.ClientError as e: - if "ExpiredToken" in str(e) or "InvalidToken" in str(e): - logging.error( - "AWS credentials have expired. Please refresh your credentials." - ) - logging.error("Try running 'aws sso login' to refresh your credentials.") - raise - except Exception as e: - logging.error(f"Error in AWS API call {func.__name__}: {str(e)}") - raise - - -async def get_availability_zones(ec2): - response = await safe_aws_call( - ec2.describe_availability_zones, - Filters=[{"Name": "opt-in-status", "Values": ["opt-in-not-required"]}], - ) - return [zone["ZoneName"] for zone in response["AvailabilityZones"]][ - :1 - ] # Get 1 AZ per region - -def get_efs_client(region): - return boto3.client("efs", region_name=region) - - -async def create_spot_instances_in_region(config: Config, instances_to_create, region): - global all_statuses, events_to_progress - - ec2 = get_ec2_client(region) - region_cfg = config.get_region_config(region) - efs_client = get_efs_client(region) - - try: - - - vpc_id = await create_vpc_if_not_exists(ec2) - igw_id = await create_internet_gateway(ec2, vpc_id) - route_table_id = await create_route_table(ec2, vpc_id, igw_id) - security_group_id = await create_security_group_if_not_exists(ec2, vpc_id) - efs_mount_ip = await create_efs(ec2, efs_client, region, vpc_id) - - user_data = scripts_provider.create_cloud_init_script(efs_mount_ip=efs_mount_ip) - if not user_data: - logging.error("User data is empty. Stopping creation.") - return [], {} - - encoded_user_data = base64.b64encode(user_data.encode()).decode() - - instance_ids = [] - zones = await get_availability_zones(ec2) - for i in range(instances_to_create): - zone = zones[i % len(zones)] # Distribute instances across available zones - - subnet_id = await create_subnet(ec2, vpc_id, zone, f"10.0.{i}.0/24") - try: - await associate_route_table(ec2, route_table_id, subnet_id) - except botocore.exceptions.ClientError as e: - if e.response["Error"]["Code"] == "Resource.AlreadyAssociated": - logging.info( - f"Route table already associated in {region}-{zone}: {str(e)}" - ) - else: - logging.warning( - f"Error associating route table in {region}-{zone}: {str(e)}" - ) - - thisInstanceStatusObject = InstanceStatus(region, zone, i) - all_statuses[thisInstanceStatusObject.id] = thisInstanceStatusObject - events_to_progress.append(thisInstanceStatusObject) - - start_time = time.time() - launch_specification = { - "ImageId": config.get_image_for_region(region), - "InstanceType": region_cfg.get("machine_type", "t2.medium"), - "UserData": encoded_user_data, - "BlockDeviceMappings": [ - { - "DeviceName": "/dev/sda1", - "Ebs": {"DeleteOnTermination": True}, - } - ], - "NetworkInterfaces": [ - { - "DeviceIndex": 0, - "AssociatePublicIpAddress": True, - "DeleteOnTermination": True, - "SubnetId": subnet_id, - "Groups": [security_group_id], - } - ], - } - - thisInstanceStatusObject.status = "Requesting" - all_statuses[thisInstanceStatusObject.id] = thisInstanceStatusObject - events_to_progress.append(thisInstanceStatusObject) - - logging.debug(f"Requesting spot instance in {region}-{zone}") - response = await asyncio.to_thread( - ec2.request_spot_instances, - InstanceCount=1, # Create a single instance per request - Type="one-time", - InstanceInterruptionBehavior="terminate", - LaunchSpecification=launch_specification, - TagSpecifications=[ - { - "ResourceType": "spot-instances-request", - "Tags": [ - {"Key": "Name", "Value": f"SpotInstance-{region}-{zone}"}, - {"Key": FILTER_TAG_NAME, "Value": FILTER_TAG_VALUE}, - ], - }, - ], - ) - - spot_request_ids = [ - request["SpotInstanceRequestId"] - for request in response["SpotInstanceRequests"] - ] - logging.debug(f"Spot request IDs: {spot_request_ids}") - - # Store the spot request ID in the status object for tracking - if spot_request_ids: - thisInstanceStatusObject.spot_request_id = spot_request_ids[0] - - thisInstanceStatusObject.status = "Waiting for fulfillment" - - # Wait for spot instances to be fulfilled - waiter = ec2.get_waiter("spot_instance_request_fulfilled") - max_wait_time = 600 # 10 minutes timeout - start_wait_time = time.time() - - # Update instance status - thisInstanceStatusObject.status = "Waiting for fulfillment" - all_statuses[thisInstanceStatusObject.id] = thisInstanceStatusObject - events_to_progress.append(thisInstanceStatusObject) - - # Setup polling for spot request status with timeout - async def poll_spot_request_status(): - timeout_reached = False - while not timeout_reached: - # Check if timeout reached - if time.time() - start_wait_time > max_wait_time: - logging.error(f"Timeout waiting for spot instance in {region}-{zone}") - return None - - # Check spot request status - try: - describe_response = await asyncio.to_thread( - ec2.describe_spot_instance_requests, - SpotInstanceRequestIds=spot_request_ids, - ) - - for request in describe_response["SpotInstanceRequests"]: - status_code = request["Status"]["Code"] - status_message = request["Status"].get("Message", "No message") - - # Update status object with details - thisInstanceStatusObject.detailed_status = f"{status_code}: {status_message}" - thisInstanceStatusObject.elapsed_time = time.time() - start_time - all_statuses[thisInstanceStatusObject.id] = thisInstanceStatusObject - events_to_progress.append(thisInstanceStatusObject) - - logging.debug(f"Status in {region}-{zone}: {status_code} - {status_message}") - - # Check for failures - if status_code in ["price-too-low", "capacity-not-available"]: - logging.error(f"Spot request failed: {status_code} - {status_message}") - return None - - # Check for success - instance ID is present - if "InstanceId" in request: - return describe_response - - except Exception as e: - logging.error(f"Error checking spot request status: {str(e)}") - - # Sleep before next poll - await asyncio.sleep(5) - - return None - - # Try to use waiter first (faster) with timeout protection - waiter_task = asyncio.create_task( - asyncio.wait_for( - asyncio.to_thread( - waiter.wait, - SpotInstanceRequestIds=spot_request_ids, - WaiterConfig={"MaxAttempts": 40, "Delay": 15}, # 40 attempts * 15 sec = 10 min max - ), - timeout=max_wait_time - ) - ) - - # Start the polling task as a backup - polling_task = asyncio.create_task(poll_spot_request_status()) - - # Wait for either task to complete - done, pending = await asyncio.wait( - [waiter_task, polling_task], - return_when=asyncio.FIRST_COMPLETED - ) - - # Cancel the pending task - for task in pending: - task.cancel() - - # Get results - describe_response = None - waiter_succeeded = False - - for task in done: - try: - if task == waiter_task: - await task # Just to get any exceptions - waiter_succeeded = True - logging.debug(f"Waiter succeeded for {region}-{zone}") - elif task == polling_task: - describe_response = await task - - except (asyncio.TimeoutError, asyncio.CancelledError): - pass - except Exception as e: - logging.error(f"Error in spot instance fulfillment: {str(e)}") - - # If waiter succeeded but we don't have response, get it now - if waiter_succeeded and not describe_response: - try: - describe_response = await asyncio.to_thread( - ec2.describe_spot_instance_requests, - SpotInstanceRequestIds=spot_request_ids, - ) - except Exception as e: - logging.error(f"Error getting spot request details: {str(e)}") - describe_response = None - - # Check if we got a valid response - if describe_response is None: - thisInstanceStatusObject.status = "Failed to request spot instance" - thisInstanceStatusObject.detailed_status = "Timeout or API error" - all_statuses[thisInstanceStatusObject.id] = thisInstanceStatusObject - events_to_progress.append(thisInstanceStatusObject) - continue # Skip to next instance - - # Get instance IDs - zone_instance_ids = [ - request["InstanceId"] - for request in describe_response.get("SpotInstanceRequests", []) - if "InstanceId" in request - ] - - if not zone_instance_ids: - thisInstanceStatusObject.status = "Failed to request spot instance" - thisInstanceStatusObject.detailed_status = "No instance ID returned" - all_statuses[thisInstanceStatusObject.id] = thisInstanceStatusObject - events_to_progress.append(thisInstanceStatusObject) - continue # Skip to next instance - - # Add to our overall list of instance IDs - instance_ids.extend(zone_instance_ids) - - # Process the first instance ID (we request only one per spot request) - thisInstanceStatusObject.instance_id = zone_instance_ids[0] - thisInstanceStatusObject.status = "Tagging" - all_statuses[thisInstanceStatusObject.id] = thisInstanceStatusObject - events_to_progress.append(thisInstanceStatusObject) - - try: - # Run tagging and instance details fetching in parallel - tagging_task = asyncio.create_task( - asyncio.to_thread( - ec2.create_tags, - Resources=zone_instance_ids, - Tags=[ - {"Key": FILTER_TAG_NAME, "Value": FILTER_TAG_VALUE}, - {"Key": "Name", "Value": f"SpotInstance-{region}-{zone}"}, - {"Key": "AZ", "Value": zone}, - ], - ) - ) - - fetching_task = asyncio.create_task( - asyncio.to_thread( - ec2.describe_instances, - InstanceIds=[thisInstanceStatusObject.instance_id], - ) - ) - - # Wait for both tasks to complete with timeout - done, pending = await asyncio.wait( - [tagging_task, fetching_task], - timeout=30 - ) - - # Cancel any pending tasks that didn't complete - for task in pending: - task.cancel() - - # Process the results - instance_details = None - tagging_completed = False - - for task in done: - try: - if task == tagging_task: - await task - tagging_completed = True - elif task == fetching_task: - instance_details = await task - except Exception as e: - logging.error(f"Error in instance initialization: {str(e)}") - - # Extract IP addresses if we got instance details - if instance_details and instance_details.get("Reservations"): - instance = instance_details["Reservations"][0]["Instances"][0] - thisInstanceStatusObject.public_ip = instance.get("PublicIpAddress", "") - thisInstanceStatusObject.private_ip = instance.get("PrivateIpAddress", "") - - # Update final status - if tagging_completed: - thisInstanceStatusObject.status = "Done" - else: - thisInstanceStatusObject.status = "Tagged with warnings" - thisInstanceStatusObject.detailed_status = "Tagging may not have completed" - - all_statuses[thisInstanceStatusObject.id] = thisInstanceStatusObject - events_to_progress.append(thisInstanceStatusObject) - - except Exception as e: - logging.error(f"Error processing instance {thisInstanceStatusObject.instance_id}: {str(e)}") - thisInstanceStatusObject.status = "Error processing instance" - thisInstanceStatusObject.detailed_status = str(e)[:30] - all_statuses[thisInstanceStatusObject.id] = thisInstanceStatusObject - events_to_progress.append(thisInstanceStatusObject) - - except Exception as e: - logging.error(f"An error occurred in {region}: {str(e)}", exc_info=True) - return [], {} - - return instance_ids - - -async def create_efs(ec2, efs_client, region, vpc_id): - """Creates an EFS if it does not exist, ensures the mount target is available, and returns its IP.""" - try: - # Check existing EFS - response = await asyncio.to_thread(efs_client.describe_file_systems) - existing_efs = [fs for fs in response.get("FileSystems", []) if fs.get("LifeCycleState") == "available"] - - if existing_efs: - file_system_id = existing_efs[0]["FileSystemId"] - logging.info(f"Found existing EFS: {file_system_id} in {region}") - else: - response = await asyncio.to_thread(efs_client.create_file_system, PerformanceMode='generalPurpose') - file_system_id = response['FileSystemId'] - logging.info(f"Created EFS: {file_system_id} in {region}") - - # Wait for EFS to become available - for _ in range(100): - fs_desc = await asyncio.to_thread( - efs_client.describe_file_systems, FileSystemId=file_system_id - ) - fs_state = fs_desc['FileSystems'][0]['LifeCycleState'] - if fs_state == 'available': - logging.info(f"EFS {file_system_id} is available.") - break - await asyncio.sleep(6) - else: - logging.error(f"EFS {file_system_id} not available after timeout.") - return None - - # Check for existing mount target - targets_resp = await asyncio.to_thread(efs_client.describe_mount_targets, FileSystemId=file_system_id) - if targets_resp['MountTargets']: - mt = targets_resp['MountTargets'][0] - logging.info(f"Found existing mount target for EFS {file_system_id} - State: {mt['LifeCycleState']}") - else: - # Create mount target - subnets = await asyncio.to_thread(ec2.describe_subnets, Filters=[{'Name': 'vpc-id', 'Values': [vpc_id]}]) - subnet_id = subnets['Subnets'][0]['SubnetId'] - security_group_id = await create_security_group_if_not_exists(ec2, vpc_id) - - try: - await asyncio.to_thread( - efs_client.create_mount_target, - FileSystemId=file_system_id, - SubnetId=subnet_id, - SecurityGroups=[security_group_id] - ) - logging.info(f"Mount target creation started for EFS {file_system_id}") - except efs_client.exceptions.MountTargetConflict: - logging.warning(f"Mount target conflict for EFS {file_system_id}, assuming it already exists.") - await asyncio.sleep(10) - - # Re-fetch mount targets - targets_resp = await asyncio.to_thread(efs_client.describe_mount_targets, FileSystemId=file_system_id) - if not targets_resp['MountTargets']: - logging.error(f"No mount targets found for EFS {file_system_id} after creation attempt.") - return None - mt = targets_resp['MountTargets'][0] - - # Wait for mount target to be available - for _ in range(100): - mt_desc = await asyncio.to_thread( - efs_client.describe_mount_targets, FileSystemId=file_system_id - ) - mt = mt_desc['MountTargets'][0] - if mt['LifeCycleState'] == 'available': - ip = mt['IpAddress'] - logging.info(f"Mount target for EFS {file_system_id} is available. IP: {ip}") - return ip - logging.info(f"Waiting for mount target to be available... State: {mt['LifeCycleState']}") - await asyncio.sleep(6) - else: - logging.error(f"Mount target for EFS {file_system_id} not available after timeout.") - return None - - except Exception as e: - logging.error(f"Error creating EFS in {region}: {str(e)}") - return None - -async def create_vpc_if_not_exists(ec2): - vpcs = await asyncio.to_thread( - ec2.describe_vpcs, Filters=[{"Name": "tag:Name", "Values": ["SpotInstanceVPC"]}] - ) - if vpcs["Vpcs"]: - return vpcs["Vpcs"][0]["VpcId"] - else: - vpc = await asyncio.to_thread(ec2.create_vpc, CidrBlock="10.0.0.0/16") - vpc_id = vpc["Vpc"]["VpcId"] - await asyncio.to_thread( - ec2.create_tags, - Resources=[vpc_id], - Tags=[{"Key": "Name", "Value": "SpotInstanceVPC"}], - ) - await asyncio.to_thread( - ec2.modify_vpc_attribute, VpcId=vpc_id, EnableDnsHostnames={"Value": True} - ) - await asyncio.to_thread( - ec2.modify_vpc_attribute, VpcId=vpc_id, EnableDnsSupport={"Value": True} - ) - return vpc_id - - -async def create_subnet(ec2, vpc_id, zone, cidr_block=None): - # First, check if a subnet already exists in this zone - existing_subnets = await asyncio.to_thread( - ec2.describe_subnets, - Filters=[ - {"Name": "vpc-id", "Values": [vpc_id]}, - {"Name": "availability-zone", "Values": [zone]}, - ], - ) - - if existing_subnets["Subnets"]: - # If a subnet exists, return its ID - return existing_subnets["Subnets"][0]["SubnetId"] - - # If no subnet exists, try to create one - cidr_base_prefix = "10.0." - cidr_base_suffix = ".0/24" - for i in range(256): - try: - cidrBlock = ( - cidr_block - if cidr_block - else cidr_base_prefix + str(i) + cidr_base_suffix - ) - logging.debug(f"Creating subnet in {zone} with CIDR block {cidrBlock}") - subnet = await asyncio.to_thread( - ec2.create_subnet, - VpcId=vpc_id, - CidrBlock=cidrBlock, - AvailabilityZone=zone, - ) - return subnet["Subnet"]["SubnetId"] - except botocore.exceptions.ClientError as e: - if e.response["Error"]["Code"] == "InvalidSubnet.Conflict": - # If this CIDR is in use, try the next one - continue - else: - # If it's a different error, raise it - raise - - # If we've tried all possible CIDRs and none worked, raise an error - raise Exception(f"Unable to create subnet in {zone}. All CIDR blocks are in use.") - - -async def create_internet_gateway(ec2, vpc_id): - # First, check if the VPC already has an Internet Gateway attached - igws = await asyncio.to_thread( - ec2.describe_internet_gateways, - Filters=[{"Name": "attachment.vpc-id", "Values": [vpc_id]}], - ) - - if igws["InternetGateways"]: - # If an Internet Gateway is already attached, return its ID - return igws["InternetGateways"][0]["InternetGatewayId"] - - # If no Internet Gateway is attached, create and attach a new one - igw = await asyncio.to_thread(ec2.create_internet_gateway) - igw_id = igw["InternetGateway"]["InternetGatewayId"] - - try: - await asyncio.to_thread( - ec2.attach_internet_gateway, InternetGatewayId=igw_id, VpcId=vpc_id - ) - except botocore.exceptions.ClientError: - # If an error occurs during attachment, delete the created IGW - await asyncio.to_thread(ec2.delete_internet_gateway, InternetGatewayId=igw_id) - # Re-check for existing IGW in case one was attached concurrently - igws = await asyncio.to_thread( - ec2.describe_internet_gateways, - Filters=[{"Name": "attachment.vpc-id", "Values": [vpc_id]}], - ) - if igws["InternetGateways"]: - return igws["InternetGateways"][0]["InternetGatewayId"] - else: - # If still no IGW found, re-raise the original error - raise - - return igw_id - - -async def create_route_table(ec2, vpc_id, igw_id): - # Check if a route table already exists for the VPC - route_tables = await asyncio.to_thread( - ec2.describe_route_tables, - Filters=[{"Name": "vpc-id", "Values": [vpc_id]}], - ) - for rt in route_tables["RouteTables"]: - for association in rt.get("Associations", []): - if association.get("Main", False): - # Found the main route table, add a route to the IGW if it doesn't exist - route_table_id = rt["RouteTableId"] - routes = rt.get("Routes", []) - if not any(route.get("GatewayId") == igw_id for route in routes): - await asyncio.to_thread( - ec2.create_route, - RouteTableId=route_table_id, - DestinationCidrBlock="0.0.0.0/0", - GatewayId=igw_id, - ) - return route_table_id - - # If no route table exists, create a new one - route_table = await asyncio.to_thread(ec2.create_route_table, VpcId=vpc_id) - route_table_id = route_table["RouteTable"]["RouteTableId"] - - # Create a route to the Internet Gateway - await asyncio.to_thread( - ec2.create_route, - RouteTableId=route_table_id, - DestinationCidrBlock="0.0.0.0/0", - GatewayId=igw_id, - ) - - # Associate the route table with the VPC (make it the main route table) - await asyncio.to_thread( - ec2.associate_route_table, - RouteTableId=route_table_id, - VpcId=vpc_id, - ) - - return route_table_id - - -async def associate_route_table(ec2, route_table_id, subnet_id): - try: - await asyncio.to_thread( - ec2.associate_route_table, RouteTableId=route_table_id, SubnetId=subnet_id - ) - except botocore.exceptions.ClientError as e: - if e.response["Error"]["Code"] == "Resource.AlreadyAssociated": - logging.debug( - f"Route table already associated in {route_table_id}-{subnet_id}: {str(e)}" - ) - else: - raise - - -async def create_security_group_if_not_exists(ec2, vpc_id): - security_groups = await asyncio.to_thread( - ec2.describe_security_groups, - Filters=[ - {"Name": "group-name", "Values": ["SpotInstanceSG"]}, - {"Name": "vpc-id", "Values": [vpc_id]}, - ], - ) - if security_groups["SecurityGroups"]: - return security_groups["SecurityGroups"][0]["GroupId"] - else: - security_group = await asyncio.to_thread( - ec2.create_security_group, - GroupName="SpotInstanceSG", - Description="Security group for Spot Instances", - VpcId=vpc_id, - ) - security_group_id = security_group["GroupId"] - await asyncio.to_thread( - ec2.authorize_security_group_ingress, - GroupId=security_group_id, - IpPermissions=[ - { - "IpProtocol": "tcp", - "FromPort": 22, - "ToPort": 22, - "IpRanges": [{"CidrIp": "0.0.0.0/0"}], - }, - { - "IpProtocol": "tcp", - "FromPort": 1234, - "ToPort": 1234, - "IpRanges": [{"CidrIp": "0.0.0.0/0"}], - }, - { - "IpProtocol": "tcp", - "FromPort": 1235, - "ToPort": 1235, - "IpRanges": [{"CidrIp": "0.0.0.0/0"}], - }, - { - "IpProtocol": "tcp", - "FromPort": 2049, - "ToPort": 2049, - "IpRanges": [{"CidrIp": "0.0.0.0/0"}], - }, - { - "IpProtocol": "tcp", - "FromPort": 111, - "ToPort": 111, - "IpRanges": [{"CidrIp": "0.0.0.0/0"}], - }, - { - "IpProtocol": "tcp", - "FromPort": 9123, - "ToPort": 9123, - "IpRanges": [{"CidrIp": "0.0.0.0/0"}], - }, - ], - ) - return security_group_id - - -async def create_spot_instances(): - """Create spot instances across all configured regions. - - This is the main function for instance creation that: - 1. Distributes instances across regions based on configuration - 2. Creates the instances in parallel - 3. Waits for all instances to get their public IPs - 4. Displays final node information and continues - - The function doesn't wait for SSH or Bacalhau services to be available. - It only ensures machines have IP addresses assigned. - - Returns: - bool: True if all instances were successfully created with IPs, False otherwise - """ - global task_name, task_total - task_name = "Creating Spot Instances" - task_total = MAX_NODES - - logger.info(f"Starting spot instance creation - target: {MAX_NODES} instances") - - async def create_in_region(region): - global global_node_count - available_slots = MAX_NODES - global_node_count - region_cfg = config.get_region_config(region) - - if available_slots <= 0: - logger.warning(f"Reached maximum nodes. Skipping region: {region}") - return [], {} - - instances_to_create = ( - min(INSTANCES_PER_REGION, available_slots) - if region_cfg.get("node_count") == "auto" - else (min(region_cfg.get("node_count"), available_slots)) - ) - - if instances_to_create == 0: - logger.info(f"No instances to create in region {region}") - return [], {} - - logger.info(f"Creating {instances_to_create} spot instances in region: {region}") - global_node_count += instances_to_create - instance_ids = await create_spot_instances_in_region( - config, instances_to_create, region - ) - - # Log success or failure - if instance_ids: - logger.info(f"Successfully created {len(instance_ids)} instances in {region}") - else: - logger.warning(f"Failed to create any instances in {region}") - - return instance_ids - - # Process regions in batches to start machine creation sooner - # Choose a batch size that gives good parallelism without overwhelming the system - batch_size = 10 # Process 10 regions at a time - total_created = 0 - logger.info(f"Creating instances in batches of {batch_size} regions") - - # Group regions into batches - region_batches = [AWS_REGIONS[i:i+batch_size] for i in range(0, len(AWS_REGIONS), batch_size)] - - for batch_num, region_batch in enumerate(region_batches, 1): - logger.info(f"Processing batch {batch_num}/{len(region_batches)} with {len(region_batch)} regions") - - # Create instances in this batch of regions in parallel - create_tasks = [create_in_region(region) for region in region_batch] - batch_results = await asyncio.gather(*create_tasks) - - # Count created instances in this batch - batch_created = sum(len(ids) for ids in batch_results if ids) - total_created += batch_created - logger.info(f"Batch {batch_num} created {batch_created} instances") - - # Wait for public IPs for instances in this batch only - # We'll do this processing in a background task so we can continue - if batch_created > 0: - # Start getting public IPs for this batch in the background - # We don't await this - just let it run - asyncio.create_task(wait_for_batch_public_ips()) - - logger.info(f"All batches processed, created {total_created} instances across all regions") - - # Don't continue if no instances were created - if total_created == 0: - logger.warning("No instances were created - skipping IP address waiting") - return False - - # Wait for any remaining IP address assignments to complete - logger.info("Ensuring all instances have received public IP addresses...") - all_ips_received = await wait_for_public_ips() - - if all_ips_received: - logger.info("All instances have been successfully created with public IPs") - - # Display final node information in a table - but don't wait for provisioning - print_node_table() - else: - logger.warning("Some instances did not receive public IPs within the timeout") - - return all_ips_received - - -def print_node_table(): - """Display a table of all nodes showing hostname, region, zone, and IP addresses. - - This presents a clean summary of all nodes that were created during the operation, - making it easy for users to see what resources are available. - - This is a synchronous function to ensure it works outside of an async context. - """ - # Get sorted list of statuses for consistent display - sorted_statuses = sorted(all_statuses.values(), key=lambda x: (x.region, x.zone)) - - # Only include instances that have a public IP (successfully created) - nodes_with_ip = [s for s in sorted_statuses if s.public_ip] - - # Count pending spot requests that didn't get fulfilled - pending_spot_requests = [s for s in sorted_statuses if s.spot_request_id and not s.instance_id] - - # First create and show the successful nodes table - if nodes_with_ip: - # Create a new table specifically for the final display - table = Table(title="Bacalhau Cluster Nodes", box=box.ROUNDED, show_header=True, header_style="bold cyan") - - # Add columns with appropriate alignment and style - table.add_column("Node #", style="dim", justify="right") - table.add_column("Hostname", style="cyan") - table.add_column("Region", style="green") - table.add_column("Zone", style="blue") - table.add_column("Public IP", style="yellow") - table.add_column("Private IP", style="dim cyan") - - # Add rows for each node - for i, status in enumerate(nodes_with_ip, 1): - # Generate a hostname from region and zone - hostname = f"bacalhau-{status.region}-{status.zone.split('-')[-1]}" - - table.add_row( - str(i), - hostname, - status.region, - status.zone, - status.public_ip or "N/A", - status.private_ip or "N/A" - ) - - # Log first for debug - logger.info(f"Displaying final table with {len(nodes_with_ip)} nodes") - - # Display the table outside of the Live context - console.print() # Add some space - console.print(table) - console.print() # Add some space after - else: - logger.warning("No nodes with IP addresses to display") - console.print("[bold yellow]No nodes received IP addresses![/bold yellow]") - console.print() - - # Show a summary of successful vs. pending spot requests - console.print(f"[bold]Spot Instance Summary:[/bold]") - console.print(f"- Successfully provisioned: [green]{len(nodes_with_ip)}[/green] nodes") - console.print(f"- Pending spot requests: [yellow]{len(pending_spot_requests)}[/yellow]") - console.print(f"- Total spot requests: [blue]{len(sorted_statuses)}[/blue]") - console.print() - - # Also print a helpful message about how to connect to nodes with proper key authentication - if nodes_with_ip: - console.print("[bold green]✓[/bold green] Your Bacalhau cluster is being provisioned!") - console.print("[yellow]Machines have IP addresses but may need a few minutes to complete setup[/yellow]") - - # Get the username and private key path from config - username = config.get_username() - private_key_path = config.get_private_ssh_key_path() - - # Create the SSH command with key file if available - if private_key_path: - ssh_cmd = f"ssh -i {private_key_path} {username}@" - else: - ssh_cmd = f"ssh {username}@" - - console.print(f"[dim]To connect to any node: {ssh_cmd}[/dim]") - else: - console.print("[bold red]⚠ No instances were successfully provisioned with IP addresses.[/bold red]") - console.print("[yellow]This could be due to spot capacity issues in the selected regions.[/yellow]") - console.print("[yellow]Consider trying again, selecting different instance types, or using different regions.[/yellow]") - - console.print() - -async def wait_for_provisioning(): - """Wait for all instances to complete their provisioning process. - - This function checks SSH connectivity and whether the Bacalhau services - are running on each instance. It updates the statuses throughout the - provisioning process. - - Returns: - bool: True when all instances are fully provisioned - """ - global all_statuses - max_timeout = 600 # 10 minutes timeout - start_time = time.time() - poll_interval = 15 # seconds between polls - - logger.info(f"Monitoring provisioning status for all instances (timeout: {max_timeout}s)") - - # Count instances we're monitoring - instances_to_monitor = [s for s in all_statuses.values() if s.instance_id and s.public_ip] - - if not instances_to_monitor: - logger.warning("No instances to monitor for provisioning") - return False - - logger.info(f"Monitoring provisioning for {len(instances_to_monitor)} instances") - - # Initialize provisioning statuses - for status in instances_to_monitor: - status.detailed_status = "Waiting for provisioning" - # Make sure to signal for UI update - events_to_progress.append(status) - - # Track completion - while True: - # Check timeout - elapsed_time = time.time() - start_time - if elapsed_time > max_timeout: - logger.warning(f"Timeout reached after {max_timeout}s waiting for provisioning") - # Update statuses for those that didn't complete - for status in instances_to_monitor: - if status.detailed_status != "Provisioning complete": - status.detailed_status = "Provisioning timeout" - events_to_progress.append(status) - return False - - # Check all instances in parallel - async def check_instance(status): - try: - # Skip already completed instances - if status.detailed_status == "Provisioning complete": - return True - - # Update status to show we're checking - status.detailed_status = f"Checking provisioning ({int(elapsed_time)}s)" - events_to_progress.append(status) - - # Check SSH connectivity first - if not await check_ssh_connectivity(status.public_ip): - status.detailed_status = "Waiting for SSH access" - events_to_progress.append(status) - return False - - # Then check if Docker is running - if not await check_docker_running(status.public_ip): - status.detailed_status = "Waiting for Docker" - events_to_progress.append(status) - return False - - # Finally check if Bacalhau service is running - if not await check_bacalhau_service(status.public_ip): - status.detailed_status = "Waiting for Bacalhau" - events_to_progress.append(status) - return False - - # All checks passed, provisioning is complete - status.detailed_status = "Provisioning complete" - events_to_progress.append(status) - return True - - except Exception as e: - logger.error(f"Error checking instance {status.instance_id}: {str(e)}") - status.detailed_status = f"Check error: {str(e)[:20]}" - events_to_progress.append(status) - return False - - # Check all instances in parallel - check_tasks = [check_instance(status) for status in instances_to_monitor] - results = await asyncio.gather(*check_tasks) - - # Count how many are complete - complete_count = sum(1 for r in results if r) - logger.info(f"Provisioning progress: {complete_count}/{len(instances_to_monitor)} instances ready") - - # Check if all are complete - if all(results): - logger.info("All instances have completed provisioning") - - # Keep the display up for a few more seconds to show the final status - logger.info("Keeping display open for 5 more seconds to show provisioning complete") - await asyncio.sleep(5) - - return True - - # Wait before next check - await asyncio.sleep(poll_interval) - -async def check_ssh_connectivity(ip_address): - """Check if an instance is accessible via SSH. - - Args: - ip_address: The public IP address of the instance - - Returns: - bool: True if SSH connection succeeds, False otherwise - """ - try: - # Use socket connection to check if port 22 is open - reader, writer = await asyncio.wait_for( - asyncio.open_connection(ip_address, 22), - timeout=5.0 - ) - - # Close the connection - writer.close() - await writer.wait_closed() - - return True - except Exception: - return False - -async def check_docker_running(ip_address): - """Check if Docker is running on the instance. - - Args: - ip_address: The public IP address of the instance - - Returns: - bool: True if docker appears to be running, False otherwise - """ - # For now, we'll just check SSH since we can't easily run commands remotely - # In a production version, this would use SSH to execute 'docker ps' - return await check_ssh_connectivity(ip_address) - -async def check_bacalhau_service(ip_address): - """Check if the Bacalhau service is running on the instance. - - Args: - ip_address: The public IP address of the instance - - Returns: - bool: True if Bacalhau service appears to be running, False otherwise - """ - try: - # Try to connect to the bacalhau healthcheck port (assuming it's 1234) - reader, writer = await asyncio.wait_for( - asyncio.open_connection(ip_address, 1234), - timeout=5.0 - ) - - # Close the connection - writer.close() - await writer.wait_closed() - - return True - except Exception: - return False - -async def wait_for_batch_public_ips(): - """Wait for public IPs for instances in the most recent batch. - - This is a non-blocking function that can be called as a background task. - It identifies instances without IPs that were created in recent batches - and polls for their IP addresses. - - This allows us to start getting IPs while other machines are still creating. - """ - # Find instances without public IPs among the most recently created ones - # These will be instances that have an instance_id but no public_ip - pending_instances = [status for status in all_statuses.values() - if status.instance_id and not status.public_ip] - - if not pending_instances: - logger.debug("No pending instances waiting for IPs in this batch") - return - - logger.info(f"Background task: Getting public IPs for {len(pending_instances)} new instances") - - # Group instances by region for efficient API calls - instances_by_region = {} - for status in pending_instances: - if status.region not in instances_by_region: - instances_by_region[status.region] = [] - instances_by_region[status.region].append(status) - - # Set a reasonable timeout for this specific batch (shorter than the main wait) - timeout = 120 # 2 minutes timeout per batch - start_time = time.time() - poll_interval = 5 # seconds between polls - - # Poll for public IPs - while time.time() - start_time < timeout: - # Count how many still need IPs - still_pending = sum(1 for status in pending_instances if not status.public_ip) - - if still_pending == 0: - logger.info(f"Background task: All {len(pending_instances)} instances in batch received IPs") - return - - logger.debug(f"Background task: Still waiting for {still_pending} instances to get public IPs") - - # Update the IPs in parallel per region - async def update_region_ips(region, statuses): - # Skip if no instances still need IPs in this region - if all(status.public_ip for status in statuses): - return 0 - - try: - # Get EC2 client for this region - ec2 = get_ec2_client(region) - - # Get instance IDs that still need IPs - instance_ids = [status.instance_id for status in statuses if not status.public_ip] - - # Skip if no instances - if not instance_ids: - return 0 - - # Query AWS API for current instance information - response = await asyncio.to_thread( - ec2.describe_instances, - InstanceIds=instance_ids - ) - - # Process results and update statuses - updated_count = 0 - for reservation in response.get("Reservations", []): - for instance in reservation.get("Instances", []): - instance_id = instance["InstanceId"] - public_ip = instance.get("PublicIpAddress", "") - private_ip = instance.get("PrivateIpAddress", "") - - # Find the matching status - for status in statuses: - if status.instance_id == instance_id: - if public_ip and not status.public_ip: - status.public_ip = public_ip - status.detailed_status = "Public IP assigned" - updated_count += 1 - if private_ip: - status.private_ip = private_ip - # Signal for UI update - events_to_progress.append(status) - - return updated_count - - except Exception as e: - logger.error(f"Error updating IPs for region {region}: {str(e)}") - return 0 - - # Create tasks for each region - tasks = [update_region_ips(region, statuses) - for region, statuses in instances_by_region.items()] - - # Run all tasks in parallel - results = await asyncio.gather(*tasks) - - # Sum up the total updated - updated_count = sum(results) - if updated_count > 0: - logger.info(f"Background task: Received {updated_count} new public IPs") - - # Save the updates to MACHINES.json - save_machines_to_json(operation="update") - - # Wait before next poll - await asyncio.sleep(poll_interval) - - # If we get here, we hit the timeout - logger.warning(f"Background task: Timeout waiting for IPs after {timeout}s") - -async def wait_for_public_ips(): - """Wait for all instances to get their public IP addresses. - - This function monitors the instance statuses and waits until all have IP addresses - or until a timeout is reached. It updates the progress display throughout. - - Returns: - bool: True if all instances got IPs, False if any timed out - """ - global all_statuses - timeout = 300 # 5 minutes timeout - start_time = time.time() - poll_interval = 5 # seconds between polls - - logger.info(f"Waiting for public IP addresses (timeout: {timeout}s)") - - # Count all instances we're waiting for - both spot requests and instances without IPs - pending_spot_requests = sum(1 for status in all_statuses.values() - if status.spot_request_id and not status.instance_id) - pending_ips = sum(1 for status in all_statuses.values() - if status.instance_id and not status.public_ip) - - total_pending = pending_spot_requests + pending_ips - logger.info(f"Waiting for {total_pending} instances to complete ({pending_spot_requests} spot requests still pending, {pending_ips} awaiting IPs)") - - # Group instances by region for parallel processing - def get_instances_by_region(): - instances_by_region = {} - spot_requests_by_region = {} - - # First, organize by region - for status in all_statuses.values(): - region = status.region - if not region: - continue - - # Handle instances waiting for IP addresses - if status.instance_id and not status.public_ip: - if region not in instances_by_region: - instances_by_region[region] = [] - instances_by_region[region].append(status) - - # Handle spot requests waiting for fulfillment - elif status.spot_request_id and not status.instance_id: - if region not in spot_requests_by_region: - spot_requests_by_region[region] = [] - spot_requests_by_region[region].append(status) - - # Combine both mappings for return - combined_by_region = {} - all_regions = set(instances_by_region.keys()) | set(spot_requests_by_region.keys()) - - for region in all_regions: - combined_by_region[region] = { - "instances": instances_by_region.get(region, []), - "spot_requests": spot_requests_by_region.get(region, []) - } - - return combined_by_region - - # Track completion status - all_ips_received = False - - while True: - # Count pending spot requests and instances waiting for IPs - pending_spot_requests = sum(1 for status in all_statuses.values() - if status.spot_request_id and not status.instance_id) - pending_ips = sum(1 for status in all_statuses.values() - if status.instance_id and not status.public_ip) - - total_pending = pending_spot_requests + pending_ips - - # Check if we're done with both spot requests and IP assignment - all_complete = total_pending == 0 - - # Check for timeout - time_elapsed = time.time() - start_time - timed_out = time_elapsed > timeout - - # Exit conditions - if all_complete: - provisioned_count = sum(1 for status in all_statuses.values() if status.public_ip) - logger.info(f"All instances processed - {provisioned_count} successfully provisioned with public IPs") - all_ips_received = True - break - - if timed_out: - # Update status for all pending instances - for status in all_statuses.values(): - if status.spot_request_id and not status.instance_id: - status.detailed_status = "Spot request not fulfilled after timeout" - events_to_progress.append(status) - elif status.instance_id and not status.public_ip: - status.detailed_status = "No public IP after timeout" - events_to_progress.append(status) - - provisioned_count = sum(1 for status in all_statuses.values() if status.public_ip) - logger.warning(f"Timed out after {timeout}s - {provisioned_count} instances provisioned, {pending_spot_requests} spot requests pending, {pending_ips} instances missing IPs") - break - - # Get instances grouped by region - instances_by_region = get_instances_by_region() - if not instances_by_region: - # No instances need IPs, we're done - logger.info("No instances waiting for IPs") - all_ips_received = True - break - - # Log progress - pending_count = sum(len(ids) for ids in instances_by_region.values()) - logger.info(f"Still waiting for {pending_count} instances to get public IPs ({int(time_elapsed)}s elapsed)") - - # Create tasks to query each region in parallel - async def query_region_instances(region, region_data): - try: - ec2 = get_ec2_client(region) - updated_count = 0 - - # First check spot request status for any pending requests - spot_requests = region_data.get("spot_requests", []) - if spot_requests: - # Get all the spot request IDs - spot_request_ids = [sr.spot_request_id for sr in spot_requests if sr.spot_request_id] - - if spot_request_ids: - logger.debug(f"Checking {len(spot_request_ids)} spot requests in {region}") - try: - spot_response = await asyncio.to_thread( - ec2.describe_spot_instance_requests, - SpotInstanceRequestIds=spot_request_ids - ) - - # Process spot request results - for request in spot_response.get("SpotInstanceRequests", []): - request_id = request.get("SpotInstanceRequestId") - instance_id = request.get("InstanceId") - status_code = request.get("Status", {}).get("Code", "") - status_message = request.get("Status", {}).get("Message", "") - - # Find the matching status object - for status in spot_requests: - if status.spot_request_id == request_id: - # Update status with details - status.detailed_status = f"{status_code}: {status_message}" - - # If the request has an instance ID, it's fulfilled - if instance_id: - status.instance_id = instance_id - status.fulfilled = True - updated_count += 1 - - # Signal for UI update - events_to_progress.append(status) - except Exception as e: - logger.error(f"Error checking spot requests in {region}: {str(e)}") - - # Now check for IP addresses for instances - instances = region_data.get("instances", []) - if instances: - # Get all instance IDs - instance_ids = [i.instance_id for i in instances if i.instance_id] - - if instance_ids: - logger.debug(f"Checking {len(instance_ids)} instances for IPs in {region}") - try: - instance_response = await asyncio.to_thread( - ec2.describe_instances, InstanceIds=instance_ids - ) - - # Process results and update statuses - for reservation in instance_response.get("Reservations", []): - for instance in reservation.get("Instances", []): - instance_id = instance["InstanceId"] - public_ip = instance.get("PublicIpAddress", "") - private_ip = instance.get("PrivateIpAddress", "") - - # Find the matching status object - for status in instances: - if status.instance_id == instance_id: - if public_ip and not status.public_ip: - status.public_ip = public_ip - status.detailed_status = "Public IP assigned" - updated_count += 1 - if private_ip: - status.private_ip = private_ip - # Signal for UI update - events_to_progress.append(status) - except Exception as e: - logger.error(f"Error checking instance IPs in {region}: {str(e)}") - - return updated_count - except Exception as e: - logger.error(f"Error querying region {region}: {str(e)}") - return 0 - - # Create and run tasks for all regions in parallel - regions_to_query = get_instances_by_region() - tasks = [ - query_region_instances(region, region_data) - for region, region_data in regions_to_query.items() - ] - - if tasks: - # Wait for all regions to be queried with timeout protection - try: - results = await asyncio.gather(*tasks) - - # Sum up the total updated - updated_count = sum(results) - - # Log how many updates we made - if updated_count > 0: - # Count current success stats - fulfilled_requests = sum(1 for status in all_statuses.values() - if status.spot_request_id and status.instance_id) - ip_assigned = sum(1 for status in all_statuses.values() - if status.instance_id and status.public_ip) - - logger.info(f"Updated {updated_count} instances - {fulfilled_requests} spot requests fulfilled, {ip_assigned} instances have IPs") - - # Save the updates to MACHINES.json - save_machines_to_json(operation="update") - - except Exception as e: - logger.error(f"Error waiting for instances: {str(e)}") - - # Wait before next poll - we don't want to hammer the AWS API - await asyncio.sleep(poll_interval) - - # Return whether all instances got IPs or not - return all_ips_received - - -async def list_spot_instances(): - logger.debug("Entering list_spot_instances function") - global all_statuses, events_to_progress, task_total - logger.debug("Resetting global statuses and events") - all_statuses = {} # Reset the global statuses - events_to_progress = [] # Clear the events list - - global task_name - task_name = "Listing Spot Instances" - task_total = 0 # We'll update this as we go - - logger.info("Starting to list spot instances") - - for region in AWS_REGIONS: - logger.info(f"Processing region: {region}") - logger.debug(f"Getting EC2 client for region {region}") - ec2 = get_ec2_client(region) - try: - logger.info(f"Fetching availability zones for region {region}") - az_response = await asyncio.to_thread(ec2.describe_availability_zones) - availability_zones = [ - az["ZoneName"] for az in az_response["AvailabilityZones"] - ] - logger.info( - f"Found {len(availability_zones)} availability zones in {region}: {', '.join(availability_zones)}" - ) - - for az in availability_zones: - logger.info(f"Querying instances in {region}/{az}") - response = await asyncio.to_thread( - ec2.describe_instances, - Filters=[ - { - "Name": "instance-state-name", - "Values": ["pending", "running", "stopped"], - }, - {"Name": "availability-zone", "Values": [az]}, - { - "Name": f"tag:{FILTER_TAG_NAME}", - "Values": [FILTER_TAG_VALUE], - }, - ], - ) - - instance_count = 0 - for reservation in response["Reservations"]: - for instance in reservation["Instances"]: - instance_count += 1 - logger.info( - f"Found instance: {instance['InstanceId']} in {region}/{az}" - ) - instance_id = instance["InstanceId"] - thisInstanceStatusObject = InstanceStatus( - region, az, 0, instance_id - ) - thisInstanceStatusObject.status = instance["State"][ - "Name" - ].capitalize() - thisInstanceStatusObject.elapsed_time = ( - datetime.now(timezone.utc) - instance["LaunchTime"] - ).total_seconds() - thisInstanceStatusObject.public_ip = instance.get( - "PublicIpAddress", "" - ) - thisInstanceStatusObject.private_ip = instance.get( - "PrivateIpAddress", "" - ) - - logger.debug( - f"Adding instance {instance_id} to status tracking" - ) - events_to_progress.append(instance_id) - all_statuses[instance_id] = thisInstanceStatusObject - task_total += 1 - - if instance_count == 0: - logger.info(f"No instances found in {region}/{az}") - - logger.info( - f"Completed scan of region {region}, found {sum(1 for status in all_statuses.values() if status.region == region)} instances" - ) - - except Exception as e: - logger.error( - f"An error occurred while listing instances in {region}: {str(e)}", - exc_info=True, - ) - - logger.info( - f"Finished listing spot instances, found {len(all_statuses)} instances in total" - ) - return all_statuses - - -async def destroy_instances(): - """Destroy all managed instances across all regions. - - This function first removes instances from MACHINES.json to provide immediate feedback, - then asynchronously queries AWS APIs to find and terminate any instances that might - have been missed in our tracking file. - """ - global task_name, task_total, events_to_progress - task_name = "Terminating Spot Instances" - events_to_progress = [] - - # Start by loading and clearing MACHINES.json for immediate feedback - logger.info("Loading existing machine records from MACHINES.json") - existing_data = load_machines_from_json() - existing_machines = existing_data.get("machines", {}) - - # If we have existing machines in the file, create status objects for them first - if existing_machines: - logger.info(f"Found {len(existing_machines)} existing machines in MACHINES.json") - for machine_id, machine_data in existing_machines.items(): - try: - # Extract needed information for termination - region = machine_data.get("region") - zone = machine_data.get("zone") - instance_id = machine_data.get("instance_id") - vpc_id = machine_data.get("vpc_id") - - if not all([region, zone, instance_id]): - logger.warning(f"Incomplete data for machine {machine_id}, skipping") - continue - - # Create a status object for tracking - status = InstanceStatus(region, zone) - status.instance_id = instance_id - status.status = "Terminating" - status.detailed_status = "From MACHINES.json" - status.vpc_id = vpc_id - all_statuses[instance_id] = status - events_to_progress.append(status) - - logger.info(f"Added instance {instance_id} in {region} for termination from MACHINES.json") - - except Exception as e: - logger.error(f"Error processing machine record {machine_id}: {str(e)}") - - # Remove all machines from MACHINES.json immediately - if existing_machines: - logger.info("Clearing MACHINES.json to provide immediate feedback") - try: - # Create empty machine data - output_data = { - "timestamp": datetime.now(timezone.utc).isoformat(), - "machines": {}, - "total_count": 0, - "regions": [], - "last_operation": "delete", - "last_updated": datetime.now(timezone.utc).isoformat() - } - - # Write to temporary file first - temp_file = "MACHINES.json.tmp" - with open(temp_file, "w") as f: - # Use fcntl for file locking on Unix systems - try: - import fcntl - fcntl.flock(f, fcntl.LOCK_EX) # Exclusive lock for writing - json.dump(output_data, indent=2, default=str, sort_keys=True, fp=f) - f.flush() # Ensure data is written to disk - os.fsync(f.fileno()) # Sync filesystem - fcntl.flock(f, fcntl.LOCK_UN) # Release lock - except (ImportError, AttributeError): - # On Windows or if fcntl not available - json.dump(output_data, indent=2, default=str, sort_keys=True, fp=f) - f.flush() # Ensure data is written to disk - - # Atomic rename to ensure file is either fully written or not at all - os.replace(temp_file, "MACHINES.json") - logger.info("Successfully cleared MACHINES.json") - - except Exception as e: - logger.error(f"Error clearing MACHINES.json: {str(e)}") - - # Now asynchronously query AWS APIs to find any instances we might have missed - logger.info("Asynchronously querying AWS APIs for any additional instances...") - - # Create a map to track instance-to-region mapping for later termination - instance_region_map = {} - - # Add all instances from MACHINES.json to our map - for instance_id, status in all_statuses.items(): - instance_region_map[instance_id] = { - "region": status.region, - "vpc_id": status.vpc_id, - } - - # Query each region in parallel - async def query_region_for_instances(region): - logger.info(f"Checking region {region} for instances to terminate...") - region_instances = {} # Store instances found in this region - - try: - ec2 = get_ec2_client(region) - # Use safe_aws_call for proper timeout handling - logger.info(f"Querying AWS API for instances in {region}...") - response = await safe_aws_call( - ec2.describe_instances, - Filters=[ - { - "Name": "instance-state-name", - "Values": ["pending", "running", "stopping", "stopped"], - }, - {"Name": f"tag:{FILTER_TAG_NAME}", "Values": [FILTER_TAG_VALUE]}, - ], - ) - - instance_count = 0 - for reservation in response["Reservations"]: - for instance in reservation["Instances"]: - instance_count += 1 - instance_id = instance["InstanceId"] - az = instance["Placement"]["AvailabilityZone"] - vpc_id = instance.get("VpcId") - - # Check if we already have this instance in our tracking or instance_region_map - if instance_id not in all_statuses and instance_id not in instance_region_map: - logger.info(f"Found additional instance {instance_id} in {az} from AWS API") - thisInstanceStatusObject = InstanceStatus(region, az) - thisInstanceStatusObject.instance_id = instance_id - thisInstanceStatusObject.status = "Terminating" - thisInstanceStatusObject.detailed_status = "Found via AWS API" - thisInstanceStatusObject.vpc_id = vpc_id - all_statuses[instance_id] = thisInstanceStatusObject - region_instances[instance_id] = { - "region": region, - "vpc_id": vpc_id, - } - - if instance_count == 0: - logger.info(f"No instances found in region {region}") - - return region_instances - - except TimeoutError: - logger.error( - f"Timeout while listing instances in {region}. Check your AWS credentials." - ) - return {} - except Exception as e: - logger.error( - f"An error occurred while listing instances in {region}: {str(e)}" - ) - return {} - - # Query all regions in parallel - tasks = [query_region_for_instances(region) for region in AWS_REGIONS] - region_results = await asyncio.gather(*tasks) - - # Merge results from all regions - for region_instances in region_results: - instance_region_map.update(region_instances) - - if not all_statuses: - logger.info("No instances found to terminate.") - return - - task_total = len(all_statuses) - logger.info(f"Found {task_total} instances to terminate.") - - async def terminate_instances_in_region(region, region_instances): - if not region_instances: - logger.info(f"No instances to terminate in {region}") - return - - # Deduplication check - double check for duplicates - # This is an extra safeguard to ensure we don't try to terminate the same instance twice - unique_instances = list(set(region_instances)) - - if len(unique_instances) != len(region_instances): - logger.warning(f"Removed {len(region_instances) - len(unique_instances)} duplicate instances in {region}") - region_instances = unique_instances - - ec2 = get_ec2_client(region) - try: - logger.info(f"Terminating {len(region_instances)} instances in {region}...") - await safe_aws_call(ec2.terminate_instances, InstanceIds=region_instances) - logger.info( - f"Instances terminate request sent in {region}, waiting for completion..." - ) - - waiter = ec2.get_waiter("instance_terminated") - start_time = time.time() - while True: - try: - logger.info(f"Checking if instances in {region} are terminated...") - await safe_aws_call( - waiter.wait, - InstanceIds=region_instances, - WaiterConfig={"MaxAttempts": 1}, - ) - logger.info(f"All instances in {region} terminated successfully") - break - except botocore.exceptions.WaiterError: - elapsed_time = time.time() - start_time - logger.info( - f"Instances in {region} still terminating after {elapsed_time:.0f}s" - ) - for instance_id in region_instances: - thisInstanceStatusObject = all_statuses[instance_id] - thisInstanceStatusObject.elapsed_time = elapsed_time - thisInstanceStatusObject.detailed_status = ( - f"Terminating ({elapsed_time:.0f}s)" - ) - events_to_progress.append(thisInstanceStatusObject) - all_statuses[instance_id] = thisInstanceStatusObject - await asyncio.sleep(10) - except TimeoutError: - # Handle timeout during waiter - logger.error( - f"Timeout waiting for instances to terminate in {region}" - ) - for instance_id in region_instances: - thisInstanceStatusObject = all_statuses[instance_id] - thisInstanceStatusObject.status = "Timeout" - thisInstanceStatusObject.detailed_status = ( - "AWS API timeout during termination" - ) - events_to_progress.append(thisInstanceStatusObject) - all_statuses[instance_id] = thisInstanceStatusObject - break - - # Update status for terminated instances - for instance_id in region_instances: - thisInstanceStatusObject = all_statuses[instance_id] - thisInstanceStatusObject.status = "Terminated" - thisInstanceStatusObject.detailed_status = "Instance terminated" - events_to_progress.append(thisInstanceStatusObject) - all_statuses[instance_id] = thisInstanceStatusObject - - # Clean up resources for each VPC - vpcs_to_delete = set( - info["vpc_id"] - for info in instance_region_map.values() - if info["region"] == region and info["vpc_id"] - ) - - if vpcs_to_delete: - logger.info(f"Cleaning up {len(vpcs_to_delete)} VPCs in {region}") - else: - logger.info(f"No VPCs to clean up in {region}") - - for vpc_id in vpcs_to_delete: - try: - logger.info(f"Starting cleanup of VPC {vpc_id} in {region}") - for instance_id, status in all_statuses.items(): - if status.vpc_id == vpc_id: - status.detailed_status = "Cleaning up VPC resources" - events_to_progress.append(status) - - await clean_up_vpc_resources(ec2, vpc_id) - logger.info(f"Completed cleanup of VPC {vpc_id} in {region}") - - except Exception as e: - logger.error( - f"An error occurred while cleaning up VPC {vpc_id} in {region}: {str(e)}" - ) - - except Exception as e: - logger.error( - f"An error occurred while cleaning up resources in {region}: {str(e)}" - ) - - # Create a deduplicated mapping of instance_id to region/vpc info - # This ensures we don't have duplicate entries for the same instance - deduplicated_map = {} - for instance_id, info in instance_region_map.items(): - # Check if we already have this instance (shouldn't happen, but just in case) - if instance_id not in deduplicated_map: - deduplicated_map[instance_id] = info - else: - logger.warning(f"Duplicate instance found: {instance_id} - keeping first entry") - - # Group instances by region - region_instances = {} - for instance_id, info in deduplicated_map.items(): - region = info["region"] - if region not in region_instances: - region_instances[region] = [] - region_instances[region].append(instance_id) - - # Log the deduplication results - if len(deduplicated_map) != len(instance_region_map): - logger.info(f"Removed {len(instance_region_map) - len(deduplicated_map)} duplicate instances") - - # Terminate instances in parallel - termination_tasks = [] - for region, instances in region_instances.items(): - logger.info( - f"Creating termination task for {len(instances)} instances in {region}" - ) - termination_tasks.append(terminate_instances_in_region(region, instances)) - - if termination_tasks: - logger.info(f"Starting {len(termination_tasks)} parallel termination tasks") - await asyncio.gather(*termination_tasks) - logger.info("All termination tasks completed") - else: - logger.info("No termination tasks to execute") - - logger.info("All instances have been terminated.") - - # Create and print a summary of what was terminated - print_termination_summary(deduplicated_map) - - -async def clean_up_vpc_resources(ec2, vpc_id): - async def update_status(message): - logger.info(message) - for status in all_statuses.values(): - if status.vpc_id == vpc_id: - status.detailed_status = message - - await update_status(f"Looking for security groups in VPC {vpc_id}") - sgs = await asyncio.to_thread( - ec2.describe_security_groups, - Filters=[{"Name": "vpc-id", "Values": [vpc_id]}], - ) - - sg_count = 0 - for sg in sgs["SecurityGroups"]: - if sg["GroupName"] != "default": - sg_count += 1 - await update_status( - f"Deleting security group {sg['GroupId']} ({sg['GroupName']})" - ) - await asyncio.to_thread(ec2.delete_security_group, GroupId=sg["GroupId"]) - - if sg_count == 0: - await update_status(f"No non-default security groups found in VPC {vpc_id}") - - await update_status(f"Looking for subnets in VPC {vpc_id}") - subnets = await asyncio.to_thread( - ec2.describe_subnets, - Filters=[{"Name": "vpc-id", "Values": [vpc_id]}], - ) - - subnet_count = 0 - for subnet in subnets["Subnets"]: - subnet_count += 1 - await update_status(f"Deleting subnet {subnet['SubnetId']}") - await asyncio.to_thread(ec2.delete_subnet, SubnetId=subnet["SubnetId"]) - - if subnet_count == 0: - await update_status(f"No subnets found in VPC {vpc_id}") - - await update_status(f"Looking for route tables in VPC {vpc_id}") - rts = await asyncio.to_thread( - ec2.describe_route_tables, - Filters=[{"Name": "vpc-id", "Values": [vpc_id]}], - ) - - rt_count = 0 - for rt in rts["RouteTables"]: - if not any( - association.get("Main", False) for association in rt.get("Associations", []) - ): - rt_count += 1 - await update_status(f"Deleting route table {rt['RouteTableId']}") - await asyncio.to_thread( - ec2.delete_route_table, - RouteTableId=rt["RouteTableId"], - ) - - if rt_count == 0: - await update_status(f"No non-main route tables found in VPC {vpc_id}") - - await update_status(f"Looking for internet gateways attached to VPC {vpc_id}") - igws = await asyncio.to_thread( - ec2.describe_internet_gateways, - Filters=[{"Name": "attachment.vpc-id", "Values": [vpc_id]}], - ) - - igw_count = 0 - for igw in igws["InternetGateways"]: - igw_count += 1 - await update_status(f"Detaching internet gateway {igw['InternetGatewayId']}") - await asyncio.to_thread( - ec2.detach_internet_gateway, - InternetGatewayId=igw["InternetGatewayId"], - VpcId=vpc_id, - ) - await update_status(f"Deleting internet gateway {igw['InternetGatewayId']}") - await asyncio.to_thread( - ec2.delete_internet_gateway, - InternetGatewayId=igw["InternetGatewayId"], - ) - - if igw_count == 0: - await update_status(f"No internet gateways found attached to VPC {vpc_id}") - - await update_status(f"Deleting VPC {vpc_id}") - await asyncio.to_thread(ec2.delete_vpc, VpcId=vpc_id) - await update_status(f"VPC {vpc_id} successfully deleted") - - -def print_termination_summary(instance_map): - """Print a summary table of all terminated instances. - - Args: - instance_map: Dictionary mapping instance IDs to region/vpc info - """ - if not instance_map: - console.print("[yellow]No instances were terminated[/yellow]") - return - - # Collect zone information from status objects - zone_info = {} - for instance_id, info in instance_map.items(): - # Try to get zone from status object - region = info.get("region", "unknown") - - # Look for the zone in the status object if available - zone = "unknown" - if instance_id in all_statuses: - zone = all_statuses[instance_id].zone - - # Track by region and zone - if region not in zone_info: - zone_info[region] = {} - - if zone not in zone_info[region]: - zone_info[region][zone] = 0 - - zone_info[region][zone] += 1 - - # Create a summary table - table = Table(title="Terminated Instances Summary", box=box.ROUNDED, show_header=True, header_style="bold red") - - # Add columns - table.add_column("Region", style="cyan") - table.add_column("Zone", style="blue") - table.add_column("Instances", style="red", justify="right") - - # Add rows for each region and zone - total_instances = 0 - - # Sort regions for consistent display - for region in sorted(zone_info.keys()): - regions_zones = zone_info[region] - # Sort zones within each region - for zone in sorted(regions_zones.keys()): - count = regions_zones[zone] - total_instances += count - - # Only show region on first row for this region - if table.row_count > 0 and zone != sorted(regions_zones.keys())[0]: - table.add_row("", zone, str(count)) - else: - table.add_row(region, zone, str(count)) - - # Add a total row - table.add_row("", "[bold]TOTAL[/bold]", f"[bold]{total_instances}[/bold]") - - # Display the table - console.print() - console.print(table) - console.print() - console.print(f"[bold red]✓[/bold red] Successfully terminated {total_instances} instances") - console.print() - -async def delete_disconnected_aws_nodes(): - try: - # Run bacalhau node list command and capture output - logger.info("Running 'bacalhau node list' to find disconnected nodes") - result = subprocess.run( - ["bacalhau", "node", "list", "--output", "json"], - capture_output=True, - text=True, - check=True, - ) - nodes = json.loads(result.stdout) - - disconnected_aws_nodes = [] - - for node in nodes: - if ( - node["Connection"] == "DISCONNECTED" - and node["Info"]["NodeType"] == "Compute" - and "EC2_INSTANCE_FAMILY" in node["Info"]["Labels"] - ): - disconnected_aws_nodes.append(node["Info"]["NodeID"]) - - if not disconnected_aws_nodes: - logger.info("No disconnected AWS nodes found.") - return - - logger.info(f"Found {len(disconnected_aws_nodes)} disconnected AWS node(s).") - - for node_id in disconnected_aws_nodes: - logger.info(f"Deleting node: {node_id}") - try: - # Run bacalhau admin node delete command - subprocess.run(["bacalhau", "node", "delete", node_id], check=True) - logger.info(f"Successfully deleted node: {node_id}") - except subprocess.CalledProcessError as e: - logger.error(f"Failed to delete node {node_id}. Error: {e}") - - except subprocess.CalledProcessError as e: - logger.error(f"Error running bacalhau node list: {e}") - except json.JSONDecodeError as e: - logger.error(f"Error parsing JSON output: {e}") - except Exception as e: - logger.error(f"An unexpected error occurred: {e}") - - -def all_statuses_to_dict(): - return { - status.id: { - "id": status.id, - "region": status.region, - "zone": status.zone, - "status": status.status, - "detailed_status": status.detailed_status, - "elapsed_time": status.elapsed_time, - "instance_id": status.instance_id, - "spot_request_id": status.spot_request_id, - "fulfilled": getattr(status, "fulfilled", False), - "public_ip": status.public_ip, - "private_ip": status.private_ip, - "vpc_id": status.vpc_id, - "timestamp": datetime.now(timezone.utc).isoformat(), - } - for status in all_statuses.values() - } - -def load_machines_from_json(): - """Atomically load machine data from MACHINES.json if it exists""" - try: - # Check if the file exists - if not os.path.exists("MACHINES.json"): - logger.debug("MACHINES.json does not exist yet") - return {} - - # Open with exclusive access to ensure atomic read - with open("MACHINES.json", "r") as f: - # Use fcntl for file locking on Unix systems - try: - import fcntl - fcntl.flock(f, fcntl.LOCK_SH) # Shared lock for reading - data = json.load(f) - fcntl.flock(f, fcntl.LOCK_UN) # Release lock - except (ImportError, AttributeError): - # On Windows or if fcntl not available, just read without locking - data = json.load(f) - - return data - except json.JSONDecodeError: - logger.warning("MACHINES.json exists but contains invalid JSON, treating as empty") - return {} - except Exception as e: - logger.error(f"Failed to load machines from JSON: {str(e)}", exc_info=True) - return {} - -def save_machines_to_json(operation="update"): - """Atomically save the current machine statuses to MACHINES.json - - Args: - operation: String indicating the type of operation - "update" or "delete" - """ - try: - # Create temporary file first (atomic write pattern) - temp_file = "MACHINES.json.tmp" - - # First try to load existing data - existing_data = load_machines_from_json() - existing_machines = existing_data.get("machines", {}) - - # Convert all current instances to a dict - current_machines = all_statuses_to_dict() - - if operation == "update": - # Update existing machines with current ones - machines_data = {**existing_machines, **current_machines} - - # Log operations - new_count = len(set(current_machines.keys()) - set(existing_machines.keys())) - updated_count = len(set(current_machines.keys()) & set(existing_machines.keys())) - logger.info(f"Adding {new_count} new and updating {updated_count} existing machines") - - elif operation == "delete": - # For delete, remove current machines from existing ones - machines_to_remove = set(current_machines.keys()) - machines_data = {k: v for k, v in existing_machines.items() - if k not in machines_to_remove} - - # Log operation - removed_count = len(machines_to_remove) - logger.info(f"Removing {removed_count} machines from MACHINES.json") - else: - # Default to just using current machines - machines_data = current_machines - - # Extract regions from the machines data (safely) - regions = set() - for machine_data in machines_data.values(): - # Check if the machine data has a region key - if isinstance(machine_data, dict) and "region" in machine_data: - region = machine_data["region"] - if region: # Only add non-empty regions - regions.add(region) - - # Include metadata - output_data = { - "timestamp": datetime.now(timezone.utc).isoformat(), - "machines": machines_data, - "total_count": len(machines_data), - "regions": list(regions), - "last_operation": operation, - "last_updated": datetime.now(timezone.utc).isoformat() - } - - # Write to temporary file first - with open(temp_file, "w") as f: - # Use fcntl for file locking on Unix systems - try: - import fcntl - fcntl.flock(f, fcntl.LOCK_EX) # Exclusive lock for writing - json.dump(output_data, indent=2, default=str, sort_keys=True, fp=f) - f.flush() # Ensure data is written to disk - os.fsync(f.fileno()) # Sync filesystem - fcntl.flock(f, fcntl.LOCK_UN) # Release lock - except (ImportError, AttributeError): - # On Windows or if fcntl not available - json.dump(output_data, indent=2, default=str, sort_keys=True, fp=f) - f.flush() # Ensure data is written to disk - - # Atomic rename to ensure file is either fully written or not at all - os.replace(temp_file, "MACHINES.json") - - if operation == "update": - logger.info(f"Saved {len(machines_data)} machine records to MACHINES.json") - else: - logger.info(f"Updated MACHINES.json - {len(machines_data)} machines remain") - - return True - except Exception as e: - logger.error(f"Failed to save machines to JSON: {str(e)}", exc_info=True) - - # Log more debug info to help diagnose the issue - logger.debug(f"machines_data type: {type(machines_data)}") - if isinstance(machines_data, dict): - logger.debug(f"machines_data has {len(machines_data)} entries") - # Log a sample of the data - if machines_data: - sample_key = next(iter(machines_data)) - sample_value = machines_data[sample_key] - logger.debug(f"Sample entry - key: {sample_key}, value type: {type(sample_value)}") - if isinstance(sample_value, dict): - logger.debug(f"Sample keys: {list(sample_value.keys())}") - - # Clean up temp file if it exists - try: - if os.path.exists("MACHINES.json.tmp"): - os.remove("MACHINES.json.tmp") - except Exception as cleanup_error: - logger.error(f"Error cleaning up temp file: {str(cleanup_error)}") - - return False - - -def parse_args(): - """Parse command line arguments""" - parser = argparse.ArgumentParser( - description="Manage spot instances across multiple AWS regions." - ) - parser.add_argument( - "action", # Changed from --action to positional argument - choices=["create", "destroy", "list", "delete_disconnected_aws_nodes"], - help="Action to perform", - nargs="?", # Make it optional - default="list", # Default to list if not provided - ) - parser.add_argument( - "--format", choices=["default", "json"], default="default", help="Output format" - ) - parser.add_argument( - "--timeout", type=int, default=30, help="AWS API timeout in seconds" - ) - parser.add_argument( - "-v", "--verbose", action="store_true", help="Enable verbose debug output" - ) - - args = parser.parse_args() - - # Configure unified logging - use the same file_handler for both log file and console - global file_handler - - # Remove any existing handlers to ensure clean configuration - for handler in logger.handlers[:]: - logger.removeHandler(handler) - - # Create/truncate the debug.log file - try: - with open("debug.log", "w") as f: - pass # Just open in write mode to truncate - except Exception as e: - sys.stdout.write(f"Warning: Could not truncate debug.log: {e}\n") - sys.stdout.flush() - - # Create and configure file handler - file_handler = logging.FileHandler("debug.log") - file_handler.setFormatter(log_formatter) - - # Set log levels based on verbose flag - if args.verbose: - file_handler.setLevel(logging.DEBUG) - logger.setLevel(logging.DEBUG) - else: - file_handler.setLevel(logging.INFO) - logger.setLevel(logging.INFO) - - # Add the file handler to our logger - this will be shared with the console handler - logger.addHandler(file_handler) - - # Log initial startup message - logger.info(f"Starting with action: {args.action}, verbose: {args.verbose}") - - # Set global timeout from command line argument - global AWS_API_TIMEOUT - AWS_API_TIMEOUT = args.timeout - logger.info(f"Set AWS API timeout to {AWS_API_TIMEOUT} seconds") - - # Set task name based on action - global task_name, task_total - if args.action == "create": - task_name = "Creating Spot Instances" - task_total = TOTAL_INSTANCES - elif args.action == "destroy": - task_name = "Terminating Spot Instances" - task_total = 100 # Will be updated when we know how many instances to terminate - elif args.action == "list": - task_name = "Listing Spot Instances" - task_total = 100 # Will be updated when we know how many instances to list - elif args.action == "delete_disconnected_aws_nodes": - task_name = "Deleting Disconnected AWS Nodes" - task_total = 100 # Will be updated when we know how many nodes to delete - - logger.info(f"Set task: '{task_name}' with target: {task_total}") - return args - - -async def check_aws_credentials(): - """Check if AWS credentials are valid before proceeding. - - Returns: - bool: True if credentials are valid, False otherwise - """ - logger.info("Checking AWS credentials validity...") - try: - # Try to use any region for the check - we'll use the first configured region - region = AWS_REGIONS[0] if AWS_REGIONS else "us-east-1" - ec2 = get_ec2_client(region) - - # Make a simple API call that requires valid credentials - await safe_aws_call(ec2.describe_regions, RegionNames=[region]) - - logger.info("AWS credentials are valid") - return True - except botocore.exceptions.ClientError as e: - error_code = getattr(e, 'response', {}).get('Error', {}).get('Code', '') - error_msg = getattr(e, 'response', {}).get('Error', {}).get('Message', str(e)) - - if error_code in ['ExpiredToken', 'InvalidToken', 'UnauthorizedOperation']: - logger.error(f"AWS credentials have expired or are invalid: {error_msg}") - console.print("[bold red]AWS credentials have expired or are invalid.[/bold red]") - console.print("[yellow]Please run 'aws sso login' to refresh your credentials.[/yellow]") - else: - logger.error(f"Error checking AWS credentials: {error_code} - {error_msg}") - console.print(f"[bold red]AWS credentials error:[/bold red] {error_code} - {error_msg}") - - return False - except Exception as e: - logger.error(f"Error checking AWS credentials: {str(e)}") - console.print(f"[bold red]Error checking AWS credentials:[/bold red] {str(e)}") - console.print("[yellow]Please verify your AWS configuration and connectivity.[/yellow]") - return False - -async def perform_action(): - """Execute the requested action""" - args = parse_args() - logger.debug(f"Starting perform_action with action: {args.action}") - operation_result = { - "success": False, - "action": args.action, - "start_time": datetime.now(timezone.utc).isoformat(), - "end_time": None, - "result_summary": {} - } - - # Check AWS credentials before performing any action that requires AWS API calls - if args.action in ["create", "destroy", "list"]: - credentials_valid = await check_aws_credentials() - if not credentials_valid: - operation_result["error"] = "Invalid AWS credentials" - return operation_result - - try: - if args.action == "create": - logger.info("Initiating create_spot_instances") - # Wait for the create operation to fully complete - creation_success = await create_spot_instances() - - # Count successfully created instances by region - created_instances = {} - for status in all_statuses.values(): - if status.instance_id and status.public_ip: # Successfully created with IP - region = status.region - if region not in created_instances: - created_instances[region] = 0 - created_instances[region] += 1 - - total_created = sum(created_instances.values()) - - # Count instances with public IPs and completed provisioning - provisioned_instances = {} - for status in all_statuses.values(): - if status.instance_id and status.public_ip and status.detailed_status == "Provisioning complete": - region = status.region - if region not in provisioned_instances: - provisioned_instances[region] = 0 - provisioned_instances[region] += 1 - - total_provisioned = sum(provisioned_instances.values()) - - # Set operation result based on success of creation - operation_result["success"] = total_created > 0 - operation_result["result_summary"] = { - "instances_created": total_created, - "instances_by_region": created_instances, - "instances_provisioned": total_provisioned, - "all_received_ips": creation_success - } - - # Save newly created instances to MACHINES.json (operation="update") - if len(all_statuses) > 0: - save_result = save_machines_to_json(operation="update") - operation_result["result_summary"]["saved_to_file"] = save_result - - logger.info(f"Creation completed: {total_created} instances created, {total_provisioned} fully provisioned") - - # If we didn't create any instances, that's an issue - if total_created == 0: - raise Exception("Failed to create any instances - check AWS credentials and limits") - - elif args.action == "list": - logger.info("Initiating list_spot_instances") - await list_spot_instances() - - # Count instances by status - instance_counts = {} - for status in all_statuses.values(): - if status.status not in instance_counts: - instance_counts[status.status] = 0 - instance_counts[status.status] += 1 - - operation_result["success"] = True - operation_result["result_summary"] = { - "total_instances": len(all_statuses), - "instances_by_status": instance_counts - } - - # Update MACHINES.json with current instances (operation="update") - if len(all_statuses) > 0: - save_machines_to_json(operation="update") - - elif args.action == "destroy": - # Store counts before destruction for reporting - initial_count = len(all_statuses) - initial_regions = set(status.region for status in all_statuses.values() if status.region) - - # Create a dictionary to track instances per region and zone - region_zone_counts = {} - for status in all_statuses.values(): - if status.region and status.zone: - if status.region not in region_zone_counts: - region_zone_counts[status.region] = {} - if status.zone not in region_zone_counts[status.region]: - region_zone_counts[status.region][status.zone] = 0 - region_zone_counts[status.region][status.zone] += 1 - - # Skip doing any MACHINES.json operations if empty - has_instances = initial_count > 0 - - logger.info("Initiating destroy_instances") - await destroy_instances() - - # Get summary of terminated instances - operation_result["success"] = True - operation_result["result_summary"] = { - "instances_terminated": initial_count, - "regions_affected": list(initial_regions), - "region_zone_distribution": region_zone_counts, - "cleanup_completed": True - } - - # Remove destroyed instances from MACHINES.json (operation="delete") - if has_instances: - save_machines_to_json(operation="delete") - - elif args.action == "delete_disconnected_aws_nodes": - logger.info("Initiating delete_disconnected_aws_nodes") - await delete_disconnected_aws_nodes() - operation_result["success"] = True - - logger.debug(f"Completed action: {args.action}") - - # Set completion timestamp - operation_result["end_time"] = datetime.now(timezone.utc).isoformat() - - except TimeoutError as e: - logger.error(f"TimeoutError occurred: {str(e)}") - console.print(f"[bold red]Error:[/bold red] {str(e)}") - console.print("[yellow]This may be due to AWS credential issues.[/yellow]") - console.print( - "[yellow]Try running 'aws sso login' to refresh your credentials.[/yellow]" - ) - table_update_event.set() - operation_result["error"] = str(e) - return operation_result - - except botocore.exceptions.ClientError as e: - logger.error(f"AWS ClientError occurred: {str(e)}") - if "ExpiredToken" in str(e) or "InvalidToken" in str(e): - console.print("[bold red]AWS credentials have expired.[/bold red]") - console.print( - "[yellow]Try running 'aws sso login' to refresh your credentials.[/yellow]" - ) - else: - console.print(f"[bold red]AWS Error:[/bold red] {str(e)}") - table_update_event.set() - operation_result["error"] = str(e) - return operation_result - - except Exception as e: - logger.error(f"Unexpected error occurred: {str(e)}", exc_info=True) - console.print(f"[bold red]Error:[/bold red] {str(e)}") - table_update_event.set() - operation_result["error"] = str(e) - return operation_result - - return operation_result - - -async def main(): - """Main execution function""" - handler = None # Initialize handler to None - try: - args = parse_args() - - # Logging has been configured in parse_args - # We'll see these log messages in both debug.log and the Rich console panel - if args.verbose: - logger.debug("Verbose logging enabled") - - logger.info(f"Starting action: {args.action}") - - if args.format == "json": - logger.info("Using JSON output format") - operation_result = await perform_action() - # Machine updates in MACHINES.json are now handled within perform_action() - - # For JSON output, also show MACHINES.json contents if it exists - machines_from_file = load_machines_from_json().get("machines", {}) - - # Use direct stdout before rich console is initialized - output = { - "current_machines": all_statuses_to_dict(), - "saved_machines_count": len(machines_from_file), - "operation_result": operation_result - } - sys.stdout.write(json.dumps(output, indent=2, default=str) + "\n") - sys.stdout.flush() - return - - # Create initial progress and table - progress = Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TaskProgressColumn(), - TimeElapsedColumn(), - ) - table = make_progress_table() - - # Create layout before using it in Live - layout = create_layout(progress, table) - - # Initialize the live display with the layout - with Live( - layout, - console=console, - refresh_per_second=5, - auto_refresh=True, - screen=True, - transient=False, # Keep the display visible after exit - ) as live: - try: - # Update our global flag to indicate terminal has been cleared - global is_terminal_cleared - is_terminal_cleared = True - - # Add the rich console handler for logging, sharing the file handler - handler = RichConsoleHandler(live, layout, file_handler) # Pass layout and file handler - logger.addHandler(handler) - - # Start display update task in a separate thread - loop = asyncio.get_event_loop() - display_task = loop.create_task(update_display(live)) - - # Set up exception handler for display_task - def handle_display_task_exception(task): - try: - # Get the exception if any - task.result() - except Exception as e: - logger.error(f"Display task failed: {str(e)}", exc_info=True) - # We don't reraise here - just log it - - display_task.add_done_callback(handle_display_task_exception) - - # Perform the requested action - operation_result = await perform_action() - - # Display summary after operation completes (if successful) - if operation_result.get("success", False): - # Create a nice summary table - summary_table = Table(title=f"{args.action.capitalize()} Operation Summary", - show_header=True, - header_style="bold cyan", - box=box.ROUNDED) - - # Add columns based on the action - if args.action == "create": - summary_table.add_column("Total Created", style="green") - summary_table.add_column("Regions", style="blue") - summary_table.add_column("Distribution", style="cyan") - - # Get summary data - summary = operation_result["result_summary"] - total = summary.get("instances_created", 0) - by_region = summary.get("instances_by_region", {}) - all_ips = summary.get("all_received_ips", True) - - # Add the IP status column - summary_table.add_column("IP Status", style="green") - - # Format region distribution - region_list = ", ".join(by_region.keys()) if by_region else "None" - distribution = " | ".join([f"{region}: {count}" for region, count in by_region.items()]) if by_region else "None" - - # Format IP status message - ip_status = "✓ All Received" if all_ips else "⚠ Some missing IPs" - - # Add the row with status - summary_table.add_row(str(total), region_list, distribution, ip_status) - - elif args.action == "destroy": - summary_table.add_column("Instances Terminated", style="red") - summary_table.add_column("Regions Affected", style="cyan") - summary_table.add_column("Result", style="magenta") - - # Get summary data - summary = operation_result["result_summary"] - terminated = summary.get("instances_terminated", 0) - regions = summary.get("regions_affected", []) - - # Format for display - region_text = ", ".join(regions) if regions else "None" - - # Add the row - show if machines file was updated - if terminated > 0: - summary_table.add_row(str(terminated), region_text, "✓ Successful") - else: - summary_table.add_row(str(terminated), region_text, "No machines found") - - # Print the summary - console.print("\n") # Add some space - console.print(summary_table) - console.print("\n") # Add some space after - - # Show appropriate message based on the operation - if args.action == "create" and operation_result.get("result_summary", {}).get("instances_created", 0) > 0: - console.print("[green]✓ Machine information saved to MACHINES.json[/green]") - elif args.action == "list" and operation_result.get("result_summary", {}).get("total_instances", 0) > 0: - console.print("[green]✓ Machine information updated in MACHINES.json[/green]") - elif args.action == "destroy" and operation_result.get("result_summary", {}).get("instances_terminated", 0) > 0: - console.print("[red]✓ Terminated machines removed from MACHINES.json[/red]") - - # Signal display task to stop and wait for completion - logger.debug("Signaling display task to stop") - table_update_event.set() - - # For create action, make sure we keep the display up just long enough - # to let users see the results but not block on full provisioning - if args.action == "create": - # Just wait a short time to ensure users see the final IP table - logger.debug("Keeping display open briefly to show final IP table") - await asyncio.sleep(5.0) - - # Signal display task to stop (normal case) - logger.debug("Ending display task") - - # Wait for display to finish updating with a timeout - try: - logger.debug("Waiting for display task to complete") - - # Short timeout for display task cleanup - display_timeout = 5.0 - await asyncio.wait_for(asyncio.shield(display_task), timeout=display_timeout) - logger.debug("Display task completed") - except asyncio.TimeoutError: - logger.warning(f"Display task did not complete within {display_timeout}s timeout") - # We continue anyway, the task will be cancelled in the finally block - - except Exception as e: - logger.error(f"Error in main execution: {str(e)}", exc_info=True) - # Don't try to use rich console here, as it might be the source of the error - # Error will be printed by our outer exception handler - raise - finally: - # Stop the display task if it's still running - if display_task and not display_task.done(): - display_task.cancel() - - # Remove the rich console handler if it was added - if handler is not None and handler in logger.handlers: - logger.removeHandler(handler) - - except Exception as e: - logger.error(f"Fatal error occurred: {str(e)}", exc_info=True) - console.print(f"\n[bold red]Fatal error:[/bold red] {str(e)}") - raise - - -if __name__ == "__main__": - # Store the original terminal settings to ensure we can properly display errors - is_terminal_cleared = False - - # Function to print error outside of rich Live display context - def print_error_message(message): - # Ensure we're writing directly to stdout to avoid stderr - if is_terminal_cleared: - # If terminal was cleared by rich Live display, add newlines for visibility - sys.stdout.write("\n\n") - sys.stdout.write(f"\n[ERROR] {message}\n") - sys.stdout.write("Check debug.log for more details.\n") - sys.stdout.flush() - - # Add a simple info message directly to console for initial startup - # This is only for user feedback before the rich console is ready - sys.stdout.write("Initializing...\n") - sys.stdout.flush() - - try: - # Log to file only, not stdout - logger.info("Starting main execution") - asyncio.run(main()) - logger.info("Main execution completed") - except KeyboardInterrupt: - logger.info("Operation cancelled by user") - sys.stderr = open(os.devnull, 'w') # Suppress any stderr output - print_error_message("Operation cancelled by user.") - sys.exit(1) - except Exception as e: - # Log detailed error - logger.error(f"Fatal error occurred: {str(e)}", exc_info=True) - - # Silence stderr completely - sys.stderr = open(os.devnull, 'w') - - # Print user-friendly error message outside of any rich context - error_msg = f"Fatal error occurred: {str(e)}" - - # Add additional context for common errors - if "TimeoutError" in str(e): - error_msg += "\nThis may be due to AWS credential issues or network problems." - error_msg += "\nTry running 'aws sso login' to refresh your credentials." - elif "ExpiredToken" in str(e) or "InvalidToken" in str(e): - error_msg += "\nAWS credentials have expired. Try running 'aws sso login'." - elif "InstanceId" in str(e) and "does not exist" in str(e): - error_msg += "\nThe specified instance may have been terminated or never created." - - print_error_message(error_msg) - sys.exit(1) diff --git a/edge-data-transfer-demo-v2/edge-data-spots/deploy_spot_no_bacalhau.py b/edge-data-transfer-demo-v2/edge-data-spots/deploy_spot_no_bacalhau.py deleted file mode 100644 index f47c27d8..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/deploy_spot_no_bacalhau.py +++ /dev/null @@ -1,2962 +0,0 @@ -#!/usr/bin/env -S uv run --script -# /// script -# requires-python = ">=3.11" -# dependencies = [ -# "boto3", -# "botocore", -# "pyyaml", -# "rich", -# ] -# /// - -import argparse -import asyncio -import base64 -import hashlib -import json -import logging -import os -import subprocess -import sys -import time -from concurrent.futures import TimeoutError -from datetime import datetime, timezone - -import boto3 -import botocore -from rich.console import Console -from rich.layout import Layout -from rich.live import Live -from rich.panel import Panel -from rich.progress import ( - BarColumn, - Progress, - SpinnerColumn, - TaskProgressColumn, - TextColumn, - TimeElapsedColumn, -) -from rich.table import Table, box - -from util.config import Config -from util.scripts_provider import ScriptsProvider - -# Set up logging with a unified approach - everything will go to the console panel -# and be written to the debug.log file as a backup - -# Set up logging with a unified stream approach -# All logs will go to both debug.log and the Rich console panel - -# Formatter for logs - concise but informative -log_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") - -# Set up main logger -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) # Default level, will be updated based on args -# Important: Prevent propagation to root logger to avoid stderr output -logger.propagate = False - -# The file handler will be shared with the Rich console handler -file_handler = None - -# Tag to filter instances by -FILTER_TAG_NAME = "ManagedBy" -FILTER_TAG_VALUE = "SpotInstanceScript" - -# Initialize console with auto-detection of width -console = Console() - -config = Config("config.yaml") -scripts_provider = ScriptsProvider(config) - -AWS_REGIONS = config.get_regions() -TOTAL_INSTANCES = config.get_total_instances() -global_node_count = 0 -INSTANCES_PER_REGION = ( - TOTAL_INSTANCES // len(AWS_REGIONS) -) or TOTAL_INSTANCES # Evenly distribute instances if set to 'auto' in config - -MAX_NODES = ( - config.get_total_instances() -) # Global limit for total nodes across all regions -current_dir = os.path.dirname(__file__) - -SCRIPT_DIR = "instance/scripts" - -# Status tracking -all_statuses = {} # Dictionary to track all instance statuses -status_lock = asyncio.Lock() # Lock for thread-safe updates to all_statuses - -# Event for signaling the table update task to stop -table_update_event = asyncio.Event() - -# Task tracking -task_name = "TASK NAME" -task_total = 10000 -events_to_progress = [] - -# AWS API timeouts -AWS_API_TIMEOUT = 30 # seconds - - -async def update_status(status): - """Thread-safe update of instance status""" - async with status_lock: - all_statuses[status.id] = status - # Add to events queue for progress tracking - events_to_progress.append(status) - - -class InstanceStatus: - def __init__(self, region, zone, index=0, instance_id=None): - input_string = f"{region}-{zone}-{index}" - hashed_string = hashlib.sha256(input_string.encode()).hexdigest() - - self.id = hashed_string[:6] - self.region = region - self.zone = zone - self.status = "Initializing" - self.detailed_status = "Initializing" - self.start_time = time.time() - self.elapsed_time = 0 - self.instance_id = instance_id - self.public_ip = None - self.private_ip = None - self.vpc_id = None - self.spot_request_id = None # Track the spot request ID for monitoring - self.fulfilled = False # Track if the spot request was fulfilled - - if self.instance_id is not None: - self.id = self.instance_id - - def update_elapsed_time(self): - self.elapsed_time = time.time() - self.start_time - return self.elapsed_time - - def combined_status(self): - if self.detailed_status and self.detailed_status != self.status: - combined = f"{self.detailed_status}" - if len(combined) > 30: - return combined[:27] + "..." - return combined - return self.status - - -def format_elapsed_time(seconds): - """Format elapsed time in a human-readable format""" - if seconds < 60: - return f"{seconds:.1f}s" - elif seconds < 3600: - minutes = seconds / 60 - return f"{minutes:.1f}m" - else: - hours = seconds / 3600 - return f"{hours:.1f}h" - - -def make_progress_table(): - """Create a table showing instance status with adaptive column widths""" - # Get terminal width - width = console.width - - # Calculate column widths based on available space - id_width = 6 - region_width = min(15, max(10, int(width * 0.10))) - zone_width = min(15, max(10, int(width * 0.10))) - status_width = min(30, max(20, int(width * 0.20))) # Wider status column - elapsed_width = 8 - instance_id_width = min(20, max(10, int(width * 0.12))) - ip_width = min(15, max(10, int(width * 0.08))) - - # Create table with adaptive column widths - table = Table(show_header=True, header_style="bold magenta", expand=False) - - # Add columns with appropriate widths - table.add_column("ID", width=id_width, style="cyan", no_wrap=True) - table.add_column("Region", width=region_width, style="cyan", no_wrap=True) - table.add_column("Zone", width=zone_width, style="cyan", no_wrap=True) - table.add_column("Status", width=status_width, style="yellow", no_wrap=True) - table.add_column( - "Time", width=elapsed_width, justify="right", style="magenta", no_wrap=True - ) - table.add_column("Instance ID", width=instance_id_width, style="blue", no_wrap=True) - table.add_column("Public IP", width=ip_width, style="green", no_wrap=True) - table.add_column("Private IP", width=ip_width, style="blue", no_wrap=True) - - # Update elapsed time for all statuses - for status in all_statuses.values(): - status.update_elapsed_time() - - # Sort statuses for consistent display - sorted_statuses = sorted(all_statuses.values(), key=lambda x: (x.region, x.zone)) - - # Add rows to the table - for status in sorted_statuses: - table.add_row( - status.id, - status.region, - status.zone, - status.combined_status(), - format_elapsed_time(status.elapsed_time), - status.instance_id or "", - status.public_ip or "", - status.private_ip or "", - ) - - return table - - -def create_layout(progress, table): - """Create a responsive layout that adapts to terminal size""" - layout = Layout() - - # Calculate panel heights based on terminal height - height = console.height - progress_height = min(4, max(3, int(height * 0.1))) # 10% for progress - console_height = min(6, max(4, int(height * 0.2))) # 20% for console - - # Create progress panel - progress_panel = Panel( - progress, - title="Progress", - border_style="green", - padding=(1, 1), - ) - - # Create console panel for log messages - console_panel = Panel( - "", # Start with empty content - title="Console Output", - border_style="blue", - padding=(0, 1), - ) - - # Split layout with responsive sizing - layout.split( - Layout(progress_panel, size=progress_height), - Layout(table), # This will take the remaining space (about 70%) - Layout(console_panel, size=console_height), - ) - - return layout - - -# Configure console handler to use rich console -class RichConsoleHandler(logging.Handler): - """Unified console handler that shows log messages from debug.log in the Rich UI. - - This handler streams the debug.log content to the console panel in the Rich UI. - It also forwards log records to the file handler, creating a single logging path. - """ - def __init__(self, live, layout, file_handler=None): - super().__init__() - self.live = live - self.layout = layout # Store the layout directly - self.messages = ["Logs will appear here..."] # Start with a simple message - - # Use the same formatter as the file handler for consistency - self.setFormatter(log_formatter) - - # Keep reference to file handler for forwarding - self.file_handler = file_handler - - # Set the level to match the file handler if provided - if file_handler: - self.setLevel(file_handler.level) - else: - self.setLevel(logging.INFO) - - # Initialize the console panel content right away - console_panel = self.layout.children[-1].renderable - console_panel.renderable = "\n".join(self.messages) - - # Read any existing content from debug.log to show history - self._load_existing_logs() - - def _load_existing_logs(self): - """Load the last few lines from debug.log to provide context""" - try: - if os.path.exists("debug.log"): - with open("debug.log", "r") as f: - # Get the last 10 lines from the file - lines = f.readlines()[-10:] - if lines: - # Replace our waiting message with actual log content - self.messages = [line.strip() for line in lines] - - # Update the console panel right away - console_panel = self.layout.children[-1].renderable - console_panel.renderable = "\n".join(self.messages) - except Exception: - # If we can't read the log file, just continue with the default message - pass - - def emit(self, record): - """Process log records and update the console panel""" - try: - # Format the message using our formatter - msg = self.format(record) - - # If we still have the default message, clear it first - if len(self.messages) == 1 and self.messages[0] == "Logs will appear here...": - self.messages = [] - - # Add the new message - self.messages.append(msg) - - # Keep only the last 20 messages (increased from 10 for more context) - if len(self.messages) > 20: - self.messages = self.messages[-20:] - - # Update the console panel content - console_panel = self.layout.children[-1].renderable - console_panel.renderable = "\n".join(self.messages) - - # Forward to file handler if we have one and it's not already handling this record - if self.file_handler and record.levelno >= self.file_handler.level: - self.file_handler.emit(record) - - except Exception: - self.handleError(record) - - -async def update_display(live): - """Update the live display with current status information""" - logger.debug("Entering update_display function") - try: - logger.debug("Creating progress bar") - progress = Progress( - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), - TextColumn("[progress.completed]{task.completed} of {task.total}"), - expand=True, - ) - - logger.debug(f"Adding task: {task_name} with total: {task_total}") - task = progress.add_task(task_name, total=task_total) - - # Create initial layout - logger.debug("Creating table") - table = make_progress_table() - logger.debug("Creating layout") - layout = create_layout(progress, table) - - # For display updates we don't need to create a new handler - # Just update the existing one with the new layout - rich_handler = None - for h in logger.handlers: - if isinstance(h, RichConsoleHandler): - rich_handler = h - break - - if rich_handler is None: - logger.debug("No existing RichConsoleHandler found - display updates may not work") - else: - # Update the existing handler with the new layout - logger.debug("Updating existing RichConsoleHandler layout") - rich_handler.layout = layout - - logger.debug("Starting update loop") - while not table_update_event.is_set(): - logger.debug("Processing status updates") - async with status_lock: - events_to_progress.clear() - progress.update(task, completed=len(all_statuses), refresh=True) - - logger.debug("Creating table and layout") - table = make_progress_table() - layout = create_layout(progress, table) - - # Find and update the RichConsoleHandler with the new layout - for h in logger.handlers: - if isinstance(h, RichConsoleHandler): - h.layout = layout - break - - logger.debug("Updating live display") - live.update(layout) - - # Slightly longer sleep to reduce log volume - await asyncio.sleep(0.5) - - except Exception as e: - logger.error(f"Error updating display: {str(e)}", exc_info=True) - # Don't re-raise the exception to keep the display running - - -def get_ec2_client(region): - """Get EC2 client with proper configuration for the specified region""" - logger.debug(f"Creating EC2 client for region {region}") - try: - # Create a boto3 client with explicit timeout configuration - logger.debug(f"Configuring boto3 client with timeout={AWS_API_TIMEOUT}") - config = botocore.config.Config( - connect_timeout=AWS_API_TIMEOUT, - read_timeout=AWS_API_TIMEOUT, - retries={"max_attempts": 3, "mode": "standard"}, - ) - logger.debug("Creating boto3 client") - client = boto3.client("ec2", region_name=region, config=config) - logger.debug("Successfully created EC2 client") - return client - except Exception as e: - logger.error( - f"Error creating EC2 client for region {region}: {str(e)}", exc_info=True - ) - raise - - -async def safe_aws_call(func, *args, **kwargs): - """Execute AWS API calls with proper timeout handling""" - try: - # Set a timeout for the AWS API call - return await asyncio.wait_for( - asyncio.to_thread(func, *args, **kwargs), timeout=AWS_API_TIMEOUT - ) - except asyncio.TimeoutError: - error_msg = ( - f"AWS API call timed out after {AWS_API_TIMEOUT} seconds: {func.__name__}" - ) - logging.error(error_msg) - if "describe_instances" in func.__name__: - logging.error( - "This may be due to SSO credential issues. Please check your AWS credentials." - ) - logging.error("Try running 'aws sso login' to refresh your credentials.") - raise TimeoutError(error_msg) - except botocore.exceptions.ClientError as e: - if "ExpiredToken" in str(e) or "InvalidToken" in str(e): - logging.error( - "AWS credentials have expired. Please refresh your credentials." - ) - logging.error("Try running 'aws sso login' to refresh your credentials.") - raise - except Exception as e: - logging.error(f"Error in AWS API call {func.__name__}: {str(e)}") - raise - - -async def get_availability_zones(ec2): - response = await safe_aws_call( - ec2.describe_availability_zones, - Filters=[{"Name": "opt-in-status", "Values": ["opt-in-not-required"]}], - ) - return [zone["ZoneName"] for zone in response["AvailabilityZones"]][ - :1 - ] # Get 1 AZ per region - - -async def create_spot_instances_in_region(config: Config, instances_to_create, region): - global all_statuses, events_to_progress - - ec2 = get_ec2_client(region) - region_cfg = config.get_region_config(region) - - try: - user_data = scripts_provider.create_cloud_init_script() - if not user_data: - logging.error("User data is empty. Stopping creation.") - return [], {} - - encoded_user_data = base64.b64encode(user_data.encode()).decode() - - vpc_id = await create_vpc_if_not_exists(ec2) - igw_id = await create_internet_gateway(ec2, vpc_id) - route_table_id = await create_route_table(ec2, vpc_id, igw_id) - security_group_id = await create_security_group_if_not_exists(ec2, vpc_id) - - instance_ids = [] - zones = await get_availability_zones(ec2) - for i in range(instances_to_create): - zone = zones[i % len(zones)] # Distribute instances across available zones - - subnet_id = await create_subnet(ec2, vpc_id, zone, f"10.0.{i}.0/24") - try: - await associate_route_table(ec2, route_table_id, subnet_id) - except botocore.exceptions.ClientError as e: - if e.response["Error"]["Code"] == "Resource.AlreadyAssociated": - logging.info( - f"Route table already associated in {region}-{zone}: {str(e)}" - ) - else: - logging.warning( - f"Error associating route table in {region}-{zone}: {str(e)}" - ) - - thisInstanceStatusObject = InstanceStatus(region, zone, i) - all_statuses[thisInstanceStatusObject.id] = thisInstanceStatusObject - events_to_progress.append(thisInstanceStatusObject) - - start_time = time.time() - launch_specification = { - "ImageId": config.get_image_for_region(region), - "InstanceType": region_cfg.get("machine_type", "t2.medium"), - "UserData": encoded_user_data, - "BlockDeviceMappings": [ - { - "DeviceName": "/dev/sda1", - "Ebs": {"DeleteOnTermination": True}, - } - ], - "NetworkInterfaces": [ - { - "DeviceIndex": 0, - "AssociatePublicIpAddress": True, - "DeleteOnTermination": True, - "SubnetId": subnet_id, - "Groups": [security_group_id], - } - ], - } - - thisInstanceStatusObject.status = "Requesting" - all_statuses[thisInstanceStatusObject.id] = thisInstanceStatusObject - events_to_progress.append(thisInstanceStatusObject) - - logging.debug(f"Requesting spot instance in {region}-{zone}") - response = await asyncio.to_thread( - ec2.request_spot_instances, - InstanceCount=1, # Create a single instance per request - Type="one-time", - InstanceInterruptionBehavior="terminate", - LaunchSpecification=launch_specification, - TagSpecifications=[ - { - "ResourceType": "spot-instances-request", - "Tags": [ - {"Key": "Name", "Value": f"SpotInstance-{region}-{zone}"}, - {"Key": FILTER_TAG_NAME, "Value": FILTER_TAG_VALUE}, - ], - }, - ], - ) - - spot_request_ids = [ - request["SpotInstanceRequestId"] - for request in response["SpotInstanceRequests"] - ] - logging.debug(f"Spot request IDs: {spot_request_ids}") - - # Store the spot request ID in the status object for tracking - if spot_request_ids: - thisInstanceStatusObject.spot_request_id = spot_request_ids[0] - - thisInstanceStatusObject.status = "Waiting for fulfillment" - - # Wait for spot instances to be fulfilled - waiter = ec2.get_waiter("spot_instance_request_fulfilled") - max_wait_time = 600 # 10 minutes timeout - start_wait_time = time.time() - - # Update instance status - thisInstanceStatusObject.status = "Waiting for fulfillment" - all_statuses[thisInstanceStatusObject.id] = thisInstanceStatusObject - events_to_progress.append(thisInstanceStatusObject) - - # Setup polling for spot request status with timeout - async def poll_spot_request_status(): - timeout_reached = False - while not timeout_reached: - # Check if timeout reached - if time.time() - start_wait_time > max_wait_time: - logging.error(f"Timeout waiting for spot instance in {region}-{zone}") - return None - - # Check spot request status - try: - describe_response = await asyncio.to_thread( - ec2.describe_spot_instance_requests, - SpotInstanceRequestIds=spot_request_ids, - ) - - for request in describe_response["SpotInstanceRequests"]: - status_code = request["Status"]["Code"] - status_message = request["Status"].get("Message", "No message") - - # Update status object with details - thisInstanceStatusObject.detailed_status = f"{status_code}: {status_message}" - thisInstanceStatusObject.elapsed_time = time.time() - start_time - all_statuses[thisInstanceStatusObject.id] = thisInstanceStatusObject - events_to_progress.append(thisInstanceStatusObject) - - logging.debug(f"Status in {region}-{zone}: {status_code} - {status_message}") - - # Check for failures - if status_code in ["price-too-low", "capacity-not-available"]: - logging.error(f"Spot request failed: {status_code} - {status_message}") - return None - - # Check for success - instance ID is present - if "InstanceId" in request: - return describe_response - - except Exception as e: - logging.error(f"Error checking spot request status: {str(e)}") - - # Sleep before next poll - await asyncio.sleep(5) - - return None - - # Try to use waiter first (faster) with timeout protection - waiter_task = asyncio.create_task( - asyncio.wait_for( - asyncio.to_thread( - waiter.wait, - SpotInstanceRequestIds=spot_request_ids, - WaiterConfig={"MaxAttempts": 40, "Delay": 15}, # 40 attempts * 15 sec = 10 min max - ), - timeout=max_wait_time - ) - ) - - # Start the polling task as a backup - polling_task = asyncio.create_task(poll_spot_request_status()) - - # Wait for either task to complete - done, pending = await asyncio.wait( - [waiter_task, polling_task], - return_when=asyncio.FIRST_COMPLETED - ) - - # Cancel the pending task - for task in pending: - task.cancel() - - # Get results - describe_response = None - waiter_succeeded = False - - for task in done: - try: - if task == waiter_task: - await task # Just to get any exceptions - waiter_succeeded = True - logging.debug(f"Waiter succeeded for {region}-{zone}") - elif task == polling_task: - describe_response = await task - - except (asyncio.TimeoutError, asyncio.CancelledError): - pass - except Exception as e: - logging.error(f"Error in spot instance fulfillment: {str(e)}") - - # If waiter succeeded but we don't have response, get it now - if waiter_succeeded and not describe_response: - try: - describe_response = await asyncio.to_thread( - ec2.describe_spot_instance_requests, - SpotInstanceRequestIds=spot_request_ids, - ) - except Exception as e: - logging.error(f"Error getting spot request details: {str(e)}") - describe_response = None - - # Check if we got a valid response - if describe_response is None: - thisInstanceStatusObject.status = "Failed to request spot instance" - thisInstanceStatusObject.detailed_status = "Timeout or API error" - all_statuses[thisInstanceStatusObject.id] = thisInstanceStatusObject - events_to_progress.append(thisInstanceStatusObject) - continue # Skip to next instance - - # Get instance IDs - zone_instance_ids = [ - request["InstanceId"] - for request in describe_response.get("SpotInstanceRequests", []) - if "InstanceId" in request - ] - - if not zone_instance_ids: - thisInstanceStatusObject.status = "Failed to request spot instance" - thisInstanceStatusObject.detailed_status = "No instance ID returned" - all_statuses[thisInstanceStatusObject.id] = thisInstanceStatusObject - events_to_progress.append(thisInstanceStatusObject) - continue # Skip to next instance - - # Add to our overall list of instance IDs - instance_ids.extend(zone_instance_ids) - - # Process the first instance ID (we request only one per spot request) - thisInstanceStatusObject.instance_id = zone_instance_ids[0] - thisInstanceStatusObject.status = "Tagging" - all_statuses[thisInstanceStatusObject.id] = thisInstanceStatusObject - events_to_progress.append(thisInstanceStatusObject) - - try: - # Run tagging and instance details fetching in parallel - tagging_task = asyncio.create_task( - asyncio.to_thread( - ec2.create_tags, - Resources=zone_instance_ids, - Tags=[ - {"Key": FILTER_TAG_NAME, "Value": FILTER_TAG_VALUE}, - {"Key": "Name", "Value": f"SpotInstance-{region}-{zone}"}, - {"Key": "AZ", "Value": zone}, - ], - ) - ) - - fetching_task = asyncio.create_task( - asyncio.to_thread( - ec2.describe_instances, - InstanceIds=[thisInstanceStatusObject.instance_id], - ) - ) - - # Wait for both tasks to complete with timeout - done, pending = await asyncio.wait( - [tagging_task, fetching_task], - timeout=30 - ) - - # Cancel any pending tasks that didn't complete - for task in pending: - task.cancel() - - # Process the results - instance_details = None - tagging_completed = False - - for task in done: - try: - if task == tagging_task: - await task - tagging_completed = True - elif task == fetching_task: - instance_details = await task - except Exception as e: - logging.error(f"Error in instance initialization: {str(e)}") - - # Extract IP addresses if we got instance details - if instance_details and instance_details.get("Reservations"): - instance = instance_details["Reservations"][0]["Instances"][0] - thisInstanceStatusObject.public_ip = instance.get("PublicIpAddress", "") - thisInstanceStatusObject.private_ip = instance.get("PrivateIpAddress", "") - - # Update final status - if tagging_completed: - thisInstanceStatusObject.status = "Done" - else: - thisInstanceStatusObject.status = "Tagged with warnings" - thisInstanceStatusObject.detailed_status = "Tagging may not have completed" - - all_statuses[thisInstanceStatusObject.id] = thisInstanceStatusObject - events_to_progress.append(thisInstanceStatusObject) - - except Exception as e: - logging.error(f"Error processing instance {thisInstanceStatusObject.instance_id}: {str(e)}") - thisInstanceStatusObject.status = "Error processing instance" - thisInstanceStatusObject.detailed_status = str(e)[:30] - all_statuses[thisInstanceStatusObject.id] = thisInstanceStatusObject - events_to_progress.append(thisInstanceStatusObject) - - except Exception as e: - logging.error(f"An error occurred in {region}: {str(e)}", exc_info=True) - return [], {} - - return instance_ids - - -async def create_vpc_if_not_exists(ec2): - vpcs = await asyncio.to_thread( - ec2.describe_vpcs, Filters=[{"Name": "tag:Name", "Values": ["SpotInstanceVPC"]}] - ) - if vpcs["Vpcs"]: - return vpcs["Vpcs"][0]["VpcId"] - else: - vpc = await asyncio.to_thread(ec2.create_vpc, CidrBlock="10.0.0.0/16") - vpc_id = vpc["Vpc"]["VpcId"] - await asyncio.to_thread( - ec2.create_tags, - Resources=[vpc_id], - Tags=[{"Key": "Name", "Value": "SpotInstanceVPC"}], - ) - await asyncio.to_thread( - ec2.modify_vpc_attribute, VpcId=vpc_id, EnableDnsHostnames={"Value": True} - ) - await asyncio.to_thread( - ec2.modify_vpc_attribute, VpcId=vpc_id, EnableDnsSupport={"Value": True} - ) - return vpc_id - - -async def create_subnet(ec2, vpc_id, zone, cidr_block=None): - # First, check if a subnet already exists in this zone - existing_subnets = await asyncio.to_thread( - ec2.describe_subnets, - Filters=[ - {"Name": "vpc-id", "Values": [vpc_id]}, - {"Name": "availability-zone", "Values": [zone]}, - ], - ) - - if existing_subnets["Subnets"]: - # If a subnet exists, return its ID - return existing_subnets["Subnets"][0]["SubnetId"] - - # If no subnet exists, try to create one - cidr_base_prefix = "10.0." - cidr_base_suffix = ".0/24" - for i in range(256): - try: - cidrBlock = ( - cidr_block - if cidr_block - else cidr_base_prefix + str(i) + cidr_base_suffix - ) - logging.debug(f"Creating subnet in {zone} with CIDR block {cidrBlock}") - subnet = await asyncio.to_thread( - ec2.create_subnet, - VpcId=vpc_id, - CidrBlock=cidrBlock, - AvailabilityZone=zone, - ) - return subnet["Subnet"]["SubnetId"] - except botocore.exceptions.ClientError as e: - if e.response["Error"]["Code"] == "InvalidSubnet.Conflict": - # If this CIDR is in use, try the next one - continue - else: - # If it's a different error, raise it - raise - - # If we've tried all possible CIDRs and none worked, raise an error - raise Exception(f"Unable to create subnet in {zone}. All CIDR blocks are in use.") - - -async def create_internet_gateway(ec2, vpc_id): - # First, check if the VPC already has an Internet Gateway attached - igws = await asyncio.to_thread( - ec2.describe_internet_gateways, - Filters=[{"Name": "attachment.vpc-id", "Values": [vpc_id]}], - ) - - if igws["InternetGateways"]: - # If an Internet Gateway is already attached, return its ID - return igws["InternetGateways"][0]["InternetGatewayId"] - - # If no Internet Gateway is attached, create and attach a new one - igw = await asyncio.to_thread(ec2.create_internet_gateway) - igw_id = igw["InternetGateway"]["InternetGatewayId"] - - try: - await asyncio.to_thread( - ec2.attach_internet_gateway, InternetGatewayId=igw_id, VpcId=vpc_id - ) - except botocore.exceptions.ClientError: - # If an error occurs during attachment, delete the created IGW - await asyncio.to_thread(ec2.delete_internet_gateway, InternetGatewayId=igw_id) - # Re-check for existing IGW in case one was attached concurrently - igws = await asyncio.to_thread( - ec2.describe_internet_gateways, - Filters=[{"Name": "attachment.vpc-id", "Values": [vpc_id]}], - ) - if igws["InternetGateways"]: - return igws["InternetGateways"][0]["InternetGatewayId"] - else: - # If still no IGW found, re-raise the original error - raise - - return igw_id - - -async def create_route_table(ec2, vpc_id, igw_id): - # Check if a route table already exists for the VPC - route_tables = await asyncio.to_thread( - ec2.describe_route_tables, - Filters=[{"Name": "vpc-id", "Values": [vpc_id]}], - ) - for rt in route_tables["RouteTables"]: - for association in rt.get("Associations", []): - if association.get("Main", False): - # Found the main route table, add a route to the IGW if it doesn't exist - route_table_id = rt["RouteTableId"] - routes = rt.get("Routes", []) - if not any(route.get("GatewayId") == igw_id for route in routes): - await asyncio.to_thread( - ec2.create_route, - RouteTableId=route_table_id, - DestinationCidrBlock="0.0.0.0/0", - GatewayId=igw_id, - ) - return route_table_id - - # If no route table exists, create a new one - route_table = await asyncio.to_thread(ec2.create_route_table, VpcId=vpc_id) - route_table_id = route_table["RouteTable"]["RouteTableId"] - - # Create a route to the Internet Gateway - await asyncio.to_thread( - ec2.create_route, - RouteTableId=route_table_id, - DestinationCidrBlock="0.0.0.0/0", - GatewayId=igw_id, - ) - - # Associate the route table with the VPC (make it the main route table) - await asyncio.to_thread( - ec2.associate_route_table, - RouteTableId=route_table_id, - VpcId=vpc_id, - ) - - return route_table_id - - -async def associate_route_table(ec2, route_table_id, subnet_id): - try: - await asyncio.to_thread( - ec2.associate_route_table, RouteTableId=route_table_id, SubnetId=subnet_id - ) - except botocore.exceptions.ClientError as e: - if e.response["Error"]["Code"] == "Resource.AlreadyAssociated": - logging.debug( - f"Route table already associated in {route_table_id}-{subnet_id}: {str(e)}" - ) - else: - raise - - -async def create_security_group_if_not_exists(ec2, vpc_id): - security_groups = await asyncio.to_thread( - ec2.describe_security_groups, - Filters=[ - {"Name": "group-name", "Values": ["SpotInstanceSG"]}, - {"Name": "vpc-id", "Values": [vpc_id]}, - ], - ) - if security_groups["SecurityGroups"]: - return security_groups["SecurityGroups"][0]["GroupId"] - else: - security_group = await asyncio.to_thread( - ec2.create_security_group, - GroupName="SpotInstanceSG", - Description="Security group for Spot Instances", - VpcId=vpc_id, - ) - security_group_id = security_group["GroupId"] - await asyncio.to_thread( - ec2.authorize_security_group_ingress, - GroupId=security_group_id, - IpPermissions=[ - { - "IpProtocol": "tcp", - "FromPort": 22, - "ToPort": 22, - "IpRanges": [{"CidrIp": "0.0.0.0/0"}], - }, - { - "IpProtocol": "tcp", - "FromPort": 1234, - "ToPort": 1234, - "IpRanges": [{"CidrIp": "0.0.0.0/0"}], - }, - { - "IpProtocol": "tcp", - "FromPort": 1235, - "ToPort": 1235, - "IpRanges": [{"CidrIp": "0.0.0.0/0"}], - }, - { - "IpProtocol": "tcp", - "FromPort": 9123, - "ToPort": 9123, - "IpRanges": [{"CidrIp": "0.0.0.0/0"}], - }, - ], - ) - return security_group_id - - -async def create_spot_instances(): - """Create spot instances across all configured regions. - - This is the main function for instance creation that: - 1. Distributes instances across regions based on configuration - 2. Creates the instances in parallel - 3. Waits for all instances to get their public IPs - 4. Displays final node information and continues - - The function doesn't wait for SSH or Bacalhau services to be available. - It only ensures machines have IP addresses assigned. - - Returns: - bool: True if all instances were successfully created with IPs, False otherwise - """ - global task_name, task_total - task_name = "Creating Spot Instances" - task_total = MAX_NODES - - logger.info(f"Starting spot instance creation - target: {MAX_NODES} instances") - - async def create_in_region(region): - global global_node_count - available_slots = MAX_NODES - global_node_count - region_cfg = config.get_region_config(region) - - if available_slots <= 0: - logger.warning(f"Reached maximum nodes. Skipping region: {region}") - return [], {} - - instances_to_create = ( - min(INSTANCES_PER_REGION, available_slots) - if region_cfg.get("node_count") == "auto" - else (min(region_cfg.get("node_count"), available_slots)) - ) - - if instances_to_create == 0: - logger.info(f"No instances to create in region {region}") - return [], {} - - logger.info(f"Creating {instances_to_create} spot instances in region: {region}") - global_node_count += instances_to_create - instance_ids = await create_spot_instances_in_region( - config, instances_to_create, region - ) - - # Log success or failure - if instance_ids: - logger.info(f"Successfully created {len(instance_ids)} instances in {region}") - else: - logger.warning(f"Failed to create any instances in {region}") - - return instance_ids - - # Process regions in batches to start machine creation sooner - # Choose a batch size that gives good parallelism without overwhelming the system - batch_size = 10 # Process 10 regions at a time - total_created = 0 - logger.info(f"Creating instances in batches of {batch_size} regions") - - # Group regions into batches - region_batches = [AWS_REGIONS[i:i+batch_size] for i in range(0, len(AWS_REGIONS), batch_size)] - - for batch_num, region_batch in enumerate(region_batches, 1): - logger.info(f"Processing batch {batch_num}/{len(region_batches)} with {len(region_batch)} regions") - - # Create instances in this batch of regions in parallel - create_tasks = [create_in_region(region) for region in region_batch] - batch_results = await asyncio.gather(*create_tasks) - - # Count created instances in this batch - batch_created = sum(len(ids) for ids in batch_results if ids) - total_created += batch_created - logger.info(f"Batch {batch_num} created {batch_created} instances") - - # Wait for public IPs for instances in this batch only - # We'll do this processing in a background task so we can continue - if batch_created > 0: - # Start getting public IPs for this batch in the background - # We don't await this - just let it run - asyncio.create_task(wait_for_batch_public_ips()) - - logger.info(f"All batches processed, created {total_created} instances across all regions") - - # Don't continue if no instances were created - if total_created == 0: - logger.warning("No instances were created - skipping IP address waiting") - return False - - # Wait for any remaining IP address assignments to complete - logger.info("Ensuring all instances have received public IP addresses...") - all_ips_received = await wait_for_public_ips() - - if all_ips_received: - logger.info("All instances have been successfully created with public IPs") - - # Display final node information in a table - but don't wait for provisioning - print_node_table() - else: - logger.warning("Some instances did not receive public IPs within the timeout") - - return all_ips_received - - -def print_node_table(): - """Display a table of all nodes showing hostname, region, zone, and IP addresses. - - This presents a clean summary of all nodes that were created during the operation, - making it easy for users to see what resources are available. - - This is a synchronous function to ensure it works outside of an async context. - """ - # Get sorted list of statuses for consistent display - sorted_statuses = sorted(all_statuses.values(), key=lambda x: (x.region, x.zone)) - - # Only include instances that have a public IP (successfully created) - nodes_with_ip = [s for s in sorted_statuses if s.public_ip] - - # Count pending spot requests that didn't get fulfilled - pending_spot_requests = [s for s in sorted_statuses if s.spot_request_id and not s.instance_id] - - # First create and show the successful nodes table - if nodes_with_ip: - # Create a new table specifically for the final display - table = Table(title="Bacalhau Cluster Nodes", box=box.ROUNDED, show_header=True, header_style="bold cyan") - - # Add columns with appropriate alignment and style - table.add_column("Node #", style="dim", justify="right") - table.add_column("Hostname", style="cyan") - table.add_column("Region", style="green") - table.add_column("Zone", style="blue") - table.add_column("Public IP", style="yellow") - table.add_column("Private IP", style="dim cyan") - - # Add rows for each node - for i, status in enumerate(nodes_with_ip, 1): - # Generate a hostname from region and zone - hostname = f"bacalhau-{status.region}-{status.zone.split('-')[-1]}" - - table.add_row( - str(i), - hostname, - status.region, - status.zone, - status.public_ip or "N/A", - status.private_ip or "N/A" - ) - - # Log first for debug - logger.info(f"Displaying final table with {len(nodes_with_ip)} nodes") - - # Display the table outside of the Live context - console.print() # Add some space - console.print(table) - console.print() # Add some space after - else: - logger.warning("No nodes with IP addresses to display") - console.print("[bold yellow]No nodes received IP addresses![/bold yellow]") - console.print() - - # Show a summary of successful vs. pending spot requests - console.print(f"[bold]Spot Instance Summary:[/bold]") - console.print(f"- Successfully provisioned: [green]{len(nodes_with_ip)}[/green] nodes") - console.print(f"- Pending spot requests: [yellow]{len(pending_spot_requests)}[/yellow]") - console.print(f"- Total spot requests: [blue]{len(sorted_statuses)}[/blue]") - console.print() - - # Also print a helpful message about how to connect to nodes with proper key authentication - if nodes_with_ip: - console.print("[bold green]✓[/bold green] Your Bacalhau cluster is being provisioned!") - console.print("[yellow]Machines have IP addresses but may need a few minutes to complete setup[/yellow]") - - # Get the username and private key path from config - username = config.get_username() - private_key_path = config.get_private_ssh_key_path() - - # Create the SSH command with key file if available - if private_key_path: - ssh_cmd = f"ssh -i {private_key_path} {username}@" - else: - ssh_cmd = f"ssh {username}@" - - console.print(f"[dim]To connect to any node: {ssh_cmd}[/dim]") - else: - console.print("[bold red]⚠ No instances were successfully provisioned with IP addresses.[/bold red]") - console.print("[yellow]This could be due to spot capacity issues in the selected regions.[/yellow]") - console.print("[yellow]Consider trying again, selecting different instance types, or using different regions.[/yellow]") - - console.print() - -async def wait_for_provisioning(): - """Wait for all instances to complete their provisioning process. - - This function checks SSH connectivity and whether the Bacalhau services - are running on each instance. It updates the statuses throughout the - provisioning process. - - Returns: - bool: True when all instances are fully provisioned - """ - global all_statuses - max_timeout = 600 # 10 minutes timeout - start_time = time.time() - poll_interval = 15 # seconds between polls - - logger.info(f"Monitoring provisioning status for all instances (timeout: {max_timeout}s)") - - # Count instances we're monitoring - instances_to_monitor = [s for s in all_statuses.values() if s.instance_id and s.public_ip] - - if not instances_to_monitor: - logger.warning("No instances to monitor for provisioning") - return False - - logger.info(f"Monitoring provisioning for {len(instances_to_monitor)} instances") - - # Initialize provisioning statuses - for status in instances_to_monitor: - status.detailed_status = "Waiting for provisioning" - # Make sure to signal for UI update - events_to_progress.append(status) - - # Track completion - while True: - # Check timeout - elapsed_time = time.time() - start_time - if elapsed_time > max_timeout: - logger.warning(f"Timeout reached after {max_timeout}s waiting for provisioning") - # Update statuses for those that didn't complete - for status in instances_to_monitor: - if status.detailed_status != "Provisioning complete": - status.detailed_status = "Provisioning timeout" - events_to_progress.append(status) - return False - - # Check all instances in parallel - async def check_instance(status): - try: - # Skip already completed instances - if status.detailed_status == "Provisioning complete": - return True - - # Update status to show we're checking - status.detailed_status = f"Checking provisioning ({int(elapsed_time)}s)" - events_to_progress.append(status) - - # Check SSH connectivity first - if not await check_ssh_connectivity(status.public_ip): - status.detailed_status = "Waiting for SSH access" - events_to_progress.append(status) - return False - - # Then check if Docker is running - if not await check_docker_running(status.public_ip): - status.detailed_status = "Waiting for Docker" - events_to_progress.append(status) - return False - - # Finally check if Bacalhau service is running - if not await check_bacalhau_service(status.public_ip): - status.detailed_status = "Waiting for Bacalhau" - events_to_progress.append(status) - return False - - # All checks passed, provisioning is complete - status.detailed_status = "Provisioning complete" - events_to_progress.append(status) - return True - - except Exception as e: - logger.error(f"Error checking instance {status.instance_id}: {str(e)}") - status.detailed_status = f"Check error: {str(e)[:20]}" - events_to_progress.append(status) - return False - - # Check all instances in parallel - check_tasks = [check_instance(status) for status in instances_to_monitor] - results = await asyncio.gather(*check_tasks) - - # Count how many are complete - complete_count = sum(1 for r in results if r) - logger.info(f"Provisioning progress: {complete_count}/{len(instances_to_monitor)} instances ready") - - # Check if all are complete - if all(results): - logger.info("All instances have completed provisioning") - - # Keep the display up for a few more seconds to show the final status - logger.info("Keeping display open for 5 more seconds to show provisioning complete") - await asyncio.sleep(5) - - return True - - # Wait before next check - await asyncio.sleep(poll_interval) - -async def check_ssh_connectivity(ip_address): - """Check if an instance is accessible via SSH. - - Args: - ip_address: The public IP address of the instance - - Returns: - bool: True if SSH connection succeeds, False otherwise - """ - try: - # Use socket connection to check if port 22 is open - reader, writer = await asyncio.wait_for( - asyncio.open_connection(ip_address, 22), - timeout=5.0 - ) - - # Close the connection - writer.close() - await writer.wait_closed() - - return True - except Exception: - return False - -async def check_docker_running(ip_address): - """Check if Docker is running on the instance. - - Args: - ip_address: The public IP address of the instance - - Returns: - bool: True if docker appears to be running, False otherwise - """ - # For now, we'll just check SSH since we can't easily run commands remotely - # In a production version, this would use SSH to execute 'docker ps' - return await check_ssh_connectivity(ip_address) - -async def check_bacalhau_service(ip_address): - """Check if the Bacalhau service is running on the instance. - - Args: - ip_address: The public IP address of the instance - - Returns: - bool: True if Bacalhau service appears to be running, False otherwise - """ - try: - # Try to connect to the bacalhau healthcheck port (assuming it's 1234) - reader, writer = await asyncio.wait_for( - asyncio.open_connection(ip_address, 1234), - timeout=5.0 - ) - - # Close the connection - writer.close() - await writer.wait_closed() - - return True - except Exception: - return False - -async def wait_for_batch_public_ips(): - """Wait for public IPs for instances in the most recent batch. - - This is a non-blocking function that can be called as a background task. - It identifies instances without IPs that were created in recent batches - and polls for their IP addresses. - - This allows us to start getting IPs while other machines are still creating. - """ - # Find instances without public IPs among the most recently created ones - # These will be instances that have an instance_id but no public_ip - pending_instances = [status for status in all_statuses.values() - if status.instance_id and not status.public_ip] - - if not pending_instances: - logger.debug("No pending instances waiting for IPs in this batch") - return - - logger.info(f"Background task: Getting public IPs for {len(pending_instances)} new instances") - - # Group instances by region for efficient API calls - instances_by_region = {} - for status in pending_instances: - if status.region not in instances_by_region: - instances_by_region[status.region] = [] - instances_by_region[status.region].append(status) - - # Set a reasonable timeout for this specific batch (shorter than the main wait) - timeout = 120 # 2 minutes timeout per batch - start_time = time.time() - poll_interval = 5 # seconds between polls - - # Poll for public IPs - while time.time() - start_time < timeout: - # Count how many still need IPs - still_pending = sum(1 for status in pending_instances if not status.public_ip) - - if still_pending == 0: - logger.info(f"Background task: All {len(pending_instances)} instances in batch received IPs") - return - - logger.debug(f"Background task: Still waiting for {still_pending} instances to get public IPs") - - # Update the IPs in parallel per region - async def update_region_ips(region, statuses): - # Skip if no instances still need IPs in this region - if all(status.public_ip for status in statuses): - return 0 - - try: - # Get EC2 client for this region - ec2 = get_ec2_client(region) - - # Get instance IDs that still need IPs - instance_ids = [status.instance_id for status in statuses if not status.public_ip] - - # Skip if no instances - if not instance_ids: - return 0 - - # Query AWS API for current instance information - response = await asyncio.to_thread( - ec2.describe_instances, - InstanceIds=instance_ids - ) - - # Process results and update statuses - updated_count = 0 - for reservation in response.get("Reservations", []): - for instance in reservation.get("Instances", []): - instance_id = instance["InstanceId"] - public_ip = instance.get("PublicIpAddress", "") - private_ip = instance.get("PrivateIpAddress", "") - - # Find the matching status - for status in statuses: - if status.instance_id == instance_id: - if public_ip and not status.public_ip: - status.public_ip = public_ip - status.detailed_status = "Public IP assigned" - updated_count += 1 - if private_ip: - status.private_ip = private_ip - # Signal for UI update - events_to_progress.append(status) - - return updated_count - - except Exception as e: - logger.error(f"Error updating IPs for region {region}: {str(e)}") - return 0 - - # Create tasks for each region - tasks = [update_region_ips(region, statuses) - for region, statuses in instances_by_region.items()] - - # Run all tasks in parallel - results = await asyncio.gather(*tasks) - - # Sum up the total updated - updated_count = sum(results) - if updated_count > 0: - logger.info(f"Background task: Received {updated_count} new public IPs") - - # Save the updates to MACHINES.json - save_machines_to_json(operation="update") - - # Wait before next poll - await asyncio.sleep(poll_interval) - - # If we get here, we hit the timeout - logger.warning(f"Background task: Timeout waiting for IPs after {timeout}s") - -async def wait_for_public_ips(): - """Wait for all instances to get their public IP addresses. - - This function monitors the instance statuses and waits until all have IP addresses - or until a timeout is reached. It updates the progress display throughout. - - Returns: - bool: True if all instances got IPs, False if any timed out - """ - global all_statuses - timeout = 300 # 5 minutes timeout - start_time = time.time() - poll_interval = 5 # seconds between polls - - logger.info(f"Waiting for public IP addresses (timeout: {timeout}s)") - - # Count all instances we're waiting for - both spot requests and instances without IPs - pending_spot_requests = sum(1 for status in all_statuses.values() - if status.spot_request_id and not status.instance_id) - pending_ips = sum(1 for status in all_statuses.values() - if status.instance_id and not status.public_ip) - - total_pending = pending_spot_requests + pending_ips - logger.info(f"Waiting for {total_pending} instances to complete ({pending_spot_requests} spot requests still pending, {pending_ips} awaiting IPs)") - - # Group instances by region for parallel processing - def get_instances_by_region(): - instances_by_region = {} - spot_requests_by_region = {} - - # First, organize by region - for status in all_statuses.values(): - region = status.region - if not region: - continue - - # Handle instances waiting for IP addresses - if status.instance_id and not status.public_ip: - if region not in instances_by_region: - instances_by_region[region] = [] - instances_by_region[region].append(status) - - # Handle spot requests waiting for fulfillment - elif status.spot_request_id and not status.instance_id: - if region not in spot_requests_by_region: - spot_requests_by_region[region] = [] - spot_requests_by_region[region].append(status) - - # Combine both mappings for return - combined_by_region = {} - all_regions = set(instances_by_region.keys()) | set(spot_requests_by_region.keys()) - - for region in all_regions: - combined_by_region[region] = { - "instances": instances_by_region.get(region, []), - "spot_requests": spot_requests_by_region.get(region, []) - } - - return combined_by_region - - # Track completion status - all_ips_received = False - - while True: - # Count pending spot requests and instances waiting for IPs - pending_spot_requests = sum(1 for status in all_statuses.values() - if status.spot_request_id and not status.instance_id) - pending_ips = sum(1 for status in all_statuses.values() - if status.instance_id and not status.public_ip) - - total_pending = pending_spot_requests + pending_ips - - # Check if we're done with both spot requests and IP assignment - all_complete = total_pending == 0 - - # Check for timeout - time_elapsed = time.time() - start_time - timed_out = time_elapsed > timeout - - # Exit conditions - if all_complete: - provisioned_count = sum(1 for status in all_statuses.values() if status.public_ip) - logger.info(f"All instances processed - {provisioned_count} successfully provisioned with public IPs") - all_ips_received = True - break - - if timed_out: - # Update status for all pending instances - for status in all_statuses.values(): - if status.spot_request_id and not status.instance_id: - status.detailed_status = "Spot request not fulfilled after timeout" - events_to_progress.append(status) - elif status.instance_id and not status.public_ip: - status.detailed_status = "No public IP after timeout" - events_to_progress.append(status) - - provisioned_count = sum(1 for status in all_statuses.values() if status.public_ip) - logger.warning(f"Timed out after {timeout}s - {provisioned_count} instances provisioned, {pending_spot_requests} spot requests pending, {pending_ips} instances missing IPs") - break - - # Get instances grouped by region - instances_by_region = get_instances_by_region() - if not instances_by_region: - # No instances need IPs, we're done - logger.info("No instances waiting for IPs") - all_ips_received = True - break - - # Log progress - pending_count = sum(len(ids) for ids in instances_by_region.values()) - logger.info(f"Still waiting for {pending_count} instances to get public IPs ({int(time_elapsed)}s elapsed)") - - # Create tasks to query each region in parallel - async def query_region_instances(region, region_data): - try: - ec2 = get_ec2_client(region) - updated_count = 0 - - # First check spot request status for any pending requests - spot_requests = region_data.get("spot_requests", []) - if spot_requests: - # Get all the spot request IDs - spot_request_ids = [sr.spot_request_id for sr in spot_requests if sr.spot_request_id] - - if spot_request_ids: - logger.debug(f"Checking {len(spot_request_ids)} spot requests in {region}") - try: - spot_response = await asyncio.to_thread( - ec2.describe_spot_instance_requests, - SpotInstanceRequestIds=spot_request_ids - ) - - # Process spot request results - for request in spot_response.get("SpotInstanceRequests", []): - request_id = request.get("SpotInstanceRequestId") - instance_id = request.get("InstanceId") - status_code = request.get("Status", {}).get("Code", "") - status_message = request.get("Status", {}).get("Message", "") - - # Find the matching status object - for status in spot_requests: - if status.spot_request_id == request_id: - # Update status with details - status.detailed_status = f"{status_code}: {status_message}" - - # If the request has an instance ID, it's fulfilled - if instance_id: - status.instance_id = instance_id - status.fulfilled = True - updated_count += 1 - - # Signal for UI update - events_to_progress.append(status) - except Exception as e: - logger.error(f"Error checking spot requests in {region}: {str(e)}") - - # Now check for IP addresses for instances - instances = region_data.get("instances", []) - if instances: - # Get all instance IDs - instance_ids = [i.instance_id for i in instances if i.instance_id] - - if instance_ids: - logger.debug(f"Checking {len(instance_ids)} instances for IPs in {region}") - try: - instance_response = await asyncio.to_thread( - ec2.describe_instances, InstanceIds=instance_ids - ) - - # Process results and update statuses - for reservation in instance_response.get("Reservations", []): - for instance in reservation.get("Instances", []): - instance_id = instance["InstanceId"] - public_ip = instance.get("PublicIpAddress", "") - private_ip = instance.get("PrivateIpAddress", "") - - # Find the matching status object - for status in instances: - if status.instance_id == instance_id: - if public_ip and not status.public_ip: - status.public_ip = public_ip - status.detailed_status = "Public IP assigned" - updated_count += 1 - if private_ip: - status.private_ip = private_ip - # Signal for UI update - events_to_progress.append(status) - except Exception as e: - logger.error(f"Error checking instance IPs in {region}: {str(e)}") - - return updated_count - except Exception as e: - logger.error(f"Error querying region {region}: {str(e)}") - return 0 - - # Create and run tasks for all regions in parallel - regions_to_query = get_instances_by_region() - tasks = [ - query_region_instances(region, region_data) - for region, region_data in regions_to_query.items() - ] - - if tasks: - # Wait for all regions to be queried with timeout protection - try: - results = await asyncio.gather(*tasks) - - # Sum up the total updated - updated_count = sum(results) - - # Log how many updates we made - if updated_count > 0: - # Count current success stats - fulfilled_requests = sum(1 for status in all_statuses.values() - if status.spot_request_id and status.instance_id) - ip_assigned = sum(1 for status in all_statuses.values() - if status.instance_id and status.public_ip) - - logger.info(f"Updated {updated_count} instances - {fulfilled_requests} spot requests fulfilled, {ip_assigned} instances have IPs") - - # Save the updates to MACHINES.json - save_machines_to_json(operation="update") - - except Exception as e: - logger.error(f"Error waiting for instances: {str(e)}") - - # Wait before next poll - we don't want to hammer the AWS API - await asyncio.sleep(poll_interval) - - # Return whether all instances got IPs or not - return all_ips_received - - -async def list_spot_instances(): - logger.debug("Entering list_spot_instances function") - global all_statuses, events_to_progress, task_total - logger.debug("Resetting global statuses and events") - all_statuses = {} # Reset the global statuses - events_to_progress = [] # Clear the events list - - global task_name - task_name = "Listing Spot Instances" - task_total = 0 # We'll update this as we go - - logger.info("Starting to list spot instances") - - for region in AWS_REGIONS: - logger.info(f"Processing region: {region}") - logger.debug(f"Getting EC2 client for region {region}") - ec2 = get_ec2_client(region) - try: - logger.info(f"Fetching availability zones for region {region}") - az_response = await asyncio.to_thread(ec2.describe_availability_zones) - availability_zones = [ - az["ZoneName"] for az in az_response["AvailabilityZones"] - ] - logger.info( - f"Found {len(availability_zones)} availability zones in {region}: {', '.join(availability_zones)}" - ) - - for az in availability_zones: - logger.info(f"Querying instances in {region}/{az}") - response = await asyncio.to_thread( - ec2.describe_instances, - Filters=[ - { - "Name": "instance-state-name", - "Values": ["pending", "running", "stopped"], - }, - {"Name": "availability-zone", "Values": [az]}, - { - "Name": f"tag:{FILTER_TAG_NAME}", - "Values": [FILTER_TAG_VALUE], - }, - ], - ) - - instance_count = 0 - for reservation in response["Reservations"]: - for instance in reservation["Instances"]: - instance_count += 1 - logger.info( - f"Found instance: {instance['InstanceId']} in {region}/{az}" - ) - instance_id = instance["InstanceId"] - thisInstanceStatusObject = InstanceStatus( - region, az, 0, instance_id - ) - thisInstanceStatusObject.status = instance["State"][ - "Name" - ].capitalize() - thisInstanceStatusObject.elapsed_time = ( - datetime.now(timezone.utc) - instance["LaunchTime"] - ).total_seconds() - thisInstanceStatusObject.public_ip = instance.get( - "PublicIpAddress", "" - ) - thisInstanceStatusObject.private_ip = instance.get( - "PrivateIpAddress", "" - ) - - logger.debug( - f"Adding instance {instance_id} to status tracking" - ) - events_to_progress.append(instance_id) - all_statuses[instance_id] = thisInstanceStatusObject - task_total += 1 - - if instance_count == 0: - logger.info(f"No instances found in {region}/{az}") - - logger.info( - f"Completed scan of region {region}, found {sum(1 for status in all_statuses.values() if status.region == region)} instances" - ) - - except Exception as e: - logger.error( - f"An error occurred while listing instances in {region}: {str(e)}", - exc_info=True, - ) - - logger.info( - f"Finished listing spot instances, found {len(all_statuses)} instances in total" - ) - return all_statuses - - -async def destroy_instances(): - """Destroy all managed instances across all regions. - - This function first removes instances from MACHINES.json to provide immediate feedback, - then asynchronously queries AWS APIs to find and terminate any instances that might - have been missed in our tracking file. - """ - global task_name, task_total, events_to_progress - task_name = "Terminating Spot Instances" - events_to_progress = [] - - # Start by loading and clearing MACHINES.json for immediate feedback - logger.info("Loading existing machine records from MACHINES.json") - existing_data = load_machines_from_json() - existing_machines = existing_data.get("machines", {}) - - # If we have existing machines in the file, create status objects for them first - if existing_machines: - logger.info(f"Found {len(existing_machines)} existing machines in MACHINES.json") - for machine_id, machine_data in existing_machines.items(): - try: - # Extract needed information for termination - region = machine_data.get("region") - zone = machine_data.get("zone") - instance_id = machine_data.get("instance_id") - vpc_id = machine_data.get("vpc_id") - - if not all([region, zone, instance_id]): - logger.warning(f"Incomplete data for machine {machine_id}, skipping") - continue - - # Create a status object for tracking - status = InstanceStatus(region, zone) - status.instance_id = instance_id - status.status = "Terminating" - status.detailed_status = "From MACHINES.json" - status.vpc_id = vpc_id - all_statuses[instance_id] = status - events_to_progress.append(status) - - logger.info(f"Added instance {instance_id} in {region} for termination from MACHINES.json") - - except Exception as e: - logger.error(f"Error processing machine record {machine_id}: {str(e)}") - - # Remove all machines from MACHINES.json immediately - if existing_machines: - logger.info("Clearing MACHINES.json to provide immediate feedback") - try: - # Create empty machine data - output_data = { - "timestamp": datetime.now(timezone.utc).isoformat(), - "machines": {}, - "total_count": 0, - "regions": [], - "last_operation": "delete", - "last_updated": datetime.now(timezone.utc).isoformat() - } - - # Write to temporary file first - temp_file = "MACHINES.json.tmp" - with open(temp_file, "w") as f: - # Use fcntl for file locking on Unix systems - try: - import fcntl - fcntl.flock(f, fcntl.LOCK_EX) # Exclusive lock for writing - json.dump(output_data, indent=2, default=str, sort_keys=True, fp=f) - f.flush() # Ensure data is written to disk - os.fsync(f.fileno()) # Sync filesystem - fcntl.flock(f, fcntl.LOCK_UN) # Release lock - except (ImportError, AttributeError): - # On Windows or if fcntl not available - json.dump(output_data, indent=2, default=str, sort_keys=True, fp=f) - f.flush() # Ensure data is written to disk - - # Atomic rename to ensure file is either fully written or not at all - os.replace(temp_file, "MACHINES.json") - logger.info("Successfully cleared MACHINES.json") - - except Exception as e: - logger.error(f"Error clearing MACHINES.json: {str(e)}") - - # Now asynchronously query AWS APIs to find any instances we might have missed - logger.info("Asynchronously querying AWS APIs for any additional instances...") - - # Create a map to track instance-to-region mapping for later termination - instance_region_map = {} - - # Add all instances from MACHINES.json to our map - for instance_id, status in all_statuses.items(): - instance_region_map[instance_id] = { - "region": status.region, - "vpc_id": status.vpc_id, - } - - # Query each region in parallel - async def query_region_for_instances(region): - logger.info(f"Checking region {region} for instances to terminate...") - region_instances = {} # Store instances found in this region - - try: - ec2 = get_ec2_client(region) - # Use safe_aws_call for proper timeout handling - logger.info(f"Querying AWS API for instances in {region}...") - response = await safe_aws_call( - ec2.describe_instances, - Filters=[ - { - "Name": "instance-state-name", - "Values": ["pending", "running", "stopping", "stopped"], - }, - {"Name": f"tag:{FILTER_TAG_NAME}", "Values": [FILTER_TAG_VALUE]}, - ], - ) - - instance_count = 0 - for reservation in response["Reservations"]: - for instance in reservation["Instances"]: - instance_count += 1 - instance_id = instance["InstanceId"] - az = instance["Placement"]["AvailabilityZone"] - vpc_id = instance.get("VpcId") - - # Check if we already have this instance in our tracking or instance_region_map - if instance_id not in all_statuses and instance_id not in instance_region_map: - logger.info(f"Found additional instance {instance_id} in {az} from AWS API") - thisInstanceStatusObject = InstanceStatus(region, az) - thisInstanceStatusObject.instance_id = instance_id - thisInstanceStatusObject.status = "Terminating" - thisInstanceStatusObject.detailed_status = "Found via AWS API" - thisInstanceStatusObject.vpc_id = vpc_id - all_statuses[instance_id] = thisInstanceStatusObject - region_instances[instance_id] = { - "region": region, - "vpc_id": vpc_id, - } - - if instance_count == 0: - logger.info(f"No instances found in region {region}") - - return region_instances - - except TimeoutError: - logger.error( - f"Timeout while listing instances in {region}. Check your AWS credentials." - ) - return {} - except Exception as e: - logger.error( - f"An error occurred while listing instances in {region}: {str(e)}" - ) - return {} - - # Query all regions in parallel - tasks = [query_region_for_instances(region) for region in AWS_REGIONS] - region_results = await asyncio.gather(*tasks) - - # Merge results from all regions - for region_instances in region_results: - instance_region_map.update(region_instances) - - if not all_statuses: - logger.info("No instances found to terminate.") - return - - task_total = len(all_statuses) - logger.info(f"Found {task_total} instances to terminate.") - - async def terminate_instances_in_region(region, region_instances): - if not region_instances: - logger.info(f"No instances to terminate in {region}") - return - - # Deduplication check - double check for duplicates - # This is an extra safeguard to ensure we don't try to terminate the same instance twice - unique_instances = list(set(region_instances)) - - if len(unique_instances) != len(region_instances): - logger.warning(f"Removed {len(region_instances) - len(unique_instances)} duplicate instances in {region}") - region_instances = unique_instances - - ec2 = get_ec2_client(region) - try: - logger.info(f"Terminating {len(region_instances)} instances in {region}...") - await safe_aws_call(ec2.terminate_instances, InstanceIds=region_instances) - logger.info( - f"Instances terminate request sent in {region}, waiting for completion..." - ) - - waiter = ec2.get_waiter("instance_terminated") - start_time = time.time() - while True: - try: - logger.info(f"Checking if instances in {region} are terminated...") - await safe_aws_call( - waiter.wait, - InstanceIds=region_instances, - WaiterConfig={"MaxAttempts": 1}, - ) - logger.info(f"All instances in {region} terminated successfully") - break - except botocore.exceptions.WaiterError: - elapsed_time = time.time() - start_time - logger.info( - f"Instances in {region} still terminating after {elapsed_time:.0f}s" - ) - for instance_id in region_instances: - thisInstanceStatusObject = all_statuses[instance_id] - thisInstanceStatusObject.elapsed_time = elapsed_time - thisInstanceStatusObject.detailed_status = ( - f"Terminating ({elapsed_time:.0f}s)" - ) - events_to_progress.append(thisInstanceStatusObject) - all_statuses[instance_id] = thisInstanceStatusObject - await asyncio.sleep(10) - except TimeoutError: - # Handle timeout during waiter - logger.error( - f"Timeout waiting for instances to terminate in {region}" - ) - for instance_id in region_instances: - thisInstanceStatusObject = all_statuses[instance_id] - thisInstanceStatusObject.status = "Timeout" - thisInstanceStatusObject.detailed_status = ( - "AWS API timeout during termination" - ) - events_to_progress.append(thisInstanceStatusObject) - all_statuses[instance_id] = thisInstanceStatusObject - break - - # Update status for terminated instances - for instance_id in region_instances: - thisInstanceStatusObject = all_statuses[instance_id] - thisInstanceStatusObject.status = "Terminated" - thisInstanceStatusObject.detailed_status = "Instance terminated" - events_to_progress.append(thisInstanceStatusObject) - all_statuses[instance_id] = thisInstanceStatusObject - - # Clean up resources for each VPC - vpcs_to_delete = set( - info["vpc_id"] - for info in instance_region_map.values() - if info["region"] == region and info["vpc_id"] - ) - - if vpcs_to_delete: - logger.info(f"Cleaning up {len(vpcs_to_delete)} VPCs in {region}") - else: - logger.info(f"No VPCs to clean up in {region}") - - for vpc_id in vpcs_to_delete: - try: - logger.info(f"Starting cleanup of VPC {vpc_id} in {region}") - for instance_id, status in all_statuses.items(): - if status.vpc_id == vpc_id: - status.detailed_status = "Cleaning up VPC resources" - events_to_progress.append(status) - - await clean_up_vpc_resources(ec2, vpc_id) - logger.info(f"Completed cleanup of VPC {vpc_id} in {region}") - - except Exception as e: - logger.error( - f"An error occurred while cleaning up VPC {vpc_id} in {region}: {str(e)}" - ) - - except Exception as e: - logger.error( - f"An error occurred while cleaning up resources in {region}: {str(e)}" - ) - - # Create a deduplicated mapping of instance_id to region/vpc info - # This ensures we don't have duplicate entries for the same instance - deduplicated_map = {} - for instance_id, info in instance_region_map.items(): - # Check if we already have this instance (shouldn't happen, but just in case) - if instance_id not in deduplicated_map: - deduplicated_map[instance_id] = info - else: - logger.warning(f"Duplicate instance found: {instance_id} - keeping first entry") - - # Group instances by region - region_instances = {} - for instance_id, info in deduplicated_map.items(): - region = info["region"] - if region not in region_instances: - region_instances[region] = [] - region_instances[region].append(instance_id) - - # Log the deduplication results - if len(deduplicated_map) != len(instance_region_map): - logger.info(f"Removed {len(instance_region_map) - len(deduplicated_map)} duplicate instances") - - # Terminate instances in parallel - termination_tasks = [] - for region, instances in region_instances.items(): - logger.info( - f"Creating termination task for {len(instances)} instances in {region}" - ) - termination_tasks.append(terminate_instances_in_region(region, instances)) - - if termination_tasks: - logger.info(f"Starting {len(termination_tasks)} parallel termination tasks") - await asyncio.gather(*termination_tasks) - logger.info("All termination tasks completed") - else: - logger.info("No termination tasks to execute") - - logger.info("All instances have been terminated.") - - # Create and print a summary of what was terminated - print_termination_summary(deduplicated_map) - - -async def clean_up_vpc_resources(ec2, vpc_id): - async def update_status(message): - logger.info(message) - for status in all_statuses.values(): - if status.vpc_id == vpc_id: - status.detailed_status = message - - await update_status(f"Looking for security groups in VPC {vpc_id}") - sgs = await asyncio.to_thread( - ec2.describe_security_groups, - Filters=[{"Name": "vpc-id", "Values": [vpc_id]}], - ) - - sg_count = 0 - for sg in sgs["SecurityGroups"]: - if sg["GroupName"] != "default": - sg_count += 1 - await update_status( - f"Deleting security group {sg['GroupId']} ({sg['GroupName']})" - ) - await asyncio.to_thread(ec2.delete_security_group, GroupId=sg["GroupId"]) - - if sg_count == 0: - await update_status(f"No non-default security groups found in VPC {vpc_id}") - - await update_status(f"Looking for subnets in VPC {vpc_id}") - subnets = await asyncio.to_thread( - ec2.describe_subnets, - Filters=[{"Name": "vpc-id", "Values": [vpc_id]}], - ) - - subnet_count = 0 - for subnet in subnets["Subnets"]: - subnet_count += 1 - await update_status(f"Deleting subnet {subnet['SubnetId']}") - await asyncio.to_thread(ec2.delete_subnet, SubnetId=subnet["SubnetId"]) - - if subnet_count == 0: - await update_status(f"No subnets found in VPC {vpc_id}") - - await update_status(f"Looking for route tables in VPC {vpc_id}") - rts = await asyncio.to_thread( - ec2.describe_route_tables, - Filters=[{"Name": "vpc-id", "Values": [vpc_id]}], - ) - - rt_count = 0 - for rt in rts["RouteTables"]: - if not any( - association.get("Main", False) for association in rt.get("Associations", []) - ): - rt_count += 1 - await update_status(f"Deleting route table {rt['RouteTableId']}") - await asyncio.to_thread( - ec2.delete_route_table, - RouteTableId=rt["RouteTableId"], - ) - - if rt_count == 0: - await update_status(f"No non-main route tables found in VPC {vpc_id}") - - await update_status(f"Looking for internet gateways attached to VPC {vpc_id}") - igws = await asyncio.to_thread( - ec2.describe_internet_gateways, - Filters=[{"Name": "attachment.vpc-id", "Values": [vpc_id]}], - ) - - igw_count = 0 - for igw in igws["InternetGateways"]: - igw_count += 1 - await update_status(f"Detaching internet gateway {igw['InternetGatewayId']}") - await asyncio.to_thread( - ec2.detach_internet_gateway, - InternetGatewayId=igw["InternetGatewayId"], - VpcId=vpc_id, - ) - await update_status(f"Deleting internet gateway {igw['InternetGatewayId']}") - await asyncio.to_thread( - ec2.delete_internet_gateway, - InternetGatewayId=igw["InternetGatewayId"], - ) - - if igw_count == 0: - await update_status(f"No internet gateways found attached to VPC {vpc_id}") - - await update_status(f"Deleting VPC {vpc_id}") - await asyncio.to_thread(ec2.delete_vpc, VpcId=vpc_id) - await update_status(f"VPC {vpc_id} successfully deleted") - - -def print_termination_summary(instance_map): - """Print a summary table of all terminated instances. - - Args: - instance_map: Dictionary mapping instance IDs to region/vpc info - """ - if not instance_map: - console.print("[yellow]No instances were terminated[/yellow]") - return - - # Collect zone information from status objects - zone_info = {} - for instance_id, info in instance_map.items(): - # Try to get zone from status object - region = info.get("region", "unknown") - - # Look for the zone in the status object if available - zone = "unknown" - if instance_id in all_statuses: - zone = all_statuses[instance_id].zone - - # Track by region and zone - if region not in zone_info: - zone_info[region] = {} - - if zone not in zone_info[region]: - zone_info[region][zone] = 0 - - zone_info[region][zone] += 1 - - # Create a summary table - table = Table(title="Terminated Instances Summary", box=box.ROUNDED, show_header=True, header_style="bold red") - - # Add columns - table.add_column("Region", style="cyan") - table.add_column("Zone", style="blue") - table.add_column("Instances", style="red", justify="right") - - # Add rows for each region and zone - total_instances = 0 - - # Sort regions for consistent display - for region in sorted(zone_info.keys()): - regions_zones = zone_info[region] - # Sort zones within each region - for zone in sorted(regions_zones.keys()): - count = regions_zones[zone] - total_instances += count - - # Only show region on first row for this region - if table.row_count > 0 and zone != sorted(regions_zones.keys())[0]: - table.add_row("", zone, str(count)) - else: - table.add_row(region, zone, str(count)) - - # Add a total row - table.add_row("", "[bold]TOTAL[/bold]", f"[bold]{total_instances}[/bold]") - - # Display the table - console.print() - console.print(table) - console.print() - console.print(f"[bold red]✓[/bold red] Successfully terminated {total_instances} instances") - console.print() - -async def delete_disconnected_aws_nodes(): - try: - # Run bacalhau node list command and capture output - logger.info("Running 'bacalhau node list' to find disconnected nodes") - result = subprocess.run( - ["bacalhau", "node", "list", "--output", "json"], - capture_output=True, - text=True, - check=True, - ) - nodes = json.loads(result.stdout) - - disconnected_aws_nodes = [] - - for node in nodes: - if ( - node["Connection"] == "DISCONNECTED" - and node["Info"]["NodeType"] == "Compute" - and "EC2_INSTANCE_FAMILY" in node["Info"]["Labels"] - ): - disconnected_aws_nodes.append(node["Info"]["NodeID"]) - - if not disconnected_aws_nodes: - logger.info("No disconnected AWS nodes found.") - return - - logger.info(f"Found {len(disconnected_aws_nodes)} disconnected AWS node(s).") - - for node_id in disconnected_aws_nodes: - logger.info(f"Deleting node: {node_id}") - try: - # Run bacalhau admin node delete command - subprocess.run(["bacalhau", "node", "delete", node_id], check=True) - logger.info(f"Successfully deleted node: {node_id}") - except subprocess.CalledProcessError as e: - logger.error(f"Failed to delete node {node_id}. Error: {e}") - - except subprocess.CalledProcessError as e: - logger.error(f"Error running bacalhau node list: {e}") - except json.JSONDecodeError as e: - logger.error(f"Error parsing JSON output: {e}") - except Exception as e: - logger.error(f"An unexpected error occurred: {e}") - - -def all_statuses_to_dict(): - return { - status.id: { - "id": status.id, - "region": status.region, - "zone": status.zone, - "status": status.status, - "detailed_status": status.detailed_status, - "elapsed_time": status.elapsed_time, - "instance_id": status.instance_id, - "spot_request_id": status.spot_request_id, - "fulfilled": getattr(status, "fulfilled", False), - "public_ip": status.public_ip, - "private_ip": status.private_ip, - "vpc_id": status.vpc_id, - "timestamp": datetime.now(timezone.utc).isoformat(), - } - for status in all_statuses.values() - } - -def load_machines_from_json(): - """Atomically load machine data from MACHINES.json if it exists""" - try: - # Check if the file exists - if not os.path.exists("MACHINES.json"): - logger.debug("MACHINES.json does not exist yet") - return {} - - # Open with exclusive access to ensure atomic read - with open("MACHINES.json", "r") as f: - # Use fcntl for file locking on Unix systems - try: - import fcntl - fcntl.flock(f, fcntl.LOCK_SH) # Shared lock for reading - data = json.load(f) - fcntl.flock(f, fcntl.LOCK_UN) # Release lock - except (ImportError, AttributeError): - # On Windows or if fcntl not available, just read without locking - data = json.load(f) - - return data - except json.JSONDecodeError: - logger.warning("MACHINES.json exists but contains invalid JSON, treating as empty") - return {} - except Exception as e: - logger.error(f"Failed to load machines from JSON: {str(e)}", exc_info=True) - return {} - -def save_machines_to_json(operation="update"): - """Atomically save the current machine statuses to MACHINES.json - - Args: - operation: String indicating the type of operation - "update" or "delete" - """ - try: - # Create temporary file first (atomic write pattern) - temp_file = "MACHINES.json.tmp" - - # First try to load existing data - existing_data = load_machines_from_json() - existing_machines = existing_data.get("machines", {}) - - # Convert all current instances to a dict - current_machines = all_statuses_to_dict() - - if operation == "update": - # Update existing machines with current ones - machines_data = {**existing_machines, **current_machines} - - # Log operations - new_count = len(set(current_machines.keys()) - set(existing_machines.keys())) - updated_count = len(set(current_machines.keys()) & set(existing_machines.keys())) - logger.info(f"Adding {new_count} new and updating {updated_count} existing machines") - - elif operation == "delete": - # For delete, remove current machines from existing ones - machines_to_remove = set(current_machines.keys()) - machines_data = {k: v for k, v in existing_machines.items() - if k not in machines_to_remove} - - # Log operation - removed_count = len(machines_to_remove) - logger.info(f"Removing {removed_count} machines from MACHINES.json") - else: - # Default to just using current machines - machines_data = current_machines - - # Extract regions from the machines data (safely) - regions = set() - for machine_data in machines_data.values(): - # Check if the machine data has a region key - if isinstance(machine_data, dict) and "region" in machine_data: - region = machine_data["region"] - if region: # Only add non-empty regions - regions.add(region) - - # Include metadata - output_data = { - "timestamp": datetime.now(timezone.utc).isoformat(), - "machines": machines_data, - "total_count": len(machines_data), - "regions": list(regions), - "last_operation": operation, - "last_updated": datetime.now(timezone.utc).isoformat() - } - - # Write to temporary file first - with open(temp_file, "w") as f: - # Use fcntl for file locking on Unix systems - try: - import fcntl - fcntl.flock(f, fcntl.LOCK_EX) # Exclusive lock for writing - json.dump(output_data, indent=2, default=str, sort_keys=True, fp=f) - f.flush() # Ensure data is written to disk - os.fsync(f.fileno()) # Sync filesystem - fcntl.flock(f, fcntl.LOCK_UN) # Release lock - except (ImportError, AttributeError): - # On Windows or if fcntl not available - json.dump(output_data, indent=2, default=str, sort_keys=True, fp=f) - f.flush() # Ensure data is written to disk - - # Atomic rename to ensure file is either fully written or not at all - os.replace(temp_file, "MACHINES.json") - - if operation == "update": - logger.info(f"Saved {len(machines_data)} machine records to MACHINES.json") - else: - logger.info(f"Updated MACHINES.json - {len(machines_data)} machines remain") - - return True - except Exception as e: - logger.error(f"Failed to save machines to JSON: {str(e)}", exc_info=True) - - # Log more debug info to help diagnose the issue - logger.debug(f"machines_data type: {type(machines_data)}") - if isinstance(machines_data, dict): - logger.debug(f"machines_data has {len(machines_data)} entries") - # Log a sample of the data - if machines_data: - sample_key = next(iter(machines_data)) - sample_value = machines_data[sample_key] - logger.debug(f"Sample entry - key: {sample_key}, value type: {type(sample_value)}") - if isinstance(sample_value, dict): - logger.debug(f"Sample keys: {list(sample_value.keys())}") - - # Clean up temp file if it exists - try: - if os.path.exists("MACHINES.json.tmp"): - os.remove("MACHINES.json.tmp") - except Exception as cleanup_error: - logger.error(f"Error cleaning up temp file: {str(cleanup_error)}") - - return False - - -def parse_args(): - """Parse command line arguments""" - parser = argparse.ArgumentParser( - description="Manage spot instances across multiple AWS regions." - ) - parser.add_argument( - "action", # Changed from --action to positional argument - choices=["create", "destroy", "list", "delete_disconnected_aws_nodes"], - help="Action to perform", - nargs="?", # Make it optional - default="list", # Default to list if not provided - ) - parser.add_argument( - "--format", choices=["default", "json"], default="default", help="Output format" - ) - parser.add_argument( - "--timeout", type=int, default=30, help="AWS API timeout in seconds" - ) - parser.add_argument( - "-v", "--verbose", action="store_true", help="Enable verbose debug output" - ) - - args = parser.parse_args() - - # Configure unified logging - use the same file_handler for both log file and console - global file_handler - - # Remove any existing handlers to ensure clean configuration - for handler in logger.handlers[:]: - logger.removeHandler(handler) - - # Create/truncate the debug.log file - try: - with open("debug.log", "w") as f: - pass # Just open in write mode to truncate - except Exception as e: - sys.stdout.write(f"Warning: Could not truncate debug.log: {e}\n") - sys.stdout.flush() - - # Create and configure file handler - file_handler = logging.FileHandler("debug.log") - file_handler.setFormatter(log_formatter) - - # Set log levels based on verbose flag - if args.verbose: - file_handler.setLevel(logging.DEBUG) - logger.setLevel(logging.DEBUG) - else: - file_handler.setLevel(logging.INFO) - logger.setLevel(logging.INFO) - - # Add the file handler to our logger - this will be shared with the console handler - logger.addHandler(file_handler) - - # Log initial startup message - logger.info(f"Starting with action: {args.action}, verbose: {args.verbose}") - - # Set global timeout from command line argument - global AWS_API_TIMEOUT - AWS_API_TIMEOUT = args.timeout - logger.info(f"Set AWS API timeout to {AWS_API_TIMEOUT} seconds") - - # Set task name based on action - global task_name, task_total - if args.action == "create": - task_name = "Creating Spot Instances" - task_total = TOTAL_INSTANCES - elif args.action == "destroy": - task_name = "Terminating Spot Instances" - task_total = 100 # Will be updated when we know how many instances to terminate - elif args.action == "list": - task_name = "Listing Spot Instances" - task_total = 100 # Will be updated when we know how many instances to list - elif args.action == "delete_disconnected_aws_nodes": - task_name = "Deleting Disconnected AWS Nodes" - task_total = 100 # Will be updated when we know how many nodes to delete - - logger.info(f"Set task: '{task_name}' with target: {task_total}") - return args - - -async def check_aws_credentials(): - """Check if AWS credentials are valid before proceeding. - - Returns: - bool: True if credentials are valid, False otherwise - """ - logger.info("Checking AWS credentials validity...") - try: - # Try to use any region for the check - we'll use the first configured region - region = AWS_REGIONS[0] if AWS_REGIONS else "us-east-1" - ec2 = get_ec2_client(region) - - # Make a simple API call that requires valid credentials - await safe_aws_call(ec2.describe_regions, RegionNames=[region]) - - logger.info("AWS credentials are valid") - return True - except botocore.exceptions.ClientError as e: - error_code = getattr(e, 'response', {}).get('Error', {}).get('Code', '') - error_msg = getattr(e, 'response', {}).get('Error', {}).get('Message', str(e)) - - if error_code in ['ExpiredToken', 'InvalidToken', 'UnauthorizedOperation']: - logger.error(f"AWS credentials have expired or are invalid: {error_msg}") - console.print("[bold red]AWS credentials have expired or are invalid.[/bold red]") - console.print("[yellow]Please run 'aws sso login' to refresh your credentials.[/yellow]") - else: - logger.error(f"Error checking AWS credentials: {error_code} - {error_msg}") - console.print(f"[bold red]AWS credentials error:[/bold red] {error_code} - {error_msg}") - - return False - except Exception as e: - logger.error(f"Error checking AWS credentials: {str(e)}") - console.print(f"[bold red]Error checking AWS credentials:[/bold red] {str(e)}") - console.print("[yellow]Please verify your AWS configuration and connectivity.[/yellow]") - return False - -async def perform_action(): - """Execute the requested action""" - args = parse_args() - logger.debug(f"Starting perform_action with action: {args.action}") - operation_result = { - "success": False, - "action": args.action, - "start_time": datetime.now(timezone.utc).isoformat(), - "end_time": None, - "result_summary": {} - } - - # Check AWS credentials before performing any action that requires AWS API calls - if args.action in ["create", "destroy", "list"]: - credentials_valid = await check_aws_credentials() - if not credentials_valid: - operation_result["error"] = "Invalid AWS credentials" - return operation_result - - try: - if args.action == "create": - logger.info("Initiating create_spot_instances") - # Wait for the create operation to fully complete - creation_success = await create_spot_instances() - - # Count successfully created instances by region - created_instances = {} - for status in all_statuses.values(): - if status.instance_id and status.public_ip: # Successfully created with IP - region = status.region - if region not in created_instances: - created_instances[region] = 0 - created_instances[region] += 1 - - total_created = sum(created_instances.values()) - - # Count instances with public IPs and completed provisioning - provisioned_instances = {} - for status in all_statuses.values(): - if status.instance_id and status.public_ip and status.detailed_status == "Provisioning complete": - region = status.region - if region not in provisioned_instances: - provisioned_instances[region] = 0 - provisioned_instances[region] += 1 - - total_provisioned = sum(provisioned_instances.values()) - - # Set operation result based on success of creation - operation_result["success"] = total_created > 0 - operation_result["result_summary"] = { - "instances_created": total_created, - "instances_by_region": created_instances, - "instances_provisioned": total_provisioned, - "all_received_ips": creation_success - } - - # Save newly created instances to MACHINES.json (operation="update") - if len(all_statuses) > 0: - save_result = save_machines_to_json(operation="update") - operation_result["result_summary"]["saved_to_file"] = save_result - - logger.info(f"Creation completed: {total_created} instances created, {total_provisioned} fully provisioned") - - # If we didn't create any instances, that's an issue - if total_created == 0: - raise Exception("Failed to create any instances - check AWS credentials and limits") - - elif args.action == "list": - logger.info("Initiating list_spot_instances") - await list_spot_instances() - - # Count instances by status - instance_counts = {} - for status in all_statuses.values(): - if status.status not in instance_counts: - instance_counts[status.status] = 0 - instance_counts[status.status] += 1 - - operation_result["success"] = True - operation_result["result_summary"] = { - "total_instances": len(all_statuses), - "instances_by_status": instance_counts - } - - # Update MACHINES.json with current instances (operation="update") - if len(all_statuses) > 0: - save_machines_to_json(operation="update") - - elif args.action == "destroy": - # Store counts before destruction for reporting - initial_count = len(all_statuses) - initial_regions = set(status.region for status in all_statuses.values() if status.region) - - # Create a dictionary to track instances per region and zone - region_zone_counts = {} - for status in all_statuses.values(): - if status.region and status.zone: - if status.region not in region_zone_counts: - region_zone_counts[status.region] = {} - if status.zone not in region_zone_counts[status.region]: - region_zone_counts[status.region][status.zone] = 0 - region_zone_counts[status.region][status.zone] += 1 - - # Skip doing any MACHINES.json operations if empty - has_instances = initial_count > 0 - - logger.info("Initiating destroy_instances") - await destroy_instances() - - # Get summary of terminated instances - operation_result["success"] = True - operation_result["result_summary"] = { - "instances_terminated": initial_count, - "regions_affected": list(initial_regions), - "region_zone_distribution": region_zone_counts, - "cleanup_completed": True - } - - # Remove destroyed instances from MACHINES.json (operation="delete") - if has_instances: - save_machines_to_json(operation="delete") - - elif args.action == "delete_disconnected_aws_nodes": - logger.info("Initiating delete_disconnected_aws_nodes") - await delete_disconnected_aws_nodes() - operation_result["success"] = True - - logger.debug(f"Completed action: {args.action}") - - # Set completion timestamp - operation_result["end_time"] = datetime.now(timezone.utc).isoformat() - - except TimeoutError as e: - logger.error(f"TimeoutError occurred: {str(e)}") - console.print(f"[bold red]Error:[/bold red] {str(e)}") - console.print("[yellow]This may be due to AWS credential issues.[/yellow]") - console.print( - "[yellow]Try running 'aws sso login' to refresh your credentials.[/yellow]" - ) - table_update_event.set() - operation_result["error"] = str(e) - return operation_result - - except botocore.exceptions.ClientError as e: - logger.error(f"AWS ClientError occurred: {str(e)}") - if "ExpiredToken" in str(e) or "InvalidToken" in str(e): - console.print("[bold red]AWS credentials have expired.[/bold red]") - console.print( - "[yellow]Try running 'aws sso login' to refresh your credentials.[/yellow]" - ) - else: - console.print(f"[bold red]AWS Error:[/bold red] {str(e)}") - table_update_event.set() - operation_result["error"] = str(e) - return operation_result - - except Exception as e: - logger.error(f"Unexpected error occurred: {str(e)}", exc_info=True) - console.print(f"[bold red]Error:[/bold red] {str(e)}") - table_update_event.set() - operation_result["error"] = str(e) - return operation_result - - return operation_result - - -async def main(): - """Main execution function""" - handler = None # Initialize handler to None - try: - args = parse_args() - - # Logging has been configured in parse_args - # We'll see these log messages in both debug.log and the Rich console panel - if args.verbose: - logger.debug("Verbose logging enabled") - - logger.info(f"Starting action: {args.action}") - - if args.format == "json": - logger.info("Using JSON output format") - operation_result = await perform_action() - # Machine updates in MACHINES.json are now handled within perform_action() - - # For JSON output, also show MACHINES.json contents if it exists - machines_from_file = load_machines_from_json().get("machines", {}) - - # Use direct stdout before rich console is initialized - output = { - "current_machines": all_statuses_to_dict(), - "saved_machines_count": len(machines_from_file), - "operation_result": operation_result - } - sys.stdout.write(json.dumps(output, indent=2, default=str) + "\n") - sys.stdout.flush() - return - - # Create initial progress and table - progress = Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TaskProgressColumn(), - TimeElapsedColumn(), - ) - table = make_progress_table() - - # Create layout before using it in Live - layout = create_layout(progress, table) - - # Initialize the live display with the layout - with Live( - layout, - console=console, - refresh_per_second=5, - auto_refresh=True, - screen=True, - transient=False, # Keep the display visible after exit - ) as live: - try: - # Update our global flag to indicate terminal has been cleared - global is_terminal_cleared - is_terminal_cleared = True - - # Add the rich console handler for logging, sharing the file handler - handler = RichConsoleHandler(live, layout, file_handler) # Pass layout and file handler - logger.addHandler(handler) - - # Start display update task in a separate thread - loop = asyncio.get_event_loop() - display_task = loop.create_task(update_display(live)) - - # Set up exception handler for display_task - def handle_display_task_exception(task): - try: - # Get the exception if any - task.result() - except Exception as e: - logger.error(f"Display task failed: {str(e)}", exc_info=True) - # We don't reraise here - just log it - - display_task.add_done_callback(handle_display_task_exception) - - # Perform the requested action - operation_result = await perform_action() - - # Display summary after operation completes (if successful) - if operation_result.get("success", False): - # Create a nice summary table - summary_table = Table(title=f"{args.action.capitalize()} Operation Summary", - show_header=True, - header_style="bold cyan", - box=box.ROUNDED) - - # Add columns based on the action - if args.action == "create": - summary_table.add_column("Total Created", style="green") - summary_table.add_column("Regions", style="blue") - summary_table.add_column("Distribution", style="cyan") - - # Get summary data - summary = operation_result["result_summary"] - total = summary.get("instances_created", 0) - by_region = summary.get("instances_by_region", {}) - all_ips = summary.get("all_received_ips", True) - - # Add the IP status column - summary_table.add_column("IP Status", style="green") - - # Format region distribution - region_list = ", ".join(by_region.keys()) if by_region else "None" - distribution = " | ".join([f"{region}: {count}" for region, count in by_region.items()]) if by_region else "None" - - # Format IP status message - ip_status = "✓ All Received" if all_ips else "⚠ Some missing IPs" - - # Add the row with status - summary_table.add_row(str(total), region_list, distribution, ip_status) - - elif args.action == "destroy": - summary_table.add_column("Instances Terminated", style="red") - summary_table.add_column("Regions Affected", style="cyan") - summary_table.add_column("Result", style="magenta") - - # Get summary data - summary = operation_result["result_summary"] - terminated = summary.get("instances_terminated", 0) - regions = summary.get("regions_affected", []) - - # Format for display - region_text = ", ".join(regions) if regions else "None" - - # Add the row - show if machines file was updated - if terminated > 0: - summary_table.add_row(str(terminated), region_text, "✓ Successful") - else: - summary_table.add_row(str(terminated), region_text, "No machines found") - - # Print the summary - console.print("\n") # Add some space - console.print(summary_table) - console.print("\n") # Add some space after - - # Show appropriate message based on the operation - if args.action == "create" and operation_result.get("result_summary", {}).get("instances_created", 0) > 0: - console.print("[green]✓ Machine information saved to MACHINES.json[/green]") - elif args.action == "list" and operation_result.get("result_summary", {}).get("total_instances", 0) > 0: - console.print("[green]✓ Machine information updated in MACHINES.json[/green]") - elif args.action == "destroy" and operation_result.get("result_summary", {}).get("instances_terminated", 0) > 0: - console.print("[red]✓ Terminated machines removed from MACHINES.json[/red]") - - # Signal display task to stop and wait for completion - logger.debug("Signaling display task to stop") - table_update_event.set() - - # For create action, make sure we keep the display up just long enough - # to let users see the results but not block on full provisioning - if args.action == "create": - # Just wait a short time to ensure users see the final IP table - logger.debug("Keeping display open briefly to show final IP table") - await asyncio.sleep(5.0) - - # Signal display task to stop (normal case) - logger.debug("Ending display task") - - # Wait for display to finish updating with a timeout - try: - logger.debug("Waiting for display task to complete") - - # Short timeout for display task cleanup - display_timeout = 5.0 - await asyncio.wait_for(asyncio.shield(display_task), timeout=display_timeout) - logger.debug("Display task completed") - except asyncio.TimeoutError: - logger.warning(f"Display task did not complete within {display_timeout}s timeout") - # We continue anyway, the task will be cancelled in the finally block - - except Exception as e: - logger.error(f"Error in main execution: {str(e)}", exc_info=True) - # Don't try to use rich console here, as it might be the source of the error - # Error will be printed by our outer exception handler - raise - finally: - # Stop the display task if it's still running - if display_task and not display_task.done(): - display_task.cancel() - - # Remove the rich console handler if it was added - if handler is not None and handler in logger.handlers: - logger.removeHandler(handler) - - except Exception as e: - logger.error(f"Fatal error occurred: {str(e)}", exc_info=True) - console.print(f"\n[bold red]Fatal error:[/bold red] {str(e)}") - raise - - -if __name__ == "__main__": - # Store the original terminal settings to ensure we can properly display errors - is_terminal_cleared = False - - # Function to print error outside of rich Live display context - def print_error_message(message): - # Ensure we're writing directly to stdout to avoid stderr - if is_terminal_cleared: - # If terminal was cleared by rich Live display, add newlines for visibility - sys.stdout.write("\n\n") - sys.stdout.write(f"\n[ERROR] {message}\n") - sys.stdout.write("Check debug.log for more details.\n") - sys.stdout.flush() - - # Add a simple info message directly to console for initial startup - # This is only for user feedback before the rich console is ready - sys.stdout.write("Initializing...\n") - sys.stdout.flush() - - try: - # Log to file only, not stdout - logger.info("Starting main execution") - asyncio.run(main()) - logger.info("Main execution completed") - except KeyboardInterrupt: - logger.info("Operation cancelled by user") - sys.stderr = open(os.devnull, 'w') # Suppress any stderr output - print_error_message("Operation cancelled by user.") - sys.exit(1) - except Exception as e: - # Log detailed error - logger.error(f"Fatal error occurred: {str(e)}", exc_info=True) - - # Silence stderr completely - sys.stderr = open(os.devnull, 'w') - - # Print user-friendly error message outside of any rich context - error_msg = f"Fatal error occurred: {str(e)}" - - # Add additional context for common errors - if "TimeoutError" in str(e): - error_msg += "\nThis may be due to AWS credential issues or network problems." - error_msg += "\nTry running 'aws sso login' to refresh your credentials." - elif "ExpiredToken" in str(e) or "InvalidToken" in str(e): - error_msg += "\nAWS credentials have expired. Try running 'aws sso login'." - elif "InstanceId" in str(e) and "does not exist" in str(e): - error_msg += "\nThe specified instance may have been terminated or never created." - - print_error_message(error_msg) - sys.exit(1) diff --git a/edge-data-transfer-demo-v2/edge-data-spots/edge-data.md b/edge-data-transfer-demo-v2/edge-data-spots/edge-data.md deleted file mode 100644 index ed24ce3d..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/edge-data.md +++ /dev/null @@ -1,71 +0,0 @@ - -# Edge Data Deployment – Beginner Guide - -## 1. Prerequisites – Required Tools - -Make sure your system has the following installed: - -- Python 3.10 or higher -- python3-pip -- `uv` (install via `pip install uv`) -- `aws-cli` – [Installation Guide](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) - ---- - -## 2. Configure AWS CLI - -Run the following command: - -`aws configure` - -## 3. Navigate to the project directory - -Run: - -`uv run -s util/get_ubuntu_amis.py` - -Choose the desired AMI ID from the `ubuntu_amis` output (all are ARM-based) and update your `config.yaml`: - -```yaml -machine_type: -``` - -## 5. Update config.yaml -Fill in the following fields: -```yaml -orchestrators: -- nats://:4222 -public_ssh_key_path: - -token: "" - - -``` - -## 6. Deploy EFS and Spot Instances -Run the deployment script: - -`uv run -s ./deploy_spot.py create` - -Check if instances have registered correctly on demo machine: - -`bacalhau node list` - - -## 7. Verify NFS mount on a node -SSH into one of the Spot instances and run: - -`df -h` - -Confirm `/mnt/data` is mounted properly. - -## 8. Generate test data -Run the test job to generate random files: - - -`bacalhau job submit generate.yaml` - -## 9. Run the metadata generation job -Submit the main processing job: - -`bacalhau job submit create_metadata.yaml` diff --git a/edge-data-transfer-demo-v2/edge-data-spots/generate_example_config.sh b/edge-data-transfer-demo-v2/edge-data-spots/generate_example_config.sh deleted file mode 100755 index 3f5886e6..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/generate_example_config.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -CONFIG_FILE="config.yaml_example" - -if [[ -z "$COMPUTE_ORCHESTRATOR" || -z "$COMPUTE_AUTH_TOKEN" || -z "$COMPUTE_AWS_REGION" ]]; then - echo "Error: COMPUTE_ORCHESTRATOR, COMPUTE_AUTH_TOKEN and COMPUTE_AWS_REGION must be set." - exit 1 -fi - -cat < "$CONFIG_FILE" -max_instances: 5 -username: bacalhau-runner -public_ssh_key_path: /root/.ssh/id_rsa -compute_orchestrators: - - $COMPUTE_ORCHESTRATOR -compute_auth_token: $COMPUTE_AUTH_TOKEN -compute_tls: "true" -regions: - - $COMPUTE_AWS_REGION: - image: "auto" - machine_type: "m6gd.medium" - node_count: auto -EOF - -echo "Config written to $CONFIG_FILE" diff --git a/edge-data-transfer-demo-v2/edge-data-spots/instance/cloud-init/init-vm-template.yml b/edge-data-transfer-demo-v2/edge-data-spots/instance/cloud-init/init-vm-template.yml deleted file mode 100644 index 00daf935..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/instance/cloud-init/init-vm-template.yml +++ /dev/null @@ -1,150 +0,0 @@ -#cloud-config - -write_files: - - path: /tmp/scripts.tar.gz - encoding: base64 - content: ${compressed_scripts} - permissions: '0600' - -users: - - name: ${username} - sudo: ALL=(ALL) NOPASSWD:ALL - shell: /bin/bash - ssh_authorized_keys: - - ${public_ssh_key} - groups: docker - -package_update: true -package_upgrade: true - -runcmd: - - mkdir -p /tmp/exs - - tar -xzf /tmp/scripts.tar.gz -C /tmp/exs - - | - # Remove minimal packages only if DNF is available (i.e., on Amazon Linux 2023). - all_packages="curl gnupg2 jq python3 python3-pip nfs-common apache2-utils" - if command -v dnf >/dev/null 2>&1; then - dnf install --allowerasing -y $all_packages - elif command -v apt >/dev/null 2>&1; then - apt update - apt install -y $all_packages - elif command -v yum >/dev/null 2>&1; then - yum install -y $all_packages - else - echo "Unsupported package manager" - exit 1 - fi - - # Install python package - - pip install flask gunicorn - - # Install Docker - - mv /tmp/exs/install-docker.sh /root/install-docker.sh - - chmod 755 /root/install-docker.sh - - /root/install-docker.sh - - # add scripts - - mv /tmp/exs/disable-network.sh /opt/disable-network.sh - - mv /tmp/exs/enable-network.sh /opt/enable-network.sh - - chmod +x /opt/disable-network.sh - - chmod +x /opt/enable-network.sh - - - mv /tmp/exs/disable-nfs.sh /opt/disable-nfs.sh - - mv /tmp/exs/enable-nfs.sh /opt/enable-nfs.sh - - chmod +x /opt/disable-nfs.sh - - chmod +x /opt/enable-nfs.sh - - # Ensure the authorized key is properly added to the user - - mkdir -p /home/${username}/.ssh - - echo "${public_ssh_key}" > /home/${username}/.ssh/authorized_keys - - chown -R ${username}:${username} /home/${username}/.ssh - - chmod 0600 /home/${username}/.ssh/authorized_keys - - # Create necessary directories first - - mkdir -p ${bacalhau_data_dir} ${bacalhau_node_dir} /etc/bacalhau /etc/systemd/system /usr/local/bin - - # Write files after directories are created - - mv /tmp/exs/bacalhau-startup.service /etc/systemd/system/bacalhau-startup.service - - mv /tmp/exs/startup.sh /usr/local/bin/startup.sh - - echo "${bacalhau_config_file}" | base64 -d > /${bacalhau_node_dir}/config.yaml - - mv /tmp/exs/docker-compose.yaml ${bacalhau_node_dir}/docker-compose.yaml - - # Set correct permissions - - chmod 0600 /etc/systemd/system/bacalhau-startup.service - - chmod 0700 /usr/local/bin/startup.sh - - chmod 0400 ${bacalhau_node_dir}/config.yaml - - chmod 0400 ${bacalhau_node_dir}/docker-compose.yaml - - chmod 0777 ${bacalhau_data_dir} - - # Set ownership - - chown -R ${username}:${username} ${bacalhau_data_dir} - - chown -R ${username}:${username} ${bacalhau_node_dir} - - chown ${username}:${username} ${bacalhau_node_dir}/config.yaml - - chown ${username}:${username} ${bacalhau_node_dir}/docker-compose.yaml - - # Add user to docker group - - usermod -aG docker ${username} - - # Install uv globally and set permissions - - export HOME=/root - - curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" HOME=/root sh - - chmod 755 /usr/local/bin/uv || true - - chown ${username}:${username} /usr/local/bin/uv || true - - # Create uv cache directory for user - - mkdir -p /home/${username}/.cache/uv - - chown -R ${username}:${username} /home/${username}/.cache - - # Install health check web server - - mv /tmp/exs/healthz-web-server.py /usr/local/bin/healthz-web-server.py - - chmod 755 /usr/local/bin/healthz-web-server.py - - chown ${username}:${username} /usr/local/bin/healthz-web-server.py - - # Create a symlink without .py extension for Gunicorn - - ln -sf /usr/local/bin/healthz-web-server.py /usr/local/bin/healthz-web-server - - # Install service - - mv /tmp/exs/healthz-web.service /etc/systemd/system/healthz-web.service - - chmod 644 /etc/systemd/system/healthz-web.service - - # Configure Docker to start on boot - - systemctl enable docker.service - - systemctl enable containerd.service - - #set lables - - - mkdir /opt/test10 - - # Create the mount point for NFScd - - mkdir -p /mnt/data - -# # Mount the NFS share -# - mount -t nfs foxyfutures.pl:/mnt/data /mnt/data -# -# # Ensure NFS mount persists on reboot -# - echo "foxyfutures.pl:/mnt/data /mnt/data nfs defaults,_netdev 0 0" >> /etc/fstab - - mount -t nfs ${efs_mount_ip}:/ /mnt/data - - - echo "${efs_mount_ip}:/ /mnt/data nfs defaults,_netdev 0 0" >> /etc/fstab - - - mv /tmp/exs/generate.py /bacalhau_data/generate.py - - mv /tmp/exs/metadata.sh /bacalhau_data/metadata.sh - - - - # Start services - - systemctl daemon-reload - - systemctl enable docker - - systemctl start docker - - systemctl enable healthz-web.service - - systemctl start healthz-web.service - - systemctl enable bacalhau-startup.service - - systemctl start bacalhau-startup.service - - - - -power_state: - mode: reboot - timeout: 1800 - condition: True diff --git a/edge-data-transfer-demo-v2/edge-data-spots/instance/config/config-template.yaml b/edge-data-transfer-demo-v2/edge-data-spots/instance/config/config-template.yaml deleted file mode 100644 index c8580cc6..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/instance/config/config-template.yaml +++ /dev/null @@ -1,15 +0,0 @@ -NameProvider: puuid -API: - Port: 1234 -Compute: - Enabled: true - Orchestrators: ${orchestrators_list} - Auth: - Token: ${bacalhau_token} - TLS: - RequireTLS: ${tls} - AllowListedLocalPaths: - - /bacalhau_data:rw - - /mnt/data:rw -JobAdmissionControl: - AcceptNetworkedJobs: true diff --git a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/bacalhau-startup.service b/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/bacalhau-startup.service deleted file mode 100644 index b94cbee9..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/bacalhau-startup.service +++ /dev/null @@ -1,17 +0,0 @@ -[Unit] -Description=Bacalhau Startup Script -After=network-online.target -Wants=network-online.target - -[Service] -Type=simple -Environment=BACALHAU_DATA_DIR=/bacalhau_data -Environment=BACALHAU_NODE_DIR=/bacalhau_node -ExecStart=/usr/local/bin/startup.sh -Restart=on-failure -RestartSec=5 -StandardOutput=journal -StandardError=journal - -[Install] -WantedBy=multi-user.target diff --git a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/disable-network.sh b/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/disable-network.sh deleted file mode 100644 index 491be53f..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/disable-network.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - - -if [[ $EUID -ne 0 ]]; then - echo "Run as root" - exit 1 -fi - - - - -echo "Config iptables..." - - - -iptables -I DOCKER-USER -p tcp --dport 4222 -j DROP -iptables -I DOCKER-USER -p udp --dport 4222 -j DROP - - - -echo "Block all" diff --git a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/disable-nfs.sh b/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/disable-nfs.sh deleted file mode 100644 index 53a3c19d..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/disable-nfs.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - - -if [[ $EUID -ne 0 ]]; then - echo "Run as root" - exit 1 -fi - - - -echo "Config iptables..." - - - -iptables -A INPUT -p tcp --dport 2049 -j DROP -iptables -A OUTPUT -p tcp --dport 2049 -j DROP -iptables -A FORWARD -p tcp --dport 2049 -j DROP - -iptables -A INPUT -p udp --dport 2049 -j DROP -iptables -A OUTPUT -p udp --dport 2049 -j DROP -iptables -A FORWARD -p udp --dport 2049 -j DROP - - -echo "Block all" diff --git a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/docker-compose.yaml b/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/docker-compose.yaml deleted file mode 100644 index eb8b68ae..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/docker-compose.yaml +++ /dev/null @@ -1,32 +0,0 @@ -services: - bacalhau-node: - image: ghcr.io/bacalhau-project/bacalhau:latest-dind - privileged: true - restart: always - volumes: - - type: bind - source: /bacalhau_node/config.yaml - target: /etc/bacalhau/config.yaml - - type: bind - source: /bacalhau_node/node-info - target: /etc/node-info - - type: bind - source: /bacalhau_data - target: /bacalhau_data - - type: bind - source: /mnt/data - target: /mnt/data - healthcheck: - test: ["CMD", "curl", "-f", "localhost:1234"] - interval: 2s - timeout: 2s - retries: 1 - start_period: 40s - command: - - "serve" - - "--config" - - "/etc/bacalhau/config.yaml" - - "-c" - - "Logging.Level=info" - - "-c" - - "LABELS=${LABELS}" diff --git a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/enable-network.sh b/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/enable-network.sh deleted file mode 100644 index ee147501..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/enable-network.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - - -if [[ $EUID -ne 0 ]]; then - echo "Start as root" - exit 1 -fi - -echo "Iptables config" - -iptables -D DOCKER-USER -p tcp --dport 4222 -j DROP -iptables -D DOCKER-USER -p udp --dport 4222 -j DROP - - -echo "Allow All Connect" - diff --git a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/enable-nfs.sh b/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/enable-nfs.sh deleted file mode 100644 index 70b5bbc8..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/enable-nfs.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - - -if [[ $EUID -ne 0 ]]; then - echo "Start as root" - exit 1 -fi - -echo "Iptables config" - - -iptables -D INPUT -p tcp --dport 2049 -j DROP -iptables -D OUTPUT -p tcp --dport 2049 -j DROP -iptables -D FORWARD -p tcp --dport 2049 -j DROP - -iptables -D INPUT -p udp --dport 2049 -j DROP -iptables -D OUTPUT -p udp --dport 2049 -j DROP -iptables -D FORWARD -p udp --dport 2049 -j DROP - -echo "Allow All Connect" - diff --git a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/generate.py b/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/generate.py deleted file mode 100644 index e9c8db54..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/generate.py +++ /dev/null @@ -1,29 +0,0 @@ -import os -import random -import string - -# Directory to store files -output_dir = "/mnt/data" -os.makedirs(output_dir, exist_ok=True) - -# Number of files to generate -num_files = 1000 - -# File size range in bytes (change as needed) -min_size = 5000 * 1024 -max_size = 20000 * 1024 - -for i in range(num_files): - file_size = random.randint(min_size, max_size) # Random size - filename = os.path.join(output_dir, f"file_{i+1}.txt") - - # Generate random content - content = ''.join(random.choices(string.ascii_letters + string.digits, k=file_size)) - - # Write to file - with open(filename, "w") as f: - f.write(content) - - print(f"Generated: {filename} ({file_size} bytes)") - -print(f"\n✅ Successfully generated {num_files} random text files in '{output_dir}'") diff --git a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/healthz-web-server.py b/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/healthz-web-server.py deleted file mode 100644 index 97f7dfc0..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/healthz-web-server.py +++ /dev/null @@ -1,207 +0,0 @@ -#!/usr/bin/env python3 -# /// script -# requires-python = ">=3.10" -# dependencies = [ -# "flask", -# "gunicorn", -# "flask_cors", -# ] -# /// - -import logging -import subprocess -import shutil -import os -import re -from flask import Flask, jsonify, request, abort -from flask_cors import CORS - -# Disable Flask logging -log = logging.getLogger("werkzeug") -log.disabled = True -app = Flask(__name__) -app.logger.disabled = True - -CORS(app, resources={r"/*": {"origins": "*"}}) - -CONTAINER_PREFIX = "bacalhau_node" -AUTH_TOKEN = "abrakadabra1234!@#" - - -def check_docker_health(): - try: - cmd = f"docker ps --filter name=^/{CONTAINER_PREFIX} --format '{{{{.Status}}}}'" - result = subprocess.run(cmd, shell=True, capture_output=True, text=True) - - if "(healthy)" in result.stdout: - return True, "Container is healthy" - elif result.stdout.strip(): - return False, "Container is running but not healthy" - else: - return False, f"No containers found matching prefix '{CONTAINER_PREFIX}'" - except Exception as e: - return False, f"Error checking health: {str(e)}" - - -def run_shell_script(script_path): - try: - result = subprocess.run(["sudo", script_path], capture_output=True, text=True) - if result.returncode == 0: - return True, f"Script executed successfully: {script_path}" - else: - return False, f"Error executing {script_path}: {result.stderr.strip()}" - except Exception as e: - return False, f"Execution failed: {str(e)}" - - -def authenticate(): - auth_header = request.headers.get("Authorization", "") - if auth_header != f"Bearer {AUTH_TOKEN}": - abort(401, description="Unauthorized") - - -@app.route("/nfs-healthz") -def nfs_healthz(): - authenticate() - mount_point = "/mnt/data" - if not os.path.ismount(mount_point): - return jsonify({"status": "unhealthy", "message": f"{mount_point} is not mounted"}), 503 - - try: - # użycie `timeout` + `ls` na /mnt/data - result = subprocess.run( - ["timeout", "1", "ls", "-1", mount_point], - capture_output=True, - text=True, - check=False - ) - if result.returncode == 0: - return jsonify({"status": "healthy", "message": f"NFS mount {mount_point} is healthy"}), 200 - else: - return jsonify({ - "status": "unhealthy", - "message": f"NFS I/O failed or timeout. Exit code: {result.returncode}, stderr: {result.stderr.strip()}" - }), 503 - except Exception as e: - return jsonify({"status": "unhealthy", "message": f"Exception: {str(e)}"}), 503 - -@app.route("/healthz") -def healthz(): - is_healthy, message = check_docker_health() - response = {"status": "healthy" if is_healthy else "unhealthy", "message": message} - return jsonify(response), 200 if is_healthy else 503 - - -@app.route("/close-network", methods=["POST"]) -def close_ports(): - authenticate() - success, message = run_shell_script("/opt/disable-network.sh") - return jsonify({"status": "success" if success else "error", "message": message}) - - -@app.route("/open-network", methods=["POST"]) -def open_ports(): - authenticate() - success, message = run_shell_script("/opt/enable-network.sh") - return jsonify({"status": "success" if success else "error", "message": message}) - -@app.route("/close-nfs", methods=["POST"]) -def close_nfs(): - authenticate() - success, message = run_shell_script("/opt/disable-nfs.sh") - return jsonify({"status": "success" if success else "error", "message": message}) - - -@app.route("/open-nfs", methods=["POST"]) -def open_nfs(): - authenticate() - success, message = run_shell_script("/opt/enable-nfs.sh") - return jsonify({"status": "success" if success else "error", "message": message}) - -@app.errorhandler(404) -def all_routes(e): - return "", 404 - -def natural_sort_key(filename): - match = re.search(r'(\d+)', filename) - return int(match.group(1)) if match else float('inf') - -@app.route("/file", methods=["GET"]) -def list_files(): - authenticate() - directory = "/mnt/data" - try: - files = [ - f for f in os.listdir(directory) - if os.path.isfile(os.path.join(directory, f)) - ] - files_sorted = sorted(files, key=natural_sort_key) - return jsonify({"files": files_sorted}) - except Exception as e: - return jsonify({"status": "error", "message": str(e)}), 500 - - -@app.route("/process_file", methods=["GET"]) -def list_processed_files(): - authenticate() - directory = "/bacalhau_data/metadata" - try: - files = [ - f for f in os.listdir(directory) - if os.path.isfile(os.path.join(directory, f)) - ] - return jsonify({"files": files}) - except Exception as e: - return jsonify({"status": "error", "message": str(e)}), 500 - -@app.route("/clear-metadata", methods=["POST"]) -def clear_metadata(): - authenticate() - target_dir = "/bacalhau_data/metadata" - try: - if os.path.exists(target_dir): - for item in os.listdir(target_dir): - item_path = os.path.join(target_dir, item) - if os.path.isfile(item_path) or os.path.islink(item_path): - os.unlink(item_path) - elif os.path.isdir(item_path): - shutil.rmtree(item_path) - return jsonify({"status": "success", "message": "Metadata directory cleared."}) - else: - return jsonify({"status": "error", "message": f"Directory {target_dir} does not exist."}), 400 - except Exception as e: - return jsonify({"status": "error", "message": f"Failed to clear directory: {str(e)}"}), 500 - - - -if __name__ == "__main__": - from gunicorn.app.base import BaseApplication - - class StandaloneApplication(BaseApplication): - def __init__(self, app, options=None): - self.options = options or {} - self.application = app - super().__init__() - - def load_config(self): - for key, value in self.options.items(): - if key in self.cfg.settings and value is not None: - self.cfg.set(key.lower(), value) - - def load(self): - return self.application - - options = { - "bind": "0.0.0.0:9123", - "workers": 16, - "accesslog": None, - "errorlog": None, - "worker_class": "sync", - "timeout": 15, - "logger_class": "gunicorn.glogging.Logger", - "loglevel": "critical", - "disable_redirect_access_to_syslog": True, - "capture_output": False, - } - - StandaloneApplication(app, options).run() diff --git a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/healthz-web.service b/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/healthz-web.service deleted file mode 100644 index 562312ae..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/healthz-web.service +++ /dev/null @@ -1,15 +0,0 @@ -[Unit] -Description=Health Check Web Server -After=network-online.target -Wants=network-online.target - -[Service] -Type=simple -Environment=PATH=/usr/local/bin:/usr/bin:/bin -ExecStart=/usr/local/bin/uv run /usr/local/bin/healthz-web-server.py -WorkingDirectory=/usr/local/bin -Restart=always -RestartSec=10 - -[Install] -WantedBy=multi-user.target diff --git a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/install-docker.sh b/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/install-docker.sh deleted file mode 100644 index 72b6db98..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/install-docker.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env bash - -set -e - -if [ -f /etc/os-release ]; then - . /etc/os-release - OS=$NAME -fi - -echo "Detected OS: $OS" -retry_command() { - local n=0 - local max=5 - local delay=15 - while true; do - "$@" && break || { - if [[ $n -lt $max ]]; then - ((n++)) - echo "Command failed. Attempt $n/$max. Retrying in $delay seconds..." - sleep $delay - else - echo "The command has failed after $n attempts." - return 1 - fi - } - done -} - -if command -v apt-get >/dev/null 2>&1; then - echo "Using apt package manager..." - - retry_command apt-get update - retry_command apt-get install -y \ - ca-certificates \ - curl \ - gnupg \ - pigz \ - jq \ - libltdl7 \ - libslirp0 \ - slirp4netns \ - apt-transport-https \ - software-properties-common - - install -m 0755 -d /etc/apt/keyrings - rm -f /etc/apt/keyrings/docker.gpg - retry_command curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg - chmod a+r /etc/apt/keyrings/docker.gpg - - echo \ - "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ - $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ - tee /etc/apt/sources.list.d/docker.list > /dev/null - - retry_command apt-get update - - retry_command apt-get install -y \ - docker-ce \ - docker-ce-cli \ - containerd.io \ - docker-buildx-plugin \ - docker-compose-plugin - -elif command -v yum >/dev/null 2>&1; then - echo "Using yum package manager..." - retry_command yum install -y docker - mkdir -p /usr/local/lib/docker/cli-plugins/ - retry_command curl -SL https://github.com/docker/compose/releases/download/v2.24.5/docker-compose-linux-x86_64 -o /usr/local/lib/docker/cli-plugins/docker-compose - chmod +x /usr/local/lib/docker/cli-plugins/docker-compose - -else - echo "No supported package manager found (apt-get, dnf)" - exit 1 -fi - -echo "Starting Docker service..." -systemctl start docker || { - echo "Failed to start Docker service. Waiting 10 seconds and trying again..." - sleep 10 - systemctl start docker -} - -echo "Enabling Docker service..." -systemctl enable docker || { - echo "Failed to enable Docker service. Waiting 10 seconds and trying again..." - sleep 10 - systemctl enable docker -} - -echo "Verifying Docker installation..." -if command -v docker >/dev/null 2>&1; then - docker --version - docker compose version -else - echo "Docker installation verification failed" - exit 1 -fi diff --git a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/metadata.sh b/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/metadata.sh deleted file mode 100644 index 707e31a0..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/metadata.sh +++ /dev/null @@ -1,83 +0,0 @@ -#!/bin/bash - -# Configuration -INPUT_DIR="/mnt/data" -OUTPUT_DIR="/bacalhau_data/metadata" -mkdir -p "$OUTPUT_DIR" - -# Bacalhau Environment Variables -PARTITION_INDEX=${BACALHAU_PARTITION_INDEX:-0} -PARTITION_COUNT=${BACALHAU_PARTITION_COUNT:-5} -NODE_ID=${BACALHAU_NODE_ID:-$(hostname)} -JOB_ID=${BACALHAU_JOB_ID:-"unknown"} -EXECUTION_ID=${BACALHAU_EXECUTION_ID:-"unknown"} -JOB_NAME=${BACALHAU_JOB_NAME:-"unknown"} -JOB_NAMESPACE=${BACALHAU_JOB_NAMESPACE:-"default"} -TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S") -PROCESSING_SLEEP=${PROCESSING_SLEEP:-2} - -# Unique files for each node -NODE_CSV_FILE="$OUTPUT_DIR/metadata_${NODE_ID}.csv" -NODE_LOG_FILE="$OUTPUT_DIR/debug_${NODE_ID}.log" - -# Function to compute file hash safely -compute_hash() { - if [[ -f "$1" ]]; then - sha256sum "$1" | awk '{print $1}' - else - echo "MISSING_FILE" - fi -} - -# Get the list of files, ensuring order is consistent across nodes -FILES=($(ls -1 "$INPUT_DIR"/*.txt 2>/dev/null | sort)) -TOTAL_FILES=${#FILES[@]} - -echo "Processing on Node: $NODE_ID | Partition Index: $PARTITION_INDEX | Total Files: $TOTAL_FILES" | tee -a "$NODE_LOG_FILE" - -# Ensure CSV file has a header if it does not exist -if [ ! -f "$NODE_CSV_FILE" ]; then - echo "file,node,partition_index,execution_id,job_id,timestamp" > "$NODE_CSV_FILE" -fi - - -for ((i=0; i "$METADATA_FILE" -{ - "file": "$FILE_NAME", - "hash": "$FILE_HASH", - "node": "$NODE_ID", - "partition": "$PARTITION_INDEX", - "execution_id": "$EXECUTION_ID", - "job_id": "$JOB_ID", - "job_name": "$JOB_NAME", - "job_namespace": "$JOB_NAMESPACE", - "timestamp": "$TIMESTAMP" -} -EOF - - echo "$FILE_NAME,$NODE_ID,$PARTITION_INDEX,$EXECUTION_ID,$JOB_ID,$TIMESTAMP" >> "$NODE_CSV_FILE" - - echo "✅ Processed: $FILE_NAME -> $METADATA_FILENAME" | tee -a "$NODE_LOG_FILE" - fi -done - -echo "✅ Metadata generation complete for Node $NODE_ID (Partition $PARTITION_INDEX)" | tee -a "$NODE_LOG_FILE" -echo "CSV metadata saved to $NODE_CSV_FILE" | tee -a "$NODE_LOG_FILE" diff --git a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/startup.sh b/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/startup.sh deleted file mode 100644 index 84a39ff1..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/instance/scripts/startup.sh +++ /dev/null @@ -1,119 +0,0 @@ -#!/bin/bash - -set -e - -BACALHAU_NODE_DIR="${BACALHAU_NODE_DIR:-/bacalhau_node}" - -get_cloud_metadata() { - cloud=$(cloud-init query cloud-name) - - if [ "${cloud}" = "gce" ]; then - echo "Detected GCP environment" - CLOUD_PROVIDER="GCP" - REGION=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/zone" | cut -d'/' -f4) - ZONE=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/zone" | cut -d'/' -f4) - PUBLIC_IP=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/access-configs/0/external-ip") - PRIVATE_IP=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/ip") - INSTANCE_ID=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/id") - INSTANCE_TYPE=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/machine-type" | cut -d'/' -f4) - PROJECT_ID=$(curl -s -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/project/project-id") - return 0 - elif [ "${cloud}" = "aws" ]; then - echo "Detected AWS environment" - CLOUD_PROVIDER="AWS" - TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") - REGION=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/placement/region) - ZONE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/placement/availability-zone) - PUBLIC_IP=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/public-ipv4) - PRIVATE_IP=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/local-ipv4) - INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-id) - INSTANCE_TYPE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-type) - return 0 - elif [ "${cloud}" = "azure" ]; then - echo "Detected Azure environment" - CLOUD_PROVIDER="AZURE" - METADATA=$(curl -s -H "Metadata:true" "http://169.254.169.254/metadata/instance?api-version=2021-02-01") - REGION=$(echo "$METADATA" | jq -r .compute.location) - ZONE=$(echo "$METADATA" | jq -r .compute.zone) - PUBLIC_IP=$(curl -s ip.me) - PRIVATE_IP=$(echo "$METADATA" | jq -r .network.interface[0].ipv4.ipAddress[0].privateIpAddress) - INSTANCE_ID=$(echo "$METADATA" | jq -r .compute.vmId) - INSTANCE_TYPE=$(echo "$METADATA" | jq -r .compute.vmSize) - return 0 - else - echo "Could not detect cloud provider - no node info will be set" - return 0 - fi -} - -get_cloud_metadata -cat > "${BACALHAU_NODE_DIR}/node-info" << EOF -CLOUD_PROVIDER=${CLOUD_PROVIDER} -REGION=${REGION} -ZONE=${ZONE} -PUBLIC_IP=${PUBLIC_IP} -PRIVATE_IP=${PRIVATE_IP} -INSTANCE_ID=${INSTANCE_ID} -INSTANCE_TYPE=${INSTANCE_TYPE} -EOF - - -LABELS=$(awk -F= '{print $1 "=" $2}' /bacalhau_node/node-info | tr '\n' ',' | sed 's/,$//') - - -sed -i '/^LABELS=/d' /etc/environment -echo "LABELS=${LABELS}" >> /etc/environment - - -sed -i '/^export LABELS=/d' ~/.profile -echo 'export LABELS=$(grep LABELS /etc/environment | cut -d "=" -f2-)' >> ~/.profile - - -source ~/.profile - -if [ "$CLOUD_PROVIDER" = "GCP" ]; then - echo "PROJECT_ID=${PROJECT_ID}" >> "${BACALHAU_NODE_DIR}/node-info" -fi - -# shellcheck disable=SC1091 -source "${BACALHAU_NODE_DIR}/node-info" - -echo "Verifying Docker service..." -if ! systemctl is-active --quiet docker; then - echo "Docker is not running. Starting Docker..." - systemctl start docker - sleep 5 # Give Docker time to start -fi - -echo "Setting up configuration..." -if [ -f "${BACALHAU_NODE_DIR}/config.yaml" ]; then - echo "Configuration file exists at ${BACALHAU_NODE_DIR}/config.yaml" -else - echo "Error: Configuration file not found at ${BACALHAU_NODE_DIR}/config.yaml" - exit 1 -fi - -echo "Starting Docker Compose services..." -if [ -f "${BACALHAU_NODE_DIR}/docker-compose.yaml" ]; then - cd "${BACALHAU_NODE_DIR}" || exit - echo "Stopping and removing any existing containers..." - docker compose down - if docker ps -a | grep -q "bacalhau_node-bacalhau-node"; then - echo "Found stray containers, removing them..." - docker ps -a | grep "bacalhau_node-bacalhau-node" | awk '{print $1}' | xargs -r docker rm -f - fi - echo "Pulling latest images..." - docker compose pull - echo "Starting services..." - docker compose up -d - echo "Docker Compose started." -else - echo "Error: docker-compose.yaml not found at ${BACALHAU_NODE_DIR}/docker-compose.yaml" - exit 1 -fi - -echo "Bacalhau node setup complete in ${CLOUD_PROVIDER} region ${REGION}" -echo "Public IP: ${PUBLIC_IP}" -echo "Private IP: ${PRIVATE_IP}" - -exit 0 diff --git a/edge-data-transfer-demo-v2/edge-data-spots/job/create_matadata.yaml b/edge-data-transfer-demo-v2/edge-data-spots/job/create_matadata.yaml deleted file mode 100644 index c8d87431..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/job/create_matadata.yaml +++ /dev/null @@ -1,33 +0,0 @@ -name: File Metadata Collection Job -type: batch -count: 5 -tasks: - - name: Collect Metadata - engine: - type: docker - params: - Image: httpd:latest - Entrypoint: - - /bin/bash - Parameters: - - -c - - | - export PROCESSING_SLEEP=3 - chmod +x /bacalhau_data/metadata.sh - /bacalhau_data/metadata.sh - InputSources: - - Target: /mnt/data - Source: - Type: localdirectory - Params: - SourcePath: /mnt/data - readWrite: true - - Target: /bacalhau_data - Source: - Type: localdirectory - Params: - SourcePath: /bacalhau_data - readWrite: true - annotations: - bacalhau.io/partitioning: "true" - bacalhau.io/slice_count: "4" diff --git a/edge-data-transfer-demo-v2/edge-data-spots/job/generate.yaml b/edge-data-transfer-demo-v2/edge-data-spots/job/generate.yaml deleted file mode 100644 index 97ff768b..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/job/generate.yaml +++ /dev/null @@ -1,30 +0,0 @@ -name: File Metadata Collection Job -type: batch -count: 1 -tasks: - - name: Collect Metadata - engine: - type: docker - params: - Image: python:latest - Entrypoint: - - /bin/bash - Parameters: - - -c - - python3 /bacalhau_data/generate.py - InputSources: - - Target: /mnt/data - Source: - Type: localdirectory - Params: - SourcePath: /mnt/data - readWrite: true - - Target: /bacalhau_data - Source: - Type: localdirectory - Params: - SourcePath: /bacalhau_data - readWrite: true - annotations: - bacalhau.io/partitioning: "true" - bacalhau.io/slice_count: "4" diff --git a/edge-data-transfer-demo-v2/edge-data-spots/ubuntu_amis.csv b/edge-data-transfer-demo-v2/edge-data-spots/ubuntu_amis.csv deleted file mode 100644 index cb260e56..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/ubuntu_amis.csv +++ /dev/null @@ -1,18 +0,0 @@ -Region,AMI ID,Instance Type,vCPUs,Memory (GiB),Spot Price ($/hr) -ap-northeast-3,ami-04e399c868c7e93bc,t3.medium,2,4.0,$0.0198 -eu-north-1,ami-0465922f90fe7a04d,c5d.large,2,4.0,$0.0243 -sa-east-1,ami-04d1bc0e151921904,m6gd.xlarge,4,16.0,$0.0457 -eu-west-3,ami-0304cc3ca822a6399,m5ad.large,2,8.0,$0.0514 -ap-southeast-1,ami-05c0c6dec1211bbe0,m5dn.large,2,8.0,$0.0570 -us-west-1,ami-0389531f2df0f6ac5,m5zn.large,2,8.0,$0.0580 -ap-south-1,ami-03cae1bb2387bab30,m6gd.xlarge,4,16.0,$0.0621 -ap-northeast-2,ami-022e9d7267d316fdd,m5zn.large,2,8.0,$0.0655 -ap-northeast-1,ami-0250df909421e8cd6,m5dn.large,2,8.0,$0.0709 -eu-central-1,ami-02d2483b6a1edec55,m6gd.xlarge,4,16.0,$0.0716 -ap-southeast-2,ami-036f7470d40c93850,m6gd.xlarge,4,16.0,$0.0741 -eu-west-2,ami-00f187893a184aa2e,m6gd.xlarge,4,16.0,$0.0746 -us-east-2,ami-014997e9e4213f4e0,m6gd.xlarge,4,16.0,$0.0766 -ca-central-1,ami-0086af577f1f819e4,m6gd.xlarge,4,16.0,$0.0781 -us-west-2,ami-054136d4bdcfd7639,m6gd.xlarge,4,16.0,$0.0826 -eu-west-1,ami-0336482046f91c7b8,m6gd.xlarge,4,16.0,$0.0910 -us-east-1,ami-010f8f76d7f4486c8,m6gd.xlarge,4,16.0,$0.0928 diff --git a/edge-data-transfer-demo-v2/edge-data-spots/util/config.py b/edge-data-transfer-demo-v2/edge-data-spots/util/config.py deleted file mode 100644 index 444b5f4a..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/util/config.py +++ /dev/null @@ -1,129 +0,0 @@ -import csv -import logging -import os - -import yaml - -logger = logging.getLogger(__name__) - - -class Config(dict): - def __init__(self, file_path): - if logger.getEffectiveLevel() <= logging.DEBUG: - logger.debug(f"Initializing Config with file_path: {file_path}") - super().__init__() - self.file_path = file_path - self._load_yaml() - if logger.getEffectiveLevel() <= logging.DEBUG: - logger.debug("Config initialization completed") - - def _load_yaml(self): - if logger.getEffectiveLevel() <= logging.DEBUG: - logger.debug(f"Loading YAML from {self.file_path}") - try: - with open(self.file_path, "r") as file: - config_data = yaml.safe_load(file) - if logger.getEffectiveLevel() <= logging.DEBUG: - logger.debug(f"Loaded config data: {config_data}") - self.update(config_data) - except Exception as e: - logger.error( - f"Error loading config file {self.file_path}: {str(e)}", exc_info=True - ) - raise - - def get_regions(self): - if logger.getEffectiveLevel() <= logging.DEBUG: - logger.debug("Getting regions from config") - regions = [list(region.keys())[0] for region in self.get("regions", [])] - if logger.getEffectiveLevel() <= logging.DEBUG: - logger.debug(f"Found regions: {regions}") - return regions - - def get_total_instances(self): - if logger.getEffectiveLevel() <= logging.DEBUG: - logger.debug("Getting total instances from config") - total = self.get("max_instances", 0) - if logger.getEffectiveLevel() <= logging.DEBUG: - logger.debug(f"Total instances: {total}") - return total - - def get_ssh_keypair(self): - return self.get("ssh_key_name") - - def get_region_config(self, region_name): - if logger.getEffectiveLevel() <= logging.DEBUG: - logger.debug(f"Getting config for region: {region_name}") - for region in self.get("regions", []): - if region_name in region: - config = region[region_name] - if logger.getEffectiveLevel() <= logging.DEBUG: - logger.debug(f"Found config for {region_name}: {config}") - return config - logger.warning(f"No config found for region: {region_name}") - return None - - def get_amis_file_path(self): - parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) - return os.path.join(parent_dir, "ubuntu_amis.csv") - - def get_image_for_region(self, region_name): - if logger.getEffectiveLevel() <= logging.DEBUG: - logger.debug(f"Getting image for region: {region_name}") - region_config = self.get_region_config(region_name) - if not region_config: - logger.error(f"Region '{region_name}' not found in config") - raise ValueError(f"Region '{region_name}' not found in config.") - - ami_value = region_config.get("image") - if logger.getEffectiveLevel() <= logging.DEBUG: - logger.debug(f"AMI value from config: {ami_value}") - - if ami_value != "auto": - return ami_value - - amis_file = self.get_amis_file_path() - if logger.getEffectiveLevel() <= logging.DEBUG: - logger.debug(f"Looking up AMI in file: {amis_file}") - - if not os.path.exists(amis_file): - logger.error(f"AMI file '{amis_file}' not found") - raise FileNotFoundError(f"AMI file '{amis_file}' not found.") - - try: - with open(amis_file, mode="r") as file: - reader = csv.DictReader(file) - for row in reader: - if row["Region"] == region_name: - ami_id = row["AMI ID"] - if logger.getEffectiveLevel() <= logging.DEBUG: - logger.debug(f"Found AMI for {region_name}: {ami_id}") - return ami_id - - logger.error(f"No AMI found for region '{region_name}' in '{amis_file}'") - raise ValueError( - f"No AMI found for region '{region_name}' in '{amis_file}'." - ) - except Exception as e: - logger.error(f"Error reading AMI file: {str(e)}", exc_info=True) - raise - - def get_orchestrators(self): - return self.get("compute_orchestrators", []) - - def get_token(self): - return self.get("compute_auth_token") - - def get_tls(self): - return self.get("compute_tls", False) - - def get_public_ssh_key_path(self): - path = self.get("public_ssh_key_path", "") - return os.path.expanduser(path) if path else "" - - def get_private_ssh_key_path(self): - path = self.get("private_ssh_key_path", "") - return os.path.expanduser(path) if path else "" - - def get_username(self): - return self.get("username", "bacalhau-runner") diff --git a/edge-data-transfer-demo-v2/edge-data-spots/util/get_available_regions.py b/edge-data-transfer-demo-v2/edge-data-spots/util/get_available_regions.py deleted file mode 100644 index 28a62239..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/util/get_available_regions.py +++ /dev/null @@ -1,283 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# /// script -# requires-python = ">=3.11" -# dependencies = [ -# "boto3", -# "botocore", -# ] -# /// - -import argparse -import json -import os -from concurrent.futures import ThreadPoolExecutor, as_completed - -import boto3 - -# Minimum requirements for running Docker and one small Python container -# These are minimal requirements - Docker needs about 2GB and a small Python container ~512MB -MIN_VCPU = 2 -MIN_MEMORY_GIB = 2 - -# Instance families that are good candidates for small workloads -PREFERRED_INSTANCE_FAMILIES = [ - "t3", - "t3a", - "t4g", # Burstable instances - good for intermittent workloads - "t2", # Older burstable instances - "a1", # ARM-based instances - can be cheaper - "m6g", - "m5", - "m5a", # General purpose instances -] - - -def check_region_spot_availability(region): - """ - Check if a region has spot instances available that meet our requirements - """ - try: - # Create EC2 client for the region - ec2_client = boto3.client("ec2", region_name=region) - - # Get available instance types in the region - response = ec2_client.describe_instance_types() - instance_types = response["InstanceTypes"] - - # Filter for instance types that meet our minimum requirements - suitable_instances = [] - for instance in instance_types: - instance_type = instance.get("InstanceType", "") - - # Check if instance meets minimum requirements - if ( - instance.get("VCpuInfo", {}).get("DefaultVCpus", 0) >= MIN_VCPU - and instance.get("MemoryInfo", {}).get("SizeInMiB", 0) / 1024 - >= MIN_MEMORY_GIB - ): - # Calculate a "size score" - lower is better (smaller instance) - vcpus = instance.get("VCpuInfo", {}).get("DefaultVCpus", 0) - memory_gib = instance.get("MemoryInfo", {}).get("SizeInMiB", 0) / 1024 - size_score = vcpus * 10 + memory_gib - - # Check if it's in our preferred families - is_preferred = any( - instance_type.startswith(family) - for family in PREFERRED_INSTANCE_FAMILIES - ) - - suitable_instances.append( - { - "instance_type": instance_type, - "vcpus": vcpus, - "memory_gib": memory_gib, - "size_score": size_score, - "is_preferred": is_preferred, - } - ) - - if not suitable_instances: - return {"region": region, "available": False} - - # Sort by preference first, then by size score (smallest first) - suitable_instances.sort( - key=lambda x: (0 if x["is_preferred"] else 1, x["size_score"]) - ) - - # Check spot pricing and availability for suitable instances - available_instances = [] - for instance_info in suitable_instances[ - :10 - ]: # Check first 10 suitable instances - instance_type = instance_info["instance_type"] - try: - # Check spot price history - spot_response = ec2_client.describe_spot_price_history( - InstanceTypes=[instance_type], - ProductDescriptions=["Linux/UNIX"], - MaxResults=1, - ) - - # If we got a price, the instance type is available for spot - if spot_response.get("SpotPriceHistory"): - spot_price = float( - spot_response["SpotPriceHistory"][0]["SpotPrice"] - ) - print( - f"Region {region} has spot availability for {instance_type} - " - f"{instance_info['vcpus']} vCPUs, {instance_info['memory_gib']:.1f} GiB RAM, " - f"${spot_price:.4f}/hr" - ) - - available_instances.append( - { - "instance_type": instance_type, - "vcpus": instance_info["vcpus"], - "memory_gib": round(instance_info["memory_gib"], 1), - "spot_price": spot_price, - } - ) - except Exception as e: - continue - - if available_instances: - # Sort available instances by price - available_instances.sort(key=lambda x: x["spot_price"]) - return { - "region": region, - "available": True, - "instances": available_instances, - "cheapest_instance": available_instances[0], - } - else: - return {"region": region, "available": False} - except Exception as e: - print(f"Error checking region {region}: {str(e)}") - return {"region": region, "available": False, "error": str(e)} - - -def get_all_aws_regions(): - """ - Get a list of all AWS regions - """ - ec2 = boto3.client("ec2", region_name="us-east-1") - regions = [region["RegionName"] for region in ec2.describe_regions()["Regions"]] - return regions - - -def main(): - # Parse command line arguments - parser = argparse.ArgumentParser( - description="Find AWS regions with suitable spot instances for Docker and containers" - ) - parser.add_argument( - "--show-all", - action="store_true", - help="Show all available regions, not just the top 5", - ) - parser.add_argument( - "--max-workers", - type=int, - default=10, - help="Maximum number of parallel workers (default: 10)", - ) - args = parser.parse_args() - - # Get all AWS regions - all_regions = get_all_aws_regions() - print(f"Checking {len(all_regions)} AWS regions for spot availability...") - print( - f"Looking for instances with at least {MIN_VCPU} vCPUs and {MIN_MEMORY_GIB} GiB RAM" - ) - - # Results will store detailed information about each region - results = [] - - # Check each region in parallel - with ThreadPoolExecutor(max_workers=args.max_workers) as executor: - future_to_region = { - executor.submit(check_region_spot_availability, region): region - for region in all_regions - } - - for future in as_completed(future_to_region): - region_result = future.result() - results.append(region_result) - - # Filter for available regions - available_regions = [r for r in results if r.get("available", False)] - - # Sort available regions by cheapest instance price - available_regions.sort(key=lambda x: x["cheapest_instance"]["spot_price"]) - - # Create a list of just the region names for backward compatibility - region_names = [r["region"] for r in available_regions] - - # Save the results to JSON - parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) - json_path = os.path.join(parent_dir, "available_regions.json") - - output_data = { - "available_regions": region_names, - "region_details": {r["region"]: r for r in available_regions}, - "all_regions_checked": len(all_regions), - "available_regions_count": len(available_regions), - "min_requirements": {"vcpu": MIN_VCPU, "memory_gib": MIN_MEMORY_GIB}, - "timestamp": import_time.strftime( - "%Y-%m-%d %H:%M:%S UTC", import_time.gmtime() - ), - } - - with open(json_path, "w") as f: - json.dump(output_data, f, indent=2) - - print( - f"\nFound {len(available_regions)} regions with suitable spot instances out of {len(all_regions)} total regions" - ) - print(f"Available regions saved to: {json_path}") - - # For backward compatibility, also create the Python module - output_path = os.path.join(parent_dir, "available_regions.py") - with open(output_path, "w") as f: - f.write( - "# AWS regions with spot instances suitable for Docker and containers\n" - ) - f.write("# This file is auto-generated by get_available_regions.py\n\n") - f.write("AVAILABLE_REGIONS = [\n") - for region in sorted(region_names): - f.write(f' "{region}",\n') - f.write("]\n\n") - - # Add detailed information about each region - f.write( - "# Detailed information about each region's smallest suitable instance\n" - ) - f.write("REGION_DETAILS = {\n") - for region_data in available_regions: - region = region_data["region"] - instance = region_data["cheapest_instance"] - f.write(f' "{region}": {{\n') - f.write(f' "instance_type": "{instance["instance_type"]}",\n') - f.write(f' "vcpus": {instance["vcpus"]},\n') - f.write(f' "memory_gib": {instance["memory_gib"]},\n') - f.write(f' "spot_price": {instance["spot_price"]:.6f},\n') - f.write(f" }},\n") - f.write("}\n") - - print(f"Python module also saved to: {output_path}") - - # Print a summary of the available regions - if args.show_all: - print( - f"\nAll {len(available_regions)} available regions for running Docker with a small Python container:" - ) - for i, region_data in enumerate(available_regions, 1): - region = region_data["region"] - instance = region_data["cheapest_instance"] - print( - f"{i}. {region} - {instance['instance_type']} - " - f"{instance['vcpus']} vCPUs, {instance['memory_gib']} GiB RAM, " - f"${instance['spot_price']:.4f}/hr" - ) - else: - # Just show the top 5 by default - display_count = min(5, len(available_regions)) - print( - f"\nTop {display_count} cheapest regions for running Docker with a small Python container:" - ) - print(f"(Use --show-all to see all {len(available_regions)} available regions)") - for i, region_data in enumerate(available_regions[:display_count], 1): - region = region_data["region"] - instance = region_data["cheapest_instance"] - print( - f"{i}. {region} - {instance['instance_type']} - " - f"{instance['vcpus']} vCPUs, {instance['memory_gib']} GiB RAM, " - f"${instance['spot_price']:.4f}/hr" - ) - - -if __name__ == "__main__": - import time as import_time - - main() diff --git a/edge-data-transfer-demo-v2/edge-data-spots/util/get_ubuntu_amis.py b/edge-data-transfer-demo-v2/edge-data-spots/util/get_ubuntu_amis.py deleted file mode 100644 index db2ee44a..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/util/get_ubuntu_amis.py +++ /dev/null @@ -1,160 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# /// script -# requires-python = ">=3.11" -# dependencies = [ -# "boto3", -# "botocore", -# ] -# /// - -import csv -import json -import os -import sys - -import boto3 - -# Dictionary to store AMI IDs by region -UBUNTU_AMIS = {} - - -def get_latest_ubuntu_ami(region): - """ - Get the latest Ubuntu 22.04 LTS AMI ID in a region - """ - try: - client = boto3.client("ec2", region_name=region) - response = client.describe_images( - Owners=["099720109477"], # Canonical's AWS account ID - Filters=[ - { - "Name": "name", - "Values": [ - "ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-arm64-server-*" - ], - }, - {"Name": "architecture", "Values": ["arm64"]}, - {"Name": "root-device-type", "Values": ["ebs"]}, - {"Name": "virtualization-type", "Values": ["hvm"]}, - ], - MaxResults=1000, # Ensure we get a sufficient number of results - ) - # Sort images by creation date - images = sorted( - response["Images"], key=lambda x: x["CreationDate"], reverse=True - ) - if not images: - print(f"Warning: No Ubuntu 22.04 LTS AMIs found in region {region}") - return None - return images[0]["ImageId"] - except Exception as e: - print(f"Error getting AMI for region {region}: {str(e)}") - return None - - -def main(): - # Get the parent directory - parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) - json_path = os.path.join(parent_dir, "available_regions.json") - - # Check if the JSON file exists - if not os.path.exists(json_path): - print( - f"Error: {json_path} not found. Please run get_available_regions.py first." - ) - sys.exit(1) - - # Load the JSON file - try: - with open(json_path, "r") as f: - data = json.load(f) - - # Get the list of available regions - regions = data.get("available_regions", []) - - if not regions: - print("No available regions found in the JSON file.") - sys.exit(1) - - print(f"Found {len(regions)} available regions in {json_path}") - - # Get region details for additional information - region_details = data.get("region_details", {}) - - except Exception as e: - print(f"Error reading {json_path}: {str(e)}") - print("Falling back to default regions...") - regions = [ - "us-west-2", - "us-east-1", - "eu-central-1", - "eu-west-1", - "eu-west-2", - "ap-southeast-1", - "sa-east-1", - "ap-northeast-1", - "ap-southeast-2", - "ca-central-1", - ] - - # Loop through each region and get the AMI ID - print(f"Getting Ubuntu AMIs for {len(regions)} regions...") - for region in regions: - ami_id = get_latest_ubuntu_ami(region) - if ami_id: - UBUNTU_AMIS[region] = ami_id - print(f"Found AMI {ami_id} for region {region}") - - # Create the CSV file - csv_path = os.path.join(parent_dir, "ubuntu_amis.csv") - with open(csv_path, mode="w", newline="") as file: - writer = csv.writer(file) - - # Write header with additional information if available - if region_details: - writer.writerow( - [ - "Region", - "AMI ID", - "Instance Type", - "vCPUs", - "Memory (GiB)", - "Spot Price ($/hr)", - ] - ) - - # Write data with instance details - for region, ami_id in UBUNTU_AMIS.items(): - details = region_details.get(region, {}) - instance = ( - details.get("cheapest_instance", {}) - if details.get("available", False) - else {} - ) - - if instance: - writer.writerow( - [ - region, - ami_id, - instance.get("instance_type", ""), - instance.get("vcpus", ""), - instance.get("memory_gib", ""), - f"${instance.get('spot_price', 0):.4f}", - ] - ) - else: - writer.writerow([region, ami_id, "", "", "", ""]) - else: - # Simple format if no instance details are available - writer.writerow(["Region", "AMI ID"]) - for region, ami_id in UBUNTU_AMIS.items(): - writer.writerow([region, ami_id]) - - print(f"Ubuntu AMIs CSV saved at: {csv_path}") - print(f"Found AMIs for {len(UBUNTU_AMIS)} out of {len(regions)} regions") - - -if __name__ == "__main__": - main() diff --git a/edge-data-transfer-demo-v2/edge-data-spots/util/scripts_provider.py b/edge-data-transfer-demo-v2/edge-data-spots/util/scripts_provider.py deleted file mode 100644 index 985fa744..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/util/scripts_provider.py +++ /dev/null @@ -1,107 +0,0 @@ -import base64 -import io -import os -import tarfile - -from util.config import Config - -class ScriptsProvider: - def __init__(self, config: Config): - super().__init__() - self.config = config - - @staticmethod - def _file_path(*path_parts): - return os.path.join(os.path.dirname(__file__), '..', 'instance', *path_parts) - - @staticmethod - def get_ssh_public_key(file_path): - """Read and validate a public SSH key from the given file path. - - Args: - file_path: Path to the public SSH key file - - Returns: - The SSH public key content as a string - """ - # Handle empty path - if not file_path: - return "" - - # Expand any tilde in file path - expanded_path = os.path.expanduser(file_path) - - # Read and validate key - try: - with open(expanded_path, "r") as file: - content = file.read().strip() - - # Basic validation - public keys should start with ssh-rsa, ssh-ed25519, etc. - if not (content.startswith('ssh-rsa') or - content.startswith('ssh-ed25519') or - content.startswith('ssh-dss') or - content.startswith('ecdsa-sha2')): - raise ValueError(f"Invalid SSH public key format in {file_path}") - - return content - - except FileNotFoundError: - print(f"Warning: SSH public key file not found at {expanded_path}") - return "" - except Exception as e: - print(f"Error reading SSH public key: {str(e)}") - return "" - - @staticmethod - def encode_file_to_base64(file_path): - with open(file_path, "rb") as file: - encoded_content = base64.b64encode(file.read()).decode("utf-8") - return encoded_content - - def create_bacalhau_config(self): - values = { - "bacalhau_token": self.config.get_token(), - "tls": "true" if self.config.get_tls() else "false" - } - with open(self._file_path("config", "config-template.yaml"), "r") as file: - bacalhau_config = file.read() - - for key, value in values.items(): - bacalhau_config = bacalhau_config.replace(f"${{{key}}}", value) - - bacalhau_config = bacalhau_config.replace("${orchestrators_list}", - "\n - ".join(self.config.get_orchestrators())) - return base64.b64encode(bacalhau_config.encode()).decode("utf-8") - - def tar_and_encode_scripts(self): - memory_file = io.BytesIO() - script_dir = self._file_path("scripts") - with tarfile.open(fileobj=memory_file, mode="w:gz") as tar: - for script_file in sorted(os.listdir(script_dir)): - script_path = os.path.join(script_dir, script_file) - tar.add(script_path, arcname=script_file) - - memory_file.seek(0) - return base64.b64encode(memory_file.getvalue()).decode() - - def create_cloud_init_script(self, efs_mount_ip=""): - # Get public SSH key - handle properly without base64 encoding - ssh_public_key = self.get_ssh_public_key(self.config.get_public_ssh_key_path()) - - values = { - "compressed_scripts": self.tar_and_encode_scripts(), - "username": self.config.get_username(), - "public_ssh_key": ssh_public_key, # No longer needs base64 encoding - "bacalhau_data_dir": "/bacalhau_data", - "bacalhau_node_dir": "/bacalhau_node", - "bacalhau_config_file": self.create_bacalhau_config(), - "efs_mount_ip": efs_mount_ip or "", - } - - with open(self._file_path("cloud-init", "init-vm-template.yml"), "r") as file: - cloud_init_script = file.read() - - for key, value in values.items(): - cloud_init_script = cloud_init_script.replace(f"${{{key}}}", value) - - return cloud_init_script diff --git a/edge-data-transfer-demo-v2/edge-data-spots/util/update_config_with_regions.py b/edge-data-transfer-demo-v2/edge-data-spots/util/update_config_with_regions.py deleted file mode 100644 index 99e125d1..00000000 --- a/edge-data-transfer-demo-v2/edge-data-spots/util/update_config_with_regions.py +++ /dev/null @@ -1,139 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# /// script -# requires-python = ">=3.11" -# dependencies = [ -# "pyyaml" -# ] -# /// -import json -import os -import sys - -import yaml - - -def main(): - # Get the parent directory - parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) - json_path = os.path.join(parent_dir, "available_regions.json") - config_path = os.path.join(parent_dir, "config.yaml") - - # Check if the JSON file exists - if not os.path.exists(json_path): - print( - f"Error: {json_path} not found. Please run get_available_regions.py first." - ) - sys.exit(1) - - # Check if the config file exists - if not os.path.exists(config_path): - print(f"Error: {config_path} not found.") - sys.exit(1) - - # Load the JSON file - try: - with open(json_path, "r") as f: - data = json.load(f) - - # Get the list of available regions - regions = data.get("available_regions", []) - - if not regions: - print("No available regions found in the JSON file.") - sys.exit(1) - - print(f"Found {len(regions)} available regions in {json_path}") - - # Get region details for additional information - region_details = data.get("region_details", {}) - - except Exception as e: - print(f"Error reading {json_path}: {str(e)}") - sys.exit(1) - - # Load the config file - try: - with open(config_path, "r") as f: - config = yaml.safe_load(f) - - if config is None: - config = {} - - print(f"Loaded configuration from {config_path}") - - except Exception as e: - print(f"Error reading {config_path}: {str(e)}") - sys.exit(1) - - # Check if regions key exists - if "regions" not in config: - config["regions"] = [] - - # Get existing regions to avoid duplicates - existing_regions = set() - for region_entry in config["regions"]: - if isinstance(region_entry, dict): - existing_regions.update(region_entry.keys()) - - # Count how many regions we'll add - new_regions = [r for r in regions if r not in existing_regions] - print(f"Adding {len(new_regions)} new regions to config.yaml") - - # Get the default machine type from existing regions or use t3.small - default_machine_type = "t3.small" # Default - - # Try to find a better default from existing config - if config["regions"]: - for region_entry in config["regions"]: - if isinstance(region_entry, dict): - for region_name, region_config in region_entry.items(): - if "machine_type" in region_config: - default_machine_type = region_config["machine_type"] - break - - # Add new regions to the config - for region in new_regions: - # Get the recommended instance type from region_details if available - recommended_instance = None - if region in region_details and region_details[region].get("available", False): - recommended_instance = ( - region_details[region].get("cheapest_instance", {}).get("instance_type") - ) - - machine_type = ( - recommended_instance if recommended_instance else default_machine_type - ) - - config["regions"].append( - { - region: { - "image": "auto", - "machine_type": machine_type, - "node_count": "auto", - } - } - ) - - # Save the updated config - try: - # Create a backup of the original config - backup_path = f"{config_path}.bak" - with open(backup_path, "w") as f: - yaml.dump(config, f, default_flow_style=False) - print(f"Created backup of original config at {backup_path}") - - # Write the updated config - with open(config_path, "w") as f: - yaml.dump(config, f, default_flow_style=False) - - print(f"Updated {config_path} with {len(new_regions)} new regions") - print(f"Total regions in config: {len(config['regions'])}") - - except Exception as e: - print(f"Error writing to {config_path}: {str(e)}") - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/edge-data-transfer-demo/.envrc b/edge-data-transfer-demo/.envrc deleted file mode 100644 index e69de29b..00000000 diff --git a/edge-data-transfer-demo/Readme.md b/edge-data-transfer-demo/Readme.md deleted file mode 100644 index f8f2f375..00000000 --- a/edge-data-transfer-demo/Readme.md +++ /dev/null @@ -1,105 +0,0 @@ -# Edge Data Transfer Demo - -## Requirements - -- [Docker](https://www.docker.com/get-started) (version 20.10 or higher) - -## Configuration - -# Building and Running - -## 1. Start the application - -Build the images and start the containers: - -```bash -docker-compose up -d -``` - -Once the application is running, open your browser and navigate to: - -``` -http://localhost:3000 -``` - -## 2. Verify Instance Registration - -One of the containers is client container. You can use it to check if the nodes are registered correctly with Bacalhau: - -```bash -docker exec -ti edge-data-transfer-client-1 /bin/bash -``` - -Once you are in the client container, you can list the nodes: -```bash -bacalhau node list -``` - -If you see 5 nodes connected, your nodes are registered correctly. - - ---- - -**TODO** -- Change the generate job to write to /mnt/data -- Change the process_metadata job to read from /mnt/data -- Update the UI to "fake" disabling access to the NFS volume (it's not NFS anymore, but we can pretend it is) -- Update the UI to "fake" disabling access to the Bacalhau server - - ------ - -(Didn't change anything below here) - ---- - -## 3. Verify NFS Mount - -SSH into one of the Spot instances. - -The private ssh key for the machines is located in the /root/.ssh/id_rsa directory on backend docker. - -By default user in nodes is 'bacalhau-runner'. The public IP addres of node could be seen in labels while listing bacalhau nodes. - -```bash -ssh bacalhau-runner@ -``` - and verify that the NFS volume is mounted. -```bash -df -h -``` - -You should see `/mnt/data` listed in the output. - ---- - -## 7. Generate Test Data - -Submit a job to generate random test files: -Jobs are available in the /backend/job directory -```bash -bacalhau job run generate.yaml -``` - -> **Warning:** This job can take up to 40 minutes. After a 5 minutes you'll see timeout while tracking job execution but job itself is running on network. - ---- - -## 8. Run Metadata Generation Job - -Submit the main processing job to generate metadata: - -```bash -bacalhau job run process_metadata.yaml -``` - -## 9. Cleanup - -After demo, you can destroy EC2 instances by running - -```bash -uv run -s ./deploy_spot.py destroy -``` -in backend container. - -You should also delete data from EFS share. diff --git a/edge-data-transfer-demo/bacalhau-config/compute.yaml b/edge-data-transfer-demo/bacalhau-config/compute.yaml deleted file mode 100644 index 84603512..00000000 --- a/edge-data-transfer-demo/bacalhau-config/compute.yaml +++ /dev/null @@ -1,7 +0,0 @@ -NameProvider: hostname -Compute: - Enabled: true - Orchestrators: - - orchestrator:4222 -UpdateConfig: - Interval: 0 diff --git a/edge-data-transfer-demo/bacalhau-config/orchestrator.yaml b/edge-data-transfer-demo/bacalhau-config/orchestrator.yaml deleted file mode 100644 index 6353dc94..00000000 --- a/edge-data-transfer-demo/bacalhau-config/orchestrator.yaml +++ /dev/null @@ -1,27 +0,0 @@ -Orchestrator: - Enabled: true -API: - Port: 1234 -WebUI: - Enabled: true -UpdateConfig: - Interval: 0 -JobDefaults: - Batch: - Task: - Publisher: - Type: s3 - Params: - Bucket: "my-bucket" - Key: jobs/{jobID}/{executionID} - Endpoint: "http://storage:9000" - Region: "storage-region" - Ops: - Task: - Publisher: - Type: s3 - Params: - Bucket: "my-bucket" - Key: jobs/{jobID}/{executionID} - Endpoint: "http://storage:9000" - Region: "storage-region" diff --git a/edge-data-transfer-demo/data/fooz.txt b/edge-data-transfer-demo/data/fooz.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/edge-data-transfer-demo/docker-compose.yml b/edge-data-transfer-demo/docker-compose.yml deleted file mode 100644 index d35f194d..00000000 --- a/edge-data-transfer-demo/docker-compose.yml +++ /dev/null @@ -1,86 +0,0 @@ -name: edge-data-transfer - -x-common-env-variables: &common-env-variables - MINIO_ROOT_USER: "minioadmin" - MINIO_ROOT_PASSWORD: "minioadmin" - AWS_ACCESS_KEY_ID: "minioadmin" - AWS_SECRET_ACCESS_KEY: "minioadmin" - BACALHAU_DISABLEANALYTICS: true - -services: - orchestrator: - image: ghcr.io/bacalhau-project/bacalhau:v1.7.0 - container_name: orchestrator - command: serve -c /etc/bacalhau/config.yaml --name orchestrator - environment: *common-env-variables - ports: - - "8438:8438" - - "1234:1234" - - "4222:4222" - volumes: - - ./bacalhau-config/orchestrator.yaml:/etc/bacalhau/config.yaml - healthcheck: - test: ["CMD", "bacalhau", "agent", "alive"] - interval: 5s - timeout: 5s - retries: 12 - start_period: 10s - - storage: - image: quay.io/minio/minio - container_name: storage - entrypoint: sh - command: -c 'mkdir -p /data/my-bucket && minio server /data --console-address ":9001"' - environment: *common-env-variables - ports: - - "9000:9000" - - "9001:9001" - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] - interval: 5s - timeout: 5s - retries: 10 - start_period: 10s - - backend: - image: ghcr.io/bacalhau-project/edge-data-node:202504071804 - command: serve -c /etc/bacalhau/config.yaml - volumes: - - ./bacalhau-config/compute.yaml:/etc/bacalhau/config.yaml - - ./data:/mnt/data - environment: *common-env-variables - depends_on: - orchestrator: - condition: service_healthy - storage: - condition: service_healthy - deploy: - replicas: 5 - privileged: true - restart: on-failure - - frontend: - image: ghcr.io/bacalhau-project/edge-data-dashboard:202504071534 - container_name: frontend - entrypoint: ["/entrypoint.sh"] - environment: - <<: *common-env-variables - BACALHAU_API_HOST: orchestrator - ports: - - "3000:3000" - depends_on: - orchestrator: - condition: service_healthy - - client: - image: ghcr.io/bacalhau-project/bacalhau:v1.7.0 - entrypoint: /bin/sh - stdin_open: true - tty: true - stop_signal: SIGTERM - stop_grace_period: 3s - environment: - <<: *common-env-variables - BACALHAU_API_HOST: orchestrator - depends_on: - - orchestrator diff --git a/edge-data-transfer-demo/edge-data-spots/.cspell/custom-dictionary.txt b/edge-data-transfer-demo/edge-data-spots/.cspell/custom-dictionary.txt deleted file mode 100644 index f08f63f3..00000000 --- a/edge-data-transfer-demo/edge-data-spots/.cspell/custom-dictionary.txt +++ /dev/null @@ -1,2 +0,0 @@ -bacalhau -levelname diff --git a/edge-data-transfer-demo/edge-data-spots/.dockerignore b/edge-data-transfer-demo/edge-data-spots/.dockerignore deleted file mode 100644 index 3f2cd847..00000000 --- a/edge-data-transfer-demo/edge-data-spots/.dockerignore +++ /dev/null @@ -1,5 +0,0 @@ -# Exclude everything except what's needed -* -!scripts/ -!Dockerfile -!build_container.sh \ No newline at end of file diff --git a/edge-data-transfer-demo/edge-data-spots/.gitignore b/edge-data-transfer-demo/edge-data-spots/.gitignore deleted file mode 100644 index 16a84abb..00000000 --- a/edge-data-transfer-demo/edge-data-spots/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -.env -.idea -.pem -config.yaml diff --git a/edge-data-transfer-demo/edge-data-spots/Dockerfile b/edge-data-transfer-demo/edge-data-spots/Dockerfile deleted file mode 100644 index 9c5fc29b..00000000 --- a/edge-data-transfer-demo/edge-data-spots/Dockerfile +++ /dev/null @@ -1,21 +0,0 @@ -FROM ghcr.io/bacalhau-project/bacalhau:v1.7.0-dind -WORKDIR /backend - -ARG BUILD_DATE -LABEL org.opencontainers.image.created="${BUILD_DATE}" - -RUN apk update && apk add --no-cache \ - python3 \ - py3-pip \ - curl \ - unzip \ - mc \ - vim \ - nano \ - openssh-client - -RUN curl -LsSf https://astral.sh/uv/install.sh | sh - -COPY scripts/ ./ - -RUN ssh-keygen -t rsa -b 4096 -N "" -f /root/.ssh/id_rsa diff --git a/edge-data-transfer-demo/edge-data-spots/README.md b/edge-data-transfer-demo/edge-data-spots/README.md deleted file mode 100644 index 82e5a5ad..00000000 --- a/edge-data-transfer-demo/edge-data-spots/README.md +++ /dev/null @@ -1,185 +0,0 @@ -# AWS Spot Instance Region Finder - -This repository contains scripts to help set up a Bacalhau cluster on AWS spot instances by finding the most cost-effective regions and instance types. - -## Scripts - -### 1. Region Availability Checker (`util/get_available_regions.py`) - -This script checks all AWS regions to find those that have spot instances available that meet the minimum requirements for running Docker and one small Python container: -- At least 1 vCPU -- At least 2 GiB of memory - -The script: -1. Queries all AWS regions (not just a subset) -2. Checks each region for instance types that meet the minimum requirements -3. Prioritizes smaller, cost-effective instance types (t3, t3a, t4g, t2, a1, m6g, m5, m5a families) -4. Verifies spot instance availability and pricing for suitable instance types -5. Outputs the results to: - - `available_regions.json` - Comprehensive JSON file with detailed region and instance information - - `available_regions.py` - Python importable format (for backward compatibility) -6. Displays a summary of the top 5 cheapest regions by default (with an option to show all) - -#### Command-line Options - -``` -usage: get_available_regions.py [-h] [--show-all] [--max-workers MAX_WORKERS] - -Find AWS regions with suitable spot instances for Docker and containers - -options: - -h, --help show this help message and exit - --show-all Show all available regions, not just the top 5 - --max-workers MAX_WORKERS - Maximum number of parallel workers (default: 10) -``` - -### 2. Ubuntu AMI Finder (`util/get_ubuntu_amis.py`) - -This script finds the latest Ubuntu 22.04 LTS AMI IDs for each available region: -1. Reads the list of available regions from `available_regions.json` (created by the first script) -2. Queries AWS for the latest Ubuntu 22.04 LTS AMI in each region -3. Outputs the results to `ubuntu_amis.csv` with detailed instance information including: - - Region - - AMI ID - - Instance Type - - vCPUs - - Memory (GiB) - - Spot Price ($/hr) - -### 3. Config Updater (`util/update_config_with_regions.py`) - -This script updates your Bacalhau cluster configuration with the available regions: -1. Reads the list of available regions from `available_regions.json` -2. Loads your existing `config.yaml` file -3. Adds all new regions that aren't already in your configuration -4. Uses recommended instance types from the region details when available -5. Creates a backup of your original configuration -6. Saves the updated configuration with all available regions - -## Workflow - -The scripts are designed to work together in sequence: - -1. First, run `get_available_regions.py` to find regions with suitable spot instances -2. Then, run `get_ubuntu_amis.py` to get the latest Ubuntu AMIs for those regions -3. Finally, run `update_config_with_regions.py` to update your Bacalhau configuration - -This approach ensures you're only looking for AMIs in regions that have suitable spot instances available, and that your configuration includes all viable regions. - -## Usage - -### Prerequisites - -1. AWS CLI configured with appropriate credentials -2. Python 3.6+ with required packages - -You can run these scripts in two ways: - -#### Option 1: Using uv (recommended) - -The scripts include dependency metadata for use with [uv](https://github.com/astral-sh/uv), which will automatically install required dependencies: - -```bash -# Install uv if you don't have it -pip install uv - -# Run scripts directly with uv -uv run -s util/get_available_regions.py -uv run -s util/get_ubuntu_amis.py -uv run -s util/update_config_with_regions.py - -# To see all available regions, not just the top 5 -uv run -s util/get_available_regions.py --show-all -``` - -#### Option 2: Using pip - -```bash -# Install dependencies manually -pip install boto3 botocore argparse pyyaml - -# Run scripts -python util/get_available_regions.py -python util/get_ubuntu_amis.py -python util/update_config_with_regions.py - -# To see all available regions, not just the top 5 -python util/get_available_regions.py --show-all -``` - -### Step 1: Find Available Regions with Smallest Suitable Instances - -```bash -uv run -s util/get_available_regions.py -``` - -This will create: -- `available_regions.json` - Comprehensive JSON file with detailed region and instance information -- `available_regions.py` - Python importable format (for backward compatibility) -- A console output showing the top 5 cheapest regions and their smallest suitable instances - -Example output: -``` -Checking 28 AWS regions for spot availability... -Looking for instances with at least 1 vCPUs and 2 GiB RAM - -Found 18 regions with suitable spot instances out of 28 total regions -Available regions saved to: available_regions.json -Python module also saved to: available_regions.py - -Top 5 cheapest regions for running Docker with a small Python container: -(Use --show-all to see all 18 available regions) -1. us-east-1 - t3.small - 2 vCPUs, 2.0 GiB RAM, $0.0078/hr -2. us-west-2 - t3a.small - 2 vCPUs, 2.0 GiB RAM, $0.0084/hr -3. eu-west-1 - t3.small - 2 vCPUs, 2.0 GiB RAM, $0.0091/hr -4. ap-southeast-1 - t3.small - 2 vCPUs, 2.0 GiB RAM, $0.0094/hr -5. eu-central-1 - t3.small - 2 vCPUs, 2.0 GiB RAM, $0.0098/hr -``` - -### Step 2: Get Ubuntu AMIs for Available Regions - -```bash -uv run -s util/get_ubuntu_amis.py -``` - -This will create: -- `ubuntu_amis.csv` - CSV file with region, AMI ID, and instance details - -Example CSV content: -``` -Region,AMI ID,Instance Type,vCPUs,Memory (GiB),Spot Price ($/hr) -us-east-1,ami-0c7217cdde317cfec,t3.small,2,2.0,$0.0078 -us-west-2,ami-0efcece6bed30fd98,t3a.small,2,2.0,$0.0084 -eu-west-1,ami-0694d931cee176e7d,t3.small,2,2.0,$0.0091 -``` - -### Step 3: Update Your Bacalhau Configuration - -```bash -uv run -s util/update_config_with_regions.py -``` - -This will: -- Read your existing `config.yaml` file -- Add all new regions from `available_regions.json` -- Use recommended instance types for each region -- Create a backup of your original configuration at `config.yaml.bak` -- Save the updated configuration with all available regions - -Example output: -``` -Found 30 available regions in available_regions.json -Loaded configuration from config.yaml -Adding 27 new regions to config.yaml -Created backup of original config at config.yaml.bak -Updated config.yaml with 27 new regions -Total regions in config: 30 -``` - -## Notes - -- The region availability script may take several minutes to run as it checks all AWS regions -- If `available_regions.json` is not found, the Ubuntu AMI finder will fall back to a default list of regions -- AWS credentials with EC2 describe permissions are required to run these scripts -- Spot instance pricing is dynamic and may change over time, so it's recommended to run the script periodically to get the latest pricing information diff --git a/edge-data-transfer-demo/edge-data-spots/build_container.sh b/edge-data-transfer-demo/edge-data-spots/build_container.sh deleted file mode 100755 index 70a357cd..00000000 --- a/edge-data-transfer-demo/edge-data-spots/build_container.sh +++ /dev/null @@ -1,113 +0,0 @@ -#!/bin/bash -set -euo pipefail - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -NC='\033[0m' # No Color - -# Log function -log() { - echo -e "${GREEN}[BUILD]${NC} $1" -} - -warn() { - echo -e "${YELLOW}[WARN]${NC} $1" -} - -error() { - echo -e "${RED}[ERROR]${NC} $1" - exit 1 -} - -# Check if required commands exist -command -v docker >/dev/null 2>&1 || error "docker is required but not installed" - -# Check if Dockerfile exists -if [ ! -f "Dockerfile" ]; then - error "Dockerfile not found in current directory" -fi - -# Generate timestamp for tag -TIMESTAMP=$(date +%Y%m%d%H%M) -BUILD_DATE="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" - -# Registry configuration -REGISTRY="ghcr.io" -ORGANIZATION="bacalhau-project" - -# Check for required environment variables -if [ -z "${IMAGE_NAME:-}" ]; then - error "IMAGE_NAME environment variable is not set" -fi - -if [ -z "${LOCAL_TAG:-}" ]; then - error "LOCAL_TAG environment variable is not set" -fi - -REMOTE_TAG="${REGISTRY}/${ORGANIZATION}/${IMAGE_NAME}:${TIMESTAMP}" -LATEST_REMOTE_TAG="${REGISTRY}/${ORGANIZATION}/${IMAGE_NAME}:latest" - -# Validate registry configuration -if [ -z "${REGISTRY}" ] || [ -z "${ORGANIZATION}" ] || [ -z "${IMAGE_NAME}" ]; then - error "Invalid registry configuration" -fi - -# Check for --no-cache option -NO_CACHE="" -if [ "${1:-}" = "--no-cache" ] || [ "${2:-}" = "--no-cache" ]; then - # Prune the buildx cache to force fresh build - log "Pruning buildx cache..." - docker buildx prune -f - - NO_CACHE="--no-cache" - log "Building without cache" -fi - -# Create and use buildx builder if it doesn't exist -if ! docker buildx inspect multiarch-builder >/dev/null 2>&1; then - log "Creating multi-arch builder..." - docker buildx create --name multiarch-builder --driver docker-container --bootstrap -fi -docker buildx use multiarch-builder - - -# Build based on whether we're pushing or not -if [ "${1:-}" = "--push" ] || [ "${2:-}" = "--push" ]; then - # Check if user is logged into GitHub Container Registry - if ! docker login ghcr.io >/dev/null 2>&1; then - error "Not logged into GitHub Container Registry. Please run 'docker login ghcr.io' first." - fi - - # Build and push container for multiple platforms - log "Building Docker container for multiple platforms and pushing..." - if ! docker buildx build \ - --platform linux/amd64,linux/arm64 \ - --push \ - -t "${REMOTE_TAG}" \ - -t "${LATEST_REMOTE_TAG}" \ - --build-arg BUILD_DATE="${BUILD_DATE}" \ - ${NO_CACHE} \ - -f Dockerfile . ; then - error "Failed to build and push Docker container" - fi - - log "Successfully pushed images to ${REGISTRY}:" - log " - ${REMOTE_TAG} (linux/amd64, linux/arm64)" - log " - ${LATEST_REMOTE_TAG} (linux/amd64, linux/arm64)" -else - # Build for local testing (single architecture) - log "Building for local use..." - if ! docker buildx build \ - --platform linux/amd64 \ - --load \ - -t "${LOCAL_TAG}" \ - --build-arg BUILD_DATE="${BUILD_DATE}" \ - ${NO_CACHE} \ - -f Dockerfile . ; then - error "Failed to build Docker container for local use" - fi - - log "Container built successfully for local use: ${LOCAL_TAG}" -fi \ No newline at end of file diff --git a/edge-data-transfer-demo/edge-data-spots/edge-data.md b/edge-data-transfer-demo/edge-data-spots/edge-data.md deleted file mode 100644 index ed24ce3d..00000000 --- a/edge-data-transfer-demo/edge-data-spots/edge-data.md +++ /dev/null @@ -1,71 +0,0 @@ - -# Edge Data Deployment – Beginner Guide - -## 1. Prerequisites – Required Tools - -Make sure your system has the following installed: - -- Python 3.10 or higher -- python3-pip -- `uv` (install via `pip install uv`) -- `aws-cli` – [Installation Guide](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) - ---- - -## 2. Configure AWS CLI - -Run the following command: - -`aws configure` - -## 3. Navigate to the project directory - -Run: - -`uv run -s util/get_ubuntu_amis.py` - -Choose the desired AMI ID from the `ubuntu_amis` output (all are ARM-based) and update your `config.yaml`: - -```yaml -machine_type: -``` - -## 5. Update config.yaml -Fill in the following fields: -```yaml -orchestrators: -- nats://:4222 -public_ssh_key_path: - -token: "" - - -``` - -## 6. Deploy EFS and Spot Instances -Run the deployment script: - -`uv run -s ./deploy_spot.py create` - -Check if instances have registered correctly on demo machine: - -`bacalhau node list` - - -## 7. Verify NFS mount on a node -SSH into one of the Spot instances and run: - -`df -h` - -Confirm `/mnt/data` is mounted properly. - -## 8. Generate test data -Run the test job to generate random files: - - -`bacalhau job submit generate.yaml` - -## 9. Run the metadata generation job -Submit the main processing job: - -`bacalhau job submit create_metadata.yaml` diff --git a/edge-data-transfer-demo/edge-data-spots/instance/cloud-init/init-vm-template.yml b/edge-data-transfer-demo/edge-data-spots/instance/cloud-init/init-vm-template.yml deleted file mode 100644 index e0cdfa74..00000000 --- a/edge-data-transfer-demo/edge-data-spots/instance/cloud-init/init-vm-template.yml +++ /dev/null @@ -1,150 +0,0 @@ -#cloud-config - -write_files: - - path: /tmp/scripts.tar.gz - encoding: base64 - content: ${compressed_scripts} - permissions: '0600' - -users: - - name: ${username} - sudo: ALL=(ALL) NOPASSWD:ALL - shell: /bin/bash - ssh_authorized_keys: - - ${public_ssh_key} - groups: docker - -package_update: true -package_upgrade: true - -runcmd: - - mkdir -p /tmp/exs - - tar -xzf /tmp/scripts.tar.gz -C /tmp/exs - - | - # Remove minimal packages only if DNF is available (i.e., on Amazon Linux 2023). - all_packages="curl gnupg2 jq python3 python3-pip nfs-common apache2-utils" - if command -v dnf >/dev/null 2>&1; then - dnf install --allowerasing -y $all_packages - elif command -v apt >/dev/null 2>&1; then - apt update - apt install -y $all_packages - elif command -v yum >/dev/null 2>&1; then - yum install -y $all_packages - else - echo "Unsupported package manager" - exit 1 - fi - - # Install python package - - pip install flask gunicorn - - # Install Docker - - mv /tmp/exs/install-docker.sh /root/install-docker.sh - - chmod 755 /root/install-docker.sh - - /root/install-docker.sh - - # add scripts - - mv /tmp/exs/disable-network.sh /home/bacalhau-runner/disable-network.sh - - mv /tmp/exs/enable-network.sh /home/bacalhau-runner/enable-network.sh - - chmod +x /home/bacalhau-runner/disable-network.sh - - chmod +x /home/bacalhau-runner/enable-network.sh - - - mv /tmp/exs/disable-nfs.sh /home/bacalhau-runner/disable-nfs.sh - - mv /tmp/exs/enable-nfs.sh /home/bacalhau-runner/enable-nfs.sh - - chmod +x /home/bacalhau-runner/disable-nfs.sh - - chmod +x /home/bacalhau-runner/enable-nfs.sh - - # Ensure the authorized key is properly added to the user - - mkdir -p /home/${username}/.ssh - - echo "${public_ssh_key}" > /home/${username}/.ssh/authorized_keys - - chown -R ${username}:${username} /home/${username}/.ssh - - chmod 0600 /home/${username}/.ssh/authorized_keys - - # Create necessary directories first - - mkdir -p ${bacalhau_data_dir} ${bacalhau_node_dir} /etc/bacalhau /etc/systemd/system /usr/local/bin - - # Write files after directories are created - - mv /tmp/exs/bacalhau-startup.service /etc/systemd/system/bacalhau-startup.service - - mv /tmp/exs/startup.sh /usr/local/bin/startup.sh - - echo "${bacalhau_config_file}" | base64 -d > /${bacalhau_node_dir}/config.yaml - - mv /tmp/exs/docker-compose.yaml ${bacalhau_node_dir}/docker-compose.yaml - - # Set correct permissions - - chmod 0600 /etc/systemd/system/bacalhau-startup.service - - chmod 0700 /usr/local/bin/startup.sh - - chmod 0400 ${bacalhau_node_dir}/config.yaml - - chmod 0400 ${bacalhau_node_dir}/docker-compose.yaml - - chmod 0777 ${bacalhau_data_dir} - - # Set ownership - - chown -R ${username}:${username} ${bacalhau_data_dir} - - chown -R ${username}:${username} ${bacalhau_node_dir} - - chown ${username}:${username} ${bacalhau_node_dir}/config.yaml - - chown ${username}:${username} ${bacalhau_node_dir}/docker-compose.yaml - - # Add user to docker group - - usermod -aG docker ${username} - - # Install uv globally and set permissions - - export HOME=/root - - curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" HOME=/root sh - - chmod 755 /usr/local/bin/uv || true - - chown ${username}:${username} /usr/local/bin/uv || true - - # Create uv cache directory for user - - mkdir -p /home/${username}/.cache/uv - - chown -R ${username}:${username} /home/${username}/.cache - - # Install health check web server - - mv /tmp/exs/healthz-web-server.py /usr/local/bin/healthz-web-server.py - - chmod 755 /usr/local/bin/healthz-web-server.py - - chown ${username}:${username} /usr/local/bin/healthz-web-server.py - - # Create a symlink without .py extension for Gunicorn - - ln -sf /usr/local/bin/healthz-web-server.py /usr/local/bin/healthz-web-server - - # Install service - - mv /tmp/exs/healthz-web.service /etc/systemd/system/healthz-web.service - - chmod 644 /etc/systemd/system/healthz-web.service - - # Configure Docker to start on boot - - systemctl enable docker.service - - systemctl enable containerd.service - - #set lables - - - mkdir /opt/test10 - - # Create the mount point for NFScd - - mkdir -p /mnt/data - -# # Mount the NFS share -# - mount -t nfs foxyfutures.pl:/mnt/data /mnt/data -# -# # Ensure NFS mount persists on reboot -# - echo "foxyfutures.pl:/mnt/data /mnt/data nfs defaults,_netdev 0 0" >> /etc/fstab - - mount -t nfs ${efs_mount_ip}:/ /mnt/data - - - echo "${efs_mount_ip}:/ /mnt/data nfs defaults,_netdev 0 0" >> /etc/fstab - - - mv /tmp/exs/generate.py /bacalhau_data/generate.py - - mv /tmp/exs/metadata.sh /bacalhau_data/metadata.sh - - - - # Start services - - systemctl daemon-reload - - systemctl enable docker - - systemctl start docker - - systemctl enable healthz-web.service - - systemctl start healthz-web.service - - systemctl enable bacalhau-startup.service - - systemctl start bacalhau-startup.service - - - - -power_state: - mode: reboot - timeout: 1800 - condition: True diff --git a/edge-data-transfer-demo/edge-data-spots/scripts/disable-network.sh b/edge-data-transfer-demo/edge-data-spots/scripts/disable-network.sh deleted file mode 100644 index 491be53f..00000000 --- a/edge-data-transfer-demo/edge-data-spots/scripts/disable-network.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - - -if [[ $EUID -ne 0 ]]; then - echo "Run as root" - exit 1 -fi - - - - -echo "Config iptables..." - - - -iptables -I DOCKER-USER -p tcp --dport 4222 -j DROP -iptables -I DOCKER-USER -p udp --dport 4222 -j DROP - - - -echo "Block all" diff --git a/edge-data-transfer-demo/edge-data-spots/scripts/disable-nfs.sh b/edge-data-transfer-demo/edge-data-spots/scripts/disable-nfs.sh deleted file mode 100644 index 53a3c19d..00000000 --- a/edge-data-transfer-demo/edge-data-spots/scripts/disable-nfs.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - - -if [[ $EUID -ne 0 ]]; then - echo "Run as root" - exit 1 -fi - - - -echo "Config iptables..." - - - -iptables -A INPUT -p tcp --dport 2049 -j DROP -iptables -A OUTPUT -p tcp --dport 2049 -j DROP -iptables -A FORWARD -p tcp --dport 2049 -j DROP - -iptables -A INPUT -p udp --dport 2049 -j DROP -iptables -A OUTPUT -p udp --dport 2049 -j DROP -iptables -A FORWARD -p udp --dport 2049 -j DROP - - -echo "Block all" diff --git a/edge-data-transfer-demo/edge-data-spots/scripts/enable-network.sh b/edge-data-transfer-demo/edge-data-spots/scripts/enable-network.sh deleted file mode 100644 index ee147501..00000000 --- a/edge-data-transfer-demo/edge-data-spots/scripts/enable-network.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - - -if [[ $EUID -ne 0 ]]; then - echo "Start as root" - exit 1 -fi - -echo "Iptables config" - -iptables -D DOCKER-USER -p tcp --dport 4222 -j DROP -iptables -D DOCKER-USER -p udp --dport 4222 -j DROP - - -echo "Allow All Connect" - diff --git a/edge-data-transfer-demo/edge-data-spots/scripts/enable-nfs.sh b/edge-data-transfer-demo/edge-data-spots/scripts/enable-nfs.sh deleted file mode 100644 index 70b5bbc8..00000000 --- a/edge-data-transfer-demo/edge-data-spots/scripts/enable-nfs.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - - -if [[ $EUID -ne 0 ]]; then - echo "Start as root" - exit 1 -fi - -echo "Iptables config" - - -iptables -D INPUT -p tcp --dport 2049 -j DROP -iptables -D OUTPUT -p tcp --dport 2049 -j DROP -iptables -D FORWARD -p tcp --dport 2049 -j DROP - -iptables -D INPUT -p udp --dport 2049 -j DROP -iptables -D OUTPUT -p udp --dport 2049 -j DROP -iptables -D FORWARD -p udp --dport 2049 -j DROP - -echo "Allow All Connect" - diff --git a/edge-data-transfer-demo/edge-data-spots/scripts/generate.py b/edge-data-transfer-demo/edge-data-spots/scripts/generate.py deleted file mode 100644 index e9c8db54..00000000 --- a/edge-data-transfer-demo/edge-data-spots/scripts/generate.py +++ /dev/null @@ -1,29 +0,0 @@ -import os -import random -import string - -# Directory to store files -output_dir = "/mnt/data" -os.makedirs(output_dir, exist_ok=True) - -# Number of files to generate -num_files = 1000 - -# File size range in bytes (change as needed) -min_size = 5000 * 1024 -max_size = 20000 * 1024 - -for i in range(num_files): - file_size = random.randint(min_size, max_size) # Random size - filename = os.path.join(output_dir, f"file_{i+1}.txt") - - # Generate random content - content = ''.join(random.choices(string.ascii_letters + string.digits, k=file_size)) - - # Write to file - with open(filename, "w") as f: - f.write(content) - - print(f"Generated: {filename} ({file_size} bytes)") - -print(f"\n✅ Successfully generated {num_files} random text files in '{output_dir}'") diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/.gitignore b/edge-data-transfer-demo/edge-data-transfer-dashboard/.gitignore deleted file mode 100644 index 6cdbadc3..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/.gitignore +++ /dev/null @@ -1,29 +0,0 @@ -# See https://help.github.com/articles/ignoring-files/ for more about ignoring files. - -# dependencies -/node_modules - -# next.js -/.next/ -/out/ - -# production -/build - -# debug -npm-debug.log* -yarn-debug.log* -yarn-error.log* -.pnpm-debug.log* - -# env files -.env* - -# vercel -.vercel - -# typescript -*.tsbuildinfo -next-env.d.ts - -config.yaml diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/Dockerfile b/edge-data-transfer-demo/edge-data-transfer-dashboard/Dockerfile deleted file mode 100644 index 081fc254..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/Dockerfile +++ /dev/null @@ -1,55 +0,0 @@ -# Build stage -FROM node:19-alpine3.18 AS builder -WORKDIR /app - -# Copy dependency files and install dependencies -COPY package.json package-lock.json next.config.mjs ./ -RUN npm install - -# Copy source code and build the Next.js app -COPY . . -RUN npm run build - -# Production stage -FROM ghcr.io/bacalhau-project/bacalhau:v1.7.0 -WORKDIR /app - -# Copy only necessary files for production -COPY --from=builder /app/package.json /app/package-lock.json ./ -COPY --from=builder /app/.next ./.next/ -COPY --from=builder /app/public ./public/ -COPY --from=builder /app/next.config.mjs ./ - -# Install node and npm: -ENV NODE_VERSION 19.9.0 - -RUN apt-get update && apt-get install -y curl xz-utils ca-certificates - -RUN ARCH= && dpkgArch="$(dpkg --print-architecture)" \ - && case "${dpkgArch##*-}" in \ - amd64) ARCH='x64';; \ - arm64) ARCH='arm64';; \ - *) echo "unsupported architecture"; exit 1 ;; \ - esac \ - && curl -fsSLO --compressed "https://nodejs.org/dist/v$NODE_VERSION/node-v$NODE_VERSION-linux-$ARCH.tar.xz" \ - && tar -xJf "node-v$NODE_VERSION-linux-$ARCH.tar.xz" -C /usr/local --strip-components=1 --no-same-owner \ - && rm "node-v$NODE_VERSION-linux-$ARCH.tar.xz" \ - && ln -s /usr/local/bin/node /usr/local/bin/nodejs \ - # smoke tests - && node --version \ - && npm --version - -# Install only production dependencies -RUN npm install --production - -# Copy entrypoint script -COPY entrypoint.sh /entrypoint.sh - -# Make entrypoint script executable -RUN chmod +x /entrypoint.sh - -# Expose port -EXPOSE 3000 - -# Run entrypoint script -ENTRYPOINT ["/entrypoint.sh"] \ No newline at end of file diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/Makefile b/edge-data-transfer-demo/edge-data-transfer-dashboard/Makefile deleted file mode 100644 index 1f7f7e2a..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -.PHONY: run - -build: - docker build -t edge-demo-frontend . - -run-container: - docker run --name edge-transfer-demo -p 3000:3000 \ - -e BACALHAU_API_HOST= \ - -e BACALHAU_API_TLS_USETLS=true \ - -e BACALHAU_TOKEN= \ - edge-demo-frontend diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/Readme.md b/edge-data-transfer-demo/edge-data-transfer-dashboard/Readme.md deleted file mode 100644 index 42b4d1fa..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/Readme.md +++ /dev/null @@ -1,42 +0,0 @@ -# Frontend Setup with Bacalhau Environment - -This project uses Bacalhau as its backend service. The following instructions will guide you through setting up your environment, installing dependencies, and running the development server. - -## Prerequisites for local development - -- **Node.js and npm:** Ensure that you have Node.js and npm installed on your machine. -- **Local Bacalhau Installation:** For the application to work, you need to have Bacalhau installed locally. Follow the instructions provided in the [Bacalhau documentation](https://docs.bacalhau.org/) to install it. - -## Environment Setup - -Before starting the frontend, you must set up your Bacalhau environment by configuring the following environment variables: - -```bash -export BACALHAU_API_HOST=api.your-expanso.cloud -export BACALHAU_API_TLS_USETLS=true -export BACALHAU_TOKEN=token -``` - -## Building the Docker Image -To build the Docker image for the application, run the following command in the project's root directory: -```bash -docker build -t edge-demo-frontend . -``` - -## Running the Application with Docker -After building the image, you can run the application in a Docker container by executing: -```bash -docker run --name edge-transfer-demo -p 3000:3000 \ - -e BACALHAU_API_HOST= \ - -e BACALHAU_API_TLS_USETLS=true \ - -e BACALHAU_TOKEN= \ - edge-demo-frontend -``` - -## Important Note -Before running the container, ensure that: - -The `BACALHAU_API_HOST` environment variable is set with a valid API address. -The `BACALHAU_TOKEN` environment variable is set with a valid authentication token. - - diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/app/api/job/route.ts b/edge-data-transfer-demo/edge-data-transfer-dashboard/app/api/job/route.ts deleted file mode 100644 index 005fab45..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/app/api/job/route.ts +++ /dev/null @@ -1,30 +0,0 @@ -import { exec } from 'child_process'; -import { NextResponse } from 'next/server'; - -export async function POST() { - return new Promise((resolve) => { - const child = exec('bacalhau job run job.yaml --id-only'); - - child.stdout.on('data', (data) => { - if (data) { - console.log('stdout:', data); - child.kill('SIGINT'); - resolve(NextResponse.json({ message: data })); - } - }); - - child.stderr.on('data', (data) => { - console.error('stderr:', data); - }); - - child.on('error', (error) => { - resolve(NextResponse.json({ error: error.message }, { status: 500 })); - }); - - child.on('close', (code) => { - if (code !== 0) { - resolve(NextResponse.json({ error: `Process exited with code ${code}` })); - } - }); - }); -} diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/app/api/nodes/route.ts b/edge-data-transfer-demo/edge-data-transfer-dashboard/app/api/nodes/route.ts deleted file mode 100644 index b8f55660..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/app/api/nodes/route.ts +++ /dev/null @@ -1,21 +0,0 @@ -import { NextResponse } from 'next/server'; -import { exec } from 'child_process'; -import { access } from 'fs/promises'; -import { promisify } from 'util'; -import { constants } from 'fs'; - -const execPromise = promisify(exec); - -export async function GET() { - try { - let command = 'bacalhau node list --output json'; - try { - await access('config.yaml', constants.F_OK); - command = 'bacalhau node list --config config.yaml --output json'; - } catch {} - const { stdout } = await execPromise(command); - return NextResponse.json({ output: JSON.parse(stdout) }); - } catch (error) { - return NextResponse.json({ error: error.message }, { status: 500 }); - } -} diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/app/globals.css b/edge-data-transfer-demo/edge-data-transfer-dashboard/app/globals.css deleted file mode 100644 index ac684423..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/app/globals.css +++ /dev/null @@ -1,94 +0,0 @@ -@tailwind base; -@tailwind components; -@tailwind utilities; - -body { - font-family: Arial, Helvetica, sans-serif; -} - -@layer utilities { - .text-balance { - text-wrap: balance; - } -} - -@layer base { - :root { - --background: 0 0% 100%; - --foreground: 0 0% 3.9%; - --card: 0 0% 100%; - --card-foreground: 0 0% 3.9%; - --popover: 0 0% 100%; - --popover-foreground: 0 0% 3.9%; - --primary: 0 0% 9%; - --primary-foreground: 0 0% 98%; - --secondary: 0 0% 96.1%; - --secondary-foreground: 0 0% 9%; - --muted: 0 0% 96.1%; - --muted-foreground: 0 0% 45.1%; - --accent: 0 0% 96.1%; - --accent-foreground: 0 0% 9%; - --destructive: 0 84.2% 60.2%; - --destructive-foreground: 0 0% 98%; - --border: 0 0% 89.8%; - --input: 0 0% 89.8%; - --ring: 0 0% 3.9%; - --chart-1: 12 76% 61%; - --chart-2: 173 58% 39%; - --chart-3: 197 37% 24%; - --chart-4: 43 74% 66%; - --chart-5: 27 87% 67%; - --radius: 0.5rem; - --sidebar-background: 0 0% 98%; - --sidebar-foreground: 240 5.3% 26.1%; - --sidebar-primary: 240 5.9% 10%; - --sidebar-primary-foreground: 0 0% 98%; - --sidebar-accent: 240 4.8% 95.9%; - --sidebar-accent-foreground: 240 5.9% 10%; - --sidebar-border: 220 13% 91%; - --sidebar-ring: 217.2 91.2% 59.8%; - } - .dark { - --background: 0 0% 3.9%; - --foreground: 0 0% 98%; - --card: 0 0% 3.9%; - --card-foreground: 0 0% 98%; - --popover: 0 0% 3.9%; - --popover-foreground: 0 0% 98%; - --primary: 0 0% 98%; - --primary-foreground: 0 0% 9%; - --secondary: 0 0% 14.9%; - --secondary-foreground: 0 0% 98%; - --muted: 0 0% 14.9%; - --muted-foreground: 0 0% 63.9%; - --accent: 0 0% 14.9%; - --accent-foreground: 0 0% 98%; - --destructive: 0 62.8% 30.6%; - --destructive-foreground: 0 0% 98%; - --border: 0 0% 14.9%; - --input: 0 0% 14.9%; - --ring: 0 0% 83.1%; - --chart-1: 220 70% 50%; - --chart-2: 160 60% 45%; - --chart-3: 30 80% 55%; - --chart-4: 280 65% 60%; - --chart-5: 340 75% 55%; - --sidebar-background: 240 5.9% 10%; - --sidebar-foreground: 240 4.8% 95.9%; - --sidebar-primary: 224.3 76.3% 48%; - --sidebar-primary-foreground: 0 0% 100%; - --sidebar-accent: 240 3.7% 15.9%; - --sidebar-accent-foreground: 240 4.8% 95.9%; - --sidebar-border: 240 3.7% 15.9%; - --sidebar-ring: 217.2 91.2% 59.8%; - } -} - -@layer base { - * { - @apply border-border; - } - body { - @apply bg-background text-foreground; - } -} diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/app/layout.tsx b/edge-data-transfer-demo/edge-data-transfer-dashboard/app/layout.tsx deleted file mode 100644 index 1b6365a6..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/app/layout.tsx +++ /dev/null @@ -1,20 +0,0 @@ -import type { Metadata } from 'next' -import './globals.css' - -export const metadata: Metadata = { - title: 'Edge Data Transfer Demo', - description: 'Created with v0', - generator: 'v0.dev', -} - -export default function RootLayout({ - children, -}: Readonly<{ - children: React.ReactNode -}>) { - return ( - - {children} - - ) -} diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/app/page.tsx b/edge-data-transfer-demo/edge-data-transfer-dashboard/app/page.tsx deleted file mode 100644 index 348880a3..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/app/page.tsx +++ /dev/null @@ -1,13 +0,0 @@ -"use client" - -import EdgeDataTransferDashboard from "../dashboard" -import {NodeProvider} from "@/lib/NodeProvider"; -import {JobsProvider} from "@/lib/JobProvider"; - -export default function SyntheticV0PageForDeployment() { - return ( - - - - ) -} diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/build_container.sh b/edge-data-transfer-demo/edge-data-transfer-dashboard/build_container.sh deleted file mode 100755 index 5876d6a0..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/build_container.sh +++ /dev/null @@ -1,108 +0,0 @@ -#!/bin/bash -set -euo pipefail - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -NC='\033[0m' # No Color - -# Log function -log() { - echo -e "${GREEN}[BUILD]${NC} $1" -} - -warn() { - echo -e "${YELLOW}[WARN]${NC} $1" -} - -error() { - echo -e "${RED}[ERROR]${NC} $1" - exit 1 -} - -# Check if required commands exist -command -v docker >/dev/null 2>&1 || error "docker is required but not installed" - -# Check if Dockerfile exists -if [ ! -f "Dockerfile" ]; then - error "Dockerfile not found in current directory" -fi - -# Generate timestamp for tag -TIMESTAMP=$(date +%Y%m%d%H%M) - -# Registry configuration -REGISTRY="ghcr.io" -ORGANIZATION="bacalhau-project" - -# Check for required environment variables -if [ -z "${IMAGE_NAME:-}" ]; then - error "IMAGE_NAME environment variable is not set" -fi - -if [ -z "${LOCAL_TAG:-}" ]; then - error "LOCAL_TAG environment variable is not set" -fi - -REMOTE_TAG="${REGISTRY}/${ORGANIZATION}/${IMAGE_NAME}:${TIMESTAMP}" -LATEST_REMOTE_TAG="${REGISTRY}/${ORGANIZATION}/${IMAGE_NAME}:latest" - -# Validate registry configuration -if [ -z "${REGISTRY}" ] || [ -z "${ORGANIZATION}" ] || [ -z "${IMAGE_NAME}" ]; then - error "Invalid registry configuration" -fi - -# Create and use buildx builder if it doesn't exist -if ! docker buildx inspect multiarch-builder >/dev/null 2>&1; then - log "Creating multi-arch builder..." - docker buildx create --name multiarch-builder --driver docker-container --bootstrap -fi -docker buildx use multiarch-builder - -# Build container for multiple platforms -log "Building Docker container for multiple platforms..." -if ! docker buildx build \ - --platform linux/amd64,linux/arm64 \ - -t "${LOCAL_TAG}" \ - -t "${REMOTE_TAG}" \ - -t "${LATEST_REMOTE_TAG}" \ - -f Dockerfile . ; then - error "Failed to build Docker container" -fi - -log "Container built successfully for all platforms!" - -# Push to registry if requested -if [ "${1:-}" = "--push" ]; then - # Check if user is logged into GitHub Container Registry - if ! docker login ghcr.io >/dev/null 2>&1; then - error "Not logged into GitHub Container Registry. Please run 'docker login ghcr.io' first." - fi - - # Tag and push with timestamp - log "Tagging image with timestamp ${TIMESTAMP}..." - if ! docker tag "${LOCAL_TAG}" "${REMOTE_TAG}"; then - error "Failed to tag Docker image with timestamp" - fi - - log "Pushing timestamped image..." - if ! docker push "${REMOTE_TAG}"; then - error "Failed to push timestamped image" - fi - - # Tag and push as latest - log "Tagging image as latest..." - if ! docker tag "${LOCAL_TAG}" "${LATEST_REMOTE_TAG}"; then - error "Failed to tag Docker image as latest" - fi - - log "Pushing latest tag..." - if ! docker push "${LATEST_REMOTE_TAG}"; then - error "Failed to push latest tag" - fi - - log "Successfully pushed images to ${REGISTRY}:" - log " - ${REMOTE_TAG} (linux/amd64, linux/arm64)" - log " - ${LATEST_REMOTE_TAG} (linux/amd64, linux/arm64)" -fi \ No newline at end of file diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/components.json b/edge-data-transfer-demo/edge-data-transfer-dashboard/components.json deleted file mode 100644 index 13f24bf4..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/components.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "$schema": "https://ui.shadcn.com/schema.json", - "style": "default", - "rsc": true, - "tsx": true, - "tailwind": { - "config": "tailwind.config.ts", - "css": "app/globals.css", - "baseColor": "neutral", - "cssVariables": true, - "prefix": "" - }, - "aliases": { - "components": "@/components", - "utils": "@/lib/utils", - "ui": "@/components/ui", - "lib": "@/lib", - "hooks": "@/hooks" - }, - "iconLibrary": "lucide" -} diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ClearMetadataButton.tsx b/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ClearMetadataButton.tsx deleted file mode 100644 index 983530cf..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ClearMetadataButton.tsx +++ /dev/null @@ -1,48 +0,0 @@ -import { Button } from "@/components/ui/button"; -import { FileJson } from "lucide-react"; -import {NodeProps} from "@/lib/JobProvider"; - -export const ClearMetadataButton = ({ nodes} : {nodes: NodeProps[]}) => { - const handleClick = async () => { - try { - await Promise.all( - nodes.map(async (node) => { - try { - const { - Info: { - NodeType, - Labels: { PUBLIC_IP }, - }, - } = node; - if(NodeType !== 'Requester') { - // Adjust the URL and port as needed. - const response = await fetch(`http://${PUBLIC_IP}:9123/clear-metadata`, { - method: "POST", - headers: { - "Content-Type": "application/json", - "Authorization": `Bearer abrakadabra1234!@#`, - }, - body: JSON.stringify({}), - }); - if (!response.ok) { - throw new Error("Error sending request"); - } - const data = await response.json(); - console.log("Server response:", data); - } - } catch (error) { - console.error("An error occurred:", error); - } - }) - ); - } catch (error) { - console.error("An error occurred:", error); - } - }; - - return ( - - ); -}; diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/DisconnectButton.tsx b/edge-data-transfer-demo/edge-data-transfer-dashboard/components/DisconnectButton.tsx deleted file mode 100644 index a0dfb1aa..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/DisconnectButton.tsx +++ /dev/null @@ -1,50 +0,0 @@ -import React, { useState } from 'react'; -import { Button } from "@/components/ui/button"; -import { Slash } from "lucide-react"; - -export const DisconnectButton = ({ ip, isDisconnected }: {ip: string, isDisconnected: boolean}) => { - const [buttonDisabled, setButtonDisabled] = useState(false); - - const handleClick = async (ip: string) => { - if (buttonDisabled) return; - - setButtonDisabled(true); - const url = isDisconnected ? 'open-network' : 'close-network'; - try { - const response = await fetch(`http://${ip}:9123/${url}`, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - 'Authorization': `Bearer abrakadabra1234!@#`, - }, - body: JSON.stringify({}), - }); - - if (!response.ok) { - throw new Error('Error sending request'); - } - - const data = await response.json(); - console.log('Server response:', data); - } catch (error) { - console.error('An error occurred:', error); - } - - setTimeout(() => { - setButtonDisabled(false); - }, 3000); - }; - - return ( - - ); -}; diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/FilesGrid.tsx b/edge-data-transfer-demo/edge-data-transfer-dashboard/components/FilesGrid.tsx deleted file mode 100644 index ca0f7027..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/FilesGrid.tsx +++ /dev/null @@ -1,45 +0,0 @@ -"use client"; - -import React from "react"; -import {Tooltip, TooltipContent, TooltipProvider, TooltipTrigger,} from "@/components/ui/tooltip"; -import {useJobs} from "@/lib/JobProvider"; - -interface Job { - id: number; - fileName: string; - nodeId: string; // "0" means empty; otherwise the real NodeID from backend - metaInvalid?: boolean; -} - -const JobGrid = React.memo(({ jobs, nodeColorsMapping }: { jobs: Job[]; nodeColorsMapping: Record }) => { - return ( - -
- {jobs.map((job) => ( - - -
- - -

{job.fileName}

-
- - ))} -
- - ); -}); - -const FilesGrid = () => { - const {jobs, nodeColorsMapping} = useJobs() - - return ( -
- -
- ); -} - -export default FilesGrid; diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/NetworkLostButton.tsx b/edge-data-transfer-demo/edge-data-transfer-dashboard/components/NetworkLostButton.tsx deleted file mode 100644 index 067a3cbd..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/NetworkLostButton.tsx +++ /dev/null @@ -1,79 +0,0 @@ -import React, {useCallback, useEffect, useState} from 'react'; -import {Button} from "@/components/ui/button"; -import {WifiOff} from "lucide-react"; - -export const NetworkLossButton = ({ ip }: {ip: string}) => { - const [isDisconnected, setIsDisconnected] = useState(true); - const [buttonDisabled, setButtonDisabled] = useState(false); - - useEffect(() => { - if (!ip) return; - - const checkHealth = async () => { - try { - const response = await fetch(`http://${ip}:9123/nfs-healthz`, { - method: "GET", - headers: { - "Content-Type": "application/json", - Authorization: "Bearer abrakadabra1234!@#", - }, - }); - - if (!response.ok) { - throw new Error("Health check failed"); - } - - const data = await response.json(); - setIsDisconnected(data.status !== "healthy"); - } catch (error) { - setIsDisconnected(true); - } - }; - - checkHealth(); - const intervalId = setInterval(checkHealth, 3000); - - return () => { - clearInterval(intervalId); - }; - }, [ip]); - - const handleClick = useCallback(async () => { - if (!ip || buttonDisabled) return; - - setButtonDisabled(true); - const endpoint = isDisconnected ? "open-nfs" : "close-nfs"; - - try { - const response = await fetch(`http://${ip}:9123/${endpoint}`, { - method: "POST", - headers: { - "Content-Type": "application/json", - Authorization: "Bearer abrakadabra1234!@#", - }, - body: JSON.stringify({}), - }); - if (!response.ok) { - throw new Error("Error sending request"); - } - } catch (error) { - console.error("Error during request:", error); - } - - setTimeout(() => { - setButtonDisabled(false); - }, 3000); - }, [ip, isDisconnected, buttonDisabled]); - - return ( - - ); -}; diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/Node.tsx b/edge-data-transfer-demo/edge-data-transfer-dashboard/components/Node.tsx deleted file mode 100644 index 3b2f036b..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/Node.tsx +++ /dev/null @@ -1,50 +0,0 @@ -import {getStatusBadge, ProgressBar} from "@/components/NodesList"; -import React from "react"; -import {DisconnectButton} from "@/components/DisconnectButton"; -import {NetworkLossButton} from "@/components/NetworkLostButton"; -import {NodeProps} from "@/lib/JobProvider"; - -export const Node = ({node, color, jobs}: {node: NodeProps, color: string, jobs: any[]}) => { - const nodeLabel = node.Info?.NodeID - const idToCompare = nodeLabel === "Empty" ? "0" : nodeLabel; - const count = jobs.filter((job) => job.nodeId === idToCompare).length; - if(node.Info?.NodeType === 'Requester'){ - return null - } - - return ( -
- {/* Column 1: Color and Node Label */} -
-
- - {nodeLabel === "Empty" ? 'Files to process' : nodeLabel} - -
- - {/* Column 2: Status */} -
- {getStatusBadge(String(node?.ConnectionState?.Status).toLowerCase())} -
- - {/* Column 3: Buttons */} -
- - -
- - {/* Column 4: Progress Bar fills the remaining space */} -
- -
-
- ); -} diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/NodesList.tsx b/edge-data-transfer-demo/edge-data-transfer-dashboard/components/NodesList.tsx deleted file mode 100644 index d72d462c..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/NodesList.tsx +++ /dev/null @@ -1,136 +0,0 @@ -"use client"; - -import {Card, CardContent, CardHeader, CardTitle} from "@/components/ui/card"; -import {CheckCircle2, HardDrive, RefreshCw, Slash, Wifi, WifiOff} from "lucide-react"; -import {Badge} from "@/components/ui/badge"; -import React from "react"; -import {Node} from "./Node"; -import {useNodes} from "@/lib/NodeProvider"; -import {useJobs} from "@/lib/JobProvider"; - -export const getStatusBadge = (status: string) => { - switch (status) { - case "connected": - return ( - - Connected - - ); - case "running": - return ( - - Running - - ); - case "completed": - return ( - - Completed - - ); - case "disconnected": - return ( - - Disconnected - - ); - case "network-loss": - return ( - - Network Loss - - ); - default: - return Unknown; - } -}; - -export const FilesProgressBar = ({count, jobsLength} : { count: number, jobsLength: number}) => { - const progress = jobsLength > 0 ? ((jobsLength - count) / jobsLength) * 100 : 100; - return ( -
-
-
-
- - {count} files to process ({progress.toFixed(1)}%) - -
- ) -} - -export const ProgressBar = ({nodeLabel, color, count, jobsLength} : {nodeLabel: string, color: string, count: number, jobsLength: number}) => { - const expectedPerNode = jobsLength * 0.2; - let progress: number; - - if (nodeLabel === "Empty") { - progress = count === 0 ? 100 : 0; - } else { - progress = expectedPerNode > 0 ? (count / expectedPerNode) * 100 : 0; - if (progress > 100) progress = 100; - } - - return ( -
-
-
-
- - {count} files ({progress.toFixed(1)}%) - -
- ); -}; - -export function NodesList() { - const {nodeColorsMapping, nodes} = useNodes() - const {jobs, files} = useJobs() - - return ( - - -
-
Source Nodes
-
- job.nodeId === "0").length } jobsLength={jobs.length}/> -
-
-
- -
- {nodes?.map((node) => { - if (node.Info?.NodeType === "Requester") return null; - return ( -
-
- -
-
- ); - })} -
-
- {/*
*/} - {/*
Destination Node
*/} - {/*
*/} - {/*
*/} - {/* */} - {/* Destination Node*/} - {/*
*/} - {/* */} - {/* Online*/} - {/* */} - {/*
*/} - {/*
*/} - -
-
-
- ); -} diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/theme-provider.tsx b/edge-data-transfer-demo/edge-data-transfer-dashboard/components/theme-provider.tsx deleted file mode 100644 index 55c2f6eb..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/theme-provider.tsx +++ /dev/null @@ -1,11 +0,0 @@ -'use client' - -import * as React from 'react' -import { - ThemeProvider as NextThemesProvider, - type ThemeProviderProps, -} from 'next-themes' - -export function ThemeProvider({ children, ...props }: ThemeProviderProps) { - return {children} -} diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/accordion.tsx b/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/accordion.tsx deleted file mode 100644 index 24c788c2..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/accordion.tsx +++ /dev/null @@ -1,58 +0,0 @@ -"use client" - -import * as React from "react" -import * as AccordionPrimitive from "@radix-ui/react-accordion" -import { ChevronDown } from "lucide-react" - -import { cn } from "@/lib/utils" - -const Accordion = AccordionPrimitive.Root - -const AccordionItem = React.forwardRef< - React.ElementRef, - React.ComponentPropsWithoutRef ->(({ className, ...props }, ref) => ( - -)) -AccordionItem.displayName = "AccordionItem" - -const AccordionTrigger = React.forwardRef< - React.ElementRef, - React.ComponentPropsWithoutRef ->(({ className, children, ...props }, ref) => ( - - svg]:rotate-180", - className - )} - {...props} - > - {children} - - - -)) -AccordionTrigger.displayName = AccordionPrimitive.Trigger.displayName - -const AccordionContent = React.forwardRef< - React.ElementRef, - React.ComponentPropsWithoutRef ->(({ className, children, ...props }, ref) => ( - -
{children}
-
-)) - -AccordionContent.displayName = AccordionPrimitive.Content.displayName - -export { Accordion, AccordionItem, AccordionTrigger, AccordionContent } diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/alert-dialog.tsx b/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/alert-dialog.tsx deleted file mode 100644 index 25e7b474..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/alert-dialog.tsx +++ /dev/null @@ -1,141 +0,0 @@ -"use client" - -import * as React from "react" -import * as AlertDialogPrimitive from "@radix-ui/react-alert-dialog" - -import { cn } from "@/lib/utils" -import { buttonVariants } from "@/components/ui/button" - -const AlertDialog = AlertDialogPrimitive.Root - -const AlertDialogTrigger = AlertDialogPrimitive.Trigger - -const AlertDialogPortal = AlertDialogPrimitive.Portal - -const AlertDialogOverlay = React.forwardRef< - React.ElementRef, - React.ComponentPropsWithoutRef ->(({ className, ...props }, ref) => ( - -)) -AlertDialogOverlay.displayName = AlertDialogPrimitive.Overlay.displayName - -const AlertDialogContent = React.forwardRef< - React.ElementRef, - React.ComponentPropsWithoutRef ->(({ className, ...props }, ref) => ( - - - - -)) -AlertDialogContent.displayName = AlertDialogPrimitive.Content.displayName - -const AlertDialogHeader = ({ - className, - ...props -}: React.HTMLAttributes) => ( -
-) -AlertDialogHeader.displayName = "AlertDialogHeader" - -const AlertDialogFooter = ({ - className, - ...props -}: React.HTMLAttributes) => ( -
-) -AlertDialogFooter.displayName = "AlertDialogFooter" - -const AlertDialogTitle = React.forwardRef< - React.ElementRef, - React.ComponentPropsWithoutRef ->(({ className, ...props }, ref) => ( - -)) -AlertDialogTitle.displayName = AlertDialogPrimitive.Title.displayName - -const AlertDialogDescription = React.forwardRef< - React.ElementRef, - React.ComponentPropsWithoutRef ->(({ className, ...props }, ref) => ( - -)) -AlertDialogDescription.displayName = - AlertDialogPrimitive.Description.displayName - -const AlertDialogAction = React.forwardRef< - React.ElementRef, - React.ComponentPropsWithoutRef ->(({ className, ...props }, ref) => ( - -)) -AlertDialogAction.displayName = AlertDialogPrimitive.Action.displayName - -const AlertDialogCancel = React.forwardRef< - React.ElementRef, - React.ComponentPropsWithoutRef ->(({ className, ...props }, ref) => ( - -)) -AlertDialogCancel.displayName = AlertDialogPrimitive.Cancel.displayName - -export { - AlertDialog, - AlertDialogPortal, - AlertDialogOverlay, - AlertDialogTrigger, - AlertDialogContent, - AlertDialogHeader, - AlertDialogFooter, - AlertDialogTitle, - AlertDialogDescription, - AlertDialogAction, - AlertDialogCancel, -} diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/alert.tsx b/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/alert.tsx deleted file mode 100644 index 41fa7e05..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/alert.tsx +++ /dev/null @@ -1,59 +0,0 @@ -import * as React from "react" -import { cva, type VariantProps } from "class-variance-authority" - -import { cn } from "@/lib/utils" - -const alertVariants = cva( - "relative w-full rounded-lg border p-4 [&>svg~*]:pl-7 [&>svg+div]:translate-y-[-3px] [&>svg]:absolute [&>svg]:left-4 [&>svg]:top-4 [&>svg]:text-foreground", - { - variants: { - variant: { - default: "bg-background text-foreground", - destructive: - "border-destructive/50 text-destructive dark:border-destructive [&>svg]:text-destructive", - }, - }, - defaultVariants: { - variant: "default", - }, - } -) - -const Alert = React.forwardRef< - HTMLDivElement, - React.HTMLAttributes & VariantProps ->(({ className, variant, ...props }, ref) => ( -
-)) -Alert.displayName = "Alert" - -const AlertTitle = React.forwardRef< - HTMLParagraphElement, - React.HTMLAttributes ->(({ className, ...props }, ref) => ( -
-)) -AlertTitle.displayName = "AlertTitle" - -const AlertDescription = React.forwardRef< - HTMLParagraphElement, - React.HTMLAttributes ->(({ className, ...props }, ref) => ( -
-)) -AlertDescription.displayName = "AlertDescription" - -export { Alert, AlertTitle, AlertDescription } diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/aspect-ratio.tsx b/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/aspect-ratio.tsx deleted file mode 100644 index d6a5226f..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/aspect-ratio.tsx +++ /dev/null @@ -1,7 +0,0 @@ -"use client" - -import * as AspectRatioPrimitive from "@radix-ui/react-aspect-ratio" - -const AspectRatio = AspectRatioPrimitive.Root - -export { AspectRatio } diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/avatar.tsx b/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/avatar.tsx deleted file mode 100644 index 51e507ba..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/avatar.tsx +++ /dev/null @@ -1,50 +0,0 @@ -"use client" - -import * as React from "react" -import * as AvatarPrimitive from "@radix-ui/react-avatar" - -import { cn } from "@/lib/utils" - -const Avatar = React.forwardRef< - React.ElementRef, - React.ComponentPropsWithoutRef ->(({ className, ...props }, ref) => ( - -)) -Avatar.displayName = AvatarPrimitive.Root.displayName - -const AvatarImage = React.forwardRef< - React.ElementRef, - React.ComponentPropsWithoutRef ->(({ className, ...props }, ref) => ( - -)) -AvatarImage.displayName = AvatarPrimitive.Image.displayName - -const AvatarFallback = React.forwardRef< - React.ElementRef, - React.ComponentPropsWithoutRef ->(({ className, ...props }, ref) => ( - -)) -AvatarFallback.displayName = AvatarPrimitive.Fallback.displayName - -export { Avatar, AvatarImage, AvatarFallback } diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/badge.tsx b/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/badge.tsx deleted file mode 100644 index f000e3ef..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/badge.tsx +++ /dev/null @@ -1,36 +0,0 @@ -import * as React from "react" -import { cva, type VariantProps } from "class-variance-authority" - -import { cn } from "@/lib/utils" - -const badgeVariants = cva( - "inline-flex items-center rounded-full border px-2.5 py-0.5 text-xs font-semibold transition-colors focus:outline-none focus:ring-2 focus:ring-ring focus:ring-offset-2", - { - variants: { - variant: { - default: - "border-transparent bg-primary text-primary-foreground hover:bg-primary/80", - secondary: - "border-transparent bg-secondary text-secondary-foreground hover:bg-secondary/80", - destructive: - "border-transparent bg-destructive text-destructive-foreground hover:bg-destructive/80", - outline: "text-foreground", - }, - }, - defaultVariants: { - variant: "default", - }, - } -) - -export interface BadgeProps - extends React.HTMLAttributes, - VariantProps {} - -function Badge({ className, variant, ...props }: BadgeProps) { - return ( -
- ) -} - -export { Badge, badgeVariants } diff --git a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/breadcrumb.tsx b/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/breadcrumb.tsx deleted file mode 100644 index 60e6c96f..00000000 --- a/edge-data-transfer-demo/edge-data-transfer-dashboard/components/ui/breadcrumb.tsx +++ /dev/null @@ -1,115 +0,0 @@ -import * as React from "react" -import { Slot } from "@radix-ui/react-slot" -import { ChevronRight, MoreHorizontal } from "lucide-react" - -import { cn } from "@/lib/utils" - -const Breadcrumb = React.forwardRef< - HTMLElement, - React.ComponentPropsWithoutRef<"nav"> & { - separator?: React.ReactNode - } ->(({ ...props }, ref) =>