diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000000..dd84ea7824 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,38 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: '' +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '....' +3. Scroll down to '....' +4. See error + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Screenshots** +If applicable, add screenshots to help explain your problem. + +**Desktop (please complete the following information):** + - OS: [e.g. iOS] + - Browser [e.g. chrome, safari] + - Version [e.g. 22] + +**Smartphone (please complete the following information):** + - Device: [e.g. iPhone6] + - OS: [e.g. iOS8.1] + - Browser [e.g. stock browser, safari] + - Version [e.g. 22] + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000000..bbcbbe7d61 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: '' +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/image/Dockerfile b/.github/image/Dockerfile new file mode 100644 index 0000000000..ddacbcde4c --- /dev/null +++ b/.github/image/Dockerfile @@ -0,0 +1,76 @@ +FROM ubuntu:latest + +# build using command: docker build --progress=plain -t chronon-base . + +# Install necessary tools and Python +RUN apt update && apt install -y wget curl bash python3 python3-pip openjdk-17-jdk python3.12-venv + +# java +ENV JAVA_HOME=/usr/lib/jvm/default-jvm +ENV PATH=$PATH:$JAVA_HOME/bin + +# sbt for scala +RUN curl -L "https://github.com/sbt/sbt/releases/download/v1.8.2/sbt-1.8.2.tgz" | tar -xz -C /usr/local +ENV PATH="/usr/local/sbt/bin:${PATH}" + +# bazel +RUN curl -fsSL "https://github.com/bazelbuild/bazelisk/releases/download/v1.18.0/bazelisk-linux-amd64" -o /usr/local/bin/bazel +RUN chmod +x /usr/local/bin/bazel +ENV PATH="/usr/local/bin:${PATH}" + +# thrift +ARG THRIFT_VERSION=0.21.0 +RUN apt install -y \ + build-essential \ + cmake \ + libboost-dev \ + libssl-dev \ + libevent-dev \ + bison \ + flex \ + autoconf \ + automake \ + libtool \ + curl && \ + curl -LSs https://archive.apache.org/dist/thrift/${THRIFT_VERSION}/thrift-${THRIFT_VERSION}.tar.gz -o thrift-${THRIFT_VERSION}.tar.gz && \ + tar -xzf thrift-${THRIFT_VERSION}.tar.gz && \ + cd thrift-${THRIFT_VERSION} && \ + ./configure --without-python --without-cpp --without-nodejs --without-java && \ + make && \ + make install && \ + cd .. && \ + rm -rf thrift-${THRIFT_VERSION} thrift-${THRIFT_VERSION}.tar.gz && \ + apt purge -y \ + build-essential \ + cmake \ + libboost-dev \ + libssl-dev \ + libevent-dev \ + bison \ + flex \ + autoconf \ + automake \ + libtool \ + curl && \ + apt autoremove -y && \ + rm -rf /var/lib/apt/lists/* + +# Upgrade pip and install some common Python packages +RUN pip3 install --break-system-packages pytest tox flake8 ruff + +RUN apt update && apt install -y build-essential git +RUN mkdir -p /usr/lib/jvm && ln -s /usr/lib/jvm/java-17-openjdk-amd64/ /usr/lib/jvm/default-jvm + +# Verify installations +RUN java -version && \ + thrift -version && \ + python3 --version && \ + pip3 --version && \ + bazel --version && \ + git --version + +# Set working directory +WORKDIR /app + +# Cmd to run when starting the container +CMD ["/bin/bash"] diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000000..ba58c4b648 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,8 @@ +## Summary + +## Checklist +- [ ] Added Unit Tests +- [ ] Covered by existing CI +- [ ] Integration tested +- [ ] Documentation update + diff --git a/.github/release.yml b/.github/release.yml new file mode 100644 index 0000000000..31a30d3b8a --- /dev/null +++ b/.github/release.yml @@ -0,0 +1,17 @@ +changelog: + exclude: + labels: + - ignore-for-release + categories: + - title: Major features / breaking changes + labels: + - Semver-Major + - title: Minor features + labels: + - Semver-Minor + - title: Bug fixes + labels: + - Semver-Patch + - title: Other changes + labels: + - "*" \ No newline at end of file diff --git a/.github/workflows/build_and_push_docker.yaml b/.github/workflows/build_and_push_docker.yaml new file mode 100644 index 0000000000..d6de80b95c --- /dev/null +++ b/.github/workflows/build_and_push_docker.yaml @@ -0,0 +1,35 @@ +name: Build and Push Docker Image + +on: + push: + paths: + - '.github/image/Dockerfile' + workflow_dispatch: + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }}-ci + +jobs: + build-and-push: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - uses: actions/checkout@v4 + + - name: Log in to the Container registry + uses: docker/login-action@v1 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push Docker image + uses: docker/build-push-action@v6 + with: + context: .github/image + push: true + tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest \ No newline at end of file diff --git a/.github/workflows/require_triggered_status_checks.yaml b/.github/workflows/require_triggered_status_checks.yaml new file mode 100644 index 0000000000..d9a714ce32 --- /dev/null +++ b/.github/workflows/require_triggered_status_checks.yaml @@ -0,0 +1,14 @@ +name: branch_protection +on: + push: +jobs: + enforce_triggered_workflows: + runs-on: ubuntu-latest + permissions: + checks: read + steps: + - name: GitHub Checks + uses: poseidon/wait-for-status-checks@v0.6.0 + with: + token: ${{ secrets.GITHUB_TOKEN }} + delay: "10s" diff --git a/.github/workflows/test_bazel_config.yaml b/.github/workflows/test_bazel_config.yaml new file mode 100644 index 0000000000..2a158b5062 --- /dev/null +++ b/.github/workflows/test_bazel_config.yaml @@ -0,0 +1,53 @@ +name: Test Bazel Config + +on: + push: + branches: + - main + paths: + - 'tools/**' + - '.github/workflows/test_bazel_config.yaml' + - '.bazelrc' + - '.bazeliskrc' + - 'WORKSPACE' + pull_request: + branches: + - main + paths: + - 'tools/**' + - '.github/workflows/test_bazel_config.yaml' + - '.bazelrc' + - '.bazeliskrc' + - 'WORKSPACE' + + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + bazel_config_tests: + runs-on: ubuntu-latest + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Run bazel sync + run: | + bazel build \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + --nobuild \ + //... diff --git a/.github/workflows/test_python.yaml b/.github/workflows/test_python.yaml new file mode 100644 index 0000000000..6c0467e0f3 --- /dev/null +++ b/.github/workflows/test_python.yaml @@ -0,0 +1,71 @@ +name: Test Python + +on: + push: + branches: + - main + paths: + - 'api/python/**' + - '.github/workflows/test_python.yaml' + pull_request: + branches: + - main + paths: + - 'api/python/**' + - '.github/workflows/test_python.yaml' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + python_tests: + runs-on: ubuntu-latest + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + + steps: + - uses: actions/checkout@v4 + + - name: Configure Git + run: | + git config --global user.email "github-actions@github.com" + git config --global user.name "GitHub Actions" + + - name: Set up Python virtual environment + shell: bash + run: | + python3 -m venv chronon_py_env + source chronon_py_env/bin/activate + + - name: Run Chronon Python lint (ruff) + shell: bash + run: | + source chronon_py_env/bin/activate + cd api/python + pip install ruff + ruff check . + + - name: Run Chronon Python tests + shell: bash + run: | + set -euxo pipefail + source chronon_py_env/bin/activate + for file in api/thrift/*.thrift; do + thrift --gen py -out api/python/ "$file" + done + cd api/python + pip3 install -r requirements/dev.txt + pip3 install tox + tox + + - uses: actions/upload-artifact@v4 + with: + name: htmlcov + path: api/python/htmlcov \ No newline at end of file diff --git a/.github/workflows/test_scala_2_12_non_spark.yaml b/.github/workflows/test_scala_2_12_non_spark.yaml new file mode 100644 index 0000000000..07ed57bc84 --- /dev/null +++ b/.github/workflows/test_scala_2_12_non_spark.yaml @@ -0,0 +1,260 @@ +name: Test non-spark modules on scala 2.12 + +on: + push: + branches: + - main + paths: + - 'flink/**' + - 'aggregator/**' + - 'online/**' + - 'api/**' + - 'service/**' + - 'service_commons/**' + - 'cloud_aws/**' + - 'cloud_gcp/**' + - '.github/workflows/test_scala_2_12_non_spark.yaml' + pull_request: + branches: + - main + paths: + - 'flink/**' + - 'aggregator/**' + - 'online/**' + - 'api/**' + - 'service/**' + - 'service_commons/**' + - 'cloud_aws/**' + - 'cloud_gcp/**' + - '.github/workflows/test_scala_2_12_non_spark.yaml' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + flink_tests: + runs-on: ubuntu-latest + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Setup Sim-Links + id: sim-links + run: ./startup.sh + + - name: Run Flink tests + run: | + bazel test \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + //flink:tests + + aggregator_tests: + runs-on: ubuntu-latest + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Setup Sim-Links + id: sim-links + run: ./startup.sh + + - name: Run Aggregator tests + run: | + bazel test \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + //aggregator:tests + + online_tests: + runs-on: ubuntu-latest + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Setup Sim-Links + id: sim-links + run: ./startup.sh + + - name: Run Online tests + run: | + bazel test \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + //online:tests + + api_tests: + runs-on: ubuntu-latest + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Setup Sim-Links + id: sim-links + run: ./startup.sh + + - name: Run api tests + run: | + bazel test \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + //api:tests + + service_tests: + runs-on: ubuntu-latest + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Setup Sim-Links + id: sim-links + run: ./startup.sh + + - name: Run service tests + run: | + bazel test \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + //service:tests + + service_commons_tests: + runs-on: ubuntu-latest + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Setup Sim-Links + id: sim-links + run: ./startup.sh + + - name: Run service_commons tests + run: | + bazel test \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + //service_commons:tests + + cloud_gcp_tests: + runs-on: ubuntu-latest + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Setup Sim-Links + id: sim-links + run: ./startup.sh + + - name: Run cloud gcp tests + run: | + bazel test \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + //cloud_gcp:tests + + cloud_aws_tests: + runs-on: ubuntu-latest + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Setup Sim-Links + id: sim-links + run: ./startup.sh + + - name: Run cloud aws tests + run: | + bazel test \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + --java_language_version=17 \ + --java_runtime_version=17 \ + //cloud_aws:tests diff --git a/.github/workflows/test_scala_2_12_spark.yaml b/.github/workflows/test_scala_2_12_spark.yaml new file mode 100644 index 0000000000..39b200e320 --- /dev/null +++ b/.github/workflows/test_scala_2_12_spark.yaml @@ -0,0 +1,201 @@ +name: Test Spark module on scala 2.12 + +on: + push: + branches: + - main + paths: + - 'spark/**' + - '.github/workflows/test_scala_2_12_spark.yaml' + pull_request: + branches: + - main + paths: + - 'spark/**' + - '.github/workflows/test_scala_2_12_spark.yaml' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + spark_tests: + runs-on: ubuntu-8_cores-32_gb + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Run Spark tests + run: | + bazel test \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + --test_env=JAVA_OPTS="-Xmx8G -Xms2G" \ + //spark:tests + batch_tests: + runs-on: ubuntu-8_cores-32_gb + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Run Batch tests + run: | + bazel test \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + --test_env=JAVA_OPTS="-Xmx8G -Xms2G" \ + //spark:batch_test + + fetcher_tests: + runs-on: ubuntu-8_cores-32_gb + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Run Fetcher tests + run: | + bazel test \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + --test_env=JAVA_OPTS="-Xmx16G -Xms8G" \ + //spark:fetcher_test + + join_tests: + runs-on: ubuntu-8_cores-32_gb + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Run Join tests + run: | + bazel test \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + --test_env=JAVA_OPTS="-Xmx16G -Xms8G" \ + //spark:join_test + + groupby_tests: + runs-on: ubuntu-8_cores-32_gb + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Run GroupBy tests + run: | + bazel test \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + --test_env=JAVA_OPTS="-Xmx16G -Xms8G" \ + //spark:groupby_test + + analyzer_tests: + runs-on: ubuntu-8_cores-32_gb + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Run Analyzer tests + run: | + bazel test \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + --test_env=JAVA_OPTS="-Xmx16G -Xms8G" \ + //spark:analyzer_test + + streaming_tests: + runs-on: ubuntu-8_cores-32_gb + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Run Streaming tests + run: | + bazel test \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + --test_env=JAVA_OPTS="-Xmx16G -Xms8G" \ + //spark:streaming_test \ No newline at end of file diff --git a/.github/workflows/test_scala_2_13_non_spark.yaml b/.github/workflows/test_scala_2_13_non_spark.yaml new file mode 100644 index 0000000000..752de0a874 --- /dev/null +++ b/.github/workflows/test_scala_2_13_non_spark.yaml @@ -0,0 +1,237 @@ +name: Test non-spark modules on scala 2.13 + +on: + push: + branches: + - main + paths: + - 'flink/**' + - 'aggregator/**' + - 'online/**' + - 'api/**' + - 'service/**' + - 'cloud_aws/**' + - 'cloud_gcp/**' + - '.github/workflows/test_scala_2_13_non_spark.yaml' + pull_request: + branches: + - main + paths: + - 'flink/**' + - 'aggregator/**' + - 'online/**' + - 'api/**' + - 'service/**' + - 'cloud_aws/**' + - 'cloud_gcp/**' + - '.github/workflows/test_scala_2_13_non_spark.yaml' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + flink_tests: + runs-on: ubuntu-latest + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Setup Sim-Links + id: sim-links + run: ./startup.sh + + - name: Run Flink tests + run: | + bazel test \ + --config=scala_2.13 \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + //flink:tests + + aggregator_tests: + runs-on: ubuntu-latest + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Setup Sim-Links + id: sim-links + run: ./startup.sh + + - name: Run Aggregator tests + run: | + bazel test \ + --config=scala_2.13 \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + //aggregator:tests + + online_tests: + runs-on: ubuntu-latest + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Setup Sim-Links + id: sim-links + run: ./startup.sh + + - name: Run Online tests + run: | + bazel test \ + --config=scala_2.13 \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + //online:tests + + api_tests: + runs-on: ubuntu-latest + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Setup Sim-Links + id: sim-links + run: ./startup.sh + + - name: Run api tests + run: | + bazel test \ + --config=scala_2.13 \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + //api:tests + + service_tests: + runs-on: ubuntu-latest + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Setup Sim-Links + id: sim-links + run: ./startup.sh + + - name: Run service tests + run: | + bazel test \ + --config=scala_2.13 \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + //service:tests + + cloud_gcp_tests: + runs-on: ubuntu-latest + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Setup Sim-Links + id: sim-links + run: ./startup.sh + + - name: Run cloud gcp tests + run: | + bazel test \ + --config=scala_2.13 \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + //cloud_gcp:tests + + cloud_aws_tests: + runs-on: ubuntu-latest + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Setup Sim-Links + id: sim-links + run: ./startup.sh + + - name: Run cloud aws tests + run: | + bazel test \ + --config=scala_2.13 \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + --java_language_version=17 \ + --java_runtime_version=17 \ + //cloud_aws:tests diff --git a/.github/workflows/test_scala_2_13_spark.yaml b/.github/workflows/test_scala_2_13_spark.yaml new file mode 100644 index 0000000000..3090eab231 --- /dev/null +++ b/.github/workflows/test_scala_2_13_spark.yaml @@ -0,0 +1,209 @@ +name: Test Spark module on scala 2.13 + +on: + push: + branches: + - main + paths: + - 'spark/**' + - '.github/workflows/test_scala_2_13_spark.yaml' + pull_request: + branches: + - main + paths: + - 'spark/**' + - '.github/workflows/test_scala_2_13_spark.yaml' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + spark_tests: + runs-on: ubuntu-8_cores-32_gb + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Run Spark tests + run: | + bazel test \ + --config=scala_2.13 \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + --test_env=JAVA_OPTS="-Xmx8G -Xms2G" \ + //spark:tests + + batch_tests: + runs-on: ubuntu-8_cores-32_gb + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Run Batch tests + run: | + bazel test \ + --config=scala_2.13 \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + --test_env=JAVA_OPTS="-Xmx8G -Xms2G" \ + //spark:batch_test + + fetcher_tests: + runs-on: ubuntu-8_cores-32_gb + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Run Fetcher tests + run: | + bazel test \ + --config=scala_2.13 \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + --test_env=JAVA_OPTS="-Xmx16G -Xms8G" \ + //spark:fetcher_test + + join_tests: + runs-on: ubuntu-8_cores-32_gb + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Run Join tests + run: | + bazel test \ + --config=scala_2.13 \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + --test_env=JAVA_OPTS="-Xmx16G -Xms8G" \ + //spark:join_test + + groupby_tests: + runs-on: ubuntu-8_cores-32_gb + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Run GroupBy tests + run: | + bazel test \ + --config=scala_2.13 \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + --test_env=JAVA_OPTS="-Xmx16G -Xms8G" \ + //spark:groupby_test + + analyzer_tests: + runs-on: ubuntu-8_cores-32_gb + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Run Analyzer tests + run: | + bazel test \ + --config=scala_2.13 \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + --test_env=JAVA_OPTS="-Xmx16G -Xms8G" \ + //spark:analyzer_test + + streaming_tests: + runs-on: ubuntu-8_cores-32_gb + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Run Streaming tests + run: | + bazel test \ + --config=scala_2.13 \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + --test_env=JAVA_OPTS="-Xmx16G -Xms8G" \ + //spark:streaming_test \ No newline at end of file diff --git a/.github/workflows/test_scala_fmt.yaml b/.github/workflows/test_scala_fmt.yaml new file mode 100644 index 0000000000..d6ee84cd23 --- /dev/null +++ b/.github/workflows/test_scala_fmt.yaml @@ -0,0 +1,50 @@ +name: Scala Fmt + +on: + push: + branches: + - main + paths: + - '**/*.scala' + - '.github/workflows/test_scala_fmt.yaml' + pull_request: + branches: + - main + paths: + - '**/*.scala' + - '.github/workflows/test_scala_fmt.yaml' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + scala_compile_fmt_fix: + runs-on: ubuntu-latest + container: + image: ghcr.io/${{ github.repository }}-ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + defaults: + run: + working-directory: ${{ github.workspace }} + + steps: + - uses: actions/checkout@v4 + + - name: Set up locale + run: | + export LANG=en_US.UTF-8 + export LC_ALL=en_US.UTF-8 + + - name: Setup Bazel cache credentials + run: | + echo "${{ secrets.BAZEL_CACHE_CREDENTIALS }}" | base64 -d > bazel-cache-key.json + + - name: Check Scalafmt + run: | + bazel query 'kind("scala_library.*", //...)' | xargs -I {} bazel run \ + --remote_cache=https://storage.googleapis.com/zipline-bazel-cache \ + --google_credentials=bazel-cache-key.json \ + {}.format-test \ No newline at end of file diff --git a/BUILD.bazel b/BUILD.bazel index a05fee3160..e69de29bb2 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -1,36 +0,0 @@ -load("@rules_license//rules:license.bzl", "license") -load("@rules_license//rules:license_kind.bzl", "license_kind") - -license_kind( - name = "zipline_ai", -) - -license( - name = "license", - package_name = "zipline_packages", - license_kinds = [":zipline_ai"], - license_text = "Refer to Zipline Agreement Document", -) - -package( - default_package_metadata = [":license"], - default_visibility = ["//visibility:public"], -) - -exports_files(["maven_install.json"]) - -# Don't try to build this target, instead build individual module uber jars with appropriate main class -# This is currently only used to generate single SBOM file with all our dependencies across modules -jvm_binary( - name = "zipline_packages", - deploy_env = [ - "//tools/build_rules/cloud_gcp:cloud_gcp", - "//tools/build_rules/flink:flink", - ], - main_class = "None", - runtime_deps = [ - "//cloud_gcp:base_cloud_gcp_lib", - "//flink:lib", - "//service:lib", - ], -) diff --git a/api/python/ai/chronon/cli/compile/conf_validator.py b/api/python/ai/chronon/cli/compile/conf_validator.py index 09e12c5fb4..a3c20f8bf6 100644 --- a/api/python/ai/chronon/cli/compile/conf_validator.py +++ b/api/python/ai/chronon/cli/compile/conf_validator.py @@ -20,6 +20,7 @@ import re from collections import defaultdict from typing import Dict, List, Set +import textwrap import ai.chronon.api.common.ttypes as common from ai.chronon.api.ttypes import ( @@ -29,7 +30,7 @@ ExternalPart, GroupBy, Join, - Source, + Source, JoinPart, ) from ai.chronon.group_by import get_output_col_names from ai.chronon.logger import get_logger @@ -377,6 +378,80 @@ def _validate_derivations( derived_columns.add(derivation.name) return errors + def _validate_join_part_keys(self, join_part: JoinPart, left_cols: List[str]) -> BaseException: + keys = [] + + key_mapping = join_part.keyMapping if join_part.keyMapping else {} + for key in join_part.groupBy.keyColumns: + keys.append(key_mapping.get(key, key)) + + missing = [k for k in keys if k not in left_cols] + + err_string = "" + left_cols_as_str = ", ".join(left_cols) + if missing: + err_string += textwrap.dedent(f""" + Join is missing keys {missing} on left side. Required for JoinPart: {join_part.groupBy.metaData.name}. + Existing columns on left side: {left_cols_as_str} + All required Keys: {join_part.groupBy.keyColumns} + Key Mapping: {key_mapping if key_mapping else 'None'} + Consider renaming a column on the left, or including the key_mapping argument to your join_part.""") + + if key_mapping: + # Left side of key mapping should include columns on the left + key_map_keys_missing_from_left = [k for k in key_mapping.keys() if k not in left_cols] + if key_map_keys_missing_from_left: + err_string += f"\nThe following keys in your key_mapping: {str(key_map_keys_missing_from_left)} are not included in the left side of the join: {left_cols_as_str}" + + # Right side of key mapping should only include keys in GroupBy + keys_missing_from_key_map_values = [v for v in key_mapping.values() if v not in join_part.groupBy.keyColumns] + if keys_missing_from_key_map_values: + err_string += f"\nThe following values in your key_mapping: {str(keys_missing_from_key_map_values)} do not cover any group by key columns: {join_part.groupBy.keyColumns}" + + if key_map_keys_missing_from_left or keys_missing_from_key_map_values: + err_string += f"\nKey Mapping should be formatted as column_from_left -> group_by_key." + + if err_string: + return ValueError(err_string) + + + def _validate_keys(self, join: Join) -> List[BaseException]: + left = join.left + + left_selects = None + if left.events: + left_selects = left.events.query.selects + elif left.entities: + left_selects = left.entities.query.selects + elif left.joinSource: + left_selects = left.joinSource.query.selects + # TODO -- if selects are not selected here, get output cols from join + + left_cols = [] + + if left_selects: + left_cols = left_selects.keys() + + errors = [] + + if left_cols: + join_parts = join.joinParts + + # Add label_parts to join_parts to validate if set + label_parts = join.labelParts + if label_parts: + for label_jp in label_parts.labels: + join_parts.append(label_jp) + + # Validate join_parts + for join_part in join_parts: + join_part_err = self._validate_join_part_keys(join_part, left_cols) + if join_part_err: + errors.append(join_part_err) + + return errors + + def _validate_join(self, join: Join) -> List[BaseException]: """ Validate join's status with materialized versions of group_bys @@ -437,6 +512,9 @@ def _validate_join(self, join: Join) -> List[BaseException]: keys = get_pre_derived_source_keys(join.left) columns = features + keys errors.extend(self._validate_derivations(columns, join.derivations)) + + errors.extend(self._validate_keys(join)) + return errors def _validate_group_by(self, group_by: GroupBy) -> List[BaseException]: diff --git a/api/thrift/agent.thrift b/api/thrift/agent.thrift index fb510c7017..da81c72a7a 100644 --- a/api/thrift/agent.thrift +++ b/api/thrift/agent.thrift @@ -5,8 +5,8 @@ include "common.thrift" struct YarnAutoScalingSpec { 1: optional i32 minInstances 2: optional i32 maxInstances - 3: optional i32 scaleUpFactor // 1.5x, 2x etc - 4: optional i32 scaleDownFactor + 3: optional double scaleUpFactor // 1.5x, 2x etc + 4: optional double scaleDownFactor 5: optional string cooldownPeriod } diff --git a/cloud_aws/src/main/scala/ai/chronon/integrations/aws/EmrSubmitter.scala b/cloud_aws/src/main/scala/ai/chronon/integrations/aws/EmrSubmitter.scala index 3051a4f7a2..b757ff04a9 100644 --- a/cloud_aws/src/main/scala/ai/chronon/integrations/aws/EmrSubmitter.scala +++ b/cloud_aws/src/main/scala/ai/chronon/integrations/aws/EmrSubmitter.scala @@ -56,10 +56,17 @@ class EmrSubmitter(customerId: String, emrClient: EmrClient) extends JobSubmitte clusterIdleTimeout: Int = DefaultClusterIdleTimeout, masterInstanceType: String = DefaultClusterInstanceType, slaveInstanceType: String = DefaultClusterInstanceType, - instanceCount: Int = DefaultClusterInstanceCount) = { - val runJobFlowRequestBuilder = RunJobFlowRequest - .builder() - .name(s"job-${java.util.UUID.randomUUID.toString}") + instanceCount: Int = DefaultClusterInstanceCount, + clusterName: Option[String] = None) = { + val runJobFlowRequestBuilder = if (clusterName.isDefined) { + RunJobFlowRequest + .builder() + .name(clusterName.get) + } else { + RunJobFlowRequest + .builder() + .name(s"job-${java.util.UUID.randomUUID.toString}") + } // Cluster infra configurations: val customerSecurityGroupId = CustomerToSecurityGroupIdMap.getOrElse( @@ -170,7 +177,9 @@ class EmrSubmitter(customerId: String, emrClient: EmrClient) extends JobSubmitte submissionProperties.getOrElse(ClusterIdleTimeout, DefaultClusterIdleTimeout.toString).toInt, masterInstanceType = submissionProperties.getOrElse(ClusterInstanceType, DefaultClusterInstanceType), slaveInstanceType = submissionProperties.getOrElse(ClusterInstanceType, DefaultClusterInstanceType), - instanceCount = submissionProperties.getOrElse(ClusterInstanceCount, DefaultClusterInstanceCount.toString).toInt + instanceCount = + submissionProperties.getOrElse(ClusterInstanceCount, DefaultClusterInstanceCount.toString).toInt, + clusterName = submissionProperties.get(ClusterName) ) runJobFlowBuilder.steps( @@ -200,10 +209,11 @@ class EmrSubmitter(customerId: String, emrClient: EmrClient) extends JobSubmitte } } - override def status(jobId: String): Unit = { + override def status(jobId: String): String = { val describeStepResponse = emrClient.describeStep(DescribeStepRequest.builder().stepId(jobId).build()) val status = describeStepResponse.step().status() println(status) + status.toString } override def kill(stepId: String): Unit = { diff --git a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/DataprocSubmitter.scala b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/DataprocSubmitter.scala index 1d967c6c23..b11f9628a0 100644 --- a/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/DataprocSubmitter.scala +++ b/cloud_gcp/src/main/scala/ai/chronon/integrations/cloud_gcp/DataprocSubmitter.scala @@ -30,17 +30,17 @@ case class GeneralJob( class DataprocSubmitter(jobControllerClient: JobControllerClient, conf: SubmitterConf) extends JobSubmitter { - override def status(jobId: String): Unit = { + override def status(jobId: String): String = { try { val currentJob: Job = jobControllerClient.getJob(conf.projectId, conf.region, jobId) - currentJob.getStatus.getState + currentJob.getStatus.getState.toString } catch { case e: ApiException => println(s"Error monitoring job: ${e.getMessage}") - + "UNKNOWN" // If there's an error, we return UNKNOWN status } } diff --git a/docs/source/dev/devnotes.md b/docs/source/dev/devnotes.md index 7db7318623..d57ed134a7 100644 --- a/docs/source/dev/devnotes.md +++ b/docs/source/dev/devnotes.md @@ -147,18 +147,6 @@ bazel build //hub:hub_assembly_deploy.jar > transitive dependencies, otherwise `bazel build //{module}:{target}` will only include > dependencies specified in the target definition -### Generate SBOM file for security review - -We created `zipline_packages` target in root directory BUILD.bazel file including all runtime deps -used across our deployed uber jars so we include all the necessary packages - -> Note: Do not use `zipline_packages` target for other purposes - -```shell -# This will generate SBOM file at bazel-bin/tools/compliance/zipline_sbom.json -bazel build //tools/compliance:zipline_sbom -``` - ### All tests for a specific module Also it's lot easier to just run from IntelliJ diff --git a/scripts/codemod/BUILD.bazel b/scripts/codemod/BUILD.bazel new file mode 100644 index 0000000000..0f607ce2ee --- /dev/null +++ b/scripts/codemod/BUILD.bazel @@ -0,0 +1,6 @@ +py_binary( + name = "thrift_package_replace", + srcs = ["thrift_package_replace.py"], + main = "thrift_package_replace.py", + visibility = ["//visibility:public"], +) \ No newline at end of file diff --git a/scripts/codemod/test_replace.py b/scripts/codemod/test_replace.py new file mode 100644 index 0000000000..0a1a17eda2 --- /dev/null +++ b/scripts/codemod/test_replace.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 + + +import glob + +""" +we have tests written in multiple flavors + +- extending junit TestCase class +- using @test annotation +- using AnyFunSuite +- using AnyFlatSpec +- using vertx junit runner + +bazel silently fails to run the tests when they are not uniform! + +This script translates almost all of the tests to AnyFlatSpec except for vertx tests. + +NOTE: CWD needs to be the root of the repo. + +USAGE: python3 scripts/codemod/test_replace.py +""" + + +def get_test_class_name(path): + # Get the file name from the path + filename = path.split("/")[-1] + # Remove 'Test.scala' and return + return filename.replace("Test.scala", "") + + +def convert_fun_suite_to_flatspec(lines, test_name): + modified_lines = [] + + for line in lines: + # Replace import statement + if "import org.scalatest.funsuite.AnyFunSuite" in line: + line = line.replace("funsuite.AnyFunSuite", "flatspec.AnyFlatSpec") + modified_lines.append(line) + continue + + # Replace AnyFunSuite with AnyFlatSpec + if "extends AnyFunSuite" in line: + line = line.replace("AnyFunSuite", "AnyFlatSpec") + modified_lines.append(line) + continue + + # Handle ignore tests and regular tests + if ("ignore(" in line or "test(" in line) and "{" in line: + start = line.find('"') + end = line.find('"', start + 1) + if start != -1 and end != -1: + test_desc = line[start + 1 : end] # Get description without quotes + words = test_desc.split() + + # Check if second word is "should" + if len(words) > 1 and words[1].lower() == "should": + subject = words[0] # Use first word as subject + remaining_desc = " ".join( + words[2:] + ) # Rest of description including "should" + new_desc = f'"{subject}" should "{remaining_desc}"' + else: + new_desc = f' it should "{test_desc}"' + + # Add appropriate suffix based on whether it's ignore or test + if "ignore(" in line: + new_line = f"{new_desc} ignore {{" + else: + new_line = f"{new_desc} in {{" + + modified_lines.append(new_line + "\n") + continue + + # Keep other lines unchanged + modified_lines.append(line) + + return "".join(modified_lines) + + +def split_camel_case(word): + if not word: + return [] + + result = [] + current_word = word[0].lower() + + for i in range(1, len(word)): + current_char = word[i] + prev_char = word[i - 1] + + # Split on transition from lowercase to uppercase + if current_char.isupper() and prev_char.islower(): + result.append(current_word) + current_word = current_char.lower() + # Split on transition from uppercase to lowercase, but only if it's not + # part of an acronym (i.e., if the previous char was also uppercase and + # not at the start of a word) + elif ( + current_char.islower() + and prev_char.isupper() + and i > 1 + and word[i - 2].isupper() + ): + result.append(current_word[:-1]) + current_word = prev_char.lower() + current_char + else: + current_word += current_char.lower() + + result.append(current_word) + return [token for token in result if token != "test"] + + +def convert_junit_to_flatspec(lines, test_name): + modified_lines = [] + is_test_method = False + class_modified = False + + for line in lines: + # Replace JUnit import with FlatSpec import + if "import org.junit.Test" in line: + modified_lines.append("import org.scalatest.flatspec.AnyFlatSpec\n") + continue + + # Handle class definition + if "class" in line and "Test" in line and (not class_modified): + class_modified = True + class_name = line.split("class")[1].split("{")[0].strip() + modified_lines.append(f"class {class_name} extends AnyFlatSpec {{\n") + continue + + # Mark start of a test method + if "@Test" in line: + is_test_method = True + continue + + # Convert only test methods marked with @Test and not private + if ( + is_test_method + and "def " in line + and "private" not in line + and (("(): Unit" in line) or ("): Unit" not in line)) + ): + is_test_method = False + + method_name = line.split("def ")[1].split("(")[0] + + test_description = " ".join(split_camel_case(method_name)) + + modified_lines.append(f' it should "{test_description}" in {{\n') + continue + + is_test_method = False + modified_lines.append(line) + + return "".join(modified_lines) + + +def convert_testcase_to_flatspec(lines, test_name): + modified_lines = [] + + for line in lines: + # Replace TestCase import with FlatSpec import + if "junit.framework.TestCase" in line: + modified_lines.append("import org.scalatest.flatspec.AnyFlatSpec\n") + continue + + # Handle imports that we want to keep + if line.startswith("import") and "TestCase" not in line: + modified_lines.append(line) + continue + + # Handle class definition + if "class" in line and "extends TestCase" in line: + class_name = line.split("class")[1].split("extends")[0].strip() + modified_lines.append(f"class {class_name} extends AnyFlatSpec {{\n") + continue + + # Convert test methods (they start with "def test") + if ( + "def test" in line + and "private" not in line + and ("(): Unit" in line or "): Unit" not in line) + ): + method_name = line.split("def test")[1].split("(")[0].strip() + # If there are parameters, capture them + + test_description = " ".join(split_camel_case(method_name)) + + modified_lines.append(f' it should "{test_description}" in {{\n') + continue + + modified_lines.append(line) + + return "".join(modified_lines) + + +def convert(handler, file_path): + test_name = get_test_class_name(file_path) + with open(file_path, "r") as file: + lines = file.readlines() + converted = handler(lines, test_name) + + with open(file_path, "w") as file: + file.write(converted) + + print(f"Converted {file_path}") + + +# Few challenging test cases below + +# convert( +# convert_junit_to_flatspec, +# "spark/src/test/scala/ai/chronon/spark/test/JoinUtilsTest.scala", +# ) + +# convert( +# convert_junit_to_flatspec, +# "spark/src/test/scala/ai/chronon/spark/test/LocalExportTableAbilityTest.scala", +# ) + +# convert( +# convert_testcase_to_flatspec, +# "aggregator/src/test/scala/ai/chronon/aggregator/test/FrequentItemsTest.scala", +# ) + +# convert( +# convert_fun_suite_to_flatspec, +# "spark/src/test/scala/ai/chronon/spark/test/FetcherTest.scala", +# ) + + +if __name__ == "__main__": + test_files = glob.glob("**/*Test.scala", recursive=True) + + fun_suite_files = [] + junit_files = [] + others = [] + junit_test_case_files = [] + flat_spec_files = [] + + for file_path in test_files: + try: + with open(file_path, "r") as file: + content = file.read() + if "AnyFunSuite" in content: + fun_suite_files.append(file_path) + elif "import org.junit.Test" in content: + junit_files.append(file_path) + elif "extends TestCase" in content: + junit_test_case_files.append(file_path) + elif "extends AnyFlatSpec" in content: + flat_spec_files.append(file_path) + else: + others.append(file_path) + except Exception as e: + print(f"Error reading {file_path}: {e}") + + print(f"funsuite files:\n {"\n ".join(fun_suite_files)}") + + for file in fun_suite_files: + convert(convert_fun_suite_to_flatspec, file) + + print(f"junit files:\n {"\n ".join(junit_files)}") + + for file in junit_files: + convert(convert_junit_to_flatspec, file) + + print(f"test case files:\n {"\n ".join(junit_test_case_files)}") + + for file in junit_test_case_files: + convert(convert_testcase_to_flatspec, file) + + print(f"flat spec files:\n {"\n ".join(flat_spec_files)}") + print(f"Other files:\n {"\n ".join(others)}") diff --git a/scripts/codemod/thrift_package_replace.py b/scripts/codemod/thrift_package_replace.py new file mode 100644 index 0000000000..b080831bfe --- /dev/null +++ b/scripts/codemod/thrift_package_replace.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 + +import argparse +import os +import sys +from pathlib import Path + + +def replace_in_file(input_file: Path, output_file: Path) -> None: + """Replace package names in a single file.""" + os.makedirs(output_file.parent, exist_ok=True) + + with open(input_file, 'r') as f: + content = f.read() + + modified_content = content.replace('org.apache.thrift', 'ai.chronon.api.thrift') + + with open(output_file, 'w') as f: + f.write(modified_content) + + +def process_directory(input_dir: Path, output_dir: Path, verbose: bool = False) -> None: + """Process all Java files in the input directory and its subdirectories.""" + if verbose: + print(f"Scanning directory: {input_dir}") + print(f"Output directory: {output_dir}") + + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Find all Java files + for java_file in input_dir.rglob('*.java'): + if verbose: + print(f"Processing file: {java_file}") + + # Calculate relative path to maintain directory structure + rel_path = java_file.relative_to(input_dir) + output_file = output_dir / rel_path + + if verbose: + print(f"Writing to: {output_file}") + + replace_in_file(java_file, output_file) + + +def main(): + parser = argparse.ArgumentParser(description='Replace package names in Java files') + parser.add_argument('input_dir', type=str, help='Input directory containing Java files') + parser.add_argument('output_dir', type=str, help='Output directory for modified files') + parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose output') + + args = parser.parse_args() + + input_path = Path(args.input_dir) + output_path = Path(args.output_dir) + + if not input_path.exists(): + print(f"Error: Input directory '{input_path}' does not exist", file=sys.stderr) + sys.exit(1) + + if not input_path.is_dir(): + print(f"Error: '{input_path}' is not a directory", file=sys.stderr) + sys.exit(1) + + try: + process_directory(input_path, output_path, args.verbose) + if args.verbose: + print("Replacement complete!") + except Exception as e: + print(f"Error: {str(e)}", file=sys.stderr) + sys.exit(1) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/spark/src/main/scala/ai/chronon/spark/submission/JobSubmitter.scala b/spark/src/main/scala/ai/chronon/spark/submission/JobSubmitter.scala index de8e6927a2..48953e5be8 100644 --- a/spark/src/main/scala/ai/chronon/spark/submission/JobSubmitter.scala +++ b/spark/src/main/scala/ai/chronon/spark/submission/JobSubmitter.scala @@ -21,7 +21,7 @@ trait JobSubmitter { files: List[String], args: String*): String - def status(jobId: String): Unit + def status(jobId: String): String def kill(jobId: String): Unit } @@ -109,6 +109,7 @@ object JobSubmitterConstants { val EmrReleaseLabel = "emrReleaseLabel" val ShouldCreateCluster = "shouldCreateCluster" val ClusterId = "jobFlowId" + val ClusterName = "clusterName" val JarUriArgKeyword = "--jar-uri" val JobTypeArgKeyword = "--job-type" diff --git a/tools/compliance/BUILD b/tools/compliance/BUILD deleted file mode 100644 index ce4b287bc9..0000000000 --- a/tools/compliance/BUILD +++ /dev/null @@ -1,21 +0,0 @@ -load(":gather_packages.bzl", "packages_used") -load(":sbom.bzl", "sbom") - -packages_used( - name = "zipline_packages", - out = "zipline_packages.json", - target = "//:zipline_packages_deploy.jar", -) - -sbom( - name = "zipline_sbom", - out = "zipline_sbom.json", - target = "//:zipline_packages_deploy.jar", -) - -# Add the write_sbom_private target similar to the reference -py_binary( - name = "write_sbom_private", - srcs = ["write_sbom.py"], - main = "write_sbom.py", -) diff --git a/tools/compliance/gather_packages.bzl b/tools/compliance/gather_packages.bzl deleted file mode 100644 index ff26347224..0000000000 --- a/tools/compliance/gather_packages.bzl +++ /dev/null @@ -1,258 +0,0 @@ -# This file is copied from the bazel github repository https://github.com/bazelbuild/bazel/blob/master/tools/compliance -# as it was recently added in latest bazel version but we are currently using old 5.4.0 because of pending -# bazelmod migration for rules_scala package -# *** DO NOT MODIFY THIS FILE UNLESS WE REALLY NEED TO *** - -"""Rules and macros for collecting LicenseInfo providers.""" - -load("@rules_license//rules:filtered_rule_kinds.bzl", "aspect_filters") -load( - "@rules_license//rules:providers.bzl", - "LicenseInfo", - "PackageInfo", -) -load("@rules_license//rules_gathering:trace.bzl", "TraceInfo") -load(":to_json.bzl", "labels_to_json", "licenses_to_json", "package_infos_to_json") -load(":user_filtered_rule_kinds.bzl", "user_aspect_filters") - -TransitivePackageInfo = provider( - """Transitive list of all SBOM relevant dependencies.""", - fields = { - "top_level_target": "Label: The top level target label we are examining.", - "license_info": "depset(LicenseInfo)", - "package_info": "depset(PackageInfo)", - "packages": "depset(label)", - "target_under_license": "Label: A target which will be associated with some licenses.", - "traces": "list(string) - diagnostic for tracing a dependency relationship to a target.", - }, -) - -# Singleton instance of nothing present. Each level of the aspect must return something, -# so we use one to save space instead of making a lot of empty ones. -NULL_INFO = TransitivePackageInfo(license_info = depset(), package_info = depset(), packages = depset()) - -def should_traverse(ctx, attr): - """Checks if the dependent attribute should be traversed. - - Args: - ctx: The aspect evaluation context. - attr: The name of the attribute to be checked. - - Returns: - True iff the attribute should be traversed. - """ - k = ctx.rule.kind - for filters in [aspect_filters, user_aspect_filters]: - always_ignored = filters.get("*", []) - if k in filters: - attr_matches = filters[k] - if (attr in attr_matches or - "*" in attr_matches or - ("_*" in attr_matches and attr.startswith("_")) or - attr in always_ignored): - return False - - for m in attr_matches: - if attr == m: - return False - return True - -def _get_transitive_metadata(ctx, trans_license_info, trans_package_info, trans_packages, traces, provider, filter_func): - """Pulls the transitive data up from attributes we care about. - - Collapses all the TransitivePackageInfo providers from my deps into lists - that we can then turn into a single TPI. - """ - attrs = [a for a in dir(ctx.rule.attr)] - for name in attrs: - if not filter_func(ctx, name): - continue - a = getattr(ctx.rule.attr, name) - - # Make anything singleton into a list for convenience. - if type(a) != type([]): - a = [a] - for dep in a: - # Ignore anything that isn't a target - if type(dep) != "Target": - continue - - # Targets can also include things like input files that won't have the - # aspect, so we additionally check for the aspect rather than assume - # it's on all targets. Even some regular targets may be synthetic and - # not have the aspect. This provides protection against those outlier - # cases. - if provider in dep: - info = dep[provider] - if info.license_info: - trans_license_info.append(info.license_info) - if info.package_info: - trans_package_info.append(info.package_info) - if info.packages: - trans_packages.append(info.packages) - - if hasattr(info, "traces"): - if info.traces: - for trace in info.traces: - traces.append("(" + ", ".join([str(ctx.label), ctx.rule.kind, name]) + ") -> " + trace) - -def gather_package_common(target, ctx, provider_factory, metadata_providers, filter_func): - """Collect license and other metadata info from myself and my deps. - - Any single target might directly depend on a license, or depend on - something that transitively depends on a license, or neither. - This aspect bundles all those into a single provider. At each level, we add - in new direct license deps found and forward up the transitive information - collected so far. - - This is a common abstraction for crawling the dependency graph. It is - parameterized to allow specifying the provider that is populated with - results. It is configurable to select only a subset of providers. It - is also configurable to specify which dependency edges should not - be traced for the purpose of tracing the graph. - - Args: - target: The target of the aspect. - ctx: The aspect evaluation context. - provider_factory: abstracts the provider returned by this aspect - metadata_providers: a list of other providers of interest - filter_func: a function that returns true iff the dep edge should be ignored - - Returns: - provider of parameterized type - """ - - # A hack until https://github.com/bazelbuild/rules_license/issues/89 is - # fully resolved. If exec is in the bin_dir path, then the current - # configuration is probably cfg = exec. - if "-exec" in ctx.bin_dir.path: - return [NULL_INFO] - - # First we gather my direct license attachments - licenses = [] - package_info = [] - if ctx.rule.kind == "_license": - # Don't try to gather licenses from the license rule itself. We'll just - # blunder into the text file of the license and pick up the default - # attribute of the package, which we don't want. - pass - elif hasattr(ctx.rule.attr, "applicable_licenses"): - for dep in ctx.rule.attr.applicable_licenses: - if LicenseInfo in dep: - licenses.append(dep[LicenseInfo]) - if PackageInfo in dep: - package_info.depend(dep[LicenseInfo]) - elif hasattr(ctx.rule.attr, "package_metadata"): - for dep in ctx.rule.attr.package_metadata: - if LicenseInfo in dep: - licenses.append(dep[LicenseInfo]) - if PackageInfo in dep: - package_info.depend(dep[LicenseInfo]) - - # Record all the external repos anyway. - target_name = str(target.label) - packages = [] - if target_name.startswith("@") and target_name[1] != "/": - packages.append(target.label) - # DBG print(str(target.label)) - - elif hasattr(ctx.rule.attr, "tags"): - for tag in ctx.rule.attr.tags: - if tag.startswith("maven_coordinates="): - packages.append(target.label) - - # Now gather transitive collection of providers from the targets - # this target depends upon. - trans_license_info = [] - trans_package_info = [] - trans_packages = [] - traces = [] - _get_transitive_metadata(ctx, trans_license_info, trans_package_info, trans_packages, traces, provider_factory, filter_func) - - if (not licenses and - not package_info and - not packages and - not trans_license_info and - not trans_package_info and - not trans_packages): - return [NULL_INFO] - - # If this is the target, start the sequence of traces. - if ctx.attr._trace[TraceInfo].trace and ctx.attr._trace[TraceInfo].trace in str(ctx.label): - traces = [ctx.attr._trace[TraceInfo].trace] - - # Trim the number of traces accumulated since the output can be quite large. - # A few representative traces are generally sufficient to identify why a dependency - # is incorrectly incorporated. - if len(traces) > 10: - traces = traces[0:10] - - return [provider_factory( - target_under_license = target.label, - license_info = depset(direct = licenses, transitive = trans_license_info), - package_info = depset(direct = package_info, transitive = trans_package_info), - packages = depset(direct = packages, transitive = trans_packages), - traces = traces, - )] - -def _gather_package_impl(target, ctx): - ret = gather_package_common( - target, - ctx, - TransitivePackageInfo, - # [ExperimentalMetadataInfo, PackageInfo], - [PackageInfo], - should_traverse, - ) - - # print(ret) - return ret - -gather_package_info = aspect( - doc = """Collects License and Package providers into a single TransitivePackageInfo provider.""", - implementation = _gather_package_impl, - attr_aspects = ["*"], - attrs = { - "_trace": attr.label(default = "@rules_license//rules:trace_target"), - }, - provides = [TransitivePackageInfo], - apply_to_generating_rules = True, -) - -def _packages_used_impl(ctx): - """Write the TransitivePackageInfo as JSON.""" - tpi = ctx.attr.target[TransitivePackageInfo] - licenses_json = licenses_to_json(tpi.license_info) - package_info_json = package_infos_to_json(tpi.package_info) - packages = labels_to_json(tpi.packages.to_list()) - - # Create a single dict of all the info. - main_template = """{{ - "top_level_target": "{top_level_target}", - "licenses": {licenses}, - "package_info": {package_info}, - "packages": {packages} - \n}}""" - - content = main_template.format( - top_level_target = ctx.attr.target.label, - licenses = licenses_json, - package_info = package_info_json, - packages = packages, - ) - ctx.actions.write( - output = ctx.outputs.out, - content = content, - ) - -packages_used = rule( - doc = """Gather transitive package information for a target and write as JSON.""", - implementation = _packages_used_impl, - attrs = { - "target": attr.label( - aspects = [gather_package_info], - allow_files = True, - ), - "out": attr.output(mandatory = True), - }, -) diff --git a/tools/compliance/packages_used_test.py b/tools/compliance/packages_used_test.py deleted file mode 100644 index bafb855004..0000000000 --- a/tools/compliance/packages_used_test.py +++ /dev/null @@ -1,54 +0,0 @@ -# This file is copied from the bazel github repository https://github.com/bazelbuild/bazel/blob/master/tools/compliance -# as it was recently added in latest bazel version but we are currently using old 5.4.0 because of pending -# bazelmod migration for rules_scala package -# *** DO NOT MODIFY THIS FILE UNLESS WE REALLY NEED TO *** - -"""Smoke test for packages_used.""" - -import json -import os -import unittest - - -def read_data_file(basename): - path = os.path.join( - os.getenv("TEST_SRCDIR"), - os.getenv("TEST_WORKSPACE"), - "tools/compliance", - basename - ) - with open(path, "rt", encoding="utf-8") as f: - return f.read() - - -class PackagesUsedTest(unittest.TestCase): - - def test_found_key_licenses(self): - raw_json = read_data_file("bazel_packages.json") - content = json.loads(raw_json) - found_top_level_license = False - found_zlib = False - for l in content["licenses"]: - print(l["label"]) # for debugging the test - if l["label"] == "//:license": - found_top_level_license = True - if l["label"] == "//third_party/bazel:license": - found_top_level_license = True - if l["label"] == "//third_party/zlib:license": - found_zlib = True - self.assertTrue(found_top_level_license) - self.assertTrue(found_zlib) - - def test_found_remote_packages(self): - if os.getenv("TEST_WORKSPACE") != "bazel": - return - raw_json = read_data_file("bazel_packages.json") - content = json.loads(raw_json) - self.assertIn( - "@@remoteapis+//:build_bazel_remote_execution_v2_remote_execution_proto", - content["packages"], - ) - - -if __name__ == "__main__": - unittest.main() \ No newline at end of file diff --git a/tools/compliance/sbom.bzl b/tools/compliance/sbom.bzl deleted file mode 100644 index 128e927b1a..0000000000 --- a/tools/compliance/sbom.bzl +++ /dev/null @@ -1,79 +0,0 @@ -# This file is copied from the bazel github repository https://github.com/bazelbuild/bazel/blob/master/tools/compliance -# as it was recently added in latest bazel version but we are currently using old 5.4.0 because of pending -# bazelmod migration for rules_scala package -# *** DO NOT MODIFY THIS FILE UNLESS WE REALLY NEED TO *** - -"""Generate an SBOM for a target.""" - -load(":gather_packages.bzl", "packages_used") - -def _sbom_impl(ctx): - # Gather all licenses and write information to one place - - # Now turn the big blob of data into something consumable. - outputs = [ctx.outputs.out] - args = ctx.actions.args() - inputs = [ctx.file.packages_used] - args.add("--packages_used", ctx.file.packages_used.path) - args.add("--out", ctx.outputs.out.path) - if ctx.attr.maven_install: - args.add("--maven_install", ctx.file.maven_install.path) - inputs.append(ctx.file.maven_install) - ctx.actions.run( - mnemonic = "CreateSBOM", - progress_message = "Creating SBOM for %s" % ctx.label, - inputs = inputs, - outputs = outputs, - executable = ctx.executable._sbom_generator, - arguments = [args], - ) - return [DefaultInfo(files = depset(outputs))] - -_sbom = rule( - implementation = _sbom_impl, - attrs = { - "packages_used": attr.label( - allow_single_file = True, - mandatory = True, - ), - "out": attr.output(mandatory = True), - "_sbom_generator": attr.label( - default = Label("//tools/compliance:write_sbom_private"), - executable = True, - allow_files = True, - cfg = "exec", - ), - "maven_install": attr.label( - mandatory = False, - allow_single_file = True, - ), - }, -) - -def sbom( - name, - target, - out = None, - maven_install = "//:maven_install.json"): - """Wrapper for sbom rule. - - Args: - name: name - target: Target to create sbom for - out: output file name - maven_install: maven lock file - """ - packages = "_packages_" + name - packages_used( - name = packages, - target = target, - out = packages + ".json", - ) - if not out: - out = name + "_sbom.json" - _sbom( - name = name, - out = out, - packages_used = ":" + packages + ".json", - maven_install = maven_install, - ) diff --git a/tools/compliance/to_json.bzl b/tools/compliance/to_json.bzl deleted file mode 100644 index 2b69a73d1f..0000000000 --- a/tools/compliance/to_json.bzl +++ /dev/null @@ -1,136 +0,0 @@ -# This file is copied from the bazel github repository https://github.com/bazelbuild/bazel/blob/master/tools/compliance -# as it was recently added in latest bazel version but we are currently using old 5.4.0 because of pending -# bazelmod migration for rules_scala package -# *** DO NOT MODIFY THIS FILE UNLESS WE REALLY NEED TO *** - -"""Utility methods for turning package metadata to JSON. - -These should eventually be part of rules_license. -""" - -def _strip_null_repo(label): - """Removes the null repo name (e.g. @//) from a string. - - The is to make str(label) compatible between bazel 5.x and 6.x - """ - s = str(label) - if s.startswith("@//"): - return s[1:] - elif s.startswith("@@//"): - return s[2:] - return s - -def _bazel_package(label): - """Returns the package containing a label.""" - clean_label = _strip_null_repo(label) - return clean_label[0:-(len(label.name) + 1)] - -_license_template = """{{ - "label": "{label}", - "bazel_package": "{bazel_package}", - "license_kinds": [{kinds}], - "copyright_notice": "{copyright_notice}", - "package_name": "{package_name}", - "package_url": "{package_url}", - "package_version": "{package_version}", - "license_text": "{license_text}" -}}""" - -_kind_template = """{{ - "target": "{kind_path}", - "name": "{kind_name}", - "conditions": {kind_conditions} -}}""" - -def license_info_to_json(license): - """Converts a LicenseInfo to JSON. - - Args: - license: a LicenseInfo - Returns: - JSON representation of license. - """ - kinds = [] - for kind in sorted(license.license_kinds, key = lambda x: x.name): - kinds.append(_kind_template.format( - kind_name = kind.name, - kind_path = kind.label, - kind_conditions = kind.conditions, - )) - - return _license_template.format( - copyright_notice = license.copyright_notice, - kinds = ",".join(kinds), - license_text = license.license_text.path, - package_name = license.package_name, - package_url = license.package_url, - package_version = license.package_version, - label = _strip_null_repo(license.label), - bazel_package = _bazel_package(license.label), - ) - -def licenses_to_json(licenses): - """Converts a list of LicenseInfo to JSON. - - This list is sorted by label for stability. - - Args: - licenses: list(LicenseInfo) - Returns: - JSON representation of licenses - """ - all_licenses = [] - for license in sorted(licenses.to_list(), key = lambda x: x.label): - all_licenses.append(license_info_to_json(license)) - return "[" + ",".join(all_licenses) + "]" - -_package_info_template = """{{ - "target": "{label}", - "bazel_package": "{bazel_package}", - "package_name": "{package_name}", - "package_url": "{package_url}", - "package_version": "{package_version}" -}}""" - -def package_info_to_json(package_info): - """Converts a PackageInfo to json. - - Args: - package_info: a PackageInfo - Returns: - JSON representation of package_info. - """ - return _package_info_template.format( - label = _strip_null_repo(package_info.label), - bazel_package = _bazel_package(package_info.label), - package_name = package_info.package_name, - package_url = package_info.package_url, - package_version = package_info.package_version, - ) - -def package_infos_to_json(packages): - """Converts a list of PackageInfo to JSON. - - This list is sorted by label for stability. - - Args: - packages: list(PackageInfo) - Returns: - JSON representation of packages. - """ - all_packages = [] - for package in sorted(packages.to_list(), key = lambda x: x.label): - all_packages.append(package_info_to_json(package)) - return "[" + ",".join(all_packages) + "]" - -def labels_to_json(labels): - """Converts a list of Labels to JSON. - - This list is sorted for stability. - - Args: - labels: list(Label) - Returns: - JSON representation of the labels. - """ - return "[%s]" % ",".join(['"%s"' % _strip_null_repo(label) for label in sorted(labels)]) diff --git a/tools/compliance/user_filtered_rule_kinds.bzl b/tools/compliance/user_filtered_rule_kinds.bzl deleted file mode 100644 index 3a37a8353a..0000000000 --- a/tools/compliance/user_filtered_rule_kinds.bzl +++ /dev/null @@ -1,20 +0,0 @@ -# This file is copied from the bazel github repository https://github.com/bazelbuild/bazel/blob/master/tools/compliance -# as it was recently added in latest bazel version but we are currently using old 5.4.0 because of pending -# bazelmod migration for rules_scala package -# *** DO NOT MODIFY THIS FILE UNLESS WE REALLY NEED TO *** - -"""Filtered rule kinds for aspect inspection. - -The format of this dictionary is: - rule_name: [attr, attr, ...] - -Filters for rules that are not part of the Bazel distribution should be added -to this file. - -Attributes are either the explicit list of attributes to filter, or '_*' which -would ignore all attributes prefixed with a _. -""" - -# Rule kinds with attributes the aspect currently needs to ignore -user_aspect_filters = { -} diff --git a/tools/compliance/write_sbom.py b/tools/compliance/write_sbom.py deleted file mode 100644 index b621447e0f..0000000000 --- a/tools/compliance/write_sbom.py +++ /dev/null @@ -1,230 +0,0 @@ -# This file is copied from the bazel github repository https://github.com/bazelbuild/bazel/blob/master/tools/compliance -# as it was recently added in latest bazel version but we are currently using old 5.4.0 because of pending -# bazelmod migration for rules_scala package -# *** DO NOT MODIFY THIS FILE UNLESS WE REALLY NEED TO *** - -"""SBOM generator. - -This tool takes input from several sources and weaves together an SBOM. - -Inputs: - - the output of packages_used. This is a JSON block of license, package_info - and other declarations, plus a list of all remote packages referenced. - - the maven lock file (maven_install.json) - - FUTURE: other packgage lock files - - FUTURE: a user provided override of package URL to corrected information - -This tool is private to the sbom() rule. -""" - -import argparse -import datetime -import hashlib -import json - - -# pylint: disable=g-bare-generic -def create_sbom(package_info: dict, maven_packages: dict) -> dict: - """Creates a dict representing an SBOM. - - Args: - package_info: dict of data from packages_used output. - maven_packages: packages gleaned from Maven lock file. - - Returns: - dict of SBOM data - """ - now = datetime.datetime.now(datetime.timezone.utc) - ret = { - "spdxVersion": "SPDX-2.3", - "dataLicense": "CC0-1.0", - "SPDXID": "SPDXRef-DOCUMENT", - "documentNamespace": ( - "https://spdx.google/be852459-4c54-4c50-9d2f-0e48890418fc" - ), - "name": package_info["top_level_target"], - "creationInfo": { - "licenseListVersion": "", - "creators": [ - "Tool: github.com/bazelbuild/bazel/tools/compliance/write_sbom", - "Organization: Google LLC", - ], - "created": now.isoformat(), - }, - } - - packages = [] - relationships = [] - - relationships.append({ - "spdxElementId": "SPDXRef-DOCUMENT", - "relatedSpdxElement": "SPDXRef-Package-main", - "relationshipType": "DESCRIBES" - }) - - # This is bazel private shenanigans. - magic_file_suffix = "//file:file" - - for pkg in package_info["packages"]: - tmp_id = hashlib.md5() - tmp_id.update(pkg.encode("utf-8")) - spdxid = "SPDXRef-GooglePackage-%s" % tmp_id.hexdigest() - pi = { - "name": pkg, - "downloadLocation": "NOASSERTION", - "SPDXID": spdxid, - # TODO(aiuto): Fill in the rest - # "supplier": "Organization: Google LLC", - # "licenseConcluded": "License-XXXXXX", - # "copyrightText": "" - } - - have_maven = None - if pkg.startswith("@maven//:"): - have_maven = maven_packages.get(pkg[9:]) - elif pkg.endswith(magic_file_suffix): - # Bazel hacks jvm_external to add //file:file as a target, then we depend - # on that rather than the correct thing. - # Example: @org_apache_tomcat_tomcat_annotations_api_8_0_5//file:file - # Check for just the versioned root - have_maven = maven_packages.get(pkg[1 : -len(magic_file_suffix)]) - - if have_maven: - pi["downloadLocation"] = have_maven["url"] - else: - # TODO(aiuto): Do something better for this case. - print("MISSING ", pkg) - - packages.append(pi) - relationships.append({ - "spdxElementId": "SPDXRef-Package-main", - "relatedSpdxElement": spdxid, - "relationshipType": "CONTAINS", - }) - - ret["packages"] = packages - ret["relationships"] = relationships - return ret - - -def maven_to_bazel(s): - """Returns a string with maven separators mapped to what we use in Bazel. - - Essentially '.', '-', ':' => '_'. - - Args: - s: a string - - Returns: - a string - """ - return s.replace(".", "_").replace("-", "_").replace(":", "_") - - -# pylint: disable=g-bare-generic -def maven_install_to_packages(maven_install: dict) -> dict: - """Convert raw maven lock file into a dict keyed by bazel package names. - - Args: - maven_install: raw maven lock file data - - Returns: - dict keyed by names created by rules_jvm_external - """ - - # Map repo coordinate back to the download repository. - # The input dict is of the form - # "https//repo1.maven.org/": [ com.google.foo:some.package, ...] - # But.... sometimes the artifact is - # com.google.foo:some.package.jar.arch - # and then that means the artifact table has an entry - # in their shasums table keyed by arch. - - repo_to_url = {} - for url, repos in maven_install["repositories"].items(): - for repo in repos: - if repo in repo_to_url: - print( - "WARNING: Duplicate download path for <%s>. Using %s" - % (repo, repo_to_url[repo]) - ) - continue - repo_to_url[repo] = url - - ret = {} - for name, info in maven_install["artifacts"].items(): - # With something like this: - parts = name.split(":") - if len(parts) >= 2: - repo = parts[0] - artifact = ":".join(parts[1:]) # Join remaining parts - else: - # Handle case with no colon or just a single part - repo = name - artifact = "" - version = info["version"] - - for arch in info["shasums"].keys(): - # build the download URL - sub_version = version - repo_name = name - if arch != "jar": - sub_version = version + "-" + arch - repo_name = "%s:jar:%s" % (name, arch) - - url = ( - "{mirror}{repo}/{artifact}/{version}/{artifact}-{version}.jar".format( - mirror = repo_to_url.get(repo_name, "https://repo1.maven.org/maven2/"), - repo=repo.replace(".", "/"), - artifact=artifact, - version=version, - ) - ) - tmp = info.copy() - tmp["maven_name"] = name - tmp["url"] = url - bazel_name = maven_to_bazel(name) + "_" + maven_to_bazel(sub_version) - ret[bazel_name] = tmp - if arch == "jar": - ret[bazel_name] = tmp - return ret - - -def main() -> None: - parser = argparse.ArgumentParser( - description="Helper for creating SBOMs", fromfile_prefix_chars="@" - ) - parser.add_argument( - "--out", required=True, help="The output file, mandatory." - ) - parser.add_argument( - "--packages_used", - required=True, - help="JSON list of transitive package data for a target", - ) - parser.add_argument( - "--maven_install", - required=False, - default="", - help="Maven lock file", - ) - opts = parser.parse_args() - - with open(opts.packages_used, "rt", encoding="utf-8") as inp: - package_info = json.loads(inp.read()) - - maven_packages = None - if opts.maven_install: - with open(opts.maven_install, "rt", encoding="utf-8") as inp: - maven_install = json.loads(inp.read()) - maven_packages = maven_install_to_packages(maven_install) - # Useful for debugging - # print(json.dumps(maven_packages, indent=2)) - - sbom = create_sbom(package_info, maven_packages) - with open(opts.out, "w", encoding="utf-8") as out: - out.write(json.dumps(sbom, indent=2)) - - -if __name__ == "__main__": - main() \ No newline at end of file