Skip to content

Commit b9abb0a

Browse files
authored
ci: Faster builds (#13142) (#13144)
* ci: Faster builds * f * test * fix * f * f --------- Signed-off-by: oliver könig <[email protected]>
1 parent d65ca6f commit b9abb0a

File tree

3 files changed

+91
-43
lines changed

3 files changed

+91
-43
lines changed

.github/workflows/_test_template.yml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,37 @@ jobs:
6262
run: |
6363
docker pull nemoci.azurecr.io/nemo_container:${{ github.run_id }}
6464
65+
- name: Clean repos
66+
run: |
67+
68+
- name: Install jq
69+
run: |
70+
curl -s https://webinstall.dev/jq | bash
71+
72+
- name: Create UUID
73+
id: uuid
74+
run: |
75+
echo "id=$(uuidgen)" >> "$GITHUB_OUTPUT"
76+
77+
- name: Checkout NeMo
78+
uses: actions/checkout@v2
79+
with:
80+
repository: NVIDIA/NeMo
81+
path: ${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo
82+
83+
- name: Fetch Mcore tag from manifest.json
84+
id: mcore
85+
run: |
86+
REF=$(cat ${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo/requirements/manifest.json | jq -r '."vcs-dependencies"."megatron-lm".ref')
87+
echo "ref=$REF" | tee -a "$GITHUB_OUTPUT"
88+
89+
- name: Checkout Megatron-LM
90+
uses: actions/checkout@v2
91+
with:
92+
repository: NVIDIA/Megatron-LM
93+
path: ${{ github.run_id }}/${{steps.uuid.outputs.id }}/Megatron-LM
94+
ref: ${{ steps.mcore.outputs.ref }}
95+
6596
- name: Start container
6697
run: |
6798
mkdir -p $DIR
@@ -92,6 +123,8 @@ jobs:
92123
--env TRANSFORMERS_OFFLINE=0 \
93124
--env HYDRA_FULL_ERROR=1 \
94125
--env HF_HOME=/home/TestData/HF_HOME \
126+
--volume $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo:/workspace \
127+
--volume $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }}/Megatron-LM:/workspace/Megatron-LM \
95128
--volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container:${{ github.run_id }} \
96129
bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
97130
RUN_TEST_EOF
@@ -194,4 +227,5 @@ jobs:
194227
- name: Container shutdown
195228
if: always()
196229
run: |
230+
rm -rf $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }} || true
197231
docker container rm -f nemo_container_${{ github.run_id }}_${{ runner.name }} || true

.github/workflows/cicd-main.yml

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ jobs:
4242
outputs:
4343
test_to_run: ${{ steps.test_to_run.outputs.main }}
4444
build_args: ${{ steps.manifest.outputs.BUILD_ARGS }}
45+
cache-from: ${{ steps.cache_from.outputs.LAST_PRS }}
4546
env:
4647
TESTS_TO_RUN: ${{ inputs.test_to_run }}
4748
EVENT_NAME: ${{ github.event_name }}
@@ -102,14 +103,36 @@ jobs:
102103
echo "$BUILD_ARGS" >> $GITHUB_OUTPUT
103104
echo "EOF" >> $GITHUB_OUTPUT
104105
106+
- name: Get last merged PR
107+
id: cache_from
108+
env:
109+
GH_TOKEN: ${{ github.token }}
110+
run: |
111+
LAST_PRS=$(gh api graphql -f query='
112+
query {
113+
repository(owner: "NVIDIA", name: "NeMo") {
114+
pullRequests(states: MERGED, first: 10, orderBy: {field: UPDATED_AT, direction: DESC}) {
115+
nodes {
116+
number
117+
}
118+
}
119+
}
120+
}' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do
121+
echo "nemoci.azurecr.io/nemo_container-buildcache:$number"
122+
done)
123+
124+
echo "LAST_PRS<<EOF" >> $GITHUB_OUTPUT
125+
echo "$LAST_PRS" >> $GITHUB_OUTPUT
126+
echo "EOF" >> $GITHUB_OUTPUT
127+
105128
code-linting:
106129
if: ${{ needs.pre-flight.outputs.test_to_run != '[]' }}
107130
needs: [pre-flight]
108131
uses: ./.github/workflows/code-linting.yml
109132

110133
cicd-test-container-build:
111134
if: ${{ needs.pre-flight.outputs.test_to_run != '[]' }}
112-
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_container.yml@v0.26.0
135+
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_container.yml@v0.27.0
113136
needs: [pre-flight, code-linting]
114137
with:
115138
image-name: nemo_container
@@ -122,15 +145,36 @@ jobs:
122145
${{ needs.pre-flight.outputs.BUILD_ARGS }}
123146
prune-filter-timerange: 24h
124147
use-inline-cache: false
148+
cache-from: |
149+
nemoci.azurecr.io/$IMAGE_NAME-buildcache:main
150+
${{ needs.pre-flight.outputs.cache-from }}
125151
126152
cicd-import-tests:
127153
if: ${{ needs.pre-flight.outputs.test_to_run != '[]' }}
128154
needs: [cicd-test-container-build, pre-flight]
129155
runs-on: self-hosted-azure-gpus-1
130156
steps:
157+
- name: Create UUID
158+
id: uuid
159+
run: |
160+
echo "id=$(uuidgen)" >> "$GITHUB_OUTPUT"
161+
162+
- name: Checkout NeMo
163+
uses: actions/checkout@v2
164+
with:
165+
repository: NVIDIA/NeMo
166+
path: ${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo
167+
131168
- name: Run some checks
132169
run: |
133-
docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container:${{ github.run_id }} bash -c '\
170+
docker run \
171+
--rm \
172+
--device=/dev/nvidia0 \
173+
--gpus all \
174+
--shm-size=8g \
175+
--volume $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo:/workspace \
176+
--env TRANSFORMERS_OFFLINE=0 \
177+
--env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container:${{ github.run_id }} bash -c '\
134178
# PyTorch Lightning version
135179
python -c "import lightning.pytorch; print(lightning.pytorch.__version__)"
136180

Dockerfile.ci

Lines changed: 11 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -16,21 +16,6 @@
1616

1717
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3
1818

19-
FROM ${BASE_IMAGE} as nemo-bump
20-
ARG NEMO_TAG
21-
WORKDIR /opt
22-
# NeMo
23-
RUN <<"EOF" bash -exu
24-
if [[ ! -d NeMo ]]; then
25-
git clone https://github.com/NVIDIA/NeMo.git
26-
fi
27-
cd NeMo/
28-
git fetch origin '+refs/pull/*/merge:refs/remotes/pull/*/merge'
29-
git fetch origin $NEMO_TAG
30-
git checkout -f $NEMO_TAG
31-
EOF
32-
33-
3419
FROM ${BASE_IMAGE} AS trt-base
3520
ARG IMAGE_LABEL
3621
LABEL "nemo.library"=${IMAGE_LABEL}
@@ -48,7 +33,7 @@ EOF
4833

4934
ARG TRTLLM_REPO
5035
ARG TRTLLM_TAG
51-
RUN --mount=type=bind,from=nemo-bump,source=/opt/NeMo/reinstall.sh,target=/tmp/NeMo/reinstall.sh <<"EOF" bash -ex
36+
RUN --mount=type=bind,source=reinstall.sh,target=/tmp/NeMo/reinstall.sh <<"EOF" bash -ex
5237

5338
bash /tmp/NeMo/reinstall.sh --library trt --mode install
5439
EOF
@@ -58,17 +43,17 @@ FROM trt-base AS trt-llm-wheel
5843

5944
ARG TRTLLM_REPO
6045
ARG TRTLLM_TAG
61-
RUN --mount=type=bind,from=nemo-bump,source=/opt/NeMo/reinstall.sh,target=/tmp/NeMo/reinstall.sh <<"EOF" bash -ex
46+
RUN --mount=type=bind,source=reinstall.sh,target=/tmp/NeMo/reinstall.sh <<"EOF" bash -ex
6247

6348
bash /tmp/NeMo/reinstall.sh --library trtllm --mode build
6449
EOF
6550

6651

67-
FROM trt-base AS test-image
52+
FROM trt-base as final
6853

6954
ARG MLM_REPO
7055
ARG MLM_TAG
71-
RUN --mount=type=bind,from=nemo-bump,source=/opt/NeMo/reinstall.sh,target=/tmp/NeMo/reinstall.sh <<"EOF" bash -ex
56+
RUN --mount=type=bind,source=reinstall.sh,target=/tmp/NeMo/reinstall.sh <<"EOF" bash -ex
7257

7358
bash /tmp/NeMo/reinstall.sh --library mcore --mode build
7459
ls -al /tmp/Megatron-LM || true
@@ -77,33 +62,18 @@ EOF
7762
WORKDIR /workspace
7863
RUN \
7964
--mount=type=bind,from=trt-llm-wheel,source=/opt/wheels/trtllm,target=/opt/wheels/trtllm \
80-
--mount=type=bind,from=nemo-bump,source=/opt/NeMo/requirements,target=/tmp/NeMo/requirements \
81-
--mount=type=bind,from=nemo-bump,source=/opt/NeMo/tools/ctc_segmentation/requirements.txt,target=/tmp/NeMo/tools/ctc_segmentation/requirements.txt \
82-
--mount=type=bind,from=nemo-bump,source=/opt/NeMo/reinstall.sh,target=/tmp/NeMo/reinstall.sh \
83-
--mount=type=bind,from=nemo-bump,source=/opt/NeMo/setup.py,target=/tmp/NeMo/setup.py \
84-
--mount=type=bind,from=nemo-bump,source=/opt/NeMo/README.md,target=/tmp/NeMo/README.md \
85-
--mount=type=bind,from=nemo-bump,source=/opt/NeMo/nemo/package_info.py,target=/tmp/NeMo/nemo/package_info.py \
86-
--mount=type=bind,from=nemo-bump,source=/opt/NeMo/nemo/__init__.py,target=/tmp/NeMo/nemo/__init__.py <<"EOF" bash -ex
65+
--mount=type=bind,source=requirements,target=/tmp/NeMo/requirements \
66+
--mount=type=bind,source=tools/ctc_segmentation/requirements.txt,target=/tmp/NeMo/tools/ctc_segmentation/requirements.txt \
67+
--mount=type=bind,source=reinstall.sh,target=/tmp/NeMo/reinstall.sh \
68+
--mount=type=bind,source=setup.py,target=/tmp/NeMo/setup.py \
69+
--mount=type=bind,source=README.md,target=/tmp/NeMo/README.md \
70+
--mount=type=bind,source=nemo/package_info.py,target=/tmp/NeMo/nemo/package_info.py \
71+
--mount=type=bind,source=nemo/__init__.py,target=/tmp/NeMo/nemo/__init__.py <<"EOF" bash -ex
8772

8873
bash /tmp/NeMo/reinstall.sh --library all --mode install
8974
rm -rf $NEMO_DIR || true
9075
EOF
9176

92-
# Copy over NeMo code
93-
ARG NEMO_REPO
94-
ARG NEMO_TAG
95-
RUN \
96-
--mount=type=bind,from=nemo-bump,source=/opt/NeMo/reinstall.sh,target=/tmp/reinstall.sh <<"EOF" bash -ex
97-
98-
bash /tmp/reinstall.sh --library all --mode install
99-
100-
# Copy into workspace
101-
cp -a /opt/NeMo/. /workspace/
102-
cp -r /opt/Megatron-LM/ /workspace/
103-
104-
# set permission
105-
chmod 777 -R /workspace
106-
EOF
10777

10878
ENV PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
10979
ENV NEMO_HOME="/home/TestData/nemo_home"

0 commit comments

Comments
 (0)