Skip to content

Commit 7443a07

Browse files
committed
WIP ilab wrapper script adjustments
WORK IN PROGRESS Ticket [RHELAI-442](https://issues.redhat.com/browse/RHELAI-442) # Background RHEL AI ships with a script in `/usr/local/bin` called `ilab` which makes running `ilab` commands feel native even though they're actually running in a podman container # Issues * The script is outdated / used several different container images for different purposes, while it should be just using the single instructlab image * The volume mounts were incorrect, as instructlab now uses XDG paths * Unnecessary directory creation for `HF_CACHE` * Unnecessary GPU count logic * Script has unnecessary fiddling of `ilab` parameters, essentially creating a UX that deviates from the natural `ilab` CLI # Solutions * Changed script to use the single container image `IMAGE_NAME` (this was already the case mostly, except for old references to `LVLM_NAME` and `TRAIN_NAME` which no longer get replaced leading to a broken `PODMAN_COMMAND_SERVE` * Will now mount the host's `~/.config` and `~/.local` into the container's corresponding directories, for `instructlab` to use and for its config / data to persist across invocations * Will now mount `~/.cache` into the container's corresponding `.cache` directory, so that the information stored in the default `HF_CACHE` is also persisted across invocations * Removed unnecessary GPU count logic * Removed all parameter parsing / fiddling # Other changes Added secret/fake "shell" `ilab` subcommand which opens a shell in the wrapper's container, useful for troubleshooting issues with the wrapper itself
1 parent 28ee9a8 commit 7443a07

File tree

2 files changed

+14
-171
lines changed

2 files changed

+14
-171
lines changed

training/ilab-wrapper/ilab

+14-169
Original file line numberDiff line numberDiff line change
@@ -5,181 +5,26 @@ ENDPOINT_URL="__REPLACE_ENDPOINT_URL__"
55
TRAIN_DEVICE="__REPLACE_TRAIN_DEVICE__"
66
CONTAINER_DEVICE="__REPLACE_CONTAINER_DEVICE__"
77
IMAGE_NAME="__REPLACE_IMAGE_NAME__"
8-
VLLM_NAME="__REPLACE_VLLM_NAME__"
9-
TRAIN_NAME="__REPLACE_TRAIN_NAME__"
10-
GPU_COUNT_COMMAND="__REPLACE_GPU_COUNT_COMMAND__"
118

12-
# ENDPOINT_URL="http://0.0.0.0:8080/v1"
13-
# TRAIN_DEVICE="cuda"
14-
# CONTAINER_DEVICE="nvidia.com/gpu=all"
15-
# IMAGE_NAME="quay.io/ai-lab/instructlab-nvidia:latest"
16-
# VLLM_NAME="quay.io/ai-lab/vllm:latest"
17-
# TRAIN_NAME="quay.io/ai-lab/deepspeed-trainer:latest"
18-
# GPU_COUNT_COMMAND="nvidia-ctk --quiet cdi list | grep -P nvidia.com/gpu='\d+' | wc -l"
9+
export ENTRYPOINT="/opt/python3.11/venv/bin/ilab"
10+
export PARAMS="$@"
1911

20-
# HF caching uses relative symlink structures, so keep cache relative to
21-
# the central working directory
22-
CONTAINER_CACHE="/instructlab/cache"
23-
HOST_CACHE="$(pwd)/cache"
24-
WORKDIR="$(pwd)"
25-
SCRIPT_DIR=$(dirname "$0")
26-
DEFAULT_SERVE_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
27-
28-
if [[ -z "${GPU_AMOUNT}" ]]; then
29-
GPU_AMOUNT=$(bash -c "${GPU_COUNT_COMMAND}")
30-
if [[ "$?" != "0" ]]; then
31-
echo "Could not determine GPU count, set export GPU_AMOUNT= manually"
32-
exit
33-
fi
34-
fi
35-
36-
if [[ "$GPU_AMOUNT" -lt 2 ]]; then
37-
echo "WARNING: You need at least 2 GPUs to load full precision models"
12+
if [[ "$1" = "shell" ]]; then
13+
export ENTRYPOINT=bash
14+
export PARAMS=""
3815
fi
3916

40-
NPROC_PER_NODE=${GPU_AMOUNT}
41-
EFFECTIVE_BATCH_SIZE=$((12*${GPU_AMOUNT}))
42-
NUM_INSTRUCTIONS=5000
43-
NUM_EPOCHS=10
44-
45-
has_argument() {
46-
match=$1
47-
shift
48-
for arg in "$@"; do
49-
if [[ "$arg" == *"$match"* ]]; then
50-
return 0
51-
fi
52-
done
53-
return 1
54-
}
55-
56-
get_argument() {
57-
local match=$1
58-
shift
59-
60-
local found=false
61-
local arg
62-
while [ "$#" -gt 0 ]; do
63-
arg="$1"
64-
shift
65-
if [[ "$arg" == "$match" ]]; then
66-
found=true
67-
if [ "$#" -gt 0 ]; then
68-
echo "$1"
69-
return 0
70-
else
71-
echo ""
72-
return 0
73-
fi
74-
fi
75-
done
76-
77-
if ! $found; then
78-
echo ""
79-
return 0
80-
fi
81-
}
82-
83-
get_argument_default() {
84-
local match=$1
85-
local default=$2
86-
shift
87-
shift
88-
local result=$(get_argument ${match} "$@")
89-
if [[ -z "${result}" ]]; then
90-
echo $default
91-
return 0
92-
fi
93-
echo "${result}"
94-
}
95-
96-
get_model() {
97-
model=$(get_argument_default "--model" "${DEFAULT_SERVE_MODEL}" "$@")
98-
if [[ ! "${model}" =~ ^/instructlab/models.* ]]; then
99-
echo /instructlab/models/"${model}"
100-
else
101-
echo "${model}"
102-
fi
103-
}
104-
105-
mkdir -p "${HOST_CACHE}"
106-
PODMAN_COMMAND=("podman" "run" "--rm" "-it" "--device" "${CONTAINER_DEVICE}" \
17+
PODMAN_COMMAND=("podman" "run" "--rm" "-it" \
18+
"--device" "${CONTAINER_DEVICE}" \
10719
"--security-opt" "label=disable" "--net" "host" \
108-
"-v" "${WORKDIR}:/instructlab" "--entrypoint" "" \
109-
"-e" "HF_HOME=${CONTAINER_CACHE}" \
110-
"-e" "HF_TOKEN=${HF_TOKEN}" \
20+
"-v" "$HOME/.cache:/root/.cache" \
21+
"-v" "$HOME/.config:/root/.config" \
22+
"-v" "$HOME/.local:/root/.local" \
23+
"--entrypoint" "$ENTRYPOINT" \
24+
"-e" "HF_HOME=$CONTAINER_CACHE" \
25+
"-e" "HF_TOKEN=$HF_TOKEN" \
11126
"${IMAGE_NAME}")
112-
PODMAN_COMMAND_SERVE=("podman" "run" "--rm" "-it" "--device" "${CONTAINER_DEVICE}" \
113-
"--security-opt" "label=disable" "--net" "host" \
114-
"-v" "${WORKDIR}:/instructlab" \
115-
"--shm-size=10gb" \
116-
"-e" "HF_HOME=${CONTAINER_CACHE}/" \
117-
"-e" "HF_TOKEN=${HF_TOKEN}" \
118-
"${VLLM_NAME}" "--host=0.0.0.0" "--port=8080" "--tensor-parallel-size=${GPU_AMOUNT}")
119-
120-
if [[ "$1" = "init" ]]; then
121-
if ! has_argument "--repository" "$@"; then
122-
shift
123-
"${PODMAN_COMMAND[@]}" ilab init \
124-
--repository https://github.com/instructlab/taxonomy.git "$@"
125-
exit $?
126-
fi
127-
elif [[ "$1" = "train" ]]; then
128-
samples=$(get_argument_default "--num-samples" ${NUM_INSTRUCTIONS} "$@")
129-
epochs=$(get_argument_default "--num-epochs" ${NUM_EPOCHS} "$@")
130-
${SCRIPT_DIR}/ilab-training-launcher ${NPROC_PER_NODE} ${EFFECTIVE_BATCH_SIZE} \
131-
${TRAIN_DEVICE} ${samples} ${epochs} ${CONTAINER_DEVICE} ${TRAIN_NAME}
132-
exit $?
133-
elif [[ "$1" = "serve" ]]; then
134-
# run vllm container which will serve vllm and ilab generate
135-
args=()
136-
model=$(get_model "$@")
137-
if [[ "${model}" == *"${DEFAULT_SERVE_MODEL}" ]]; then
138-
args+=("--chat-template=mixtral.jinja")
139-
fi
140-
args+=("--model" "${model}")
141-
"${PODMAN_COMMAND_SERVE[@]}" "${args[@]}"
142-
exit $?
143-
elif [[ "$1" = "chat" ]]; then
144-
shift
145-
args=($@)
146-
if ! has_argument "--endpoint-url" "$@"; then
147-
args+=("--endpoint-url" "http://0.0.0.0:8080/v1")
148-
fi
149-
if ! has_argument "--model-family" "$@"; then
150-
args+=("--model-family" "mixtral")
151-
fi
152-
args+=("--model" $(get_model "$@"))
153-
"${PODMAN_COMMAND[@]}" ilab chat "${args[@]}"
154-
exit $?
155-
elif [[ "$1" = "generate" ]]; then
156-
shift
157-
args=($@)
158-
if ! has_argument "--endpoint-url" "$@"; then
159-
args+=("--endpoint-url" "http://0.0.0.0:8080/v1")
160-
fi
161-
if ! has_argument "--model-family" "$@"; then
162-
args+=("--model-family" "mixtral")
163-
fi
164-
if ! has_argument "--num-instructions" "$@"; then
165-
args+=("--num-instructions" "5000")
166-
fi
167-
args+=("--model" $(get_model "$@"))
168-
echo ilab generate "${args[@]}"
16927

170-
"${PODMAN_COMMAND[@]}" ilab generate "${args[@]}"
171-
exit $?
172-
elif [[ "$1" == "download" && $# -lt 2 ]]; then
173-
echo "You must specify the model to download."
174-
echo
175-
echo "High-fidelity generation and training requires two models:"
176-
echo
177-
echo "Mixtral: ilab download --repository ${DEFAULT_SERVE_MODEL}"
178-
echo "Granite: ilab download --repository ibm/granite-7b-base"
179-
echo
180-
echo "For more options type ilab --help"
181-
exit 1
182-
fi
18328

184-
"${PODMAN_COMMAND[@]}" ilab "$@"
29+
"${PODMAN_COMMAND[@]}" "$@"
18530

training/nvidia-bootc/Containerfile

-2
Original file line numberDiff line numberDiff line change
@@ -188,14 +188,12 @@ RUN grep -q /usr/lib/containers/storage /etc/containers/storage.conf || \
188188
&& chmod +x /usr/bin/ilab
189189

190190
ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-nvidia:latest"
191-
ARG GPU_COUNT_COMMAND="nvidia-ctk --quiet cdi list | grep -P nvidia.com/gpu='\\\\d+' | wc -l"
192191

193192
RUN for i in /usr/bin/ilab*; do \
194193
sed -i 's/__REPLACE_TRAIN_DEVICE__/cuda/' $i; \
195194
sed -i 's/__REPLACE_CONTAINER_DEVICE__/nvidia.com\/gpu=all/' $i; \
196195
sed -i "s%__REPLACE_IMAGE_NAME__%${INSTRUCTLAB_IMAGE}%" $i; \
197196
sed -i 's%__REPLACE_ENDPOINT_URL__%http://0.0.0.0:8080/v1%' $i; \
198-
sed -i "s%__REPLACE_GPU_COUNT_COMMAND__%${GPU_COUNT_COMMAND}%" $i; \
199197
sed -i 's/__REPLACE_TRAIN_DEVICE__/cuda/' $i; \
200198
done
201199

0 commit comments

Comments
 (0)