Skip to content

Commit 48eabdd

Browse files
authored
Merge pull request #680 from omertuc/wrapper
`ilab` wrapper script adjustments
2 parents a6dd048 + 2627588 commit 48eabdd

File tree

2 files changed

+19
-179
lines changed

2 files changed

+19
-179
lines changed

training/ilab-wrapper/ilab

+19-175
Original file line numberDiff line numberDiff line change
@@ -1,185 +1,29 @@
11
#!/bin/bash
22

33
# Template values replaced by container build
4-
ENDPOINT_URL="__REPLACE_ENDPOINT_URL__"
5-
TRAIN_DEVICE="__REPLACE_TRAIN_DEVICE__"
64
CONTAINER_DEVICE="__REPLACE_CONTAINER_DEVICE__"
75
IMAGE_NAME="__REPLACE_IMAGE_NAME__"
8-
VLLM_NAME="__REPLACE_VLLM_NAME__"
9-
TRAIN_NAME="__REPLACE_TRAIN_NAME__"
10-
GPU_COUNT_COMMAND="__REPLACE_GPU_COUNT_COMMAND__"
116

12-
# ENDPOINT_URL="http://0.0.0.0:8080/v1"
13-
# TRAIN_DEVICE="cuda"
14-
# CONTAINER_DEVICE="nvidia.com/gpu=all"
15-
# IMAGE_NAME="quay.io/ai-lab/instructlab-nvidia:latest"
16-
# VLLM_NAME="quay.io/ai-lab/vllm:latest"
17-
# TRAIN_NAME="quay.io/ai-lab/deepspeed-trainer:latest"
18-
# GPU_COUNT_COMMAND="nvidia-ctk --quiet cdi list | grep -P nvidia.com/gpu='\d+' | wc -l"
7+
export ENTRYPOINT="/opt/python3.11/venv/bin/ilab"
8+
export PARAMS=("$@")
199

20-
# HF caching uses relative symlink structures, so keep cache relative to
21-
# the central working directory
22-
CONTAINER_CACHE="/instructlab/cache"
23-
HOST_CACHE="$(pwd)/cache"
24-
WORKDIR="$(pwd)"
25-
SCRIPT_DIR=$(dirname "$0")
26-
DEFAULT_SERVE_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
10+
for dir in "$HOME/.cache" "$HOME/.config" "$HOME/.local"; do
11+
mkdir -p "$dir"
12+
done
2713

28-
if [[ -z "${GPU_AMOUNT}" ]]; then
29-
GPU_AMOUNT=$(bash -c "${GPU_COUNT_COMMAND}")
30-
if [[ "$?" != "0" ]]; then
31-
echo "Could not determine GPU count, set export GPU_AMOUNT= manually"
32-
exit
33-
fi
14+
if [[ "$1" = "shell" ]]; then
15+
export ENTRYPOINT=bash
16+
export PARAMS=()
3417
fi
3518

36-
if [[ "$GPU_AMOUNT" -lt 2 ]]; then
37-
echo "WARNING: You need at least 2 GPUs to load full precision models"
38-
fi
39-
40-
NPROC_PER_NODE=${GPU_AMOUNT}
41-
EFFECTIVE_BATCH_SIZE=$((12*${GPU_AMOUNT}))
42-
NUM_INSTRUCTIONS=5000
43-
NUM_EPOCHS=10
44-
45-
has_argument() {
46-
match=$1
47-
shift
48-
for arg in "$@"; do
49-
if [[ "$arg" == *"$match"* ]]; then
50-
return 0
51-
fi
52-
done
53-
return 1
54-
}
55-
56-
get_argument() {
57-
local match=$1
58-
shift
59-
60-
local found=false
61-
local arg
62-
while [ "$#" -gt 0 ]; do
63-
arg="$1"
64-
shift
65-
if [[ "$arg" == "$match" ]]; then
66-
found=true
67-
if [ "$#" -gt 0 ]; then
68-
echo "$1"
69-
return 0
70-
else
71-
echo ""
72-
return 0
73-
fi
74-
fi
75-
done
76-
77-
if ! $found; then
78-
echo ""
79-
return 0
80-
fi
81-
}
82-
83-
get_argument_default() {
84-
local match=$1
85-
local default=$2
86-
shift
87-
shift
88-
local result=$(get_argument ${match} "$@")
89-
if [[ -z "${result}" ]]; then
90-
echo $default
91-
return 0
92-
fi
93-
echo "${result}"
94-
}
95-
96-
get_model() {
97-
model=$(get_argument_default "--model" "${DEFAULT_SERVE_MODEL}" "$@")
98-
if [[ ! "${model}" =~ ^/instructlab/models.* ]]; then
99-
echo /instructlab/models/"${model}"
100-
else
101-
echo "${model}"
102-
fi
103-
}
104-
105-
mkdir -p "${HOST_CACHE}"
106-
PODMAN_COMMAND=("podman" "run" "--rm" "-it" "--device" "${CONTAINER_DEVICE}" \
107-
"--security-opt" "label=disable" "--net" "host" \
108-
"-v" "${WORKDIR}:/instructlab" "--entrypoint" "" \
109-
"-e" "HF_HOME=${CONTAINER_CACHE}" \
110-
"-e" "HF_TOKEN=${HF_TOKEN}" \
111-
"${IMAGE_NAME}")
112-
PODMAN_COMMAND_SERVE=("podman" "run" "--rm" "-it" "--device" "${CONTAINER_DEVICE}" \
113-
"--security-opt" "label=disable" "--net" "host" \
114-
"-v" "${WORKDIR}:/instructlab" \
115-
"--shm-size=10gb" \
116-
"-e" "HF_HOME=${CONTAINER_CACHE}/" \
117-
"-e" "HF_TOKEN=${HF_TOKEN}" \
118-
"${VLLM_NAME}" "--host=0.0.0.0" "--port=8080" "--tensor-parallel-size=${GPU_AMOUNT}")
119-
120-
if [[ "$1" = "init" ]]; then
121-
if ! has_argument "--repository" "$@"; then
122-
shift
123-
"${PODMAN_COMMAND[@]}" ilab init \
124-
--repository https://github.com/instructlab/taxonomy.git "$@"
125-
exit $?
126-
fi
127-
elif [[ "$1" = "train" ]]; then
128-
samples=$(get_argument_default "--num-samples" ${NUM_INSTRUCTIONS} "$@")
129-
epochs=$(get_argument_default "--num-epochs" ${NUM_EPOCHS} "$@")
130-
${SCRIPT_DIR}/ilab-training-launcher ${NPROC_PER_NODE} ${EFFECTIVE_BATCH_SIZE} \
131-
${TRAIN_DEVICE} ${samples} ${epochs} ${CONTAINER_DEVICE} ${TRAIN_NAME}
132-
exit $?
133-
elif [[ "$1" = "serve" ]]; then
134-
# run vllm container which will serve vllm and ilab generate
135-
args=()
136-
model=$(get_model "$@")
137-
if [[ "${model}" == *"${DEFAULT_SERVE_MODEL}" ]]; then
138-
args+=("--chat-template=mixtral.jinja")
139-
fi
140-
args+=("--model" "${model}")
141-
"${PODMAN_COMMAND_SERVE[@]}" "${args[@]}"
142-
exit $?
143-
elif [[ "$1" = "chat" ]]; then
144-
shift
145-
args=($@)
146-
if ! has_argument "--endpoint-url" "$@"; then
147-
args+=("--endpoint-url" "http://0.0.0.0:8080/v1")
148-
fi
149-
if ! has_argument "--model-family" "$@"; then
150-
args+=("--model-family" "mixtral")
151-
fi
152-
args+=("--model" $(get_model "$@"))
153-
"${PODMAN_COMMAND[@]}" ilab chat "${args[@]}"
154-
exit $?
155-
elif [[ "$1" = "generate" ]]; then
156-
shift
157-
args=($@)
158-
if ! has_argument "--endpoint-url" "$@"; then
159-
args+=("--endpoint-url" "http://0.0.0.0:8080/v1")
160-
fi
161-
if ! has_argument "--model-family" "$@"; then
162-
args+=("--model-family" "mixtral")
163-
fi
164-
if ! has_argument "--num-instructions" "$@"; then
165-
args+=("--num-instructions" "5000")
166-
fi
167-
args+=("--model" $(get_model "$@"))
168-
echo ilab generate "${args[@]}"
169-
170-
"${PODMAN_COMMAND[@]}" ilab generate "${args[@]}"
171-
exit $?
172-
elif [[ "$1" == "download" && $# -lt 2 ]]; then
173-
echo "You must specify the model to download."
174-
echo
175-
echo "High-fidelity generation and training requires two models:"
176-
echo
177-
echo "Mixtral: ilab download --repository ${DEFAULT_SERVE_MODEL}"
178-
echo "Granite: ilab download --repository ibm/granite-7b-base"
179-
echo
180-
echo "For more options type ilab --help"
181-
exit 1
182-
fi
183-
184-
"${PODMAN_COMMAND[@]}" ilab "$@"
185-
19+
PODMAN_COMMAND=("podman" "run" "--rm" "-it"
20+
"--device" "${CONTAINER_DEVICE}"
21+
"--security-opt" "label=disable" "--net" "host"
22+
"-v" "$HOME/.cache:/root/.cache"
23+
"-v" "$HOME/.config:/root/.config"
24+
"-v" "$HOME/.local:/root/.local"
25+
"--entrypoint" "$ENTRYPOINT"
26+
"--env" "HF_TOKEN"
27+
"${IMAGE_NAME}")
28+
29+
"${PODMAN_COMMAND[@]}" "${PARAMS[@]}"

training/nvidia-bootc/Containerfile

-4
Original file line numberDiff line numberDiff line change
@@ -188,15 +188,11 @@ RUN grep -q /usr/lib/containers/storage /etc/containers/storage.conf || \
188188
&& chmod +x /usr/bin/ilab
189189

190190
ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-nvidia:latest"
191-
ARG GPU_COUNT_COMMAND="nvidia-ctk --quiet cdi list | grep -P nvidia.com/gpu='\\\\d+' | wc -l"
192191

193192
RUN for i in /usr/bin/ilab*; do \
194193
sed -i 's/__REPLACE_TRAIN_DEVICE__/cuda/' $i; \
195194
sed -i 's/__REPLACE_CONTAINER_DEVICE__/nvidia.com\/gpu=all/' $i; \
196195
sed -i "s%__REPLACE_IMAGE_NAME__%${INSTRUCTLAB_IMAGE}%" $i; \
197-
sed -i 's%__REPLACE_ENDPOINT_URL__%http://0.0.0.0:8080/v1%' $i; \
198-
sed -i "s%__REPLACE_GPU_COUNT_COMMAND__%${GPU_COUNT_COMMAND}%" $i; \
199-
sed -i 's/__REPLACE_TRAIN_DEVICE__/cuda/' $i; \
200196
done
201197

202198
# Added for running as an OCI Container to prevent Overlay on Overlay issues.

0 commit comments

Comments
 (0)