|
1 | 1 | #!/bin/bash
|
2 | 2 |
|
3 | 3 | # Template values replaced by container build
|
4 |
| -ENDPOINT_URL="__REPLACE_ENDPOINT_URL__" |
5 |
| -TRAIN_DEVICE="__REPLACE_TRAIN_DEVICE__" |
6 | 4 | CONTAINER_DEVICE="__REPLACE_CONTAINER_DEVICE__"
|
7 | 5 | IMAGE_NAME="__REPLACE_IMAGE_NAME__"
|
8 |
| -VLLM_NAME="__REPLACE_VLLM_NAME__" |
9 |
| -TRAIN_NAME="__REPLACE_TRAIN_NAME__" |
10 |
| -GPU_COUNT_COMMAND="__REPLACE_GPU_COUNT_COMMAND__" |
11 | 6 |
|
12 |
| -# ENDPOINT_URL="http://0.0.0.0:8080/v1" |
13 |
| -# TRAIN_DEVICE="cuda" |
14 |
| -# CONTAINER_DEVICE="nvidia.com/gpu=all" |
15 |
| -# IMAGE_NAME="quay.io/ai-lab/instructlab-nvidia:latest" |
16 |
| -# VLLM_NAME="quay.io/ai-lab/vllm:latest" |
17 |
| -# TRAIN_NAME="quay.io/ai-lab/deepspeed-trainer:latest" |
18 |
| -# GPU_COUNT_COMMAND="nvidia-ctk --quiet cdi list | grep -P nvidia.com/gpu='\d+' | wc -l" |
| 7 | +export ENTRYPOINT="/opt/python3.11/venv/bin/ilab" |
| 8 | +export PARAMS=("$@") |
19 | 9 |
|
20 |
| -# HF caching uses relative symlink structures, so keep cache relative to |
21 |
| -# the central working directory |
22 |
| -CONTAINER_CACHE="/instructlab/cache" |
23 |
| -HOST_CACHE="$(pwd)/cache" |
24 |
| -WORKDIR="$(pwd)" |
25 |
| -SCRIPT_DIR=$(dirname "$0") |
26 |
| -DEFAULT_SERVE_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1" |
| 10 | +for dir in "$HOME/.cache" "$HOME/.config" "$HOME/.local"; do |
| 11 | + mkdir -p "$dir" |
| 12 | +done |
27 | 13 |
|
28 |
| -if [[ -z "${GPU_AMOUNT}" ]]; then |
29 |
| - GPU_AMOUNT=$(bash -c "${GPU_COUNT_COMMAND}") |
30 |
| - if [[ "$?" != "0" ]]; then |
31 |
| - echo "Could not determine GPU count, set export GPU_AMOUNT= manually" |
32 |
| - exit |
33 |
| - fi |
| 14 | +if [[ "$1" = "shell" ]]; then |
| 15 | + export ENTRYPOINT=bash |
| 16 | + export PARAMS=() |
34 | 17 | fi
|
35 | 18 |
|
36 |
| -if [[ "$GPU_AMOUNT" -lt 2 ]]; then |
37 |
| - echo "WARNING: You need at least 2 GPUs to load full precision models" |
38 |
| -fi |
39 |
| - |
40 |
| -NPROC_PER_NODE=${GPU_AMOUNT} |
41 |
| -EFFECTIVE_BATCH_SIZE=$((12*${GPU_AMOUNT})) |
42 |
| -NUM_INSTRUCTIONS=5000 |
43 |
| -NUM_EPOCHS=10 |
44 |
| - |
45 |
| -has_argument() { |
46 |
| - match=$1 |
47 |
| - shift |
48 |
| - for arg in "$@"; do |
49 |
| - if [[ "$arg" == *"$match"* ]]; then |
50 |
| - return 0 |
51 |
| - fi |
52 |
| - done |
53 |
| - return 1 |
54 |
| -} |
55 |
| - |
56 |
| -get_argument() { |
57 |
| - local match=$1 |
58 |
| - shift |
59 |
| - |
60 |
| - local found=false |
61 |
| - local arg |
62 |
| - while [ "$#" -gt 0 ]; do |
63 |
| - arg="$1" |
64 |
| - shift |
65 |
| - if [[ "$arg" == "$match" ]]; then |
66 |
| - found=true |
67 |
| - if [ "$#" -gt 0 ]; then |
68 |
| - echo "$1" |
69 |
| - return 0 |
70 |
| - else |
71 |
| - echo "" |
72 |
| - return 0 |
73 |
| - fi |
74 |
| - fi |
75 |
| - done |
76 |
| - |
77 |
| - if ! $found; then |
78 |
| - echo "" |
79 |
| - return 0 |
80 |
| - fi |
81 |
| -} |
82 |
| - |
83 |
| -get_argument_default() { |
84 |
| - local match=$1 |
85 |
| - local default=$2 |
86 |
| - shift |
87 |
| - shift |
88 |
| - local result=$(get_argument ${match} "$@") |
89 |
| - if [[ -z "${result}" ]]; then |
90 |
| - echo $default |
91 |
| - return 0 |
92 |
| - fi |
93 |
| - echo "${result}" |
94 |
| -} |
95 |
| - |
96 |
| -get_model() { |
97 |
| - model=$(get_argument_default "--model" "${DEFAULT_SERVE_MODEL}" "$@") |
98 |
| - if [[ ! "${model}" =~ ^/instructlab/models.* ]]; then |
99 |
| - echo /instructlab/models/"${model}" |
100 |
| - else |
101 |
| - echo "${model}" |
102 |
| - fi |
103 |
| -} |
104 |
| - |
105 |
| -mkdir -p "${HOST_CACHE}" |
106 |
| -PODMAN_COMMAND=("podman" "run" "--rm" "-it" "--device" "${CONTAINER_DEVICE}" \ |
107 |
| - "--security-opt" "label=disable" "--net" "host" \ |
108 |
| - "-v" "${WORKDIR}:/instructlab" "--entrypoint" "" \ |
109 |
| - "-e" "HF_HOME=${CONTAINER_CACHE}" \ |
110 |
| - "-e" "HF_TOKEN=${HF_TOKEN}" \ |
111 |
| - "${IMAGE_NAME}") |
112 |
| -PODMAN_COMMAND_SERVE=("podman" "run" "--rm" "-it" "--device" "${CONTAINER_DEVICE}" \ |
113 |
| - "--security-opt" "label=disable" "--net" "host" \ |
114 |
| - "-v" "${WORKDIR}:/instructlab" \ |
115 |
| - "--shm-size=10gb" \ |
116 |
| - "-e" "HF_HOME=${CONTAINER_CACHE}/" \ |
117 |
| - "-e" "HF_TOKEN=${HF_TOKEN}" \ |
118 |
| - "${VLLM_NAME}" "--host=0.0.0.0" "--port=8080" "--tensor-parallel-size=${GPU_AMOUNT}") |
119 |
| - |
120 |
| -if [[ "$1" = "init" ]]; then |
121 |
| - if ! has_argument "--repository" "$@"; then |
122 |
| - shift |
123 |
| - "${PODMAN_COMMAND[@]}" ilab init \ |
124 |
| - --repository https://github.com/instructlab/taxonomy.git "$@" |
125 |
| - exit $? |
126 |
| - fi |
127 |
| -elif [[ "$1" = "train" ]]; then |
128 |
| - samples=$(get_argument_default "--num-samples" ${NUM_INSTRUCTIONS} "$@") |
129 |
| - epochs=$(get_argument_default "--num-epochs" ${NUM_EPOCHS} "$@") |
130 |
| - ${SCRIPT_DIR}/ilab-training-launcher ${NPROC_PER_NODE} ${EFFECTIVE_BATCH_SIZE} \ |
131 |
| - ${TRAIN_DEVICE} ${samples} ${epochs} ${CONTAINER_DEVICE} ${TRAIN_NAME} |
132 |
| - exit $? |
133 |
| -elif [[ "$1" = "serve" ]]; then |
134 |
| - # run vllm container which will serve vllm and ilab generate |
135 |
| - args=() |
136 |
| - model=$(get_model "$@") |
137 |
| - if [[ "${model}" == *"${DEFAULT_SERVE_MODEL}" ]]; then |
138 |
| - args+=("--chat-template=mixtral.jinja") |
139 |
| - fi |
140 |
| - args+=("--model" "${model}") |
141 |
| - "${PODMAN_COMMAND_SERVE[@]}" "${args[@]}" |
142 |
| - exit $? |
143 |
| -elif [[ "$1" = "chat" ]]; then |
144 |
| - shift |
145 |
| - args=($@) |
146 |
| - if ! has_argument "--endpoint-url" "$@"; then |
147 |
| - args+=("--endpoint-url" "http://0.0.0.0:8080/v1") |
148 |
| - fi |
149 |
| - if ! has_argument "--model-family" "$@"; then |
150 |
| - args+=("--model-family" "mixtral") |
151 |
| - fi |
152 |
| - args+=("--model" $(get_model "$@")) |
153 |
| - "${PODMAN_COMMAND[@]}" ilab chat "${args[@]}" |
154 |
| - exit $? |
155 |
| -elif [[ "$1" = "generate" ]]; then |
156 |
| - shift |
157 |
| - args=($@) |
158 |
| - if ! has_argument "--endpoint-url" "$@"; then |
159 |
| - args+=("--endpoint-url" "http://0.0.0.0:8080/v1") |
160 |
| - fi |
161 |
| - if ! has_argument "--model-family" "$@"; then |
162 |
| - args+=("--model-family" "mixtral") |
163 |
| - fi |
164 |
| - if ! has_argument "--num-instructions" "$@"; then |
165 |
| - args+=("--num-instructions" "5000") |
166 |
| - fi |
167 |
| - args+=("--model" $(get_model "$@")) |
168 |
| - echo ilab generate "${args[@]}" |
169 |
| - |
170 |
| - "${PODMAN_COMMAND[@]}" ilab generate "${args[@]}" |
171 |
| - exit $? |
172 |
| -elif [[ "$1" == "download" && $# -lt 2 ]]; then |
173 |
| - echo "You must specify the model to download." |
174 |
| - echo |
175 |
| - echo "High-fidelity generation and training requires two models:" |
176 |
| - echo |
177 |
| - echo "Mixtral: ilab download --repository ${DEFAULT_SERVE_MODEL}" |
178 |
| - echo "Granite: ilab download --repository ibm/granite-7b-base" |
179 |
| - echo |
180 |
| - echo "For more options type ilab --help" |
181 |
| - exit 1 |
182 |
| -fi |
183 |
| - |
184 |
| -"${PODMAN_COMMAND[@]}" ilab "$@" |
185 |
| - |
| 19 | +PODMAN_COMMAND=("podman" "run" "--rm" "-it" |
| 20 | + "--device" "${CONTAINER_DEVICE}" |
| 21 | + "--security-opt" "label=disable" "--net" "host" |
| 22 | + "-v" "$HOME/.cache:/root/.cache" |
| 23 | + "-v" "$HOME/.config:/root/.config" |
| 24 | + "-v" "$HOME/.local:/root/.local" |
| 25 | + "--entrypoint" "$ENTRYPOINT" |
| 26 | + "--env" "HF_TOKEN" |
| 27 | + "${IMAGE_NAME}") |
| 28 | + |
| 29 | +"${PODMAN_COMMAND[@]}" "${PARAMS[@]}" |
0 commit comments