WIP ilab wrapper script adjustments

omertuc · omertuc · commit 7443a07c72a9 · 2024-07-17T13:36:20.000+02:00
WORK IN PROGRESS Ticket [RHELAI-442](https://issues.redhat.com/browse/RHELAI-442) # Background RHEL AI ships with a script in `/usr/local/bin` called `ilab` which makes running `ilab` commands feel native even though they're actually running in a podman container # Issues * The script is outdated / used several different container images for different purposes, while it should be just using the single instructlab image * The volume mounts were incorrect, as instructlab now uses XDG paths * Unnecessary directory creation for `HF_CACHE` * Unnecessary GPU count logic * Script has unnecessary fiddling of `ilab` parameters, essentially creating a UX that deviates from the natural `ilab` CLI # Solutions * Changed script to use the single container image `IMAGE_NAME` (this was already the case mostly, except for old references to `LVLM_NAME` and `TRAIN_NAME` which no longer get replaced leading to a broken `PODMAN_COMMAND_SERVE` * Will now mount the host's `~/.config` and `~/.local` into the container's corresponding directories, for `instructlab` to use and for its config / data to persist across invocations * Will now mount `~/.cache` into the container's corresponding `.cache` directory, so that the information stored in the default `HF_CACHE` is also persisted across invocations * Removed unnecessary GPU count logic * Removed all parameter parsing / fiddling # Other changes Added secret/fake "shell" `ilab` subcommand which opens a shell in the wrapper's container, useful for troubleshooting issues with the wrapper itself
diff --git a/training/ilab-wrapper/ilab b/training/ilab-wrapper/ilab
@@ -5,181 +5,26 @@ ENDPOINT_URL="__REPLACE_ENDPOINT_URL__"
 TRAIN_DEVICE="__REPLACE_TRAIN_DEVICE__"
 CONTAINER_DEVICE="__REPLACE_CONTAINER_DEVICE__"
 IMAGE_NAME="__REPLACE_IMAGE_NAME__"
-VLLM_NAME="__REPLACE_VLLM_NAME__"
-TRAIN_NAME="__REPLACE_TRAIN_NAME__"
-GPU_COUNT_COMMAND="__REPLACE_GPU_COUNT_COMMAND__"
 
-# ENDPOINT_URL="http://0.0.0.0:8080/v1"
-# TRAIN_DEVICE="cuda"
-# CONTAINER_DEVICE="nvidia.com/gpu=all"
-# IMAGE_NAME="quay.io/ai-lab/instructlab-nvidia:latest"
-# VLLM_NAME="quay.io/ai-lab/vllm:latest"
-# TRAIN_NAME="quay.io/ai-lab/deepspeed-trainer:latest"
-# GPU_COUNT_COMMAND="nvidia-ctk --quiet cdi list | grep -P nvidia.com/gpu='\d+' | wc -l"
+export ENTRYPOINT="/opt/python3.11/venv/bin/ilab"
+export PARAMS="$@"
 
-# HF caching uses relative symlink structures, so keep cache relative to
-# the central working directory
-CONTAINER_CACHE="/instructlab/cache"
-HOST_CACHE="$(pwd)/cache"
-WORKDIR="$(pwd)"
-SCRIPT_DIR=$(dirname "$0")
-DEFAULT_SERVE_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
-
-if [[ -z "${GPU_AMOUNT}" ]]; then
-	GPU_AMOUNT=$(bash -c "${GPU_COUNT_COMMAND}")
-	if [[ "$?" != "0" ]]; then
-		echo "Could not determine GPU count, set export GPU_AMOUNT= manually"
-		exit
-	fi
-fi
-
-if [[ "$GPU_AMOUNT" -lt 2 ]]; then
-	echo "WARNING: You need at least 2 GPUs to load full precision models"
+if [[ "$1" = "shell" ]]; then
+	export ENTRYPOINT=bash
+	export PARAMS=""
 fi
 
-NPROC_PER_NODE=${GPU_AMOUNT}
-EFFECTIVE_BATCH_SIZE=$((12*${GPU_AMOUNT}))
-NUM_INSTRUCTIONS=5000
-NUM_EPOCHS=10
-
-has_argument() {
-	match=$1
-	shift
-	for arg in "$@"; do
-		if [[ "$arg" == *"$match"* ]]; then
-			return 0
-		fi
-	done
-	return 1
-}
-
-get_argument() {
-	local match=$1
-	shift
-
-	local found=false
-	local arg
-	while [ "$#" -gt 0 ]; do
-		arg="$1"
-		shift
-		if [[ "$arg" == "$match" ]]; then
-			found=true
-			if [ "$#" -gt 0 ]; then
-				echo "$1"
-				return 0
-			else
-				echo ""
-				return 0
-			fi
-		fi
-	done
-
-	if ! $found; then
-		echo ""
-		return 0
-	fi
-}
-
-get_argument_default() {
-	local match=$1
-	local default=$2
-	shift
-	shift
-	local result=$(get_argument ${match} "$@")
-	if [[ -z "${result}" ]]; then
-		echo $default
-		return 0
-	fi
-	echo "${result}"
-}
-
-get_model() {
-	model=$(get_argument_default "--model" "${DEFAULT_SERVE_MODEL}" "$@")
-	if [[ ! "${model}" =~ ^/instructlab/models.* ]]; then
-		echo /instructlab/models/"${model}"
-	else
-		echo "${model}"
-	fi
-}
-
-mkdir -p "${HOST_CACHE}"
-PODMAN_COMMAND=("podman" "run" "--rm" "-it" "--device" "${CONTAINER_DEVICE}" \
+PODMAN_COMMAND=("podman" "run" "--rm" "-it" \
+        "--device" "${CONTAINER_DEVICE}" \
 		"--security-opt" "label=disable" "--net" "host" \
-		"-v" "${WORKDIR}:/instructlab" "--entrypoint" "" \
-		"-e" "HF_HOME=${CONTAINER_CACHE}" \
-		"-e" "HF_TOKEN=${HF_TOKEN}" \
+		"-v" "$HOME/.cache:/root/.cache" \
+		"-v" "$HOME/.config:/root/.config" \
+		"-v" "$HOME/.local:/root/.local" \
+		"--entrypoint" "$ENTRYPOINT" \
+		"-e" "HF_HOME=$CONTAINER_CACHE" \
+		"-e" "HF_TOKEN=$HF_TOKEN" \
 		"${IMAGE_NAME}")
-PODMAN_COMMAND_SERVE=("podman" "run" "--rm" "-it" "--device" "${CONTAINER_DEVICE}" \
-		"--security-opt" "label=disable" "--net" "host" \
-		"-v" "${WORKDIR}:/instructlab" \
-		"--shm-size=10gb" \
-		"-e" "HF_HOME=${CONTAINER_CACHE}/" \
-		"-e" "HF_TOKEN=${HF_TOKEN}" \
-		"${VLLM_NAME}" "--host=0.0.0.0" "--port=8080" "--tensor-parallel-size=${GPU_AMOUNT}")
-
-if [[ "$1" = "init" ]]; then
-	if ! has_argument "--repository" "$@"; then
-		shift
-		"${PODMAN_COMMAND[@]}" ilab init \
-			--repository https://github.com/instructlab/taxonomy.git "$@"
-		exit $?
-	fi
-elif [[ "$1" = "train" ]]; then
-	samples=$(get_argument_default "--num-samples" ${NUM_INSTRUCTIONS} "$@")
-	epochs=$(get_argument_default "--num-epochs" ${NUM_EPOCHS} "$@")
-	${SCRIPT_DIR}/ilab-training-launcher ${NPROC_PER_NODE} ${EFFECTIVE_BATCH_SIZE} \
-		${TRAIN_DEVICE} ${samples} ${epochs} ${CONTAINER_DEVICE} ${TRAIN_NAME}
-	exit $?
-elif [[ "$1" = "serve" ]]; then
-	# run vllm container which will serve vllm and ilab generate
-	args=()
-	model=$(get_model "$@")
-	if [[ "${model}" == *"${DEFAULT_SERVE_MODEL}" ]]; then
-		args+=("--chat-template=mixtral.jinja")
-	fi
-	args+=("--model" "${model}")
-	"${PODMAN_COMMAND_SERVE[@]}" "${args[@]}"
-	exit $?
-elif [[ "$1" = "chat" ]]; then
-	shift
-	args=($@)
-	if ! has_argument "--endpoint-url" "$@"; then
-		args+=("--endpoint-url" "http://0.0.0.0:8080/v1")
-	fi
-	if ! has_argument "--model-family" "$@"; then
-		args+=("--model-family" "mixtral")
-	fi
-	args+=("--model" $(get_model "$@"))
-	"${PODMAN_COMMAND[@]}" ilab chat "${args[@]}"
-	exit $?
-elif [[ "$1" = "generate" ]]; then
-	shift
-	args=($@)
-	if ! has_argument "--endpoint-url" "$@"; then
-		args+=("--endpoint-url" "http://0.0.0.0:8080/v1")
-	fi
-	if ! has_argument "--model-family" "$@"; then
-		args+=("--model-family" "mixtral")
-	fi
-	if ! has_argument "--num-instructions" "$@"; then
-		args+=("--num-instructions" "5000")
-	fi
-	args+=("--model" $(get_model "$@"))
-	echo ilab generate "${args[@]}"
 
-	"${PODMAN_COMMAND[@]}" ilab generate "${args[@]}"
-	exit $?
-elif [[ "$1" == "download" && $# -lt 2 ]]; then
-	echo "You must specify the model to download."
-	echo
-	echo "High-fidelity generation and training requires two models:"
-	echo
-	echo "Mixtral: ilab download --repository ${DEFAULT_SERVE_MODEL}"
-	echo "Granite: ilab download --repository ibm/granite-7b-base"
-	echo
-	echo "For more options type ilab --help"
-	exit 1
-fi
 
-"${PODMAN_COMMAND[@]}" ilab "$@"
+"${PODMAN_COMMAND[@]}" "$@"
 
diff --git a/training/nvidia-bootc/Containerfile b/training/nvidia-bootc/Containerfile
@@ -188,14 +188,12 @@ RUN grep -q /usr/lib/containers/storage /etc/containers/storage.conf || \
     && chmod +x /usr/bin/ilab
 
 ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-nvidia:latest"
-ARG GPU_COUNT_COMMAND="nvidia-ctk --quiet cdi list | grep -P nvidia.com/gpu='\\\\d+' | wc -l"
 
 RUN for i in /usr/bin/ilab*; do \
 	sed -i 's/__REPLACE_TRAIN_DEVICE__/cuda/' $i;  \
 	sed -i 's/__REPLACE_CONTAINER_DEVICE__/nvidia.com\/gpu=all/' $i; \
 	sed -i "s%__REPLACE_IMAGE_NAME__%${INSTRUCTLAB_IMAGE}%" $i; \
 	sed -i 's%__REPLACE_ENDPOINT_URL__%http://0.0.0.0:8080/v1%' $i; \
-	sed -i "s%__REPLACE_GPU_COUNT_COMMAND__%${GPU_COUNT_COMMAND}%" $i; \
 	sed -i 's/__REPLACE_TRAIN_DEVICE__/cuda/' $i; \
     done