(base) testuser@clftower:~/ollama/ipex/ipex-llm/docker/llm/serving/xpu/docker$ docker exec -it $CONTAINER_NAME bash
root@clftower:/llm# sycl-ls
[level_zero:gpu][level_zero:0] Intel(R) oneAPI Unified Runtime over Level-Zero, Intel(R) Arc(TM) A770 Graphics 12.55.8 [1.6.32224.500000]
[opencl:cpu][opencl:0] Intel(R) OpenCL, Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz OpenCL 3.0 (Build 0) [2024.18.12.0.05_160000]
[opencl:gpu][opencl:1] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [24.52.32224.5]
root@clftower:/llm# cd /ipex-llm/python/llm/scripts
bash env-check.sh
bash: cd: /ipex-llm/python/llm/scripts: No such file or directory
bash: env-check.sh: No such file or directory

root@clftower:/llm# cd /
root@clftower:/# ll
total 252
drwxr-xr-x   1 root root   4096 Jun 25 21:34 ./
drwxr-xr-x   1 root root   4096 Jun 25 21:34 ../
-rwxr-xr-x   1 root root      0 Jun 25 21:34 .dockerenv*
drwxr-xr-x   8 root root   4096 Jun 25 21:24 benchmark/
lrwxrwxrwx   1 root root      7 Sep 11  2024 bin -> usr/bin/
drwxr-xr-x   2 root root   4096 Apr 18  2022 boot/
drwxr-xr-x   6 root root    380 Jun 25 21:34 dev/
drwxr-xr-x   1 root root   4096 Jun 25 21:34 etc/
drwxr-xr-x  56 root root   4096 Jun 25 21:24 examples/
drwxr-xr-x   2 root root   4096 Apr 18  2022 home/
lrwxrwxrwx   1 root root      7 Sep 11  2024 lib -> usr/lib/
lrwxrwxrwx   1 root root      9 Sep 11  2024 lib32 -> usr/lib32/
lrwxrwxrwx   1 root root      9 Sep 11  2024 lib64 -> usr/lib64/
lrwxrwxrwx   1 root root     10 Sep 11  2024 libx32 -> usr/libx32/
drwxr-xr-x   1 root root   4096 Jun 25 21:34 llm/
drwxr-xr-x   2 root root   4096 Sep 11  2024 media/
drwxr-xr-x   2 root root   4096 Sep 11  2024 mnt/
drwxr-xr-x   1 root root   4096 Jun 25 21:21 opt/
dr-xr-xr-x 415 root root      0 Jun 25 21:34 proc/
drwx------   1 root root   4096 Jun 25 21:34 root/
drwxr-xr-x   1 root root   4096 Dec 12  2024 run/
lrwxrwxrwx   1 root root      8 Sep 11  2024 sbin -> usr/sbin/
drwxr-xr-x   2 root root   4096 Sep 11  2024 srv/
dr-xr-xr-x  13 root root      0 Jun 25 19:15 sys/
-rw-r--r--   1 root root 160864 Oct 31  2024 third-party-programs.txt
drwxrwxrwt   1 root root  12288 Jun 25 21:32 tmp/
drwxr-xr-x   1 root root   4096 Sep 11  2024 usr/
drwxr-xr-x   2 root root   4096 Jun 25 21:24 vLLM-Serving/
drwxr-xr-x   1 root root   4096 Sep 11  2024 var/
root@clftower:/# find . -name 'scripts' -type d
./opt/intel/oneapi/advisor/2025.0/documentation/en/help/scripts
./opt/intel/oneapi/2025.0/opt/debugger/lib/python3.12/venv/scripts
./opt/intel/oneapi/vtune/2025.0/bin64/os-perf/scripts
./opt/intel/oneapi/vtune/2025.0/bin64/resources/app/scripts
./opt/intel/oneapi/debugger/2025.0/opt/debugger/lib/python3.12/venv/scripts
./usr/share/doc/wrk/examples/scripts
./usr/lib/python3.10/venv/scripts
./usr/lib/python3.11/venv/scripts
./usr/local/lib/python3.11/dist-packages/scripts
./usr/local/lib/python3.11/dist-packages/numba/scripts
./usr/local/lib/python3.11/dist-packages/ray/scripts
./usr/local/lib/python3.11/dist-packages/accelerate/test_utils/scripts
./llm/vllm/.buildkite/scripts
./llm/vllm/.buildkite/nightly-benchmarks/scripts
./llm/vllm/.github/scripts
./llm/vllm/.github/workflows/scripts
root@clftower:/# find . -name 'env-check.sh'
root@clftower:/# # Disable code related to XETLA; only Intel Data Center GPU Max Series supports XETLA, so non-Max machines should set this to OFF.
# Recommended for use on Intel Arc™ A-Series Graphics and Intel Data Center GPU Flex Series.
export USE_XETLA=OFF

# Enable immediate command lists mode for the Level Zero plugin. Improves performance on Intel Arc™ A-Series Graphics and Intel Data Center GPU Max Series; however, it depends on the Linux Kernel, and some Linux kernels may not necessarily provide acceleration.
# Recommended for use on Intel Arc™ A-Series Graphics and Intel Data Center GPU Max Series, but it depends on the Linux kernel, Upstream i915 kernel drivers may cause performance regressions.
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1

# Controls persistent device compiled code cache. Set to '1' to turn on and '0' to turn off.
# Recommended for all hardware environments. This environment variable is already set by default in Docker images.
export SYCL_CACHE_PERSISTENT=1
root@clftower:/# cd /benchmark/all-in-one

root@clftower:/benchmark/all-in-one# env
TBBROOT=/opt/intel/oneapi/tbb/2022.0/env/..
PYTHONUNBUFFERED=1
ONEAPI_ROOT=/opt/intel/oneapi
PKG_CONFIG_PATH=/opt/intel/oneapi/vtune/2025.0/include/pkgconfig/lib64:/opt/intel/oneapi/tbb/2022.0/env/../lib/pkgconfig:/opt/intel/oneapi/mpi/2021.14/lib/pkgconfig:/opt/intel/oneapi/mkl/2025.0/lib/pkgconfig:/opt/intel/oneapi/ippcp/2025.0/lib/pkgconfig:/opt/intel/oneapi/dpl/2022.7/lib/pkgconfig:/opt/intel/oneapi/dnnl/2025.0/lib/pkgconfig:/opt/intel/oneapi/dal/2025.0/lib/pkgconfig:/opt/intel/oneapi/compiler/2025.0/lib/pkgconfig:/opt/intel/oneapi/ccl/2021.14/lib/pkgconfig/:/opt/intel/oneapi/advisor/2025.0/include/pkgconfig/lib64:
USE_XETLA=OFF
HOSTNAME=clftower
ADVISOR_2025_DIR=/opt/intel/oneapi/advisor/2025.0
CCL_ROOT=/opt/intel/oneapi/ccl/2021.14
I_MPI_ROOT=/opt/intel/oneapi/mpi/2021.14
FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/2021.14/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric
DNNLROOT=/opt/intel/oneapi/dnnl/2025.0
DIAGUTIL_PATH=/opt/intel/oneapi/dpcpp-ct/2025.0/etc/dpct/sys_check/sys_check.sh:/opt/intel/oneapi/compiler/2025.0/etc/compiler/sys_check/sys_check.sh
PWD=/benchmark/all-in-one
CCL_CONFIGURATION=cpu_gpu_dpcpp
DPL_ROOT=/opt/intel/oneapi/dpl/2022.7
MANPATH=/opt/intel/oneapi/mpi/2021.14/share/man:/opt/intel/oneapi/debugger/2025.0/share/man:/opt/intel/oneapi/compiler/2025.0/share/man:
TCM_ROOT=/opt/intel/oneapi/tcm/1.2
TZ=Asia/Shanghai
HOME=/root
GDB_INFO=/opt/intel/oneapi/debugger/2025.0/share/info/
CCL_CONFIGURATION_PATH=
LANG=C.UTF-8
LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
SETVARS_COMPLETED=1
APM=/opt/intel/oneapi/advisor/2025.0/perfmodels
CMAKE_PREFIX_PATH=/opt/intel/oneapi/tbb/2022.0/env/..:/opt/intel/oneapi/pti/0.10/lib/cmake/pti:/opt/intel/oneapi/mkl/2025.0/lib/cmake:/opt/intel/oneapi/ipp/2022.0/lib/cmake/ipp:/opt/intel/oneapi/dpl/2022.7/lib/cmake/oneDPL:/opt/intel/oneapi/dnnl/2025.0/lib/cmake:/opt/intel/oneapi/dal/2025.0:/opt/intel/oneapi/compiler/2025.0
CMPLR_ROOT=/opt/intel/oneapi/compiler/2025.0
Pti_DIR=/opt/intel/oneapi/pti/0.10/lib/cmake/pti
INFOPATH=/opt/intel/oneapi/debugger/2025.0/share/info
IPPROOT=/opt/intel/oneapi/ipp/2022.0
IPP_TARGET_ARCH=intel64
LESSCLOSE=/usr/bin/lesspipe %s %s
PYTHONPATH=/opt/intel/oneapi/advisor/2025.0/pythonapi
TERM=xterm
DALROOT=/opt/intel/oneapi/dal/2025.0
LESSOPEN=| /usr/bin/lesspipe %s
UMF_ROOT=/opt/intel/oneapi/umf/0.9
LIBRARY_PATH=/opt/intel/oneapi/tcm/1.2/lib:/opt/intel/oneapi/umf/0.9/lib:/opt/intel/oneapi/tbb/2022.0/env/../lib/intel64/gcc4.8:/opt/intel/oneapi/pti/0.10/lib:/opt/intel/oneapi/mpi/2021.14/lib:/opt/intel/oneapi/mkl/2025.0/lib:/opt/intel/oneapi/ippcp/2025.0/lib/:/opt/intel/oneapi/ipp/2022.0/lib:/opt/intel/oneapi/dnnl/2025.0/lib:/opt/intel/oneapi/dal/2025.0/lib:/opt/intel/oneapi/compiler/2025.0/lib:/opt/intel/oneapi/ccl/2021.14/lib/
DAL_MAJOR_BINARY=3
SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
IPPCRYPTOROOT=/opt/intel/oneapi/ippcp/2025.0
IPPCP_TARGET_ARCH=intel64
SHLVL=1
OCL_ICD_FILENAMES=/opt/intel/oneapi/compiler/2025.0/lib/libintelocl.so
CLASSPATH=/opt/intel/oneapi/mpi/2021.14/share/java/mpi.jar
LD_LIBRARY_PATH=/opt/intel/oneapi/tcm/1.2/lib:/opt/intel/oneapi/umf/0.9/lib:/opt/intel/oneapi/tbb/2022.0/env/../lib/intel64/gcc4.8:/opt/intel/oneapi/pti/0.10/lib:/opt/intel/oneapi/mpi/2021.14/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/2021.14/lib:/opt/intel/oneapi/mkl/2025.0/lib:/opt/intel/oneapi/ippcp/2025.0/lib/:/opt/intel/oneapi/ipp/2022.0/lib:/opt/intel/oneapi/dnnl/2025.0/lib:/opt/intel/oneapi/debugger/2025.0/opt/debugger/lib:/opt/intel/oneapi/dal/2025.0/lib:/opt/intel/oneapi/compiler/2025.0/opt/compiler/lib:/opt/intel/oneapi/compiler/2025.0/lib:/opt/intel/oneapi/ccl/2021.14/lib/
VTUNE_PROFILER_DIR=/opt/intel/oneapi/vtune/2025.0
MKLROOT=/opt/intel/oneapi/mkl/2025.0
DAL_MINOR_BINARY=0
VTUNE_PROFILER_2025_DIR=/opt/intel/oneapi/vtune/2025.0
NLSPATH=/opt/intel/oneapi/compiler/2025.0/lib/compiler/locale/%l_%t/%N
PATH=/opt/intel/oneapi/vtune/2025.0/bin64:/opt/intel/oneapi/mpi/2021.14/bin:/opt/intel/oneapi/mkl/2025.0/bin:/opt/intel/oneapi/dpcpp-ct/2025.0/bin:/opt/intel/oneapi/dev-utilities/2025.0/bin:/opt/intel/oneapi/debugger/2025.0/opt/debugger/bin:/opt/intel/oneapi/compiler/2025.0/bin:/opt/intel/oneapi/advisor/2025.0/bin64:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
SYCL_CACHE_PERSISTENT=1
INTEL_PYTHONHOME=/opt/intel/oneapi/debugger/2025.0/opt/debugger
IPEX_LLM_FORCE_BATCH_FORWARD=1
VLLM_RPC_TIMEOUT=100000
CPATH=/opt/intel/oneapi/umf/0.9/include:/opt/intel/oneapi/tbb/2022.0/env/../include:/opt/intel/oneapi/pti/0.10/include:/opt/intel/oneapi/mpi/2021.14/include:/opt/intel/oneapi/mkl/2025.0/include:/opt/intel/oneapi/ippcp/2025.0/include:/opt/intel/oneapi/ipp/2022.0/include:/opt/intel/oneapi/dpl/2022.7/include:/opt/intel/oneapi/dpcpp-ct/2025.0/include:/opt/intel/oneapi/dnnl/2025.0/include:/opt/intel/oneapi/dev-utilities/2025.0/include:/opt/intel/oneapi/dal/2025.0/include:/opt/intel/oneapi/ccl/2021.14/include
OLDPWD=/
_=/usr/bin/env
root@clftower:/benchmark/all-in-one# 
root@clftower:/benchmark/all-in-one# source ipex-llm-init --gpu --device Arc
found intel-openmp in /usr/local/lib/libiomp5.so
found oneapi in /opt/intel/oneapi/setvars.sh

:: initializing oneAPI environment ...
   bash: BASH_VERSION = 5.1.16(1)-release
   args: Using "$@" for setvars.sh arguments: --force
:: advisor -- latest
:: ccl -- latest
:: compiler -- latest
:: dal -- latest
:: debugger -- latest
:: dev-utilities -- latest
:: dnnl -- latest
:: dpcpp-ct -- latest
:: dpl -- latest
:: ipp -- latest
:: ippcp -- latest
:: mkl -- latest
:: mpi -- latest
:: pti -- latest
:: tbb -- latest
:: umf -- latest
:: vtune -- latest
:: oneAPI environment initialized ::

[W625 21:39:04.779711591 OperatorEntry.cpp:154] Warning: Warning only once for all operators,  other operators may also be overridden.
  Overriding a previously registered kernel for the same operator and the same dispatch key
  operator: aten::_validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> ()
    registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6
  dispatch key: XPU
  previous kernel: registered at /pytorch/build/aten/src/ATen/RegisterCPU.cpp:30477
       new kernel: registered at /build/intel-pytorch-extension/build/Release/csrc/gpu/csrc/aten/generated/ATen/RegisterXPU.cpp:468 (function operator())
[W625 21:39:06.124302589 OperatorEntry.cpp:154] Warning: Warning only once for all operators,  other operators may also be overridden.
  Overriding a previously registered kernel for the same operator and the same dispatch key
  operator: aten::_validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> ()
    registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6
  dispatch key: XPU
  previous kernel: registered at /pytorch/build/aten/src/ATen/RegisterCPU.cpp:30477
       new kernel: registered at /build/intel-pytorch-extension/build/Release/csrc/gpu/csrc/aten/generated/ATen/RegisterXPU.cpp:468 (function operator())
+++++ Env Variables +++++
Internal:
    ENABLE_IOMP     = 1
    ENABLE_GPU      = 1
    ENABLE_JEMALLOC = 0
    ENABLE_TCMALLOC = 0
    LIB_DIR    = /usr/local/lib
    BIN_DIR    = bin64
    LLM_DIR    = /usr/local/lib/python3.11/dist-packages/ipex_llm

Exported:
    LD_PRELOAD             = /usr/local/lib/libiomp5.so
    OMP_NUM_THREADS        = 6
    MALLOC_CONF            =
    USE_XETLA              = OFF
    ENABLE_SDP_FUSION      =
    SYCL_CACHE_PERSISTENT  = 1
    BIGDL_LLM_XMX_DISABLED =
    SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS = 1
+++++++++++++++++++++++++
Complete.
root@clftower:/benchmark/all-in-one# python run.py
[W625 21:39:17.602247727 OperatorEntry.cpp:154] Warning: Warning only once for all operators,  other operators may also be overridden.
  Overriding a previously registered kernel for the same operator and the same dispatch key
  operator: aten::_validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> ()
    registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6
  dispatch key: XPU
  previous kernel: registered at /pytorch/build/aten/src/ATen/RegisterCPU.cpp:30477
       new kernel: registered at /build/intel-pytorch-extension/build/Release/csrc/gpu/csrc/aten/generated/ATen/RegisterXPU.cpp:468 (function operator())
Traceback (most recent call last):
  File "/benchmark/all-in-one/run.py", line 2288, in <module>
    from omegaconf import OmegaConf
ModuleNotFoundError: No module named 'omegaconf'
[W625 21:39:21.038149895 OperatorEntry.cpp:154] Warning: Warning only once for all operators,  other operators may also be overridden.
  Overriding a previously registered kernel for the same operator and the same dispatch key
  operator: aten::_validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> ()
    registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6
  dispatch key: XPU
  previous kernel: registered at /pytorch/build/aten/src/ATen/RegisterCPU.cpp:30477
       new kernel: registered at /build/intel-pytorch-extension/build/Release/csrc/gpu/csrc/aten/generated/ATen/RegisterXPU.cpp:468 (function operator())
root@clftower:/benchmark/all-in-one# pip install omegaconf
Collecting omegaconf
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting antlr4-python3-runtime==4.9.* (from omegaconf)
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
  Preparing metadata (setup.py) ... done
Requirement already satisfied: PyYAML>=5.1.0 in /usr/local/lib/python3.11/dist-packages (from omegaconf) (6.0.2)
Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB)
Building wheels for collected packages: antlr4-python3-runtime
  DEPRECATION: Building 'antlr4-python3-runtime' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'antlr4-python3-runtime'. Discussion can be found at https://github.com/pypa/pip/issues/6334
  Building wheel for antlr4-python3-runtime (setup.py) ... done
  Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.9.3-py3-none-any.whl size=144592 sha256=d71ecd9c96205c679d76c2607a44994a095a1fac3fac1620262348069667deb6
  Stored in directory: /root/.cache/pip/wheels/1a/97/32/461f837398029ad76911109f07047fde1d7b661a147c7c56d1
Successfully built antlr4-python3-runtime
Installing collected packages: antlr4-python3-runtime, omegaconf
Successfully installed antlr4-python3-runtime-4.9.3 omegaconf-2.3.0
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
root@clftower:/benchmark/all-in-one# python run.py
[W625 21:39:59.753849880 OperatorEntry.cpp:154] Warning: Warning only once for all operators,  other operators may also be overridden.
  Overriding a previously registered kernel for the same operator and the same dispatch key
  operator: aten::_validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> ()
    registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6
  dispatch key: XPU
  previous kernel: registered at /pytorch/build/aten/src/ATen/RegisterCPU.cpp:30477
       new kernel: registered at /build/intel-pytorch-extension/build/Release/csrc/gpu/csrc/aten/generated/ATen/RegisterXPU.cpp:468 (function operator())
2025-06-25 21:40:02,001 - ipex_llm.utils.common.log4Error - ERROR -

****************************Usage Error************************
/llm/llm-models/Llama-2-7b-chat-hf not exists!, Please check your models' folder.
2025-06-25 21:40:02,001 - ipex_llm.utils.common.log4Error - ERROR -

****************************Call Stack*************************
Traceback (most recent call last):
  File "/benchmark/all-in-one/run.py", line 2338, in <module>
    run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'],
  File "/benchmark/all-in-one/run.py", line 157, in run_model
    result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size, cpu_embedding, fp16=True, lookahead=lookahead, task=task)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/benchmark/all-in-one/run.py", line 479, in run_transformer_int4_gpu
    model_path = get_model_path(repo_id, local_model_hub)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/benchmark/all-in-one/run.py", line 232, in get_model_path
    invalidInputError(os.path.isdir(local_model_path),
  File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/common/log4Error.py", line 32, in invalidInputError
    raise RuntimeError(errMsg)
RuntimeError: /llm/llm-models/Llama-2-7b-chat-hf not exists!, Please check your models' folder.
[W625 21:40:02.181337868 OperatorEntry.cpp:154] Warning: Warning only once for all operators,  other operators may also be overridden.
  Overriding a previously registered kernel for the same operator and the same dispatch key
  operator: aten::_validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> ()
    registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6
  dispatch key: XPU
  previous kernel: registered at /pytorch/build/aten/src/ATen/RegisterCPU.cpp:30477
       new kernel: registered at /build/intel-pytorch-extension/build/Release/csrc/gpu/csrc/aten/generated/ATen/RegisterXPU.cpp:468 (function operator())
(base) testuser@clftower:~/ollama/ipex/ipex-llm/docker/llm/serving/xpu/docker$ docker exec -it $CONTAINER_NAME bash

root@clftower:/benchmark/all-in-one# ls -l /llm/models/
total 7045904
drwxr-xr-x 4 root root       4096 Jun 24 06:54 Llama-3.1-8B-Instruct
drwxrwxr-x 2 1000 1000       4096 Jun 24 22:42 Llama-3.1-8B-Instruct-ov-int8
-rw-rw-r-- 1 1000 1000 7214987928 Jun 24 22:46 Llama-3.1-8B-Instruct-ov-int8.tar.gz
-rw-rw-r-- 1 1000 1000        156 Jun 25 00:16 Modelfile

root@clftower:/benchmark/all-in-one# python run.py
[W625 21:42:57.820825994 OperatorEntry.cpp:154] Warning: Warning only once for all operators,  other operators may also be overridden.
  Overriding a previously registered kernel for the same operator and the same dispatch key
  operator: aten::_validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> ()
    registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6
  dispatch key: XPU
  previous kernel: registered at /pytorch/build/aten/src/ATen/RegisterCPU.cpp:30477
       new kernel: registered at /build/intel-pytorch-extension/build/Release/csrc/gpu/csrc/aten/generated/ATen/RegisterXPU.cpp:468 (function operator())
Loading checkpoint shards:   0%|                                                                                                                                                              | 0/4 [00:00<?, ?it/s][W625 21:43:00.554996948 OperatorEntry.cpp:154] Warning: Warning only once for all operators,  other operators may also be overridden.
  Overriding a previously registered kernel for the same operator and the same dispatch key
  operator: aten::_validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> ()
    registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6
  dispatch key: XPU
  previous kernel: registered at /pytorch/build/aten/src/ATen/RegisterCPU.cpp:30477
       new kernel: registered at /build/intel-pytorch-extension/build/Release/csrc/gpu/csrc/aten/generated/ATen/RegisterXPU.cpp:468 (function operator())
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:14<00:00,  3.65s/it]
2025-06-25 21:43:15,236 - ipex_llm.transformers.utils - WARNING - sym_int4 is deprecated, use woq_int4 instead, if you are loading saved sym_int4 low bit model, please resaved it with woq_int4
2025-06-25 21:43:15,236 - ipex_llm.transformers.utils - INFO - Converting the current model to sym_int4 format......
/usr/local/lib/python3.11/dist-packages/torch/nn/init.py:511: UserWarning: Initializing zero-element tensors is a no-op
  warnings.warn("Initializing zero-element tensors is a no-op")
INFO 06-25 21:43:16 [__init__.py:239] Automatically detected platform xpu.
>> loading of model costs 25.695578124999884s and 0.0GB
<class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
2025-06-25 21:43:26,045 - ipex_llm.utils.benchmark_util_4_47 - WARNING - The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
2025-06-25 21:43:26,045 - ipex_llm.utils.benchmark_util_4_47 - WARNING - Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
2025-06-25 21:43:26,179 - ipex_llm.utils.benchmark_util_4_47 - WARNING - The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Exception in thread Thread-3 (run_model_in_thread):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/benchmark/all-in-one/run.py", line 70, in run_model_in_thread
    output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/benchmark_util_4_47.py", line 2305, in generate
    result = self._sample(
             ^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/benchmark_util_4_47.py", line 3309, in _sample
    outputs = self(**model_inputs, return_dict=True)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/benchmark_util_4_47.py", line 524, in __call__
    return self.model(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/utils/generic.py", line 969, in wrapper
    output = func(self, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 688, in forward
    outputs: BaseModelOutputWithPast = self.model(
                                       ^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipex_llm/transformers/models/llama.py", line 103, in llama_model_forward
    return LlamaModel.forward(
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/utils/generic.py", line 969, in wrapper
    output = func(self, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 453, in forward
    layer_outputs = decoder_layer(
                    ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/modeling_layers.py", line 48, in __call__
    return super().__call__(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 308, in forward
    hidden_states, self_attn_weights = self.self_attn(
                                       ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipex_llm/transformers/models/llama.py", line 144, in llama_attention_forward
    qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim)
                               ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1928, in __getattr__
    raise AttributeError(
AttributeError: 'LlamaAttention' object has no attribute 'num_heads'
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
2025-06-25 21:43:28,882 - ipex_llm.utils.benchmark_util_4_47 - WARNING - The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
2025-06-25 21:43:28,882 - ipex_llm.utils.benchmark_util_4_47 - WARNING - Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Exception in thread Thread-4 (run_model_in_thread):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/benchmark/all-in-one/run.py", line 70, in run_model_in_thread
    output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/benchmark_util_4_47.py", line 2305, in generate
    result = self._sample(
             ^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/benchmark_util_4_47.py", line 3309, in _sample
    outputs = self(**model_inputs, return_dict=True)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/benchmark_util_4_47.py", line 524, in __call__
    return self.model(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/utils/generic.py", line 969, in wrapper
    output = func(self, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 688, in forward
    outputs: BaseModelOutputWithPast = self.model(
                                       ^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipex_llm/transformers/models/llama.py", line 103, in llama_model_forward
    return LlamaModel.forward(
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/utils/generic.py", line 969, in wrapper
    output = func(self, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 453, in forward
    layer_outputs = decoder_layer(
                    ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/modeling_layers.py", line 48, in __call__
    return super().__call__(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 308, in forward
    hidden_states, self_attn_weights = self.self_attn(
                                       ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipex_llm/transformers/models/llama.py", line 144, in llama_attention_forward
    qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim)
                               ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1928, in __getattr__
    raise AttributeError(
AttributeError: 'LlamaAttention' object has no attribute 'num_heads'
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
2025-06-25 21:43:29,466 - ipex_llm.utils.benchmark_util_4_47 - WARNING - The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
2025-06-25 21:43:29,467 - ipex_llm.utils.benchmark_util_4_47 - WARNING - Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Exception in thread Thread-5 (run_model_in_thread):
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.11/threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "/benchmark/all-in-one/run.py", line 70, in run_model_in_thread
    output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/benchmark_util_4_47.py", line 2305, in generate
    result = self._sample(
             ^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/benchmark_util_4_47.py", line 3309, in _sample
    outputs = self(**model_inputs, return_dict=True)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/benchmark_util_4_47.py", line 524, in __call__
    return self.model(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/utils/generic.py", line 969, in wrapper
    output = func(self, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 688, in forward
    outputs: BaseModelOutputWithPast = self.model(
                                       ^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipex_llm/transformers/models/llama.py", line 103, in llama_model_forward
    return LlamaModel.forward(
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/utils/generic.py", line 969, in wrapper
    output = func(self, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 453, in forward
    layer_outputs = decoder_layer(
                    ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/modeling_layers.py", line 48, in __call__
    return super().__call__(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 308, in forward
    hidden_states, self_attn_weights = self.self_attn(
                                       ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipex_llm/transformers/models/llama.py", line 144, in llama_attention_forward
    qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim)
                               ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1928, in __getattr__
    raise AttributeError(
AttributeError: 'LlamaAttention' object has no attribute 'num_heads'
root@clftower:/benchmark/all-in-one#