(base) testuser@clftower:~/ollama/ipex/ipex-llm/docker/llm/serving/xpu/docker$ docker exec -it $CONTAINER_NAME bash root@clftower:/llm# sycl-ls [level_zero:gpu][level_zero:0] Intel(R) oneAPI Unified Runtime over Level-Zero, Intel(R) Arc(TM) A770 Graphics 12.55.8 [1.6.32224.500000] [opencl:cpu][opencl:0] Intel(R) OpenCL, Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz OpenCL 3.0 (Build 0) [2024.18.12.0.05_160000] [opencl:gpu][opencl:1] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [24.52.32224.5] root@clftower:/llm# cd /ipex-llm/python/llm/scripts bash env-check.sh bash: cd: /ipex-llm/python/llm/scripts: No such file or directory bash: env-check.sh: No such file or directory root@clftower:/llm# cd / root@clftower:/# ll total 252 drwxr-xr-x 1 root root 4096 Jun 25 21:34 ./ drwxr-xr-x 1 root root 4096 Jun 25 21:34 ../ -rwxr-xr-x 1 root root 0 Jun 25 21:34 .dockerenv* drwxr-xr-x 8 root root 4096 Jun 25 21:24 benchmark/ lrwxrwxrwx 1 root root 7 Sep 11 2024 bin -> usr/bin/ drwxr-xr-x 2 root root 4096 Apr 18 2022 boot/ drwxr-xr-x 6 root root 380 Jun 25 21:34 dev/ drwxr-xr-x 1 root root 4096 Jun 25 21:34 etc/ drwxr-xr-x 56 root root 4096 Jun 25 21:24 examples/ drwxr-xr-x 2 root root 4096 Apr 18 2022 home/ lrwxrwxrwx 1 root root 7 Sep 11 2024 lib -> usr/lib/ lrwxrwxrwx 1 root root 9 Sep 11 2024 lib32 -> usr/lib32/ lrwxrwxrwx 1 root root 9 Sep 11 2024 lib64 -> usr/lib64/ lrwxrwxrwx 1 root root 10 Sep 11 2024 libx32 -> usr/libx32/ drwxr-xr-x 1 root root 4096 Jun 25 21:34 llm/ drwxr-xr-x 2 root root 4096 Sep 11 2024 media/ drwxr-xr-x 2 root root 4096 Sep 11 2024 mnt/ drwxr-xr-x 1 root root 4096 Jun 25 21:21 opt/ dr-xr-xr-x 415 root root 0 Jun 25 21:34 proc/ drwx------ 1 root root 4096 Jun 25 21:34 root/ drwxr-xr-x 1 root root 4096 Dec 12 2024 run/ lrwxrwxrwx 1 root root 8 Sep 11 2024 sbin -> usr/sbin/ drwxr-xr-x 2 root root 4096 Sep 11 2024 srv/ dr-xr-xr-x 13 root root 0 Jun 25 19:15 sys/ -rw-r--r-- 1 root root 160864 Oct 31 2024 third-party-programs.txt drwxrwxrwt 1 root root 12288 Jun 25 21:32 tmp/ drwxr-xr-x 1 root root 4096 Sep 11 2024 usr/ drwxr-xr-x 2 root root 4096 Jun 25 21:24 vLLM-Serving/ drwxr-xr-x 1 root root 4096 Sep 11 2024 var/ root@clftower:/# find . -name 'scripts' -type d ./opt/intel/oneapi/advisor/2025.0/documentation/en/help/scripts ./opt/intel/oneapi/2025.0/opt/debugger/lib/python3.12/venv/scripts ./opt/intel/oneapi/vtune/2025.0/bin64/os-perf/scripts ./opt/intel/oneapi/vtune/2025.0/bin64/resources/app/scripts ./opt/intel/oneapi/debugger/2025.0/opt/debugger/lib/python3.12/venv/scripts ./usr/share/doc/wrk/examples/scripts ./usr/lib/python3.10/venv/scripts ./usr/lib/python3.11/venv/scripts ./usr/local/lib/python3.11/dist-packages/scripts ./usr/local/lib/python3.11/dist-packages/numba/scripts ./usr/local/lib/python3.11/dist-packages/ray/scripts ./usr/local/lib/python3.11/dist-packages/accelerate/test_utils/scripts ./llm/vllm/.buildkite/scripts ./llm/vllm/.buildkite/nightly-benchmarks/scripts ./llm/vllm/.github/scripts ./llm/vllm/.github/workflows/scripts root@clftower:/# find . -name 'env-check.sh' root@clftower:/# # Disable code related to XETLA; only Intel Data Center GPU Max Series supports XETLA, so non-Max machines should set this to OFF. # Recommended for use on Intel Arc™ A-Series Graphics and Intel Data Center GPU Flex Series. export USE_XETLA=OFF # Enable immediate command lists mode for the Level Zero plugin. Improves performance on Intel Arc™ A-Series Graphics and Intel Data Center GPU Max Series; however, it depends on the Linux Kernel, and some Linux kernels may not necessarily provide acceleration. # Recommended for use on Intel Arc™ A-Series Graphics and Intel Data Center GPU Max Series, but it depends on the Linux kernel, Upstream i915 kernel drivers may cause performance regressions. export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 # Controls persistent device compiled code cache. Set to '1' to turn on and '0' to turn off. # Recommended for all hardware environments. This environment variable is already set by default in Docker images. export SYCL_CACHE_PERSISTENT=1 root@clftower:/# cd /benchmark/all-in-one root@clftower:/benchmark/all-in-one# env TBBROOT=/opt/intel/oneapi/tbb/2022.0/env/.. PYTHONUNBUFFERED=1 ONEAPI_ROOT=/opt/intel/oneapi PKG_CONFIG_PATH=/opt/intel/oneapi/vtune/2025.0/include/pkgconfig/lib64:/opt/intel/oneapi/tbb/2022.0/env/../lib/pkgconfig:/opt/intel/oneapi/mpi/2021.14/lib/pkgconfig:/opt/intel/oneapi/mkl/2025.0/lib/pkgconfig:/opt/intel/oneapi/ippcp/2025.0/lib/pkgconfig:/opt/intel/oneapi/dpl/2022.7/lib/pkgconfig:/opt/intel/oneapi/dnnl/2025.0/lib/pkgconfig:/opt/intel/oneapi/dal/2025.0/lib/pkgconfig:/opt/intel/oneapi/compiler/2025.0/lib/pkgconfig:/opt/intel/oneapi/ccl/2021.14/lib/pkgconfig/:/opt/intel/oneapi/advisor/2025.0/include/pkgconfig/lib64: USE_XETLA=OFF HOSTNAME=clftower ADVISOR_2025_DIR=/opt/intel/oneapi/advisor/2025.0 CCL_ROOT=/opt/intel/oneapi/ccl/2021.14 I_MPI_ROOT=/opt/intel/oneapi/mpi/2021.14 FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/2021.14/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric DNNLROOT=/opt/intel/oneapi/dnnl/2025.0 DIAGUTIL_PATH=/opt/intel/oneapi/dpcpp-ct/2025.0/etc/dpct/sys_check/sys_check.sh:/opt/intel/oneapi/compiler/2025.0/etc/compiler/sys_check/sys_check.sh PWD=/benchmark/all-in-one CCL_CONFIGURATION=cpu_gpu_dpcpp DPL_ROOT=/opt/intel/oneapi/dpl/2022.7 MANPATH=/opt/intel/oneapi/mpi/2021.14/share/man:/opt/intel/oneapi/debugger/2025.0/share/man:/opt/intel/oneapi/compiler/2025.0/share/man: TCM_ROOT=/opt/intel/oneapi/tcm/1.2 TZ=Asia/Shanghai HOME=/root GDB_INFO=/opt/intel/oneapi/debugger/2025.0/share/info/ CCL_CONFIGURATION_PATH= LANG=C.UTF-8 LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36: SETVARS_COMPLETED=1 APM=/opt/intel/oneapi/advisor/2025.0/perfmodels CMAKE_PREFIX_PATH=/opt/intel/oneapi/tbb/2022.0/env/..:/opt/intel/oneapi/pti/0.10/lib/cmake/pti:/opt/intel/oneapi/mkl/2025.0/lib/cmake:/opt/intel/oneapi/ipp/2022.0/lib/cmake/ipp:/opt/intel/oneapi/dpl/2022.7/lib/cmake/oneDPL:/opt/intel/oneapi/dnnl/2025.0/lib/cmake:/opt/intel/oneapi/dal/2025.0:/opt/intel/oneapi/compiler/2025.0 CMPLR_ROOT=/opt/intel/oneapi/compiler/2025.0 Pti_DIR=/opt/intel/oneapi/pti/0.10/lib/cmake/pti INFOPATH=/opt/intel/oneapi/debugger/2025.0/share/info IPPROOT=/opt/intel/oneapi/ipp/2022.0 IPP_TARGET_ARCH=intel64 LESSCLOSE=/usr/bin/lesspipe %s %s PYTHONPATH=/opt/intel/oneapi/advisor/2025.0/pythonapi TERM=xterm DALROOT=/opt/intel/oneapi/dal/2025.0 LESSOPEN=| /usr/bin/lesspipe %s UMF_ROOT=/opt/intel/oneapi/umf/0.9 LIBRARY_PATH=/opt/intel/oneapi/tcm/1.2/lib:/opt/intel/oneapi/umf/0.9/lib:/opt/intel/oneapi/tbb/2022.0/env/../lib/intel64/gcc4.8:/opt/intel/oneapi/pti/0.10/lib:/opt/intel/oneapi/mpi/2021.14/lib:/opt/intel/oneapi/mkl/2025.0/lib:/opt/intel/oneapi/ippcp/2025.0/lib/:/opt/intel/oneapi/ipp/2022.0/lib:/opt/intel/oneapi/dnnl/2025.0/lib:/opt/intel/oneapi/dal/2025.0/lib:/opt/intel/oneapi/compiler/2025.0/lib:/opt/intel/oneapi/ccl/2021.14/lib/ DAL_MAJOR_BINARY=3 SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 IPPCRYPTOROOT=/opt/intel/oneapi/ippcp/2025.0 IPPCP_TARGET_ARCH=intel64 SHLVL=1 OCL_ICD_FILENAMES=/opt/intel/oneapi/compiler/2025.0/lib/libintelocl.so CLASSPATH=/opt/intel/oneapi/mpi/2021.14/share/java/mpi.jar LD_LIBRARY_PATH=/opt/intel/oneapi/tcm/1.2/lib:/opt/intel/oneapi/umf/0.9/lib:/opt/intel/oneapi/tbb/2022.0/env/../lib/intel64/gcc4.8:/opt/intel/oneapi/pti/0.10/lib:/opt/intel/oneapi/mpi/2021.14/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/2021.14/lib:/opt/intel/oneapi/mkl/2025.0/lib:/opt/intel/oneapi/ippcp/2025.0/lib/:/opt/intel/oneapi/ipp/2022.0/lib:/opt/intel/oneapi/dnnl/2025.0/lib:/opt/intel/oneapi/debugger/2025.0/opt/debugger/lib:/opt/intel/oneapi/dal/2025.0/lib:/opt/intel/oneapi/compiler/2025.0/opt/compiler/lib:/opt/intel/oneapi/compiler/2025.0/lib:/opt/intel/oneapi/ccl/2021.14/lib/ VTUNE_PROFILER_DIR=/opt/intel/oneapi/vtune/2025.0 MKLROOT=/opt/intel/oneapi/mkl/2025.0 DAL_MINOR_BINARY=0 VTUNE_PROFILER_2025_DIR=/opt/intel/oneapi/vtune/2025.0 NLSPATH=/opt/intel/oneapi/compiler/2025.0/lib/compiler/locale/%l_%t/%N PATH=/opt/intel/oneapi/vtune/2025.0/bin64:/opt/intel/oneapi/mpi/2021.14/bin:/opt/intel/oneapi/mkl/2025.0/bin:/opt/intel/oneapi/dpcpp-ct/2025.0/bin:/opt/intel/oneapi/dev-utilities/2025.0/bin:/opt/intel/oneapi/debugger/2025.0/opt/debugger/bin:/opt/intel/oneapi/compiler/2025.0/bin:/opt/intel/oneapi/advisor/2025.0/bin64:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin SYCL_CACHE_PERSISTENT=1 INTEL_PYTHONHOME=/opt/intel/oneapi/debugger/2025.0/opt/debugger IPEX_LLM_FORCE_BATCH_FORWARD=1 VLLM_RPC_TIMEOUT=100000 CPATH=/opt/intel/oneapi/umf/0.9/include:/opt/intel/oneapi/tbb/2022.0/env/../include:/opt/intel/oneapi/pti/0.10/include:/opt/intel/oneapi/mpi/2021.14/include:/opt/intel/oneapi/mkl/2025.0/include:/opt/intel/oneapi/ippcp/2025.0/include:/opt/intel/oneapi/ipp/2022.0/include:/opt/intel/oneapi/dpl/2022.7/include:/opt/intel/oneapi/dpcpp-ct/2025.0/include:/opt/intel/oneapi/dnnl/2025.0/include:/opt/intel/oneapi/dev-utilities/2025.0/include:/opt/intel/oneapi/dal/2025.0/include:/opt/intel/oneapi/ccl/2021.14/include OLDPWD=/ _=/usr/bin/env root@clftower:/benchmark/all-in-one# root@clftower:/benchmark/all-in-one# source ipex-llm-init --gpu --device Arc found intel-openmp in /usr/local/lib/libiomp5.so found oneapi in /opt/intel/oneapi/setvars.sh :: initializing oneAPI environment ... bash: BASH_VERSION = 5.1.16(1)-release args: Using "$@" for setvars.sh arguments: --force :: advisor -- latest :: ccl -- latest :: compiler -- latest :: dal -- latest :: debugger -- latest :: dev-utilities -- latest :: dnnl -- latest :: dpcpp-ct -- latest :: dpl -- latest :: ipp -- latest :: ippcp -- latest :: mkl -- latest :: mpi -- latest :: pti -- latest :: tbb -- latest :: umf -- latest :: vtune -- latest :: oneAPI environment initialized :: [W625 21:39:04.779711591 OperatorEntry.cpp:154] Warning: Warning only once for all operators, other operators may also be overridden. Overriding a previously registered kernel for the same operator and the same dispatch key operator: aten::_validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> () registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6 dispatch key: XPU previous kernel: registered at /pytorch/build/aten/src/ATen/RegisterCPU.cpp:30477 new kernel: registered at /build/intel-pytorch-extension/build/Release/csrc/gpu/csrc/aten/generated/ATen/RegisterXPU.cpp:468 (function operator()) [W625 21:39:06.124302589 OperatorEntry.cpp:154] Warning: Warning only once for all operators, other operators may also be overridden. Overriding a previously registered kernel for the same operator and the same dispatch key operator: aten::_validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> () registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6 dispatch key: XPU previous kernel: registered at /pytorch/build/aten/src/ATen/RegisterCPU.cpp:30477 new kernel: registered at /build/intel-pytorch-extension/build/Release/csrc/gpu/csrc/aten/generated/ATen/RegisterXPU.cpp:468 (function operator()) +++++ Env Variables +++++ Internal: ENABLE_IOMP = 1 ENABLE_GPU = 1 ENABLE_JEMALLOC = 0 ENABLE_TCMALLOC = 0 LIB_DIR = /usr/local/lib BIN_DIR = bin64 LLM_DIR = /usr/local/lib/python3.11/dist-packages/ipex_llm Exported: LD_PRELOAD = /usr/local/lib/libiomp5.so OMP_NUM_THREADS = 6 MALLOC_CONF = USE_XETLA = OFF ENABLE_SDP_FUSION = SYCL_CACHE_PERSISTENT = 1 BIGDL_LLM_XMX_DISABLED = SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS = 1 +++++++++++++++++++++++++ Complete. root@clftower:/benchmark/all-in-one# python run.py [W625 21:39:17.602247727 OperatorEntry.cpp:154] Warning: Warning only once for all operators, other operators may also be overridden. Overriding a previously registered kernel for the same operator and the same dispatch key operator: aten::_validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> () registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6 dispatch key: XPU previous kernel: registered at /pytorch/build/aten/src/ATen/RegisterCPU.cpp:30477 new kernel: registered at /build/intel-pytorch-extension/build/Release/csrc/gpu/csrc/aten/generated/ATen/RegisterXPU.cpp:468 (function operator()) Traceback (most recent call last): File "/benchmark/all-in-one/run.py", line 2288, in from omegaconf import OmegaConf ModuleNotFoundError: No module named 'omegaconf' [W625 21:39:21.038149895 OperatorEntry.cpp:154] Warning: Warning only once for all operators, other operators may also be overridden. Overriding a previously registered kernel for the same operator and the same dispatch key operator: aten::_validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> () registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6 dispatch key: XPU previous kernel: registered at /pytorch/build/aten/src/ATen/RegisterCPU.cpp:30477 new kernel: registered at /build/intel-pytorch-extension/build/Release/csrc/gpu/csrc/aten/generated/ATen/RegisterXPU.cpp:468 (function operator()) root@clftower:/benchmark/all-in-one# pip install omegaconf Collecting omegaconf Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB) Collecting antlr4-python3-runtime==4.9.* (from omegaconf) Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB) Preparing metadata (setup.py) ... done Requirement already satisfied: PyYAML>=5.1.0 in /usr/local/lib/python3.11/dist-packages (from omegaconf) (6.0.2) Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB) Building wheels for collected packages: antlr4-python3-runtime DEPRECATION: Building 'antlr4-python3-runtime' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'antlr4-python3-runtime'. Discussion can be found at https://github.com/pypa/pip/issues/6334 Building wheel for antlr4-python3-runtime (setup.py) ... done Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.9.3-py3-none-any.whl size=144592 sha256=d71ecd9c96205c679d76c2607a44994a095a1fac3fac1620262348069667deb6 Stored in directory: /root/.cache/pip/wheels/1a/97/32/461f837398029ad76911109f07047fde1d7b661a147c7c56d1 Successfully built antlr4-python3-runtime Installing collected packages: antlr4-python3-runtime, omegaconf Successfully installed antlr4-python3-runtime-4.9.3 omegaconf-2.3.0 WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning. root@clftower:/benchmark/all-in-one# python run.py [W625 21:39:59.753849880 OperatorEntry.cpp:154] Warning: Warning only once for all operators, other operators may also be overridden. Overriding a previously registered kernel for the same operator and the same dispatch key operator: aten::_validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> () registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6 dispatch key: XPU previous kernel: registered at /pytorch/build/aten/src/ATen/RegisterCPU.cpp:30477 new kernel: registered at /build/intel-pytorch-extension/build/Release/csrc/gpu/csrc/aten/generated/ATen/RegisterXPU.cpp:468 (function operator()) 2025-06-25 21:40:02,001 - ipex_llm.utils.common.log4Error - ERROR - ****************************Usage Error************************ /llm/llm-models/Llama-2-7b-chat-hf not exists!, Please check your models' folder. 2025-06-25 21:40:02,001 - ipex_llm.utils.common.log4Error - ERROR - ****************************Call Stack************************* Traceback (most recent call last): File "/benchmark/all-in-one/run.py", line 2338, in run_model(model, api, in_out_pairs, conf['local_model_hub'], conf['warm_up'], conf['num_trials'], conf['num_beams'], File "/benchmark/all-in-one/run.py", line 157, in run_model result = run_transformer_int4_gpu(repo_id, local_model_hub, in_out_pairs, warm_up, num_trials, num_beams, low_bit, batch_size, cpu_embedding, fp16=True, lookahead=lookahead, task=task) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/benchmark/all-in-one/run.py", line 479, in run_transformer_int4_gpu model_path = get_model_path(repo_id, local_model_hub) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/benchmark/all-in-one/run.py", line 232, in get_model_path invalidInputError(os.path.isdir(local_model_path), File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/common/log4Error.py", line 32, in invalidInputError raise RuntimeError(errMsg) RuntimeError: /llm/llm-models/Llama-2-7b-chat-hf not exists!, Please check your models' folder. [W625 21:40:02.181337868 OperatorEntry.cpp:154] Warning: Warning only once for all operators, other operators may also be overridden. Overriding a previously registered kernel for the same operator and the same dispatch key operator: aten::_validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> () registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6 dispatch key: XPU previous kernel: registered at /pytorch/build/aten/src/ATen/RegisterCPU.cpp:30477 new kernel: registered at /build/intel-pytorch-extension/build/Release/csrc/gpu/csrc/aten/generated/ATen/RegisterXPU.cpp:468 (function operator()) (base) testuser@clftower:~/ollama/ipex/ipex-llm/docker/llm/serving/xpu/docker$ docker exec -it $CONTAINER_NAME bash root@clftower:/benchmark/all-in-one# ls -l /llm/models/ total 7045904 drwxr-xr-x 4 root root 4096 Jun 24 06:54 Llama-3.1-8B-Instruct drwxrwxr-x 2 1000 1000 4096 Jun 24 22:42 Llama-3.1-8B-Instruct-ov-int8 -rw-rw-r-- 1 1000 1000 7214987928 Jun 24 22:46 Llama-3.1-8B-Instruct-ov-int8.tar.gz -rw-rw-r-- 1 1000 1000 156 Jun 25 00:16 Modelfile root@clftower:/benchmark/all-in-one# python run.py [W625 21:42:57.820825994 OperatorEntry.cpp:154] Warning: Warning only once for all operators, other operators may also be overridden. Overriding a previously registered kernel for the same operator and the same dispatch key operator: aten::_validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> () registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6 dispatch key: XPU previous kernel: registered at /pytorch/build/aten/src/ATen/RegisterCPU.cpp:30477 new kernel: registered at /build/intel-pytorch-extension/build/Release/csrc/gpu/csrc/aten/generated/ATen/RegisterXPU.cpp:468 (function operator()) Loading checkpoint shards: 0%| | 0/4 [00:00 () registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6 dispatch key: XPU previous kernel: registered at /pytorch/build/aten/src/ATen/RegisterCPU.cpp:30477 new kernel: registered at /build/intel-pytorch-extension/build/Release/csrc/gpu/csrc/aten/generated/ATen/RegisterXPU.cpp:468 (function operator()) Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:14<00:00, 3.65s/it] 2025-06-25 21:43:15,236 - ipex_llm.transformers.utils - WARNING - sym_int4 is deprecated, use woq_int4 instead, if you are loading saved sym_int4 low bit model, please resaved it with woq_int4 2025-06-25 21:43:15,236 - ipex_llm.transformers.utils - INFO - Converting the current model to sym_int4 format...... /usr/local/lib/python3.11/dist-packages/torch/nn/init.py:511: UserWarning: Initializing zero-element tensors is a no-op warnings.warn("Initializing zero-element tensors is a no-op") INFO 06-25 21:43:16 [__init__.py:239] Automatically detected platform xpu. >> loading of model costs 25.695578124999884s and 0.0GB The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details. 2025-06-25 21:43:26,045 - ipex_llm.utils.benchmark_util_4_47 - WARNING - The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. 2025-06-25 21:43:26,045 - ipex_llm.utils.benchmark_util_4_47 - WARNING - Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation. 2025-06-25 21:43:26,179 - ipex_llm.utils.benchmark_util_4_47 - WARNING - The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. Exception in thread Thread-3 (run_model_in_thread): Traceback (most recent call last): File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner self.run() File "/usr/lib/python3.11/threading.py", line 982, in run self._target(*self._args, **self._kwargs) File "/benchmark/all-in-one/run.py", line 70, in run_model_in_thread output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/benchmark_util_4_47.py", line 2305, in generate result = self._sample( ^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/benchmark_util_4_47.py", line 3309, in _sample outputs = self(**model_inputs, return_dict=True) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/benchmark_util_4_47.py", line 524, in __call__ return self.model(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/transformers/utils/generic.py", line 969, in wrapper output = func(self, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 688, in forward outputs: BaseModelOutputWithPast = self.model( ^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/ipex_llm/transformers/models/llama.py", line 103, in llama_model_forward return LlamaModel.forward( ^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/transformers/utils/generic.py", line 969, in wrapper output = func(self, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 453, in forward layer_outputs = decoder_layer( ^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/transformers/modeling_layers.py", line 48, in __call__ return super().__call__(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 308, in forward hidden_states, self_attn_weights = self.self_attn( ^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/ipex_llm/transformers/models/llama.py", line 144, in llama_attention_forward qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim) ^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1928, in __getattr__ raise AttributeError( AttributeError: 'LlamaAttention' object has no attribute 'num_heads' The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details. 2025-06-25 21:43:28,882 - ipex_llm.utils.benchmark_util_4_47 - WARNING - The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. 2025-06-25 21:43:28,882 - ipex_llm.utils.benchmark_util_4_47 - WARNING - Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation. Exception in thread Thread-4 (run_model_in_thread): Traceback (most recent call last): File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner self.run() File "/usr/lib/python3.11/threading.py", line 982, in run self._target(*self._args, **self._kwargs) File "/benchmark/all-in-one/run.py", line 70, in run_model_in_thread output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/benchmark_util_4_47.py", line 2305, in generate result = self._sample( ^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/benchmark_util_4_47.py", line 3309, in _sample outputs = self(**model_inputs, return_dict=True) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/benchmark_util_4_47.py", line 524, in __call__ return self.model(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/transformers/utils/generic.py", line 969, in wrapper output = func(self, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 688, in forward outputs: BaseModelOutputWithPast = self.model( ^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/ipex_llm/transformers/models/llama.py", line 103, in llama_model_forward return LlamaModel.forward( ^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/transformers/utils/generic.py", line 969, in wrapper output = func(self, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 453, in forward layer_outputs = decoder_layer( ^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/transformers/modeling_layers.py", line 48, in __call__ return super().__call__(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 308, in forward hidden_states, self_attn_weights = self.self_attn( ^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/ipex_llm/transformers/models/llama.py", line 144, in llama_attention_forward qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim) ^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1928, in __getattr__ raise AttributeError( AttributeError: 'LlamaAttention' object has no attribute 'num_heads' The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details. 2025-06-25 21:43:29,466 - ipex_llm.utils.benchmark_util_4_47 - WARNING - The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results. 2025-06-25 21:43:29,467 - ipex_llm.utils.benchmark_util_4_47 - WARNING - Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation. Exception in thread Thread-5 (run_model_in_thread): Traceback (most recent call last): File "/usr/lib/python3.11/threading.py", line 1045, in _bootstrap_inner self.run() File "/usr/lib/python3.11/threading.py", line 982, in run self._target(*self._args, **self._kwargs) File "/benchmark/all-in-one/run.py", line 70, in run_model_in_thread output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/benchmark_util_4_47.py", line 2305, in generate result = self._sample( ^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/benchmark_util_4_47.py", line 3309, in _sample outputs = self(**model_inputs, return_dict=True) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/ipex_llm/utils/benchmark_util_4_47.py", line 524, in __call__ return self.model(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/transformers/utils/generic.py", line 969, in wrapper output = func(self, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 688, in forward outputs: BaseModelOutputWithPast = self.model( ^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/ipex_llm/transformers/models/llama.py", line 103, in llama_model_forward return LlamaModel.forward( ^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/transformers/utils/generic.py", line 969, in wrapper output = func(self, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 453, in forward layer_outputs = decoder_layer( ^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/transformers/modeling_layers.py", line 48, in __call__ return super().__call__(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/transformers/models/llama/modeling_llama.py", line 308, in forward hidden_states, self_attn_weights = self.self_attn( ^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/ipex_llm/transformers/models/llama.py", line 144, in llama_attention_forward qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim) ^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1928, in __getattr__ raise AttributeError( AttributeError: 'LlamaAttention' object has no attribute 'num_heads' root@clftower:/benchmark/all-in-one#