-
Notifications
You must be signed in to change notification settings - Fork 12.2k
Feature: Integrate with unified SYCL backend for Intel GPUs #2690
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 53 commits
7a4343d
2338769
0c00b4f
ff83711
02dffb6
43f2c35
da752ed
6dd3278
3a9d2c5
65f895d
3b1a743
c2ef7a9
69d76c8
c709c3c
fa3a586
3645f25
bd38129
5b53899
a47f5ec
c67c2ab
c3c5b20
ca2cb69
95daece
a8936f4
79d30d7
0d6e721
7350fd4
09b5619
d80dd65
593ce00
57e9fba
d5f7d36
35a0daa
ae941b1
e3481fa
623d803
f396a3b
f008cc7
67e6b3c
533c647
dd7f139
b403784
a0a1304
27c08c0
97cbe18
1ddaf44
bd716b2
be31379
d097e2a
88f64b7
756c4ac
b42a32d
5f83a12
d6fc1a0
c7e745e
3bfb846
498121b
91b1461
816f480
7a44a95
7babd76
04a46c4
799af05
ec5c8bc
22e1b45
238ec31
67de350
fb15de3
96186a7
d07a88d
8dd1b60
3aabd8a
18742f7
0e235fb
5600118
eef5faa
5bb93d4
0635f84
f1bab50
66e24c2
b06dca6
05b7f9b
d6a6505
174c9a0
c08fec2
2cba564
f707051
45b0618
5531754
b9ffaab
2ab9715
d394ca7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. | ||
project("llama.cpp" C CXX) | ||
include(CheckIncludeFileCXX) | ||
|
||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) | ||
|
||
|
@@ -96,13 +97,14 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for | |
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING | ||
"llama: max. batch size for using peer access") | ||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) | ||
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) | ||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF) | ||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) | ||
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) | ||
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF) | ||
option(LLAMA_MPI "llama: use MPI" OFF) | ||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) | ||
option(LLAMA_SYCL "llama: use SYCL" OFF) | ||
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) | ||
|
||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) | ||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) | ||
|
@@ -121,8 +123,12 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) | |
# | ||
# Compile flags | ||
# | ||
if (LLAMA_SYCL) | ||
set(CMAKE_CXX_STANDARD 17) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How deep is the C++17 dependency in the SYCL backend? It's okay to optionally include it like this, but I'm wondering if it is realistic to implement this in C++11 at some point - it would be in better harmony with the rest of the codebase. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually the icpx compiler expects C++17 standard and SYCL has dependency on that version. We thought about this process having same version C++11 but it causes compilation errors due to dependency on c++17 headers. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to add a bit more info, it is not just that the SYCL compiler, icpx, expects C++17. C++17 is a core aspect within the SYCL open standard. Any SYCL2020 code is expected to be C++17 conformant, so the relationship is deeper than just the specific implementation of the Khronos specification. I would say the dependency between SYCL and C++17 is hard, and it would likely not work well if SYCL specific features were compiled with C++11. From the spec: https://registry.khronos.org/SYCL/specs/sycl-2020/pdf/sycl-2020.pdf |
||
else() | ||
set(CMAKE_CXX_STANDARD 11) | ||
endif() | ||
|
||
set(CMAKE_CXX_STANDARD 11) | ||
set(CMAKE_CXX_STANDARD_REQUIRED true) | ||
set(CMAKE_C_STANDARD 11) | ||
set(CMAKE_C_STANDARD_REQUIRED true) | ||
|
@@ -338,18 +344,11 @@ if (LLAMA_CUBLAS) | |
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE}) | ||
|
||
if (LLAMA_STATIC) | ||
if (WIN32) | ||
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library | ||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) | ||
else () | ||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) | ||
endif() | ||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) | ||
else() | ||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) | ||
endif() | ||
|
||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) | ||
|
||
abhilash1910 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) | ||
# 52 == lowest CUDA 12 standard | ||
# 60 == f16 CUDA intrinsics | ||
|
@@ -426,9 +425,6 @@ if (LLAMA_HIPBLAS) | |
if (${hipblas_FOUND} AND ${hip_FOUND}) | ||
message(STATUS "HIP and hipBLAS found") | ||
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) | ||
if (LLAMA_HIP_UMA) | ||
add_compile_definitions(GGML_HIP_UMA) | ||
endif() | ||
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) | ||
if (BUILD_SHARED_LIBS) | ||
set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON) | ||
|
@@ -454,6 +450,35 @@ if (LLAMA_HIPBLAS) | |
endif() | ||
endif() | ||
|
||
|
||
if (LLAMA_SYCL) | ||
if ( NOT DEFINED ENV{ONEAPI_ROOT}) | ||
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh") | ||
abhilash1910 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
endif() | ||
#todo: AOT | ||
|
||
find_package(IntelSYCL REQUIRED) | ||
if (LLAMA_SYCL_F16) | ||
add_compile_definitions(GGML_SYCL_F16) | ||
endif() | ||
add_compile_definitions(GGML_USE_SYCL) | ||
|
||
add_compile_options(-I./) #include DPCT | ||
add_compile_options(-I/${SYCL_INCLUDE_DIR}) | ||
|
||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing") | ||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") | ||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib") | ||
|
||
set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h) | ||
set(GGML_SOURCES_SYCL ggml-sycl.cpp) | ||
|
||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) | ||
|
||
endif() | ||
|
||
|
||
|
||
abhilash1910 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
function(get_flags CCID CCVER) | ||
set(C_FLAGS "") | ||
set(CXX_FLAGS "") | ||
|
@@ -790,6 +815,7 @@ add_library(ggml OBJECT | |
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} | ||
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} | ||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} | ||
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} | ||
abhilash1910 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
|
||
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,252 @@ | ||
# llama.cpp for SYCL | ||
|
||
[Background](#background) | ||
|
||
[OS](#os) | ||
|
||
[Intel GPU](#intel-gpu) | ||
|
||
[Linux](#linux) | ||
|
||
[Environment Variable](#environment-variable) | ||
|
||
[Known Issue](#known-issue) | ||
|
||
[Todo](#todo) | ||
|
||
## Background | ||
|
||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17. | ||
|
||
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms. | ||
|
||
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs. | ||
abhilash1910 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL. | ||
abhilash1910 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
The llama.cpp for SYCL is used to support Intel GPUs. | ||
|
||
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building). | ||
|
||
## OS | ||
|
||
|OS|Status|Verified| | ||
|-|-|-| | ||
|Linux|Support|Ubuntu 22.04| | ||
|Windows|Ongoing| | | ||
|
||
|
||
## Intel GPU | ||
|
||
|Intel GPU| Status | Verified Model| | ||
|-|-|-| | ||
|Intel Data Center Max Series| Support| Max 1550| | ||
|Intel Data Center Flex Series| Support| Flex 170| | ||
|Intel Arc Series| Support| Arc 770| | ||
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake| | ||
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7| | ||
|
||
|
||
## Linux | ||
|
||
### Setup Environment | ||
|
||
1. Install Intel GPU driver. | ||
|
||
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html). | ||
|
||
Note: for iGPU, please install the client GPU driver. | ||
|
||
b. Add user to group: video, render. | ||
|
||
``` | ||
sudo usermod -aG render username | ||
sudo usermod -aG video username | ||
``` | ||
|
||
Note: re-login to enable it. | ||
|
||
c. Check | ||
|
||
``` | ||
sudo apt install clinfo | ||
sudo clinfo -l | ||
``` | ||
|
||
Output (example): | ||
|
||
``` | ||
Platform #0: Intel(R) OpenCL Graphics | ||
`-- Device #0: Intel(R) Arc(TM) A770 Graphics | ||
|
||
|
||
Platform #0: Intel(R) OpenCL HD Graphics | ||
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49] | ||
``` | ||
|
||
2. Install Intel® oneAPI Base toolkit. | ||
|
||
|
||
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html). | ||
|
||
Recommend to install to default folder: **/opt/intel/oneapi**. | ||
|
||
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder. | ||
|
||
b. Check | ||
|
||
``` | ||
source /opt/intel/oneapi/setvars.sh | ||
|
||
sycl-ls | ||
``` | ||
|
||
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**. | ||
|
||
Output (example): | ||
``` | ||
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000] | ||
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000] | ||
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50] | ||
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918] | ||
|
||
``` | ||
|
||
2. Build locally: | ||
|
||
``` | ||
mkdir -p build | ||
cd build | ||
source /opt/intel/oneapi/setvars.sh | ||
|
||
#for FP16 | ||
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference | ||
|
||
#for FP32 | ||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx | ||
|
||
#build example/main only | ||
#cmake --build . --config Release --target main | ||
|
||
#build all binary | ||
cmake --build . --config Release -v | ||
|
||
``` | ||
|
||
or | ||
|
||
``` | ||
./examples/sycl/build.sh | ||
``` | ||
|
||
Note: | ||
|
||
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only. | ||
|
||
### Run | ||
|
||
1. Put model file to folder **models** | ||
|
||
2. Enable oneAPI running environment | ||
|
||
``` | ||
source /opt/intel/oneapi/setvars.sh | ||
``` | ||
|
||
3. List device ID | ||
|
||
Run without parameter: | ||
|
||
``` | ||
./build/bin/ls-sycl-device | ||
|
||
or | ||
|
||
./build/bin/main | ||
``` | ||
|
||
Check the ID in startup log, like: | ||
|
||
``` | ||
found 4 SYCL devices: | ||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3, | ||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136 | ||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2, | ||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280 | ||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0, | ||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280 | ||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0, | ||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136 | ||
|
||
``` | ||
|
||
|Attribute|Note| | ||
|-|-| | ||
|compute capability 1.3|Level-zero running time, recommended | | ||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases| | ||
|
||
4. Set device ID and execute llama.cpp | ||
|
||
Set device ID = 0 by **GGML_SYCL_DEVICE=0** | ||
|
||
``` | ||
GGML_SYCL_DEVICE=0 && ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 | ||
abhilash1910 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
``` | ||
or run by script: | ||
|
||
``` | ||
./examples/sycl/run_llama2.sh | ||
``` | ||
|
||
Note: | ||
|
||
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue. | ||
|
||
|
||
5. Check the device ID in output | ||
|
||
Like: | ||
``` | ||
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device | ||
``` | ||
|
||
|
||
## Environment Variable | ||
|
||
#### Build | ||
|
||
|Name|Value|Function| | ||
|-|-|-| | ||
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.| | ||
|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.| | ||
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path| | ||
|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path| | ||
|
||
#### Running | ||
|
||
|
||
|Name|Value|Function| | ||
|-|-|-| | ||
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output| | ||
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG| | ||
|
||
## Known Issue | ||
|
||
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`. | ||
|
||
Miss to enable oneAPI running environment. | ||
|
||
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`. | ||
|
||
|
||
- Hang during startup | ||
|
||
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block. | ||
|
||
Solution: add **--no-mmap**. | ||
|
||
## Todo | ||
|
||
- Support to build in Windows. | ||
|
||
- Support multiple cards. |
Uh oh!
There was an error while loading. Please reload this page.