Skip to content

Commit 6bbc598

Browse files
SlyEchoYellowRoseCxardforkfunnbotEngininja2
authored
ROCm Port (#1087)
* use hipblas based on cublas * Update Makefile for the Cuda kernels * Expand arch list and make it overrideable * Fix multi GPU on multiple amd architectures with rocblas_initialize() (#5) * add hipBLAS to README * new build arg LLAMA_CUDA_MMQ_Y * fix half2 decomposition * Add intrinsics polyfills for AMD * AMD assembly optimized __dp4a * Allow overriding CC_TURING * use "ROCm" instead of "CUDA" * ignore all build dirs * Add Dockerfiles * fix llama-bench * fix -nommq help for non CUDA/HIP --------- Co-authored-by: YellowRoseCx <[email protected]> Co-authored-by: ardfork <[email protected]> Co-authored-by: funnbot <[email protected]> Co-authored-by: Engininja2 <[email protected]> Co-authored-by: Kerfuffle <[email protected]> Co-authored-by: jammm <[email protected]> Co-authored-by: jdecourval <[email protected]>
1 parent 3f460a2 commit 6bbc598

12 files changed

+335
-59
lines changed

.devops/full-rocm.Dockerfile

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
ARG UBUNTU_VERSION=22.04
2+
3+
# This needs to generally match the container host's environment.
4+
ARG ROCM_VERSION=5.6
5+
6+
# Target the CUDA build image
7+
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
8+
9+
FROM ${BASE_ROCM_DEV_CONTAINER} as build
10+
11+
# Unless otherwise specified, we make a fat build.
12+
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
13+
# This is mostly tied to rocBLAS supported archs.
14+
ARG ROCM_DOCKER_ARCH=\
15+
gfx803 \
16+
gfx900 \
17+
gfx906 \
18+
gfx908 \
19+
gfx90a \
20+
gfx1010 \
21+
gfx1030 \
22+
gfx1100 \
23+
gfx1101 \
24+
gfx1102
25+
26+
COPY requirements.txt requirements.txt
27+
28+
RUN pip install --upgrade pip setuptools wheel \
29+
&& pip install -r requirements.txt
30+
31+
WORKDIR /app
32+
33+
COPY . .
34+
35+
# Set nvcc architecture
36+
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
37+
# Enable ROCm
38+
ENV LLAMA_HIPBLAS=1
39+
ENV CC=/opt/rocm/llvm/bin/clang
40+
ENV CXX=/opt/rocm/llvm/bin/clang++
41+
42+
RUN make
43+
44+
ENTRYPOINT ["/app/.devops/tools.sh"]

.devops/main-rocm.Dockerfile

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
ARG UBUNTU_VERSION=22.04
2+
3+
# This needs to generally match the container host's environment.
4+
ARG ROCM_VERSION=5.6
5+
6+
# Target the CUDA build image
7+
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
8+
9+
FROM ${BASE_ROCM_DEV_CONTAINER} as build
10+
11+
# Unless otherwise specified, we make a fat build.
12+
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
13+
# This is mostly tied to rocBLAS supported archs.
14+
ARG ROCM_DOCKER_ARCH=\
15+
gfx803 \
16+
gfx900 \
17+
gfx906 \
18+
gfx908 \
19+
gfx90a \
20+
gfx1010 \
21+
gfx1030 \
22+
gfx1100 \
23+
gfx1101 \
24+
gfx1102
25+
26+
COPY requirements.txt requirements.txt
27+
28+
RUN pip install --upgrade pip setuptools wheel \
29+
&& pip install -r requirements.txt
30+
31+
WORKDIR /app
32+
33+
COPY . .
34+
35+
# Set nvcc architecture
36+
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
37+
# Enable ROCm
38+
ENV LLAMA_HIPBLAS=1
39+
ENV CC=/opt/rocm/llvm/bin/clang
40+
ENV CXX=/opt/rocm/llvm/bin/clang++
41+
42+
RUN make
43+
44+
ENTRYPOINT [ "/app/main" ]

.dockerignore

+1-8
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,7 @@
55
.vscode/
66
.DS_Store
77

8-
build/
9-
build-em/
10-
build-debug/
11-
build-release/
12-
build-static/
13-
build-no-accel/
14-
build-sanitize-addr/
15-
build-sanitize-thread/
8+
build*/
169

1710
models/*
1811

.gitignore

+1-14
Original file line numberDiff line numberDiff line change
@@ -16,20 +16,7 @@
1616
.vs/
1717
.vscode/
1818

19-
build/
20-
build-em/
21-
build-debug/
22-
build-release/
23-
build-ci-debug/
24-
build-ci-release/
25-
build-static/
26-
build-cublas/
27-
build-opencl/
28-
build-metal/
29-
build-mpi/
30-
build-no-accel/
31-
build-sanitize-addr/
32-
build-sanitize-thread/
19+
build*/
3320
out/
3421
tmp/
3522

CMakeLists.txt

+38
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern
7474
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
7575
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
7676
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
77+
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
7778
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
7879
option(LLAMA_METAL "llama: use Metal" OFF)
7980
option(LLAMA_MPI "llama: use MPI" OFF)
@@ -352,6 +353,43 @@ if (LLAMA_CLBLAST)
352353
endif()
353354
endif()
354355

356+
if (LLAMA_HIPBLAS)
357+
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
358+
359+
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
360+
message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
361+
endif()
362+
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
363+
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
364+
endif()
365+
366+
find_package(hip)
367+
find_package(hipblas)
368+
find_package(rocblas)
369+
370+
if (${hipblas_FOUND} AND ${hip_FOUND})
371+
message(STATUS "HIP and hipBLAS found")
372+
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
373+
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
374+
if (LLAMA_CUDA_FORCE_DMMV)
375+
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
376+
endif()
377+
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
378+
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
379+
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
380+
target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000)
381+
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
382+
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
383+
384+
if (LLAMA_STATIC)
385+
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
386+
endif()
387+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm)
388+
else()
389+
message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
390+
endif()
391+
endif()
392+
355393
if (LLAMA_ALL_WARNINGS)
356394
if (NOT MSVC)
357395
set(c_flags

Makefile

+24
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,30 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
280280
$(CXX) $(CXXFLAGS) -c $< -o $@
281281
endif # LLAMA_CLBLAST
282282

283+
ifdef LLAMA_HIPBLAS
284+
ROCM_PATH ?= /opt/rocm
285+
HIPCC ?= $(ROCM_PATH)/bin/hipcc
286+
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
287+
LLAMA_CUDA_DMMV_X ?= 32
288+
LLAMA_CUDA_MMV_Y ?= 1
289+
LLAMA_CUDA_KQUANTS_ITER ?= 2
290+
CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
291+
CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
292+
LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
293+
LDFLAGS += -lhipblas -lamdhip64 -lrocblas
294+
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
295+
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
296+
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
297+
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
298+
HIPFLAGS += -DCC_TURING=1000000000
299+
ifdef LLAMA_CUDA_FORCE_DMMV
300+
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
301+
endif # LLAMA_CUDA_FORCE_DMMV
302+
OBJS += ggml-cuda.o
303+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
304+
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
305+
endif # LLAMA_HIPBLAS
306+
283307
ifdef LLAMA_METAL
284308
CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
285309
CXXFLAGS += -DGGML_USE_METAL

README.md

+29
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,35 @@ Building the program with BLAS support may lead to some performance improvements
422422
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
423423
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
424424

425+
- #### hipBLAS
426+
427+
This provide BLAS acceleation on HIP supported GPU like AMD GPU.
428+
Make sure to have ROCm installed.
429+
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
430+
Windows support is coming soon...
431+
432+
- Using `make`:
433+
```bash
434+
make LLAMA_HIPBLAS=1
435+
```
436+
- Using `CMake`:
437+
```bash
438+
mkdir build
439+
cd build
440+
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
441+
cmake --build .
442+
```
443+
444+
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
445+
If your GPU is not officialy supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
446+
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
447+
448+
| Option | Legal values | Default | Description |
449+
|-------------------------|------------------------|---------|-------------|
450+
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
451+
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
452+
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
453+
425454
- #### CLBlast
426455
427456
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.

common/common.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -613,9 +613,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
613613
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
614614
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
615615
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
616+
#ifdef GGML_USE_CUBLAS
616617
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
617-
fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
618+
fprintf(stdout, " use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
618619
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
620+
#endif // GGML_USE_CUBLAS
619621
#endif
620622
fprintf(stdout, " --mtest compute maximum memory usage\n");
621623
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");

examples/llama-bench/llama-bench.cpp

+1-3
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,7 @@
1818
#include "llama.h"
1919
#include "common.h"
2020
#include "build-info.h"
21-
#ifdef GGML_USE_CUBLAS
2221
#include "ggml-cuda.h"
23-
#endif
2422

2523
// utils
2624
static uint64_t get_time_ns() {
@@ -504,7 +502,7 @@ struct test {
504502

505503
static std::string get_backend() {
506504
if (cuda) {
507-
return "CUDA";
505+
return GGML_CUDA_NAME;
508506
}
509507
if (opencl) {
510508
return "OpenCL";

0 commit comments

Comments
 (0)