Skip to content

Commit 3f460a2

Browse files
authored
cuda : add RoPE kernel for mode == 2 (NeoX) (#2760)
* cuda : add RoPE kernel for mode == 2 (NeoX) * falcon : do not offload the embeddings layer
1 parent 87e3733 commit 3f460a2

File tree

2 files changed

+54
-26
lines changed

2 files changed

+54
-26
lines changed

ggml-cuda.cu

+33-25
Original file line numberDiff line numberDiff line change
@@ -3907,28 +3907,27 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
39073907
dst[i + 1] = x0*sin_theta + x1*cos_theta;
39083908
}
39093909

3910-
// TODO: this implementation is wrong!
3911-
//static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
3912-
// const float p_delta, const int p_delta_rows, const float theta_scale) {
3913-
// const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
3914-
//
3915-
// if (col >= ncols) {
3916-
// return;
3917-
// }
3918-
//
3919-
// const int row = blockDim.x*blockIdx.x + threadIdx.x;
3920-
// const int i = row*ncols + col/2;
3921-
//
3922-
// const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
3923-
// const float sin_theta = sinf(theta);
3924-
// const float cos_theta = cosf(theta);
3925-
//
3926-
// const float x0 = x[i + 0];
3927-
// const float x1 = x[i + ncols/2];
3928-
//
3929-
// dst[i + 0] = x0*cos_theta - x1*sin_theta;
3930-
// dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
3931-
//}
3910+
static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
3911+
const float p_delta, const int p_delta_rows, const float theta_scale) {
3912+
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
3913+
3914+
if (col >= ncols) {
3915+
return;
3916+
}
3917+
3918+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
3919+
const int i = row*ncols + col/2;
3920+
3921+
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
3922+
const float sin_theta = sinf(theta);
3923+
const float cos_theta = cosf(theta);
3924+
3925+
const float x0 = x[i + 0];
3926+
const float x1 = x[i + ncols/2];
3927+
3928+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
3929+
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
3930+
}
39323931

39333932
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
39343933
const int col = blockDim.x*blockIdx.x + threadIdx.x;
@@ -4799,13 +4798,21 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
47994798

48004799
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
48014800
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4802-
GGML_ASSERT(nrows % 2 == 0);
4801+
GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
48034802
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
48044803
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
48054804
const dim3 block_nums(nrows, num_blocks_x, 1);
48064805
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
48074806
}
48084807

4808+
static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
4809+
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
4810+
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
4811+
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
4812+
const dim3 block_nums(nrows, num_blocks_x, 1);
4813+
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
4814+
}
4815+
48094816
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
48104817
GGML_ASSERT(nrows % 4 == 0);
48114818
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
@@ -5548,8 +5555,9 @@ inline void ggml_cuda_op_rope(
55485555
const float block_p = max(p - (n_ctx - 2.f), 0.f);
55495556
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
55505557
} else if (is_neox) {
5551-
GGML_ASSERT(false && "RoPE NeoX not implemented yet");
5552-
#pragma message("TODO: implement RoPE NeoX for CUDA")
5558+
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
5559+
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
5560+
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
55535561
} else {
55545562
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
55555563
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);

llama.cpp

+21-1
Original file line numberDiff line numberDiff line change
@@ -1958,6 +1958,14 @@ static void llm_load_tensors(
19581958
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
19591959
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
19601960
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
1961+
1962+
if (backend_norm == GGML_BACKEND_GPU) {
1963+
vram_weights += ggml_nbytes(model.output_norm);
1964+
vram_weights += ggml_nbytes(model.output_norm_b);
1965+
}
1966+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
1967+
vram_weights += ggml_nbytes(model.output);
1968+
}
19611969
}
19621970

19631971
const uint32_t n_ff = hparams.n_ff;
@@ -1967,7 +1975,7 @@ static void llm_load_tensors(
19671975
model.layers.resize(n_layer);
19681976

19691977
for (uint32_t i = 0; i < n_layer; ++i) {
1970-
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1978+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
19711979
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
19721980

19731981
auto & layer = model.layers[i];
@@ -1978,13 +1986,25 @@ static void llm_load_tensors(
19781986
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
19791987
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
19801988
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
1989+
1990+
if (backend == GGML_BACKEND_GPU) {
1991+
vram_weights += ggml_nbytes(layer.attn_norm_2);
1992+
vram_weights += ggml_nbytes(layer.attn_norm_2_b);
1993+
}
19811994
}
19821995

19831996
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
19841997
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
19851998

19861999
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
19872000
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2001+
2002+
if (backend == GGML_BACKEND_GPU) {
2003+
vram_weights +=
2004+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
2005+
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) +
2006+
ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
2007+
}
19882008
}
19892009
} break;
19902010
default:

0 commit comments

Comments
 (0)