Skip to content

Commit 32102c2

Browse files
committed
Merge branch 'master' into concedo_experimental
# Conflicts: # README.md
2 parents 8424a35 + 481f793 commit 32102c2

File tree

9 files changed

+30
-15
lines changed

9 files changed

+30
-15
lines changed

convert.py

+6
Original file line numberDiff line numberDiff line change
@@ -154,9 +154,15 @@ def guessed(model: 'LazyModel') -> 'Params':
154154
# try transformer naming first
155155
if "model.layers.0.self_attn.q_proj.weight" in model:
156156
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
157+
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
158+
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
157159
else:
158160
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
159161

162+
if n_layer < 1:
163+
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
164+
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
165+
160166
n_head=n_embd // 128 # guessed
161167

162168
return Params(

examples/alpaca.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
cd `dirname $0`
88
cd ..
99

10-
./main -m ./models/ggml-alpaca-7b-q4.bin \
10+
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
1111
--color \
1212
-f ./prompts/alpaca.txt \
1313
--ctx_size 2048 \

examples/embedding/embedding.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ int main(int argc, char ** argv) {
1818
params.embedding = true;
1919

2020
if (params.n_ctx > 2048) {
21-
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
21+
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
2222
"expect poor results\n", __func__, params.n_ctx);
2323
}
2424

examples/main/main.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ int main(int argc, char ** argv) {
8585
}
8686

8787
if (params.n_ctx > 2048) {
88-
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
88+
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
8989
"expect poor results\n", __func__, params.n_ctx);
9090
} else if (params.n_ctx < 8) {
9191
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);

examples/perplexity/perplexity.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ int main(int argc, char ** argv) {
130130
params.n_batch = std::min(params.n_batch, params.n_ctx);
131131

132132
if (params.n_ctx > 2048) {
133-
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
133+
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
134134
"expect poor results\n", __func__, params.n_ctx);
135135
}
136136

examples/server/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ Command line options:
77
- `--threads N`, `-t N`: Set the number of threads to use during computation.
88
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
99
- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
10-
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
10+
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
1111
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
1212
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
1313
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.

ggml-opencl.cpp

+10-6
Original file line numberDiff line numberDiff line change
@@ -654,13 +654,17 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
654654
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
655655
const int in = tid - step*im; // 0...15 or 0...7
656656

657-
#if K_QUANTS_PER_ITERATION == 1
657+
\n#if K_QUANTS_PER_ITERATION == 1\n
658658
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
659659
const int is = 0;
660-
#else
660+
661+
\n#else\n
662+
661663
const int l0 = 4 * in; // 0, 4, 8, ..., 28
662664
const int is = in / 4;
663-
#endif
665+
666+
\n#endif\n
667+
664668
const int ql_offset = 64*im + l0;
665669
const int qh_offset = 32*im + l0;
666670
const int s_offset = 8*im + is;
@@ -677,7 +681,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
677681

678682
const float d = vload_half(0, &x[i].d);
679683

680-
#if K_QUANTS_PER_ITERATION == 1
684+
\n#if K_QUANTS_PER_ITERATION == 1\n
681685
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
682686
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
683687
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
@@ -687,7 +691,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
687691
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
688692
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
689693
tmp[16 * ix + tid] += sum;
690-
#else
694+
\n#else\n
691695
float sum = 0;
692696
for (int l = 0; l < 4; ++l) {
693697
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
@@ -696,7 +700,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
696700
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
697701
}
698702
tmp[16 * ix + tid] += sum;
699-
#endif
703+
\n#endif\n
700704

701705
}
702706

ggml.h

+9-3
Original file line numberDiff line numberDiff line change
@@ -1516,9 +1516,15 @@ extern "C" {
15161516
// Internal types and functions exposed for tests and benchmarks
15171517
//
15181518

1519-
typedef void (*ggml_to_float_t)(const void * x, float * y, int k);
1520-
typedef void (*ggml_from_float_t)(const float * x, void * y, int k);
1521-
typedef void (*ggml_vec_dot_t)(const int n, float * s, const void * x, const void * y);
1519+
#ifdef __cplusplus
1520+
// restrict not standard in C++
1521+
#define GGML_RESTRICT
1522+
#else
1523+
#define GGML_RESTRICT restrict
1524+
#endif
1525+
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
1526+
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
1527+
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
15221528

15231529
typedef struct {
15241530
ggml_to_float_t to_float;

koboldcpp.py

-1
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,6 @@ def load_model(model_filename):
198198
inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
199199
inputs.debugmode = args.debugmode
200200
banned_tokens = args.bantokens
201-
print(banned_tokens)
202201
for n in range(ban_token_max):
203202
if not banned_tokens or n >= len(banned_tokens):
204203
inputs.banned_tokens[n] = "".encode("UTF-8")

0 commit comments

Comments
 (0)