Skip to content

Commit 203bbdd

Browse files
committed
Merge branch 'master' into mistral.cpp
2 parents fa2261a + 5e2727f commit 203bbdd

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+1018
-842
lines changed

Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -327,9 +327,9 @@ ifdef LLAMA_DEBUG
327327
endif
328328
else
329329
MK_CPPFLAGS += -DNDEBUG
330-
MK_CFLAGS += -O3
331-
MK_CXXFLAGS += -O3
332-
MK_NVCCFLAGS += -O3
330+
MK_CFLAGS += -O3 -g
331+
MK_CXXFLAGS += -O3 -g
332+
MK_NVCCFLAGS += -O3 -g
333333
endif
334334

335335
ifdef LLAMA_SANITIZE_THREAD

common/common.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1324,6 +1324,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
13241324
else { invalid_param = true; }
13251325
return true;
13261326
}
1327+
if (arg == "--no-warmup") {
1328+
params.warmup = false;
1329+
return true;
1330+
}
13271331
#ifndef LOG_DISABLE_LOGS
13281332
// Parse args for logging parameters
13291333
if (log_param_single_parse(argv[i])) {
@@ -1446,6 +1450,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
14461450
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
14471451
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
14481452
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
1453+
options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" });
14491454
options.push_back({ "server infill",
14501455
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
14511456

convert_hf_to_gguf.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1570,6 +1570,34 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
15701570
return [(self.map_tensor_name(name), data_torch)]
15711571

15721572
def prepare_tensors(self):
1573+
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
1574+
if rope_scaling.get("rope_type", '').lower() == "llama3":
1575+
base = self.hparams.get("rope_theta", 10000.0)
1576+
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
1577+
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
1578+
1579+
factor = rope_scaling.get("factor", 8.0)
1580+
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
1581+
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
1582+
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
1583+
1584+
low_freq_wavelen = old_context_len / low_freq_factor
1585+
high_freq_wavelen = old_context_len / high_freq_factor
1586+
assert low_freq_wavelen != high_freq_wavelen
1587+
1588+
rope_factors = []
1589+
for freq in freqs:
1590+
wavelen = 2 * math.pi / freq
1591+
if wavelen < high_freq_wavelen:
1592+
rope_factors.append(1)
1593+
elif wavelen > low_freq_wavelen:
1594+
rope_factors.append(factor)
1595+
else:
1596+
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
1597+
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
1598+
1599+
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
1600+
15731601
super().prepare_tensors()
15741602

15751603
if self._experts is not None:

examples/eval-callback/eval-callback.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
6262
} else if (type == GGML_TYPE_I8) {
6363
v = (float) *(int8_t *) &data[i];
6464
} else {
65-
GGML_ASSERT(false);
65+
GGML_ABORT("fatal error");
6666
}
6767
printf("%12.4f", v);
6868
sum += v;

examples/imatrix/imatrix.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
127127
}
128128
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
129129
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
130-
exit(1); //GGML_ASSERT(false);
130+
exit(1); //GGML_ABORT("fatal error");
131131
}
132132
if (m_params.verbosity > 1) {
133133
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
@@ -176,7 +176,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
176176
}
177177
else if (e.values.size() != (size_t)src1->ne[0]) {
178178
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
179-
exit(1); //GGML_ASSERT(false);
179+
exit(1); //GGML_ABORT("fatal error");
180180
}
181181
++e.ncall;
182182
if (m_params.verbosity > 1) {

examples/llama-bench/llama-bench.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ static const char * output_format_str(output_formats format) {
150150
case JSON: return "json";
151151
case MARKDOWN: return "md";
152152
case SQL: return "sql";
153-
default: GGML_ASSERT(!"invalid output format");
153+
default: GGML_ABORT("invalid output format");
154154
}
155155
}
156156

@@ -176,7 +176,7 @@ static const char * split_mode_str(llama_split_mode mode) {
176176
case LLAMA_SPLIT_MODE_NONE: return "none";
177177
case LLAMA_SPLIT_MODE_LAYER: return "layer";
178178
case LLAMA_SPLIT_MODE_ROW: return "row";
179-
default: GGML_ASSERT(!"invalid split mode");
179+
default: GGML_ABORT("invalid split mode");
180180
}
181181
}
182182

@@ -1326,7 +1326,7 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
13261326
case SQL:
13271327
return std::unique_ptr<printer>(new sql_printer());
13281328
}
1329-
GGML_ASSERT(false);
1329+
GGML_ABORT("fatal error");
13301330
}
13311331

13321332
int main(int argc, char ** argv) {

examples/llava/clip.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -869,7 +869,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
869869
embeddings = peg_0;
870870
}
871871
else {
872-
GGML_ASSERT(false);
872+
GGML_ABORT("fatal error");
873873
}
874874
}
875875

examples/tokenize/tokenize.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
163163
printf(">");
164164
return;
165165
}
166-
GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way.");
166+
GGML_ABORT("MultiByteToWideChar() failed in an unexpected way.");
167167
}
168168

169169
LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));

ggml/CMakeLists.txt

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,15 @@ else()
5050
set(GGML_BLAS_VENDOR_DEFAULT "Generic")
5151
endif()
5252

53+
if (CMAKE_CROSSCOMPILING)
54+
set(GGML_NATIVE_DEFAULT OFF)
55+
else()
56+
set(GGML_NATIVE_DEFAULT ON)
57+
endif()
58+
5359
# general
5460
option(GGML_STATIC "ggml: static link libraries" OFF)
55-
option(GGML_NATIVE "ggml: enable -march=native flag" ON)
61+
option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT})
5662
option(GGML_LTO "ggml: enable link time optimization" OFF)
5763
option(GGML_CCACHE "ggml: use ccache if available" ON)
5864

@@ -70,7 +76,7 @@ option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF)
7076
option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
7177

7278
# instruction set specific
73-
if (GGML_NATIVE)
79+
if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
7480
set(INS_ENB OFF)
7581
else()
7682
set(INS_ENB ON)

ggml/include/ggml.h

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -254,18 +254,8 @@
254254

255255
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
256256

257-
#define GGML_ASSERT(x) \
258-
do { \
259-
if (!(x)) { \
260-
fflush(stdout); \
261-
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
262-
ggml_print_backtrace(); \
263-
abort(); \
264-
} \
265-
} while (0)
266-
267257
#ifndef NDEBUG
268-
#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
258+
#define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
269259
#elif defined(__GNUC__)
270260
#define GGML_UNREACHABLE() __builtin_unreachable()
271261
#elif defined(_MSC_VER)
@@ -274,6 +264,17 @@
274264
#define GGML_UNREACHABLE() ((void) 0)
275265
#endif
276266

267+
#ifdef __cplusplus
268+
#define GGML_NORETURN [[noreturn]]
269+
#elif defined(_MSC_VER)
270+
#define GGML_NORETURN __declspec(noreturn)
271+
#else
272+
#define GGML_NORETURN _Noreturn
273+
#endif
274+
275+
#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
276+
#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
277+
277278
// used to copy the number of elements and stride in bytes of tensors into local variables.
278279
// main purpose is to reduce code duplication and improve readability.
279280
//
@@ -322,6 +323,9 @@
322323
extern "C" {
323324
#endif
324325

326+
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
327+
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
328+
325329
enum ggml_status {
326330
GGML_STATUS_ALLOC_FAILED = -2,
327331
GGML_STATUS_FAILED = -1,
@@ -636,8 +640,11 @@ extern "C" {
636640
GGML_CGRAPH_EVAL_ORDER_COUNT
637641
};
638642

643+
typedef uint32_t ggml_bitset_t;
644+
639645
struct ggml_hash_set {
640646
size_t size;
647+
ggml_bitset_t * used;
641648
struct ggml_tensor ** keys;
642649
};
643650

@@ -651,7 +658,7 @@ extern "C" {
651658
struct ggml_tensor ** grads;
652659
struct ggml_tensor ** leafs;
653660

654-
struct ggml_hash_set visited_hash_table;
661+
struct ggml_hash_set visited_hash_set;
655662

656663
enum ggml_cgraph_eval_order order;
657664
};
@@ -698,8 +705,6 @@ extern "C" {
698705
GGML_API int64_t ggml_cycles(void);
699706
GGML_API int64_t ggml_cycles_per_ms(void);
700707

701-
GGML_API void ggml_print_backtrace(void);
702-
703708
// accepts a UTF-8 path, even on Windows
704709
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
705710

@@ -2005,8 +2010,8 @@ extern "C" {
20052010

20062011
// ggml_graph_plan() has to be called before ggml_graph_compute()
20072012
// when plan.work_size > 0, caller must allocate memory for plan.work_data
2008-
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
2009-
GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
2013+
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
2014+
GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
20102015
// same as ggml_graph_compute() but the work data is allocated as a part of the context
20112016
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
20122017
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);

ggml/src/ggml-alloc.c

Lines changed: 18 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
9191
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
9292
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
9393
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
94-
GGML_ASSERT(!"not enough space in the buffer");
95-
return;
94+
GGML_ABORT("not enough space in the buffer");
9695
}
9796

9897
void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
@@ -133,7 +132,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset,
133132
return;
134133
}
135134
}
136-
GGML_ASSERT(!"out of allocated_tensors");
135+
GGML_ABORT("out of allocated_tensors");
137136
}
138137
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
139138
for (int i = 0; i < 1024; i++) {
@@ -142,8 +141,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
142141
return;
143142
}
144143
}
145-
fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
146-
GGML_ASSERT(!"tensor not found");
144+
GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
147145
}
148146
#endif
149147

@@ -176,8 +174,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
176174
// this should never happen
177175
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
178176
__func__, size, max_avail);
179-
GGML_ASSERT(!"not enough space in the buffer");
180-
GGML_UNREACHABLE();
177+
GGML_ABORT("not enough space in the buffer");
181178
}
182179
}
183180

@@ -443,7 +440,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
443440
}
444441
}
445442

446-
free(galloc->hash_set.keys);
443+
ggml_hash_set_free(&galloc->hash_set);
447444
free(galloc->hash_values);
448445
free(galloc->bufts);
449446
free(galloc->buffers);
@@ -456,7 +453,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
456453
typedef struct ggml_gallocr * ggml_gallocr_t;
457454

458455
static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
459-
size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
456+
size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
460457
return &galloc->hash_values[i];
461458
}
462459

@@ -565,8 +562,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
565562

566563
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
567564
// clear hash tables
568-
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
569-
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
565+
ggml_hash_set_reset(&galloc->hash_set);
566+
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
570567

571568
// allocate leafs
572569
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
@@ -671,21 +668,19 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
671668
}
672669

673670
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
674-
size_t hash_size = graph->visited_hash_table.size;
671+
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
672+
// add 25% margin to avoid hash collisions
673+
min_hash_size += min_hash_size / 4;
675674

676675
// initialize hash table
677-
if (galloc->hash_set.size < hash_size) {
678-
free(galloc->hash_set.keys);
679-
free(galloc->hash_values);
680-
galloc->hash_set.size = hash_size;
681-
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
682-
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
676+
if (galloc->hash_set.size < min_hash_size) {
677+
ggml_hash_set_free(&galloc->hash_set);
678+
galloc->hash_set = ggml_hash_set_new(min_hash_size);
683679
GGML_ASSERT(galloc->hash_set.keys != NULL);
680+
681+
free(galloc->hash_values);
682+
galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
684683
GGML_ASSERT(galloc->hash_values != NULL);
685-
} else {
686-
// reset hash table
687-
memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
688-
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
689684
}
690685

691686
// reset allocators
@@ -817,8 +812,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
817812
}
818813

819814
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
820-
ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
821-
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
815+
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
822816
return talloc->size_max >= node_size;
823817
}
824818

0 commit comments

Comments
 (0)