ggerganov commited on
Commit
050ba38
·
unverified ·
1 Parent(s): ecef312

tests : remove test-backend-ops (#2434)

Browse files
Files changed (2) hide show
  1. Makefile +2 -8
  2. tests/test-backend-ops.cpp +0 -2564
Makefile CHANGED
@@ -3,12 +3,11 @@ BUILD_TARGETS = \
3
  main \
4
  bench \
5
  quantize \
6
- server \
7
- tests/test-c.o
8
 
9
  # Binaries only useful for tests
10
  TEST_TARGETS = \
11
- tests/test-backend-ops
12
 
13
  # Deprecation aliases
14
  ifdef WHISPER_CUBLAS
@@ -1101,11 +1100,6 @@ tests: $(TEST_TARGETS)
1101
  tests/test-c.o: tests/test-c.c include/whisper.h
1102
  $(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
1103
 
1104
- tests/test-backend-ops: tests/test-backend-ops.cpp \
1105
- $(OBJ_GGML)
1106
- $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1107
- $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1108
-
1109
  #
1110
  # Audio samples
1111
  #
 
3
  main \
4
  bench \
5
  quantize \
6
+ server
 
7
 
8
  # Binaries only useful for tests
9
  TEST_TARGETS = \
10
+ tests/test-c.o
11
 
12
  # Deprecation aliases
13
  ifdef WHISPER_CUBLAS
 
1100
  tests/test-c.o: tests/test-c.c include/whisper.h
1101
  $(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
1102
 
 
 
 
 
 
1103
  #
1104
  # Audio samples
1105
  #
tests/test-backend-ops.cpp DELETED
@@ -1,2564 +0,0 @@
1
- #include <ggml.h>
2
- #include <ggml-alloc.h>
3
- #include <ggml-backend.h>
4
-
5
- #include <algorithm>
6
- #include <array>
7
- #include <cfloat>
8
- #include <cstring>
9
- #include <functional>
10
- #include <memory>
11
- #include <random>
12
- #include <stdio.h>
13
- #include <stdlib.h>
14
- #include <string>
15
- #include <thread>
16
- #include <vector>
17
-
18
-
19
- static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
20
- // static RNG initialization (revisit if n_threads stops being constant)
21
- static const size_t n_threads = std::thread::hardware_concurrency();
22
- static std::vector<std::default_random_engine> generators = []() {
23
- std::random_device rd;
24
- std::vector<std::default_random_engine> vec;
25
- vec.reserve(n_threads);
26
- //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
27
- for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
28
- return vec;
29
- }();
30
-
31
- size_t size = ggml_nelements(tensor);
32
- std::vector<float> data(size);
33
-
34
- auto init_thread = [&](size_t ith, size_t start, size_t end) {
35
- std::uniform_real_distribution<float> distribution(min, max);
36
- for (size_t i = start; i < end; i++) {
37
- data[i] = distribution(generators[ith]);
38
- }
39
- };
40
-
41
- std::vector<std::thread> threads;
42
- threads.reserve(n_threads);
43
- for (size_t i = 0; i < n_threads; i++) {
44
- size_t start = i*size/n_threads;
45
- size_t end = (i+1)*size/n_threads;
46
- threads.emplace_back(init_thread, i, start, end);
47
- }
48
- for (auto & t : threads) {
49
- t.join();
50
- }
51
-
52
- #if 0
53
- const char * val_str = getenv("GGML_TEST_EPS");
54
- float val = 1e-9f;
55
- if (val_str != nullptr) {
56
- val = std::stof(val_str);
57
- printf("GGML_TEST_EPS=%e\n", val);
58
- }
59
-
60
- // test quantization with very small values that may result in nan scales due to division by zero
61
- if (ggml_is_quantized(tensor->type)) {
62
- for (int i = 0; i < 256; i++) {
63
- data[i] = val;
64
- }
65
- }
66
- #endif
67
-
68
- if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
69
- ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
70
- } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
71
- GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
72
- std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
73
- std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
74
- const float * im = imatrix.data();
75
- if (!ggml_quantize_requires_imatrix(tensor->type)) {
76
- // when the imatrix is optional, we want to test both quantization with and without imatrix
77
- // use one of the random numbers to decide
78
- if (data[0] > 0.5f*(min + max)) {
79
- im = nullptr;
80
- }
81
- }
82
-
83
- ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
84
- GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
85
- // TODO: other cases
86
- //#pragma omp parallel for
87
- //for (int i = 0; i < tensor->ne[1]; i++) {
88
- // ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
89
- // i * tensor->ne[0], 1, tensor->ne[0], im);
90
- //}
91
-
92
- ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
93
- } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
94
- // This is going to create some weird integers though.
95
- ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
96
- } else {
97
- GGML_ABORT("fatal error");
98
- }
99
- }
100
-
101
- static std::vector<float> tensor_to_float(const ggml_tensor * t) {
102
- std::vector<float> tv;
103
- tv.reserve(ggml_nelements(t));
104
-
105
- std::vector<uint8_t> buf(ggml_nbytes(t));
106
- ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
107
-
108
- ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
109
- size_t bs = ggml_blck_size(t->type);
110
- std::vector<float> vq(ggml_blck_size(t->type));
111
- bool quantized = ggml_is_quantized(t->type);
112
-
113
- // access elements by index to avoid gaps in views
114
- for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
115
- for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
116
- for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
117
- for (int64_t i0 = 0; i0 < t->ne[0]; i0 += bs) {
118
- size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
119
- if (t->type == GGML_TYPE_F16) {
120
- tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
121
- } else if (t->type == GGML_TYPE_BF16) {
122
- tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
123
- } else if (t->type == GGML_TYPE_F32) {
124
- tv.push_back(*(float *) &buf[i]);
125
- } else if (t->type == GGML_TYPE_I32) {
126
- tv.push_back((float)*(int32_t *) &buf[i]);
127
- } else if (t->type == GGML_TYPE_I16) {
128
- tv.push_back((float)*(int16_t *) &buf[i]);
129
- } else if (t->type == GGML_TYPE_I8) {
130
- tv.push_back((float)*(int8_t *) &buf[i]);
131
- } else if (quantized) {
132
- tt.to_float(&buf[i], vq.data(), bs);
133
- tv.insert(tv.end(), vq.begin(), vq.end());
134
- } else {
135
- GGML_ABORT("fatal error");
136
- }
137
- }
138
- }
139
- }
140
- }
141
-
142
- return tv;
143
- }
144
-
145
- /*
146
- static double cosine_similarity(const float * v1, const float * v2, size_t n) {
147
- double dot = 0.0;
148
- double mag1 = 0.0;
149
- double mag2 = 0.0;
150
-
151
- for (size_t i = 0; i < n; i++) {
152
- if (std::isnan(v1[i]) || std::isnan(v2[i])) {
153
- return -1.0f;
154
- }
155
- if (std::isinf(v1[i]) && std::isinf(v2[i])) {
156
- continue;
157
- }
158
- dot += v1[i]*v2[i];
159
- mag1 += v1[i]*v1[i];
160
- mag2 += v2[i]*v2[i];
161
- }
162
-
163
- return dot/sqrt(mag1*mag2);
164
- }
165
-
166
- static float distance(const float * v1, const float * v2, size_t n) {
167
- double d = 0.0;
168
-
169
- for (size_t i = 0; i < n; i++) {
170
- if (std::isnan(v1[i]) || std::isnan(v2[i])) {
171
- return INFINITY;
172
- }
173
- if (std::isinf(v1[i]) && std::isinf(v2[i])) {
174
- continue;
175
- }
176
- d += (v1[i] - v2[i])*(v1[i] - v2[i]);
177
- }
178
-
179
- return sqrt(d);
180
- }
181
-
182
- static float vec_len(const float * v, size_t n) {
183
- double d = 0.0;
184
-
185
- for (size_t i = 0; i < n; i++) {
186
- if (std::isnan(v[i])) {
187
- return INFINITY;
188
- }
189
- if (std::isinf(v[i])) {
190
- continue;
191
- }
192
- d += v[i]*v[i];
193
- }
194
-
195
- return sqrt(d);
196
- }
197
- */
198
-
199
- // normalized mean squared error = mse(a, b) / mse(a, 0)
200
- static double nmse(const float * a, const float * b, size_t n) {
201
- double mse_a_b = 0.0;
202
- double mse_a_0 = 0.0;
203
-
204
- for (size_t i = 0; i < n; i++) {
205
- float a_i = a[i];
206
- float b_i = b[i];
207
-
208
- mse_a_b += (a_i - b_i) * (a_i - b_i);
209
- mse_a_0 += a_i * a_i;
210
- }
211
-
212
- return mse_a_b / mse_a_0;
213
- }
214
-
215
- // utils for printing the variables of the test cases
216
- #define VAR_TO_STR(x) (#x "=" + var_to_str(x))
217
-
218
- template<typename T>
219
- static std::string var_to_str(const T & x) {
220
- return std::to_string(x);
221
- }
222
-
223
- template<typename T, size_t N>
224
- static std::string var_to_str(const T (&x)[N]) {
225
- std::string s = "[";
226
- for (size_t i = 0; i < N; i++) {
227
- if (i > 0) {
228
- s += ",";
229
- }
230
- s += var_to_str(x[i]);
231
- }
232
- s += "]";
233
- return s;
234
- }
235
-
236
- template<typename T, size_t N>
237
- static std::string var_to_str(const std::array<T, N> & x) {
238
- std::string s = "[";
239
- for (size_t i = 0; i < N; i++) {
240
- if (i > 0) {
241
- s += ",";
242
- }
243
- s += var_to_str(x[i]);
244
- }
245
- s += "]";
246
- return s;
247
- }
248
-
249
- //static std::string var_to_str(ggml_unary_op unary_op) {
250
- // return ggml_unary_op_name(unary_op);
251
- //}
252
-
253
- static std::string var_to_str(ggml_type type) {
254
- return ggml_type_name(type);
255
- }
256
-
257
- static std::string var_to_str(ggml_op_pool pool) {
258
- switch (pool) {
259
- case GGML_OP_POOL_AVG: return "avg";
260
- case GGML_OP_POOL_MAX: return "max";
261
- default: return std::to_string(pool);
262
- }
263
- }
264
-
265
- #define VARS_TO_STR1(a) VAR_TO_STR(a)
266
- #define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b)
267
- #define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
268
- #define VARS_TO_STR4(a, b, c, d) VAR_TO_STR(a) + "," + VARS_TO_STR3(b, c, d)
269
- #define VARS_TO_STR5(a, b, c, d, e) VAR_TO_STR(a) + "," + VARS_TO_STR4(b, c, d, e)
270
- #define VARS_TO_STR6(a, b, c, d, e, f) VAR_TO_STR(a) + "," + VARS_TO_STR5(b, c, d, e, f)
271
- #define VARS_TO_STR7(a, b, c, d, e, f, g) VAR_TO_STR(a) + "," + VARS_TO_STR6(b, c, d, e, f, g)
272
- #define VARS_TO_STR8(a, b, c, d, e, f, g, h) VAR_TO_STR(a) + "," + VARS_TO_STR7(b, c, d, e, f, g, h)
273
- #define VARS_TO_STR9(a, b, c, d, e, f, g, h, i) VAR_TO_STR(a) + "," + VARS_TO_STR8(b, c, d, e, f, g, h, i)
274
- #define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
275
- #define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
276
- #define VARS_TO_STR12(a, b, c, d, e, f, g, h, i, j, k, l) VAR_TO_STR(a) + "," + VARS_TO_STR11(b, c, d, e, f, g, h, i, j, k, l)
277
-
278
- #ifdef GGML_USE_SYCL
279
- static bool inline _isinf(float f) {
280
- return (*(uint32_t *)&f & 0x7fffffff) == 0x7f800000;
281
- }
282
- #else
283
- static bool inline _isinf(float f) { return std::isinf(f); }
284
- #endif
285
-
286
- // accept FLT_MAX as infinity
287
- static bool isinf_or_max(float f) {
288
- return _isinf(f) || f == FLT_MAX || f == -FLT_MAX;
289
- }
290
-
291
- static bool ggml_is_view_op(enum ggml_op op) {
292
- return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
293
- }
294
-
295
- enum test_mode {
296
- MODE_TEST,
297
- MODE_PERF,
298
- };
299
-
300
- struct test_case {
301
- virtual ~test_case() {}
302
-
303
- virtual std::string op_desc(ggml_tensor * t) {
304
- return ggml_op_desc(t);
305
- }
306
-
307
- virtual std::string vars() {
308
- return "";
309
- }
310
-
311
- virtual ggml_tensor * build_graph(ggml_context * ctx) = 0;
312
-
313
- virtual double max_nmse_err() {
314
- return 1e-7;
315
- }
316
-
317
- virtual void initialize_tensors(ggml_context * ctx) {
318
- for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
319
- init_tensor_uniform(t);
320
- }
321
- }
322
-
323
- virtual size_t op_size(ggml_tensor * t) {
324
- size_t size = ggml_nbytes(t);
325
- // add source tensors
326
- for (int i = 0; i < GGML_MAX_SRC; i++) {
327
- if (t->src[i] != NULL) {
328
- size += ggml_nbytes(t->src[i]);
329
- }
330
- }
331
- return size;
332
- }
333
-
334
- ggml_cgraph * gf = nullptr;
335
-
336
- static const int sentinel_size = 1024;
337
-
338
- test_mode mode;
339
-
340
- std::vector<ggml_tensor *> sentinels;
341
-
342
- void add_sentinel(ggml_context * ctx) {
343
- if (mode == MODE_PERF) {
344
- return;
345
- }
346
- ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size);
347
- ggml_format_name(sentinel, "sent_%zu", sentinels.size());
348
- sentinels.push_back(sentinel);
349
- }
350
-
351
- // hijack ggml_new_tensor to add sentinels after each tensor to check for overflows in the backend
352
-
353
- ggml_tensor * ggml_new_tensor(ggml_context * ctx, ggml_type type, int n_dims, const int64_t * ne) {
354
- ggml_tensor * t = ::ggml_new_tensor(ctx, type, n_dims, ne);
355
- add_sentinel(ctx);
356
- return t;
357
- }
358
-
359
- ggml_tensor * ggml_new_tensor_1d(ggml_context * ctx, ggml_type type, int64_t ne0) {
360
- ggml_tensor * t = ::ggml_new_tensor_1d(ctx, type, ne0);
361
- add_sentinel(ctx);
362
- return t;
363
- }
364
-
365
- ggml_tensor * ggml_new_tensor_2d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1) {
366
- ggml_tensor * t = ::ggml_new_tensor_2d(ctx, type, ne0, ne1);
367
- add_sentinel(ctx);
368
- return t;
369
- }
370
-
371
- ggml_tensor * ggml_new_tensor_3d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) {
372
- ggml_tensor * t = ::ggml_new_tensor_3d(ctx, type, ne0, ne1, ne2);
373
- add_sentinel(ctx);
374
- return t;
375
- }
376
-
377
- ggml_tensor * ggml_new_tensor_4d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
378
- ggml_tensor * t = ::ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3);
379
- add_sentinel(ctx);
380
- return t;
381
- }
382
-
383
- bool eval(ggml_backend_t backend1, ggml_backend_t backend2, const char * op_name) {
384
- mode = MODE_TEST;
385
-
386
- ggml_init_params params = {
387
- /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
388
- /* .mem_base = */ NULL,
389
- /* .no_alloc = */ true,
390
- };
391
- ggml_context * ctx = ggml_init(params);
392
-
393
- gf = ggml_new_graph(ctx);
394
-
395
- // pre-graph sentinel
396
- add_sentinel(ctx);
397
-
398
- ggml_tensor * out = build_graph(ctx);
399
-
400
- if (op_name != nullptr && op_desc(out) != op_name) {
401
- //printf(" %s: skipping\n", op_desc(out).c_str());
402
- ggml_free(ctx);
403
- return true;
404
- }
405
-
406
- printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
407
- fflush(stdout);
408
-
409
- // check if the backends support the ops
410
- bool supported = true;
411
- for (ggml_backend_t backend : {backend1, backend2}) {
412
- for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
413
- if (!ggml_backend_supports_op(backend, t)) {
414
- printf("not supported [%s] ", ggml_backend_name(backend));
415
- supported = false;
416
- break;
417
- }
418
- }
419
- }
420
- if (!supported) {
421
- printf("\n");
422
- ggml_free(ctx);
423
- return true;
424
- }
425
-
426
- // post-graph sentinel
427
- add_sentinel(ctx);
428
-
429
- // allocate
430
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
431
- if (buf == NULL) {
432
- printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));
433
- ggml_free(ctx);
434
- return false;
435
- }
436
-
437
- // build graph
438
- ggml_build_forward_expand(gf, out);
439
-
440
- // add sentinels as graph nodes so that they are checked in the callback
441
- for (ggml_tensor * sentinel : sentinels) {
442
- gf->nodes[gf->n_nodes++] = sentinel;
443
- }
444
-
445
- // randomize tensors
446
- initialize_tensors(ctx);
447
-
448
- // compare
449
- struct callback_userdata {
450
- bool ok;
451
- double max_err;
452
- ggml_backend_t backend1;
453
- ggml_backend_t backend2;
454
- };
455
-
456
- callback_userdata ud {
457
- true,
458
- max_nmse_err(),
459
- backend1,
460
- backend2
461
- };
462
-
463
- auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
464
- callback_userdata * ud = (callback_userdata *) user_data;
465
- const char * bn1 = ggml_backend_name(ud->backend1);
466
- const char * bn2 = ggml_backend_name(ud->backend2);
467
-
468
- if (t1->op == GGML_OP_NONE) {
469
- // sentinels must be unchanged
470
- std::vector<uint8_t> t1_data(ggml_nbytes(t1));
471
- std::vector<uint8_t> t2_data(ggml_nbytes(t2));
472
- ggml_backend_tensor_get(t1, t1_data.data(), 0, ggml_nbytes(t1));
473
- ggml_backend_tensor_get(t2, t2_data.data(), 0, ggml_nbytes(t2));
474
-
475
- if (memcmp(t1_data.data(), t2_data.data(), ggml_nbytes(t1)) != 0) {
476
- printf("sentinel mismatch: %s ", t1->name);
477
- ud->ok = false;
478
- return true;
479
- }
480
- }
481
-
482
- std::vector<float> f1 = tensor_to_float(t1);
483
- std::vector<float> f2 = tensor_to_float(t2);
484
-
485
- for (size_t i = 0; i < f1.size(); i++) {
486
- // check for nans
487
- if (std::isnan(f1[i]) || std::isnan(f2[i])) {
488
- printf("[%s] NaN at index %zu (%s=%f %s=%f) ", ggml_op_desc(t1), i, bn1, f1[i], bn2, f2[i]);
489
- ud->ok = false;
490
- return true;
491
- }
492
- // check for infs: both must be inf of the same sign, or both must be finite
493
- if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) {
494
- if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) {
495
- if (std::signbit(f1[i]) != std::signbit(f2[i])) {
496
- printf("[%s] inf sign mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
497
- ud->ok = false;
498
- return true;
499
- }
500
- } else {
501
- printf("[%s] inf mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
502
- ud->ok = false;
503
- return true;
504
- }
505
- }
506
- }
507
-
508
- double err = nmse(f1.data(), f2.data(), f1.size());
509
- if (err > ud->max_err) {
510
- printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
511
- //for (int i = 0; i < (int) f1.size(); i++) {
512
- // printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
513
- //}
514
- //printf("\n");
515
- //exit(1);
516
- ud->ok = false;
517
- }
518
- return true;
519
-
520
- GGML_UNUSED(index);
521
- };
522
-
523
- const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
524
-
525
- if (!cmp_ok) {
526
- printf("compare failed ");
527
- }
528
-
529
- ggml_backend_buffer_free(buf);
530
-
531
- ggml_free(ctx);
532
-
533
- if (ud.ok && cmp_ok) {
534
- printf("\033[1;32mOK\033[0m\n");
535
- return true;
536
- }
537
-
538
- printf("\033[1;31mFAIL\033[0m\n");
539
- return false;
540
- }
541
-
542
- bool eval_perf(ggml_backend_t backend, const char * op_name) {
543
- mode = MODE_PERF;
544
-
545
- static const size_t graph_nodes = 8192;
546
-
547
- ggml_init_params params = {
548
- /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead_custom(graph_nodes, false),
549
- /* .mem_base = */ NULL,
550
- /* .no_alloc = */ true,
551
- };
552
- ggml_context * ctx = ggml_init(params);
553
-
554
- ggml_tensor * out = build_graph(ctx);
555
-
556
- if (op_name != nullptr && op_desc(out) != op_name) {
557
- //printf(" %s: skipping\n", op_desc(out).c_str());
558
- ggml_free(ctx);
559
- return true;
560
- }
561
-
562
- int len = printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
563
- fflush(stdout);
564
-
565
- // check if backends support op
566
- if (!ggml_backend_supports_op(backend, out)) {
567
- printf("not supported\n");
568
- ggml_free(ctx);
569
- return true;
570
- }
571
-
572
- // align while also leaving some margin for variations in parameters
573
- int align = 20;
574
- int last = (len + align - 1) / align * align;
575
- if (last - len < 5) {
576
- last += align;
577
- }
578
- last = std::max(last, 60);
579
- printf("%*s", last - len, "");
580
-
581
- // allocate
582
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
583
- if (buf == NULL) {
584
- printf("failed to allocate tensors\n");
585
- ggml_free(ctx);
586
- return false;
587
- }
588
-
589
- // randomize tensors
590
- initialize_tensors(ctx);
591
-
592
- // build graph
593
- ggml_cgraph * gf = ggml_new_graph_custom(ctx, graph_nodes, false);
594
- ggml_build_forward_expand(gf, out);
595
-
596
- // warmup run
597
- ggml_backend_graph_compute(backend, gf);
598
-
599
- // duplicate the op
600
- size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
601
- int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
602
- for (int i = 1; i < n_runs; i++) {
603
- gf->nodes[gf->n_nodes++] = out;
604
- }
605
-
606
- // calculate memory
607
- size_t mem = n_runs * op_size(out);
608
- auto tensor_op_size = [](ggml_tensor * t) {
609
- size_t size = ggml_nbytes(t);
610
- // add source tensors
611
- for (int i = 0; i < GGML_MAX_SRC; i++) {
612
- if (t->src[i] != NULL) {
613
- size += ggml_nbytes(t->src[i]);
614
- }
615
- }
616
- return size;
617
- };
618
- for (int i = 0; i < gf->n_nodes; i++) {
619
- if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
620
- continue;
621
- }
622
- mem += tensor_op_size(gf->nodes[i]);
623
- }
624
-
625
- // run
626
- ggml_backend_synchronize(backend);
627
-
628
- int64_t start_time = ggml_time_us();
629
- ggml_backend_graph_compute(backend, gf);
630
- ggml_backend_synchronize(backend);
631
- int64_t end_time = ggml_time_us();
632
- double time_us = end_time - start_time;
633
-
634
- printf(" %5d runs - %8.2f us/run - %8zu kB/run - \033[1;34m%7.2f GB/s\033[0m\n",
635
- n_runs,
636
- time_us / n_runs,
637
- op_size(out) / 1024,
638
- mem / (time_us/1e6) / 1024.0 / 1024.0 / 1024.0);
639
-
640
- ggml_backend_buffer_free(buf);
641
-
642
- ggml_free(ctx);
643
-
644
- return true;
645
- }
646
- };
647
-
648
- // GGML_OP_UNARY
649
- struct test_unary : public test_case {
650
- const ggml_unary_op op;
651
- const ggml_type type;
652
- const std::array<int64_t, 4> ne_a;
653
- int v; // view (1 : non-contiguous a)
654
-
655
- std::string vars() override {
656
- return VARS_TO_STR3(type, ne_a, v);
657
- }
658
-
659
- test_unary(ggml_unary_op op,
660
- ggml_type type = GGML_TYPE_F32,
661
- std::array<int64_t, 4> ne_a = {128, 10, 10, 10},
662
- int v = 0)
663
- : op(op), type(type), ne_a(ne_a), v(v) {}
664
-
665
- ggml_tensor * build_graph(ggml_context * ctx) override {
666
- ggml_tensor * a;
667
- if (v & 1) {
668
- auto ne = ne_a; ne[0] *= 3;
669
- a = ggml_new_tensor(ctx, type, 4, ne.data());
670
- a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
671
- } else {
672
- a = ggml_new_tensor(ctx, type, 4, ne_a.data());
673
- }
674
- ggml_tensor * out = ggml_unary(ctx, a, op);
675
- return out;
676
- }
677
-
678
- void initialize_tensors(ggml_context * ctx) override {
679
- for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
680
- // test extended range of values to check for NaNs in GELU
681
- init_tensor_uniform(t, -150.f, 150.f);
682
- }
683
- }
684
- };
685
-
686
- // GGML_OP_GET_ROWS
687
- struct test_get_rows : public test_case {
688
- const ggml_type type;
689
- const int n; // cols
690
- const int m; // rows
691
- const int r; // rows to get
692
- const int b; // batch size
693
- const bool v; // view (non-contiguous src1)
694
-
695
- std::string vars() override {
696
- return VARS_TO_STR6(type, n, m, r, b, v);
697
- }
698
-
699
- test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false)
700
- : type(type), n(n), m(m), r(r), b(b), v(v) {}
701
-
702
- ggml_tensor * build_graph(ggml_context * ctx) override {
703
- ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b);
704
- ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
705
- if (v) {
706
- rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
707
- }
708
- ggml_tensor * out = ggml_get_rows(ctx, in, rows);
709
- return out;
710
- }
711
-
712
- void initialize_tensors(ggml_context * ctx) override {
713
- for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
714
- if (t->type == GGML_TYPE_I32) {
715
- if (ggml_is_view_op(t->op)) { continue; }
716
- // rows
717
- std::vector<int> data(r*b);
718
- for (int i = 0; i < r*b; i++) {
719
- data[i] = rand() % m;
720
- }
721
- ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int));
722
- } else {
723
- init_tensor_uniform(t);
724
- }
725
- }
726
- }
727
- };
728
-
729
- // GGML_OP_REPEAT
730
- struct test_repeat : public test_case {
731
- const ggml_type type;
732
- const std::array<int64_t, 4> ne;
733
- const std::array<int, 4> nr;
734
-
735
- std::string vars() override {
736
- return VARS_TO_STR3(type, ne, nr);
737
- }
738
-
739
- size_t op_size(ggml_tensor * t) override {
740
- return ggml_nbytes(t) * 2;
741
- }
742
-
743
- test_repeat(ggml_type type = GGML_TYPE_F32,
744
- std::array<int64_t, 4> ne = {10, 10, 10, 10},
745
- std::array<int, 4> nr = {2, 2, 2, 2})
746
- : type(type), ne(ne), nr(nr) {}
747
-
748
- ggml_tensor * build_graph(ggml_context * ctx) override {
749
- ggml_tensor * target = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
750
- ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
751
- ggml_tensor * out = ggml_repeat(ctx, src, target);
752
- return out;
753
- }
754
- };
755
-
756
- // GGML_OP_DUP
757
- struct test_dup : public test_case {
758
- const ggml_type type;
759
- const std::array<int64_t, 4> ne;
760
- const std::array<int64_t, 4> permute;
761
- bool _use_permute;
762
-
763
- std::string vars() override {
764
- std::string v = VARS_TO_STR2(type, ne);
765
- if (_use_permute) v += "," + VAR_TO_STR(permute);
766
- return v;
767
- }
768
-
769
- test_dup(ggml_type type = GGML_TYPE_F32,
770
- std::array<int64_t, 4> ne = {10, 10, 20, 1},
771
- std::array<int64_t, 4> permute = {0, 0, 0, 0})
772
- : type(type), ne(ne), permute(permute),
773
- _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
774
-
775
- ggml_tensor * build_graph(ggml_context * ctx) override {
776
- ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
777
- if (_use_permute) {
778
- src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
779
- }
780
- ggml_tensor * out = ggml_dup(ctx, src);
781
- return out;
782
- }
783
- };
784
-
785
- // GGML_OP_CPY
786
- struct test_cpy : public test_case {
787
- const ggml_type type_src;
788
- const ggml_type type_dst;
789
- const std::array<int64_t, 4> ne;
790
- const std::array<int64_t, 4> permute;
791
- bool _src_use_permute;
792
-
793
- std::string vars() override {
794
- return VARS_TO_STR4(type_src, type_dst, ne, permute);
795
- }
796
-
797
- double max_nmse_err() override {
798
- return 1e-6;
799
- }
800
-
801
- size_t op_size(ggml_tensor * t) override {
802
- return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
803
- }
804
-
805
- test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
806
- std::array<int64_t, 4> ne = {10, 10, 10, 1},
807
- std::array<int64_t, 4> permute = {0, 0, 0, 0})
808
- : type_src(type_src), type_dst(type_dst), ne(ne), permute(permute),
809
- _src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
810
-
811
- ggml_tensor * build_graph(ggml_context * ctx) override {
812
- ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
813
- if (_src_use_permute) {
814
- src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
815
- }
816
- ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
817
- ggml_tensor * out = ggml_cpy(ctx, src, dst);
818
- return out;
819
- }
820
- };
821
-
822
- // GGML_OP_CONT
823
- struct test_cont : public test_case {
824
- const ggml_type type;
825
- const std::array<int64_t, 4> ne;
826
-
827
- std::string vars() override {
828
- return VARS_TO_STR2(type, ne);
829
- }
830
-
831
- test_cont(ggml_type type = GGML_TYPE_F32,
832
- std::array<int64_t, 4> ne = {10, 10, 10, 1})
833
- : type(type), ne(ne) {}
834
-
835
- ggml_tensor * build_graph(ggml_context * ctx) override {
836
- ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
837
- src = ggml_transpose(ctx, src);
838
- ggml_tensor * out = ggml_cont(ctx, src);
839
-
840
- return out;
841
- }
842
- };
843
-
844
- // GGML_OP_ADD
845
- // GGML_OP_MUL
846
- // GGML_OP_DIV
847
- struct test_bin_bcast : public test_case {
848
- using op_t = ggml_tensor * (*) (ggml_context *, ggml_tensor *, ggml_tensor *);
849
- op_t op;
850
- const ggml_type type;
851
- const std::array<int64_t, 4> ne;
852
- const std::array<int, 4> nr;
853
-
854
- std::string vars() override {
855
- return VARS_TO_STR3(type, ne, nr);
856
- }
857
-
858
- size_t op_size(ggml_tensor * t) override {
859
- return ggml_nbytes(t) * 3;
860
- }
861
-
862
- test_bin_bcast(op_t op, ggml_type type = GGML_TYPE_F32,
863
- std::array<int64_t, 4> ne = {10, 10, 1, 1},
864
- std::array<int, 4> nr = {1, 2, 1, 1})
865
- : op(op), type(type), ne(ne), nr(nr) {}
866
-
867
- ggml_tensor * build_graph(ggml_context * ctx) override {
868
- ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
869
- ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
870
- ggml_tensor * out = op(ctx, a, b);
871
- return out;
872
- }
873
-
874
- void initialize_tensors(ggml_context * ctx) override {
875
- for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
876
- if (op == ggml_div) {
877
- // avoid division by zero
878
- init_tensor_uniform(t, 1.0f, 2.0f);
879
- } else {
880
- init_tensor_uniform(t);
881
- }
882
- }
883
- }
884
- };
885
-
886
- // GGML_OP_SCALE
887
- struct test_scale : public test_case {
888
- const ggml_type type;
889
- const std::array<int64_t, 4> ne;
890
- float scale;
891
-
892
- std::string vars() override {
893
- return VARS_TO_STR3(type, ne, scale);
894
- }
895
-
896
- test_scale(ggml_type type = GGML_TYPE_F32,
897
- std::array<int64_t, 4> ne = {10, 10, 10, 10},
898
- float scale = 2.0f)
899
- : type(type), ne(ne), scale(scale) {}
900
-
901
- ggml_tensor * build_graph(ggml_context * ctx) override {
902
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
903
- ggml_tensor * out = ggml_scale(ctx, a, scale);
904
- return out;
905
- }
906
- };
907
-
908
- // GGML_OP_NORM
909
- struct test_norm : public test_case {
910
- const ggml_type type;
911
- const std::array<int64_t, 4> ne;
912
- float eps;
913
-
914
- std::string vars() override {
915
- return VARS_TO_STR3(type, ne, eps);
916
- }
917
-
918
- test_norm(ggml_type type = GGML_TYPE_F32,
919
- std::array<int64_t, 4> ne = {64, 10, 10, 10},
920
- float eps = 1e-6f)
921
- : type(type), ne(ne), eps(eps) {}
922
-
923
- ggml_tensor * build_graph(ggml_context * ctx) override {
924
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
925
- ggml_tensor * out = ggml_norm(ctx, a, eps);
926
- return out;
927
- }
928
- };
929
-
930
- // GGML_OP_RMS_NORM
931
- struct test_rms_norm : public test_case {
932
- const ggml_type type;
933
- const std::array<int64_t, 4> ne;
934
- float eps;
935
-
936
- std::string vars() override {
937
- return VARS_TO_STR3(type, ne, eps);
938
- }
939
-
940
- test_rms_norm(ggml_type type = GGML_TYPE_F32,
941
- std::array<int64_t, 4> ne = {64, 10, 10, 10},
942
- float eps = 1e-6f)
943
- : type(type), ne(ne), eps(eps) {}
944
-
945
- ggml_tensor * build_graph(ggml_context * ctx) override {
946
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
947
- ggml_tensor * out = ggml_rms_norm(ctx, a, eps);
948
- return out;
949
- }
950
- };
951
-
952
- // GGML_OP_MUL_MAT
953
- struct test_mul_mat : public test_case {
954
- const ggml_type type_a;
955
- const ggml_type type_b;
956
- const int64_t m;
957
- const int64_t n;
958
- const int64_t k;
959
- const std::array<int64_t, 2> bs; // dims 3 and 4
960
- const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
961
-
962
- std::string vars() override {
963
- return VARS_TO_STR7(type_a, type_b, m, n, k, bs, nr);
964
- }
965
-
966
- double max_nmse_err() override {
967
- return 5e-4;
968
- }
969
-
970
- size_t op_size(ggml_tensor * t) override {
971
- size_t a = ggml_nbytes(t->src[0]) * n * nr[0] * nr[1];
972
- size_t b = ggml_nbytes(t->src[1]) * m;
973
- size_t c = ggml_nbytes(t);
974
- return a + b + c;
975
-
976
- GGML_UNUSED(t);
977
- }
978
-
979
- test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
980
- int64_t m = 32, int64_t n = 32, int64_t k = 32,
981
- std::array<int64_t, 2> bs = {10, 10},
982
- std::array<int64_t, 2> nr = {2, 2})
983
- : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr) {}
984
-
985
- ggml_tensor * build_graph(ggml_context * ctx) override {
986
- // C^T = A * B^T: (k, m) * (k, n) => (m, n)
987
- ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0] , bs[1]);
988
- ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
989
- ggml_tensor * out = ggml_mul_mat(ctx, a, b);
990
- return out;
991
- }
992
- };
993
-
994
- // GGML_OP_MUL_MAT_ID
995
- struct test_mul_mat_id : public test_case {
996
- const ggml_type type_a;
997
- const ggml_type type_b;
998
- const int n_mats;
999
- const int n_used;
1000
- const bool b; // brodcast b matrix
1001
- const int64_t m;
1002
- const int64_t n;
1003
- const int64_t k;
1004
-
1005
- std::string vars() override {
1006
- return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k);
1007
- }
1008
-
1009
- double max_nmse_err() override {
1010
- return 5e-4;
1011
- }
1012
-
1013
- size_t op_size(ggml_tensor * t) override {
1014
- size_t a = ggml_nbytes(t->src[2]) * n;
1015
- size_t b = ggml_nbytes(t->src[1]) * m;
1016
- size_t c = ggml_nbytes(t);
1017
- return a + b + c;
1018
-
1019
- GGML_UNUSED(t);
1020
- }
1021
-
1022
- test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
1023
- int n_mats = 8, int n_used = 2, bool b = false,
1024
- int64_t m = 32, int64_t n = 32, int64_t k = 32)
1025
- : type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b),
1026
- m(m), n(n), k(k) {
1027
- GGML_ASSERT(n_used <= n_mats);
1028
- }
1029
-
1030
- ggml_tensor * build_graph(ggml_context * ctx) override {
1031
- // C^T = A * B^T: (k, m) * (k, n) => (m, n)
1032
- ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
1033
- ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
1034
- if (n_used != n_mats) {
1035
- ids = ggml_view_2d(ctx, ids, n_used, n, ids->nb[1], 0);
1036
- }
1037
- ggml_tensor * b = ggml_new_tensor_3d(ctx, type_b, k, this->b ? 1 : n_used, n);
1038
- ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids);
1039
- return out;
1040
- }
1041
-
1042
- void initialize_tensors(ggml_context * ctx) override {
1043
- std::random_device rd;
1044
- std::default_random_engine rng(rd());
1045
- for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1046
- if (t->type == GGML_TYPE_I32) {
1047
- if (ggml_is_view_op(t->op)) { continue; }
1048
- // ids
1049
- for (int64_t r = 0; r < ggml_nrows(t); r++) {
1050
- std::vector<int32_t> data(t->ne[0]);
1051
- for (int i = 0; i < t->ne[0]; i++) {
1052
- data[i] = i % n_mats;
1053
- }
1054
- std::shuffle(data.begin(), data.end(), rng);
1055
- ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
1056
- }
1057
- } else {
1058
- init_tensor_uniform(t);
1059
- }
1060
- }
1061
- }
1062
- };
1063
-
1064
- // GGML_OP_SQR
1065
- struct test_sqr : public test_case {
1066
- const ggml_type type;
1067
- const std::array<int64_t, 4> ne;
1068
-
1069
- std::string vars() override {
1070
- return VARS_TO_STR2(type, ne);
1071
- }
1072
-
1073
- test_sqr(ggml_type type = GGML_TYPE_F32,
1074
- std::array<int64_t, 4> ne = {10, 10, 10, 10})
1075
- : type(type), ne(ne) {}
1076
-
1077
- ggml_tensor * build_graph(ggml_context * ctx) override {
1078
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1079
- ggml_tensor * out = ggml_sqr(ctx, a);
1080
- return out;
1081
- }
1082
- };
1083
-
1084
- // GGML_OP_SQRT
1085
- struct test_sqrt : public test_case {
1086
- const ggml_type type;
1087
- const std::array<int64_t, 4> ne;
1088
-
1089
- std::string vars() override {
1090
- return VARS_TO_STR2(type, ne);
1091
- }
1092
-
1093
- test_sqrt(ggml_type type = GGML_TYPE_F32,
1094
- std::array<int64_t, 4> ne = {10, 10, 10, 10})
1095
- : type(type), ne(ne) {}
1096
-
1097
- ggml_tensor * build_graph(ggml_context * ctx) override {
1098
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1099
- ggml_tensor * out = ggml_sqrt(ctx, a);
1100
- return out;
1101
- }
1102
-
1103
- void initialize_tensors(ggml_context * ctx) override {
1104
- // fill with positive values
1105
- for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1106
- init_tensor_uniform(t, 0.0f, 100.0f);
1107
- }
1108
- }
1109
- };
1110
-
1111
- // GGML_OP_CLAMP
1112
- struct test_clamp : public test_case {
1113
- const ggml_type type;
1114
- const std::array<int64_t, 4> ne;
1115
- float min;
1116
- float max;
1117
-
1118
- std::string vars() override {
1119
- return VARS_TO_STR4(type, ne, min, max);
1120
- }
1121
-
1122
- test_clamp(ggml_type type = GGML_TYPE_F32,
1123
- std::array<int64_t, 4> ne = {10, 10, 10, 10},
1124
- float min = -0.5f, float max = 0.5f)
1125
- : type(type), ne(ne), min(min), max(max) {}
1126
-
1127
- ggml_tensor * build_graph(ggml_context * ctx) override {
1128
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1129
- ggml_tensor * out = ggml_clamp(ctx, a, min, max);
1130
- return out;
1131
- }
1132
- };
1133
-
1134
- // GGML_OP_DIAG_MASK_INF
1135
- struct test_diag_mask_inf : public test_case {
1136
- const ggml_type type;
1137
- const std::array<int64_t, 4> ne;
1138
- const int n_past;
1139
-
1140
- std::string vars() override {
1141
- return VARS_TO_STR3(type, ne, n_past);
1142
- }
1143
-
1144
- test_diag_mask_inf(ggml_type type = GGML_TYPE_F32,
1145
- std::array<int64_t, 4> ne = {10, 10, 10, 10},
1146
- int n_past = 5)
1147
- : type(type), ne(ne), n_past(n_past) {}
1148
-
1149
- ggml_tensor * build_graph(ggml_context * ctx) override {
1150
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1151
- ggml_tensor * out = ggml_diag_mask_inf(ctx, a, n_past);
1152
- return out;
1153
- }
1154
- };
1155
-
1156
- // GGML_OP_SOFT_MAX
1157
- struct test_soft_max : public test_case {
1158
- const ggml_type type;
1159
- const std::array<int64_t, 4> ne;
1160
- const bool mask;
1161
- const float scale;
1162
- const float max_bias;
1163
-
1164
- std::string vars() override {
1165
- return VARS_TO_STR5(type, ne, mask, scale, max_bias);
1166
- }
1167
-
1168
- // the 1024 test with bias occasionally fails:
1169
- // SOFT_MAX(type=f32,ne=[1024,16,1,1],mask=1,scale=1.000000,max_bias=8.000000): [SOFT_MAX] NMSE = 0.000000103 > 0.000000100 FAIL
1170
- virtual double max_nmse_err() override {
1171
- return 1e-6;
1172
- }
1173
-
1174
- test_soft_max(ggml_type type = GGML_TYPE_F32,
1175
- std::array<int64_t, 4> ne = {10, 10, 10, 10},
1176
- bool mask = false,
1177
- float scale = 1.0f,
1178
- float max_bias = 0.0f)
1179
- : type(type), ne(ne), mask(mask), scale(scale), max_bias(max_bias) {}
1180
-
1181
- ggml_tensor * build_graph(ggml_context * ctx) override {
1182
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1183
- ggml_tensor * mask = nullptr;
1184
- if (this->mask) {
1185
- mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]);
1186
- }
1187
- ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, scale, max_bias);
1188
- return out;
1189
- }
1190
- };
1191
-
1192
-
1193
- // GGML_OP_ROPE
1194
- struct test_rope : public test_case {
1195
- const ggml_type type;
1196
- const std::array<int64_t, 4> ne_a;
1197
- int n_dims;
1198
- int mode;
1199
- int n_ctx; // used to generate positions
1200
- float fs; // freq_scale
1201
- float ef; // ext_factor
1202
- float af; // attn_factor
1203
- bool ff;
1204
- int v; // view (1 : non-contiguous a)
1205
-
1206
- std::string vars() override {
1207
- return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, v);
1208
- }
1209
-
1210
- test_rope(ggml_type type = GGML_TYPE_F32,
1211
- std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
1212
- int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f, float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0)
1213
- : type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v) {}
1214
-
1215
- ggml_tensor * build_graph(ggml_context * ctx) override {
1216
- ggml_tensor * a;
1217
- if (v & 1) {
1218
- auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
1219
- a = ggml_new_tensor(ctx, type, 4, ne.data());
1220
- a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
1221
- } else {
1222
- a = ggml_new_tensor(ctx, type, 4, ne_a.data());
1223
- }
1224
- ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
1225
- ggml_tensor * freq = ff ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2) : nullptr;
1226
- ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
1227
- return out;
1228
- }
1229
-
1230
- void initialize_tensors(ggml_context * ctx) override {
1231
- for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1232
- if (t->type == GGML_TYPE_I32) {
1233
- // pos
1234
- std::vector<int> data(ne_a[2]);
1235
- for (int i = 0; i < ne_a[2]; i++) {
1236
- data[i] = rand() % n_ctx;
1237
- }
1238
- ggml_backend_tensor_set(t, data.data(), 0, ne_a[2] * sizeof(int));
1239
- } else {
1240
- if (t->ne[0] == n_dims/2) {
1241
- // frequency factors in the range [0.9f, 1.1f]
1242
- init_tensor_uniform(t, 0.9f, 1.1f);
1243
- } else {
1244
- init_tensor_uniform(t);
1245
- }
1246
- }
1247
- }
1248
- }
1249
- };
1250
-
1251
- // GGML_OP_POOL2D
1252
- struct test_pool2d : public test_case {
1253
- enum ggml_op_pool pool_type;
1254
- const ggml_type type_input;
1255
- const std::array<int64_t, 4> ne_input;
1256
- // kernel size
1257
- const int k0;
1258
- const int k1;
1259
- // stride
1260
- const int s0;
1261
- const int s1;
1262
- // padding
1263
- const int p0;
1264
- const int p1;
1265
-
1266
- std::string vars() override {
1267
- return VARS_TO_STR9(pool_type, type_input, ne_input, k0, k1, s0, s1, p0, p1);
1268
- }
1269
-
1270
- test_pool2d(ggml_op_pool pool_type = GGML_OP_POOL_AVG,
1271
- ggml_type type_input = GGML_TYPE_F32,
1272
- std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
1273
- int k0 = 3, int k1 = 3,
1274
- int s0 = 1, int s1 = 1,
1275
- int p0 = 1, int p1 = 1)
1276
- : pool_type(pool_type), type_input(type_input), ne_input(ne_input), k0(k0), k1(k1), s0(s0), s1(s1), p0(p0), p1(p1) {}
1277
-
1278
- ggml_tensor * build_graph(ggml_context * ctx) override {
1279
- ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
1280
- ggml_tensor * out = ggml_pool_2d(ctx, input, pool_type, k0, k1, s0, s1, p0, p1);
1281
- return out;
1282
- }
1283
- };
1284
-
1285
- // GGML_OP_CONV_TRANSPOSE_1D
1286
- struct test_conv_transpose_1d : public test_case {
1287
- const std::array<int64_t, 4> ne_input;
1288
- const std::array<int64_t, 4> ne_kernel;
1289
-
1290
- const int s0; // stride
1291
- const int p0; // padding
1292
- const int d0; // dilation
1293
-
1294
- std::string vars() override {
1295
- return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0);
1296
- }
1297
-
1298
- test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1]
1299
- std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1]
1300
- int s0 = 1, int p0 = 0, int d0 = 1)
1301
- : ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {}
1302
-
1303
- ggml_tensor * build_graph(ggml_context * ctx) override {
1304
- ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
1305
- ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
1306
- ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0);
1307
- return out;
1308
- }
1309
- };
1310
-
1311
- // GGML_OP_IM2COL
1312
- struct test_im2col : public test_case {
1313
- const ggml_type type_input;
1314
- const ggml_type type_kernel;
1315
- const ggml_type dst_type;
1316
- const std::array<int64_t, 4> ne_input;
1317
- const std::array<int64_t, 4> ne_kernel;
1318
- // stride
1319
- const int s0;
1320
- const int s1;
1321
- // padding
1322
- const int p0;
1323
- const int p1;
1324
- // dilation
1325
- const int d0;
1326
- const int d1;
1327
- // mode
1328
- const bool is_2D;
1329
-
1330
- std::string vars() override {
1331
- return VARS_TO_STR12(type_input, type_kernel, dst_type, ne_input, ne_kernel, s0, s1, p0, p1, d0, d1, is_2D);
1332
- }
1333
-
1334
- test_im2col(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16, ggml_type dst_type = GGML_TYPE_F32,
1335
- std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
1336
- std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
1337
- int s0 = 1, int s1 = 1,
1338
- int p0 = 1, int p1 = 1,
1339
- int d0 = 1, int d1 = 1,
1340
- bool is_2D = true)
1341
- : type_input(type_input), type_kernel(type_kernel), dst_type(dst_type), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), is_2D(is_2D) {}
1342
-
1343
- ggml_tensor * build_graph(ggml_context * ctx) override {
1344
- ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
1345
- ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
1346
- ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D, dst_type);
1347
- return out;
1348
- }
1349
- };
1350
-
1351
- // GGML_OP_CONCAT
1352
- struct test_concat : public test_case {
1353
- const ggml_type type;
1354
- const std::array<int64_t, 4> ne_a;
1355
- const int64_t ne_b_d;
1356
- const int dim;
1357
- const int v; // view (1 << 0: non-cont a, 1 << 1: non-cont b)
1358
-
1359
- std::string vars() override {
1360
- return VARS_TO_STR5(type, ne_a, ne_b_d, dim, v);
1361
- }
1362
-
1363
- test_concat(ggml_type type = GGML_TYPE_F32,
1364
- std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
1365
- int64_t ne_b_d = 10,
1366
- int dim = 2, int v = 0)
1367
- : type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim), v(v) {}
1368
-
1369
- ggml_tensor * build_graph(ggml_context * ctx) override {
1370
- auto ne_b = ne_a;
1371
- ne_b[dim] = ne_b_d;
1372
- ggml_tensor * a;
1373
- if (v & 1) {
1374
- auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
1375
- a = ggml_new_tensor(ctx, type, 4, ne.data());
1376
- a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
1377
- } else {
1378
- a = ggml_new_tensor(ctx, type, 4, ne_a.data());
1379
- }
1380
- ggml_tensor * b;
1381
- if (v & 2) {
1382
- auto ne = ne_b; ne[0] *= 3; ne[1] *= 2; ne[2] *= 4;
1383
- b = ggml_new_tensor(ctx, type, 4, ne.data());
1384
- b = ggml_view_4d(ctx, b, ne_b[0], ne_b[1], ne_b[2], ne_b[3], b->nb[1], b->nb[2], b->nb[3], 0);
1385
- } else {
1386
- b = ggml_new_tensor(ctx, type, 4, ne_b.data());
1387
- }
1388
- ggml_tensor * out = ggml_concat(ctx, a, b, dim);
1389
- return out;
1390
- }
1391
- };
1392
-
1393
- // GGML_OP_ARGSORT
1394
- struct test_argsort : public test_case {
1395
- const ggml_type type;
1396
- const std::array<int64_t, 4> ne;
1397
- ggml_sort_order order;
1398
-
1399
- std::string vars() override {
1400
- return VARS_TO_STR3(type, ne, order);
1401
- }
1402
-
1403
- test_argsort(ggml_type type = GGML_TYPE_F32,
1404
- std::array<int64_t, 4> ne = {16, 10, 10, 10},
1405
- ggml_sort_order order = GGML_SORT_ORDER_ASC)
1406
- : type(type), ne(ne), order(order) {}
1407
-
1408
- ggml_tensor * build_graph(ggml_context * ctx) override {
1409
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1410
- ggml_tensor * out = ggml_argsort(ctx, a, order);
1411
- return out;
1412
- }
1413
-
1414
- void initialize_tensors(ggml_context * ctx) override {
1415
- std::random_device rd;
1416
- std::default_random_engine rng(rd());
1417
- for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1418
- if (t->type == GGML_TYPE_I32) {
1419
- // indices
1420
- std::vector<int> data(ggml_nelements(t));
1421
- for (int i = 0; i < ggml_nelements(t); i++) {
1422
- data[i] = rand();
1423
- }
1424
- std::shuffle(data.begin(), data.end(), rng);
1425
- ggml_backend_tensor_set(t, data.data(), 0, ne[0]*ne[1]*ne[2]*ne[3] * sizeof(int));
1426
- } else if (t->type == GGML_TYPE_F32) {
1427
- // initialize with unique values to avoid ties
1428
- for (int64_t r = 0; r < ggml_nrows(t); r++) {
1429
- std::vector<float> data(t->ne[0]);
1430
- for (int i = 0; i < t->ne[0]; i++) {
1431
- data[i] = i;
1432
- }
1433
- std::shuffle(data.begin(), data.end(), rng);
1434
- ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
1435
- }
1436
- } else {
1437
- GGML_ABORT("fatal error");
1438
- }
1439
- }
1440
- }
1441
- };
1442
-
1443
- // GGML_OP_SUM_ROWS
1444
- struct test_sum_rows : public test_case {
1445
- const ggml_type type;
1446
- const std::array<int64_t, 4> ne;
1447
-
1448
- std::string vars() override {
1449
- return VARS_TO_STR2(type, ne);
1450
- }
1451
-
1452
- test_sum_rows(ggml_type type = GGML_TYPE_F32,
1453
- std::array<int64_t, 4> ne = {10, 10, 10, 10})
1454
- : type(type), ne(ne) {}
1455
-
1456
- ggml_tensor * build_graph(ggml_context * ctx) override {
1457
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1458
- ggml_tensor * out = ggml_sum_rows(ctx, a);
1459
- return out;
1460
- }
1461
- };
1462
-
1463
- // GGML_OP_UPSCALE
1464
- struct test_upscale : public test_case {
1465
- const ggml_type type;
1466
- const std::array<int64_t, 4> ne;
1467
- const int32_t scale_factor;
1468
- const bool transpose;
1469
-
1470
- std::string vars() override {
1471
- return VARS_TO_STR4(type, ne, scale_factor, transpose);
1472
- }
1473
-
1474
- test_upscale(ggml_type type = GGML_TYPE_F32,
1475
- std::array<int64_t, 4> ne = {512, 512, 3, 1},
1476
- int32_t scale_factor = 2, bool transpose = false)
1477
- : type(type), ne(ne), scale_factor(scale_factor), transpose(transpose) {}
1478
-
1479
- ggml_tensor * build_graph(ggml_context * ctx) override {
1480
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1481
- if (transpose) a = ggml_transpose(ctx, a);
1482
- ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
1483
- return out;
1484
- }
1485
- };
1486
-
1487
- // GGML_OP_UPSCALE (ext)
1488
- struct test_upscale_ext : public test_case {
1489
- const ggml_type type;
1490
- const std::array<int64_t, 4> ne;
1491
- const std::array<int64_t, 4> ne_tgt;
1492
-
1493
- std::string vars() override {
1494
- return VARS_TO_STR3(type, ne, ne_tgt);
1495
- }
1496
-
1497
- test_upscale_ext(ggml_type type = GGML_TYPE_F32,
1498
- std::array<int64_t, 4> ne = {2, 5, 7, 11},
1499
- std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13})
1500
- : type(type), ne(ne), ne_tgt(ne_tgt) {}
1501
-
1502
- ggml_tensor * build_graph(ggml_context * ctx) override {
1503
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1504
- ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]);
1505
- return out;
1506
- }
1507
- };
1508
-
1509
- // GGML_OP_GROUP_NORM
1510
- struct test_group_norm : public test_case {
1511
- const ggml_type type;
1512
- const std::array<int64_t, 4> ne;
1513
- const int32_t num_groups;
1514
- const float eps;
1515
-
1516
- std::string vars() override {
1517
- return VARS_TO_STR3(type, ne, num_groups);
1518
- }
1519
-
1520
- test_group_norm(ggml_type type = GGML_TYPE_F32,
1521
- std::array<int64_t, 4> ne = {64, 64, 320, 1},
1522
- int32_t num_groups = 32,
1523
- float eps = 1e-6f)
1524
- : type(type), ne(ne), num_groups(num_groups), eps(eps) {}
1525
-
1526
- ggml_tensor * build_graph(ggml_context * ctx) override {
1527
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
1528
- ggml_tensor * out = ggml_group_norm(ctx, a, num_groups, eps);
1529
- return out;
1530
- }
1531
- };
1532
-
1533
- // GGML_OP_ACC
1534
- struct test_acc : public test_case {
1535
- const ggml_type type;
1536
- const std::array<int64_t, 4> ne_a;
1537
- const std::array<int64_t, 4> ne_b;
1538
-
1539
- std::string vars() override {
1540
- return VARS_TO_STR3(type, ne_a, ne_b);
1541
- }
1542
-
1543
- test_acc(ggml_type type = GGML_TYPE_F32,
1544
- std::array<int64_t, 4> ne_a = {1024, 577, 1, 1},
1545
- std::array<int64_t, 4> ne_b = {1024, 576, 1, 1})
1546
- : type(type), ne_a(ne_a), ne_b(ne_b) {}
1547
-
1548
- ggml_tensor * build_graph(ggml_context * ctx) override {
1549
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
1550
- ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
1551
- ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
1552
- return out;
1553
- }
1554
- };
1555
-
1556
- // GGML_OP_PAD
1557
- struct test_pad : public test_case {
1558
- const ggml_type type;
1559
- const std::array<int64_t, 4> ne_a;
1560
- const int pad_0;
1561
- const int pad_1;
1562
-
1563
- std::string vars() override {
1564
- return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
1565
- }
1566
-
1567
- test_pad(ggml_type type = GGML_TYPE_F32,
1568
- std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
1569
- int pad_0 = 1, int pad_1 = 1)
1570
- : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1) {}
1571
-
1572
- ggml_tensor * build_graph(ggml_context * ctx) override {
1573
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
1574
- ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
1575
- return out;
1576
- }
1577
- };
1578
-
1579
- // GGML_OP_ARANGE
1580
- struct test_arange : public test_case {
1581
- const ggml_type type;
1582
- const float start;
1583
- const float stop;
1584
- const float step;
1585
-
1586
- std::string vars() override {
1587
- return VARS_TO_STR4(type, start, stop, step);
1588
- }
1589
-
1590
- test_arange(ggml_type type = GGML_TYPE_F32,
1591
- float start = 0.f, float stop = 10.f, float step = 1.f)
1592
- : type(type), start(start), stop(stop), step(step) {}
1593
-
1594
- ggml_tensor * build_graph(ggml_context * ctx) override {
1595
- ggml_tensor * out = ggml_arange(ctx, start, stop, step);
1596
- return out;
1597
- }
1598
- };
1599
-
1600
- // GGML_OP_TIMESTEP_EMBEDDING
1601
- struct test_timestep_embedding : public test_case {
1602
- const ggml_type type;
1603
- const std::array<int64_t, 4> ne_a;
1604
- const int dim;
1605
- const int max_period;
1606
-
1607
- std::string vars() override {
1608
- return VARS_TO_STR4(type, ne_a, dim, max_period);
1609
- }
1610
-
1611
- test_timestep_embedding(ggml_type type = GGML_TYPE_F32,
1612
- std::array<int64_t, 4> ne_a = {2, 1, 1, 1},
1613
- int dim = 320, int max_period=10000)
1614
- : type(type), ne_a(ne_a), dim(dim), max_period(max_period) {}
1615
-
1616
- ggml_tensor * build_graph(ggml_context * ctx) override {
1617
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
1618
- ggml_tensor * out = ggml_timestep_embedding(ctx, a, dim, max_period);
1619
- return out;
1620
- }
1621
- };
1622
-
1623
- // GGML_OP_LEAKY_RELU
1624
- struct test_leaky_relu : public test_case {
1625
- const ggml_type type;
1626
- const std::array<int64_t, 4> ne_a;
1627
- const float negative_slope;
1628
-
1629
- std::string vars() override {
1630
- return VARS_TO_STR3(type, ne_a, negative_slope);
1631
- }
1632
-
1633
- test_leaky_relu(ggml_type type = GGML_TYPE_F32,
1634
- std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
1635
- float negative_slope = 0.1f)
1636
- : type(type), ne_a(ne_a), negative_slope(negative_slope) {}
1637
-
1638
- ggml_tensor * build_graph(ggml_context * ctx) override {
1639
- ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
1640
- ggml_tensor * out = ggml_leaky_relu(ctx, a, negative_slope, true);
1641
- return out;
1642
- }
1643
- };
1644
-
1645
- // GGML_OP_FLASH_ATTN_EXT
1646
- struct test_flash_attn_ext : public test_case {
1647
- const int64_t hs; // head size
1648
- const int64_t nh; // num heads
1649
- const int64_t kv; // kv size
1650
- const int64_t nb; // batch size
1651
-
1652
- const bool mask; // use mask
1653
-
1654
- const float max_bias; // ALiBi
1655
-
1656
- const ggml_type type_KV;
1657
-
1658
- std::string vars() override {
1659
- return VARS_TO_STR7(hs, nh, kv, nb, mask, max_bias, type_KV);
1660
- }
1661
-
1662
- double max_nmse_err() override {
1663
- return 5e-4;
1664
- }
1665
-
1666
- test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
1667
- : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), type_KV(type_KV) {}
1668
-
1669
- ggml_tensor * build_graph(ggml_context * ctx) override {
1670
- const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV));
1671
-
1672
- ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs_padded, nb, nh, 1);
1673
- ggml_tensor * k = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
1674
- ggml_tensor * v = ggml_new_tensor_4d(ctx, type_KV, hs_padded, kv, nh, 1);
1675
- ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr;
1676
- ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias);
1677
- return out;
1678
- }
1679
- };
1680
-
1681
- enum llm_norm_type {
1682
- LLM_NORM,
1683
- LLM_NORM_RMS,
1684
- };
1685
-
1686
- struct llama_hparams {
1687
- uint32_t n_vocab;
1688
- uint32_t n_embd;
1689
- uint32_t n_head;
1690
- uint32_t n_head_kv;
1691
- static constexpr uint32_t n_layer = 1;
1692
- uint32_t n_rot;
1693
- uint32_t n_embd_head; // dimension of values (d_v)
1694
- uint32_t n_ff;
1695
-
1696
- float f_norm_eps;
1697
- float f_norm_rms_eps;
1698
-
1699
- // cparams
1700
- static constexpr uint32_t n_ctx = 512; // user-specified context size
1701
- static constexpr uint32_t n_ctx_orig = n_ctx;
1702
-
1703
- // batch
1704
- int32_t n_tokens;
1705
-
1706
- // llm_build_context
1707
- static constexpr int32_t n_kv = 32; // size of KV cache to consider (n_kv <= n_ctx
1708
- static constexpr int32_t kv_head = 1; // index of where we store new KV data in the cache
1709
-
1710
- uint32_t n_embd_gqa() const { // dimension of key embeddings across all k-v heads
1711
- return n_embd_head * n_head_kv;
1712
- }
1713
- };
1714
-
1715
- // LLM base class
1716
- struct test_llm : public test_case {
1717
- llama_hparams hp;
1718
-
1719
- protected:
1720
- test_llm(llama_hparams hp)
1721
- : hp(std::move(hp)) {
1722
- }
1723
-
1724
- public:
1725
- struct ggml_tensor * llm_build_norm(
1726
- struct ggml_context * ctx,
1727
- struct ggml_tensor * cur,
1728
- struct ggml_tensor * mw,
1729
- struct ggml_tensor * mb,
1730
- llm_norm_type type) {
1731
- switch (type) {
1732
- case LLM_NORM: cur = ggml_norm (ctx, cur, hp.f_norm_eps); break;
1733
- case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hp.f_norm_rms_eps); break;
1734
- }
1735
- cur = ggml_mul(ctx, cur, mw);
1736
- if (mb) {
1737
- cur = ggml_add(ctx, cur, mb);
1738
- }
1739
- return cur;
1740
- }
1741
-
1742
- void llm_build_kv_store(
1743
- struct ggml_context * ctx,
1744
- struct ggml_tensor * k_l,
1745
- struct ggml_tensor * v_l,
1746
- struct ggml_tensor * k_cur,
1747
- struct ggml_tensor * v_cur) {
1748
- // compute the transposed [n_tokens, n_embd] V matrix
1749
- struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, hp.n_embd_gqa(), hp.n_tokens));
1750
-
1751
- struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, k_l, hp.n_tokens*hp.n_embd_gqa(),
1752
- (ggml_row_size(k_l->type, hp.n_embd_gqa()))*hp.kv_head);
1753
-
1754
- struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, v_l, hp.n_tokens, hp.n_embd_gqa(),
1755
- ( hp.n_ctx)*ggml_element_size(v_l),
1756
- (hp.kv_head)*ggml_element_size(v_l));
1757
-
1758
- // important: storing RoPE-ed version of K in the KV cache!
1759
- ggml_cpy(ctx, k_cur, k_cache_view);
1760
- ggml_cpy(ctx, v_cur_t, v_cache_view);
1761
- }
1762
-
1763
- struct ggml_tensor * llm_build_kqv(
1764
- struct ggml_context * ctx,
1765
- struct ggml_tensor * k_l,
1766
- struct ggml_tensor * v_l,
1767
- struct ggml_tensor * q_cur,
1768
- struct ggml_tensor * kq_mask,
1769
- float kq_scale) {
1770
- struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
1771
-
1772
- struct ggml_tensor * k =
1773
- ggml_view_3d(ctx, k_l,
1774
- hp.n_embd_head, hp.n_kv, hp.n_head_kv,
1775
- ggml_row_size(k_l->type, hp.n_embd_gqa()),
1776
- ggml_row_size(k_l->type, hp.n_embd_head),
1777
- 0);
1778
-
1779
- struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
1780
-
1781
- kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, 0.0f);
1782
-
1783
- // split cached v into n_head heads
1784
- struct ggml_tensor * v =
1785
- ggml_view_3d(ctx, v_l,
1786
- hp.n_kv, hp.n_embd_head, hp.n_head_kv,
1787
- ggml_element_size(v_l)*hp.n_ctx,
1788
- ggml_element_size(v_l)*hp.n_ctx*hp.n_embd_head,
1789
- 0);
1790
-
1791
- struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
1792
-
1793
- struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
1794
-
1795
- struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, hp.n_embd_head*hp.n_head, hp.n_tokens);
1796
-
1797
- struct ggml_tensor * wo = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
1798
- cur = ggml_mul_mat(ctx, wo, cur);
1799
-
1800
- return cur;
1801
- }
1802
-
1803
- void initialize_tensors(ggml_context * ctx) override {
1804
- for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
1805
- if (t->type == GGML_TYPE_I32) {
1806
- // pos
1807
- std::vector<int> data(hp.n_tokens);
1808
- for (int i = 0; i < hp.n_tokens; i++) {
1809
- data[i] = rand() % hp.n_ctx;
1810
- }
1811
- ggml_backend_tensor_set(t, data.data(), 0, hp.n_tokens * sizeof(int));
1812
- } else {
1813
- init_tensor_uniform(t);
1814
- }
1815
- }
1816
- }
1817
- };
1818
-
1819
- // Llama
1820
- struct test_llama : public test_llm {
1821
- static constexpr float freq_base = 10000.0f;
1822
- static constexpr float freq_scale = 1.0f;
1823
- static constexpr float ext_factor = 0.0f;
1824
- static constexpr float attn_factor = 1.0f;
1825
- static constexpr float beta_fast = 32.0f;
1826
- static constexpr float beta_slow = 1.0f;
1827
-
1828
- std::string op_desc(ggml_tensor * t) override {
1829
- GGML_UNUSED(t);
1830
- return "LLAMA";
1831
- }
1832
-
1833
- std::string vars() override {
1834
- auto n_tokens = hp.n_tokens;
1835
- return VARS_TO_STR1(n_tokens);
1836
- }
1837
-
1838
- double max_nmse_err() override {
1839
- return 2e-3;
1840
- }
1841
-
1842
- test_llama(int n_tokens = 1)
1843
- : test_llm({
1844
- /*n_vocab =*/ 32000,
1845
- /*n_embd =*/ 3200,
1846
- /*n_head =*/ 32,
1847
- /*n_head_kv =*/ 32,
1848
- /*n_rot =*/ 100,
1849
- /*n_embd_head =*/ 100,
1850
- /*n_ff =*/ 8640,
1851
- /*f_norm_eps =*/ 0.f,
1852
- /*f_norm_rms_eps =*/ 1e-5f,
1853
- /*n_tokens =*/ n_tokens,
1854
- }) {
1855
- }
1856
-
1857
- ggml_tensor * build_graph(ggml_context * ctx) override {
1858
- struct ggml_tensor * cur;
1859
- struct ggml_tensor * inpL;
1860
-
1861
- inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
1862
-
1863
- // inp_pos - contains the positions
1864
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
1865
-
1866
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
1867
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
1868
-
1869
- ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
1870
- ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
1871
-
1872
- for (uint32_t il = 0; il < hp.n_layer; ++il) {
1873
- struct ggml_tensor * inpSA = inpL;
1874
-
1875
- // norm
1876
- ggml_tensor * attn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
1877
- cur = llm_build_norm(ctx, inpL, attn_norm, nullptr, LLM_NORM_RMS);
1878
-
1879
- // self-attention
1880
- {
1881
- ggml_tensor * wq = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
1882
- ggml_tensor * wk = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
1883
- ggml_tensor * wv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
1884
-
1885
- // compute Q and K and RoPE them
1886
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx, wq, cur);
1887
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur);
1888
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur);
1889
-
1890
- Qcur = ggml_rope_ext(
1891
- ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens), inp_pos, nullptr,
1892
- hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
1893
- ext_factor, attn_factor, beta_fast, beta_slow
1894
- );
1895
-
1896
- Kcur = ggml_rope_ext(
1897
- ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos, nullptr,
1898
- hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
1899
- ext_factor, attn_factor, beta_fast, beta_slow
1900
- );
1901
-
1902
- llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
1903
-
1904
- cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
1905
- }
1906
-
1907
- struct ggml_tensor * ffn_inp = ggml_add(ctx, cur, inpSA);
1908
-
1909
- // feed-forward network
1910
- ggml_tensor * ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
1911
- cur = llm_build_norm(ctx, ffn_inp, ffn_norm, nullptr, LLM_NORM_RMS);
1912
-
1913
- ggml_tensor * ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
1914
- ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
1915
- ggml_tensor * ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
1916
- struct ggml_tensor * tmp = ggml_mul_mat(ctx, ffn_up, cur);
1917
- cur = ggml_mul_mat(ctx, ffn_gate, cur);
1918
- cur = ggml_silu(ctx, cur);
1919
- cur = ggml_mul(ctx, cur, tmp);
1920
- cur = ggml_mul_mat(ctx, ffn_down, cur);
1921
-
1922
- cur = ggml_add(ctx, cur, ffn_inp);
1923
-
1924
- // input for next layer
1925
- inpL = cur;
1926
- }
1927
-
1928
- cur = inpL;
1929
-
1930
- ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
1931
- cur = llm_build_norm(ctx, cur, output_norm, nullptr, LLM_NORM_RMS);
1932
-
1933
- // lm_head
1934
- ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_vocab);
1935
- cur = ggml_mul_mat(ctx, output, cur);
1936
-
1937
- return cur;
1938
- }
1939
- };
1940
-
1941
- // Falcon
1942
- struct test_falcon : public test_llm {
1943
- static constexpr float freq_base = 10000.0f;
1944
- static constexpr float freq_scale = 1.0f;
1945
- static constexpr float ext_factor = 0.0f;
1946
- static constexpr float attn_factor = 1.0f;
1947
- static constexpr float beta_fast = 32.0f;
1948
- static constexpr float beta_slow = 1.0f;
1949
-
1950
- std::string op_desc(ggml_tensor * t) override {
1951
- GGML_UNUSED(t);
1952
- return "FALCON";
1953
- }
1954
-
1955
- std::string vars() override {
1956
- auto n_tokens = hp.n_tokens;
1957
- return VARS_TO_STR1(n_tokens);
1958
- }
1959
-
1960
- double max_nmse_err() override {
1961
- return 2e-3;
1962
- }
1963
-
1964
- test_falcon(int n_tokens = 1)
1965
- : test_llm({
1966
- /*n_vocab =*/ 32000,
1967
- /*n_embd =*/ 3200,
1968
- /*n_head =*/ 50,
1969
- /*n_head_kv =*/ 1,
1970
- /*n_rot =*/ 64,
1971
- /*n_embd_head =*/ 64,
1972
- /*n_ff =*/ 8640,
1973
- /*f_norm_eps =*/ 1e-5f,
1974
- /*f_norm_rms_eps =*/ 0.f,
1975
- /*n_tokens =*/ n_tokens,
1976
- }) {
1977
- }
1978
-
1979
- ggml_tensor * build_graph(ggml_context * ctx) override {
1980
- struct ggml_tensor * cur;
1981
- struct ggml_tensor * inpL;
1982
-
1983
- inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
1984
-
1985
- // inp_pos - contains the positions
1986
- struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
1987
-
1988
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
1989
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
1990
-
1991
- ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
1992
- ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
1993
-
1994
- for (uint32_t il = 0; il < hp.n_layer; ++il) {
1995
- // norm
1996
- ggml_tensor * attn_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
1997
- ggml_tensor * attn_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
1998
- ggml_tensor * attn_norm = llm_build_norm(ctx, inpL, attn_norm_w, attn_norm_b, LLM_NORM);
1999
-
2000
- // self-attention
2001
- {
2002
- cur = attn_norm;
2003
-
2004
- ggml_tensor * wqkv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd + 2*hp.n_embd_gqa());
2005
-
2006
- cur = ggml_mul_mat(ctx, wqkv, cur);
2007
-
2008
- struct ggml_tensor * Qcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd, hp.n_tokens, cur->nb[1], 0*sizeof(float)*(hp.n_embd)));
2009
- struct ggml_tensor * Kcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd)));
2010
- struct ggml_tensor * Vcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd + hp.n_embd_gqa())));
2011
-
2012
- Qcur = ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens);
2013
- Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens);
2014
-
2015
- // using mode = 2 for neox mode
2016
- Qcur = ggml_rope_ext(
2017
- ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
2018
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
2019
- );
2020
-
2021
- Kcur = ggml_rope_ext(
2022
- ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
2023
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
2024
- );
2025
-
2026
- llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
2027
-
2028
- cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
2029
- }
2030
-
2031
- struct ggml_tensor * ffn_inp = cur;
2032
-
2033
- // feed forward
2034
- {
2035
- ggml_tensor * ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
2036
- ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
2037
- cur = attn_norm;
2038
- cur = ggml_mul_mat(ctx, ffn_up, cur);
2039
- cur = ggml_gelu(ctx, cur);
2040
- cur = ggml_mul_mat(ctx, ffn_down, cur);
2041
- }
2042
-
2043
- cur = ggml_add(ctx, cur, ffn_inp);
2044
-
2045
- cur = ggml_add(ctx, cur, inpL);
2046
-
2047
- // input for next layer
2048
- inpL = cur;
2049
- }
2050
-
2051
- cur = inpL;
2052
-
2053
- ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
2054
- ggml_tensor * output_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
2055
- cur = llm_build_norm(ctx, cur, output_norm, output_norm_b, LLM_NORM);
2056
-
2057
- // lm_head
2058
- ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q8_0, hp.n_embd, hp.n_vocab);
2059
- cur = ggml_mul_mat(ctx, output, cur);
2060
-
2061
- return cur;
2062
- }
2063
- };
2064
-
2065
- static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
2066
- std::vector<std::unique_ptr<test_case>> test_cases;
2067
- std::default_random_engine rng(0);
2068
-
2069
- const ggml_type all_types[] = {
2070
- GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
2071
- GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
2072
- GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
2073
- GGML_TYPE_Q8_0,
2074
- GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
2075
- GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
2076
- GGML_TYPE_Q6_K,
2077
- GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
2078
- GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
2079
- GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
2080
- };
2081
-
2082
- const ggml_type base_types[] = {
2083
- GGML_TYPE_F32, GGML_TYPE_F16,
2084
- GGML_TYPE_Q4_0,
2085
- GGML_TYPE_Q4_K,
2086
- GGML_TYPE_IQ2_XXS
2087
- };
2088
-
2089
- const ggml_type other_types[] = {
2090
- GGML_TYPE_Q4_1,
2091
- GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
2092
- GGML_TYPE_Q8_0,
2093
- GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
2094
- GGML_TYPE_Q5_K,
2095
- GGML_TYPE_Q6_K,
2096
- GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
2097
- GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
2098
- GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
2099
- GGML_TYPE_BF16,
2100
- };
2101
-
2102
- // unary ops
2103
- for (int v : {0, 1}) {
2104
- for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
2105
- test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 128, 10, 10, 10 }, v));
2106
- test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 7, 13, 19, 23 }, v));
2107
- }
2108
- }
2109
-
2110
- test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
2111
- for (ggml_type type : all_types) {
2112
- for (int b : {1, 7}) {
2113
- for (bool v : {false, true}) {
2114
- test_cases.emplace_back(new test_get_rows(type, 256, 5, 4, b, v));
2115
- }
2116
- }
2117
- }
2118
- for (int b : {1, 7}) {
2119
- for (bool v : {false, true}) {
2120
- test_cases.emplace_back(new test_get_rows(GGML_TYPE_I32, 256, 5, 4, b, v));
2121
- }
2122
- }
2123
-
2124
- for (ggml_type type_input : {GGML_TYPE_F32}) {
2125
- for (ggml_op_pool pool_type : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
2126
- for (int k0 : {1, 3}) {
2127
- for (int k1 : {1, 3}) {
2128
- for (int s0 : {1, 2}) {
2129
- for (int s1 : {1, 2}) {
2130
- for (int p0 : {0, 1}) {
2131
- for (int p1 : {0, 1}) {
2132
- test_cases.emplace_back(new test_pool2d(pool_type, type_input, {10, 10, 3, 1}, k0, k1, s0, s1, p0, p1));
2133
- }
2134
- }
2135
- }
2136
- }
2137
- }
2138
- }
2139
- }
2140
- }
2141
-
2142
- test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
2143
- test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
2144
- // test cases for 1D im2col
2145
- test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
2146
- test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
2147
-
2148
- test_cases.emplace_back(new test_conv_transpose_1d());
2149
- test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
2150
- test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
2151
- test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 1, 0, 1));
2152
- test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 2, 0, 1));
2153
- test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 1, 0, 1));
2154
- test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
2155
- test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
2156
-
2157
-
2158
- test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
2159
- test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
2160
- test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1}));
2161
- test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 2, 1}));
2162
- test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 2}));
2163
- test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 10, 10, 10}, {2, 1, 1, 1}));
2164
- test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 10, 10, 10}, {1, 1, 1, 2}));
2165
-
2166
- test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
2167
- test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
2168
- test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
2169
- test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
2170
- test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {0, 2, 1, 3}));
2171
- test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
2172
- test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
2173
- test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
2174
- test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
2175
- test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
2176
-
2177
- for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
2178
- for (ggml_type type_dst : all_types) {
2179
- test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
2180
- test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
2181
- }
2182
- }
2183
- for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
2184
- for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
2185
- test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
2186
- }
2187
- }
2188
-
2189
- test_cases.emplace_back(new test_cont());
2190
-
2191
- auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
2192
- for (auto op : {ggml_add, ggml_mul, ggml_div}) {
2193
- test_cases.emplace_back(new test_bin_bcast(op, type, ne, nr));
2194
- }
2195
- };
2196
-
2197
- add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1});
2198
- add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1});
2199
- add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1});
2200
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 1, 1}, {1, 1, 1, 1});
2201
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 1}, {1, 1, 1, 1});
2202
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 1});
2203
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 1, 1, 1});
2204
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 1, 1});
2205
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 1});
2206
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 2});
2207
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 2});
2208
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 2, 2});
2209
- add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 2, 2, 2});
2210
-
2211
- // stable diffusion
2212
- add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 1, 1, 1});
2213
- add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 16, 16, 1});
2214
- add_test_bin_bcast(GGML_TYPE_F32, {1280, 16, 16, 1}, {1, 1, 1, 1});
2215
- add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 256, 1, 1});
2216
- add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {16, 16, 1, 1});
2217
- add_test_bin_bcast(GGML_TYPE_F32, {16, 16, 1280, 1}, {1, 1, 1, 1});
2218
- add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {16, 16, 1, 1});
2219
- add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 2560, 1}, {16, 16, 1, 1});
2220
- add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {32, 32, 1, 1});
2221
- add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {32, 32, 1, 1});
2222
- add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 640, 1}, {32, 32, 1, 1});
2223
- add_test_bin_bcast(GGML_TYPE_F32, {5120, 1, 1, 1}, {1, 256, 1, 1});
2224
- add_test_bin_bcast(GGML_TYPE_F32, {640, 1, 1, 1}, {1, 1, 1, 1});
2225
- //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1});
2226
- //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1});
2227
-
2228
- test_cases.emplace_back(new test_scale());
2229
-
2230
- for (float eps : {1e-6f, 1e-5f, 1e-3f, 1e-1f}) {
2231
- test_cases.emplace_back(new test_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
2232
- test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
2233
- }
2234
-
2235
- #if 1
2236
- for (ggml_type type_a : base_types) {
2237
- for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
2238
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
2239
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {1, 1}));
2240
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {2, 1}));
2241
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 1}));
2242
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 1}));
2243
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 2}));
2244
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 2}));
2245
-
2246
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 1, 1}, {1, 1}));
2247
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 1}, {1, 1}));
2248
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 1}, {2, 1}));
2249
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 1}));
2250
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 1}));
2251
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 2}));
2252
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
2253
- }
2254
- }
2255
- #else
2256
- // m = a rows
2257
- // n = b rows
2258
- // k = cols
2259
- std::uniform_int_distribution<> dist_m(1, 128);
2260
- std::uniform_int_distribution<> dist_n(16, 128);
2261
- std::uniform_int_distribution<> dist_k(1, 16);
2262
- for (int i = 0; i < 1000; i++) {
2263
- for (ggml_type type_a : all_types) {
2264
- for (ggml_type type_b : {GGML_TYPE_F32}) {
2265
- int m = dist_m(rng);
2266
- int n = dist_n(rng);
2267
- int k = dist_k(rng) * ggml_blck_size(type_a);
2268
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1, 1}, {1, 1}));
2269
- }
2270
- }
2271
- }
2272
- #endif
2273
-
2274
- for (ggml_type type_a : other_types) {
2275
- for (ggml_type type_b : {GGML_TYPE_F32}) {
2276
- if (ggml_blck_size(type_a) != 256) {
2277
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, ggml_blck_size(type_a), {1, 1}, {1, 1}));
2278
- }
2279
- test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1, 1}, {1, 1}));
2280
- }
2281
- }
2282
-
2283
- test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 128, { 8, 1}, {1, 1}));
2284
- test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 128, { 8, 1}, {4, 1}));
2285
- test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 64, { 8, 1}, {4, 1}));
2286
- test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 64, { 8, 1}, {4, 1}));
2287
- test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1}, {4, 1}));
2288
- test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
2289
-
2290
- for (ggml_type type_a : base_types) {
2291
- for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
2292
- for (int n_mats : {4, 8}) {
2293
- for (int n_used : {1, 2, 4}) {
2294
- for (bool b : {false, true}) {
2295
- for (int n : {1, 32}) {
2296
- int m = 512;
2297
- int k = 256;
2298
- test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
2299
- }
2300
- }
2301
- }
2302
- }
2303
- }
2304
- }
2305
-
2306
- for (ggml_type type_a : other_types) {
2307
- for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
2308
- for (int n_mats : {4}) {
2309
- for (int n_used : {2}) {
2310
- for (bool b : {false}) {
2311
- for (int n : {1}) {
2312
- int m = 512;
2313
- int k = 256;
2314
- test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
2315
- }
2316
- }
2317
- }
2318
- }
2319
- }
2320
- }
2321
-
2322
- test_cases.emplace_back(new test_sqr());
2323
- test_cases.emplace_back(new test_sqrt());
2324
- test_cases.emplace_back(new test_clamp());
2325
-
2326
- test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5));
2327
- test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 1}, 5));
2328
- test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 10}, 5));
2329
-
2330
- #if 0
2331
- std::uniform_int_distribution<> dist_ne1(1, 50);
2332
- int exponent = 1;
2333
- while (exponent < (1 << 17)) {
2334
- std::uniform_int_distribution<> dist_ne0(exponent, 2*exponent);
2335
-
2336
- for (int n = 0; n < 10; ++n) {
2337
- int64_t ne0 = dist_ne0(rng);
2338
- int64_t ne1 = dist_ne1(rng);
2339
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
2340
- }
2341
-
2342
- exponent <<= 1;
2343
- }
2344
- #endif
2345
- for (bool mask : {false, true}) {
2346
- for (float max_bias : {0.0f, 8.0f}) {
2347
- if (!mask && max_bias > 0.0f) continue;
2348
- for (float scale : {1.0f, 0.1f}) {
2349
- for (int64_t ne0 : {16, 1024}) {
2350
- for (int64_t ne1 : {16, 1024}) {
2351
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, scale, max_bias));
2352
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, scale, max_bias));
2353
- }
2354
- }
2355
- }
2356
- }
2357
- }
2358
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f));
2359
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
2360
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
2361
- test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
2362
-
2363
- {
2364
- bool all = true;
2365
-
2366
- for (float v : { 0, 1 }) {
2367
- for (float fs : { 1.0f, 1.4245f }) {
2368
- for (float ef : { 0.0f, 0.7465f }) {
2369
- for (float af : { 1.0f, 1.4245f }) {
2370
- for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
2371
- for (bool ff : {false, true}) { // freq_factors
2372
- test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 7B
2373
-
2374
- if (all) {
2375
- test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 13B
2376
- test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 30B
2377
- test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 65B
2378
- }
2379
-
2380
- if (all) {
2381
- test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
2382
- test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
2383
- test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
2384
- test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm)
2385
- test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
2386
- }
2387
-
2388
- test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
2389
- }
2390
- }
2391
-
2392
- all = false;
2393
- }
2394
- }
2395
- }
2396
- }
2397
- }
2398
-
2399
- for (int v : { 0, 1, 2, 3 }) {
2400
- for (int dim : { 0, 1, 2, 3, }) {
2401
- test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v));
2402
- test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim, v));
2403
- }
2404
- }
2405
-
2406
- for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
2407
- test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
2408
- test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
2409
- test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
2410
- }
2411
-
2412
- test_cases.emplace_back(new test_sum_rows());
2413
- test_cases.emplace_back(new test_upscale());
2414
- test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
2415
- test_cases.emplace_back(new test_upscale_ext());
2416
- test_cases.emplace_back(new test_group_norm());
2417
- test_cases.emplace_back(new test_acc());
2418
- test_cases.emplace_back(new test_pad());
2419
- test_cases.emplace_back(new test_arange());
2420
- test_cases.emplace_back(new test_timestep_embedding());
2421
- test_cases.emplace_back(new test_leaky_relu());
2422
-
2423
- for (int hs : { 64, 80, 128, 256, }) {
2424
- for (bool mask : { true, false } ) {
2425
- for (float max_bias : { 0.0f, 8.0f }) {
2426
- if (!mask && max_bias > 0.0f) continue;
2427
- for (int nh : { 32, }) {
2428
- for (int kv : { 512, 1024, }) {
2429
- for (int nb : { 1, 2, 4, 8, }) {
2430
- for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
2431
- test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, type_KV));
2432
- }
2433
- }
2434
- }
2435
- }
2436
- }
2437
- }
2438
- }
2439
-
2440
- // these tests are disabled to save execution time, but they can be handy for debugging
2441
- #if 0
2442
- test_cases.emplace_back(new test_llama(1));
2443
- test_cases.emplace_back(new test_llama(2));
2444
- test_cases.emplace_back(new test_falcon(1));
2445
- test_cases.emplace_back(new test_falcon(2));
2446
- #endif
2447
-
2448
- // run tests
2449
- if (mode == MODE_TEST) {
2450
- ggml_backend_t backend_cpu = ggml_backend_cpu_init();
2451
-
2452
- size_t n_ok = 0;
2453
- for (auto & test : test_cases) {
2454
- if (test->eval(backend, backend_cpu, op_name)) {
2455
- n_ok++;
2456
- }
2457
- }
2458
- printf(" %zu/%zu tests passed\n", n_ok, test_cases.size());
2459
-
2460
- ggml_backend_free(backend_cpu);
2461
-
2462
- return n_ok == test_cases.size();
2463
- }
2464
-
2465
- if (mode == MODE_PERF) {
2466
- for (auto & test : test_cases) {
2467
- test->eval_perf(backend, op_name);
2468
- }
2469
- return true;
2470
- }
2471
-
2472
- GGML_ABORT("fatal error");
2473
- return false;
2474
- }
2475
-
2476
- static void usage(char ** argv) {
2477
- printf("Usage: %s [mode] [-o op] [-b backend]\n", argv[0]);
2478
- printf(" valid modes are: test (compare with CPU backend for correctness) or perf (performance evaluation)\n");
2479
- printf(" op names are as given by ggml_op_desc()\n");
2480
- }
2481
-
2482
- int main(int argc, char ** argv) {
2483
- test_mode mode = MODE_TEST;
2484
- const char * op_name_filter = NULL;
2485
- const char * backend_filter = NULL;
2486
-
2487
- for (int i = 1; i < argc; i++) {
2488
- if (strcmp(argv[i], "test") == 0) {
2489
- mode = MODE_TEST;
2490
- } else if (strcmp(argv[i], "perf") == 0) {
2491
- mode = MODE_PERF;
2492
- } else if (strcmp(argv[i], "-o") == 0) {
2493
- if (i + 1 < argc) {
2494
- op_name_filter = argv[++i];
2495
- } else {
2496
- usage(argv);
2497
- return 1;
2498
- }
2499
- } else if (strcmp(argv[i], "-b") == 0) {
2500
- if (i + 1 < argc) {
2501
- backend_filter = argv[++i];
2502
- } else {
2503
- usage(argv);
2504
- return 1;
2505
- }
2506
- } else {
2507
- usage(argv);
2508
- return 1;
2509
- }
2510
- }
2511
-
2512
- // enumerate backends
2513
- printf("Testing %zu backends\n\n", ggml_backend_reg_get_count());
2514
-
2515
- size_t n_ok = 0;
2516
-
2517
- for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) {
2518
- printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i));
2519
-
2520
- if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_reg_get_name(i)) != 0) {
2521
- printf(" Skipping\n");
2522
- n_ok++;
2523
- continue;
2524
- }
2525
-
2526
- ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL);
2527
- GGML_ASSERT(backend != NULL);
2528
-
2529
- if (backend_filter == NULL && ggml_backend_is_cpu(backend)) {
2530
- printf(" Skipping CPU backend\n");
2531
- ggml_backend_free(backend);
2532
- n_ok++;
2533
- continue;
2534
- }
2535
-
2536
- printf(" Backend name: %s\n", ggml_backend_name(backend));
2537
-
2538
- bool ok = test_backend(backend, mode, op_name_filter);
2539
-
2540
- printf(" Backend %s: ", ggml_backend_name(backend));
2541
- if (ok) {
2542
- printf("\033[1;32mOK\033[0m\n");
2543
- n_ok++;
2544
- } else {
2545
- printf("\033[1;31mFAIL\033[0m\n");
2546
- }
2547
-
2548
- printf("\n");
2549
-
2550
- ggml_backend_free(backend);
2551
- }
2552
-
2553
- printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count());
2554
-
2555
- if (n_ok != ggml_backend_reg_get_count()) {
2556
- printf("\033[1;31mFAIL\033[0m\n");
2557
- return 1;
2558
- }
2559
-
2560
- ggml_quantize_free();
2561
-
2562
- printf("\033[1;32mOK\033[0m\n");
2563
- return 0;
2564
- }