ggerganov commited on
Commit
958f2d3
·
1 Parent(s): e22e2f8

ggml : add ggml-cpu-impl.h (skip) (#0)

Browse files
ggml/src/ggml-cpu-impl.h ADDED
@@ -0,0 +1,614 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // GGML CPU internal header
4
+
5
+ #include "ggml.h"
6
+ #include "ggml-impl.h"
7
+ #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
8
+ //#include <stddef.h>
9
+ #include <stdbool.h>
10
+ #include <string.h> // memcpy
11
+ #include <math.h> // fabsf
12
+
13
+
14
+ #ifdef __cplusplus
15
+ extern "C" {
16
+ #endif
17
+
18
+ #if defined(_MSC_VER)
19
+
20
+ #define m512bh(p) p
21
+ #define m512i(p) p
22
+
23
+ #else
24
+
25
+ #define m512bh(p) (__m512bh)(p)
26
+ #define m512i(p) (__m512i)(p)
27
+
28
+ #endif
29
+
30
+ /**
31
+ * Converts brain16 to float32.
32
+ *
33
+ * The bfloat16 floating point format has the following structure:
34
+ *
35
+ * ┌sign
36
+ * │
37
+ * │ ┌exponent
38
+ * │ │
39
+ * │ │ ┌mantissa
40
+ * │ │ │
41
+ * │┌──┴───┐┌─┴───┐
42
+ * 0b0000000000000000 brain16
43
+ *
44
+ * Since bf16 has the same number of exponent bits as a 32bit float,
45
+ * encoding and decoding numbers becomes relatively straightforward.
46
+ *
47
+ * ┌sign
48
+ * │
49
+ * │ ┌exponent
50
+ * │ │
51
+ * │ │ ┌mantissa
52
+ * │ │ │
53
+ * │┌──┴───┐┌─┴───────────────────┐
54
+ * 0b00000000000000000000000000000000 IEEE binary32
55
+ *
56
+ * For comparison, the standard fp16 format has fewer exponent bits.
57
+ *
58
+ * ┌sign
59
+ * │
60
+ * │ ┌exponent
61
+ * │ │
62
+ * │ │ ┌mantissa
63
+ * │ │ │
64
+ * │┌─┴─┐┌─┴──────┐
65
+ * 0b0000000000000000 IEEE binary16
66
+ *
67
+ * @see IEEE 754-2008
68
+ */
69
+ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
70
+ union {
71
+ float f;
72
+ uint32_t i;
73
+ } u;
74
+ u.i = (uint32_t)h.bits << 16;
75
+ return u.f;
76
+ }
77
+
78
+ /**
79
+ * Converts float32 to brain16.
80
+ *
81
+ * This is binary identical with Google Brain float conversion.
82
+ * Floats shall round to nearest even, and NANs shall be quiet.
83
+ * Subnormals aren't flushed to zero, except perhaps when used.
84
+ * This code should vectorize nicely if using modern compilers.
85
+ */
86
+ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
87
+ ggml_bf16_t h;
88
+ union {
89
+ float f;
90
+ uint32_t i;
91
+ } u;
92
+ u.f = s;
93
+ if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
94
+ h.bits = (u.i >> 16) | 64; /* force to quiet */
95
+ return h;
96
+ }
97
+ h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
98
+ return h;
99
+ }
100
+
101
+ #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
102
+ #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
103
+
104
+ // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
105
+ #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
106
+ #ifndef __FMA__
107
+ #define __FMA__
108
+ #endif
109
+ #ifndef __F16C__
110
+ #define __F16C__
111
+ #endif
112
+ #endif
113
+
114
+ // __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
115
+ #if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
116
+ #ifndef __SSE3__
117
+ #define __SSE3__
118
+ #endif
119
+ #ifndef __SSSE3__
120
+ #define __SSSE3__
121
+ #endif
122
+ #endif
123
+
124
+ #if defined(__ARM_FEATURE_SVE)
125
+ #include <arm_sve.h>
126
+ #include <sys/prctl.h>
127
+ #endif
128
+
129
+ // 16-bit float
130
+ // on Arm, we use __fp16
131
+ // on x86, we use uint16_t
132
+ #if defined(__ARM_NEON)
133
+
134
+ // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
135
+ //
136
+ // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
137
+ //
138
+ #include <arm_neon.h>
139
+
140
+ #ifdef _MSC_VER
141
+
142
+ typedef uint16_t ggml_fp16_internal_t;
143
+
144
+ #define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
145
+
146
+ #else
147
+
148
+ typedef __fp16 ggml_fp16_internal_t;
149
+
150
+ #define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
151
+
152
+ #endif // _MSC_VER
153
+
154
+ #if !defined(__aarch64__)
155
+
156
+ // 32-bit ARM compatibility
157
+
158
+ // vaddlvq_s16
159
+ // vpaddq_s16
160
+ // vpaddq_s32
161
+ // vaddvq_s32
162
+ // vaddvq_f32
163
+ // vmaxvq_f32
164
+ // vcvtnq_s32_f32
165
+ // vzip1_u8
166
+ // vzip2_u8
167
+
168
+ inline static int32_t vaddlvq_s16(int16x8_t v) {
169
+ int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
170
+ return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
171
+ }
172
+
173
+ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
174
+ int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
175
+ int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
176
+ return vcombine_s16(a0, b0);
177
+ }
178
+
179
+ inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
180
+ int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
181
+ int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
182
+ return vcombine_s32(a0, b0);
183
+ }
184
+
185
+ inline static int32_t vaddvq_s32(int32x4_t v) {
186
+ return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
187
+ }
188
+
189
+ inline static float vaddvq_f32(float32x4_t v) {
190
+ return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
191
+ }
192
+
193
+ inline static float vmaxvq_f32(float32x4_t v) {
194
+ return
195
+ MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
196
+ MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
197
+ }
198
+
199
+ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
200
+ int32x4_t res;
201
+
202
+ res[0] = roundf(vgetq_lane_f32(v, 0));
203
+ res[1] = roundf(vgetq_lane_f32(v, 1));
204
+ res[2] = roundf(vgetq_lane_f32(v, 2));
205
+ res[3] = roundf(vgetq_lane_f32(v, 3));
206
+
207
+ return res;
208
+ }
209
+
210
+ inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
211
+ uint8x8_t res;
212
+
213
+ res[0] = a[0]; res[1] = b[0];
214
+ res[2] = a[1]; res[3] = b[1];
215
+ res[4] = a[2]; res[5] = b[2];
216
+ res[6] = a[3]; res[7] = b[3];
217
+
218
+ return res;
219
+ }
220
+
221
+ inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
222
+ uint8x8_t res;
223
+
224
+ res[0] = a[4]; res[1] = b[4];
225
+ res[2] = a[5]; res[3] = b[5];
226
+ res[4] = a[6]; res[5] = b[6];
227
+ res[6] = a[7]; res[7] = b[7];
228
+
229
+ return res;
230
+ }
231
+
232
+ // vld1q_s16_x2
233
+ // vld1q_u8_x2
234
+ // vld1q_u8_x4
235
+ // vld1q_s8_x2
236
+ // vld1q_s8_x4
237
+ // TODO: double-check these work correctly
238
+
239
+ typedef struct ggml_int16x8x2_t {
240
+ int16x8_t val[2];
241
+ } ggml_int16x8x2_t;
242
+
243
+ inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
244
+ ggml_int16x8x2_t res;
245
+
246
+ res.val[0] = vld1q_s16(ptr + 0);
247
+ res.val[1] = vld1q_s16(ptr + 8);
248
+
249
+ return res;
250
+ }
251
+
252
+ typedef struct ggml_uint8x16x2_t {
253
+ uint8x16_t val[2];
254
+ } ggml_uint8x16x2_t;
255
+
256
+ inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
257
+ ggml_uint8x16x2_t res;
258
+
259
+ res.val[0] = vld1q_u8(ptr + 0);
260
+ res.val[1] = vld1q_u8(ptr + 16);
261
+
262
+ return res;
263
+ }
264
+
265
+ typedef struct ggml_uint8x16x4_t {
266
+ uint8x16_t val[4];
267
+ } ggml_uint8x16x4_t;
268
+
269
+ inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
270
+ ggml_uint8x16x4_t res;
271
+
272
+ res.val[0] = vld1q_u8(ptr + 0);
273
+ res.val[1] = vld1q_u8(ptr + 16);
274
+ res.val[2] = vld1q_u8(ptr + 32);
275
+ res.val[3] = vld1q_u8(ptr + 48);
276
+
277
+ return res;
278
+ }
279
+
280
+ typedef struct ggml_int8x16x2_t {
281
+ int8x16_t val[2];
282
+ } ggml_int8x16x2_t;
283
+
284
+ inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
285
+ ggml_int8x16x2_t res;
286
+
287
+ res.val[0] = vld1q_s8(ptr + 0);
288
+ res.val[1] = vld1q_s8(ptr + 16);
289
+
290
+ return res;
291
+ }
292
+
293
+ typedef struct ggml_int8x16x4_t {
294
+ int8x16_t val[4];
295
+ } ggml_int8x16x4_t;
296
+
297
+ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
298
+ ggml_int8x16x4_t res;
299
+
300
+ res.val[0] = vld1q_s8(ptr + 0);
301
+ res.val[1] = vld1q_s8(ptr + 16);
302
+ res.val[2] = vld1q_s8(ptr + 32);
303
+ res.val[3] = vld1q_s8(ptr + 48);
304
+
305
+ return res;
306
+ }
307
+
308
+ // NOTE: not tested
309
+ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
310
+ int8x16_t res;
311
+
312
+ res[ 0] = a[b[ 0]];
313
+ res[ 1] = a[b[ 1]];
314
+ res[ 2] = a[b[ 2]];
315
+ res[ 3] = a[b[ 3]];
316
+ res[ 4] = a[b[ 4]];
317
+ res[ 5] = a[b[ 5]];
318
+ res[ 6] = a[b[ 6]];
319
+ res[ 7] = a[b[ 7]];
320
+ res[ 8] = a[b[ 8]];
321
+ res[ 9] = a[b[ 9]];
322
+ res[10] = a[b[10]];
323
+ res[11] = a[b[11]];
324
+ res[12] = a[b[12]];
325
+ res[13] = a[b[13]];
326
+ res[14] = a[b[14]];
327
+ res[15] = a[b[15]];
328
+
329
+ return res;
330
+ }
331
+
332
+ // NOTE: not tested
333
+ inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
334
+ uint8x16_t res;
335
+
336
+ res[ 0] = a[b[ 0]];
337
+ res[ 1] = a[b[ 1]];
338
+ res[ 2] = a[b[ 2]];
339
+ res[ 3] = a[b[ 3]];
340
+ res[ 4] = a[b[ 4]];
341
+ res[ 5] = a[b[ 5]];
342
+ res[ 6] = a[b[ 6]];
343
+ res[ 7] = a[b[ 7]];
344
+ res[ 8] = a[b[ 8]];
345
+ res[ 9] = a[b[ 9]];
346
+ res[10] = a[b[10]];
347
+ res[11] = a[b[11]];
348
+ res[12] = a[b[12]];
349
+ res[13] = a[b[13]];
350
+ res[14] = a[b[14]];
351
+ res[15] = a[b[15]];
352
+
353
+ return res;
354
+ }
355
+
356
+ #else
357
+
358
+ #define ggml_int16x8x2_t int16x8x2_t
359
+ #define ggml_uint8x16x2_t uint8x16x2_t
360
+ #define ggml_uint8x16x4_t uint8x16x4_t
361
+ #define ggml_int8x16x2_t int8x16x2_t
362
+ #define ggml_int8x16x4_t int8x16x4_t
363
+
364
+ #define ggml_vld1q_s16_x2 vld1q_s16_x2
365
+ #define ggml_vld1q_u8_x2 vld1q_u8_x2
366
+ #define ggml_vld1q_u8_x4 vld1q_u8_x4
367
+ #define ggml_vld1q_s8_x2 vld1q_s8_x2
368
+ #define ggml_vld1q_s8_x4 vld1q_s8_x4
369
+ #define ggml_vqtbl1q_s8 vqtbl1q_s8
370
+ #define ggml_vqtbl1q_u8 vqtbl1q_u8
371
+
372
+ #endif // !defined(__aarch64__)
373
+
374
+ #if !defined(__ARM_FEATURE_DOTPROD)
375
+
376
+ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
377
+ const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
378
+ const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
379
+
380
+ return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
381
+ }
382
+
383
+ #else
384
+
385
+ #define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
386
+
387
+ #endif // !defined(__ARM_FEATURE_DOTPROD)
388
+
389
+ #endif // defined(__ARM_NEON)
390
+
391
+ #if defined(__ARM_NEON) && !defined(_MSC_VER)
392
+
393
+ #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
394
+ #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
395
+
396
+ #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
397
+
398
+ static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
399
+ ggml_fp16_internal_t tmp;
400
+ memcpy(&tmp, &h, sizeof(ggml_fp16_t));
401
+ return (float)tmp;
402
+ }
403
+
404
+ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
405
+ ggml_fp16_t res;
406
+ ggml_fp16_internal_t tmp = f;
407
+ memcpy(&res, &tmp, sizeof(ggml_fp16_t));
408
+ return res;
409
+ }
410
+
411
+ #else
412
+
413
+ #ifdef __wasm_simd128__
414
+ #include <wasm_simd128.h>
415
+ #else
416
+ #ifdef __POWER9_VECTOR__
417
+ #include <altivec.h>
418
+ #undef bool
419
+ #define bool _Bool
420
+ #else
421
+ #if defined(_MSC_VER) || defined(__MINGW32__)
422
+ #include <intrin.h>
423
+ #else
424
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
425
+ #if !defined(__riscv)
426
+ #include <immintrin.h>
427
+ #endif
428
+ #endif
429
+ #endif
430
+ #endif
431
+ #endif
432
+
433
+ #ifdef __riscv_v_intrinsic
434
+ #include <riscv_vector.h>
435
+ #endif
436
+
437
+ #if defined(__loongarch64)
438
+ #if defined(__loongarch_asx)
439
+ #include <lasxintrin.h>
440
+ #endif
441
+ #if defined(__loongarch_sx)
442
+ #include <lsxintrin.h>
443
+ #endif
444
+ #endif
445
+
446
+ #if defined(__loongarch_asx)
447
+
448
+ typedef union {
449
+ int32_t i;
450
+ float f;
451
+ } ft_union;
452
+
453
+ /* float type data load instructions */
454
+ static __m128 __lsx_vreplfr2vr_s(float val) {
455
+ ft_union fi_tmpval = {.f = val};
456
+ return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
457
+ }
458
+
459
+ static __m256 __lasx_xvreplfr2vr_s(float val) {
460
+ ft_union fi_tmpval = {.f = val};
461
+ return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
462
+ }
463
+ #endif
464
+
465
+ #ifdef __F16C__
466
+
467
+ #ifdef _MSC_VER
468
+ #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
469
+ #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
470
+ #else
471
+ #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
472
+ #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
473
+ #endif
474
+
475
+ #elif defined(__POWER9_VECTOR__)
476
+
477
+ #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
478
+ #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
479
+ /* the inline asm below is about 12% faster than the lookup method */
480
+ #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
481
+ #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
482
+
483
+ static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
484
+ register float f;
485
+ register double d;
486
+ __asm__(
487
+ "mtfprd %0,%2\n"
488
+ "xscvhpdp %0,%0\n"
489
+ "frsp %1,%0\n" :
490
+ /* temp */ "=d"(d),
491
+ /* out */ "=f"(f):
492
+ /* in */ "r"(h));
493
+ return f;
494
+ }
495
+
496
+ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
497
+ register double d;
498
+ register ggml_fp16_t r;
499
+ __asm__( /* xscvdphp can work on double or single precision */
500
+ "xscvdphp %0,%2\n"
501
+ "mffprd %1,%0\n" :
502
+ /* temp */ "=d"(d),
503
+ /* out */ "=r"(r):
504
+ /* in */ "f"(f));
505
+ return r;
506
+ }
507
+
508
+ #else
509
+
510
+ // FP16 <-> FP32
511
+ // ref: https://github.com/Maratyszcza/FP16
512
+
513
+ static inline float fp32_from_bits(uint32_t w) {
514
+ union {
515
+ uint32_t as_bits;
516
+ float as_value;
517
+ } fp32;
518
+ fp32.as_bits = w;
519
+ return fp32.as_value;
520
+ }
521
+
522
+ static inline uint32_t fp32_to_bits(float f) {
523
+ union {
524
+ float as_value;
525
+ uint32_t as_bits;
526
+ } fp32;
527
+ fp32.as_value = f;
528
+ return fp32.as_bits;
529
+ }
530
+
531
+ static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
532
+ const uint32_t w = (uint32_t) h << 16;
533
+ const uint32_t sign = w & UINT32_C(0x80000000);
534
+ const uint32_t two_w = w + w;
535
+
536
+ const uint32_t exp_offset = UINT32_C(0xE0) << 23;
537
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
538
+ const float exp_scale = 0x1.0p-112f;
539
+ #else
540
+ const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
541
+ #endif
542
+ const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
543
+
544
+ const uint32_t magic_mask = UINT32_C(126) << 23;
545
+ const float magic_bias = 0.5f;
546
+ const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
547
+
548
+ const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
549
+ const uint32_t result = sign |
550
+ (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
551
+ return fp32_from_bits(result);
552
+ }
553
+
554
+ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
555
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
556
+ const float scale_to_inf = 0x1.0p+112f;
557
+ const float scale_to_zero = 0x1.0p-110f;
558
+ #else
559
+ const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
560
+ const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
561
+ #endif
562
+ float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
563
+
564
+ const uint32_t w = fp32_to_bits(f);
565
+ const uint32_t shl1_w = w + w;
566
+ const uint32_t sign = w & UINT32_C(0x80000000);
567
+ uint32_t bias = shl1_w & UINT32_C(0xFF000000);
568
+ if (bias < UINT32_C(0x71000000)) {
569
+ bias = UINT32_C(0x71000000);
570
+ }
571
+
572
+ base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
573
+ const uint32_t bits = fp32_to_bits(base);
574
+ const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
575
+ const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
576
+ const uint32_t nonsign = exp_bits + mantissa_bits;
577
+ return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
578
+ }
579
+
580
+ #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
581
+ #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
582
+
583
+ #endif // __F16C__
584
+
585
+ #endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
586
+
587
+ #ifdef __ARM_FEATURE_SVE
588
+ #include <arm_sve.h>
589
+ #endif // __ARM_FEATURE_SVE
590
+
591
+ // precomputed f32 table for f16 (256 KB)
592
+ // defined in ggml.c, initialized in ggml_init()
593
+ extern float ggml_table_f32_f16[1 << 16];
594
+
595
+ // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
596
+ // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
597
+ // This is also true for POWER9.
598
+ #if !defined(GGML_FP16_TO_FP32)
599
+ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
600
+ uint16_t s;
601
+ memcpy(&s, &f, sizeof(uint16_t));
602
+ return ggml_table_f32_f16[s];
603
+ }
604
+
605
+ #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
606
+ #endif
607
+
608
+ #if !defined(GGML_FP32_TO_FP16)
609
+ #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
610
+ #endif
611
+
612
+ #ifdef __cplusplus
613
+ }
614
+ #endif
scripts/sync-ggml-am.sh CHANGED
@@ -98,9 +98,9 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then
98
 
99
  # replace filenames:
100
  #
101
- # CMakelists.txt -> ggml/CMakeLists.txt
102
- # src/CMakeLists.txt -> ggml/src/CMakeLists.txt
103
- # cmake/FindSIMD.cmake -> ggml/cmake/FindSIMD.cmake
104
  #
105
  # src/ggml.c -> ggml/src/ggml.c
106
  # src/ggml-aarch64.c -> ggml/src/ggml-aarch64.c
@@ -112,6 +112,7 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then
112
  # src/ggml-cann/* -> ggml/src/ggml-cann/
113
  # src/ggml-cann.cpp -> ggml/src/ggml-cann.cpp
114
  # src/ggml-common.h -> ggml/src/ggml-common.h
 
115
  # src/ggml-cuda/* -> ggml/src/ggml-cuda/
116
  # src/ggml-cuda.cu -> ggml/src/ggml-cuda.cu
117
  # src/ggml-impl.h -> ggml/src/ggml-impl.h
@@ -137,10 +138,10 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then
137
  # include/ggml-sycl.h -> ggml/include/ggml-sycl.h
138
  # include/ggml-vulkan.h -> ggml/include/ggml-vulkan.h
139
  #
140
- # examples/common.h -> examples/common.h
141
- # examples/common.cpp -> examples/common.cpp
142
- # examples/common-ggml.h -> examples/common-ggml.h
143
- # examples/common-ggml.cpp -> examples/common-ggml.cpp
144
  #
145
  # LICENSE -> LICENSE
146
  # ggml/scripts/gen-authors.sh -> scripts/gen-authors.sh
@@ -159,6 +160,7 @@ if [ -f $SRC_WHISPER/ggml-src.patch ]; then
159
  -e 's/(^[[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \
160
  -e 's/(^[[:space:]]|[ab]\/)src\/ggml-cann\.cpp/\1ggml\/src\/ggml-cann.cpp/g' \
161
  -e 's/(^[[:space:]]|[ab]\/)src\/ggml-common\.h/\1ggml\/src\/ggml-common.h/g' \
 
162
  -e 's/(^[[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \
163
  -e 's/(^[[:space:]]|[ab]\/)src\/ggml-cuda\.cu/\1ggml\/src\/ggml-cuda.cu/g' \
164
  -e 's/(^[[:space:]]|[ab]\/)src\/ggml-impl\.h/\1ggml\/src\/ggml-impl.h/g' \
 
98
 
99
  # replace filenames:
100
  #
101
+ # CMakelists.txt -> ggml/CMakeLists.txt
102
+ # src/CMakeLists.txt -> ggml/src/CMakeLists.txt
103
+ # cmake/FindSIMD.cmake -> ggml/cmake/FindSIMD.cmake
104
  #
105
  # src/ggml.c -> ggml/src/ggml.c
106
  # src/ggml-aarch64.c -> ggml/src/ggml-aarch64.c
 
112
  # src/ggml-cann/* -> ggml/src/ggml-cann/
113
  # src/ggml-cann.cpp -> ggml/src/ggml-cann.cpp
114
  # src/ggml-common.h -> ggml/src/ggml-common.h
115
+ # src/ggml-cpu-impl.h -> ggml/src/ggml-cpu-impl.h
116
  # src/ggml-cuda/* -> ggml/src/ggml-cuda/
117
  # src/ggml-cuda.cu -> ggml/src/ggml-cuda.cu
118
  # src/ggml-impl.h -> ggml/src/ggml-impl.h
 
138
  # include/ggml-sycl.h -> ggml/include/ggml-sycl.h
139
  # include/ggml-vulkan.h -> ggml/include/ggml-vulkan.h
140
  #
141
+ # examples/common.h -> examples/common.h
142
+ # examples/common.cpp -> examples/common.cpp
143
+ # examples/common-ggml.h -> examples/common-ggml.h
144
+ # examples/common-ggml.cpp -> examples/common-ggml.cpp
145
  #
146
  # LICENSE -> LICENSE
147
  # ggml/scripts/gen-authors.sh -> scripts/gen-authors.sh
 
160
  -e 's/(^[[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \
161
  -e 's/(^[[:space:]]|[ab]\/)src\/ggml-cann\.cpp/\1ggml\/src\/ggml-cann.cpp/g' \
162
  -e 's/(^[[:space:]]|[ab]\/)src\/ggml-common\.h/\1ggml\/src\/ggml-common.h/g' \
163
+ -e 's/(^[[:space:]]|[ab]\/)src\/ggml-cpu-impl\.h/\1ggml\/src\/ggml-cpu-impl.h/g' \
164
  -e 's/(^[[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \
165
  -e 's/(^[[:space:]]|[ab]\/)src\/ggml-cuda\.cu/\1ggml\/src\/ggml-cuda.cu/g' \
166
  -e 's/(^[[:space:]]|[ab]\/)src\/ggml-impl\.h/\1ggml\/src\/ggml-impl.h/g' \
scripts/sync-ggml.sh CHANGED
@@ -14,6 +14,7 @@ cp -rpv ../ggml/src/ggml-blas.cpp ./ggml/src/ggml-blas.cpp
14
  cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/
15
  cp -rpv ../ggml/src/ggml-cann.cpp ./ggml/src/ggml-cann.cpp
16
  cp -rpv ../ggml/src/ggml-common.h ./ggml/src/ggml-common.h
 
17
  cp -rpv ../ggml/src/ggml-cuda/* ./ggml/src/ggml-cuda/
18
  cp -rpv ../ggml/src/ggml-cuda.cu ./ggml/src/ggml-cuda.cu
19
  cp -rpv ../ggml/src/ggml-impl.h ./ggml/src/ggml-impl.h
 
14
  cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/
15
  cp -rpv ../ggml/src/ggml-cann.cpp ./ggml/src/ggml-cann.cpp
16
  cp -rpv ../ggml/src/ggml-common.h ./ggml/src/ggml-common.h
17
+ cp -rpv ../ggml/src/ggml-cpu-impl.h ./ggml/src/ggml-cpu-impl.h
18
  cp -rpv ../ggml/src/ggml-cuda/* ./ggml/src/ggml-cuda/
19
  cp -rpv ../ggml/src/ggml-cuda.cu ./ggml/src/ggml-cuda.cu
20
  cp -rpv ../ggml/src/ggml-impl.h ./ggml/src/ggml-impl.h