Spaces:
Running
Running
| // GGML internal header | |
| extern "C" { | |
| // static_assert should be a #define, but if it's not, | |
| // fall back to the _Static_assert C11 keyword. | |
| // if C99 - static_assert is noop | |
| // ref: https://stackoverflow.com/a/53923785/4039976 | |
| // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 | |
| // 16-bit float | |
| // on Arm, we use __fp16 | |
| // on x86, we use uint16_t | |
| // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: | |
| // | |
| // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ | |
| // | |
| /* the inline asm below is about 12% faster than the lookup method */ | |
| static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { | |
| register float f; | |
| register double d; | |
| __asm__( | |
| "mtfprd %0,%2\n" | |
| "xscvhpdp %0,%0\n" | |
| "frsp %1,%0\n" : | |
| /* temp */ "=d"(d), | |
| /* out */ "=f"(f): | |
| /* in */ "r"(h)); | |
| return f; | |
| } | |
| static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { | |
| register double d; | |
| register ggml_fp16_t r; | |
| __asm__( /* xscvdphp can work on double or single precision */ | |
| "xscvdphp %0,%2\n" | |
| "mffprd %1,%0\n" : | |
| /* temp */ "=d"(d), | |
| /* out */ "=r"(r): | |
| /* in */ "f"(f)); | |
| return r; | |
| } | |
| // FP16 <-> FP32 | |
| // ref: https://github.com/Maratyszcza/FP16 | |
| static inline float fp32_from_bits(uint32_t w) { | |
| union { | |
| uint32_t as_bits; | |
| float as_value; | |
| } fp32; | |
| fp32.as_bits = w; | |
| return fp32.as_value; | |
| } | |
| static inline uint32_t fp32_to_bits(float f) { | |
| union { | |
| float as_value; | |
| uint32_t as_bits; | |
| } fp32; | |
| fp32.as_value = f; | |
| return fp32.as_bits; | |
| } | |
| static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { | |
| const uint32_t w = (uint32_t) h << 16; | |
| const uint32_t sign = w & UINT32_C(0x80000000); | |
| const uint32_t two_w = w + w; | |
| const uint32_t exp_offset = UINT32_C(0xE0) << 23; | |
| const float exp_scale = 0x1.0p-112f; | |
| const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); | |
| const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; | |
| const uint32_t magic_mask = UINT32_C(126) << 23; | |
| const float magic_bias = 0.5f; | |
| const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; | |
| const uint32_t denormalized_cutoff = UINT32_C(1) << 27; | |
| const uint32_t result = sign | | |
| (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); | |
| return fp32_from_bits(result); | |
| } | |
| static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { | |
| const float scale_to_inf = 0x1.0p+112f; | |
| const float scale_to_zero = 0x1.0p-110f; | |
| const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); | |
| const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); | |
| float base = (fabsf(f) * scale_to_inf) * scale_to_zero; | |
| const uint32_t w = fp32_to_bits(f); | |
| const uint32_t shl1_w = w + w; | |
| const uint32_t sign = w & UINT32_C(0x80000000); | |
| uint32_t bias = shl1_w & UINT32_C(0xFF000000); | |
| if (bias < UINT32_C(0x71000000)) { | |
| bias = UINT32_C(0x71000000); | |
| } | |
| base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; | |
| const uint32_t bits = fp32_to_bits(base); | |
| const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); | |
| const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); | |
| const uint32_t nonsign = exp_bits + mantissa_bits; | |
| return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); | |
| } | |
| // precomputed f32 table for f16 (256 KB) | |
| // defined in ggml.c, initialized in ggml_init() | |
| extern float ggml_table_f32_f16[1 << 16]; | |
| // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, | |
| // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. | |
| // This is also true for POWER9. | |
| inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { | |
| uint16_t s; | |
| memcpy(&s, &f, sizeof(uint16_t)); | |
| return ggml_table_f32_f16[s]; | |
| } | |
| bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key); | |
| // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted | |
| size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key); | |
| // returns GGML_HAHSHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full | |
| size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key); | |
| // return index, asserts if table is full | |
| size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key); | |
| } | |