10
10
* @brief GGML kernels for FP16 activation flow
11
11
*/
12
12
13
- #include " ggml-common.h"
14
- #include " ggml-cpu-quants.h"
15
- #include " ggml-cpu.h"
16
- #include " ggml-quants.h"
17
- #include " ggml.h"
13
+ #include < ggml_interface.h>
14
+ #include < nntr_ggml_impl.h>
15
+ #include < nntr_ggml_impl_common.h>
16
+
18
17
#include < algorithm>
19
18
#include < assert.h>
20
19
#include < bs_thread_pool_manager.hpp>
21
-
22
20
#include < cmath>
23
21
#include < cstring>
24
- #include < ggml_interface.h>
25
22
#include < iostream>
26
23
#include < math.h>
27
- #include < nntr_ggml_impl.h>
28
24
#include < stdint.h>
29
25
30
26
#if defined(__ARM_NEON)
31
27
#include < arm_neon.h>
32
28
#endif
33
29
34
- #ifndef MAX
35
- #define MAX (x, y ) (((x) > (y)) ? (x) : (y))
36
- #endif
37
- #ifndef MIN
38
- #define MIN (x, y ) (((x) < (y)) ? (x) : (y))
39
- #endif
40
-
41
30
namespace nntrainer {
42
31
43
- template <int K> constexpr int QK_0 () {
44
- if constexpr (K == 4 ) {
45
- return QK4_0;
46
- }
47
- if constexpr (K == 8 ) {
48
- return QK8_0;
49
- }
50
- return -1 ;
51
- }
52
- /* *
53
- * @brief block of 0-quantization
54
- *
55
- * @tparam K quant bit
56
- * @tparam N number of blocks to be packed
57
- */
58
- template <int K, int N> struct block {
59
- uint16_t d[N]; // deltas for N qK_0 blocks
60
- int8_t qs[(QK_0<K>() * N * K) / 8 ]; // quants for N qK_0 blocks
61
- };
62
-
63
- using block_q8_0x4 = block<8 , 4 >;
64
- #if defined(__ARM_NEON)
65
- static inline float nntr_compute_fp16_to_fp32 (uint16_t h) {
66
- _FP16 tmp;
67
- memcpy (&tmp, &h, sizeof (uint16_t ));
68
- return (float )tmp;
69
- }
70
-
71
- static inline uint16_t nntr_compute_fp32_to_fp16 (float f) {
72
- uint16_t res;
73
- _FP16 tmp = f;
74
- memcpy (&res, &tmp, sizeof (uint16_t ));
75
- return res;
76
- }
77
- #else
78
- static inline float fp32_from_bits (uint32_t w) {
79
- union {
80
- uint32_t as_bits;
81
- float as_value;
82
- } fp32;
83
- fp32.as_bits = w;
84
- return fp32.as_value ;
85
- }
86
-
87
- static inline uint32_t fp32_to_bits (float f) {
88
- union {
89
- float as_value;
90
- uint32_t as_bits;
91
- } fp32;
92
- fp32.as_value = f;
93
- return fp32.as_bits ;
94
- }
95
-
96
- static inline float nntr_compute_fp16_to_fp32 (uint16_t h) {
97
- const uint32_t w = (uint32_t )h << 16 ;
98
- const uint32_t sign = w & UINT32_C (0x80000000 );
99
- const uint32_t two_w = w + w;
100
-
101
- const uint32_t exp_offset = UINT32_C (0xE0 ) << 23 ;
102
- #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || \
103
- defined (__GNUC__) && !defined (__STRICT_ANSI__)) && \
104
- (!defined (__cplusplus) || __cplusplus >= 201703L )
105
- const float exp_scale = 0x1 .0p-112f ;
106
- #else
107
- const float exp_scale = fp32_from_bits (UINT32_C (0x7800000 ));
108
- #endif
109
- const float normalized_value =
110
- fp32_from_bits ((two_w >> 4 ) + exp_offset) * exp_scale;
111
-
112
- const uint32_t magic_mask = UINT32_C (126 ) << 23 ;
113
- const float magic_bias = 0 .5f ;
114
- const float denormalized_value =
115
- fp32_from_bits ((two_w >> 17 ) | magic_mask) - magic_bias;
116
-
117
- const uint32_t denormalized_cutoff = UINT32_C (1 ) << 27 ;
118
- const uint32_t result =
119
- sign | (two_w < denormalized_cutoff ? fp32_to_bits (denormalized_value)
120
- : fp32_to_bits (normalized_value));
121
- return fp32_from_bits (result);
122
- }
123
-
124
- static inline uint16_t nntr_compute_fp32_to_fp16 (float f) {
125
- #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || \
126
- defined (__GNUC__) && !defined (__STRICT_ANSI__)) && \
127
- (!defined (__cplusplus) || __cplusplus >= 201703L )
128
- const float scale_to_inf = 0x1 .0p+112f ;
129
- const float scale_to_zero = 0x1 .0p-110f ;
130
- #else
131
- const float scale_to_inf = fp32_from_bits (UINT32_C (0x77800000 ));
132
- const float scale_to_zero = fp32_from_bits (UINT32_C (0x08800000 ));
133
- #endif
134
- float base = (fabsf (f) * scale_to_inf) * scale_to_zero;
135
-
136
- const uint32_t w = fp32_to_bits (f);
137
- const uint32_t shl1_w = w + w;
138
- const uint32_t sign = w & UINT32_C (0x80000000 );
139
- uint32_t bias = shl1_w & UINT32_C (0xFF000000 );
140
- if (bias < UINT32_C (0x71000000 )) {
141
- bias = UINT32_C (0x71000000 );
142
- }
143
-
144
- base = fp32_from_bits ((bias >> 1 ) + UINT32_C (0x07800000 )) + base;
145
- const uint32_t bits = fp32_to_bits (base);
146
- const uint32_t exp_bits = (bits >> 13 ) & UINT32_C (0x00007C00 );
147
- const uint32_t mantissa_bits = bits & UINT32_C (0x00000FFF );
148
- const uint32_t nonsign = exp_bits + mantissa_bits;
149
- return (sign >> 16 ) |
150
- (shl1_w > UINT32_C (0xFF000000 ) ? UINT16_C (0x7E00 ) : nonsign);
151
- }
152
- #endif
153
-
154
- static inline int nearest_int (float fval) {
155
- assert (fabsf (fval) <= 4194303 .f );
156
- float val = fval + 12582912 .f ;
157
- int i;
158
- memcpy (&i, &val, sizeof (int ));
159
- return (i & 0x007fffff ) - 0x00400000 ;
160
- }
161
-
162
32
static inline void __copy_f16_from_f32 (const float *src, _FP16 *dst,
163
33
int64_t k) {
164
34
#if defined(__ARM_NEON)
@@ -252,7 +122,7 @@ size_t __ggml_quantize_q8_0(const _FP16 *src, void *dst, int64_t nrow,
252
122
return nrow * row_size;
253
123
}
254
124
255
- void __nntr_dequantize_row_q8_0 (const void *_x, _FP16 *__restrict y,
125
+ void __ggml_dequantize_row_q8_0 (const void *_x, _FP16 *__restrict y,
256
126
int64_t k) {
257
127
static const int qk = QK8_0;
258
128
const block_q8_0 *__restrict x = (const block_q8_0 *__restrict)_x;
@@ -271,13 +141,13 @@ void __nntr_dequantize_row_q8_0(const void *_x, _FP16 *__restrict y,
271
141
}
272
142
}
273
143
274
- static void __nntr_quantize_mat_q8_0_4x8 (const _FP16 *GGML_RESTRICT x,
275
- void *GGML_RESTRICT vy, int64_t k) {
144
+ static void __ggml_quantize_mat_q8_0_4x8 (const _FP16 *__restrict x,
145
+ void *__restrict vy, int64_t k) {
276
146
assert (QK8_0 == 32 );
277
147
assert (k % QK8_0 == 0 );
278
148
const int nb = k / QK8_0;
279
149
280
- block_q8_0x4 *GGML_RESTRICT y = (block_q8_0x4 *)vy;
150
+ block_q8_0x4 *__restrict y = (block_q8_0x4 *)vy;
281
151
282
152
#if defined(__ARM_NEON)
283
153
float16x8_t srcv[4 ][4 ];
@@ -387,11 +257,11 @@ static void __nntr_quantize_mat_q8_0_4x8(const _FP16 *GGML_RESTRICT x,
387
257
}
388
258
389
259
template <>
390
- void __ggml_dequantize_row_q8_K (const void *GGML_RESTRICT _x,
391
- _FP16 *GGML_RESTRICT y, int64_t k) {
260
+ void __ggml_dequantize_row_q8_K (const void *__restrict _x, _FP16 *__restrict y ,
261
+ int64_t k) {
392
262
assert (k % QK_K == 0 );
393
263
const int64_t nb = k / QK_K;
394
- const block_q8_K *GGML_RESTRICT x = (const block_q8_K *GGML_RESTRICT )_x;
264
+ const block_q8_K *__restrict x = (const block_q8_K *__restrict )_x;
395
265
396
266
for (int i = 0 ; i < nb; i++) {
397
267
for (int j = 0 ; j < QK_K; ++j) {
@@ -400,11 +270,11 @@ void __ggml_dequantize_row_q8_K(const void *GGML_RESTRICT _x,
400
270
}
401
271
}
402
272
403
- void __ggml_quantize_row_q8_K_ref (const _FP16 *GGML_RESTRICT x,
404
- void *GGML_RESTRICT _y, int64_t k) {
273
+ void __ggml_quantize_row_q8_K_ref (const _FP16 *__restrict x,
274
+ void *__restrict _y, int64_t k) {
405
275
assert (k % QK_K == 0 );
406
276
const int64_t nb = k / QK_K;
407
- block_q8_K *GGML_RESTRICT y = (block_q8_K * GGML_RESTRICT) _y;
277
+ block_q8_K *__restrict y = (block_q8_K *__restrict) _y;
408
278
409
279
for (int i = 0 ; i < nb; i++) {
410
280
@@ -444,8 +314,8 @@ void __ggml_quantize_row_q8_K_ref(const _FP16 *GGML_RESTRICT x,
444
314
}
445
315
446
316
template <>
447
- void __ggml_quantize_row_q8_K (const _FP16 *GGML_RESTRICT x ,
448
- void *GGML_RESTRICT y, int64_t k) {
317
+ void __ggml_quantize_row_q8_K (const _FP16 *__restrict x, void *__restrict y ,
318
+ int64_t k) {
449
319
__ggml_quantize_row_q8_K_ref (x, y, k);
450
320
}
451
321
@@ -483,7 +353,7 @@ void __ggml_gemm_q6_K(const unsigned int M, const unsigned int N,
483
353
484
354
const void *const B_data = (void *)((char *)B + B_row_data_offset);
485
355
486
- ggml_vec_dot_q6_K_q8_K (K, &C32_ptr[thread_job], bs, B_data, bx,
356
+ nntr_vec_dot_q6_K_q8_K (K, &C32_ptr[thread_job], bs, B_data, bx,
487
357
quantized_A_data, by, nrc);
488
358
}
489
359
} else { // GEMM
@@ -507,7 +377,7 @@ void __ggml_gemm_q6_K(const unsigned int M, const unsigned int N,
507
377
const int32_t B_row_data_offset = B_row_size * j;
508
378
const void *const B_data = (void *)((char *)B + B_row_data_offset);
509
379
510
- ggml_vec_dot_q6_K_q8_K (K, &C32_ptr[thread_job * ldc + j], bs, B_data,
380
+ nntr_vec_dot_q6_K_q8_K (K, &C32_ptr[thread_job * ldc + j], bs, B_data,
511
381
bx, A_data, by, nrc);
512
382
}
513
383
}
@@ -534,7 +404,7 @@ static inline void __ggml_q4_0_4x8_q8_0_GEMM_BSTP(
534
404
535
405
// Quantize 4-divisible-M row portion with matrix-wise function
536
406
for (unsigned int i = 0 ; i < M4; i++) {
537
- __nntr_quantize_mat_q8_0_4x8 (A + 4 * i * K, QA.data () + i * qa_4_rows_size,
407
+ __ggml_quantize_mat_q8_0_4x8 (A + 4 * i * K, QA.data () + i * qa_4_rows_size,
538
408
K);
539
409
}
540
410
// Quantize leftover 1 ~ 3 rows with row-wise function
@@ -636,7 +506,7 @@ void __ggml_q4_0_4x8_q8_0_GEMM(const unsigned int M, const unsigned int N,
636
506
637
507
// Quantize 4-divisible-M row portion with matrix-wise function
638
508
for (unsigned int i = 0 ; i < M4; i++) {
639
- __nntr_quantize_mat_q8_0_4x8 (A + 4 * i * K,
509
+ __ggml_quantize_mat_q8_0_4x8 (A + 4 * i * K,
640
510
(void *)(QA.data () + i * qa_4_rows_size), K);
641
511
}
642
512
// Quantize leftover 1 ~ 3 rows with row-wise function
0 commit comments