Skip to content

Commit 2882b42

Browse files
committed
Made ggml_interface_fp16 independent
Removed ggml includes, renamed some functions, moved some declerations to common nntr_ggml_impl headers Signed-off-by: p-debski2 <[email protected]>
1 parent fabbb0d commit 2882b42

File tree

6 files changed

+94
-223
lines changed

6 files changed

+94
-223
lines changed

nntrainer/tensor/cpu_backend/arm/arm_compute_backend_fp16.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ size_t quantize_q8_0(const _FP16 *src, void *dst, int64_t nrow,
330330

331331
template <> void dequantize_row_q8_0(const void *x_raw, _FP16 *y, int64_t k) {
332332
#ifdef ENABLE_GGML
333-
__nntr_dequantize_row_q8_0(x_raw, y, k);
333+
__ggml_dequantize_row_q8_0(x_raw, y, k);
334334
#else
335335
__fallback_dequantize_row_q8_0(x_raw, y, k);
336336
#endif

nntrainer/tensor/cpu_backend/ggml_interface/ggml_interface.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ void __ggml_dequantize_row_q4_0(const void *x_raw, float *y, int64_t k);
277277
* @param y output destination for dequantized data
278278
* @param k data length
279279
*/
280-
void __nntr_dequantize_row_q8_0(const void *x_raw, float *y, int64_t k);
280+
void __ggml_dequantize_row_q8_0(const void *x_raw, float *y, int64_t k);
281281

282282
/**
283283
* @brief q4K to float dequantize
@@ -372,7 +372,7 @@ size_t __ggml_quantize_q8_0(const _FP16 *src, void *dst, int64_t nrow,
372372
* @param y output destination for dequantized data
373373
* @param k data length
374374
*/
375-
void __nntr_dequantize_row_q8_0(const void *x_raw, _FP16 *y, int64_t k);
375+
void __ggml_dequantize_row_q8_0(const void *x_raw, _FP16 *y, int64_t k);
376376
#endif
377377
} // namespace nntrainer
378378

nntrainer/tensor/cpu_backend/ggml_interface/ggml_interface_fp16.cpp

Lines changed: 20 additions & 150 deletions
Original file line numberDiff line numberDiff line change
@@ -10,155 +10,25 @@
1010
* @brief GGML kernels for FP16 activation flow
1111
*/
1212

13-
#include "ggml-common.h"
14-
#include "ggml-cpu-quants.h"
15-
#include "ggml-cpu.h"
16-
#include "ggml-quants.h"
17-
#include "ggml.h"
13+
#include <ggml_interface.h>
14+
#include <nntr_ggml_impl.h>
15+
#include <nntr_ggml_impl_common.h>
16+
1817
#include <algorithm>
1918
#include <assert.h>
2019
#include <bs_thread_pool_manager.hpp>
21-
2220
#include <cmath>
2321
#include <cstring>
24-
#include <ggml_interface.h>
2522
#include <iostream>
2623
#include <math.h>
27-
#include <nntr_ggml_impl.h>
2824
#include <stdint.h>
2925

3026
#if defined(__ARM_NEON)
3127
#include <arm_neon.h>
3228
#endif
3329

34-
#ifndef MAX
35-
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
36-
#endif
37-
#ifndef MIN
38-
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
39-
#endif
40-
4130
namespace nntrainer {
4231

43-
template <int K> constexpr int QK_0() {
44-
if constexpr (K == 4) {
45-
return QK4_0;
46-
}
47-
if constexpr (K == 8) {
48-
return QK8_0;
49-
}
50-
return -1;
51-
}
52-
/**
53-
* @brief block of 0-quantization
54-
*
55-
* @tparam K quant bit
56-
* @tparam N number of blocks to be packed
57-
*/
58-
template <int K, int N> struct block {
59-
uint16_t d[N]; // deltas for N qK_0 blocks
60-
int8_t qs[(QK_0<K>() * N * K) / 8]; // quants for N qK_0 blocks
61-
};
62-
63-
using block_q8_0x4 = block<8, 4>;
64-
#if defined(__ARM_NEON)
65-
static inline float nntr_compute_fp16_to_fp32(uint16_t h) {
66-
_FP16 tmp;
67-
memcpy(&tmp, &h, sizeof(uint16_t));
68-
return (float)tmp;
69-
}
70-
71-
static inline uint16_t nntr_compute_fp32_to_fp16(float f) {
72-
uint16_t res;
73-
_FP16 tmp = f;
74-
memcpy(&res, &tmp, sizeof(uint16_t));
75-
return res;
76-
}
77-
#else
78-
static inline float fp32_from_bits(uint32_t w) {
79-
union {
80-
uint32_t as_bits;
81-
float as_value;
82-
} fp32;
83-
fp32.as_bits = w;
84-
return fp32.as_value;
85-
}
86-
87-
static inline uint32_t fp32_to_bits(float f) {
88-
union {
89-
float as_value;
90-
uint32_t as_bits;
91-
} fp32;
92-
fp32.as_value = f;
93-
return fp32.as_bits;
94-
}
95-
96-
static inline float nntr_compute_fp16_to_fp32(uint16_t h) {
97-
const uint32_t w = (uint32_t)h << 16;
98-
const uint32_t sign = w & UINT32_C(0x80000000);
99-
const uint32_t two_w = w + w;
100-
101-
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
102-
#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || \
103-
defined(__GNUC__) && !defined(__STRICT_ANSI__)) && \
104-
(!defined(__cplusplus) || __cplusplus >= 201703L)
105-
const float exp_scale = 0x1.0p-112f;
106-
#else
107-
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
108-
#endif
109-
const float normalized_value =
110-
fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
111-
112-
const uint32_t magic_mask = UINT32_C(126) << 23;
113-
const float magic_bias = 0.5f;
114-
const float denormalized_value =
115-
fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
116-
117-
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
118-
const uint32_t result =
119-
sign | (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value)
120-
: fp32_to_bits(normalized_value));
121-
return fp32_from_bits(result);
122-
}
123-
124-
static inline uint16_t nntr_compute_fp32_to_fp16(float f) {
125-
#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || \
126-
defined(__GNUC__) && !defined(__STRICT_ANSI__)) && \
127-
(!defined(__cplusplus) || __cplusplus >= 201703L)
128-
const float scale_to_inf = 0x1.0p+112f;
129-
const float scale_to_zero = 0x1.0p-110f;
130-
#else
131-
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
132-
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
133-
#endif
134-
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
135-
136-
const uint32_t w = fp32_to_bits(f);
137-
const uint32_t shl1_w = w + w;
138-
const uint32_t sign = w & UINT32_C(0x80000000);
139-
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
140-
if (bias < UINT32_C(0x71000000)) {
141-
bias = UINT32_C(0x71000000);
142-
}
143-
144-
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
145-
const uint32_t bits = fp32_to_bits(base);
146-
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
147-
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
148-
const uint32_t nonsign = exp_bits + mantissa_bits;
149-
return (sign >> 16) |
150-
(shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
151-
}
152-
#endif
153-
154-
static inline int nearest_int(float fval) {
155-
assert(fabsf(fval) <= 4194303.f);
156-
float val = fval + 12582912.f;
157-
int i;
158-
memcpy(&i, &val, sizeof(int));
159-
return (i & 0x007fffff) - 0x00400000;
160-
}
161-
16232
static inline void __copy_f16_from_f32(const float *src, _FP16 *dst,
16333
int64_t k) {
16434
#if defined(__ARM_NEON)
@@ -252,7 +122,7 @@ size_t __ggml_quantize_q8_0(const _FP16 *src, void *dst, int64_t nrow,
252122
return nrow * row_size;
253123
}
254124

255-
void __nntr_dequantize_row_q8_0(const void *_x, _FP16 *__restrict y,
125+
void __ggml_dequantize_row_q8_0(const void *_x, _FP16 *__restrict y,
256126
int64_t k) {
257127
static const int qk = QK8_0;
258128
const block_q8_0 *__restrict x = (const block_q8_0 *__restrict)_x;
@@ -271,13 +141,13 @@ void __nntr_dequantize_row_q8_0(const void *_x, _FP16 *__restrict y,
271141
}
272142
}
273143

274-
static void __nntr_quantize_mat_q8_0_4x8(const _FP16 *GGML_RESTRICT x,
275-
void *GGML_RESTRICT vy, int64_t k) {
144+
static void __ggml_quantize_mat_q8_0_4x8(const _FP16 *__restrict x,
145+
void *__restrict vy, int64_t k) {
276146
assert(QK8_0 == 32);
277147
assert(k % QK8_0 == 0);
278148
const int nb = k / QK8_0;
279149

280-
block_q8_0x4 *GGML_RESTRICT y = (block_q8_0x4 *)vy;
150+
block_q8_0x4 *__restrict y = (block_q8_0x4 *)vy;
281151

282152
#if defined(__ARM_NEON)
283153
float16x8_t srcv[4][4];
@@ -387,11 +257,11 @@ static void __nntr_quantize_mat_q8_0_4x8(const _FP16 *GGML_RESTRICT x,
387257
}
388258

389259
template <>
390-
void __ggml_dequantize_row_q8_K(const void *GGML_RESTRICT _x,
391-
_FP16 *GGML_RESTRICT y, int64_t k) {
260+
void __ggml_dequantize_row_q8_K(const void *__restrict _x, _FP16 *__restrict y,
261+
int64_t k) {
392262
assert(k % QK_K == 0);
393263
const int64_t nb = k / QK_K;
394-
const block_q8_K *GGML_RESTRICT x = (const block_q8_K *GGML_RESTRICT)_x;
264+
const block_q8_K *__restrict x = (const block_q8_K *__restrict)_x;
395265

396266
for (int i = 0; i < nb; i++) {
397267
for (int j = 0; j < QK_K; ++j) {
@@ -400,11 +270,11 @@ void __ggml_dequantize_row_q8_K(const void *GGML_RESTRICT _x,
400270
}
401271
}
402272

403-
void __ggml_quantize_row_q8_K_ref(const _FP16 *GGML_RESTRICT x,
404-
void *GGML_RESTRICT _y, int64_t k) {
273+
void __ggml_quantize_row_q8_K_ref(const _FP16 *__restrict x,
274+
void *__restrict _y, int64_t k) {
405275
assert(k % QK_K == 0);
406276
const int64_t nb = k / QK_K;
407-
block_q8_K *GGML_RESTRICT y = (block_q8_K * GGML_RESTRICT) _y;
277+
block_q8_K *__restrict y = (block_q8_K *__restrict)_y;
408278

409279
for (int i = 0; i < nb; i++) {
410280

@@ -444,8 +314,8 @@ void __ggml_quantize_row_q8_K_ref(const _FP16 *GGML_RESTRICT x,
444314
}
445315

446316
template <>
447-
void __ggml_quantize_row_q8_K(const _FP16 *GGML_RESTRICT x,
448-
void *GGML_RESTRICT y, int64_t k) {
317+
void __ggml_quantize_row_q8_K(const _FP16 *__restrict x, void *__restrict y,
318+
int64_t k) {
449319
__ggml_quantize_row_q8_K_ref(x, y, k);
450320
}
451321

@@ -483,7 +353,7 @@ void __ggml_gemm_q6_K(const unsigned int M, const unsigned int N,
483353

484354
const void *const B_data = (void *)((char *)B + B_row_data_offset);
485355

486-
ggml_vec_dot_q6_K_q8_K(K, &C32_ptr[thread_job], bs, B_data, bx,
356+
nntr_vec_dot_q6_K_q8_K(K, &C32_ptr[thread_job], bs, B_data, bx,
487357
quantized_A_data, by, nrc);
488358
}
489359
} else { // GEMM
@@ -507,7 +377,7 @@ void __ggml_gemm_q6_K(const unsigned int M, const unsigned int N,
507377
const int32_t B_row_data_offset = B_row_size * j;
508378
const void *const B_data = (void *)((char *)B + B_row_data_offset);
509379

510-
ggml_vec_dot_q6_K_q8_K(K, &C32_ptr[thread_job * ldc + j], bs, B_data,
380+
nntr_vec_dot_q6_K_q8_K(K, &C32_ptr[thread_job * ldc + j], bs, B_data,
511381
bx, A_data, by, nrc);
512382
}
513383
}
@@ -534,7 +404,7 @@ static inline void __ggml_q4_0_4x8_q8_0_GEMM_BSTP(
534404

535405
// Quantize 4-divisible-M row portion with matrix-wise function
536406
for (unsigned int i = 0; i < M4; i++) {
537-
__nntr_quantize_mat_q8_0_4x8(A + 4 * i * K, QA.data() + i * qa_4_rows_size,
407+
__ggml_quantize_mat_q8_0_4x8(A + 4 * i * K, QA.data() + i * qa_4_rows_size,
538408
K);
539409
}
540410
// Quantize leftover 1 ~ 3 rows with row-wise function
@@ -636,7 +506,7 @@ void __ggml_q4_0_4x8_q8_0_GEMM(const unsigned int M, const unsigned int N,
636506

637507
// Quantize 4-divisible-M row portion with matrix-wise function
638508
for (unsigned int i = 0; i < M4; i++) {
639-
__nntr_quantize_mat_q8_0_4x8(A + 4 * i * K,
509+
__ggml_quantize_mat_q8_0_4x8(A + 4 * i * K,
640510
(void *)(QA.data() + i * qa_4_rows_size), K);
641511
}
642512
// Quantize leftover 1 ~ 3 rows with row-wise function

nntrainer/tensor/cpu_backend/ggml_interface/nntr_ggml_impl/nntr_ggml_impl_common.h

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
#include <assert.h>
1818
#include <cmath>
19+
#include <cstring>
1920
#include <stdint.h>
2021

2122
typedef uint16_t nntr_fp16_t;
@@ -147,4 +148,73 @@ inline int nearest_int(float fval) {
147148
return (i & 0x007fffff) - 0x00400000;
148149
}
149150

151+
enum ggml_type {
152+
GGML_TYPE_F32 = 0,
153+
GGML_TYPE_F16 = 1,
154+
GGML_TYPE_Q4_0 = 2,
155+
GGML_TYPE_Q4_1 = 3,
156+
GGML_TYPE_Q8_0 = 8,
157+
GGML_TYPE_Q4_K = 12,
158+
GGML_TYPE_Q6_K = 14,
159+
GGML_TYPE_Q8_K = 15,
160+
GGML_TYPE_COUNT = 39,
161+
};
162+
163+
inline int64_t ggml_blck_size(enum ggml_type type) {
164+
165+
switch (type) {
166+
case GGML_TYPE_Q4_0: {
167+
return QK4_0;
168+
}
169+
case GGML_TYPE_Q8_0: {
170+
return QK8_0;
171+
}
172+
case GGML_TYPE_Q4_K: {
173+
return QK_K;
174+
}
175+
case GGML_TYPE_Q6_K: {
176+
return QK_K;
177+
}
178+
case GGML_TYPE_Q8_K: {
179+
return QK_K;
180+
}
181+
default: {
182+
break;
183+
}
184+
}
185+
186+
return -1;
187+
}
188+
189+
inline size_t ggml_type_size(enum ggml_type type) {
190+
191+
switch (type) {
192+
case GGML_TYPE_Q4_0: {
193+
return sizeof(block_q4_0);
194+
}
195+
case GGML_TYPE_Q8_0: {
196+
return sizeof(block_q8_0);
197+
}
198+
case GGML_TYPE_Q4_K: {
199+
return sizeof(block_q4_K);
200+
}
201+
case GGML_TYPE_Q6_K: {
202+
return sizeof(block_q6_K);
203+
}
204+
case GGML_TYPE_Q8_K: {
205+
return sizeof(block_q8_K);
206+
}
207+
default: {
208+
break;
209+
}
210+
}
211+
212+
return -1;
213+
}
214+
215+
inline size_t ggml_row_size(enum ggml_type type, int64_t ne) {
216+
assert(ne % ggml_blck_size(type) == 0);
217+
return ggml_type_size(type) * ne / ggml_blck_size(type);
218+
}
219+
150220
#endif

0 commit comments

Comments
 (0)