@@ -142,7 +142,6 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor, cons
142
142
GGML_ASSERT (tensor->extra == nullptr );
143
143
GGML_ASSERT (tensor->op == GGML_OP_NONE);
144
144
145
- void *buffer_host;
146
145
size_t n_bytes = ggml_nbytes (tensor);
147
146
int64_t n_elems = ggml_nelements (tensor);
148
147
int64_t groups = n_elems / QK4_0;
@@ -176,7 +175,6 @@ GGML_CALL static void ggml_backend_cann_transform_back_q4_0(const ggml_tensor* t
176
175
GGML_ASSERT (tensor->extra == nullptr );
177
176
GGML_ASSERT (tensor->op == GGML_OP_NONE);
178
177
179
- void *buffer_host;
180
178
size_t n_bytes = ggml_nbytes (tensor);
181
179
int64_t n_elems = ggml_nelements (tensor);
182
180
int64_t groups = n_elems / QK4_0;
@@ -206,12 +204,66 @@ GGML_CALL static void ggml_backend_cann_transform_back_q4_0(const ggml_tensor* t
206
204
}
207
205
}
208
206
207
+ #define QK8_0 32
208
+ typedef struct {
209
+ uint16_t d; // delta
210
+ int8_t qs[QK8_0]; // quants
211
+ } block_q8_0;
212
+
213
+ GGML_CALL static void ggml_backend_cann_transform_q8_0 (ggml_tensor* tensor, const void *src, void * dst) {
214
+ GGML_ASSERT (tensor->extra == nullptr );
215
+ GGML_ASSERT (tensor->op == GGML_OP_NONE);
216
+
217
+ size_t n_bytes = ggml_nbytes (tensor);
218
+ int64_t n_elems = ggml_nelements (tensor);
219
+ int64_t groups = n_elems / QK8_0;
220
+ size_t quant_bytes = n_elems * sizeof (uint8_t );
221
+
222
+ uint8_t * quant_offset = (uint8_t *)dst;
223
+ uint16_t * scale_offset = (uint16_t *)((char *)dst + quant_bytes);
224
+
225
+ for (int i = 0 ;i<groups; i++) {
226
+ block_q8_0 *group = (block_q8_0*)((char *)src + i * sizeof (block_q8_0));
227
+ *scale_offset = group->d ;
228
+ scale_offset++;
229
+ size_t group_quant_size = QK8_0 * sizeof (uint8_t );
230
+ memcpy (quant_offset, group->qs , group_quant_size);
231
+ quant_offset += group_quant_size;
232
+ }
233
+ }
234
+
235
+ GGML_CALL static void ggml_backend_cann_transform_back_q8_0 (const ggml_tensor* tensor, const void *src, void * dst) {
236
+ GGML_ASSERT (tensor->extra == nullptr );
237
+ GGML_ASSERT (tensor->op == GGML_OP_NONE);
238
+
239
+ size_t n_bytes = ggml_nbytes (tensor);
240
+ int64_t n_elems = ggml_nelements (tensor);
241
+ int64_t groups = n_elems / QK8_0;
242
+ size_t quant_bytes = n_elems * sizeof (uint8_t );
243
+
244
+ uint8_t * quant_offset = (uint8_t *)src;
245
+ uint16_t * scale_offset = (uint16_t *)((char *)src + quant_bytes);
246
+
247
+ for (int i = 0 ;i<groups; i++) {
248
+ block_q8_0 *group = (block_q8_0*)((char *)dst + i * sizeof (block_q8_0));
249
+ group->d = *scale_offset;
250
+ scale_offset++;
251
+ size_t group_quant_size = QK8_0 * sizeof (uint8_t );
252
+ memcpy (group->qs , quant_offset, group_quant_size);
253
+ quant_offset += group_quant_size;
254
+ }
255
+ }
256
+
257
+
209
258
GGML_CALL static void ggml_backend_cann_transform (ggml_tensor* tensor, const void * src, void *dst) {
210
259
std::cout<<" Transform tensor:" <<tensor->name <<std::endl;
211
260
switch (tensor->type ) {
212
261
case GGML_TYPE_Q4_0:
213
262
ggml_backend_cann_transform_q4_0 (tensor, src, dst);
214
263
break ;
264
+ case GGML_TYPE_Q8_0:
265
+ ggml_backend_cann_transform_q8_0 (tensor, src, dst);
266
+ break ;
215
267
default :
216
268
break ;
217
269
}
@@ -223,6 +275,9 @@ GGML_CALL static void ggml_backend_cann_transform_back(const ggml_tensor* tensor
223
275
case GGML_TYPE_Q4_0:
224
276
ggml_backend_cann_transform_back_q4_0 (tensor, src, dst);
225
277
break ;
278
+ case GGML_TYPE_Q8_0:
279
+ ggml_backend_cann_transform_back_q8_0 (tensor, src, dst);
280
+ break ;
226
281
default :
227
282
break ;
228
283
}
@@ -231,6 +286,7 @@ GGML_CALL static void ggml_backend_cann_transform_back(const ggml_tensor* tensor
231
286
GGML_CALL static bool need_transform (ggml_type type) {
232
287
switch (type) {
233
288
case GGML_TYPE_Q4_0:
289
+ case GGML_TYPE_Q8_0:
234
290
return true ;
235
291
default :
236
292
return false ;
@@ -820,7 +876,16 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
820
876
case GGML_OP_MUL_MAT_ID:
821
877
// embedding
822
878
case GGML_OP_GET_ROWS:
823
- return false ;
879
+ {
880
+ switch (op->src [0 ]->type ) {
881
+ // case GGML_TYPE_Q4_0:
882
+ case GGML_TYPE_Q8_0:
883
+ return true ;
884
+ default :
885
+ return false ;
886
+ }
887
+ }
888
+ break ;
824
889
case GGML_OP_CPY:
825
890
case GGML_OP_DUP:
826
891
case GGML_OP_REPEAT:
0 commit comments