@@ -42,6 +42,8 @@ class GgufIntegrationTests(unittest.TestCase):
42
42
llama3_model_id = "NousResearch/Meta-Llama-3-8B-GGUF"
43
43
tinyllama_model_id = "PenutChen/TinyLlama-1.1B-Chat-v1.0-GGUF"
44
44
phi3_model_id = "microsoft/Phi-3-mini-4k-instruct-gguf"
45
+ bloom_model_id = "afrideva/bloom-560m-GGUF"
46
+ original_bloom_model_id = "bigscience/bloom-560m"
45
47
46
48
# standard quants
47
49
q4_0_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf"
@@ -69,6 +71,8 @@ class GgufIntegrationTests(unittest.TestCase):
69
71
q4_0_qwen2_model_id = "qwen1_5-0_5b-chat-q4_0.gguf"
70
72
q4_0_qwen2_moe_model_id = "Qwen1.5-MoE-A2.7B-Chat.Q4_0.gguf"
71
73
q4_llama3_model_id = "Meta-Llama-3-8B-Q4_K_M.gguf"
74
+ fp16_bloom_model_id = "bloom-560m.fp16.gguf"
75
+ q8_bloom_model_id = "bloom-560m.q8_0.gguf"
72
76
f16_tinyllama_model_id = "TinyLlama-1.1B-Chat-v1.0.FP16.gguf"
73
77
74
78
example_text = "Hello"
@@ -385,6 +389,62 @@ def test_llama3_q4_0(self):
385
389
EXPECTED_TEXT = "Hello, I am interested in [The Park]\n The"
386
390
self .assertEqual (tokenizer .decode (out [0 ], skip_special_tokens = True ), EXPECTED_TEXT )
387
391
392
+ def test_bloom_fp16 (self ):
393
+ tokenizer = AutoTokenizer .from_pretrained (self .bloom_model_id , gguf_file = self .fp16_bloom_model_id )
394
+ model = AutoModelForCausalLM .from_pretrained (
395
+ self .bloom_model_id ,
396
+ gguf_file = self .fp16_bloom_model_id ,
397
+ device_map = "auto" ,
398
+ torch_dtype = torch .float16 ,
399
+ )
400
+
401
+ text = tokenizer (self .example_text , return_tensors = "pt" ).to (torch_device )
402
+ out = model .generate (** text , max_new_tokens = 10 )
403
+
404
+ EXPECTED_TEXT = "Hello, I just want to say that I am very"
405
+ self .assertEqual (tokenizer .decode (out [0 ], skip_special_tokens = True ), EXPECTED_TEXT )
406
+
407
+ def test_bloom_q8_0 (self ):
408
+ tokenizer = AutoTokenizer .from_pretrained (self .bloom_model_id , gguf_file = self .q8_bloom_model_id )
409
+ model = AutoModelForCausalLM .from_pretrained (
410
+ self .bloom_model_id ,
411
+ gguf_file = self .q8_bloom_model_id ,
412
+ device_map = "auto" ,
413
+ torch_dtype = torch .float16 ,
414
+ )
415
+
416
+ text = tokenizer (self .example_text , return_tensors = "pt" ).to (torch_device )
417
+ out = model .generate (** text , max_new_tokens = 10 )
418
+
419
+ EXPECTED_TEXT = "Hello, I just want to say that I am very"
420
+ self .assertEqual (tokenizer .decode (out [0 ], skip_special_tokens = True ), EXPECTED_TEXT )
421
+
422
+ def test_bloom_weights_conversion_fp16 (self ):
423
+ quantized_model = AutoModelForCausalLM .from_pretrained (
424
+ self .bloom_model_id ,
425
+ gguf_file = self .fp16_bloom_model_id ,
426
+ device_map = "auto" ,
427
+ torch_dtype = torch .float16 ,
428
+ )
429
+ original_model = AutoModelForCausalLM .from_pretrained (
430
+ self .original_bloom_model_id ,
431
+ device_map = "auto" ,
432
+ torch_dtype = torch .float16 ,
433
+ )
434
+
435
+ quantized_state_dict = quantized_model .state_dict ()
436
+ original_state_dict = original_model .state_dict ()
437
+
438
+ for (quantized_name , quantized_param ), (original_name , original_param ) in zip (
439
+ quantized_state_dict .items (), original_state_dict .items ()
440
+ ):
441
+ if (
442
+ "self_attention.query_key_value" in quantized_name
443
+ and "self_attention.query_key_value" in original_name
444
+ ):
445
+ self .assertTrue (quantized_param .shape == original_param .shape )
446
+ torch .testing .assert_close (quantized_param , original_param )
447
+
388
448
def test_tokenization_xnli (self ):
389
449
import tqdm
390
450
from datasets import load_dataset
0 commit comments