Skip to content

Commit

Permalink
might work
Browse files Browse the repository at this point in the history
  • Loading branch information
yangw1234 committed Aug 14, 2024
1 parent 33dff86 commit 464138a
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -828,7 +828,10 @@ def run_decode(model, rank, world_size, layer_start, layer_end,
# model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16,
# trust_remote_code=True, attn_implementation="eager",
# load_in_low_bit="sym_int4", pipeline_parallel_stages=world_size)


from ipex_llm.transformers.npu_models.convert import optimize_llm, optimize_llm_post
optimize_llm(model)

num_heads = model.model.layers[layer_start].self_attn.num_heads
num_key_value_heads = model.model.layers[layer_start].self_attn.num_key_value_heads
head_dim = model.model.layers[layer_start].self_attn.head_dim
Expand Down Expand Up @@ -879,10 +882,6 @@ def run_decode(model, rank, world_size, layer_start, layer_end,
transpose_value=transpose_value_cache
)

for i in range(len(model.model.layers)):
model.model.layers[i] = None

gc.collect()
model.model.multi_decoder = multi_decoder

result_queue.put("loading success")
Expand All @@ -894,16 +893,20 @@ def run_decode(model, rank, world_size, layer_start, layer_end,
result = input_queue.get()
if result == "stop":
break
# input_ids, past_key_value, n_predict = input_queue.get()
# output = model.generate(input_ids, num_beams=1, do_sample=False, max_new_tokens=n_predict, past_key_values=past_key_value)
result_queue.put("result")
input_ids, past_key_value, n_predict = result
output = model.generate(input_ids, num_beams=1, do_sample=False, max_new_tokens=n_predict, past_key_values=past_key_value)
result_queue.put(output)



def run_prefill(model, max_seq_len, transpose_value_cache, input_queue, result_queue):


print("finish loading prefill model")

from ipex_llm.transformers.npu_models.convert import optimize_llm, optimize_llm_post

optimize_llm(model)

layer_start = 0
layer_end = len(model.model.layers)
Expand Down
18 changes: 9 additions & 9 deletions python/llm/src/ipex_llm/transformers/npu_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,15 +146,15 @@ def from_pretrained(cls,
from intel_npu_acceleration_library.compiler import create_npu_kernels
with torch.no_grad():
# optimize_llm(model)
if pipeline_parallel_stages == 1:
cls.load_convert(qtype, model, 'cpu', *args, **kwargs)
print("load convert finished")
create_npu_kernels(model)
print("create npu kernels finished")
else:
cls.load_convert(qtype, model.model, 'cpu', *args, **kwargs)
create_npu_kernels(model.model)
optimize_llm_post(model)
# if pipeline_parallel_stages == 1:
cls.load_convert(qtype, model, 'cpu', *args, **kwargs)
print("load convert finished")
create_npu_kernels(model)
print("create npu kernels finished")
# else:
# cls.load_convert(qtype, model.model, 'cpu', *args, **kwargs)
# create_npu_kernels(model.model)
# optimize_llm_post(model)
model = model.eval()

logger.info(f"Finish to convert model")
Expand Down

0 comments on commit 464138a

Please sign in to comment.