From 3d8aa554d26248b77a5834b353b3ee478d24f90a Mon Sep 17 00:00:00 2001 From: lucidrains Date: Sun, 17 Sep 2023 10:59:56 -0700 Subject: [PATCH] finally seeing the 1.7x speed up --- setup.py | 4 ++-- speculative_decoding/speculative_decoding.py | 18 ++++++++++++------ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index eb77a46..c869f0e 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name = 'speculative-decoding', packages = find_packages(exclude=[]), - version = '0.0.1', + version = '0.0.2', license='MIT', description = 'Speculative Decoding', author = 'Phil Wang', @@ -20,7 +20,7 @@ 'beartype', 'einops>=0.6.1', 'rotary-embedding-torch>=0.3.0', - 'torch>=2.0', + 'torch>=1.12', ], classifiers=[ 'Development Status :: 4 - Beta', diff --git a/speculative_decoding/speculative_decoding.py b/speculative_decoding/speculative_decoding.py index 7d0fc3f..df38e87 100644 --- a/speculative_decoding/speculative_decoding.py +++ b/speculative_decoding/speculative_decoding.py @@ -63,6 +63,11 @@ def base_decoding( return out[..., prompt_seq_len:] +def safe_div(num, den, eps = 1e-10): + return num / max(den, eps) + +def find_first_true_index(bool_tensor, dim = -1): + return (bool_tensor.cumsum(dim = dim) == 0).sum(dim = dim) @torch.no_grad() def speculative_decoding( @@ -118,16 +123,17 @@ def speculative_decoding( # prob and prob of small model (p(x) and q(x) in algorithm 1) - prob = (logits / temperature).softmax(dim = -1) - small_prob = (small_logits / temperature).softmax(dim = -1) + prob = safe_div(logits, temperature).softmax(dim = -1) + small_prob = safe_div(small_logits, temperature).softmax(dim = -1) + + p, prob_next = prob[:, :-1], prob[:, -1] - p = prob[:, :-1].gather(-1, q_sampled_out) + p = p.gather(-1, q_sampled_out) q = small_prob.gather(-1, q_sampled_out) * lenience r = random_uniform = torch.zeros_like(q).float().uniform_(0, 1) - n = accepted = (((r > (p / q)).cumsum(dim = -1)) == 0).sum().item() - - prob_next = prob[:, -1] + accepted = find_first_true_index(r > (p / q)) + n = accepted[0][0] # need to handle batched spec decoding if n < gamma: adjusted_prob = F.relu(prob[:, n] - small_prob[:, n])