From 32edf21f6fefcb654e5ef30d0c6d7cf35cef88fa Mon Sep 17 00:00:00 2001
From: Phil Wang <lucidrains@gmail.com>
Date: Sun, 17 Sep 2023 19:04:26 +0200
Subject: [PATCH] finally seeing the 1.7x speed up

---
 setup.py                                     |  2 +-
 speculative_decoding/speculative_decoding.py | 18 ++++++++++++------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/setup.py b/setup.py
index eb77a46..fe1d129 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@
     'beartype',
     'einops>=0.6.1',
     'rotary-embedding-torch>=0.3.0',
-    'torch>=2.0',
+    'torch>=1.12',
   ],
   classifiers=[
     'Development Status :: 4 - Beta',
diff --git a/speculative_decoding/speculative_decoding.py b/speculative_decoding/speculative_decoding.py
index 7d0fc3f..6a65450 100644
--- a/speculative_decoding/speculative_decoding.py
+++ b/speculative_decoding/speculative_decoding.py
@@ -63,6 +63,11 @@ def base_decoding(
 
     return out[..., prompt_seq_len:]
 
+def safe_div(num, den, eps = 1e-10):
+    return num / max(den, eps)
+
+def find_first_true_index(bool_tensor, dim = -1):
+    return (bool_tensor.cumsum(dim = dim)).sum(dim = dim)
 
 @torch.no_grad()
 def speculative_decoding(
@@ -118,16 +123,17 @@ def speculative_decoding(
 
         # prob and prob of small model (p(x) and q(x) in algorithm 1)
 
-        prob = (logits / temperature).softmax(dim = -1)
-        small_prob = (small_logits / temperature).softmax(dim = -1)
+        prob = safe_div(logits, temperature).softmax(dim = -1)
+        small_prob = safe_div(small_logits, temperature).softmax(dim = -1)
+
+        p, prob_next = prob[:, :-1], prob[:, -1]
 
-        p = prob[:, :-1].gather(-1, q_sampled_out)
+        p = p.gather(-1, q_sampled_out)
         q = small_prob.gather(-1, q_sampled_out) * lenience
         r = random_uniform = torch.zeros_like(q).float().uniform_(0, 1)
 
-        n = accepted = (((r > (p / q)).cumsum(dim = -1)) == 0).sum().item()
-
-        prob_next = prob[:, -1]
+        accepted = find_first_true_index(r > (p / q))
+        n = accepted[0][0] # need to handle batched spec decoding
 
         if n < gamma:
             adjusted_prob = F.relu(prob[:, n] - small_prob[:, n])