Merge pull request #6 from yhgon/patch-2

AminRezaei0x443 · web-flow · commit da51a4448ae5 · 2022-03-07T22:14:10.000+03:30
Handling device for `big_neg`
diff --git a/memory_efficient_attention/attention_torch.py b/memory_efficient_attention/attention_torch.py
@@ -27,7 +27,7 @@ def summarize_chunk(key_idx, query, key, value, mask, bias):
             mask = mask_calc_fn(query_idx, key_idx, mask, attn_weights, calc_fn_data)
         if mask is not None:
             big_neg = torch.finfo(attn_weights.dtype).min
-            big_neg = torch.tensor(big_neg, dtype=torch.float32)
+            big_neg = torch.tensor(big_neg, , device=mask.device, dtype=torch.float32)
             mask = torch.einsum('...hqk->...qhk', mask)
             attn_weights = torch.where(mask, attn_weights, big_neg)
         if weights_calc_fn is not None: