diff --git a/torchbenchmark/models/BERT_pytorch/bert_pytorch/model/embedding/position.py b/torchbenchmark/models/BERT_pytorch/bert_pytorch/model/embedding/position.py
index d55c224b5c..0f615b6def 100644
--- a/torchbenchmark/models/BERT_pytorch/bert_pytorch/model/embedding/position.py
+++ b/torchbenchmark/models/BERT_pytorch/bert_pytorch/model/embedding/position.py
@@ -10,7 +10,8 @@ def __init__(self, d_model, max_len=512):
 
         # Compute the positional encodings once in log space.
         pe = torch.zeros(max_len, d_model).float()
-        pe.require_grad = False
+        # Changed from upstream, see https://github.com/codertimo/BERT-pytorch/pull/104
+        pe.requires_grad = False
 
         position = torch.arange(0, max_len).float().unsqueeze(1)
         div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()