codertimo · watseob · May 14, 2020
diff --git a/bert_pytorch/trainer/pretrain.py b/bert_pytorch/trainer/pretrain.py
@@ -59,8 +59,11 @@ def __init__(self, bert: BERT, vocab_size: int,
         self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps)
 
         # Using Negative Log Likelihood Loss function for predicting the masked_token
-        self.criterion = nn.NLLLoss(ignore_index=0)
-
+        self.criterion_mask_lm = nn.NLLLoss(ignore_index=0)
+
+        # Using Negative Log Likelihood Loss function for predicting the is_next
+        self.criterion_is_next = nn.NLLLoss()
+
         self.log_freq = log_freq
 
         print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()]))
@@ -102,10 +105,10 @@ def iteration(self, epoch, data_loader, train=True):
             next_sent_output, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"])
 
             # 2-1. NLL(negative log likelihood) loss of is_next classification result
-            next_loss = self.criterion(next_sent_output, data["is_next"])
+            next_loss = self.criterion_is_next(next_sent_output, data["is_next"])
 
             # 2-2. NLLLoss of predicting masked token word
-            mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"])
+            mask_loss = self.criterion_mask_lm(mask_lm_output.transpose(1, 2), data["bert_label"])
 
             # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure
             loss = next_loss + mask_loss