aleximmer · wiseodd · Apr 26, 2024 · Apr 27, 2024 · Apr 27, 2024 · Apr 27, 2024
diff --git a/laplace/baselaplace.py b/laplace/baselaplace.py
@@ -69,6 +69,8 @@ class BaseLaplace:
     enable_backprop: bool, default=False
         whether to enable backprop to the input `x` through the Laplace predictive.
         Useful for e.g. Bayesian optimization.
+    logit_class_dim: int, default=-1
+        the dim of the model's logit tensor that corresponds to the class/output
     dict_key_x: str, default='input_ids'
         The dictionary key under which the input tensor `x` is stored. Only has effect
         when the model takes a `MutableMapping` as the input. Useful for Huggingface
@@ -95,6 +97,7 @@ def __init__(
         prior_mean: float | torch.Tensor = 0.0,
         temperature: float = 1.0,
         enable_backprop: bool = False,
+        logit_class_dim: int = -1,
         dict_key_x: str = "input_ids",
         dict_key_y: str = "labels",
         backend: type[CurvatureInterface] | None = None,
@@ -126,6 +129,7 @@ def __init__(
         self.sigma_noise: float | torch.Tensor = sigma_noise
         self.temperature: float = temperature
         self.enable_backprop: bool = enable_backprop
+        self.logit_class_dim: int = logit_class_dim
 
         # For models with dict-like inputs (e.g. Huggingface LLMs)
         self.dict_key_x = dict_key_x
@@ -178,6 +182,7 @@ def backend(self) -> CurvatureInterface:
             self._backend = self._backend_cls(
                 self.model,
                 likelihood,
+                logit_class_dim=self.logit_class_dim,
                 dict_key_x=self.dict_key_x,
                 dict_key_y=self.dict_key_y,
                 **self._backend_kwargs,
@@ -584,6 +589,7 @@ def __init__(
         prior_mean: float | torch.Tensor = 0.0,
         temperature: float = 1.0,
         enable_backprop: bool = False,
+        logit_class_dim: int = -1,
         dict_key_x: str = "inputs_id",
         dict_key_y: str = "labels",
         backend: type[CurvatureInterface] | None = None,
@@ -598,6 +604,7 @@ def __init__(
             prior_mean,
             temperature,
             enable_backprop,
+            logit_class_dim,
             dict_key_x,
             dict_key_y,
             backend,
@@ -913,7 +920,8 @@ def __call__(
                 ).mean(dim=0)
             elif link_approx == LinkApprox.PROBIT:
                 kappa = 1 / torch.sqrt(1.0 + np.pi / 8 * f_var.diagonal(dim1=1, dim2=2))
-                return torch.softmax(kappa * f_mu, dim=-1)
+
+                return torch.softmax(kappa * f_mu, dim=self.logit_class_dim)
             elif "bridge" in link_approx:
                 # zero mean correction
                 f_mu -= (
@@ -1005,7 +1013,7 @@ def predictive_samples(
             if self.likelihood == Likelihood.REGRESSION:
                 return f_samples
             else:
-                return torch.softmax(f_samples, dim=-1)
+                return torch.softmax(f_samples, dim=self.logit_class_dim)
 
         else:  # 'nn'
             return self._nn_predictive_samples(x, n_samples, generator)
@@ -1058,7 +1066,7 @@ def _nn_predictive_samples(
         fs = torch.stack(fs)
 
         if self.likelihood == Likelihood.CLASSIFICATION:
-            fs = torch.softmax(fs, dim=-1)
+            fs = torch.softmax(fs, dim=self.logit_class_dim)
 
         return fs
 
@@ -1074,7 +1082,7 @@ def _nn_predictive_classification(
             logits = self.model(
                 X.to(self._device) if isinstance(X, torch.Tensor) else X, **model_kwargs
             ).detach()
-            py += torch.softmax(logits, dim=-1) / n_samples
+            py += torch.softmax(logits, dim=self.logit_class_dim) / n_samples
 
         vector_to_parameters(self.mean, self.params)
 
@@ -1276,6 +1284,7 @@ def __init__(
         prior_mean: float | torch.Tensor = 0.0,
         temperature: float = 1.0,
         enable_backprop: bool = False,
+        logit_class_dim: int = -1,
         dict_key_x: str = "input_ids",
         dict_key_y: str = "labels",
         backend: type[CurvatureInterface] | None = None,
@@ -1289,6 +1298,7 @@ def __init__(
             prior_mean,
             temperature,
             enable_backprop,
+            logit_class_dim,
             dict_key_x,
             dict_key_y,
             backend,
@@ -1411,6 +1421,7 @@ def __init__(
         prior_mean: float | torch.Tensor = 0.0,
         temperature: float = 1.0,
         enable_backprop: bool = False,
+        logit_class_dim: int = -1,
         dict_key_x: str = "inputs_id",
         dict_key_y: str = "labels",
         backend: type[CurvatureInterface] | None = None,
@@ -1428,6 +1439,7 @@ def __init__(
             prior_mean,
             temperature,
             enable_backprop,
+            logit_class_dim,
             dict_key_x,
             dict_key_y,
             backend,
@@ -1583,19 +1595,21 @@ def __init__(
         prior_mean: float | torch.Tensor = 0,
         temperature: float = 1,
         enable_backprop: bool = False,
+        logit_class_dim: int = -1,
         dict_key_x: str = "inputs_id",
         dict_key_y: str = "labels",
         backend=AsdfghjklHessian,
         backend_kwargs: dict[str, Any] | None = None,
     ):
         super().__init__(
             model,
-            likelihood,
+            likelihood=likelihood,
             sigma_noise=sigma_noise,
             prior_precision=prior_precision,
             prior_mean=prior_mean,
             temperature=temperature,
             enable_backprop=enable_backprop,
+            logit_class_dim=logit_class_dim,
             dict_key_x=dict_key_x,
             dict_key_y=dict_key_y,
             backend=backend,

diff --git a/laplace/curvature/asdfghjkl.py b/laplace/curvature/asdfghjkl.py
@@ -176,6 +176,7 @@ def __init__(
         model: nn.Module,
         likelihood: Likelihood | str,
         last_layer: bool = False,
+        logit_class_dim: int = -1,
         dict_key_x: str = "input_ids",
         dict_key_y: str = "labels",
         low_rank: int = 10,
@@ -185,6 +186,7 @@ def __init__(
             likelihood,
             last_layer,
             None,
+            logit_class_dim,
             dict_key_x="input_ids",
             dict_key_y="labels",
         )
@@ -241,14 +243,21 @@ def __init__(
         likelihood: Likelihood | str,
         last_layer: bool = False,
         subnetwork_indices: torch.LongTensor | None = None,
+        logit_class_dim: int = -1,
         dict_key_x: str = "input_ids",
         dict_key_y: str = "labels",
         stochastic: bool = False,
     ) -> None:
         if likelihood != Likelihood.CLASSIFICATION:
             raise ValueError("This backend only supports classification currently.")
         super().__init__(
-            model, likelihood, last_layer, subnetwork_indices, dict_key_x, dict_key_y
+            model,
+            likelihood,
+            last_layer,
+            subnetwork_indices,
+            logit_class_dim,
+            dict_key_x,
+            dict_key_y,
         )
         self.stochastic = stochastic
 
@@ -265,13 +274,16 @@ def __init__(
         model: nn.Module,
         likelihood: Likelihood | None,
         last_layer: bool = False,
+        logit_class_dim: int = -1,
         dict_key_x: str = "input_ids",
         dict_key_y: str = "labels",
     ) -> None:
         if likelihood != Likelihood.CLASSIFICATION:
             raise ValueError("This backend only supports classification currently.")
 
-        super().__init__(model, likelihood, last_layer, None, dict_key_x, dict_key_y)
+        super().__init__(
+            model, likelihood, last_layer, None, logit_class_dim, dict_key_x, dict_key_y
+        )
 
     @property
     def _ggn_type(self) -> str:

diff --git a/laplace/curvature/asdl.py b/laplace/curvature/asdl.py
@@ -34,11 +34,18 @@ def __init__(
         likelihood: Likelihood | str,
         last_layer: bool = False,
         subnetwork_indices: torch.LongTensor | None = None,
+        logit_class_dim: int = -1,
         dict_key_x: str = "input_ids",
         dict_key_y: str = "labels",
     ):
         super().__init__(
-            model, likelihood, last_layer, subnetwork_indices, dict_key_x, dict_key_y
+            model,
+            likelihood,
+            last_layer,
+            subnetwork_indices,
+            logit_class_dim,
+            dict_key_x,
+            dict_key_y,
         )
 
     @property
@@ -187,14 +194,19 @@ def diag(
         if "emp" in self._ggn_type:
             dummy = fisher_maker.setup_model_call(self._model, x)
             dummy = (
-                dummy if self.loss_type == LOSS_MSE else dummy.view(-1, dummy.size(-1))
+                dummy
+                if self.loss_type == LOSS_MSE
+                else dummy.view(-1, dummy.size(self.logit_class_dim))
             )
             fisher_maker.setup_loss_call(self.lossfunc, dummy, y)
         else:
             fisher_maker.setup_model_call(self._model, x)
         f, _ = fisher_maker.forward_and_backward()
-        # Assumes that the last dimension of f is of size outputs.
-        f = f if self.loss_type == LOSS_MSE else f.view(-1, f.size(-1))
+        f = (
+            f
+            if self.loss_type == LOSS_MSE
+            else f.view(-1, f.size(self.logit_class_dim))
+        )
         loss = self.lossfunc(f.detach(), y)
         vec = list()
         for module in self.model.modules():
@@ -232,14 +244,20 @@ def kron(
         if "emp" in self._ggn_type:
             dummy = fisher_maker.setup_model_call(self._model, x)
             dummy = (
-                dummy if self.loss_type == LOSS_MSE else dummy.view(-1, dummy.size(-1))
+                dummy
+                if self.loss_type == LOSS_MSE
+                else dummy.view(-1, dummy.size(self.logit_class_dim))
             )
             fisher_maker.setup_loss_call(self.lossfunc, dummy, y)
         else:
             fisher_maker.setup_model_call(self._model, x)
         f, _ = fisher_maker.forward_and_backward()
         # Assumes that the last dimension of f is of size outputs.
-        f = f if self.loss_type == LOSS_MSE else f.view(-1, f.size(-1))
+        f = (
+            f
+            if self.loss_type == LOSS_MSE
+            else f.view(-1, f.size(self.logit_class_dim))
+        )
         loss = self.lossfunc(f.detach(), y)
         M = len(y)
         kron = self._get_kron_factors(M)
@@ -271,6 +289,7 @@ def __init__(
         model: nn.Module,
         likelihood: Likelihood | str,
         last_layer: bool = False,
+        logit_class_dim: int = -1,
         dict_key_x: str = "input_ids",
         dict_key_y: str = "labels",
     ) -> None:
@@ -279,6 +298,7 @@ def __init__(
             likelihood,
             last_layer,
             subnetwork_indices=None,
+            logit_class_dim=logit_class_dim,
             dict_key_x=dict_key_x,
             dict_key_y=dict_key_y,
         )
@@ -300,7 +320,11 @@ def full(
         hess_maker = HessianMaker(self.model, cfg)
 
         dummy = hess_maker.setup_model_call(self._model, x)
-        dummy = dummy if self.loss_type == LOSS_MSE else dummy.view(-1, dummy.size(-1))
+        dummy = (
+            dummy
+            if self.loss_type == LOSS_MSE
+            else dummy.view(-1, dummy.size(self.logit_class_dim))
+        )
         y = y if self.loss_type == LOSS_MSE else y.view(-1)
 
         hess_maker.setup_loss_call(self.lossfunc, dummy, y)
@@ -309,7 +333,11 @@ def full(
         H = self._model.hessian.data
         f = self.model(x).detach()
         # Assumes that the last dimension of f is of size outputs.
-        f = f if self.loss_type == LOSS_MSE else f.view(-1, f.size(-1))
+        f = (
+            f
+            if self.loss_type == LOSS_MSE
+            else f.view(-1, f.size(self.logit_class_dim))
+        )
         loss = self.lossfunc(f, y)
 
         return self.factor * loss, self.factor * H
@@ -324,12 +352,19 @@ def __init__(
         likelihood: Likelihood | str,
         last_layer: bool = False,
         subnetwork_indices: torch.LongTensor | None = None,
+        logit_class_dim: int = -1,
         dict_key_x: str = "input_ids",
         dict_key_y: str = "labels",
         stochastic: bool = False,
     ):
         super().__init__(
-            model, likelihood, last_layer, subnetwork_indices, dict_key_x, dict_key_y
+            model,
+            likelihood,
+            last_layer,
+            subnetwork_indices,
+            logit_class_dim,
+            dict_key_x,
+            dict_key_y,
         )
         self.stochastic = stochastic
 
@@ -346,10 +381,13 @@ def __init__(
         model: nn.Module,
         likelihood: Likelihood | str,
         last_layer: bool = False,
+        logit_class_dim: int = -1,
         dict_key_x: str = "input_ids",
         dict_key_y: str = "labels",
     ):
-        super().__init__(model, likelihood, last_layer, None, dict_key_x, dict_key_y)
+        super().__init__(
+            model, likelihood, last_layer, None, logit_class_dim, dict_key_x, dict_key_y
+        )
 
     @property
     def _ggn_type(self) -> str: