speedups and bugfixes

jwdink · jwdink · commit 398574410458 · 2025-08-15T21:24:20.000-05:00
diff --git a/torchcast/internals/batch_design/measurement_model.py b/torchcast/internals/batch_design/measurement_model.py
@@ -161,7 +161,7 @@ def _adjust_measure_mat(self,
             # apply measure-wide adjustment
             measure_mat[i] = self.measure_funs[measure].adjust_measure_mat(measure_mat[i], measured_mean[i])
 
-        return torch.stack(measure_mat, dim=-1)
+        return torch.stack(measure_mat, dim=-2)
 
     @cached_property
     def measure2idx(self) -> dict[str, int]:
diff --git a/torchcast/internals/utils.py b/torchcast/internals/utils.py
@@ -12,7 +12,6 @@ def get_subclasses(cls: Type) -> Iterable[Type]:
         yield subclass
 
 
-@functools.lru_cache(maxsize=100)
 def get_meshgrids(groups: torch.Tensor,
                   val_idx: torch.Tensor) -> tuple[tuple[torch.Tensor, ...], tuple[torch.Tensor, ...]]:
     """
@@ -189,13 +188,21 @@ def get_nan_groups(isnan: torch.Tensor) -> List[Tuple[torch.Tensor, Optional[tor
     """
     assert len(isnan.shape) == 2
     state_dim = isnan.shape[-1]
-    out: List[Tuple[torch.Tensor, Optional[torch.Tensor]]] = []
+
+    out = []
     if state_dim == 1:
         # shortcut for univariate
         group_idx = (~isnan.squeeze(-1)).nonzero().view(-1)
         out.append((group_idx, None))
         return out
-    for nan_combo in torch.unique(isnan, dim=0):
+
+    nan_combos = torch.unique(isnan, dim=0)
+    if len(nan_combos) == 1 and nan_combos[0].sum() == 0:
+        # shortcut for no nans
+        out.append((torch.arange(isnan.shape[0]), None))
+        return out
+
+    for nan_combo in nan_combos:
         num_nan = nan_combo.sum()
         if num_nan < state_dim:
             c1 = (isnan * nan_combo[None, :]).sum(1) == num_nan
diff --git a/torchcast/state_space/predictions.py b/torchcast/state_space/predictions.py
@@ -425,12 +425,18 @@ def measure_covs_flat(self) -> torch.Tensor:
             self._state_means_flat, self._state_covs_flat, self._mcovs_flat = self._flatten()
         return self._mcovs_flat
 
-    def log_prob(self, obs: torch.Tensor, weights: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def log_prob(self,
+                 obs: torch.Tensor,
+                 weights: Optional[torch.Tensor] = None,
+                 nan_groups_flat: Optional[Sequence[tuple[torch.Tensor, Optional[torch.Tensor]]]] = None
+                 ) -> torch.Tensor:
         """
         Compute the log-probability of data (e.g. data that was originally fed into the ``StateSpaceModel``).
 
         :param obs: A Tensor that could be used in the ``StateSpaceModel`` forward pass.
         :param weights: If specified, will be used to weight the log-probability of each group X timestep.
+        :param nan_groups_flat: used by StateSpaceModel.fit() for speeding up computations, pre-computing nan-masks at
+         the start of fitting rather than doing so on each call to log_prob().
         :return: A tensor with one element for each group X timestep indicating the log-probability.
         """
         assert len(obs.shape) == 3
@@ -447,7 +453,11 @@ def log_prob(self, obs: torch.Tensor, weights: Optional[torch.Tensor] = None) ->
         measure_covs_flat = self.measure_covs.view(-1, measure_rank, measure_rank)
 
         lp_flat = torch.zeros(obs_flat.shape[0], dtype=self.state_means.dtype, device=self.state_means.device)
-        for gt_idx, valid_idx in get_nan_groups(torch.isnan(obs_flat)):
+
+        if nan_groups_flat is None:
+            nan_groups_flat = get_nan_groups(torch.isnan(obs_flat))
+
+        for gt_idx, valid_idx in nan_groups_flat:
             if valid_idx is None:
                 gt_obs = obs_flat[gt_idx]
                 gt_mcov = measure_covs_flat[gt_idx]
diff --git a/torchcast/state_space/state_space.py b/torchcast/state_space/state_space.py
@@ -8,7 +8,7 @@
 
 from torchcast.internals.batch_design import TransitionModel, MeasurementModel, MeasureFun
 from torchcast.internals.hessian import hessian
-from torchcast.internals.utils import repeat, true1d_idx, get_nan_groups, mask_mats, get_meshgrids
+from torchcast.internals.utils import repeat, true1d_idx, get_nan_groups, get_meshgrids
 from torchcast.covariance import Covariance
 from torchcast.state_space.predictions import Predictions
 from torchcast.process.regression import Process
@@ -74,11 +74,6 @@ def __init__(self,
                 else:
                     self.dt_unit = process.dt_unit
 
-    # @property
-    # def is_nonlinear(self) -> bool:
-    #     return any(not p.linear_measurement for p in self.processes.values()) or self.measure_funs
-
-    @torch.jit.ignore()
     def forward(self,
                 y: Optional[torch.Tensor] = None,
                 n_step: Union[int, float] = 1,
@@ -88,7 +83,6 @@ def forward(self,
                 every_step: bool = True,
                 include_updates_in_output: bool = False,
                 simulate: Optional[int] = None,
-                last_measured_per_group: Optional[torch.Tensor] = None,
                 prediction_kwargs: Optional[dict] = None,
                 **kwargs) -> 'Predictions':
         """
@@ -119,14 +113,6 @@ def forward(self,
         :param include_updates_in_output: If False, only the ``n_step`` ahead predictions are included in the output.
          This means that we cannot use this output to generate the ``initial_state`` for subsequent forward-passes. Set
          to True to allow this -- False by default to reduce memory.
-        :param last_measured_per_group: This provides a method to reduce unused computations in training. On each call
-         to forward in training, you can supply to this argument a tensor indicating the last measured timestep for
-         each group in the batch (this can be computed with ``last_measured_per_group=batch.get_durations()``, where
-         ``batch`` is a :class:`TimeSeriesDataset`). In this case, predictions will not be generated after the
-         specified timestep for each group; these can be discarded in training because, without any measurements, they
-         wouldn't have been used in loss calculations anyways. Naturally this should never be set for
-         inference/forecasting. This will automatically be set when calling ``fit()``, but if you're instread using a
-         custom training loop, you can pass this manually.
         :param simulate: If specified, will generate `simulate` samples from the model.
         :param prediction_kwargs: A dictionary of kwargs to pass to initialize ``Predictions()``.
         :param kwargs: Further arguments passed to the `processes`. For example, the :class:`.LinearModel` expects an
@@ -176,8 +162,14 @@ def forward(self,
             out_timesteps=out_timesteps
         )
 
+        # used by fit() to reduce unneeded computations:
+        last_measured_per_group = kwargs.pop('last_measured_per_group', None)
         if last_measured_per_group is None:
             last_measured_per_group = torch.full((num_groups,), out_timesteps, dtype=torch.int, device=meanu.device)
+        nan_groups = kwargs.pop('nan_groups', None)
+        if nan_groups is None:
+            nan_groups = [None] * out_timesteps
+        # /
 
         # todo: update Covariance class to make this less hacky:
         mcov_kwargs = {}
@@ -242,6 +234,7 @@ def forward(self,
                     measured_mean=measured_mean,
                     measure_mat=measure_mat,
                     measure_cov=measure_covs[t],
+                    nan_groups=nan_groups[t],
                     **{k: v[t] for k, v in update_kwargs.items()}
                 )
                 if self.adaptive_measure_var and t < len(measure_covs) - 1:
@@ -302,7 +295,13 @@ def forward(self,
                 device=meanu.device,
                 dtype=meanu.dtype
             )
-        preds = self._generate_predictions(preds, updates, measure_covs, measurement_model, **prediction_kwargs)
+        preds = self._generate_predictions(
+            preds=preds,
+            updates=updates,
+            measure_covs=measure_covs,
+            measurement_model=measurement_model,
+            **prediction_kwargs
+        )
         return preds.set_metadata(
             start_offsets=start_offsets if start_offsets is not None else np.zeros(num_groups, dtype='int'),
             dt_unit=self.dt_unit
@@ -357,9 +356,6 @@ def fit(self,
         if set_initial_values:
             self._set_initial_values(y, verbose=verbose > 1, **kwargs)
 
-        if not get_loss:
-            get_loss = lambda _pred, _y: -_pred.log_prob(_y).mean()
-
         _deprecated = {k: kwargs.pop(k) for k in ['tol', 'patience', 'max_iter'] if k in kwargs}
         _dmsg = f"The following are deprecated, use `stopping` arg instead:\n{set(_deprecated)}"
         if stopping is None:
@@ -381,6 +377,11 @@ def fit(self,
 
         kwargs = self._prepare_fit_kwargs(y, **kwargs)
 
+        if get_loss is None:
+            # precompute nan-groups instead of doing it on each call to log_prob:
+            nan_groups_flat = get_nan_groups(torch.isnan(y).reshape(-1, y.shape[-1]))
+            get_loss = lambda _pred, _y: -_pred.log_prob(_y, nan_groups_flat=nan_groups_flat).mean()
+
         closure = _OptimizerClosure(
             ss_model=self,
             y=y,
@@ -415,21 +416,25 @@ def is_nonlinear(self) -> bool:
         return any(not p.linear_measurement for p in self.processes.values()) or self.measure_funs
 
     def _prepare_fit_kwargs(self, y: torch.Tensor, **kwargs) -> dict:
-        mc_samples = kwargs.pop('mc_samples', None)
+        # precompute nan-groups for forward pass
+        isnan = torch.isnan(y)
+        kwargs['nan_groups'] = [get_nan_groups(isnan_t) for isnan_t in isnan.unbind(1)]
 
+        #
+        prediction_kwargs = kwargs.pop('prediction_kwargs', None) or {}
+        # monte-carlo for Predictions.log_prob:
+        mc_samples = kwargs.pop('mc_samples', None)
         if self.is_nonlinear and not mc_samples:
             raise ValueError("Nonlinear state-space models require `mc_samples` to be set.")
-
         if mc_samples:
-            prediction_kwargs = kwargs.pop('prediction_kwargs', None) or {}
             if 'mc_white_noise' not in prediction_kwargs:
                 emmat_rank = MeasurementModel.get_extended_mmat_rank(self.processes.values(), self.measures)
                 prediction_kwargs['mc_white_noise'] = torch.randn(
                     (mc_samples, emmat_rank),
                     device=y.device,
                     dtype=y.dtype
                 )
-            kwargs['prediction_kwargs'] = prediction_kwargs
+        kwargs['prediction_kwargs'] = prediction_kwargs
 
         # see `last_measured_per_group` in forward docstring
         # todo: duplicate code in ``TimeSeriesDataset.get_durations()``
@@ -447,6 +452,7 @@ def _generate_predictions(self,
                               updates: Optional[tuple[list[torch.Tensor], list[torch.Tensor]]],
                               measure_covs: torch.Tensor,
                               measurement_model: 'MeasurementModel',
+                              nan_groups: Optional[List[Sequence[tuple[torch.Tensor, Optional[torch.Tensor]]]]] = None,
                               mc_white_noise: Optional[torch.Tensor] = None,
                               **kwargs
                               ) -> 'Predictions':
@@ -523,41 +529,48 @@ def _update_step_with_nans(self,
                                measured_mean: torch.Tensor,
                                measure_mat: torch.Tensor,
                                measure_cov: torch.Tensor,
+                               nan_groups: Optional[Sequence[tuple[torch.Tensor, Optional[torch.Tensor]]]] = None,
                                **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
-        isnan = torch.isnan(input)
-        if isnan.all():
-            return mean, cov
-        if isnan.any():
-            new_mean = mean.clone()
-            new_cov = cov.clone()
-            for groups, val_idx in get_nan_groups(isnan):
-                masked = self._mask_mats(
-                    groups,
-                    val_idx,
+        if nan_groups is None:
+            nan_groups = get_nan_groups(torch.isnan(input))
+        if len(nan_groups) == 1:
+            group_idx, val_idx = nan_groups[0]
+            if len(group_idx) == len(input) and val_idx is None:
+                # no nans, no masking:
+                return self._update_step(
                     input=input,
+                    mean=mean,
+                    cov=cov,
                     measured_mean=measured_mean,
                     measure_mat=measure_mat,
                     measure_cov=measure_cov,
                     **kwargs
                 )
-                new_mean[groups], new_cov[groups] = self._update_step(
-                    mean=mean[groups],
-                    cov=cov[groups],
-                    **masked,
-                    **{k: v for k, v in kwargs.items() if k not in masked}
-                )
-            return new_mean, new_cov
-        else:
-            return self._update_step(
+        elif not len(nan_groups):
+            # all nans, nothing to do:
+            return mean, cov
+
+        new_mean = mean.clone()
+        new_cov = cov.clone()
+        for groups, val_idx in nan_groups:
+            masked = self._mask_mats(
+                groups,
+                val_idx,
                 input=input,
-                mean=mean,
-                cov=cov,
                 measured_mean=measured_mean,
                 measure_mat=measure_mat,
                 measure_cov=measure_cov,
                 **kwargs
             )
 
+            new_mean[groups], new_cov[groups] = self._update_step(
+                mean=mean[groups],
+                cov=cov[groups],
+                **masked,
+                **{k: v for k, v in kwargs.items() if k not in masked}
+            )
+        return new_mean, new_cov
+
     def _mask_mats(self,
                    groups: torch.Tensor,
                    val_idx: Optional[torch.Tensor],
@@ -677,14 +690,15 @@ def state_rank(self) -> int:
 
     def _get_measure_scaling(self) -> torch.Tensor:
         mcov = self.measure_covariance({}, num_groups=1, num_times=1, _ignore_input=True)[0, 0]
-        measure_var = mcov.diagonal(dim1=-2, dim2=-1).unbind()
+        measure_var = list(mcov.diagonal(dim1=-2, dim2=-1).unbind())
+        for idx in self.measure_covariance.empty_idx:
+            measure_var[idx] = torch.ones_like(measure_var[idx])  # empty measures have no variance, so set to 1
 
         multi = [
             measure_var[self.measures.index(process.measure)].expand(process.rank).sqrt()
             for process in self.processes.values()
         ]
-        for idx in self.measure_covariance.empty_idx:
-            multi[idx] = torch.ones_like(multi[idx])  # empty measures have no variance, so set to 1
+
         multi = torch.cat(multi)
         if (multi <= 0).any():
             raise RuntimeError(f"measure-cov diagonal is not positive:{measure_var}")
@@ -787,19 +801,12 @@ def simulate(self,
         )
 
 
+def default_get_loss(pred: 'Predictions', y: torch.Tensor, **kwargs) -> torch.Tensor:
+    return -pred.log_prob(y, **kwargs).mean()
+
+
 class _OptimizerClosure:
-    """
-    closure = _OptimizerClosure(
-            ss_model=self,
-            y=y,
-            get_loss=get_loss,
-            prog=prog,
-            callable_kwargs=callable_kwargs,
-            optimizer=optimizer,
-            stopping=stopping,
-            kwargs=kwargs,
-        )
-    """
+
     def __init__(self,
                  ss_model: StateSpaceModel,
                  y: torch.Tensor,