[ENH] Add ROI association decoder (#536)

* Start work on new decoder. * More stuff. * Leverage input validation in Decoders and fix new decoder. * Add test. * Fix up docstring. * Add new decoder to example. * Fix style issue. * Add new decoder to decoding.rst.
neurostuff · Jul 14, 2021 · 929584b · 929584b
1 parent 3164be4
commit 929584b
Show file tree

Hide file tree

Showing 10 changed files with 239 additions and 27 deletions.
diff --git a/docs/methods/decoding.rst b/docs/methods/decoding.rst
@@ -12,7 +12,7 @@ Discrete decoding approaches characterize subsets of the Dataset or regions of i
 
 The BrainMap approach
 `````````````````````
-:func:`nimare.decode.discrete.BrainMapDecoder`, :func:`nimare.decode.discrete.brainmap_decode`
+:class:`nimare.decode.discrete.BrainMapDecoder`, :func:`nimare.decode.discrete.brainmap_decode`
 
 The BrainMap method for discrete functional decoding performs both forward and reverse inference
 using an annotated coordinate-based database and a target sample of studies within that database.
@@ -78,7 +78,7 @@ of foci associated with each study in the database.
 
 The Neurosynth approach
 ```````````````````````
-:func:`nimare.decode.discrete.NeurosynthDecoder`, :func:`nimare.decode.discrete.neurosynth_decode`
+:class:`nimare.decode.discrete.NeurosynthDecoder`, :func:`nimare.decode.discrete.neurosynth_decode`
 
 The Neurosynth method for discrete functional decoding performs both forward and reverse inference
 using an annotated coordinate-based database and a target sample of studies within that database.
@@ -135,12 +135,29 @@ of any given experiment including a given label.
     - Convert p-value to signed z-value using :math:`P(s^{+}|l^{-})` to determine sign.
 
 
+The Neurosynth ROI association approach
+```````````````````````````````````````
+:class:`nimare.decode.discrete.ROIAssociationDecoder`
+
+Neurosynth's ROI association approach is quite simple, but it has been used in at least one publication, `Margulies et al. (2016)`_.
+
+This approach uses the following steps to calculate label-wise correlation values:
+
+1.  Specify a region of interest (ROI) image in the same space as the Dataset.
+2.  Generate modeled activation (MA) maps for all studies in Dataset with coordinates.
+3.  Average the MA values within the ROI to get a study-wise MA regressor.
+4.  Correlate the MA regressor with study-wise annotation values (e.g., tf-idf values).
+
+.. _Margulies et al. (2016): https://doi.org/10.1073/pnas.1608282113
+
+
 The GC-LDA approach
 ```````````````````
 :func:`nimare.decode.discrete.gclda_decode_roi`
 
 The GC-LDA approach sums :math:`P(topic|voxel)` weights within the region of interest to produce topic-wise weights.
 
+
 Continuous decoding
 -------------------
 

diff --git a/examples/04_decoding/plot_discrete_decoders.py b/examples/04_decoding/plot_discrete_decoders.py
@@ -15,8 +15,7 @@
 
 import nibabel as nib
 import numpy as np
-import pandas as pd
-from nilearn.plotting import plot_roi, plot_stat_map
+from nilearn.plotting import plot_roi
 
 import nimare
 from nimare.decode import discrete
@@ -34,7 +33,7 @@
 
 ###############################################################################
 # Create a region of interest
-# -------------------------------------------
+# ---------------------------
 
 # First we'll make an ROI
 arr = np.zeros(dset.masker.mask_img.shape, int)
@@ -46,21 +45,35 @@
 ids = dset.get_studies_by_mask(mask_img)
 
 ###############################################################################
-# Decode an ROI image using the Neurosynth method
-# -----------------------------------------------
+# Decode an ROI image using the BrainMap method
+# ---------------------------------------------
 
 # Run the decoder
-decoder = discrete.NeurosynthDecoder(correction=None)
+decoder = discrete.BrainMapDecoder(correction=None)
 decoder.fit(dset)
 decoded_df = decoder.transform(ids=ids)
 decoded_df.sort_values(by="probReverse", ascending=False).head()
 
 ###############################################################################
-# Decode an ROI image using the BrainMap method
-# -----------------------------------------------
+# Decode an ROI image using the Neurosynth chi-square method
+# ----------------------------------------------------------
 
 # Run the decoder
-decoder = discrete.BrainMapDecoder(correction=None)
+decoder = discrete.NeurosynthDecoder(correction=None)
 decoder.fit(dset)
 decoded_df = decoder.transform(ids=ids)
 decoded_df.sort_values(by="probReverse", ascending=False).head()
+
+###############################################################################
+# Decode an ROI image using the Neurosynth ROI association method
+# ---------------------------------------------------------------
+
+# This method decodes the ROI image directly, rather than comparing subsets of the Dataset like the
+# other two.
+decoder = discrete.ROIAssociationDecoder(mask_img)
+decoder.fit(dset)
+
+# The `transform` method doesn't take any parameters.
+decoded_df = decoder.transform()
+
+decoded_df.sort_values(by="r", ascending=False).head()
diff --git a/nimare/base.py b/nimare/base.py
@@ -419,6 +419,32 @@ class Decoder(NiMAREBase):
 
     __id_cols = ["id", "study_id", "contrast_id"]
 
+    def _validate_input(self, dataset, drop_invalid=True):
+        """Search for, and validate, required inputs as necessary."""
+        if not hasattr(dataset, "slice"):
+            raise ValueError(
+                f"Argument 'dataset' must be a valid Dataset object, not a {type(dataset)}."
+            )
+
+        if self._required_inputs:
+            data = dataset.get(self._required_inputs, drop_invalid=drop_invalid)
+            # Do not overwrite existing inputs_ attribute.
+            # This is necessary for PairwiseCBMAEstimator, which validates two sets of coordinates
+            # in the same object.
+            # It makes the *strong* assumption that required inputs will not changes within an
+            # Estimator across fit calls, so all fields of inputs_ will be overwritten instead of
+            # retaining outdated fields from previous fit calls.
+            if not hasattr(self, "inputs_"):
+                self.inputs_ = {}
+
+            for k, v in data.items():
+                if v is None:
+                    raise ValueError(
+                        f"Estimator {self.__class__.__name__} requires input dataset to contain "
+                        f"{k}, but no matching data were found."
+                    )
+                self.inputs_[k] = v
+
     def _preprocess_input(self, dataset):
         """Select features for model based on requested features and feature_group.
 
@@ -429,32 +455,42 @@ def _preprocess_input(self, dataset):
         if self.feature_group is not None:
             if not self.feature_group.endswith("__"):
                 self.feature_group += "__"
-            feature_names = dataset.annotations.columns.values
+            feature_names = self.inputs_["annotations"].columns.values
             feature_names = [f for f in feature_names if f.startswith(self.feature_group)]
             if self.features is not None:
                 features = [f.split("__")[-1] for f in feature_names if f in self.features]
             else:
                 features = feature_names
         else:
             if self.features is None:
-                features = dataset.annotations.columns.values
+                features = self.inputs_["annotations"].columns.values
             else:
                 features = self.features
+
         features = [f for f in features if f not in self.__id_cols]
+        n_features_orig = len(features)
+
         # At least one study in the dataset much have each label
-        counts = (dataset.annotations[features] > self.frequency_threshold).sum(0)
+        counts = (self.inputs_["annotations"][features] > self.frequency_threshold).sum(0)
         features = counts[counts > 0].index.tolist()
         if not len(features):
             raise Exception("No features identified in Dataset!")
+        elif len(features) < n_features_orig:
+            LGR.info(f"Retaining {len(features)}/({n_features_orig} features.")
+
         self.features_ = features
 
-    def fit(self, dataset):
+    def fit(self, dataset, drop_invalid=True):
         """Fit Decoder to Dataset.
 
         Parameters
         ----------
         dataset : :obj:`nimare.dataset.Dataset`
             Dataset object to analyze.
+        drop_invalid : :obj:`bool`, optional
+            Whether to automatically ignore any studies without the required data or not.
+            Default is True.
+
 
         Returns
         -------
@@ -471,6 +507,7 @@ def fit(self, dataset):
         Selection of features based on requested features and feature group is performed in
         `Decoder._preprocess_input`.
         """
+        self._validate_input(dataset, drop_invalid=drop_invalid)
         self._preprocess_input(dataset)
         self._fit(dataset)
 

diff --git a/nimare/dataset.py b/nimare/dataset.py
@@ -401,6 +401,11 @@ def get(self, dict_, drop_invalid=True):
                 temp = [self.coordinates.loc[self.coordinates["id"] == id_] for id_ in self.ids]
                 # Replace empty DataFrames with Nones
                 temp = [t if t.size else None for t in temp]
+            elif vals[0] == "annotations":
+                # Break DataFrame down into a list of study-specific DataFrames
+                temp = [self.annotations.loc[self.annotations["id"] == id_] for id_ in self.ids]
+                # Replace empty DataFrames with Nones
+                temp = [t if t.size else None for t in temp]
             else:
                 raise ValueError(f"Input '{vals[0]}' not understood.")
 
@@ -420,7 +425,7 @@ def get(self, dict_, drop_invalid=True):
 
         for k in results:
             results[k] = [results[k][i] for i in keep_idx]
-            if dict_.get(k, [None])[0] == "coordinates":
+            if dict_.get(k, [None])[0] in ("coordinates", "annotations"):
                 results[k] = pd.concat(results[k])
 
         return results

diff --git a/nimare/decode/continuous.py b/nimare/decode/continuous.py
@@ -134,6 +134,11 @@ class CorrelationDecoder(Decoder):
     evaluate results based on significance.
     """
 
+    _required_inputs = {
+        "coordinates": ("coordinates", None),
+        "annotations": ("annotations", None),
+    }
+
     def __init__(
         self,
         feature_group=None,
@@ -183,6 +188,8 @@ def _fit(self, dataset):
             feature_ids = dataset.get_studies_by_label(
                 labels=[feature], label_threshold=self.frequency_threshold
             )
+            feature_ids = sorted(list(set(feature_ids).intersection(self.inputs_["id"])))
+
             LGR.info(
                 f"Decoding {feature} ({i}/{len(self.features)}): {len(feature_ids)}/"
                 f"{len(dataset.ids)} studies"
@@ -191,7 +198,7 @@ def _fit(self, dataset):
             # This seems like a somewhat inelegant solution
             # Check if the meta method is a pairwise estimator
             if "dataset2" in inspect.getfullargspec(self.meta_estimator.fit).args:
-                nonfeature_ids = sorted(list(set(dataset.ids) - set(feature_ids)))
+                nonfeature_ids = sorted(list(set(self.inputs_["id"]) - set(feature_ids)))
                 nonfeature_dset = dataset.slice(nonfeature_ids)
                 self.meta_estimator.fit(feature_dset, nonfeature_dset)
             else:
@@ -247,6 +254,10 @@ class CorrelationDistributionDecoder(Decoder):
     evaluate results based on significance.
     """
 
+    _required_inputs = {
+        "annotations": ("annotations", None),
+    }
+
     def __init__(
         self,
         feature_group=None,
@@ -258,9 +269,9 @@ def __init__(
         self.feature_group = feature_group
         self.features = features
         self.frequency_threshold = frequency_threshold
-        self.target_image = target_image
         self.memory_limit = memory_limit
         self.results = None
+        self._required_inputs["images"] = ("image", target_image)
 
     def _fit(self, dataset):
         """Collect sets of maps from the Dataset corresponding to each requested feature.
@@ -286,8 +297,13 @@ def _fit(self, dataset):
             feature_ids = dataset.get_studies_by_label(
                 labels=[feature], label_threshold=self.frequency_threshold
             )
-            test_imgs = dataset.get_images(ids=feature_ids, imtype=self.target_image)
-            test_imgs = list(filter(None, test_imgs))
+            selected_ids = sorted(list(set(feature_ids).intersection(self.inputs_["id"])))
+            selected_id_idx = [
+                i_id for i_id, id_ in enumerate(self.inputs_["id"]) if id_ in selected_ids
+            ]
+            test_imgs = [
+                img for i_img, img in enumerate(self.inputs_["images"]) if i_img in selected_id_idx
+            ]
             if len(test_imgs):
                 feature_arr = safe_transform(
                     test_imgs,