From 7e094f2eb70d75c3d3b455636303a9f2934b44ec Mon Sep 17 00:00:00 2001
From: Taylor Salo <tsalo006@fiu.edu>
Date: Thu, 6 Jan 2022 16:31:09 -0500
Subject: [PATCH] Use scikit-learn for LDAModel (#607)

* Drop LDA.

* Delete 03_lda.py

* Use resources instead of test data.

* Bundle sklearn model in new class.

* More updates.

* Fix.

* Add test.

* Update 03_plot_lda.py

* Improve things.

* Link to CBMA documentation.

* Update 03_plot_lda.py

* Update api.rst

* More cleanup.

* Remove Annotator class.

The Annotator and Annotation classes will be developed in #618.

* Update 03_plot_lda.py

* Remove undefined base class.
---
 docs/api.rst                          |   1 -
 examples/03_annotation/03_lda.py      |  43 ----
 examples/03_annotation/03_plot_lda.py |  57 ++++++
 nimare/annotate/lda.py                | 276 +++++++++-----------------
 nimare/extract/__init__.py            |   2 -
 nimare/extract/extract.py             |  49 -----
 nimare/tests/test_annotate_lda.py     |  26 +++
 nimare/tests/test_utils.py            |  25 ---
 nimare/utils.py                       |  26 ---
 9 files changed, 181 insertions(+), 324 deletions(-)
 delete mode 100644 examples/03_annotation/03_lda.py
 create mode 100644 examples/03_annotation/03_plot_lda.py
 create mode 100644 nimare/tests/test_annotate_lda.py

diff --git a/docs/api.rst b/docs/api.rst
index b110c8267..9d5420fbb 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -218,7 +218,6 @@ For more information about fetching data from the internet, see :ref:`fetching t
    extract.fetch_neuroquery
    extract.fetch_neurosynth
    extract.download_nidm_pain
-   extract.download_mallet
    extract.download_cognitive_atlas
    extract.download_abstracts
    extract.download_peaks2maps_model
diff --git a/examples/03_annotation/03_lda.py b/examples/03_annotation/03_lda.py
deleted file mode 100644
index def5b69ab..000000000
--- a/examples/03_annotation/03_lda.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
-# ex: set sts=4 ts=4 sw=4 et:
-"""
-
-.. _annotations_lda:
-
-==================
-LDA topic modeling
-==================
-
-This example trains a latent Dirichlet allocation model with MALLET using abstracts from
-Neurosynth.
-"""
-import os
-
-from nimare import annotate, extract
-from nimare.dataset import Dataset
-from nimare.utils import get_resource_path
-
-###############################################################################
-# Load dataset with abstracts
-# ---------------------------
-dset = Dataset(os.path.join(get_resource_path(), "neurosynth_laird_studies.json"))
-
-###############################################################################
-# Download MALLET
-# ---------------
-# MALLET is a Java toolbox for natural language processing.
-# While LDA is implemented in some Python libraries, like scikit-learn,
-# MALLET appears to do a better job at LDA than other tools.
-# LDAModel will download MALLET automatically, but it's included here for clarity.
-mallet_dir = extract.download_mallet()
-
-###############################################################################
-# Run model
-# ---------
-# This may take some time, so we won't run it in the gallery.
-model = annotate.lda.LDAModel(dset.texts, text_column="abstract", n_iters=5)
-model.fit()
-model.save("lda_model.pkl.gz")
-
-# Let's remove the model now that you know how to generate it.
-os.remove("lda_model.pkl.gz")
diff --git a/examples/03_annotation/03_plot_lda.py b/examples/03_annotation/03_plot_lda.py
new file mode 100644
index 000000000..8dc5d2a6c
--- /dev/null
+++ b/examples/03_annotation/03_plot_lda.py
@@ -0,0 +1,57 @@
+# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
+# ex: set sts=4 ts=4 sw=4 et:
+"""
+
+.. _annotations_lda:
+
+==================
+LDA topic modeling
+==================
+
+This example trains a latent Dirichlet allocation model with scikit-learn
+using abstracts from Neurosynth.
+"""
+import os
+
+import pandas as pd
+
+from nimare import annotate
+from nimare.dataset import Dataset
+from nimare.utils import get_resource_path
+
+###############################################################################
+# Load dataset with abstracts
+# ---------------------------
+dset = Dataset(os.path.join(get_resource_path(), "neurosynth_laird_studies.json"))
+
+###############################################################################
+# Initialize LDA model
+# --------------------
+model = annotate.lda.LDAModel(n_topics=5, max_iter=1000, text_column="abstract")
+
+###############################################################################
+# Run model
+# ---------
+new_dset = model.fit(dset)
+
+###############################################################################
+# View results
+# ------------
+# This DataFrame is very large, so we will only show a slice of it.
+new_dset.annotations[new_dset.annotations.columns[:10]].head(10)
+
+###############################################################################
+# Given that this DataFrame is very wide (many terms), we will transpose it before presenting it.
+model.distributions_["p_topic_g_word_df"].T.head(10)
+
+###############################################################################
+n_top_terms = 10
+top_term_df = model.distributions_["p_topic_g_word_df"].T
+temp_df = top_term_df.copy()
+top_term_df = pd.DataFrame(columns=top_term_df.columns, index=range(n_top_terms))
+top_term_df.index.name = "Token"
+for col in top_term_df.columns:
+    top_tokens = temp_df.sort_values(by=col, ascending=False).index.tolist()[:n_top_terms]
+    top_term_df.loc[:, col] = top_tokens
+
+top_term_df
diff --git a/nimare/annotate/lda.py b/nimare/annotate/lda.py
index c9075fa36..4ad9e46a1 100644
--- a/nimare/annotate/lda.py
+++ b/nimare/annotate/lda.py
@@ -1,219 +1,139 @@
-"""Topic modeling with latent Dirichlet allocation via MALLET."""
-import logging
-import os
-import shutil
-
-import numpy as np
+"""Topic modeling with latent Dirichlet allocation."""
 import pandas as pd
+from sklearn.decomposition import LatentDirichletAllocation
 
-from .. import references
-from ..base import NiMAREBase
-from ..due import due
-from ..extract import download_mallet, utils
-from ..utils import _run_shell_command
-
-LGR = logging.getLogger(__name__)
+from nimare import references
+from nimare.annotate.text import generate_counts
+from nimare.base import NiMAREBase
+from nimare.due import due
 
 
 @due.dcite(references.LDA, description="Introduces LDA.")
-@due.dcite(references.MALLET, description="Citation for MALLET toolbox")
 @due.dcite(
     references.LDAMODEL,
     description="First use of LDA for automated annotation of neuroimaging literature.",
 )
 class LDAModel(NiMAREBase):
-    """Perform topic modeling using Latent Dirichlet Allocation (LDA).
+    """Generate a latent Dirichlet allocation (LDA) topic model.
 
-    Build an LDA [1]_ topic model with the Java toolbox MALLET [2]_, as
-    performed in [3]_.
+    This class is a light wrapper around scikit-learn tools for tokenization and LDA.
 
     Parameters
     ----------
-    text_df : :obj:`pandas.DataFrame`
-        A pandas DataFrame with two columns ('id' and text_column) containing
-        article text.
+    n_topics : :obj:`int`
+        Number of topics for topic model. This corresponds to the model's ``n_components``
+        parameter. Must be an integer >= 1.
+    max_iter : :obj:`int`, optional
+        Maximum number of iterations to use during model fitting. Default = 1000.
+    alpha : :obj:`float` or None, optional
+        The ``alpha`` value for the model. This corresponds to the model's ``doc_topic_prior``
+        parameter. Default is None, which evaluates to ``1 / n_topics``, as was used in [2]_.
+    beta : :obj:`float` or None, optional
+        The ``beta`` value for the model. This corresponds to the model's ``topic_word_prior``
+        parameter. If None, it evaluates to ``1 / n_topics``.
+        Default is 0.001, which was used in [2]_.
     text_column : :obj:`str`, optional
-        Name of column in text_df that contains text. Default is 'abstract'.
-    n_topics : :obj:`int`, optional
-        Number of topics to generate. Default=50.
-    n_iters : :obj:`int`, optional
-        Number of iterations to run in training topic model. Default=1000.
-    alpha : :obj:`float` or 'auto', optional
-        The Dirichlet prior on the per-document topic distributions.
-        Default: auto, which calculates 50 / n_topics, based on Poldrack et al.
-        (2012).
-    beta : :obj:`float`, optional
-        The Dirichlet prior on the per-topic word distribution. Default: 0.001,
-        based on Poldrack et al. (2012).
+        The source of text to use for the model. This should correspond to an existing column
+        in the :py:attr:`~nimare.dataset.Dataset.texts` attribute. Default is "abstract".
 
     Attributes
     ----------
-    commands_ : :obj:`list` of :obj:`str`
-        List of MALLET commands called to fit model.
+    model : :obj:`~sklearn.decomposition.LatentDirichletAllocation`
+
+    Notes
+    -----
+    Latent Dirichlet allocation was first developed in [1]_, and was first applied to neuroimaging
+    articles in [2]_.
 
     References
     ----------
     .. [1] Blei, David M., Andrew Y. Ng, and Michael I. Jordan. "Latent
         dirichlet allocation." Journal of machine Learning research 3.Jan
         (2003): 993-1022.
-    .. [2] McCallum, Andrew Kachites. "Mallet: A machine learning for language
-        toolkit." (2002).
-    .. [3] Poldrack, Russell A., et al. "Discovering relations between mind,
+    .. [2] Poldrack, Russell A., et al. "Discovering relations between mind,
         brain, and mental disorders using topic mapping." PLoS computational
         biology 8.10 (2012): e1002707.
         https://doi.org/10.1371/journal.pcbi.1002707
 
     See Also
     --------
-    nimare.extract.download_mallet : This function will be called automatically to download MALLET.
+    :class:`~sklearn.feature_extraction.text.CountVectorizer`: Used to build a vocabulary of terms
+        and their associated counts from texts in the ``self.text_column`` of the Dataset's
+        ``texts`` attribute.
+    :class:`~sklearn.decomposition.LatentDirichletAllocation`: Used to train the LDA model.
     """
 
-    def __init__(
-        self, text_df, text_column="abstract", n_topics=50, n_iters=1000, alpha="auto", beta=0.001
-    ):
-        mallet_dir = download_mallet()
-        mallet_bin = os.path.join(mallet_dir, "bin/mallet")
-
-        model_dir = utils._get_dataset_dir("mallet_model")
-        text_dir = os.path.join(model_dir, "texts")
-
-        if not os.path.isdir(model_dir):
-            os.mkdir(model_dir)
-
-        if alpha == "auto":
-            alpha = 50.0 / n_topics
-        elif not isinstance(alpha, float):
-            raise ValueError('Argument alpha must be float or "auto"')
-
-        self.params = {"n_topics": n_topics, "n_iters": n_iters, "alpha": alpha, "beta": beta}
-        self.model_dir = model_dir
-
-        # Check for presence of text files and convert if necessary
-        if not os.path.isdir(text_dir):
-            LGR.info("Texts folder not found. Creating text files...")
-            os.mkdir(text_dir)
-
-            # Remove rows with empty text cells
-            orig_ids = text_df["id"].tolist()
-            text_df = text_df.dropna(subset=[text_column])
-            keep_ids = text_df["id"].tolist()
-
-            if len(keep_ids) != len(orig_ids):
-                LGR.info(f"Retaining {len(keep_ids)}/{len(orig_ids)} studies")
-
-            for id_ in text_df["id"].values:
-                text = text_df.loc[text_df["id"] == id_, text_column].values[0]
-                with open(os.path.join(text_dir, str(id_) + ".txt"), "w") as fo:
-                    fo.write(text)
-
-        # Run MALLET topic modeling
-        LGR.info("Compiling MALLET commands...")
-        import_str = (
-            f"{mallet_bin} import-dir "
-            f"--input {text_dir} "
-            f"--output {model_dir}/topic-input.mallet "
-            "--keep-sequence "
-            "--remove-stopwords"
+    def __init__(self, n_topics, max_iter=1000, alpha=None, beta=0.001, text_column="abstract"):
+        self.n_topics = n_topics
+        self.max_iter = max_iter
+        self.alpha = alpha
+        self.beta = beta
+        self.text_column = text_column
+
+        self.model = LatentDirichletAllocation(
+            n_components=n_topics,
+            max_iter=max_iter,
+            learning_method="online",
+            doc_topic_prior=alpha,
+            topic_word_prior=beta,
         )
 
-        train_str = (
-            f"{mallet_bin} train-topics "
-            f"--input {model_dir}/topic-input.mallet "
-            f"--num-topics {self.params['n_topics']} "
-            f"--output-doc-topics {model_dir}/doc_topics.txt "
-            f"--topic-word-weights-file {model_dir}/topic_word_weights.txt "
-            f"--num-iterations {self.params['n_iters']} "
-            f"--output-model {model_dir}/saved_model.mallet "
-            "--random-seed 1 "
-            f"--alpha {self.params['alpha']} "
-            f"--beta {self.params['beta']}"
-        )
-        self.commands_ = [import_str, train_str]
+    def fit(self, dset):
+        """Fit the LDA topic model to text from a Dataset.
 
-    def fit(self):
-        """
-        Fit LDA model to corpus.
+        Parameters
+        ----------
+        dset : :obj:`~nimare.dataset.Dataset`
+            A Dataset with, at minimum, text available in the ``self.text_column`` column of its
+            :py:attr:`~nimare.dataset.Dataset.texts` attribute.
+
+        Returns
+        -------
+        dset : :obj:`~nimare.dataset.Dataset`
+            A new Dataset with an updated :py:attr:`~nimare.dataset.Dataset.annotations` attribute.
 
         Attributes
         ----------
-        p_topic_g_doc_ : :obj:`numpy.ndarray`
-            Probability of each topic given a document
-        p_word_g_topic_ : :obj:`numpy.ndarray`
-            Probability of each word given a topic
+        distributions_ : :obj:`dict`
+            A dictionary containing additional distributions produced by the model, including:
+
+                -   ``p_topic_g_word``: :obj:`numpy.ndarray` of shape (n_topics, n_tokens)
+                    containing the topic-term weights for the model.
+                -   ``p_topic_g_word_df``: :obj:`pandas.DataFrame` of shape (n_topics, n_tokens)
+                    containing the topic-term weights for the model.
         """
-        LGR.info("Generating topics...")
-        _run_shell_command(self.commands_[0])
-        _run_shell_command(self.commands_[1])
-
-        # Read in and convert doc_topics and topic_keys.
-        topic_names = [f"topic_{i:03d}" for i in range(self.params["n_topics"])]
-
-        # doc_topics: Topic weights for each paper.
-        # The conversion here is pretty ugly at the moment.
-        # First row should be dropped. First column is row number and can be used
-        # as the index.
-        # Second column is 'file: /full/path/to/id.txt' <-- Parse to get id.
-        # After that, odd columns are topic numbers and even columns are the
-        # weights for the topics in the preceding column. These columns are sorted
-        # on an individual id basis by the weights.
-        n_cols = (2 * self.params["n_topics"]) + 1
-        dt_df = pd.read_csv(
-            os.path.join(self.model_dir, "doc_topics.txt"),
-            delimiter="\t",
-            skiprows=1,
-            header=None,
-            index_col=0,
+        counts_df = generate_counts(
+            dset.texts,
+            text_column=self.text_column,
+            tfidf=False,
+            max_df=len(dset.ids) - 2,
+            min_df=2,
         )
-        dt_df = dt_df[dt_df.columns[:n_cols]]
-
-        # Get ids from filenames
-        dt_df[1] = dt_df[1].apply(self._clean_str)
-
-        # Put weights (even cols) and topics (odd cols) into separate dfs.
-        weights_df = dt_df[dt_df.columns[2::2]]
-        weights_df.index = dt_df[1]
-        weights_df.columns = range(self.params["n_topics"])
-
-        topics_df = dt_df[dt_df.columns[1:-1:2]]
-        topics_df.index = dt_df[1]
-        topics_df.columns = range(self.params["n_topics"])
-
-        # Sort columns in weights_df separately for each row using topics_df.
-        sorters_df = topics_df.apply(self._get_sort, axis=1)
-        weights = weights_df.values
-        sorters = np.vstack(sorters_df.values)
-        # there has to be a better way to do this.
-        for i in range(sorters.shape[0]):
-            weights[i, :] = weights[i, sorters[i, :]]
-
-        # Define topic names (e.g., topic_000)
-        p_topic_g_doc_df = pd.DataFrame(columns=topic_names, data=weights, index=dt_df[1])
-        p_topic_g_doc_df.index.name = "id"
-        self.p_topic_g_doc_ = p_topic_g_doc_df.values
-        self.p_topic_g_doc_df_ = p_topic_g_doc_df
-
-        # Topic word weights
-        p_word_g_topic_df = pd.read_csv(
-            os.path.join(self.model_dir, "topic_word_weights.txt"),
-            dtype=str,
-            keep_default_na=False,
-            na_values=[],
-            sep="\t",
-            header=None,
-            names=["topic", "word", "weight"],
+        vocabulary = counts_df.columns.tolist()
+        count_values = counts_df.values
+        study_ids = counts_df.index.tolist()
+        # TODO: LDA50__1_word1_word2_word3
+        topic_names = [f"LDA{self.n_topics}__{i + 1}" for i in range(self.n_topics)]
+
+        doc_topic_weights = self.model.fit_transform(count_values)
+        doc_topic_weights_df = pd.DataFrame(
+            index=study_ids,
+            columns=topic_names,
+            data=doc_topic_weights,
         )
-        p_word_g_topic_df["weight"] = p_word_g_topic_df["weight"].astype(float)
-        p_word_g_topic_df["topic"] = p_word_g_topic_df["topic"].astype(int)
-        p_word_g_topic_df = p_word_g_topic_df.pivot(index="topic", columns="word", values="weight")
-        p_word_g_topic_df = p_word_g_topic_df.div(p_word_g_topic_df.sum(axis=1), axis=0)
-        self.p_word_g_topic_ = p_word_g_topic_df.values
-        self.p_word_g_topic_df_ = p_word_g_topic_df
-
-        # Remove all temporary files (text files, model, and outputs).
-        shutil.rmtree(self.model_dir)
-
-    def _clean_str(self, string):
-        return os.path.basename(os.path.splitext(string)[0])
-
-    def _get_sort(self, lst):
-        return [i[0] for i in sorted(enumerate(lst), key=lambda x: x[1])]
+        topic_word_weights = self.model.components_
+        topic_word_weights_df = pd.DataFrame(
+            index=topic_names,
+            columns=vocabulary,
+            data=topic_word_weights,
+        )
+        self.distributions_ = {
+            "p_topic_g_word": topic_word_weights,
+            "p_topic_g_word_df": topic_word_weights_df,
+        }
+
+        annotations = dset.annotations.copy()
+        annotations = pd.merge(annotations, doc_topic_weights_df, left_on="id", right_index=True)
+        new_dset = dset.copy()
+        new_dset.annotations = annotations
+        return new_dset
diff --git a/nimare/extract/__init__.py b/nimare/extract/__init__.py
index 709854be8..71afe7526 100644
--- a/nimare/extract/__init__.py
+++ b/nimare/extract/__init__.py
@@ -3,7 +3,6 @@
 from .extract import (
     download_abstracts,
     download_cognitive_atlas,
-    download_mallet,
     download_nidm_pain,
     download_peaks2maps_model,
     fetch_neuroquery,
@@ -12,7 +11,6 @@
 
 __all__ = [
     "download_nidm_pain",
-    "download_mallet",
     "download_cognitive_atlas",
     "download_abstracts",
     "download_peaks2maps_model",
diff --git a/nimare/extract/extract.py b/nimare/extract/extract.py
index b61b19ed4..066232bd2 100644
--- a/nimare/extract/extract.py
+++ b/nimare/extract/extract.py
@@ -305,55 +305,6 @@ def download_nidm_pain(data_dir=None, overwrite=False):
     return data_dir
 
 
-def download_mallet(data_dir=None, overwrite=False):
-    """Download the MALLET toolbox for LDA topic modeling.
-
-    .. versionadded:: 0.0.2
-
-    Parameters
-    ----------
-    data_dir : :obj:`pathlib.Path` or :obj:`str`, optional
-        Path where data should be downloaded. By default, files are downloaded in home directory.
-    overwrite : :obj:`bool`, optional
-        Whether to overwrite existing files or not. Default is False.
-
-    Returns
-    -------
-    data_dir : :obj:`str`
-        Updated data directory pointing to MALLET files.
-    """
-    url = "http://mallet.cs.umass.edu/dist/mallet-2.0.7.tar.gz"
-
-    temp_dataset_name = "mallet__temp"
-    temp_data_dir = _get_dataset_dir(temp_dataset_name, data_dir=data_dir)
-
-    dataset_name = "mallet"
-    data_dir = temp_data_dir.replace(temp_dataset_name, dataset_name)
-
-    desc_file = op.join(data_dir, "description.txt")
-    if op.isfile(desc_file) and overwrite is False:
-        shutil.rmtree(temp_data_dir)
-        return data_dir
-
-    mallet_file = op.join(temp_data_dir, op.basename(url))
-    _download_zipped_file(url, mallet_file)
-
-    with tarfile.open(mallet_file) as tf:
-        tf.extractall(path=temp_data_dir)
-
-    os.rename(op.join(temp_data_dir, "mallet-2.0.7"), data_dir)
-
-    os.remove(mallet_file)
-    shutil.rmtree(temp_data_dir)
-
-    with open(desc_file, "w") as fo:
-        fo.write("The MALLET toolbox for latent Dirichlet allocation.")
-
-    LGR.debug(f"Dataset moved to {data_dir}")
-
-    return data_dir
-
-
 def download_cognitive_atlas(data_dir=None, overwrite=False):
     """Download Cognitive Atlas ontology and extract IDs and relationships.
 
diff --git a/nimare/tests/test_annotate_lda.py b/nimare/tests/test_annotate_lda.py
new file mode 100644
index 000000000..550c3e7fc
--- /dev/null
+++ b/nimare/tests/test_annotate_lda.py
@@ -0,0 +1,26 @@
+"""Test nimare.annotate.lda (LDA)."""
+import numpy as np
+import pandas as pd
+
+from nimare import annotate
+
+
+def test_lda(testdata_laird):
+    """A smoke test for LDA."""
+    N_TOPICS = 5
+    model = annotate.lda.LDAModel(
+        n_topics=N_TOPICS,
+        max_iter=100,
+        text_column="abstract",
+    )
+    new_dset = model.fit(testdata_laird)
+    topic_columns = [c for c in new_dset.annotations.columns if c.startswith("LDA")]
+    assert len(topic_columns) == N_TOPICS
+
+    assert hasattr(model, "distributions_")
+    assert "p_topic_g_word" in model.distributions_.keys()
+    assert isinstance(model.distributions_["p_topic_g_word"], np.ndarray)
+    assert model.distributions_["p_topic_g_word"].shape[0] == N_TOPICS
+    assert "p_topic_g_word_df" in model.distributions_.keys()
+    assert isinstance(model.distributions_["p_topic_g_word_df"], pd.DataFrame)
+    assert model.distributions_["p_topic_g_word_df"].shape[0] == N_TOPICS
diff --git a/nimare/tests/test_utils.py b/nimare/tests/test_utils.py
index bcce85955..73ec07f01 100644
--- a/nimare/tests/test_utils.py
+++ b/nimare/tests/test_utils.py
@@ -2,7 +2,6 @@
 import logging
 import os
 import os.path as op
-import time
 
 import nibabel as nib
 import numpy as np
@@ -184,27 +183,3 @@ def test_mm2vox():
     img = utils.get_template(space="mni152_2mm", mask=None)
     aff = img.affine
     assert np.array_equal(utils.mm2vox(test, aff), true)
-
-
-def test_run_shell_command(caplog):
-    """Test _run_shell_command."""
-    with caplog.at_level(logging.INFO):
-        utils._run_shell_command("echo 'output'")
-    assert "output" in caplog.text
-
-    # Check that the exception is registered as such
-    with pytest.raises(Exception) as execinfo:
-        utils._run_shell_command("echo 'Error!' 1>&2;exit 64")
-    assert "Error!" in str(execinfo.value)
-
-    # Check that the function actually waits until the command completes
-    dur = 3
-    start = time.time()
-    with caplog.at_level(logging.INFO):
-        utils._run_shell_command(f"echo 'hi';sleep {dur}s;echo 'bye'")
-    end = time.time()
-
-    assert "hi" in caplog.text
-    assert "bye" in caplog.text
-    duration = end - start
-    assert duration >= dur
diff --git a/nimare/utils.py b/nimare/utils.py
index aeebbd73d..33a7a2776 100755
--- a/nimare/utils.py
+++ b/nimare/utils.py
@@ -5,7 +5,6 @@
 import os
 import os.path as op
 import re
-import subprocess
 from functools import wraps
 from tempfile import mkstemp
 
@@ -937,28 +936,3 @@ def _boolean_unmask(data_array, bool_array):
     unmasked_data[bool_array] = data_array
     unmasked_data = unmasked_data.T
     return unmasked_data
-
-
-def _run_shell_command(command, env=None):
-    """Run a given command with certain environment variables set."""
-    merged_env = os.environ
-    if env:
-        merged_env.update(env)
-
-    process = subprocess.Popen(
-        command,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        shell=True,
-        env=merged_env,
-    )
-    while True:
-        line = process.stdout.readline()
-        line = str(line, "utf-8")[:-1]
-        LGR.info(line)
-        if line == "" and process.poll() is not None:
-            break
-
-    if process.returncode != 0:
-        stderr_line = str(process.stderr.read(), "utf-8")[:-1]
-        raise Exception(f"Non zero return code: {process.returncode}\n{command}\n\n{stderr_line}")