From 7e094f2eb70d75c3d3b455636303a9f2934b44ec Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 6 Jan 2022 16:31:09 -0500 Subject: [PATCH] Use scikit-learn for LDAModel (#607) * Drop LDA. * Delete 03_lda.py * Use resources instead of test data. * Bundle sklearn model in new class. * More updates. * Fix. * Add test. * Update 03_plot_lda.py * Improve things. * Link to CBMA documentation. * Update 03_plot_lda.py * Update api.rst * More cleanup. * Remove Annotator class. The Annotator and Annotation classes will be developed in #618. * Update 03_plot_lda.py * Remove undefined base class. --- docs/api.rst | 1 - examples/03_annotation/03_lda.py | 43 ---- examples/03_annotation/03_plot_lda.py | 57 ++++++ nimare/annotate/lda.py | 276 +++++++++----------------- nimare/extract/__init__.py | 2 - nimare/extract/extract.py | 49 ----- nimare/tests/test_annotate_lda.py | 26 +++ nimare/tests/test_utils.py | 25 --- nimare/utils.py | 26 --- 9 files changed, 181 insertions(+), 324 deletions(-) delete mode 100644 examples/03_annotation/03_lda.py create mode 100644 examples/03_annotation/03_plot_lda.py create mode 100644 nimare/tests/test_annotate_lda.py diff --git a/docs/api.rst b/docs/api.rst index b110c8267..9d5420fbb 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -218,7 +218,6 @@ For more information about fetching data from the internet, see :ref:`fetching t extract.fetch_neuroquery extract.fetch_neurosynth extract.download_nidm_pain - extract.download_mallet extract.download_cognitive_atlas extract.download_abstracts extract.download_peaks2maps_model diff --git a/examples/03_annotation/03_lda.py b/examples/03_annotation/03_lda.py deleted file mode 100644 index def5b69ab..000000000 --- a/examples/03_annotation/03_lda.py +++ /dev/null @@ -1,43 +0,0 @@ -# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- -# ex: set sts=4 ts=4 sw=4 et: -""" - -.. _annotations_lda: - -================== -LDA topic modeling -================== - -This example trains a latent Dirichlet allocation model with MALLET using abstracts from -Neurosynth. -""" -import os - -from nimare import annotate, extract -from nimare.dataset import Dataset -from nimare.utils import get_resource_path - -############################################################################### -# Load dataset with abstracts -# --------------------------- -dset = Dataset(os.path.join(get_resource_path(), "neurosynth_laird_studies.json")) - -############################################################################### -# Download MALLET -# --------------- -# MALLET is a Java toolbox for natural language processing. -# While LDA is implemented in some Python libraries, like scikit-learn, -# MALLET appears to do a better job at LDA than other tools. -# LDAModel will download MALLET automatically, but it's included here for clarity. -mallet_dir = extract.download_mallet() - -############################################################################### -# Run model -# --------- -# This may take some time, so we won't run it in the gallery. -model = annotate.lda.LDAModel(dset.texts, text_column="abstract", n_iters=5) -model.fit() -model.save("lda_model.pkl.gz") - -# Let's remove the model now that you know how to generate it. -os.remove("lda_model.pkl.gz") diff --git a/examples/03_annotation/03_plot_lda.py b/examples/03_annotation/03_plot_lda.py new file mode 100644 index 000000000..8dc5d2a6c --- /dev/null +++ b/examples/03_annotation/03_plot_lda.py @@ -0,0 +1,57 @@ +# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- +# ex: set sts=4 ts=4 sw=4 et: +""" + +.. _annotations_lda: + +================== +LDA topic modeling +================== + +This example trains a latent Dirichlet allocation model with scikit-learn +using abstracts from Neurosynth. +""" +import os + +import pandas as pd + +from nimare import annotate +from nimare.dataset import Dataset +from nimare.utils import get_resource_path + +############################################################################### +# Load dataset with abstracts +# --------------------------- +dset = Dataset(os.path.join(get_resource_path(), "neurosynth_laird_studies.json")) + +############################################################################### +# Initialize LDA model +# -------------------- +model = annotate.lda.LDAModel(n_topics=5, max_iter=1000, text_column="abstract") + +############################################################################### +# Run model +# --------- +new_dset = model.fit(dset) + +############################################################################### +# View results +# ------------ +# This DataFrame is very large, so we will only show a slice of it. +new_dset.annotations[new_dset.annotations.columns[:10]].head(10) + +############################################################################### +# Given that this DataFrame is very wide (many terms), we will transpose it before presenting it. +model.distributions_["p_topic_g_word_df"].T.head(10) + +############################################################################### +n_top_terms = 10 +top_term_df = model.distributions_["p_topic_g_word_df"].T +temp_df = top_term_df.copy() +top_term_df = pd.DataFrame(columns=top_term_df.columns, index=range(n_top_terms)) +top_term_df.index.name = "Token" +for col in top_term_df.columns: + top_tokens = temp_df.sort_values(by=col, ascending=False).index.tolist()[:n_top_terms] + top_term_df.loc[:, col] = top_tokens + +top_term_df diff --git a/nimare/annotate/lda.py b/nimare/annotate/lda.py index c9075fa36..4ad9e46a1 100644 --- a/nimare/annotate/lda.py +++ b/nimare/annotate/lda.py @@ -1,219 +1,139 @@ -"""Topic modeling with latent Dirichlet allocation via MALLET.""" -import logging -import os -import shutil - -import numpy as np +"""Topic modeling with latent Dirichlet allocation.""" import pandas as pd +from sklearn.decomposition import LatentDirichletAllocation -from .. import references -from ..base import NiMAREBase -from ..due import due -from ..extract import download_mallet, utils -from ..utils import _run_shell_command - -LGR = logging.getLogger(__name__) +from nimare import references +from nimare.annotate.text import generate_counts +from nimare.base import NiMAREBase +from nimare.due import due @due.dcite(references.LDA, description="Introduces LDA.") -@due.dcite(references.MALLET, description="Citation for MALLET toolbox") @due.dcite( references.LDAMODEL, description="First use of LDA for automated annotation of neuroimaging literature.", ) class LDAModel(NiMAREBase): - """Perform topic modeling using Latent Dirichlet Allocation (LDA). + """Generate a latent Dirichlet allocation (LDA) topic model. - Build an LDA [1]_ topic model with the Java toolbox MALLET [2]_, as - performed in [3]_. + This class is a light wrapper around scikit-learn tools for tokenization and LDA. Parameters ---------- - text_df : :obj:`pandas.DataFrame` - A pandas DataFrame with two columns ('id' and text_column) containing - article text. + n_topics : :obj:`int` + Number of topics for topic model. This corresponds to the model's ``n_components`` + parameter. Must be an integer >= 1. + max_iter : :obj:`int`, optional + Maximum number of iterations to use during model fitting. Default = 1000. + alpha : :obj:`float` or None, optional + The ``alpha`` value for the model. This corresponds to the model's ``doc_topic_prior`` + parameter. Default is None, which evaluates to ``1 / n_topics``, as was used in [2]_. + beta : :obj:`float` or None, optional + The ``beta`` value for the model. This corresponds to the model's ``topic_word_prior`` + parameter. If None, it evaluates to ``1 / n_topics``. + Default is 0.001, which was used in [2]_. text_column : :obj:`str`, optional - Name of column in text_df that contains text. Default is 'abstract'. - n_topics : :obj:`int`, optional - Number of topics to generate. Default=50. - n_iters : :obj:`int`, optional - Number of iterations to run in training topic model. Default=1000. - alpha : :obj:`float` or 'auto', optional - The Dirichlet prior on the per-document topic distributions. - Default: auto, which calculates 50 / n_topics, based on Poldrack et al. - (2012). - beta : :obj:`float`, optional - The Dirichlet prior on the per-topic word distribution. Default: 0.001, - based on Poldrack et al. (2012). + The source of text to use for the model. This should correspond to an existing column + in the :py:attr:`~nimare.dataset.Dataset.texts` attribute. Default is "abstract". Attributes ---------- - commands_ : :obj:`list` of :obj:`str` - List of MALLET commands called to fit model. + model : :obj:`~sklearn.decomposition.LatentDirichletAllocation` + + Notes + ----- + Latent Dirichlet allocation was first developed in [1]_, and was first applied to neuroimaging + articles in [2]_. References ---------- .. [1] Blei, David M., Andrew Y. Ng, and Michael I. Jordan. "Latent dirichlet allocation." Journal of machine Learning research 3.Jan (2003): 993-1022. - .. [2] McCallum, Andrew Kachites. "Mallet: A machine learning for language - toolkit." (2002). - .. [3] Poldrack, Russell A., et al. "Discovering relations between mind, + .. [2] Poldrack, Russell A., et al. "Discovering relations between mind, brain, and mental disorders using topic mapping." PLoS computational biology 8.10 (2012): e1002707. https://doi.org/10.1371/journal.pcbi.1002707 See Also -------- - nimare.extract.download_mallet : This function will be called automatically to download MALLET. + :class:`~sklearn.feature_extraction.text.CountVectorizer`: Used to build a vocabulary of terms + and their associated counts from texts in the ``self.text_column`` of the Dataset's + ``texts`` attribute. + :class:`~sklearn.decomposition.LatentDirichletAllocation`: Used to train the LDA model. """ - def __init__( - self, text_df, text_column="abstract", n_topics=50, n_iters=1000, alpha="auto", beta=0.001 - ): - mallet_dir = download_mallet() - mallet_bin = os.path.join(mallet_dir, "bin/mallet") - - model_dir = utils._get_dataset_dir("mallet_model") - text_dir = os.path.join(model_dir, "texts") - - if not os.path.isdir(model_dir): - os.mkdir(model_dir) - - if alpha == "auto": - alpha = 50.0 / n_topics - elif not isinstance(alpha, float): - raise ValueError('Argument alpha must be float or "auto"') - - self.params = {"n_topics": n_topics, "n_iters": n_iters, "alpha": alpha, "beta": beta} - self.model_dir = model_dir - - # Check for presence of text files and convert if necessary - if not os.path.isdir(text_dir): - LGR.info("Texts folder not found. Creating text files...") - os.mkdir(text_dir) - - # Remove rows with empty text cells - orig_ids = text_df["id"].tolist() - text_df = text_df.dropna(subset=[text_column]) - keep_ids = text_df["id"].tolist() - - if len(keep_ids) != len(orig_ids): - LGR.info(f"Retaining {len(keep_ids)}/{len(orig_ids)} studies") - - for id_ in text_df["id"].values: - text = text_df.loc[text_df["id"] == id_, text_column].values[0] - with open(os.path.join(text_dir, str(id_) + ".txt"), "w") as fo: - fo.write(text) - - # Run MALLET topic modeling - LGR.info("Compiling MALLET commands...") - import_str = ( - f"{mallet_bin} import-dir " - f"--input {text_dir} " - f"--output {model_dir}/topic-input.mallet " - "--keep-sequence " - "--remove-stopwords" + def __init__(self, n_topics, max_iter=1000, alpha=None, beta=0.001, text_column="abstract"): + self.n_topics = n_topics + self.max_iter = max_iter + self.alpha = alpha + self.beta = beta + self.text_column = text_column + + self.model = LatentDirichletAllocation( + n_components=n_topics, + max_iter=max_iter, + learning_method="online", + doc_topic_prior=alpha, + topic_word_prior=beta, ) - train_str = ( - f"{mallet_bin} train-topics " - f"--input {model_dir}/topic-input.mallet " - f"--num-topics {self.params['n_topics']} " - f"--output-doc-topics {model_dir}/doc_topics.txt " - f"--topic-word-weights-file {model_dir}/topic_word_weights.txt " - f"--num-iterations {self.params['n_iters']} " - f"--output-model {model_dir}/saved_model.mallet " - "--random-seed 1 " - f"--alpha {self.params['alpha']} " - f"--beta {self.params['beta']}" - ) - self.commands_ = [import_str, train_str] + def fit(self, dset): + """Fit the LDA topic model to text from a Dataset. - def fit(self): - """ - Fit LDA model to corpus. + Parameters + ---------- + dset : :obj:`~nimare.dataset.Dataset` + A Dataset with, at minimum, text available in the ``self.text_column`` column of its + :py:attr:`~nimare.dataset.Dataset.texts` attribute. + + Returns + ------- + dset : :obj:`~nimare.dataset.Dataset` + A new Dataset with an updated :py:attr:`~nimare.dataset.Dataset.annotations` attribute. Attributes ---------- - p_topic_g_doc_ : :obj:`numpy.ndarray` - Probability of each topic given a document - p_word_g_topic_ : :obj:`numpy.ndarray` - Probability of each word given a topic + distributions_ : :obj:`dict` + A dictionary containing additional distributions produced by the model, including: + + - ``p_topic_g_word``: :obj:`numpy.ndarray` of shape (n_topics, n_tokens) + containing the topic-term weights for the model. + - ``p_topic_g_word_df``: :obj:`pandas.DataFrame` of shape (n_topics, n_tokens) + containing the topic-term weights for the model. """ - LGR.info("Generating topics...") - _run_shell_command(self.commands_[0]) - _run_shell_command(self.commands_[1]) - - # Read in and convert doc_topics and topic_keys. - topic_names = [f"topic_{i:03d}" for i in range(self.params["n_topics"])] - - # doc_topics: Topic weights for each paper. - # The conversion here is pretty ugly at the moment. - # First row should be dropped. First column is row number and can be used - # as the index. - # Second column is 'file: /full/path/to/id.txt' <-- Parse to get id. - # After that, odd columns are topic numbers and even columns are the - # weights for the topics in the preceding column. These columns are sorted - # on an individual id basis by the weights. - n_cols = (2 * self.params["n_topics"]) + 1 - dt_df = pd.read_csv( - os.path.join(self.model_dir, "doc_topics.txt"), - delimiter="\t", - skiprows=1, - header=None, - index_col=0, + counts_df = generate_counts( + dset.texts, + text_column=self.text_column, + tfidf=False, + max_df=len(dset.ids) - 2, + min_df=2, ) - dt_df = dt_df[dt_df.columns[:n_cols]] - - # Get ids from filenames - dt_df[1] = dt_df[1].apply(self._clean_str) - - # Put weights (even cols) and topics (odd cols) into separate dfs. - weights_df = dt_df[dt_df.columns[2::2]] - weights_df.index = dt_df[1] - weights_df.columns = range(self.params["n_topics"]) - - topics_df = dt_df[dt_df.columns[1:-1:2]] - topics_df.index = dt_df[1] - topics_df.columns = range(self.params["n_topics"]) - - # Sort columns in weights_df separately for each row using topics_df. - sorters_df = topics_df.apply(self._get_sort, axis=1) - weights = weights_df.values - sorters = np.vstack(sorters_df.values) - # there has to be a better way to do this. - for i in range(sorters.shape[0]): - weights[i, :] = weights[i, sorters[i, :]] - - # Define topic names (e.g., topic_000) - p_topic_g_doc_df = pd.DataFrame(columns=topic_names, data=weights, index=dt_df[1]) - p_topic_g_doc_df.index.name = "id" - self.p_topic_g_doc_ = p_topic_g_doc_df.values - self.p_topic_g_doc_df_ = p_topic_g_doc_df - - # Topic word weights - p_word_g_topic_df = pd.read_csv( - os.path.join(self.model_dir, "topic_word_weights.txt"), - dtype=str, - keep_default_na=False, - na_values=[], - sep="\t", - header=None, - names=["topic", "word", "weight"], + vocabulary = counts_df.columns.tolist() + count_values = counts_df.values + study_ids = counts_df.index.tolist() + # TODO: LDA50__1_word1_word2_word3 + topic_names = [f"LDA{self.n_topics}__{i + 1}" for i in range(self.n_topics)] + + doc_topic_weights = self.model.fit_transform(count_values) + doc_topic_weights_df = pd.DataFrame( + index=study_ids, + columns=topic_names, + data=doc_topic_weights, ) - p_word_g_topic_df["weight"] = p_word_g_topic_df["weight"].astype(float) - p_word_g_topic_df["topic"] = p_word_g_topic_df["topic"].astype(int) - p_word_g_topic_df = p_word_g_topic_df.pivot(index="topic", columns="word", values="weight") - p_word_g_topic_df = p_word_g_topic_df.div(p_word_g_topic_df.sum(axis=1), axis=0) - self.p_word_g_topic_ = p_word_g_topic_df.values - self.p_word_g_topic_df_ = p_word_g_topic_df - - # Remove all temporary files (text files, model, and outputs). - shutil.rmtree(self.model_dir) - - def _clean_str(self, string): - return os.path.basename(os.path.splitext(string)[0]) - - def _get_sort(self, lst): - return [i[0] for i in sorted(enumerate(lst), key=lambda x: x[1])] + topic_word_weights = self.model.components_ + topic_word_weights_df = pd.DataFrame( + index=topic_names, + columns=vocabulary, + data=topic_word_weights, + ) + self.distributions_ = { + "p_topic_g_word": topic_word_weights, + "p_topic_g_word_df": topic_word_weights_df, + } + + annotations = dset.annotations.copy() + annotations = pd.merge(annotations, doc_topic_weights_df, left_on="id", right_index=True) + new_dset = dset.copy() + new_dset.annotations = annotations + return new_dset diff --git a/nimare/extract/__init__.py b/nimare/extract/__init__.py index 709854be8..71afe7526 100644 --- a/nimare/extract/__init__.py +++ b/nimare/extract/__init__.py @@ -3,7 +3,6 @@ from .extract import ( download_abstracts, download_cognitive_atlas, - download_mallet, download_nidm_pain, download_peaks2maps_model, fetch_neuroquery, @@ -12,7 +11,6 @@ __all__ = [ "download_nidm_pain", - "download_mallet", "download_cognitive_atlas", "download_abstracts", "download_peaks2maps_model", diff --git a/nimare/extract/extract.py b/nimare/extract/extract.py index b61b19ed4..066232bd2 100644 --- a/nimare/extract/extract.py +++ b/nimare/extract/extract.py @@ -305,55 +305,6 @@ def download_nidm_pain(data_dir=None, overwrite=False): return data_dir -def download_mallet(data_dir=None, overwrite=False): - """Download the MALLET toolbox for LDA topic modeling. - - .. versionadded:: 0.0.2 - - Parameters - ---------- - data_dir : :obj:`pathlib.Path` or :obj:`str`, optional - Path where data should be downloaded. By default, files are downloaded in home directory. - overwrite : :obj:`bool`, optional - Whether to overwrite existing files or not. Default is False. - - Returns - ------- - data_dir : :obj:`str` - Updated data directory pointing to MALLET files. - """ - url = "http://mallet.cs.umass.edu/dist/mallet-2.0.7.tar.gz" - - temp_dataset_name = "mallet__temp" - temp_data_dir = _get_dataset_dir(temp_dataset_name, data_dir=data_dir) - - dataset_name = "mallet" - data_dir = temp_data_dir.replace(temp_dataset_name, dataset_name) - - desc_file = op.join(data_dir, "description.txt") - if op.isfile(desc_file) and overwrite is False: - shutil.rmtree(temp_data_dir) - return data_dir - - mallet_file = op.join(temp_data_dir, op.basename(url)) - _download_zipped_file(url, mallet_file) - - with tarfile.open(mallet_file) as tf: - tf.extractall(path=temp_data_dir) - - os.rename(op.join(temp_data_dir, "mallet-2.0.7"), data_dir) - - os.remove(mallet_file) - shutil.rmtree(temp_data_dir) - - with open(desc_file, "w") as fo: - fo.write("The MALLET toolbox for latent Dirichlet allocation.") - - LGR.debug(f"Dataset moved to {data_dir}") - - return data_dir - - def download_cognitive_atlas(data_dir=None, overwrite=False): """Download Cognitive Atlas ontology and extract IDs and relationships. diff --git a/nimare/tests/test_annotate_lda.py b/nimare/tests/test_annotate_lda.py new file mode 100644 index 000000000..550c3e7fc --- /dev/null +++ b/nimare/tests/test_annotate_lda.py @@ -0,0 +1,26 @@ +"""Test nimare.annotate.lda (LDA).""" +import numpy as np +import pandas as pd + +from nimare import annotate + + +def test_lda(testdata_laird): + """A smoke test for LDA.""" + N_TOPICS = 5 + model = annotate.lda.LDAModel( + n_topics=N_TOPICS, + max_iter=100, + text_column="abstract", + ) + new_dset = model.fit(testdata_laird) + topic_columns = [c for c in new_dset.annotations.columns if c.startswith("LDA")] + assert len(topic_columns) == N_TOPICS + + assert hasattr(model, "distributions_") + assert "p_topic_g_word" in model.distributions_.keys() + assert isinstance(model.distributions_["p_topic_g_word"], np.ndarray) + assert model.distributions_["p_topic_g_word"].shape[0] == N_TOPICS + assert "p_topic_g_word_df" in model.distributions_.keys() + assert isinstance(model.distributions_["p_topic_g_word_df"], pd.DataFrame) + assert model.distributions_["p_topic_g_word_df"].shape[0] == N_TOPICS diff --git a/nimare/tests/test_utils.py b/nimare/tests/test_utils.py index bcce85955..73ec07f01 100644 --- a/nimare/tests/test_utils.py +++ b/nimare/tests/test_utils.py @@ -2,7 +2,6 @@ import logging import os import os.path as op -import time import nibabel as nib import numpy as np @@ -184,27 +183,3 @@ def test_mm2vox(): img = utils.get_template(space="mni152_2mm", mask=None) aff = img.affine assert np.array_equal(utils.mm2vox(test, aff), true) - - -def test_run_shell_command(caplog): - """Test _run_shell_command.""" - with caplog.at_level(logging.INFO): - utils._run_shell_command("echo 'output'") - assert "output" in caplog.text - - # Check that the exception is registered as such - with pytest.raises(Exception) as execinfo: - utils._run_shell_command("echo 'Error!' 1>&2;exit 64") - assert "Error!" in str(execinfo.value) - - # Check that the function actually waits until the command completes - dur = 3 - start = time.time() - with caplog.at_level(logging.INFO): - utils._run_shell_command(f"echo 'hi';sleep {dur}s;echo 'bye'") - end = time.time() - - assert "hi" in caplog.text - assert "bye" in caplog.text - duration = end - start - assert duration >= dur diff --git a/nimare/utils.py b/nimare/utils.py index aeebbd73d..33a7a2776 100755 --- a/nimare/utils.py +++ b/nimare/utils.py @@ -5,7 +5,6 @@ import os import os.path as op import re -import subprocess from functools import wraps from tempfile import mkstemp @@ -937,28 +936,3 @@ def _boolean_unmask(data_array, bool_array): unmasked_data[bool_array] = data_array unmasked_data = unmasked_data.T return unmasked_data - - -def _run_shell_command(command, env=None): - """Run a given command with certain environment variables set.""" - merged_env = os.environ - if env: - merged_env.update(env) - - process = subprocess.Popen( - command, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - shell=True, - env=merged_env, - ) - while True: - line = process.stdout.readline() - line = str(line, "utf-8")[:-1] - LGR.info(line) - if line == "" and process.poll() is not None: - break - - if process.returncode != 0: - stderr_line = str(process.stderr.read(), "utf-8")[:-1] - raise Exception(f"Non zero return code: {process.returncode}\n{command}\n\n{stderr_line}")