From 9e7f14b3f3f0272e45ce747e61d99b10601b8382 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Tue, 10 Aug 2021 13:45:54 -0400 Subject: [PATCH] [ENH] Support new format for Neurosynth and NeuroQuery data (#535) * Initial work on new fetcher. * Get fetcher working. * Get conversion working. * Fix test_fetch_neurosynth. * Work on conversion tests. * Fix other test. Now I just need the test files. * Change entity order in new standard. * Fix path. * Update example, add test data, and fix tests. * Add fetch_neuroquery to API. * Clean things up a bit. * Drop ids txt file and add metadata tsv.gz file. * Update test files. * Make some metadata optional. * Add NeuroQuery stuff. * Generalize the download example. Ref #550. * Pin NeuroQuery to commit instead of branch. --- docs/api.rst | 1 + examples/01_datasets/download_neurosynth.py | 84 ++++-- nimare/extract/__init__.py | 12 +- nimare/extract/extract.py | 232 +++++++++++++--- nimare/io.py | 253 +++++++++++++----- nimare/resources/database_file_manifest.json | 142 ++++++++++ ...ta-neurosynth_version-7_coordinates.tsv.gz | Bin 0 -> 8671 bytes .../data-neurosynth_version-7_metadata.tsv.gz | Bin 0 -> 2111 bytes ...ms_source-abstract_type-tfidf_features.npz | Bin 0 -> 1442 bytes ...synth_version-7_vocab-terms_vocabulary.txt | 100 +++++++ .../tests/data/test_neurosynth_database.txt | 1 - .../tests/data/test_neurosynth_features.txt | 1 - nimare/tests/test_extract.py | 29 +- nimare/tests/test_io.py | 52 +++- 14 files changed, 756 insertions(+), 151 deletions(-) create mode 100644 nimare/resources/database_file_manifest.json create mode 100644 nimare/tests/data/data-neurosynth_version-7_coordinates.tsv.gz create mode 100644 nimare/tests/data/data-neurosynth_version-7_metadata.tsv.gz create mode 100644 nimare/tests/data/data-neurosynth_version-7_vocab-terms_source-abstract_type-tfidf_features.npz create mode 100644 nimare/tests/data/data-neurosynth_version-7_vocab-terms_vocabulary.txt delete mode 100644 nimare/tests/data/test_neurosynth_database.txt delete mode 100644 nimare/tests/data/test_neurosynth_features.txt diff --git a/docs/api.rst b/docs/api.rst index f043de9a3..1e94529f9 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -194,6 +194,7 @@ For more information about functional characterization analysis, see :ref:`Meta- :toctree: generated/ :template: function.rst + extract.fetch_neuroquery extract.fetch_neurosynth extract.download_nidm_pain extract.download_mallet diff --git a/examples/01_datasets/download_neurosynth.py b/examples/01_datasets/download_neurosynth.py index 7ab45dc38..c1803eaab 100644 --- a/examples/01_datasets/download_neurosynth.py +++ b/examples/01_datasets/download_neurosynth.py @@ -4,47 +4,95 @@ .. _datasets2: -============================================= - Download and convert the Neurosynth database -============================================= +================================================ + Download the Neurosynth or NeuroQuery databases +================================================ Download and convert the Neurosynth database (with abstracts) for analysis with NiMARE. -.. note:: - This will likely change as we work to shift database querying to a remote - database, rather than handling it locally with NiMARE. +.. warning:: + In August 2021, the Neurosynth database was reorganized according to a new file format. + As such, the ``fetch_neurosynth`` function for NiMARE versions before 0.0.10 will not work + with its default parameters. + In order to download the Neurosynth database in its older format using NiMARE <= 0.0.9, + do the following:: + nimare.extract.fetch_neurosynth( + url=( + "https://github.com/neurosynth/neurosynth-data/blob/" + "e8f27c4a9a44dbfbc0750366166ad2ba34ac72d6/current_data.tar.gz?raw=true" + ), + ) """ ############################################################################### # Start with the necessary imports # -------------------------------- import os - -from neurosynth.base.dataset import download +from pprint import pprint import nimare ############################################################################### # Download Neurosynth -# -------------------------------- +# ------------------- +# Neurosynth's data files are stored at https://github.com/neurosynth/neurosynth-data. out_dir = os.path.abspath("../example_data/") -if not os.path.isdir(out_dir): - os.mkdir(out_dir) +os.makedirs(out_dir, exist_ok=True) -if not os.path.isfile(os.path.join(out_dir, "database.txt")): - download(out_dir, unpack=True) +files = nimare.extract.fetch_neurosynth( + path=out_dir, + version="7", + overwrite=False, + source="abstract", + vocab="terms", +) +pprint(files) +neurosynth_db = files[0] ############################################################################### # Convert Neurosynth database to NiMARE dataset file # -------------------------------------------------- -dset = nimare.io.convert_neurosynth_to_dataset( - os.path.join(out_dir, "database.txt"), os.path.join(out_dir, "features.txt") +neurosynth_dset = nimare.io.convert_neurosynth_to_dataset( + database_file=neurosynth_db["database"], + annotations_files=neurosynth_db["features"], ) -dset.save(os.path.join(out_dir, "neurosynth_dataset.pkl.gz")) +neurosynth_dset.save(os.path.join(out_dir, "neurosynth_dataset.pkl.gz")) +print(neurosynth_dset) ############################################################################### # Add article abstracts to dataset # -------------------------------- -dset = nimare.extract.download_abstracts(dset, "tsalo006@fiu.edu") -dset.save(os.path.join(out_dir, "neurosynth_nimare_with_abstracts.pkl.gz")) +# This is only possible because Neurosynth uses PMIDs as study IDs. +# +# Make sure you replace the example email address with your own. +neurosynth_dset = nimare.extract.download_abstracts(neurosynth_dset, "example@example.edu") +neurosynth_dset.save(os.path.join(out_dir, "neurosynth_dataset_with_abstracts.pkl.gz")) + +############################################################################### +# Do the same with NeuroQuery +# --------------------------- +# NeuroQuery's data files are stored at https://github.com/neuroquery/neuroquery_data. +files = nimare.extract.fetch_neuroquery( + path=out_dir, + version="1", + overwrite=False, + source="combined", + vocab="neuroquery7547", + type="tfidf", +) +pprint(files) +neuroquery_db = files[0] + +# Note that the conversion function says "neurosynth". +# This is just for backwards compatibility. +neuroquery_dset = nimare.io.convert_neurosynth_to_dataset( + database_file=neuroquery_db["database"], + annotations_files=neuroquery_db["features"], +) +neuroquery_dset.save(os.path.join(out_dir, "neuroquery_dataset.pkl.gz")) +print(neuroquery_dset) + +# NeuroQuery also uses PMIDs as study IDs. +neuroquery_dset = nimare.extract.download_abstracts(neuroquery_dset, "example@example.edu") +neuroquery_dset.save(os.path.join(out_dir, "neuroquery_dataset_with_abstracts.pkl.gz")) diff --git a/nimare/extract/__init__.py b/nimare/extract/__init__.py index 24d7b86ad..709854be8 100644 --- a/nimare/extract/__init__.py +++ b/nimare/extract/__init__.py @@ -1,6 +1,4 @@ """Dataset and trained model downloading functions.""" -import warnings - from . import utils from .extract import ( download_abstracts, @@ -8,6 +6,7 @@ download_mallet, download_nidm_pain, download_peaks2maps_model, + fetch_neuroquery, fetch_neurosynth, ) @@ -17,14 +16,7 @@ "download_cognitive_atlas", "download_abstracts", "download_peaks2maps_model", + "fetch_neuroquery", "fetch_neurosynth", "utils", ] - -warnings.simplefilter("default") - -warnings.warn( - "{} is an experimental module under active development; use it at your " - "own risk.".format(__name__), - ImportWarning, -) diff --git a/nimare/extract/extract.py b/nimare/extract/extract.py index 2e4858a0f..64b1c36d1 100644 --- a/nimare/extract/extract.py +++ b/nimare/extract/extract.py @@ -1,10 +1,10 @@ """Tools for downloading datasets.""" +import itertools +import json import logging -import math import os import os.path as op import shutil -import sys import tarfile import time import zipfile @@ -19,6 +19,7 @@ from tqdm.auto import tqdm from ..dataset import Dataset +from ..utils import get_resource_path from .utils import ( _download_zipped_file, _expand_df, @@ -29,59 +30,214 @@ LGR = logging.getLogger(__name__) +VALID_ENTITIES = { + "coordinates.tsv.gz": ["data", "version"], + "metadata.tsv.gz": ["data", "version"], + "features.npz": ["data", "version", "vocab", "source", "type"], + "vocabulary.txt": ["data", "version", "vocab"], + "metadata.json": ["data", "version", "vocab"], + "keys.tsv": ["data", "version", "vocab"], +} + + +def _find_entities(filename, search_pairs, log=False): + """Search file for any matching patterns of entities.""" + # Convert all string-based kwargs to lists + search_pairs = {k: [v] if isinstance(v, str) else v for k, v in search_pairs.items()} + search_pairs = [[f"{k}-{v_i}" for v_i in v] for k, v in search_pairs.items()] + searches = list(itertools.product(*search_pairs)) + + if log: + LGR.info(f"Searching for any feature files matching the following criteria: {searches}") + + file_parts = filename.split("_") + suffix = file_parts[-1] + valid_entities_for_suffix = VALID_ENTITIES[suffix] + for search in searches: + temp_search = [term for term in search if term.split("-")[0] in valid_entities_for_suffix] + if all(term in file_parts for term in temp_search): + return True + + return False + + +def _fetch_database(search_pairs, database_url, out_dir, overwrite=False): + """Fetch generic database.""" + res_dir = get_resource_path() + with open(op.join(res_dir, "database_file_manifest.json"), "r") as fo: + database_file_manifest = json.load(fo) + + out_dir = op.abspath(out_dir) + os.makedirs(out_dir, exist_ok=True) + + found_databases = [] + found_files = [] + log = True + for database in database_file_manifest: + coordinates_file = database["coordinates"] + metadata_file = database["metadata"] + if not _find_entities(coordinates_file, search_pairs, log=log): + log = False + continue + + log = False + + feature_dicts = database["features"] + for feature_dict in feature_dicts: + features_file = feature_dict["features"] + # Other files associated with features have subset of entities, + # so unnecessary to search them if we assume that the hard-coded manifest is valid. + if not _find_entities(features_file, search_pairs): + continue + else: + out_coordinates_file = op.join(out_dir, coordinates_file) + out_metadata_file = op.join(out_dir, metadata_file) + out_feature_dict = {k: op.join(out_dir, v) for k, v in feature_dict.items()} + + db_found = [ + i_db + for i_db, db_dct in enumerate(found_databases) + if db_dct["coordinates"] == out_coordinates_file + ] + if len(db_found): + assert len(db_found) == 1 + + found_databases[db_found[0]]["features"].append(out_feature_dict) + else: + found_databases.append( + { + "coordinates": out_coordinates_file, + "metadata": out_metadata_file, + "features": [out_feature_dict], + } + ) + found_files += [coordinates_file, metadata_file, *feature_dict.values()] + + found_files = sorted(list(set(found_files))) + for found_file in found_files: + print(f"Downloading {found_file}", flush=True) + + url = op.join(database_url, found_file + "?raw=true") + out_file = op.join(out_dir, found_file) + + if op.isfile(out_file) and not overwrite: + print("File exists and overwrite is False. Skipping.") + continue + + with open(out_file, "wb") as fo: + u = urlopen(url) + + block_size = 8192 + while True: + buffer = u.read(block_size) + if not buffer: + break + fo.write(buffer) -def fetch_neurosynth(path=".", url=None, unpack=False): + return found_databases + + +def fetch_neurosynth(path=".", version="7", overwrite=False, **kwargs): """Download the latest data files from NeuroSynth. + .. versionchanged:: 0.0.10 + + * Use new format for Neurosynth and NeuroQuery files. + .. versionadded:: 0.0.4 + Parameters + ---------- + path : str + Location in which to save the retrieved data files. Defaults to current directory. + version : str or list, optional + The version to fetch. The default is "7" (Neurosynth's latest version). + overwrite : bool, optional + Whether to overwrite existing files or not. Default is False. + kwargs : dict, optional + Keyword arguments to select relevant feature files. + Valid kwargs include: source, vocab, type. + Each kwarg may be a string or a list of strings. + If no kwargs are provided, all feature files for the specified database version will be + downloaded. + + Returns + ------- + found_databases : :obj:`list` of :obj:`dict` + List of dictionaries indicating datasets downloaded. + Each list entry is a different database, containing a dictionary with three keys: + "coordinates", "metadata", and "features". "coordinates" and "metadata" will be filenames. + "features" will be a list of dictionaries, each containing "id", "vocab", and "features" + keys with associated files. + + Notes + ----- + This function was adapted from neurosynth.base.dataset.download(). + + Warning + ------- + Starting in version 0.0.10, this function operates on the new Neurosynth/NeuroQuery file + format. Old code using this function **will not work** with the new version. + """ + URL = ( + "https://github.com/neurosynth/neurosynth-data/blob/" + "753c058ac17c69db47689c1bb7c7a2598b443035/" + ) + + kwargs["data"] = "neurosynth" + kwargs["version"] = version + + found_databases = _fetch_database(kwargs, URL, path, overwrite=overwrite) + + return found_databases + + +def fetch_neuroquery(path=".", version="1", overwrite=False, **kwargs): + """Download the latest data files from NeuroQuery. + + .. versionadded:: 0.0.10 + Parameters ---------- path : str Location to save the retrieved data files. Defaults to current directory. + version : str or list, optional + The version to fetch. The default is "7" (Neurosynth's latest version). url : None or str, optional Specific URL to download. If not None, overrides URL to current data. - unpack : bool, optional - If True, unzips the data file post-download. Defaults to False. + If you want to fetch Neurosynth's data from *before* the 2021 reorganization, + you will need to use this argument. + kwargs + Keyword arguments to select relevant feature files. + Valid kwargs include: source, vocab, type. + Each kwarg may be a string or a list of strings. + If no kwargs are provided, all feature files for the specified database version will be + downloaded. + + Returns + ------- + found_databases : :obj:`list` of :obj:`dict` + List of dictionaries indicating datasets downloaded. + Each list entry is a different database, containing a dictionary with three keys: + "coordinates", "metadata", and "features". "coordinates" and "metadata" will be filenames. + "features" will be a list of dictionaries, each containing "id", "vocab", and "features" + keys with associated files. Notes ----- - This function was originally neurosynth.base.dataset.download(). + This function was adapted from neurosynth.base.dataset.download(). """ - if url is None: - url = ( - "https://github.com/neurosynth/neurosynth-data/blob/master/current_data.tar.gz?" - "raw=true" - ) - if os.path.exists(path) and os.path.isdir(path): - basename = os.path.basename(url).split("?")[0] - filename = os.path.join(path, basename) - else: - filename = path - - f = open(filename, "wb") - - u = urlopen(url) - file_size = int(u.headers["Content-Length"][0]) - print("Downloading the latest Neurosynth files: {0} bytes: {1}".format(url, file_size)) + URL = ( + "https://github.com/neuroquery/neuroquery_data/blob/" + "893f7c31ee616a2b05419fab8bcd7c40936f7e0a/data/" + ) - bytes_dl = 0 - block_size = 8192 - while True: - buffer = u.read(block_size) - if not buffer: - break - bytes_dl += len(buffer) - f.write(buffer) - p = float(bytes_dl) / file_size - status = r"{0} [{1:.2%}]".format(bytes_dl, p) - status = status + chr(8) * (len(status) + 1) - sys.stdout.write(status) + kwargs["data"] = "neuroquery" + kwargs["version"] = version - f.close() + found_databases = _fetch_database(kwargs, URL, path, overwrite=overwrite) - if unpack: - tarfile.open(filename, "r:gz").extractall(os.path.dirname(filename)) + return found_databases def download_nidm_pain(data_dir=None, overwrite=False, verbose=1): @@ -408,7 +564,7 @@ def download_peaks2maps_model(data_dir=None, overwrite=False, verbose=1): wrote = 0 for data in tqdm( r.iter_content(block_size), - total=math.ceil(total_size // block_size), + total=np.ceil(total_size // block_size), unit="MB", unit_scale=True, ): diff --git a/nimare/io.py b/nimare/io.py index 0bb8faeea..cf59c0cbc 100644 --- a/nimare/io.py +++ b/nimare/io.py @@ -10,6 +10,7 @@ import numpy as np import pandas as pd import requests +from scipy import sparse from .dataset import Dataset from .extract.utils import _get_dataset_dir @@ -25,8 +26,17 @@ } -def convert_neurosynth_to_dict(text_file, annotations_file=None): - """Convert Neurosynth database files to a dictionary. +def convert_neurosynth_to_dict( + coordinates_file, + metadata_file, + annotations_files=None, + feature_groups=None, +): + """Convert Neurosynth/NeuroQuery database files to a dictionary. + + .. versionchanged:: 0.0.10 + + * Use new format for Neurosynth and NeuroQuery files. .. versionchanged:: 0.0.9 @@ -34,81 +44,135 @@ def convert_neurosynth_to_dict(text_file, annotations_file=None): Parameters ---------- - text_file : :obj:`str` - Text file with Neurosynth's coordinates. Normally named "database.txt". - annotations_file : :obj:`str`, :obj:`dict`, or None, optional - Optional file(s) with Neurosynth's annotations. - If a string is provided, then labels from the file will be labeled with the feature group - "Neurosynth_TFIDF". - If a dictionary is provided, then keys must be feature groups and values must be filenames. - The standard Neurosynth annotations file is normally named "features.txt". + coordinates_file : :obj:`str` + TSV.GZ file with Neurosynth/NeuroQuery's coordinates. + metadata_file : :obj:`str` + TSV.GZ file with Neurosynth/NeuroQuery's metadata. + annotations_files : :obj:`dict`, :obj:`list` of :obj:`dict`, or None, optional + Optional file(s) with Neurosynth/NeuroQuery's annotations. + This should consist of a dictionary with two keys: "features" and "vocabulary". + "features" should have an NPZ file containing a sparse matrix of feature values. + "vocabulary" should have a TXT file containing labels. + The vocabulary corresponds to the columns of the feature matrix, while study IDs are + inferred from the metadata file, which MUST be in the same order as the features matrix. + Multiple sets of annotations may be provided, in which case "annotations_files" should be + a list of dictionaries. The appropriate name of each annotation set will be inferred from + the "features" filename, but this can be overwritten by using the "feature_groups" + parameter. + Default is None. + feature_groups : :obj:`list` of :obj:`str`, or None, optional + An optional list of names of annotation sets defined in "annotations_files". + This should only be used if "annotations_files" is used and the users wants to override + the automatically-extracted annotation set names. Default is None. Returns ------- dset_dict : :obj:`dict` - NiMARE-organized dictionary containing experiment information from text - files. + NiMARE-organized dictionary containing experiment information from text files. + + Warning + ------- + Starting in version 0.0.10, this function operates on the new Neurosynth/NeuroQuery file + format. Old code using this function **will not work** with the new version. """ - dset_df = pd.read_table(text_file) - if "space" not in dset_df.columns: + coords_df = pd.read_table(coordinates_file) + metadata_df = pd.read_table(metadata_file) + assert metadata_df["id"].is_unique, "Metadata file must have one row per ID." + + coords_df["id"] = coords_df["id"].astype(str) + metadata_df["id"] = metadata_df["id"].astype(str) + metadata_df = metadata_df.set_index("id", drop=False) + ids = metadata_df["id"].tolist() + + if "space" not in metadata_df.columns: LGR.warning("No 'space' column detected. Defaulting to 'UNKNOWN'.") - dset_df["space"] = "UNKNOWN" + metadata_df["space"] = "UNKNOWN" + + if isinstance(annotations_files, dict): + annotations_files = [annotations_files] - if isinstance(annotations_file, str): - annotations_file = {"Neurosynth_TFIDF": annotations_file} + if isinstance(feature_groups, str): + feature_groups = [feature_groups] - if annotations_file is not None: + # Load labels into a single DataFrame + if annotations_files is not None: label_dfs = [] - for feature_group, features_file in annotations_file.items(): - if feature_group.endswith("__"): - feature_group = feature_group[:-2] - - label_df = pd.read_table(features_file, index_col="pmid") - label_df.index = label_df.index.astype(str) - labels = label_df.columns - if not all("__" in label for label in labels): - labels = {label: f"{feature_group}__" + label for label in labels} - label_df = label_df.rename(columns=labels) - label_dfs.append(label_df) + if feature_groups is not None: + assert len(feature_groups) == len(annotations_files) + + for i_feature_group, annotations_dict in enumerate(annotations_files): + features_file = annotations_dict["features"] + vocabulary_file = annotations_dict["vocabulary"] + + vocab = re.findall("vocab-([a-zA-Z0-9]+)_", features_file)[0] + source = re.findall("source-([a-zA-Z0-9]+)_", features_file)[0] + value_type = re.findall("type-([a-zA-Z0-9]+)_", features_file)[0] + + if feature_groups is not None: + feature_group = feature_groups[i_feature_group] + feature_group = feature_group.rstrip("_") + "__" + else: + feature_group = f"{vocab}_{source}_{value_type}__" + + features = sparse.load_npz(features_file).todense() + vocab = np.loadtxt(vocabulary_file, dtype=str, delimiter="\t") + + labels = [feature_group + label for label in vocab] + + temp_label_df = pd.DataFrame(features, index=ids, columns=labels) + temp_label_df.index.name = "study_id" + + label_dfs.append(temp_label_df) label_df = pd.concat(label_dfs, axis=1) else: label_df = None - dset_df["id"] = dset_df["id"].astype(str) - - ids = dset_df["id"].unique() + # Compile (pseudo-)NIMADS-format dictionary dset_dict = {} - for sid in ids: - study_df = dset_df.loc[dset_df["id"] == sid] + for sid, study_metadata in metadata_df.iterrows(): + study_coords_df = coords_df.loc[coords_df["id"] == sid] study_dict = {} study_dict["metadata"] = {} - study_dict["metadata"]["authors"] = study_df["authors"].tolist()[0] - study_dict["metadata"]["journal"] = study_df["journal"].tolist()[0] - study_dict["metadata"]["year"] = study_df["year"].tolist()[0] - study_dict["metadata"]["title"] = study_df["title"].tolist()[0] + study_dict["metadata"]["authors"] = study_metadata.get("authors", "n/a") + study_dict["metadata"]["journal"] = study_metadata.get("journal", "n/a") + study_dict["metadata"]["year"] = study_metadata.get("year", "n/a") + study_dict["metadata"]["title"] = study_metadata.get("title", "n/a") study_dict["contrasts"] = {} study_dict["contrasts"]["1"] = {} + # Duplicate metadata across study and contrast levels study_dict["contrasts"]["1"]["metadata"] = {} - study_dict["contrasts"]["1"]["metadata"]["authors"] = study_df["authors"].tolist()[0] - study_dict["contrasts"]["1"]["metadata"]["journal"] = study_df["journal"].tolist()[0] - study_dict["contrasts"]["1"]["metadata"]["year"] = study_df["year"].tolist()[0] - study_dict["contrasts"]["1"]["metadata"]["title"] = study_df["title"].tolist()[0] + study_dict["contrasts"]["1"]["metadata"]["authors"] = study_metadata.get("authors", "n/a") + study_dict["contrasts"]["1"]["metadata"]["journal"] = study_metadata.get("journal", "n/a") + study_dict["contrasts"]["1"]["metadata"]["year"] = study_metadata.get("year", "n/a") + study_dict["contrasts"]["1"]["metadata"]["title"] = study_metadata.get("title", "n/a") study_dict["contrasts"]["1"]["coords"] = {} - study_dict["contrasts"]["1"]["coords"]["space"] = study_df["space"].tolist()[0] - study_dict["contrasts"]["1"]["coords"]["x"] = study_df["x"].tolist() - study_dict["contrasts"]["1"]["coords"]["y"] = study_df["y"].tolist() - study_dict["contrasts"]["1"]["coords"]["z"] = study_df["z"].tolist() + study_dict["contrasts"]["1"]["coords"]["space"] = study_metadata["space"] + study_dict["contrasts"]["1"]["coords"]["x"] = study_coords_df["x"].tolist() + study_dict["contrasts"]["1"]["coords"]["y"] = study_coords_df["y"].tolist() + study_dict["contrasts"]["1"]["coords"]["z"] = study_coords_df["z"].tolist() + if label_df is not None: study_dict["contrasts"]["1"]["labels"] = label_df.loc[sid].to_dict() + dset_dict[sid] = study_dict return dset_dict -def convert_neurosynth_to_json(text_file, out_file, annotations_file=None): - """Convert Neurosynth dataset text file to a NiMARE json file. +def convert_neurosynth_to_json( + coordinates_file, + metadata_file, + out_file, + annotations_files=None, + feature_groups=None, +): + """Convert Neurosynth/NeuroQuery dataset text file to a NiMARE json file. + + .. versionchanged:: 0.0.10 + + * Use new format for Neurosynth and NeuroQuery files. .. versionchanged:: 0.0.9 @@ -116,25 +180,54 @@ def convert_neurosynth_to_json(text_file, out_file, annotations_file=None): Parameters ---------- - text_file : :obj:`str` - Text file with Neurosynth's coordinates. Normally named "database.txt". + coordinates_file : :obj:`str` + TSV.GZ file with Neurosynth/NeuroQuery's coordinates and metadata. + metadata_file : :obj:`str` + TSV.GZ file with Neurosynth/NeuroQuery's metadata. out_file : :obj:`str` Output NiMARE-format json file. - annotations_file : :obj:`str`, :obj:`dict`, or None, optional - Optional file(s) with Neurosynth's annotations. - If a string is provided, then labels from the file will be labeled with the feature group - "Neurosynth_TFIDF". - If a dictionary is provided, then keys must be feature groups and values must be filenames. - The standard Neurosynth annotations file is normally named "features.txt". + annotations_files : :obj:`dict`, :obj:`list` of :obj:`dict`, or None, optional + Optional file(s) with Neurosynth/NeuroQuery's annotations. + This should consist of a dictionary with two keys: "features" and "vocabulary". + "features" should have an NPZ file containing a sparse matrix of feature values. + "vocabulary" should have a TXT file containing labels. + The vocabulary corresponds to the columns of the feature matrix, while study IDs are + inferred from the metadata file, which MUST be in the same order as the features matrix. + Multiple sets of annotations may be provided, in which case "annotations_files" should be + a list of dictionaries. The appropriate name of each annotation set will be inferred from + the "features" filename, but this can be overwritten by using the "feature_groups" + parameter. + Default is None. + feature_groups : :obj:`list` of :obj:`str`, or None, optional + An optional list of names of annotation sets defined in "annotations_files". + This should only be used if "annotations_files" is used and the users wants to override + the automatically-extracted annotation set names. Default is None. + + Warning + ------- + Starting in version 0.0.10, this function operates on the new Neurosynth/NeuroQuery file + format. Old code using this function **will not work** with the new version. """ - dset_dict = convert_neurosynth_to_dict(text_file, annotations_file) + dset_dict = convert_neurosynth_to_dict( + coordinates_file, metadata_file, annotations_files, feature_groups + ) with open(out_file, "w") as fo: json.dump(dset_dict, fo, indent=4, sort_keys=True) -def convert_neurosynth_to_dataset(text_file, annotations_file=None, target="mni152_2mm"): - """Convert Neurosynth database files into NiMARE Dataset. +def convert_neurosynth_to_dataset( + coordinates_file, + metadata_file, + annotations_files=None, + feature_groups=None, + target="mni152_2mm", +): + """Convert Neurosynth/NeuroQuery database files into NiMARE Dataset. + + .. versionchanged:: 0.0.10 + + * Use new format for Neurosynth and NeuroQuery files. .. versionchanged:: 0.0.9 @@ -142,24 +235,46 @@ def convert_neurosynth_to_dataset(text_file, annotations_file=None, target="mni1 Parameters ---------- - text_file : :obj:`str` - Text file with Neurosynth's coordinates. Normally named "database.txt". + coordinates_file : :obj:`str` + TSV.GZ file with Neurosynth/NeuroQuery's coordinates and metadata. + metadata_file : :obj:`str` + TSV.GZ file with Neurosynth/NeuroQuery's metadata. + annotations_files : :obj:`dict`, :obj:`list` of :obj:`dict`, or None, optional + Optional file(s) with Neurosynth/NeuroQuery's annotations. + This should consist of a dictionary with two keys: "features" and "vocabulary". + "features" should have an NPZ file containing a sparse matrix of feature values. + "vocabulary" should have a TXT file containing labels. + The vocabulary corresponds to the columns of the feature matrix, while study IDs are + inferred from the metadata file, which MUST be in the same order as the features matrix. + Multiple sets of annotations may be provided, in which case "annotations_files" should be + a list of dictionaries. The appropriate name of each annotation set will be inferred from + the "features" filename, but this can be overwritten by using the "feature_groups" + parameter. + Default is None. + feature_groups : :obj:`list` of :obj:`str`, or None, optional + An optional list of names of annotation sets defined in "annotations_files". + This should only be used if "annotations_files" is used and the users wants to override + the automatically-extracted annotation set names. + Default is None. target : {'mni152_2mm', 'ale_2mm'}, optional Target template space for coordinates. Default is 'mni152_2mm'. - annotations_file : :obj:`str`, :obj:`dict`, or None, optional - Optional file(s) with Neurosynth's annotations. - If a string is provided, then labels from the file will be labeled with the feature group - "Neurosynth_TFIDF". - If a dictionary is provided, then keys must be feature groups and values must be filenames. - The standard Neurosynth annotations file is normally named "features.txt". - Default is None. Returns ------- :obj:`nimare.dataset.Dataset` Dataset object containing experiment information from text_file. + + Warning + ------- + Starting in version 0.0.10, this function operates on the new Neurosynth/NeuroQuery file + format. Old code using this function **will not work** with the new version. """ - dset_dict = convert_neurosynth_to_dict(text_file, annotations_file) + dset_dict = convert_neurosynth_to_dict( + coordinates_file, + metadata_file, + annotations_files, + feature_groups, + ) return Dataset(dset_dict, target=target) diff --git a/nimare/resources/database_file_manifest.json b/nimare/resources/database_file_manifest.json new file mode 100644 index 000000000..82f2ab536 --- /dev/null +++ b/nimare/resources/database_file_manifest.json @@ -0,0 +1,142 @@ +[ + { + "coordinates": "data-neurosynth_version-3_coordinates.tsv.gz", + "metadata": "data-neurosynth_version-3_metadata.tsv.gz", + "features": [ + { + "features": "data-neurosynth_version-3_vocab-terms_source-abstract_type-tfidf_features.npz", + "vocabulary": "data-neurosynth_version-3_vocab-terms_vocabulary.txt" + } + ] + }, + { + "coordinates": "data-neurosynth_version-4_coordinates.tsv.gz", + "metadata": "data-neurosynth_version-4_metadata.tsv.gz", + "features": [ + { + "features": "data-neurosynth_version-4_vocab-terms_source-abstract_type-tfidf_features.npz", + "vocabulary": "data-neurosynth_version-4_vocab-terms_vocabulary.txt" + } + ] + }, + { + "coordinates": "data-neurosynth_version-5_coordinates.tsv.gz", + "metadata": "data-neurosynth_version-5_metadata.tsv.gz", + "features": [ + { + "features": "data-neurosynth_version-5_vocab-terms_source-abstract_type-tfidf_features.npz", + "vocabulary": "data-neurosynth_version-5_vocab-terms_vocabulary.txt" + } + ] + }, + { + "coordinates": "data-neurosynth_version-6_coordinates.tsv.gz", + "metadata": "data-neurosynth_version-6_metadata.tsv.gz", + "features": [ + { + "features": "data-neurosynth_version-6_vocab-terms_source-abstract_type-tfidf_features.npz", + "vocabulary": "data-neurosynth_version-6_vocab-terms_vocabulary.txt" + }, + { + "features": "data-neurosynth_version-6_vocab-LDA50_source-abstract_type-weight_features.npz", + "vocabulary": "data-neurosynth_version-6_vocab-LDA50_vocabulary.txt", + "keys": "data-neurosynth_version-6_vocab-LDA50_keys.tsv", + "metadata": "data-neurosynth_version-6_vocab-LDA50_metadata.json" + }, + { + "features": "data-neurosynth_version-6_vocab-LDA100_source-abstract_type-weight_features.npz", + "vocabulary": "data-neurosynth_version-6_vocab-LDA100_vocabulary.txt", + "keys": "data-neurosynth_version-6_vocab-LDA100_keys.tsv", + "metadata": "data-neurosynth_version-6_vocab-LDA100_metadata.json" + }, + { + "features": "data-neurosynth_version-6_vocab-LDA200_source-abstract_type-weight_features.npz", + "vocabulary": "data-neurosynth_version-6_vocab-LDA200_vocabulary.txt", + "keys": "data-neurosynth_version-6_vocab-LDA200_keys.tsv", + "metadata": "data-neurosynth_version-6_vocab-LDA200_metadata.json" + }, + { + "features": "data-neurosynth_version-6_vocab-LDA400_source-abstract_type-weight_features.npz", + "vocabulary": "data-neurosynth_version-6_vocab-LDA400_vocabulary.txt", + "keys": "data-neurosynth_version-6_vocab-LDA400_keys.tsv", + "metadata": "data-neurosynth_version-6_vocab-LDA400_metadata.json" + } + ] + }, + { + "coordinates": "data-neurosynth_version-7_coordinates.tsv.gz", + "metadata": "data-neurosynth_version-7_metadata.tsv.gz", + "features": [ + { + "features": "data-neurosynth_version-7_vocab-terms_source-abstract_type-tfidf_features.npz", + "vocabulary": "data-neurosynth_version-7_vocab-terms_vocabulary.txt" + }, + { + "features": "data-neurosynth_version-7_vocab-LDA50_source-abstract_type-weight_features.npz", + "vocabulary": "data-neurosynth_version-7_vocab-LDA50_vocabulary.txt", + "keys": "data-neurosynth_version-7_vocab-LDA50_keys.tsv", + "metadata": "data-neurosynth_version-7_vocab-LDA50_metadata.json" + }, + { + "features": "data-neurosynth_version-7_vocab-LDA100_source-abstract_type-weight_features.npz", + "vocabulary": "data-neurosynth_version-7_vocab-LDA100_vocabulary.txt", + "keys": "data-neurosynth_version-7_vocab-LDA100_keys.tsv", + "metadata": "data-neurosynth_version-7_vocab-LDA100_metadata.json" + }, + { + "features": "data-neurosynth_version-7_vocab-LDA200_source-abstract_type-weight_features.npz", + "vocabulary": "data-neurosynth_version-7_vocab-LDA200_vocabulary.txt", + "keys": "data-neurosynth_version-7_vocab-LDA200_keys.tsv", + "metadata": "data-neurosynth_version-7_vocab-LDA200_metadata.json" + }, + { + "features": "data-neurosynth_version-7_vocab-LDA400_source-abstract_type-weight_features.npz", + "vocabulary": "data-neurosynth_version-7_vocab-LDA400_vocabulary.txt", + "keys": "data-neurosynth_version-7_vocab-LDA400_keys.tsv", + "metadata": "data-neurosynth_version-7_vocab-LDA400_metadata.json" + } + ] + }, + { + "coordinates": "data-neuroquery_version-1_coordinates.tsv.gz", + "metadata": "data-neuroquery_version-1_metadata.tsv.gz", + "features": [ + { + "features": "data-neuroquery_version-1_vocab-neuroquery7547_source-combined_type-tfidf_features.npz", + "vocabulary": "data-neuroquery_version-1_vocab-neuroquery7547_vocabulary.txt" + }, + { + "features": "data-neuroquery_version-1_vocab-neuroquery7547_source-abstract_type-count_features.npz", + "vocabulary": "data-neuroquery_version-1_vocab-neuroquery7547_vocabulary.txt" + }, + { + "features": "data-neuroquery_version-1_vocab-neuroquery7547_source-body_type-count_features.npz", + "vocabulary": "data-neuroquery_version-1_vocab-neuroquery7547_vocabulary.txt" + }, + { + "features": "data-neuroquery_version-1_vocab-neuroquery7547_source-keywords_type-count_features.npz", + "vocabulary": "data-neuroquery_version-1_vocab-neuroquery7547_vocabulary.txt" + }, + { + "features": "data-neuroquery_version-1_vocab-neuroquery7547_source-title_type-count_features.npz", + "vocabulary": "data-neuroquery_version-1_vocab-neuroquery7547_vocabulary.txt" + }, + { + "features": "data-neuroquery_version-1_vocab-neuroquery156521_source-abstract_type-count_features.npz", + "vocabulary": "data-neuroquery_version-1_vocab-neuroquery156521_vocabulary.txt" + }, + { + "features": "data-neuroquery_version-1_vocab-neuroquery156521_source-body_type-count_features.npz", + "vocabulary": "data-neuroquery_version-1_vocab-neuroquery156521_vocabulary.txt" + }, + { + "features": "data-neuroquery_version-1_vocab-neuroquery156521_source-keywords_type-count_features.npz", + "vocabulary": "data-neuroquery_version-1_vocab-neuroquery156521_vocabulary.txt" + }, + { + "features": "data-neuroquery_version-1_vocab-neuroquery156521_source-title_type-count_features.npz", + "vocabulary": "data-neuroquery_version-1_vocab-neuroquery156521_vocabulary.txt" + } + ] + } +] diff --git a/nimare/tests/data/data-neurosynth_version-7_coordinates.tsv.gz b/nimare/tests/data/data-neurosynth_version-7_coordinates.tsv.gz new file mode 100644 index 0000000000000000000000000000000000000000..991389a01807cf913be50719d8ace802d7c09d11 GIT binary patch literal 8671 zcmV<5At2r#iwFpZVi92i|72lwVJ&WDb#iZWd2V!QUv_13b7^mGEjM3dZ*OvBX>MV3 zWpgfcb9MlXoy(3kA&;2XHTTlLIGEGbL^GOMjOL&$5-G{mrvTef-Pm~Zs`Y)3C@{u= zG3MX@_4!}_{4f9cZ~x=p>5u>Z-~aXe&wu;p|LFhnf1dyQ{J(!{E~n)E6KZbt@&9-d zynp#8)_=I?2l_9xe>j0_PYRvO{nvxY)#oRqQQ-IuB3Pcu#qnE3E1DBVw2*%M5FWrmHAaVhV`WY$_#0tJj6Q*QlJ!KI8 z_a(<~3}TzxY;$(ha)GPQZ1c6iJd_Hh=tRs92|vDn*Jlv}l-^HPON&nk*lCa4zg^BW zj2^i=5_d<3K`w9h)hy175-IJ8jztGzBT6O3zkIh8M1De-N|O5`J-(tf_#?d4-i%_I zUk6Dr;r(kCmu4+x#jnq@Swof|F1#u?3iNgM|4(uku%PxqJ*w#fGb?^R>ggiWGdo#m z%mQB(j5aco;+#}}VSehvQEL|EP}n)2lz}P z0t;Hn`EA)SB%J6#m9r02Tt1k$Ra&GWGCJkk;&;7K#=QB0(57JKDE$GC(x{yvwj-7~XZ=;B50!D){QMn>A1-0HUp@+Bzz*33QFt++VnX=S)Q75bR&WsJ(;!?}o&|+^Qs0Fo zT1mK*!C7jKqtH9GxEyCIeI@JaUt9YAYuC$&Is&iN|H)&E27MD5s|Y45_zEShy;qUA zKKuhOVq^}QjZ4m0Uj6J1We&a#_7d1l@m${xO^s13G0>hqG_}M&McF|21)W&OMqsa= z2ENkiK9M_EzOBw8n5`fXINT(P93%D2MLe$o{P6~r$el~ePZ7id0Sg>cTM=vc&K+Y6 zjM+Jf-;KQ?-q7McV6i5$;k7Y>YE{BUwh`mQ*yUQs|#ZkZd{mqnfyvH}%b zGf{P;eu9V<#HN19T&7LbcFuc|i^wG%>oak@V>AzEMRd@&^~>0cMx%RoP(hul zQ1!!kx%-sHCcRa*6|Q^_M&j9lJ1C?c1Tt2zcBWs$jP;$m}#$tiUUGfRt_ zil#uFp<{j5XVTG%uA=C5uGu)CwdW1&OKBgMjjkeCtYH3t%BPbR1oS}MxM<|U&PACc zkIS&9GUs`R=-HcgERS_YFAr#7j?*#_2ON|SCyo%HfjN)KWG)#wFmk++WH+4v1}o^l zpe({>5+U>ht8$;Xe6{tkr#<_eYQzbyMcUhY&7?wKcSj57jU4*?8b|V+sM<_aofda| zQkHNnbI)9~xd?hJhJi|p64eeWI-R^u#c_;wfr16{49SwO^VF`YP(lC4g{s@?NR(A6 z7ODX$BIt^fj9fNyWV(hEOmx1bzDmZk*r<0rdunJb9L5YwrXL8o=8nV9>!pQF1rXTR~?80|)2@(7{SW<_7jfd|@Y( zJj)Evpb1PLP&VP+IXJ;_VxWMLO!y59R?vSyc|Hm}%>;eo^34c#kGD;6*)vqsvDMv< z*}{=ad91f&t~&|0At(`4=1h>fEVdb=ln-!WzHG7`VqA#nCQYM zY+W8L-9=;B62-oW#uE8j*l`QUvw3j(UNKt1@Bu;OJaI+EC^}Hby86!pc)ochfYl0? z5BMX-b#%i7d^)9Yyu!M0VC3S?$?aJEp<y9S4B5k zsP|DXkZ%IbD|ljweW$&HqT=_Z_T5Vo`$C&BFJpjM)Uk+BUKfm_#zs7Vs8ZG2Ft=ep zV&C@N^}wQzk(In)#fIe#6MMg;Er>~`y1OV}sTD+_V!?#DxEpS9CWymPR19jFt5JV} zmx`C6GkjU)9#IDHDNvlpV_zQ!8Z}Q6^)8UBcNhzqTdU&>SB+fRIdKggMJRxja;%Vm-?1( zX{x-S8=K9**BV9VUf*k)%HsO& zxB~4iVETY^FApb9TlZ#VFk3$zD1(K%YmVKAW8&P-MOj}Palz?eSgc_FfU?eZ zPuRRydXYAy(*#vhkGcsXn@F5lJ6`MM`FIJV9}k%0Baad}!0S{ThxGy-0t*E;3@D!) z7B5!tHFJ?-aG!aH`iNzWT}km$9-tYS9spbip1?RmG^WXs%L{mswxTH(FqCsFE5pH& zOBA}_R*!^~dyD|M@~#v8Q~F&$|BU_}p1dYO)44HXBj&xN1N1K1QB#Wg#mu|G?$i;# znaMepF~!T7$}=cv_G*@>^rXsRfD**6sqEEudQ{=aQMo6IgJ`49kDs_9e+T-Y9 z&Z|YDD(3c4J2UahYM4>+lX)%(e9@pZ^=lfPxqCTbfQt*Ylj2e`xv0LR-)_W96)O}? zT(vg7n+?8eCs(X*`hitZ?8z2qx|r=sz=;aaENdRdOM2b>!@`$^(Uxl{X)0SA<|ffZ zHV(oj1&aFAGHVqAZ$wU#oknlAQk3kweV@iGYi0Fq`P4F1WIbn3ru)d^7^$Q`Jpq;6 znLVA*mKjMb&EbgCm9dSfnC51wuy?o3cS&UhDw(uGRob4aCs5p-QlHIa z$OdTcGkEQIPdB!nPrZxURUSwCPgSSkz6&3H6;*rXak{AL z0>Q&aJ;bVonNnaHZ%K3Cbt%y8VWnaSp zOW!w63!lB(QK%PDrQ+wJ88#s{??T_q^es%8o0-g6`{wzU3`Xp#xN>wIIX?W0D~H|@ z^iw1EbcRO`-9&Tk^fMC>p1?%OyaHb?IbxK4DXinpjf0V+t8*~x*WAh>cH}bU8T0Fs z=MhYpTrlL5U=~N$>x2d`FwERPRWHt~?Ok$Clp`*m6k(Oe+3vQKfk#2}nN$e+mpQB9 z%Jz8T-u2m@u`9^BI!()-9rV-^pu1#8jqcga21Xex@minE1zb;Mbs$xoc7U^K9x1o? zhFDlx^QI~}%o24L$ifPrA6E6NiMRy14)krHqEw5EMl)1`I*1m>;bS+@8HWc)q};T* z%>K=%{$09PzllMAgJI+W>maStb3Ji!WGL)hel`K-->Trrdo@DQ!V34uBAD>Q0S>yb zFba2$PQ_>t)q|cciqgJbFi(a6j)q`?ym}K%29xgsH2zMkdgQp@ZeV%h?@XFUE`I_{ zln!S9u0Qs$qb$$wB7dxo$q9U9qJaq_vjkjFZ235P7FSpLxj)qUVxpWQPmsn##PoU7 zM9JeOe3Pvza;5CD{@8>y5b1L5%A_iQm_;wJH~dKXhbIfU6)UM^0v)8J&nwm5v9Ykv zWpZQR6|g)G7nQW(x-fV)PsrcdVuA58To)PC;kr1E`S(t(qCX64aH(f5f7jLTY&_Q0 zIdU}9FvpN2E*iODytE=o0nVC15aOE!u0z#J1- z(5+ZA=Uh5r)rK__Hr$T}#3W_ca;3er$`?Yo7FCJHF8C$Do)R5DT7M3L~THtbwAQO!bm)Uc>i-b4ix z73b;w)Gv_&6O~L<-V|S)Z+nc~b$Dc7O;j@tHJZoj(t98JDM1*q9k@iUWzW3bM2xPr zIu3a_Vw9gdZ&UV(qtuyhEqx&;=O#z*x6~f48rUN`qiK#@GID^TTU+iUMmJjGr8o=( z!G{{0$ay2j9_O-rjwdiTD@eJ9hjUGCNiIe%Zvq4vODl)8RvOr_NJ;rJJ4BBhcgLj> z-pIMnVp;}Mvx1o7Wh#Qg6(dJ8ebgn`UB%1i4Wa}S7&%_U-N0Z4JtYU3V?+j>5$w*# z>D4KEr?mKR;&u>AYv_2q=uGzk)4V8_*4_yl_cm2)L+jH&`a?hm!3vVsVZ5BBaKXs= zE#2|a^0Z#7p;1Ef~qCnzv({z%0=Tt_Y}*f6@97j#wOBbSTG=r?>! zy7z8PInkXmh-xy)xB_{KXD(8)JWoEVImyUe#P$|_mGiFTZQOD^1RZona6}+)vylu| zD~RbRy;Bv?QM7jeRka+Qncf=0LE*Woed2-@q)^$Zt;(mJ4J-|3LR~baVZ2&U>sMN_ zVVNeH3zyIzJ%?NnIWTg3FRg$M1}o@s38-&rN^>@@V#a~0E!@k^(YW!q6qrs&;5 zFN>-S;B3Mi*hU9!qfzJOMo<97Wt9zJ<2-Jn)U{9r)_}`qf$!UAeF?$$?Jr!MkGf@b z>l>>D!k5H?E{}fbW5E)htlc>72Hht2$GS`$9}ZtYh{hg^99pC+=(xvZ`7*hBZkjx< zh~}lDGRRwGeSG)hbrVM~WM0+=!C5Pqx2M~b^5*knsxhm0NPK~st?T&A!zP{(YdcN^xsIK$RY9BlKGJQT;v z1Zf2kh@TA@D&(u8O5mwISGsOdWDVwV?o2x2b)nLCobE#oDJR3uV`DEn(=bgPxbtQ>Pfr-uW8q$6&Clx ze3P(X!~8Q-RbDV>!V+CHuN;GUqN<51J0(e@+am*&Oca=?XrbP<;|!;6{94YbERWVy z?{P4x&$|jWd~!IX+%oi8;40naYE>?6M2}qU(1=8#z+7k&lr2zNPyz|ULxQA2m@QF< zXb|PK<~U*AhPfRRt#&^Yn^;t_ibL{(6&toPBYQ<6Gd#?~!G{yotgS#-RUNx~)S|ZQ z-hI}3Hq~|UrKlX|^1W~clhg-19X(UoL?sgiL^X|~fJB`N`(XbOF>k{9Ekb`aL>y76 zE+$48{+fAyq}hU_J)2WX^PES$#I*)Lf$K&j@|Z7uw)%9 zoHX@xC~U5 zNTgvC%6Hg^Tz)W5&6ecq#3dugnGTCS^^QjoOkl8roN)RwM~_p?co((%WhX=v)kLrk zFyQVbu!&kp*-uy`pA%3jq%#LIY1Gd;5j|0!6C5iL-v|Ocz&K+CGx%|`g)>(1J}2-q zEhlO{=ftT>PnhF#zt1T9Qtvdu*-WDdQ7o}qFV9cm0A@BkV|dqLN~}l}ZlvcYvqWJW znA$?lCjyi|<;?l%)P|2W|HWX6j$bZ^Ul3=&hp$I-9vMZ%`pgi>p@2z5PS`wI&fgv2 zF9y|XCLnAQMLANVT2{h?*6sBCfB|{sF14rU+rn=f^otAA7azQiQJhNqZUTQ%_)hc+ zCq6&{Uj&?2ZlY8ajK}>j7-&j1zNVp!3ovnBfa&+JM~B18w|Am&OS0t}J5iKO=YjMVaM zE!XWfK$IX_2xqmZOxr!!4#-a+dmtwmyaWOsko&qqX--_6VFUAyQ`*(f9l!$i|6HD*!+ zRX0j9R%7E(ZnIeCgM>a zTk7ExR;JQksBEIrPKou0NnJEHS=7IIi^zHwnXQ}tSTimyn)*CF_p(MNcmo!2v5`QX z`6-_1f?>WqsOtL6S*Ujxvr&>PJWtlD$b7b8>4S+?!!(GOZYqz#micHBMley{Br<2J zeKSd{zh7WzBz%@sQvLf?Q7&w1Z0V7=X@nfT;_M+ubCM{O-aUv|z$L~7!;pqR$BAc( z+@muwyI5Gl*BsU?ccSRx&J3~^FY<)zVhR1Cki&-K@clOx93OoWh~$VdU(!?MA6oD=5Loo)@U`Z z%D<1N`4Xt-jr^&wIv-Q;E93K_ERWe5X4bo3P<_&XA|^@m>?%D#o}VdQmz7Utg~!p& z-6*LJy&E(8T>4_%6ET5Q`FE#@PN2%Jyc5O=j7~LU3h&dcD?=v;SGzNNv?Pe``m8Kf zFV+k8bpGB;{YcB*m3Zz{SR@zkl$rQZFLx3%6xrlgEZa#fFsIg*Z~jD8ynHm>mQuYu zg(s|w-zO?h)1-Ei$2Hk@1Ls5mW?Z?oa%FbVRf7{d%)~d+;N|=>i4WvXhM?g5<~6HW zKQl=ds7!9NzzI_i)Vq}i&YCNIFyTH3RLjZR?TLxCrx^~iS}_bwoR3~e5YT;McNip0 zvE{F_?DMYs4k2`ZHVZ|*j2NLI@81Df=^*JM};O?fKao(+zjR;PN=OMg)xljN@|oT}|WjilI( zzbPqy!Dt^}wVD+)b@P*wTuWZN2R!dNQD`foYM&~mJA$SxU;Cvx_dQcNQDJHQ2$wxm zex;OJ`*QIzPG`gR=k0h1WfG z2`$v-=AjnQ*WPh2H0Hl<2GWBfW}&g^GDahcM8)Sz;d(!7Qx zx&uZR*Apj@KXZ|;gE|u6nd4__bJRiF!-QpP4%A#D81|f9djC2+VgwCU&l+J zVLY;lDU>R`)6up8d*V1BMcZV20_nw;wlW?cIpDkgVP)Muk!@uKfK8E4Wfpo{%+bwn z0pw&#)3TsCpU6fQIcz$30@;}D7d4M02hq(-y>c~sNPOnVc^QLm--wbdn2}?0Vw(#; z)-Z0nc~L0}B@DdiM8m26^K!zmKuQj^4}lXG$S7cJo?JzbR;H~tnvNXLSuHQyZ~*h% z;Lms+vAB#koU%Ij-h~G3E@dUi80HmJYNQ8SeJghbkK=P+_`a~7Bt?#4grC@o#_E@{ zK0e>Ikmob=`1{NFY(sON#+*N?-^TF+mwwwUpTM%eKcEzt`6ZgLwqqxdru6k<%@Lzl zN<64oeDe3PfFs3EJ9_xr!_;pthZA)Rhi@*5(gBrv7`lt=ofp3MXpa~@Wq2HB8%J-U zRks!!Cob=a5r0oh8;7rAex}wF2NY4+vL+Vy$xf%#Z@qMOwDuc`&Sbqi9M5sH!5+k6 zN$-u7s+!0~I$!qXBkvve=G@FaG%TT*?vSmtezs)9VBV+Ni|!md=sm=PzajWw4;2$KFi& za3&KCD|y-xnFjNQeeR)aNhigCoX>r6lLS-v7oYEGCFebam&lH@o|@7mOmUmOK({bz2CFN zB)LF#N#vkv?EBZBOkl?mA59~k-uDGK%&D@(N{sMpvrw_J-nc?+MCb}bW`DA7njoPOPomcweIl=azu*R)SimHzE ztP1&iYvhQz{oGf!7C^SX@-2vCadCM;lmSbbG_z_@My_6ptd(a;z!S2cs#=p+Hsh{n zC9IviDqx7);F9|bDB&VFb&Ex(9Z>b!Q5;U?htZCWr6(FY$SO4v%#|a2?gF zuJ$xy-4U9mBeb6?XHLd7zj~qqzGhu$4S9m11)?`|=85>UgeNX-h0Vq2iOMFb7{7@} z=PhZn@XaJKu++@gtt?4ojsoDe0H#w}nS)AjzjA_-1p*5c7ZfCk_8i)G$UmG542Ro^ zau&)p6IFLgYGL0h-YE)-(Ury}TOSsKK19ELQL5H!6ETHW<6YXRtY|1}T+#51{fIs3 zC{}QsUWu(t#aLcXRkjjJqVMy>B_qdaPfJ^y<`WpL;M#_l<;OjN-U>P+*y9tn8?_2p ztsq9d;-z1i%SP^#KTH$Ba#*STP@299DD^g;gak`2?O^E2Wq@-F5t)DP;h`bC>a#!)?H zTU5Es&$lF{1^ukvgcTc>cPvy1OEwI~{tQ`%QlRM%!ut0buNLCyCW<1_eY;hGF@)>9kw7jaWKxjX)XbZ(1=e6o9?2#lzy3Qda|v_V z+O66=#n`&^IXZp16%_}uQGv@vB!fZ~sSHF}B*wa6BZUpVnbK;Jp7g^`yWMC6`hy52 z_cy^vuaz}Ale!Qot+aVgWxSN69JzAtq+QFH<{NS{7go$uNio(G>4!CDSWMBfb3Glc3S>cZ0`7(zptQ%)(DKE>coQl0H&_uV%4Ok;3P1 zhhhAFW-zSS(85@{e=`kudaZW9zsEDsqA=Np=VhkA=oIL+(#tyC)aepX8E0f{Azw)7 zm_$(ki87iHr-=Be9*j$w7g}1HoYJh2c_K93;`!35BB6`nDVF4dOS|$bPE2LHY zN@lrX6{Kb^eTc#plD?NBRX8*&BeT@1qIQ;vG^HWP92+fN?aCAk1;4-I@HR=>Rm`@ov8dcQoc!OB=O7|@7yM1iz;W-&xYV;xgEuclHT ze&^Oi(v!|Bt(5~ES%@SPQU5N_{k1)wdeGyvwsvB3bfWhKjXwULv6Td3$@v>!=e2!Ag{l^~Ca$#Q3OZ5Aasbei2A zTj?piHQ7tJa`JvcXf>McMt7eX6X8}E2%wGyz$^RleAd-23*pI?FFeNi@0p3q6f2X+ zHp^I~;*d<7L8dHtc#2ag4!qQZr$nSu(*xXKD#|GFM2w0E2T1pLoJ2QpXe~EBCGMo8 z$vI3#YPG`SfC=fnj~+VhFzg=b;St&E)Lzs`Y+Zq@gbr0?w37zT5Tz7r@2V!Ngx$eZ z3FGAV(#N;2Tu6nLP=wRIRx9K1gFXSj+AuAOVDyj33T+4#U?v=A2r}iNgrA(hBMtpV zzt;~MVZ9MHIv-|XuivS4n%%QM!tl>t*lKrbp%=|1OtDf&P59~z{YUKd=}vBD{!K>g z87gD91RuV>Ib_CKLIu79u_9N;(j&=DNUd5Zrp07>QxDF|XvLL#3W;196LWEagw7Fq zwBqcU8n`<@+(LP-5WxBb-l7BthIvCCfh`DP8e(r6OezjNdVSJw!yP-}8{GXSaJL%a zAN}s1VHh^UT6Zsu19r;9QcKQ#AixWz9_F{)mWK!smd0*sE|-y7R1{(KuJm}oAAuNE zk*6-aQnqfAiJT2HSH)}9v)n2dC6Ec~{uTI;DP4jBnWj7g#dd@tRS9_nbh+8UGgwDl ziRRYCzVC^WXJMhC0T&$gLRS?J=U&}>48lf9oi3`{zLs0v@4lS$JI$~IQ~Iq!x~*E% zE96>c%H;`r4tf0O8(ym;-f3amcGBYK4lYCRoL5Ip2=lX+cFz1^47#c9uMA#=!UYmd z?33%3Y1z=zgi}hsV#7;*jE2%8Aq%=aXZ0LJH5DpKpbIAP!X!}2J@oYn<=I!TF&p%1 zUdHIb{0Ln@H}r_-ncl=0G5G5H8);c(v(d4-JZHVX;jCS2`I>y~Svj`Uin6%0EGjf2 zv9}~n4V(yud@t43rI-`>Qm9IuT!b?9BI6or6_boiB3G30rXJi0G$eEjaa=*78i_c* z;LJk(8GGLyn(~FlfcKEdGdzT_9JxqyC-awm*rw9(;O+G6vP%1AXWP$i2gLgQk!oN=*d0-t28^ z`nFrH@cid(K)2EAhV5T^UXML5Y;9HfhOXUdsA{c$#8y4{eW&x4I~~nAb&eZ^urXWp zj0v6IvJQoVBLA&Lb(_6@_LaUbYF-smkw)h~ z$NJw(uGbC2!@YPw`@(p16D`msimdca)WI_Ni{jg9b)n~l5BGe3>$kX}A+&v~Q4b!F zd}zilxJ8f%<_hhWdG|7Rwpnp`xj;h|!mhii4vA_* pirsPf{0r2M9Df;Ja4wfAlYMn}B>(>d009600{~oaL}A(z005V)65#*< literal 0 HcmV?d00001 diff --git a/nimare/tests/data/data-neurosynth_version-7_vocab-terms_source-abstract_type-tfidf_features.npz b/nimare/tests/data/data-neurosynth_version-7_vocab-terms_source-abstract_type-tfidf_features.npz new file mode 100644 index 0000000000000000000000000000000000000000..c99e73a5fe568a53619cf22e649d06b5a31bfa77 GIT binary patch literal 1442 zcmWIWW@Zs#U|`??Vnqh+xPP;{fh-M11_o{h5r)jXl+5JRV!gbAN=60&urN@T0i*;3 zX1@`CB`t9BM8L6tweu!L&52o(w{$_?HnVTf+2HO3lq`mKE1lt|!cTb!=>Gi3(|qNqGv-+Ak^WW0)i(Eb!SOQ$ma>$xY!| z%PYo2z7;bJrlhrAiV2v-x++d!m3#`HamXB5t>et4>>l&PJAO7uFa)6ZcSOW6}NE8G>DnJ0MFXDuA))0yea7>6Wu6yz*1Aa39@l`G+sh;guk_M@d{vi zya2=;Ku4q`mL$U501^WMkUR+Ba6=kFHy|a*|D5ag%f2zY(eCS?vpZ+cVY}44b^B)= z7W@65NAmgU|N5JsCr?YfV6*q&zc16|H$R$}yJ0%-`>fD;`;Xdr`Sqp$f4TMBncwyE zIOlFWd~d_QKgZM+Z@;~K$$Ij#y?68W@0?WmeR4zYx6jM{_b&XAGX3wh33P;*Sc5i-s}A@bbcv6|E=Hl;j>=-O5FQ5GJ0(=durIN_x5j&U#rS) ztu2}!vEawE-fu6b@m?>>znL}vUQyQOSr-+5ufO@}Q&sWy8~#RF@fL>PH~du%Ubg)E z-t9NaLn7Ak#B90u?^Ivk`bk03Z+XRx*jQ%gY|qcUbgO7t_1wQ7XP#Nlf7d=k{_6G{ z4i#6!et*k;r2ARK>{kAp8~2ocmTv!>HGk&qsr)e;|A&fc?g_7$ymx)mIo9*h(QkM5 z%3XbJxOR*AME`peH^1rFtC0LS=b!fPH@+?Ao8JA~dhFkqx$)O7?Kiv442)q$CJ|;_ z)eSJVH5g83#A8L+3*2fPlRN(mo gT^ni|M%Fe3s125)1H4(;Kys`=C