diff --git a/.gitignore b/.gitignore index 4141242f..bc4709de 100644 --- a/.gitignore +++ b/.gitignore @@ -14,10 +14,14 @@ _build *.coverage *.DS_Store .idea/ + +# Local Scratch Work tmp -# Images +# Example Data +**wilds_datasets/ +# Images examples/*.png menelaus/*.png tests/*.png diff --git a/README.md b/README.md index 84487ce5..146d7bfb 100644 --- a/README.md +++ b/README.md @@ -100,9 +100,13 @@ Create a virtual environment as desired, then: pip install menelaus # to allow editing, running tests, generating docs, etc. -# First, clone the git repo, then: +# first, clone the git repo, then: cd ./menelaus_clone_folder/ pip install -e .[dev] + +# to run examples which use datasets from the wilds library, +# another install option is: +pip install menelaus[wilds] ``` Menelaus should work with Python 3.8 or higher. diff --git a/docs/source/examples/nlp/wilds_datasets.ipynb b/docs/source/examples/nlp/wilds_datasets.ipynb new file mode 100644 index 00000000..419e9ae2 --- /dev/null +++ b/docs/source/examples/nlp/wilds_datasets.ipynb @@ -0,0 +1,156 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Overview \n", + "\n", + "This notebook is a work in progress. Eventually, the contents will demonstrate an NLP-based drift detection algorithm in action, but until the feature is developed, it shows the loading and use of two datasets to be used in the examples:\n", + "\n", + "- Civil Comments dataset: online comments to be used in toxicity classification problems \n", + "- Amazon Reviews dataset: amazon reviews to be used in a variety of NLP problems\n", + "\n", + "The data is accessed by using the `wilds` library, which contains several such datasets and wraps them in an API as shown below. \n", + "\n", + "#### Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from wilds import get_dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Load Data\n", + "\n", + "Note that initially, the large data files need to be downloaded first. Later examples may assume the data is already stored to disk." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading dataset to ./wilds_datasets\\amazon_v2.1...\n", + "You can also download the dataset manually at https://wilds.stanford.edu/downloads.\n", + "Downloading https://worksheets.codalab.org/rest/bundles/0xe3ed909786d34ee79d430d065582aa29/contents/blob/ to ./wilds_datasets\\amazon_v2.1\\archive.tar.gz\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████▉| 1988272128/1989805589 [06:39<00:00, 4982930.49Byte/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracting ./wilds_datasets\\amazon_v2.1\\archive.tar.gz to ./wilds_datasets\\amazon_v2.1\n", + "\n", + "It took 7.56 minutes to download and uncompress the dataset.\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# amazon reviews\n", + "dataset_amazon = get_dataset(dataset=\"amazon\", download=True, root_dir=\"./wilds_datasets\")\n", + "dataset_amazon" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading dataset to ./wilds_datasets\\civilcomments_v1.0...\n", + "You can also download the dataset manually at https://wilds.stanford.edu/downloads.\n", + "Downloading https://worksheets.codalab.org/rest/bundles/0x8cd3de0634154aeaad2ee6eb96723c6e/contents/blob/ to ./wilds_datasets\\civilcomments_v1.0\\archive.tar.gz\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "90914816Byte [00:17, 5109891.58Byte/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracting ./wilds_datasets\\civilcomments_v1.0\\archive.tar.gz to ./wilds_datasets\\civilcomments_v1.0\n", + "\n", + "It took 0.33 minutes to download and uncompress the dataset.\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# civil comments\n", + "dataset_civil = get_dataset(dataset=\"civilcomments\", download=True, root_dir=\"./wilds_datasets\")\n", + "dataset_civil" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.2" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/wilds_datasets.py b/examples/wilds_datasets.py new file mode 100644 index 00000000..acc5cd96 --- /dev/null +++ b/examples/wilds_datasets.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +# coding: utf-8 + +# #### Overview +# +# This notebook is a work in progress. Eventually, the contents will demonstrate an NLP-based drift detection algorithm in action, but until the feature is developed, it shows the loading and use of two datasets to be used in the examples: +# +# - Civil Comments dataset: online comments to be used in toxicity classification problems +# - Amazon Reviews dataset: amazon reviews to be used in a variety of NLP problems +# +# The data is accessed by using the `wilds` library, which contains several such datasets and wraps them in an API as shown below. +# +# #### Imports + +# In[4]: + + +from wilds import get_dataset + + +# #### Load Data +# +# Note that initially, the large data files need to be downloaded first. Later examples may assume the data is already stored to disk. + +# In[7]: + + +# amazon reviews +dataset_amazon = get_dataset(dataset="amazon", download=True) +dataset_amazon + + +# In[8]: + + +# civil comments +dataset_civil = get_dataset(dataset="civilcomments", download=True) +dataset_civil + + +# In[ ]: + + + + diff --git a/menelaus/datasets/__init__.py b/menelaus/datasets/__init__.py index c1feb33f..b1307bea 100644 --- a/menelaus/datasets/__init__.py +++ b/menelaus/datasets/__init__.py @@ -3,5 +3,3 @@ from menelaus.datasets.make_example_data import make_example_batch_data from menelaus.datasets.make_example_data import fetch_circle_data from menelaus.datasets.make_example_data import fetch_rainfall_data - -# from menelaus.datasets.generator import DataGenerator diff --git a/menelaus/datasets/generator.py b/menelaus/datasets/generator.py deleted file mode 100644 index 7bac394b..00000000 --- a/menelaus/datasets/generator.py +++ /dev/null @@ -1,8 +0,0 @@ -"""This submodule is not yet implemented.""" -# XXX A DataGenerator may yield batch/streaming-format records, and uses -# user-provided settings to draw from distributions, or introduce -# drift - @Anmol-Srivastava -# class DataGenerator: -# def __init__(self, mode, config): -# self.mode = mode -# self.config = config diff --git a/setup.cfg b/setup.cfg index 296960b8..9e193bef 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,6 +30,10 @@ install_requires = scikit-learn [options.extras_require] +wilds = + wilds + matplotlib + test = coverage jupyter @@ -60,7 +64,6 @@ dev = sphinx-rtd-theme tox - format = bandit black