Select wilds datasets for NLP tools (#155)

* remove generator, add wilds as optional install * add example loading wilds data * tweak optional wilds install * edit gitignore to skip downloaded wilds data * format with black * tweak download location, update readme with wilds option
mitre · Aug 8, 2023 · 2ca8f03 · 2ca8f03
1 parent 170b493
commit 2ca8f03
Show file tree

Hide file tree

Showing 7 changed files with 215 additions and 13 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,10 +14,14 @@ _build
 *.coverage
 *.DS_Store
 .idea/
+
+# Local Scratch Work
 tmp
 
-# Images
+# Example Data
+**wilds_datasets/
 
+# Images
 examples/*.png
 menelaus/*.png
 tests/*.png

diff --git a/README.md b/README.md
@@ -100,9 +100,13 @@ Create a virtual environment as desired, then:
 pip install menelaus
 
 # to allow editing, running tests, generating docs, etc.
-# First, clone the git repo, then:
+# first, clone the git repo, then:
 cd ./menelaus_clone_folder/
 pip install -e .[dev] 
+
+# to run examples which use datasets from the wilds library,
+# another install option is:
+pip install menelaus[wilds]
 ```
 
 Menelaus should work with Python 3.8 or higher. 

diff --git a/docs/source/examples/nlp/wilds_datasets.ipynb b/docs/source/examples/nlp/wilds_datasets.ipynb
@@ -0,0 +1,156 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Overview \n",
+    "\n",
+    "This notebook is a work in progress. Eventually, the contents will demonstrate an NLP-based drift detection algorithm in action, but until the feature is developed, it shows the loading and use of two datasets to be used in the examples:\n",
+    "\n",
+    "- Civil Comments dataset: online comments to be used in toxicity classification problems \n",
+    "- Amazon Reviews dataset: amazon reviews to be used in a variety of NLP problems\n",
+    "\n",
+    "The data is accessed by using the `wilds` library, which contains several such datasets and wraps them in an API as shown below. \n",
+    "\n",
+    "#### Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from wilds import get_dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Load Data\n",
+    "\n",
+    "Note that initially, the large data files need to be downloaded first. Later examples may assume the data is already stored to disk."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading dataset to ./wilds_datasets\\amazon_v2.1...\n",
+      "You can also download the dataset manually at https://wilds.stanford.edu/downloads.\n",
+      "Downloading https://worksheets.codalab.org/rest/bundles/0xe3ed909786d34ee79d430d065582aa29/contents/blob/ to ./wilds_datasets\\amazon_v2.1\\archive.tar.gz\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████▉| 1988272128/1989805589 [06:39<00:00, 4982930.49Byte/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracting ./wilds_datasets\\amazon_v2.1\\archive.tar.gz to ./wilds_datasets\\amazon_v2.1\n",
+      "\n",
+      "It took 7.56 minutes to download and uncompress the dataset.\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<wilds.datasets.amazon_dataset.AmazonDataset at 0x26f9518ac50>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# amazon reviews\n",
+    "dataset_amazon = get_dataset(dataset=\"amazon\", download=True, root_dir=\"./wilds_datasets\")\n",
+    "dataset_amazon"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading dataset to ./wilds_datasets\\civilcomments_v1.0...\n",
+      "You can also download the dataset manually at https://wilds.stanford.edu/downloads.\n",
+      "Downloading https://worksheets.codalab.org/rest/bundles/0x8cd3de0634154aeaad2ee6eb96723c6e/contents/blob/ to ./wilds_datasets\\civilcomments_v1.0\\archive.tar.gz\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "90914816Byte [00:17, 5109891.58Byte/s]                             \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracting ./wilds_datasets\\civilcomments_v1.0\\archive.tar.gz to ./wilds_datasets\\civilcomments_v1.0\n",
+      "\n",
+      "It took 0.33 minutes to download and uncompress the dataset.\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<wilds.datasets.civilcomments_dataset.CivilCommentsDataset at 0x26f9518b0a0>"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# civil comments\n",
+    "dataset_civil = get_dataset(dataset=\"civilcomments\", download=True, root_dir=\"./wilds_datasets\")\n",
+    "dataset_civil"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.2"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/wilds_datasets.py b/examples/wilds_datasets.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# #### Overview 
+# 
+# This notebook is a work in progress. Eventually, the contents will demonstrate an NLP-based drift detection algorithm in action, but until the feature is developed, it shows the loading and use of two datasets to be used in the examples:
+# 
+# - Civil Comments dataset: online comments to be used in toxicity classification problems 
+# - Amazon Reviews dataset: amazon reviews to be used in a variety of NLP problems
+# 
+# The data is accessed by using the `wilds` library, which contains several such datasets and wraps them in an API as shown below. 
+# 
+# #### Imports
+
+# In[4]:
+
+
+from wilds import get_dataset
+
+
+# #### Load Data
+# 
+# Note that initially, the large data files need to be downloaded first. Later examples may assume the data is already stored to disk.
+
+# In[7]:
+
+
+# amazon reviews
+dataset_amazon = get_dataset(dataset="amazon", download=True)
+dataset_amazon
+
+
+# In[8]:
+
+
+# civil comments
+dataset_civil = get_dataset(dataset="civilcomments", download=True)
+dataset_civil
+
+
+# In[ ]:
+
+
+
+
diff --git a/menelaus/datasets/__init__.py b/menelaus/datasets/__init__.py
@@ -3,5 +3,3 @@
 from menelaus.datasets.make_example_data import make_example_batch_data
 from menelaus.datasets.make_example_data import fetch_circle_data
 from menelaus.datasets.make_example_data import fetch_rainfall_data
-
-# from menelaus.datasets.generator import DataGenerator
diff --git a/menelaus/datasets/generator.py b/menelaus/datasets/generator.py
diff --git a/setup.cfg b/setup.cfg
@@ -30,6 +30,10 @@ install_requires =
    scikit-learn
 
 [options.extras_require]
+wilds = 
+   wilds
+   matplotlib
+
 test =
    coverage
    jupyter
@@ -60,7 +64,6 @@ dev =
    sphinx-rtd-theme
    tox
 
-
 format = 
    bandit
    black