mitre · Anmol-Srivastava · Aug 8, 2023 · Aug 4, 2023 · Aug 7, 2023 · Aug 7, 2023
diff --git a/.gitignore b/.gitignore
@@ -14,10 +14,14 @@ _build
 *.coverage
 *.DS_Store
 .idea/
+
+# Local Scratch Work
 tmp
 
-# Images
+# Example Data
+docs/source/examples/**/data/
 
+# Images
 examples/*.png
 menelaus/*.png
 tests/*.png

diff --git a/docs/source/examples/nlp/wilds_datasets.ipynb b/docs/source/examples/nlp/wilds_datasets.ipynb
@@ -0,0 +1,163 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Overview \n",
+    "\n",
+    "This notebook is a work in progress. Eventually, the contents will demonstrate an NLP-based drift detection algorithm in action, but until the feature is developed, it shows the loading and use of two datasets to be used in the examples:\n",
+    "\n",
+    "- Civil Comments dataset: online comments to be used in toxicity classification problems \n",
+    "- Amazon Reviews dataset: amazon reviews to be used in a variety of NLP problems\n",
+    "\n",
+    "The data is accessed by using the `wilds` library, which contains several such datasets and wraps them in an API as shown below. \n",
+    "\n",
+    "#### Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from wilds import get_dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Load Data\n",
+    "\n",
+    "Note that initially, the large data files need to be downloaded first. Later examples may assume the data is already stored to disk."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading dataset to data\\amazon_v2.1...\n",
+      "You can also download the dataset manually at https://wilds.stanford.edu/downloads.\n",
+      "Downloading https://worksheets.codalab.org/rest/bundles/0xe3ed909786d34ee79d430d065582aa29/contents/blob/ to data\\amazon_v2.1\\archive.tar.gz\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████▉| 1988272128/1989805589 [06:36<00:00, 5019175.79Byte/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracting data\\amazon_v2.1\\archive.tar.gz to data\\amazon_v2.1\n",
+      "\n",
+      "It took 7.57 minutes to download and uncompress the dataset.\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<wilds.datasets.amazon_dataset.AmazonDataset at 0x1901b5c8be0>"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# amazon reviews\n",
+    "dataset_amazon = get_dataset(dataset=\"amazon\", download=True)\n",
+    "dataset_amazon"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading dataset to data\\civilcomments_v1.0...\n",
+      "You can also download the dataset manually at https://wilds.stanford.edu/downloads.\n",
+      "Downloading https://worksheets.codalab.org/rest/bundles/0x8cd3de0634154aeaad2ee6eb96723c6e/contents/blob/ to data\\civilcomments_v1.0\\archive.tar.gz\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "90914816Byte [00:18, 4889856.78Byte/s]                             \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracting data\\civilcomments_v1.0\\archive.tar.gz to data\\civilcomments_v1.0\n",
+      "\n",
+      "It took 0.35 minutes to download and uncompress the dataset.\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<wilds.datasets.civilcomments_dataset.CivilCommentsDataset at 0x19061dd7130>"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# civil comments\n",
+    "dataset_civil = get_dataset(dataset=\"civilcomments\", download=True)\n",
+    "dataset_civil"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.2"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/wilds_datasets.py b/examples/wilds_datasets.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# #### Overview 
+# 
+# This notebook is a work in progress. Eventually, the contents will demonstrate an NLP-based drift detection algorithm in action, but until the feature is developed, it shows the loading and use of two datasets to be used in the examples:
+# 
+# - Civil Comments dataset: online comments to be used in toxicity classification problems 
+# - Amazon Reviews dataset: amazon reviews to be used in a variety of NLP problems
+# 
+# The data is accessed by using the `wilds` library, which contains several such datasets and wraps them in an API as shown below. 
+# 
+# #### Imports
+
+# In[4]:
+
+
+from wilds import get_dataset
+
+
+# #### Load Data
+# 
+# Note that initially, the large data files need to be downloaded first. Later examples may assume the data is already stored to disk.
+
+# In[7]:
+
+
+# amazon reviews
+dataset_amazon = get_dataset(dataset="amazon", download=True)
+dataset_amazon
+
+
+# In[8]:
+
+
+# civil comments
+dataset_civil = get_dataset(dataset="civilcomments", download=True)
+dataset_civil
+
+
+# In[ ]:
+
+
+
+
diff --git a/menelaus/datasets/__init__.py b/menelaus/datasets/__init__.py
@@ -3,5 +3,3 @@
 from menelaus.datasets.make_example_data import make_example_batch_data
 from menelaus.datasets.make_example_data import fetch_circle_data
 from menelaus.datasets.make_example_data import fetch_rainfall_data
-
-# from menelaus.datasets.generator import DataGenerator
diff --git a/menelaus/datasets/generator.py b/menelaus/datasets/generator.py
diff --git a/setup.cfg b/setup.cfg
@@ -30,6 +30,10 @@ install_requires =
    scikit-learn
 
 [options.extras_require]
+wilds = 
+   wilds
+   matplotlib
+
 test =
    coverage
    jupyter
@@ -60,7 +64,6 @@ dev =
    sphinx-rtd-theme
    tox
 
-
 format = 
    bandit
    black