commit data sample for nlp example in RTD, run nbconvert

mitre · Nov 16, 2023 · 11d6299 · 11d6299
1 parent ca02be5
commit 11d6299
Show file tree

Hide file tree

Showing 3 changed files with 110 additions and 40 deletions.
diff --git a/docs/source/examples/nlp/civil_comments_sample.pkl b/docs/source/examples/nlp/civil_comments_sample.pkl
diff --git a/docs/source/examples/nlp/wilds_datasets.ipynb b/docs/source/examples/nlp/wilds_datasets.ipynb
@@ -25,20 +25,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\ASRIVASTAVA\\Documents\\repos\\menelaus\\venv\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "import pandas as pd\n",
+    "import pickle\n",
+    "# import pandas as pd\n",
     "# from wilds import get_dataset\n",
     "\n",
     "from menelaus.experimental.transform import auto_tokenize, extract_embedding, uae_reduce_dimension\n",
@@ -52,19 +44,29 @@
    "source": [
     "## Load Data\n",
     "\n",
-    "Since some of the experimental modules are not very performant, the dataset is loaded and then limited to the first 300 data points (comments), which are split into three sequential batches of 100."
+    "Since some of the experimental modules are not very performant, the dataset is loaded and then limited to the first 300 data points (comments), which are split into three sequential batches of 100.\n",
+    "\n",
+    "__Note__: for convenience in generating documentation, the sample is itself saved locally and read from disk in the below examples, but the commented code describes the steps. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
     "# civil comments\n",
     "# dataset_civil = get_dataset(dataset=\"civilcomments\", download=True, root_dir=\"./wilds_datasets\")\n",
-    "dataset_civil = pd.read_csv('wilds_datasets/civilcomments_v1.0/all_data_with_identities.csv')\n",
-    "dataset_civil = dataset_civil['comment_text'][:300].tolist()\n",
+    "# dataset_civil = pd.read_csv('wilds_datasets/civilcomments_v1.0/all_data_with_identities.csv')\n",
+    "# dataset_civil = dataset_civil['comment_text'][:300].tolist()\n",
+    "\n",
+    "# with open('civil_comments_sample.pkl', 'wb') as f:\n",
+    "#     pickle.dump(dataset_civil, f)\n",
+    "\n",
+    "dataset_civil = None\n",
+    "with open('civil_comments_sample.pkl', 'rb') as f:\n",
+    "    dataset_civil = pickle.load(f)\n",
+    "\n",
     "batch1 = dataset_civil[:100]\n",
     "batch2 = dataset_civil[100:200]\n",
     "batch3 = dataset_civil[200:300]"
@@ -89,7 +91,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -118,14 +120,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']\n",
+      "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
       "- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n",
       "- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n",
       "All the weights of TFBertModel were initialized from the PyTorch model.\n",
@@ -145,7 +147,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']\n",
+      "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
       "- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n",
       "- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n",
       "All the weights of TFBertModel were initialized from the PyTorch model.\n",
@@ -165,7 +167,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']\n",
+      "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n",
       "- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n",
       "- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n",
       "All the weights of TFBertModel were initialized from the PyTorch model.\n",
@@ -189,7 +191,7 @@
     "detector.step(batch1)\n",
     "print(f\"\\nState after initial batch: {detector.state}\\n\")\n",
     "\n",
-    "# detector + add test (copy reference)  \n",
+    "# detector + add test   \n",
     "detector.step(batch2)\n",
     "print(f\"\\nState after test batch: {detector.state}\\n\")\n",
     "\n",

diff --git a/examples/wilds_datasets.py b/examples/wilds_datasets.py
@@ -1,45 +1,113 @@
 #!/usr/bin/env python
 # coding: utf-8
 
-# #### Overview 
+# ## Overview
 # 
-# This notebook is a work in progress. Eventually, the contents will demonstrate an NLP-based drift detection algorithm in action, but until the feature is developed, it shows the loading and use of two datasets to be used in the examples:
+# This example demonstrates an experimental NLP-based drift detection algorithm. It uses the "Civil Comments" dataset ([link](https://github.com/p-lambda/wilds/blob/main/wilds/datasets/civilcomments_dataset.py) to a Python loading script with additional details/links) from the `wilds` library, which contains online comments meant to be used in toxicity classification problems.
 # 
-# - Civil Comments dataset: online comments to be used in toxicity classification problems 
-# - Amazon Reviews dataset: amazon reviews to be used in a variety of NLP problems
+# This example and the experimental modules often pull directly and indirectly from [`alibi-detect`](https://github.com/SeldonIO/alibi-detect/tree/master) and its own [example(s)](https://docs.seldon.io/projects/alibi-detect/en/stable/examples/cd_text_imdb.html).
 # 
-# The data is accessed by using the `wilds` library, which contains several such datasets and wraps them in an API as shown below. 
+# ## Notes
 # 
-# #### Imports
+# This code is experimental, and has notable issues:
+# - transform functions are very slow, on even moderate batch sizes
+# - detector design is not generalized, and may not work on streaming problems, or with data representations of different types/shapes
+# - some warnings below are not addressed
+# - if not present, `toolz`, `tensorflow`, and `transformers` must be added via the `experimental` install option, and are not included by default
+# 
+# ## Imports
+# 
+# Code (transforms, alarm, detector) is pulled from the experimental module in `menelaus`, which is live but not fully tested. Note that commented code shows `wilds` modules being used to access and save the dataset to disk, but are excluded to save time. The example hence assumes the dataset is locally available.
+
+# In[3]:
+
+
+import pickle
+# import pandas as pd
+# from wilds import get_dataset
+
+from menelaus.experimental.transform import auto_tokenize, extract_embedding, uae_reduce_dimension
+from menelaus.experimental.detector import Detector
+from menelaus.experimental.alarm import KolmogorovSmirnovAlarm
+
+
+# ## Load Data
+# 
+# Since some of the experimental modules are not very performant, the dataset is loaded and then limited to the first 300 data points (comments), which are split into three sequential batches of 100.
+# 
+# __Note__: for convenience in generating documentation, the sample is itself saved locally and read from disk in the below examples, but the commented code describes the steps. 
 
 # In[4]:
 
 
-from wilds import get_dataset
+# civil comments
+# dataset_civil = get_dataset(dataset="civilcomments", download=True, root_dir="./wilds_datasets")
+# dataset_civil = pd.read_csv('wilds_datasets/civilcomments_v1.0/all_data_with_identities.csv')
+# dataset_civil = dataset_civil['comment_text'][:300].tolist()
 
+# with open('civil_comments_sample.pkl', 'wb') as f:
+#     pickle.dump(dataset_civil, f)
 
-# #### Load Data
+dataset_civil = None
+with open('civil_comments_sample.pkl', 'rb') as f:
+    dataset_civil = pickle.load(f)
+
+batch1 = dataset_civil[:100]
+batch2 = dataset_civil[100:200]
+batch3 = dataset_civil[200:300]
+
+
+# ## Transforms Pipeline
+# 
+# The major step is to initialize the transform functions that will be applied to the comments, to turn them into detector-compatible representations. 
+# 
+# First, the comments must be tokenized:
+# - set up an `AutoTokenizer` model from the `transformers` library with a convenience function, by specifying the desired model name and other arguments
+# - the convenience function lets the configured tokenizer be called repeatedly, using batch 1 as the training data
 # 
-# Note that initially, the large data files need to be downloaded first. Later examples may assume the data is already stored to disk.
+# Then, the tokens must be made into embeddings:
+# - an initial transform function uses a `transformers` model to extract embeddings from given tokens
+# - the subsequent transform function reduces the dimension via an `UntrainedAutoEncoder` to a manageable size
 
-# In[7]:
+# In[5]:
 
 
-# amazon reviews
-dataset_amazon = get_dataset(dataset="amazon", download=True)
-dataset_amazon
+# tokens 
+tokenizer = auto_tokenize(model_name='bert-base-cased', padding='longest', return_tensors='tf')
+tokens = tokenizer(data=batch1)
 
+# embedding (TODO abstract this layers line)
+layers = [-_ for _ in range(1, 8 + 1)]
+embedder = extract_embedding(model_name='bert-base-cased', embedding_type='hidden_state', layers=layers)
 
-# In[8]:
+# dimension reduction via Untrained AutoEncoder
+uae_reduce = uae_reduce_dimension(enc_dim=32)
 
 
-# civil comments
-dataset_civil = get_dataset(dataset="civilcomments", download=True)
-dataset_civil
+# ## Detector Setup
+# 
+# Next a detector is setup. First, a `KolmogorovSmirnovAlarm` is initialized with default settings. When the amount of columns (which reject the null KS test hypothesis) exceeds the default ratio (0.25), this alarm will indicate drift has occurred. 
+# 
+# Then the detector is constructed. It is given the initialized alarm, and the ordered list of transforms configured above. The detector is then made to step through each available batch, and its state is printed as output. Note that the first batch establishes the reference data, the second establishes the test data, and the third will require recalibration (test is combined into reference) if drift is detected.
 
+# In[6]:
 
-# In[ ]:
 
+# detector + set reference
+ks_alarm = KolmogorovSmirnovAlarm()
+detector = Detector(alarm=ks_alarm, transforms=[tokenizer, embedder, uae_reduce])
+detector.step(batch1)
+print(f"\nState after initial batch: {detector.state}\n")
 
+# detector + add test   
+detector.step(batch2)
+print(f"\nState after test batch: {detector.state}\n")
 
+# recalibrate and re-evaluate (XXX - all batches must be same length)
+detector.step(batch3)
+print(f"\nState after new batch, recalibration: {detector.state}\n")
 
+
+# ## Final Notes
+# 
+# We can see the baseline state after processing the initial batch, an alarm raised after observing test data, and then another alarm signal after a new test batch is observed and the reference is internally recalibrated.