Skip to content

Commit

Permalink
fill in recalibration step
Browse files Browse the repository at this point in the history
  • Loading branch information
Anmol-Srivastava committed Oct 19, 2023
1 parent ced0bf6 commit 29303ed
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 32 deletions.
81 changes: 56 additions & 25 deletions docs/source/examples/nlp/wilds_datasets.ipynb
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"__This code is experimental. Some notable issues__\n",
"- transforms are very slow on even moderate batch sizes\n",
"- detection scheme design is not finalized"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -57,7 +66,10 @@
"# civil comments\n",
"# dataset_civil = get_dataset(dataset=\"civilcomments\", download=True, root_dir=\"./wilds_datasets\")\n",
"dataset_civil = pd.read_csv('wilds_datasets/civilcomments_v1.0/all_data_with_identities.csv')\n",
"dataset_civil = dataset_civil['comment_text'][:5].tolist()"
"dataset_civil = dataset_civil['comment_text'][:300].tolist()\n",
"batch1 = dataset_civil[:100]\n",
"batch2 = dataset_civil[100:200]\n",
"batch3 = dataset_civil[200:300]"
]
},
{
Expand All @@ -71,14 +83,45 @@
"text": [
"c:\\Users\\ASRIVASTAVA\\Documents\\repos\\menelaus\\venv\\lib\\site-packages\\transformers\\tokenization_utils_base.py:2418: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
" warnings.warn(\n",
"Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']\n",
"Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']\n",
"- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n",
"All the weights of TFBertModel were initialized from the PyTorch model.\n",
"If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.\n",
"If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"State after initial batch: baseline\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\ASRIVASTAVA\\Documents\\repos\\menelaus\\venv\\lib\\site-packages\\transformers\\tokenization_utils_base.py:2418: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
" warnings.warn(\n",
"Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']\n",
"Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']\n",
"- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n",
"All the weights of TFBertModel were initialized from the PyTorch model.\n",
"If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"State after test batch alarm\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']\n",
"- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n",
"All the weights of TFBertModel were initialized from the PyTorch model.\n",
Expand All @@ -89,17 +132,14 @@
"name": "stdout",
"output_type": "stream",
"text": [
"[0. 0.01 0.0025 0. 0. 0.01 0. 0.\n",
" 0.025625 0. 0. 0. 0. 0.0025 0. 0.\n",
" 0.0025 0.01 0.025625 0.03125 0.025625 0.03125 0. 0.\n",
" 0. 0. 0.0025 0. 0. 0. 0. 0. ]\n"
"State after new batch, recalibration alarm\n"
]
}
],
"source": [
"# tokens \n",
"tokenizer = auto_tokenize(model_name='bert-base-cased', pad_to_max_length=True, return_tensors='tf')\n",
"tokens = tokenizer(data=dataset_civil)\n",
"tokens = tokenizer(data=batch1)\n",
"\n",
"# embedding (TODO abstract this layers line)\n",
"layers = [-_ for _ in range(1, 8 + 1)]\n",
Expand All @@ -111,25 +151,16 @@
"# detector + set reference\n",
"ks_alarm = KolmogorovSmirnovAlarm()\n",
"detector = Detector(alarm=ks_alarm, transforms=[tokenizer, embedder, uae_reduce])\n",
"detector.step(dataset_civil)\n",
"assert detector.rep_test is None and detector.rep_reference.shape == (5, 32)\n",
"detector.step(batch1)\n",
"print(f\"State after initial batch: {detector.state}\")\n",
"\n",
"# detector + add test (copy reference) \n",
"detector.step(dataset_civil)\n",
"assert detector.rep_test.shape == (5, 32)\n",
"detector.step(batch2)\n",
"print(f\"State after test batch {detector.state}\")\n",
"\n",
"# TODO - recalibrate and re-evaluate ..."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# todo check things work on new branch installs etc\n",
"# todo remove this exp code from test requirement\n",
"# todo ?"
"# recalibrate and re-evaluate (XXX - all batches must be same length)\n",
"detector.step(batch3)\n",
"print(f\"State after new batch, recalibration {detector.state}\")"
]
}
],
Expand Down
3 changes: 0 additions & 3 deletions menelaus/experimental/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from menelaus.nlp_experimental.alarm import Alarm
from menelaus.nlp_experimental.detector import Detector
from menelaus.nlp_experimental.representation import Representation
3 changes: 0 additions & 3 deletions menelaus/experimental/alarm.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ def __init__(self, alpha=0.05, critical_feature_proportion=0.25):
def evaluate(self, rep_reference, rep_test):
rep_reference = rep_reference.reshape(rep_reference.shape[0], -1)
rep_test = rep_test.reshape(rep_test.shape[0], -1)
if rep_reference.shape[0] != rep_test.shape[0]:
raise ValueError(f"...")
n_features = rep_reference.shape[1]
p_values = np.zeros(n_features, dtype=np.float32)
# distances = np.zeros_like(p_values)
Expand All @@ -38,4 +36,3 @@ def evaluate(self, rep_reference, rep_test):
self._state = STATE_DRIFT
else:
self._state = STATE_INIT
print(p_values)
7 changes: 6 additions & 1 deletion menelaus/experimental/detector.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
from typing import List
from toolz import pipe

Expand All @@ -14,7 +15,11 @@ def transform(self, raw_values):
return ret

def recalibrate(self, raw_values):
return 0
if self.state == "alarm":
rep_new = self.transform(raw_values)
self.rep_test = np.vstack((self.rep_test, rep_new))
else:
self.rep_test = self.transform(raw_values)

@property
def state(self):
Expand Down
File renamed without changes.

0 comments on commit 29303ed

Please sign in to comment.