Skip to content

Commit

Permalink
add alibi detect citation, reference in docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
Anmol-Srivastava committed Oct 24, 2023
1 parent 1df0165 commit 352eba3
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 1 deletion.
9 changes: 9 additions & 0 deletions docs/source/refs.bib
Original file line number Diff line number Diff line change
Expand Up @@ -183,4 +183,13 @@ @misc{souza2020
year={2020},
howpublished="\url{https://arxiv.org/abs/2005.00113}",
note={Online; accessed 20-July-2022},
}

@software{alibi-detect,
title = {Alibi Detect: Algorithms for outlier, adversarial and drift detection},
author = {Van Looveren, Arnaud and Klaise, Janis and Vacanti, Giovanni and Cobb, Oliver and Scillitoe, Ashley and Samoilescu, Robert and Athorne, Alex},
url = {https://github.com/SeldonIO/alibi-detect},
version = {0.11.4},
date = {2023-07-07},
year = {2019}
}
41 changes: 40 additions & 1 deletion menelaus/experimental/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,23 @@

@curry
def auto_tokenize(data, model_name, **kwargs):
"""
Curried function that takes raw data (typically list of strings), model
name, and other optional arguments, and returns the tokens created from
using the pre-trained model.
"""
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokens = tokenizer.batch_encode_plus(data, **kwargs)
return tokens


@curry
def _hidden_state_embeddings(hidden_states, layers, use_cls):
"""
Curried helper function to assist with extracting embeddings from tokens.
Ref. :cite:t:`alibi-detect`
"""
hs = [
hidden_states[layer][:, 0:1, :] if use_cls else hidden_states[layer]
for layer in layers
Expand All @@ -31,6 +41,8 @@ def __init__(
"""
Extract text embeddings from transformer models.
Ref. :cite:t:`alibi-detect`
Parameters
----------
model_name_or_path
Expand Down Expand Up @@ -88,6 +100,13 @@ def call(self, tokens: Dict[str, tf.Tensor]) -> tf.Tensor:

@curry
def extract_embedding(tokens, model_name, embedding_type, layers):
"""
Curried function to extract embeddings from tokens. Takes tokens,
name of transformer embedding model, embedding type, and layers.
Returns tokens, embedding, and embedding model.
Ref. :cite:t:`alibi-detect`
"""
te = TransformerEmbedding(
model_name_or_path=model_name, embedding_type=embedding_type, layers=layers
)
Expand All @@ -96,6 +115,13 @@ def extract_embedding(tokens, model_name, embedding_type, layers):


class _Encoder(tf.keras.Model):
"""
Helper class to assist with encoding embeddings into a reduced-dimension
output.
Ref. :cite:t:`alibi-detect`
"""

def __init__(
self,
input_layer: Union[tf.keras.layers.Layer, tf.keras.Model],
Expand Down Expand Up @@ -128,7 +154,13 @@ def call(self, x: Union[np.ndarray, tf.Tensor, Dict[str, tf.Tensor]]) -> tf.Tens


class UAE(tf.keras.Model):
# copied from alibi-detect
"""
Untrained AutoEncoder class to reduce dimension of embedding output from previous
steps.
Ref. :cite:t:`alibi-detect`
"""

def __init__(
self,
encoder_net: Optional[tf.keras.Model] = None,
Expand Down Expand Up @@ -160,6 +192,13 @@ def call(self, x: Union[np.ndarray, tf.Tensor, Dict[str, tf.Tensor]]) -> tf.Tens

@curry
def uae_reduce_dimension(input, enc_dim, seed=0, to_numpy=True):
"""
Curried function to reduce dimension of embedding output via Untrained
AutoEncoder. Takes input tuple (tokens, embedding, input layer), encoding
dimension size, optional seed. Returns reduced array (numpy or tensor).
Ref. :cite:t:`alibi-detect`
"""
tf.random.set_seed(seed)
tokens, embedding, input_layer = input
uae = UAE(input_layer=input_layer, shape=embedding.shape, enc_dim=enc_dim)
Expand Down

0 comments on commit 352eba3

Please sign in to comment.