Skip to content

Commit

Permalink
add docstrings to transforms
Browse files Browse the repository at this point in the history
  • Loading branch information
Anmol-Srivastava committed Nov 7, 2023
1 parent 21e23ac commit e4ecdaf
Showing 1 changed file with 117 additions and 28 deletions.
145 changes: 117 additions & 28 deletions menelaus/experimental/transform.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
"""
Contains transform functions, which are curried functions initialized with a certain configuration,
and called in some sequence to transform an initial batch of data into a final formatted data
representation. Applying transforms helps compare two sets of data, and convert data into a format
accepted by some ``Alarm`` type.
"""

import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Flatten, Dense, InputLayer
Expand Down Expand Up @@ -35,42 +42,54 @@ def _hidden_state_embeddings(hidden_states, layers, use_cls):


class TransformerEmbedding(tf.keras.Model):
def __init__(
self, model_name_or_path: str, embedding_type: str, layers: List[int] = None
) -> None:
"""
Extract text embeddings from transformer models.
"""
Extracts texts embeddings from transformer models. Pulled directly from ``alibi-detect``.
Ref. :cite:t:`alibi-detect`
Ref. :cite:t:`alibi-detect`
Parameters
----------
Attributes:
model_name_or_path
Name of or path to the model.
Name of or path to the transformer model.
embedding_type
Type of embedding to extract. Needs to be one of pooler_output,
last_hidden_state, hidden_state or hidden_state_cls.
From the HuggingFace documentation:
- pooler_output
Last layer hidden-state of the first token of the sequence
(classification token) further processed by a Linear layer and a Tanh
activation function. The Linear layer weights are trained from the next
sentence prediction (classification) objective during pre-training.
This output is usually not a good summary of the semantic content of the
input, you’re often better with averaging or pooling the sequence of
hidden-states for the whole input sequence.
- last_hidden_state
Sequence of hidden-states at the output of the last layer of the model.
- hidden_state
Hidden states of the model at the output of each layer.
- hidden_state_cls
See hidden_state but use the CLS token output.
layers
If "hidden_state" or "hidden_state_cls" is used as embedding
type, layers has to be a list with int's referring to the hidden layers used
to extract the embedding.
"""

def __init__(
self, model_name_or_path: str, embedding_type: str, layers: List[int] = None
) -> None:
"""
Args:
model_name_or_path
Name of or path to the transformer model.
embedding_type
Type of embedding to extract. Needs to be one of pooler_output,
last_hidden_state, hidden_state or hidden_state_cls.
From the HuggingFace documentation:
- pooler_output
Last layer hidden-state of the first token of the sequence
(classification token) further processed by a Linear layer and a Tanh
activation function. The Linear layer weights are trained from the next
sentence prediction (classification) objective during pre-training.
This output is usually not a good summary of the semantic content of the
input, you’re often better with averaging or pooling the sequence of
hidden-states for the whole input sequence.
- last_hidden_state
Sequence of hidden-states at the output of the last layer of the model.
- hidden_state
Hidden states of the model at the output of each layer.
- hidden_state_cls
See hidden_state but use the CLS token output.
layers
If "hidden_state" or "hidden_state_cls" is used as embedding
type, layers has to be a list with int's referring to the hidden layers used
to extract the embedding.
"""
super(TransformerEmbedding, self).__init__()
self.config = AutoConfig.from_pretrained(
Expand All @@ -83,6 +102,18 @@ def __init__(
)

def call(self, tokens: Dict[str, tf.Tensor]) -> tf.Tensor:
"""
Applies transformer model to tokens, then extracts embeddings from output.
Args:
tokens
Dictionary output of transformer model on raw strings. For details
on output format, see return values of ``encode_plus``, ``__call__``,
or ``batch_encode_plus`` methods in ``transformers.BatchEncoding``.
Returns:
Extracted embeddings, typically as ``tf.tensor`` or ``numpy.ndarray``.
"""
output = self.model(tokens)
if self.embedding_type == "pooler_output":
return output.pooler_output
Expand Down Expand Up @@ -117,9 +148,15 @@ def extract_embedding(tokens, model_name, embedding_type, layers):
class _Encoder(tf.keras.Model):
"""
Helper class to assist with encoding embeddings into a reduced-dimension
output.
output. Pulled directly from ``alibi-detect``.
Ref. :cite:t:`alibi-detect`
Attributes:
input_layer
Input layer from which new encodings will be generated.
mlp
Multilayer perceptron network used for dimension-reduction step.
"""

def __init__(
Expand All @@ -129,6 +166,18 @@ def __init__(
enc_dim: Optional[int] = None,
step_dim: Optional[int] = None,
) -> None:
"""
Args:
input_layer
Input layer from which new encodings will be generated.
mlp
Multilayer perceptron network used for dimension-reduction step.
Default ``None``.
enc_dim
Desired size for final encoded output. Default ``None``.
step_dim
Optional step size for constructing MLP if none given. Default ``None``.
"""
super().__init__()
self.input_layer = input_layer
if isinstance(mlp, tf.keras.Model):
Expand All @@ -149,16 +198,31 @@ def __init__(
)

def call(self, x: Union[np.ndarray, tf.Tensor, Dict[str, tf.Tensor]]) -> tf.Tensor:
"""
Performs reduced-dimension encoding step on new data.
Args:
x
New input batch.
Returns
Encoded data (processed through input layer and MLP).
"""
x = self.input_layer(x)
return self.mlp(x)


class UAE(tf.keras.Model):
"""
Untrained AutoEncoder class to reduce dimension of embedding output from previous
steps.
steps. Pulled directly from ``alibi-detect``.
Ref. :cite:t:`alibi-detect`
Attributes:
encoder
Encoder network to be used on tokens resulting in reduced-dimension
embeddings.
"""

def __init__(
Expand All @@ -168,6 +232,21 @@ def __init__(
shape: Optional[tuple] = None,
enc_dim: Optional[int] = None,
) -> None:
"""
Args:
encoder_net
If this is given as a ``tf.keras.Model``, use this to obtain embeddings.
Default ``None``.
input_layer
If ``encoder_net`` not given, this is used as the input layer which
accepts tokens. Default ``None``.
shape
If ``encoder_net`` not given, this is the desired input shape for the
input layer. Default ``None``.
enc_dim
If ``encoder_net`` not given, this is the desired encoding dimension
for the final output. Default ``None``.
"""
super().__init__()
is_enc = isinstance(encoder_net, tf.keras.Model)
is_enc_dim = isinstance(enc_dim, int)
Expand All @@ -187,6 +266,16 @@ def __init__(
)

def call(self, x: Union[np.ndarray, tf.Tensor, Dict[str, tf.Tensor]]) -> tf.Tensor:
"""
Performs encoding step on tensors.
Args:
x
New batch of tensors.
Returns:
Encoded and reduced-dimension output via UAE.
"""
return self.encoder(x)


Expand Down

0 comments on commit e4ecdaf

Please sign in to comment.