Skip to content

Latest commit

 

History

History
43 lines (33 loc) · 1.39 KB

kertf_TextVectorization.md

File metadata and controls

43 lines (33 loc) · 1.39 KB

Keras' TextVectorisation layer usage

text_dataset = tf.data.Dataset.from_tensor_slices(["foo", "bar", "baz"])

max_features = 5000  # Maximum vocab size
max_len = 5          # Sequence length to pad the outputs to
embedding_dims = 2

# Create layer
vectorize_layer = TextVectorization(standardize='lower_and_strip_punctuation', # default
                                    max_tokens=max_features,    
                                    output_mode='int', 
                                    split='whitespace',   # default
                                    output_sequence_length=max_len)  
                      
# call `adapt` on the text-only dataset to create the vocabulary              
vectorize_layer.adapt(text_dataset.batch(32))

# Create model that uses the layer
model = tf.keras.models.Sequential() 

# Start by creating an explicit input layer. 
# Needs to have a shape of (1,) & dtype 'string'. 
# Need to guarantee that there is exactly one string input per batch)
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))  

# The first layer in our model is the vectorization layer
# After this layer, we have a tensor of shape (batch_size, max_len) 
# containing vocab indices.  
model.add(vectorize_layer)

# list of two lists
input_data = [["foo qux bar"], 
              ["qux baz"]] 

model.predict(input_data)
array([[2, 1, 4, 0, 0],
       [1, 3, 0, 0, 0]])