@@ -65,27 +65,19 @@ def featurize(
65
65
continue # Skip empty batches
66
66
67
67
# Encode the batch to get token embeddings
68
- token_embeddings = model .encode (
69
- list_batch ,
70
- output_value = "token_embeddings" ,
71
- convert_to_tensor = True ,
72
- )
68
+ token_embeddings = model .encode (list_batch , output_value = "token_embeddings" , convert_to_numpy = True )
73
69
74
70
# Tokenize the batch to get input IDs
75
71
tokenized_ids = model .tokenize (list_batch )["input_ids" ]
76
72
77
73
for tokenized_id , token_embedding in zip (tokenized_ids , token_embeddings ):
78
- # Convert token IDs to tokens (excluding special tokens)
79
- token_ids = tokenized_id [1 :- 1 ]
80
- # Decode tokens to text
81
- text = model .tokenizer .decode (token_ids )
74
+ # Decode the token IDs to get the text
75
+ text = model .tokenizer .decode (tokenized_id , skip_special_tokens = True )
82
76
if text in seen :
83
77
continue
84
78
seen .add (text )
85
79
# Get the corresponding token embeddings (excluding special tokens)
86
- token_embeds = token_embedding [1 :- 1 ]
87
- # Convert embeddings to NumPy arrays
88
- token_embeds = token_embeds .detach ().cpu ().numpy ()
80
+ token_embeds = token_embedding [1 :- 1 ].detach ().cpu ().numpy ()
89
81
# Compute the mean of the token embeddings
90
82
mean = np .mean (token_embeds , axis = 0 )
91
83
txts .append (text )
0 commit comments