-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_embeddings.py
53 lines (42 loc) · 1.61 KB
/
get_embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import os
import openai
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import time
from tqdm import tqdm
from openai import OpenAI
client = OpenAI(
# This is the default and can be omitted
api_key=os.environ.get("OPENAI_API_KEY"),
)
def get_embedding(code, model="text-embedding-3-large"):
return client.embeddings.create(model=model, input=[code]).data[0].embedding
def getSimilarity (codeA , codeB):
embedding1 = get_embedding(codeA)
embedding2 = get_embedding(codeB)
embedding1_np = np.array(embedding1).flatten()
embedding2_np = np.array(embedding2).flatten()
similarity_score = cosine_similarity([embedding1_np], [embedding2_np])[0][0]
return embedding1, embedding2, similarity_score
def main():
name="java_cn"
with open(f'data/{name}.json', 'r') as f:
data = json.load(f)
for i in tqdm(range(len(data)), desc="data processing"):
success = False
while not success:
try:
codeA = data[i]['codeA']
codeB = data[i]['codeB']
data[i]['emb_f1'],data[i]['emb_f2'],data[i]['Result'] = getSimilarity(codeA=codeA,codeB=codeB)
success = True
except Exception as e:
time.sleep(30)
print(f"Error processing index {i}: {e}")
#with open(f'error/e{i}.json', 'w') as err:
# json.dump(data, err, indent=4)
with open(f'classifier/data/ada_{name}.json', 'w') as output_file:
json.dump(data, output_file, indent=1)
if __name__ == "__main__":
main()