-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_embeddings.py
More file actions
90 lines (74 loc) · 3.6 KB
/
run_embeddings.py
File metadata and controls
90 lines (74 loc) · 3.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# Import the SentenceTransformer class, utility functions, and torch
from sentence_transformers import SentenceTransformer, util
import torch
print("-------------------------------------------")
print("Hugging Face Ecosystem Example")
print("Task: Semantic Similarity Search")
print("Library: sentence-transformers")
print("Model: all-MiniLM-L6-v2")
print("-------------------------------------------")
# 1. Determine device (GPU or CPU)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
if device == 'cuda':
print(f"GPU: {torch.cuda.get_device_name(0)}")
# 2. Load the Sentence Transformer model
print("Loading Sentence Transformer model (may download on first run)...")
try:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)
print("Model loaded successfully.")
except Exception as e:
print(f"Error loading model: {e}")
print("Please ensure 'sentence-transformers' and 'torch' (or 'tensorflow') are installed.")
exit()
# 3. Define the 'corpus' of sentences to search within
# Using context relevant to Friday afternoon in Perth
corpus_sentences = [
"It's a sunny Friday afternoon here in Perth.",
"Many people are planning their weekend activities now.",
"Traffic leaving the city centre can be busy around 2 PM.",
"A walk along the Swan River seems like a good idea later.",
"The weather in Western Australia is lovely today.",
"Commuters might face delays on the freeway system.",
"Finding parking downtown might be difficult this afternoon."
]
# 4. Define the query sentence we want to find similar sentences for
# Note it uses different words than the target sentences in the corpus.
query_sentence = "How is the road congestion in the city right now?"
print("\nCorpus Sentences:")
for i, s in enumerate(corpus_sentences):
print(f"- \"{s}\"")
print(f"\nQuery Sentence: \"{query_sentence}\"")
# 5. Encode the corpus and the query to get embeddings
print("\nGenerating embeddings for corpus and query...")
try:
# It's often efficient to encode the corpus once and reuse it
# convert_to_tensor=True is recommended for util.semantic_search
corpus_embeddings = model.encode(corpus_sentences, convert_to_tensor=True, device=device)
query_embedding = model.encode(query_sentence, convert_to_tensor=True, device=device)
print("Embeddings generated.")
# 6. Perform semantic search using cosine similarity
# util.semantic_search finds the top_k most similar sentences
print("\nPerforming semantic search...")
# Find the top 3 most similar sentences in the corpus
top_k = 3
# returns a list of lists for each query, containing dicts with 'corpus_id' and 'score'
hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)
hits = hits[0] # Get the results for the first (and only) query
print("Search complete.")
# 7. Print the results
print(f"\n--- Top {top_k} Most Similar Sentences to the Query ---")
if not hits:
print("No similar sentences found.")
else:
print(f"Query: \"{query_sentence}\"\n")
for rank, hit in enumerate(hits):
corpus_id = hit['corpus_id'] # Index of the sentence in corpus_sentences
score = hit['score'] # Cosine similarity score
print(f"Rank {rank+1}: Score: {score:.4f}")
print(f" Sentence: \"{corpus_sentences[corpus_id]}\"")
print("-" * 15) # Separator
print("-------------------------------------------------")
except Exception as e:
print(f"Error during embedding generation or search: {e}")
print("\nExample finished.")