-
Notifications
You must be signed in to change notification settings - Fork 4
/
_53_CassIOVectorDbTest.java
152 lines (132 loc) · 5.87 KB
/
_53_CassIOVectorDbTest.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
package devoxx.demo._5_vectorsearch;
import dev.langchain4j.model.input.Prompt;
import dev.langchain4j.model.input.PromptTemplate;
import dev.langchain4j.model.output.Response;
import dev.langchain4j.store.cassio.AnnQuery;
import dev.langchain4j.store.cassio.AnnResult;
import dev.langchain4j.store.cassio.MetadataVectorRecord;
import dev.langchain4j.store.cassio.MetadataVectorTable;
import devoxx.demo.devoxx.Quote;
import devoxx.demo.utils.AbstractDevoxxTestSupport;
import lombok.extern.slf4j.Slf4j;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static dev.langchain4j.store.cassio.SimilarityMetric.COSINE;
import static devoxx.demo.devoxx.Utilities.ASTRA_KEYSPACE;
import static devoxx.demo.devoxx.Utilities.EMBEDDING_DIMENSION;
import static devoxx.demo.devoxx.Utilities.TABLE_NAME;
import static devoxx.demo.devoxx.Utilities.loadQuotes;
@Slf4j
public class _53_CassIOVectorDbTest extends AbstractDevoxxTestSupport {
@Test
@Disabled
public void shouldIngestDocuments() throws IOException {
getVectorTable().create();
getVectorTable().clear();
loadQuotes("philo_quotes.json") // extraction
.stream().parallel() // no chunking (single sentences)
.map(this::mapQuoteToCassandraRecord) // bean-> db record
.forEach(getVectorTable()::put); // persist
}
@Test
public void shouldFindSimilarQuotes() {
// Encode question
List<Float> vector = getEmbeddingModelGecko()
.embed("We struggle all our life for nothing")
.content()
.vectorAsList();
// Build Query
AnnQuery query = AnnQuery.builder()
.metric(COSINE)
.topK(3).threshold(.8)
.embeddings(vector) // add vector here
.build();
// Execute query
Stream<MetadataVectorRecord> results = getVectorTable()
.similaritySearch(query)
.stream()
.map(AnnResult::getEmbedded);
// Display Results
results.map(MetadataVectorRecord::getBody)
.forEach(System.out::println);
}
@Test
public void shouldFindSimilarQuotesWithFilter() {
log.info("Sample Search");
findSimilarQuotes("We struggle all our life for nothing", 3, .8, null)
.map(Quote::body).forEach(System.out::println);
log.info("Search with filter");
findSimilarQuotes("We struggle all our life for nothing", 2, .5, "politics")
.map(Quote::body).forEach(System.out::println);
}
@Test
public void shouldGenerateQuotes() {
log.info("Generate Quotes");
MetadataVectorTable vectorTable = getVectorTable();
PromptTemplate promptTemplate = PromptTemplate.from("""
Generate a single short philosophical quote on the given topic,
similar in spirit and form to the provided actual example quotes.
Do not exceed 20-30 words in your quote.
REFERENCE TOPIC: \n {{topic}} \n
ACTUAL EXAMPLES:\n{{examples}}
""");
Map<String, Object> variables = new HashMap<>();
variables.put("topic", "politics");
variables.put("examples", findSimilarQuotes("We struggle all our life for nothing", 2, .8, "politics")
.map(Quote::body)
.collect(Collectors.joining(",")));
Prompt prompt = promptTemplate.apply(variables);
System.out.println(prompt.toSystemMessage().text());
Response<String> response = getLanguageModelTextBison().generate(prompt);
System.out.println(response.content());
}
private Stream<Quote> findSimilarQuotes(String quote, int topK, double threshold, String topic) {
AnnQuery.AnnQueryBuilder builder = AnnQuery.builder()
.metric(COSINE)
.topK(topK)
.threshold(threshold)
.metaData(topic != null ? Map.of(topic, "true") : null)
.embeddings(getEmbeddingModelGecko()
.embed(quote)
.content().vectorAsList());
return getVectorTable().similaritySearch(builder.build()).stream()
.map(AnnResult::getEmbedded)
.map(embedded -> (MetadataVectorRecord) embedded)
.map(this::mapCassandraRowToQuote);
}
private MetadataVectorRecord mapQuoteToCassandraRecord(Quote quote) {
System.out.println("◾ " + quote );
MetadataVectorRecord record = new MetadataVectorRecord();
record.setBody(quote.body());
record.getMetadata().put("author", quote.author());
quote.tags().forEach(tag -> record.getMetadata().put(tag, "true"));
record.setVector(getEmbeddingModelGecko().embed(quote.body()).content().vectorAsList());
record.setRowId(quote.rowId());
return record;
}
private Quote mapCassandraRowToQuote(MetadataVectorRecord r) {
// Removing the brackets and trimming unnecessary spaces
List<String> tags = new ArrayList<>();
String myTags = r.getMetadata().get("tags");
if (myTags != null) {
String trimmed =myTags.substring(1, myTags.length() - 1).trim();
tags.addAll(Arrays.stream(trimmed.split(",")).map(String::trim).toList());
}
return new Quote(
r.getRowId(),
r.getMetadata().get("author"),
tags ,
r.getBody());
}
private MetadataVectorTable getVectorTable() {
return new MetadataVectorTable(getCassandraSession(), ASTRA_KEYSPACE, TABLE_NAME, EMBEDDING_DIMENSION);
}
}