Skip to content

Commit c8b7de4

Browse files
committed
feat(.service): 创建文档操作-异步任务补充完成
1 parent 1824837 commit c8b7de4

4 files changed

Lines changed: 162 additions & 4 deletions

File tree

llmops-service/src/main/java/com/emcikem/llm/service/provider/LLMOpsDatasetProvider.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,4 +309,10 @@ public boolean createSegmentList(List<LlmOpsSegmentDO> llmOpsSegmentList) {
309309
// return llmOpsSegmentList.batch
310310
return false;
311311
}
312+
313+
public boolean updateSegmentByNodeIds(LlmOpsSegmentDO llmOpsSegmentDO, List<String> nodeIdxList) {
314+
LlmOpsSegmentDOExample example = new LlmOpsSegmentDOExample();
315+
example.createCriteria().andNodeIdIn(nodeIdxList);
316+
return llmOpsSegmentDOMapper.updateByExampleSelective(llmOpsSegmentDO, example) == nodeIdxList.size();
317+
}
312318
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
package com.emcikem.llm.service.provider;
2+
3+
import com.emcikem.llm.dao.entity.LlmOpsKeywordTableDO;
4+
import com.emcikem.llm.dao.example.LlmOpsKeywordTableDOExample;
5+
import com.emcikem.llm.dao.mapper.LlmOpsKeywordTableDOMapper;
6+
import jakarta.annotation.Resource;
7+
import org.apache.commons.collections4.CollectionUtils;
8+
import org.springframework.stereotype.Service;
9+
10+
import java.util.List;
11+
12+
/**
13+
* Create with Emcikem on 2025/6/2
14+
*
15+
* @author Emcikem
16+
* @version 1.0.0
17+
*/
18+
@Service
19+
public class LLMOpsKeyWordProvider {
20+
21+
@Resource
22+
private LlmOpsKeywordTableDOMapper llmOpsKeywordTableDOMapper;
23+
24+
public LlmOpsKeywordTableDO getKeyWordTableByDatasetId(String datasetId) {
25+
LlmOpsKeywordTableDOExample example = new LlmOpsKeywordTableDOExample();
26+
example.createCriteria().andDatasetIdEqualTo(datasetId);
27+
List<LlmOpsKeywordTableDO> llmOpsKeywordTableList = llmOpsKeywordTableDOMapper.selectByExampleWithBLOBs(example);
28+
if (CollectionUtils.isEmpty(llmOpsKeywordTableList)) {
29+
return null;
30+
}
31+
return llmOpsKeywordTableList.get(0);
32+
}
33+
34+
public boolean updateKeyword(LlmOpsKeywordTableDO keyWordTableDO) {
35+
return llmOpsKeywordTableDOMapper.updateByPrimaryKeySelective(keyWordTableDO) == 1;
36+
}
37+
38+
public boolean insertKeywordTable(LlmOpsKeywordTableDO llmOpsKeywordTableDO) {
39+
return llmOpsKeywordTableDOMapper.insert(llmOpsKeywordTableDO) == 1;
40+
}
41+
}

llmops-service/src/main/java/com/emcikem/llm/service/service/dataset/LLMOpsDocumentTask.java

Lines changed: 89 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,12 @@
66
import com.emcikem.llm.common.util.GsonUtil;
77
import com.emcikem.llm.common.vo.dataset.process.DocumentProcessVO;
88
import com.emcikem.llm.dao.entity.LlmOpsDocumentDO;
9+
import com.emcikem.llm.dao.entity.LlmOpsKeywordTableDO;
910
import com.emcikem.llm.dao.entity.LlmOpsProcessRuleDO;
1011
import com.emcikem.llm.dao.entity.LlmOpsSegmentDO;
1112
import com.emcikem.llm.dao.entity.LlmOpsUploadFileDO;
1213
import com.emcikem.llm.service.provider.LLMOpsDatasetProvider;
14+
import com.emcikem.llm.service.provider.LLMOpsKeyWordProvider;
1315
import com.emcikem.llm.service.provider.LLMOpsProcessRuleProvider;
1416
import com.emcikem.llm.service.provider.LlmOpsUploadFileProvider;
1517
import com.emcikem.llm.service.util.FileUtil;
@@ -23,13 +25,12 @@
2325
import jakarta.annotation.Resource;
2426
import org.apache.commons.collections4.CollectionUtils;
2527
import org.apache.commons.compress.utils.Lists;
28+
import org.apache.commons.lang3.StringUtils;
2629
import org.springframework.stereotype.Service;
2730

28-
import java.util.ArrayList;
2931
import java.util.Date;
3032
import java.util.List;
3133
import java.util.Map;
32-
import java.util.Objects;
3334
import java.util.UUID;
3435

3536
/**
@@ -53,6 +54,12 @@ public class LLMOpsDocumentTask {
5354
@Resource
5455
private LLMOpsProcessRuleProvider llmOpsProcessRuleProvider;
5556

57+
@Resource
58+
private LLMOpsKeyWordProvider llmOpsKeyWordProvider;
59+
60+
@Resource
61+
private VectorDatabaseService vectorDatabaseService;
62+
5663
/**
5764
* 根据传递的文档id列表构建文档,涵盖了加载、分割、索引构建、数据村粗等内容
5865
* @param documentIdList
@@ -80,6 +87,7 @@ public void buildDocumentsAsync(List<String> documentIdList) {
8087
indexing(documentDO, lcSegmentList);
8188

8289
// 7. 存储操作,涵盖文档状态更新,以及向量数据库的存储
90+
completed(documentDO, lcSegmentList);
8391

8492
} catch (Exception ex) {
8593
documentDO.setStatus(DataBaseStatusEnum.ERROR.getDesc());
@@ -91,6 +99,43 @@ public void buildDocumentsAsync(List<String> documentIdList) {
9199
}
92100
}
93101

102+
/**
103+
* 村粗文档片段到向量数据库,并完成状态更新
104+
* @param llmOpsDocumentDO
105+
* @param lcSegmentList
106+
*/
107+
public void completed(LlmOpsDocumentDO llmOpsDocumentDO, List<TextSegment> lcSegmentList) {
108+
// 1. 循环遍历片段列表数据,将文档状态以及片段状态设置为True
109+
for (TextSegment lcSegment : lcSegmentList) {
110+
lcSegment.metadata().put("document_enabled", Boolean.TRUE.toString());
111+
lcSegment.metadata().put("segment_enabled", Boolean.TRUE.toString());
112+
}
113+
114+
// 2. 调用向量数据库,每次存储10条数据,避免一次传递过多的数据
115+
for (int i = 0; i < lcSegmentList.size(); i += 10) {
116+
// 3. 提取需要存储的数据和ids
117+
List<TextSegment> textSegments = lcSegmentList.subList(i, Math.min(lcSegmentList.size(), i + 10));
118+
List<String> nodeIdxList = textSegments.stream().map(x->x.metadata().getString("node_id")).toList();
119+
120+
// 4. TODO:调用向量数据库存储对应的数据
121+
vectorDatabaseService.addDocuments(textSegments, nodeIdxList);
122+
123+
// 5. 更新关联片段的状态以及完成时间
124+
LlmOpsSegmentDO llmOpsSegmentDO = new LlmOpsSegmentDO();
125+
llmOpsSegmentDO.setStatus(DataBaseStatusEnum.COMPLETED.getDesc());
126+
llmOpsSegmentDO.setCompletedAt(new Date());
127+
llmOpsSegmentDO.setEnabled(true);
128+
llmOpsDatasetProvider.updateSegmentByNodeIds(llmOpsSegmentDO, nodeIdxList);
129+
}
130+
131+
// 6. 更新文档的状态
132+
llmOpsDocumentDO.setStatus(DataBaseStatusEnum.COMPLETED.getDesc());
133+
llmOpsDocumentDO.setCompletedAt(new Date());
134+
llmOpsDocumentDO.setEnabled(false);
135+
boolean result = llmOpsDatasetProvider.updateDocument(llmOpsDocumentDO);
136+
137+
}
138+
94139
/**
95140
* 根据传递的信息构建索引,涵盖关键词提取,词表构建
96141
* @param documentDO
@@ -108,11 +153,51 @@ private void indexing(LlmOpsDocumentDO documentDO, List<TextSegment> lcSegmentLi
108153
llmOpsSegmentDO.setStatus(DataBaseStatusEnum.INDEXING.getDesc());
109154
llmOpsSegmentDO.setUpdatedAt(new Date());
110155
llmOpsSegmentDO.setIndexCompletedAt(new Date());
111-
llmOpsDatasetProvider.updateSegment(llmOpsSegmentDO);
156+
boolean result = llmOpsDatasetProvider.updateSegment(llmOpsSegmentDO);
157+
158+
// 3. 获取当前知识库的关键词表
159+
LlmOpsKeywordTableDO keyWordTableDO = llmOpsKeyWordProvider.getKeyWordTableByDatasetId(llmOpsSegmentDO.getDatasetId());
160+
Map<String, List<String>> keywordTableMap = Maps.newHashMap();
161+
if (keyWordTableDO != null && StringUtils.isNoneEmpty(keyWordTableDO.getKeywordTable())) {
162+
keywordTableMap = GsonUtil.gsonToMaps(keyWordTableDO.getKeywordTable());
163+
}
112164

113-
// 3. 获取当前知识库的关键词表
165+
// 4. 循环将新的关键词添加到关键词表中
166+
for (String keyWord : keyWords) {
167+
if (!keywordTableMap.containsKey(keyWord)) {
168+
keywordTableMap.put(keyWord, Lists.newArrayList());
169+
}
170+
keywordTableMap.get(keyWord).add(llmOpsSegmentDO.getId());
171+
}
172+
173+
// 5. 更新关键词表
174+
if (keyWordTableDO == null) {
175+
LlmOpsKeywordTableDO llmOpsKeywordTableDO = getKeywordTableDO(keyWordTableDO, keywordTableMap, llmOpsSegmentDO.getDatasetId());
176+
boolean insertResult = llmOpsKeyWordProvider.insertKeywordTable(llmOpsKeywordTableDO);
177+
} else {
178+
boolean updateResult = llmOpsKeyWordProvider.updateKeyword(keyWordTableDO);
179+
}
180+
}
181+
182+
// 7. 更新文档状态
183+
documentDO.setUpdatedAt(new Date());
184+
documentDO.setIndexCompletedAt(new Date());
185+
llmOpsDatasetProvider.updateDocument(documentDO);
186+
}
114187

188+
private LlmOpsKeywordTableDO getKeywordTableDO(LlmOpsKeywordTableDO keyWordTableDO, Map<String, List<String>> keywordTableMap, String datasetId) {
189+
if (keyWordTableDO == null) {
190+
LlmOpsKeywordTableDO llmOpsKeywordTableDO = new LlmOpsKeywordTableDO();
191+
llmOpsKeywordTableDO.setId(UUID.randomUUID().toString());
192+
llmOpsKeywordTableDO.setDatasetId(datasetId);
193+
llmOpsKeywordTableDO.setKeywordTable(GsonUtil.toJSONString(keywordTableMap));
194+
llmOpsKeywordTableDO.setCreatedAt(new Date());
195+
llmOpsKeywordTableDO.setUpdatedAt(new Date());
196+
return llmOpsKeywordTableDO;
115197
}
198+
keyWordTableDO.setUpdatedAt(new Date());
199+
keyWordTableDO.setKeywordTable(GsonUtil.toJSONString(keywordTableMap));
200+
return keyWordTableDO;
116201
}
117202

118203
/**
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package com.emcikem.llm.service.service.dataset;
2+
3+
import dev.langchain4j.data.embedding.Embedding;
4+
import dev.langchain4j.data.segment.TextSegment;
5+
import dev.langchain4j.store.embedding.inmemory.InMemoryEmbeddingStore;
6+
import lombok.extern.slf4j.Slf4j;
7+
import org.springframework.stereotype.Service;
8+
9+
import java.util.List;
10+
11+
/**
12+
* Create with Emcikem on 2025/6/2
13+
*
14+
* @author Emcikem
15+
* @version 1.0.0
16+
*/
17+
@Service
18+
@Slf4j
19+
public class VectorDatabaseService {
20+
21+
private InMemoryEmbeddingStore<Embedding> embeddingStore;
22+
23+
public boolean addDocuments(List<TextSegment> textSegmentList, List<String> nodeIdList) {
24+
25+
}
26+
}

0 commit comments

Comments
 (0)