66import com .emcikem .llm .common .util .GsonUtil ;
77import com .emcikem .llm .common .vo .dataset .process .DocumentProcessVO ;
88import com .emcikem .llm .dao .entity .LlmOpsDocumentDO ;
9+ import com .emcikem .llm .dao .entity .LlmOpsKeywordTableDO ;
910import com .emcikem .llm .dao .entity .LlmOpsProcessRuleDO ;
1011import com .emcikem .llm .dao .entity .LlmOpsSegmentDO ;
1112import com .emcikem .llm .dao .entity .LlmOpsUploadFileDO ;
1213import com .emcikem .llm .service .provider .LLMOpsDatasetProvider ;
14+ import com .emcikem .llm .service .provider .LLMOpsKeyWordProvider ;
1315import com .emcikem .llm .service .provider .LLMOpsProcessRuleProvider ;
1416import com .emcikem .llm .service .provider .LlmOpsUploadFileProvider ;
1517import com .emcikem .llm .service .util .FileUtil ;
2325import jakarta .annotation .Resource ;
2426import org .apache .commons .collections4 .CollectionUtils ;
2527import org .apache .commons .compress .utils .Lists ;
28+ import org .apache .commons .lang3 .StringUtils ;
2629import org .springframework .stereotype .Service ;
2730
28- import java .util .ArrayList ;
2931import java .util .Date ;
3032import java .util .List ;
3133import java .util .Map ;
32- import java .util .Objects ;
3334import java .util .UUID ;
3435
3536/**
@@ -53,6 +54,12 @@ public class LLMOpsDocumentTask {
5354 @ Resource
5455 private LLMOpsProcessRuleProvider llmOpsProcessRuleProvider ;
5556
57+ @ Resource
58+ private LLMOpsKeyWordProvider llmOpsKeyWordProvider ;
59+
60+ @ Resource
61+ private VectorDatabaseService vectorDatabaseService ;
62+
5663 /**
5764 * 根据传递的文档id列表构建文档,涵盖了加载、分割、索引构建、数据村粗等内容
5865 * @param documentIdList
@@ -80,6 +87,7 @@ public void buildDocumentsAsync(List<String> documentIdList) {
8087 indexing (documentDO , lcSegmentList );
8188
8289 // 7. 存储操作,涵盖文档状态更新,以及向量数据库的存储
90+ completed (documentDO , lcSegmentList );
8391
8492 } catch (Exception ex ) {
8593 documentDO .setStatus (DataBaseStatusEnum .ERROR .getDesc ());
@@ -91,6 +99,43 @@ public void buildDocumentsAsync(List<String> documentIdList) {
9199 }
92100 }
93101
102+ /**
103+ * 村粗文档片段到向量数据库,并完成状态更新
104+ * @param llmOpsDocumentDO
105+ * @param lcSegmentList
106+ */
107+ public void completed (LlmOpsDocumentDO llmOpsDocumentDO , List <TextSegment > lcSegmentList ) {
108+ // 1. 循环遍历片段列表数据,将文档状态以及片段状态设置为True
109+ for (TextSegment lcSegment : lcSegmentList ) {
110+ lcSegment .metadata ().put ("document_enabled" , Boolean .TRUE .toString ());
111+ lcSegment .metadata ().put ("segment_enabled" , Boolean .TRUE .toString ());
112+ }
113+
114+ // 2. 调用向量数据库,每次存储10条数据,避免一次传递过多的数据
115+ for (int i = 0 ; i < lcSegmentList .size (); i += 10 ) {
116+ // 3. 提取需要存储的数据和ids
117+ List <TextSegment > textSegments = lcSegmentList .subList (i , Math .min (lcSegmentList .size (), i + 10 ));
118+ List <String > nodeIdxList = textSegments .stream ().map (x ->x .metadata ().getString ("node_id" )).toList ();
119+
120+ // 4. TODO:调用向量数据库存储对应的数据
121+ vectorDatabaseService .addDocuments (textSegments , nodeIdxList );
122+
123+ // 5. 更新关联片段的状态以及完成时间
124+ LlmOpsSegmentDO llmOpsSegmentDO = new LlmOpsSegmentDO ();
125+ llmOpsSegmentDO .setStatus (DataBaseStatusEnum .COMPLETED .getDesc ());
126+ llmOpsSegmentDO .setCompletedAt (new Date ());
127+ llmOpsSegmentDO .setEnabled (true );
128+ llmOpsDatasetProvider .updateSegmentByNodeIds (llmOpsSegmentDO , nodeIdxList );
129+ }
130+
131+ // 6. 更新文档的状态
132+ llmOpsDocumentDO .setStatus (DataBaseStatusEnum .COMPLETED .getDesc ());
133+ llmOpsDocumentDO .setCompletedAt (new Date ());
134+ llmOpsDocumentDO .setEnabled (false );
135+ boolean result = llmOpsDatasetProvider .updateDocument (llmOpsDocumentDO );
136+
137+ }
138+
94139 /**
95140 * 根据传递的信息构建索引,涵盖关键词提取,词表构建
96141 * @param documentDO
@@ -108,11 +153,51 @@ private void indexing(LlmOpsDocumentDO documentDO, List<TextSegment> lcSegmentLi
108153 llmOpsSegmentDO .setStatus (DataBaseStatusEnum .INDEXING .getDesc ());
109154 llmOpsSegmentDO .setUpdatedAt (new Date ());
110155 llmOpsSegmentDO .setIndexCompletedAt (new Date ());
111- llmOpsDatasetProvider .updateSegment (llmOpsSegmentDO );
156+ boolean result = llmOpsDatasetProvider .updateSegment (llmOpsSegmentDO );
157+
158+ // 3. 获取当前知识库的关键词表
159+ LlmOpsKeywordTableDO keyWordTableDO = llmOpsKeyWordProvider .getKeyWordTableByDatasetId (llmOpsSegmentDO .getDatasetId ());
160+ Map <String , List <String >> keywordTableMap = Maps .newHashMap ();
161+ if (keyWordTableDO != null && StringUtils .isNoneEmpty (keyWordTableDO .getKeywordTable ())) {
162+ keywordTableMap = GsonUtil .gsonToMaps (keyWordTableDO .getKeywordTable ());
163+ }
112164
113- // 3. 获取当前知识库的关键词表
165+ // 4. 循环将新的关键词添加到关键词表中
166+ for (String keyWord : keyWords ) {
167+ if (!keywordTableMap .containsKey (keyWord )) {
168+ keywordTableMap .put (keyWord , Lists .newArrayList ());
169+ }
170+ keywordTableMap .get (keyWord ).add (llmOpsSegmentDO .getId ());
171+ }
172+
173+ // 5. 更新关键词表
174+ if (keyWordTableDO == null ) {
175+ LlmOpsKeywordTableDO llmOpsKeywordTableDO = getKeywordTableDO (keyWordTableDO , keywordTableMap , llmOpsSegmentDO .getDatasetId ());
176+ boolean insertResult = llmOpsKeyWordProvider .insertKeywordTable (llmOpsKeywordTableDO );
177+ } else {
178+ boolean updateResult = llmOpsKeyWordProvider .updateKeyword (keyWordTableDO );
179+ }
180+ }
181+
182+ // 7. 更新文档状态
183+ documentDO .setUpdatedAt (new Date ());
184+ documentDO .setIndexCompletedAt (new Date ());
185+ llmOpsDatasetProvider .updateDocument (documentDO );
186+ }
114187
188+ private LlmOpsKeywordTableDO getKeywordTableDO (LlmOpsKeywordTableDO keyWordTableDO , Map <String , List <String >> keywordTableMap , String datasetId ) {
189+ if (keyWordTableDO == null ) {
190+ LlmOpsKeywordTableDO llmOpsKeywordTableDO = new LlmOpsKeywordTableDO ();
191+ llmOpsKeywordTableDO .setId (UUID .randomUUID ().toString ());
192+ llmOpsKeywordTableDO .setDatasetId (datasetId );
193+ llmOpsKeywordTableDO .setKeywordTable (GsonUtil .toJSONString (keywordTableMap ));
194+ llmOpsKeywordTableDO .setCreatedAt (new Date ());
195+ llmOpsKeywordTableDO .setUpdatedAt (new Date ());
196+ return llmOpsKeywordTableDO ;
115197 }
198+ keyWordTableDO .setUpdatedAt (new Date ());
199+ keyWordTableDO .setKeywordTable (GsonUtil .toJSONString (keywordTableMap ));
200+ return keyWordTableDO ;
116201 }
117202
118203 /**
0 commit comments