Skip to content

Commit 036ebd6

Browse files
committed
fix #2856 Add crawl order configuration to control URL processing order
1 parent 70ef1c5 commit 036ebd6

File tree

3 files changed

+70
-0
lines changed

3 files changed

+70
-0
lines changed
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/*
2+
* Copyright 2012-2024 CodeLibs Project and the Others.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
13+
* either express or implied. See the License for the specific language
14+
* governing permissions and limitations under the License.
15+
*/
16+
package org.codelibs.fess.crawler.service;
17+
18+
import java.util.List;
19+
import java.util.Map;
20+
21+
import org.apache.logging.log4j.LogManager;
22+
import org.apache.logging.log4j.Logger;
23+
import org.codelibs.fess.crawler.entity.EsUrlQueue;
24+
import org.codelibs.fess.crawler.service.impl.EsUrlQueueService;
25+
import org.codelibs.fess.crawler.util.EsCrawlerConfig;
26+
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
27+
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
28+
import org.codelibs.fess.helper.CrawlingConfigHelper;
29+
import org.codelibs.fess.util.ComponentUtil;
30+
import org.opensearch.index.query.QueryBuilders;
31+
import org.opensearch.index.query.functionscore.FunctionScoreQueryBuilder;
32+
import org.opensearch.index.query.functionscore.RandomScoreFunctionBuilder;
33+
import org.opensearch.search.sort.SortBuilders;
34+
import org.opensearch.search.sort.SortOrder;
35+
36+
public class FessUrlQueueService extends EsUrlQueueService {
37+
private static final Logger logger = LogManager.getLogger(FessUrlQueueService.class);
38+
39+
public FessUrlQueueService(final EsCrawlerConfig crawlerConfig) {
40+
super(crawlerConfig);
41+
}
42+
43+
@Override
44+
protected List<EsUrlQueue> fetchUrlQueueList(final String sessionId) {
45+
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
46+
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(sessionId);
47+
final Map<String, String> configParams = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
48+
final String crawlOrder = configParams.getOrDefault(CrawlingConfig.Param.Config.CRAWL_ORDER, "sequential");
49+
if ("random".equals(crawlOrder)) {
50+
return getList(EsUrlQueue.class, sessionId,
51+
QueryBuilders.functionScoreQuery(QueryBuilders.matchAllQuery(),
52+
new FunctionScoreQueryBuilder.FilterFunctionBuilder[] { new FunctionScoreQueryBuilder.FilterFunctionBuilder(
53+
new RandomScoreFunctionBuilder().seed(sessionId.hashCode())) }),
54+
0, pollingFetchSize, SortBuilders.scoreSort().order(SortOrder.ASC));
55+
} else if (!"sequential".equals(crawlOrder)) {
56+
logger.warn("Invalid crawl order specified: {}. Falling back to sequential.", crawlOrder);
57+
}
58+
return getList(EsUrlQueue.class, sessionId, null, 0, pollingFetchSize, SortBuilders.fieldSort(CREATE_TIME).order(SortOrder.ASC));
59+
}
60+
}

src/main/java/org/codelibs/fess/es/config/exentity/CrawlingConfig.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ public static class Config {
133133
public static final String IGNORE_ROBOTS_TAGS = "ignore.robots.tags";
134134
public static final String SCRIPT_TYPE = "script.type";
135135
public static final String HTML_CHILD_URL_RULES = "html.child.url.rules";
136+
public static final String CRAWL_ORDER = "crawl.order";
136137
}
137138

138139
// meta.*
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
3+
"http://dbflute.org/meta/lastadi10.dtd">
4+
<components namespace="fessCrawler">
5+
<component name="urlQueueService"
6+
class="org.codelibs.fess.crawler.service.FessUrlQueueService">
7+
<arg>crawlerConfig</arg>
8+
</component>
9+
</components>

0 commit comments

Comments
 (0)