Skip to content

Commit d8b9172

Browse files
committed
#67 - Fixed a bug with an url crawled multiple times simultaniously while the source is common crawl.
1 parent 5038e41 commit d8b9172

File tree

1 file changed

+10
-10
lines changed

1 file changed

+10
-10
lines changed

loa-application/loa-downloader-application/src/main/java/com/github/loa/downloader/command/batch/generator/commoncrawl/CommonCrawlDocumentGenerator.java

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,9 @@
1515

1616
import java.net.MalformedURLException;
1717
import java.net.URL;
18-
import java.util.ArrayList;
19-
import java.util.Collections;
20-
import java.util.List;
21-
import java.util.Optional;
18+
import java.util.*;
2219
import java.util.concurrent.ExecutorService;
2320
import java.util.concurrent.Executors;
24-
import java.util.concurrent.atomic.AtomicInteger;
2521
import java.util.stream.Collectors;
2622

2723
@Slf4j
@@ -36,6 +32,7 @@ public class CommonCrawlDocumentGenerator implements Generator<String> {
3632
private int processedWarcFiles;
3733
private List<String> crawlLocations = new ArrayList<>();
3834
private List<String> availableUrls = Collections.synchronizedList(new ArrayList<>());
35+
private Set<String> upcomingUrls = Collections.synchronizedSet(new HashSet<>());
3936
private final ExecutorService executorService = Executors.newFixedThreadPool(10);
4037

4138
@Override
@@ -53,6 +50,9 @@ public Optional<String> generate() {
5350

5451
handleWarcFile(crawlLocations.remove(0));
5552

53+
availableUrls.addAll(upcomingUrls);
54+
upcomingUrls.clear();
55+
5656
log.info("Finished the processing of WARC file with ID: " + (
5757
commonCrawlDocumentSourceConfiguration.getWarcId() + processedWarcFiles));
5858

@@ -89,16 +89,16 @@ private void handleWarcRecord(final WarcRecord warcRecord) {
8989
executorService.submit(() -> {
9090
final Document document = Jsoup.parse(contentString, warcRecordUrl);
9191

92-
final List<String> urlsOnPage = document.select("a").stream()
92+
final Set<String> urlsOnPage = document.select("a").stream()
9393
.map(element -> element.attr("abs:href"))
9494
.filter(url -> !url.isEmpty())
95-
.collect(Collectors.toList());
95+
.collect(Collectors.toSet());
9696

97-
final int beforeUrls = availableUrls.size();
97+
final int beforeUrls = upcomingUrls.size();
9898

99-
availableUrls.addAll(urlsOnPage);
99+
upcomingUrls.addAll(urlsOnPage);
100100

101-
final int afterUrls = availableUrls.size();
101+
final int afterUrls = upcomingUrls.size();
102102

103103
if ((afterUrls / 25000) - (beforeUrls / 25000) > 0) {
104104
log.info("Collected " + afterUrls + " urls!");

0 commit comments

Comments
 (0)