Skip to content

Commit e2665b1

Browse files
committed
filter ard topics
1 parent 5e9c030 commit e2665b1

File tree

1 file changed

+20
-6
lines changed

1 file changed

+20
-6
lines changed

src/main/java/mServer/crawler/sender/zdf/parser/ZdfTopicsPageHtmlDeserializer.java

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,27 +4,41 @@
44
import mServer.crawler.sender.base.UrlUtils;
55
import mServer.crawler.sender.zdf.ZdfConstants;
66
import org.jsoup.nodes.Document;
7+
import org.jsoup.nodes.Element;
78
import org.jsoup.select.Elements;
89

910
import java.util.HashSet;
1011
import java.util.Set;
1112

1213
public class ZdfTopicsPageHtmlDeserializer {
1314

14-
private static final String LINK_SELECTOR = "article h3 a";
15+
private static final String ARTICLE_SELECTOR = "article";
16+
private static final String LINK_SELECTOR = "h3 a";
17+
private static final String TEASER_SELECTOR = "dd.teaser-info span";
1518
private static final String ATTRIBUTE_HREF = "href";
1619

1720
public Set<CrawlerUrlDTO> deserialize(final Document document) {
1821
final Set<CrawlerUrlDTO> results = new HashSet<>();
1922

20-
Elements filmUrls = document.select(LINK_SELECTOR);
23+
Elements filmUrls = document.select(ARTICLE_SELECTOR);
2124
filmUrls.forEach(
22-
filmUrlElement -> {
23-
String url = filmUrlElement.attr(ATTRIBUTE_HREF);
24-
url = UrlUtils.addDomainIfMissing(url, ZdfConstants.URL_BASE);
25-
results.add(new CrawlerUrlDTO(url));
25+
articleElement -> {
26+
final Element filmUrlElement = articleElement.selectFirst(LINK_SELECTOR);
27+
final Element teaserElement = articleElement.selectFirst(TEASER_SELECTOR);
28+
if (filmUrlElement != null && isRelevant(teaserElement)) {
29+
String url = filmUrlElement.attr(ATTRIBUTE_HREF);
30+
url = UrlUtils.addDomainIfMissing(url, ZdfConstants.URL_BASE);
31+
results.add(new CrawlerUrlDTO(url));
32+
}
2633
});
2734

2835
return results;
2936
}
37+
38+
private boolean isRelevant(Element teaserElement) {
39+
if (teaserElement == null) {
40+
return true;
41+
}
42+
return !("ARD".equalsIgnoreCase(teaserElement.text()));
43+
}
3044
}

0 commit comments

Comments
 (0)