|
4 | 4 | import mServer.crawler.sender.base.UrlUtils;
|
5 | 5 | import mServer.crawler.sender.zdf.ZdfConstants;
|
6 | 6 | import org.jsoup.nodes.Document;
|
| 7 | +import org.jsoup.nodes.Element; |
7 | 8 | import org.jsoup.select.Elements;
|
8 | 9 |
|
9 | 10 | import java.util.HashSet;
|
10 | 11 | import java.util.Set;
|
11 | 12 |
|
12 | 13 | public class ZdfTopicsPageHtmlDeserializer {
|
13 | 14 |
|
14 |
| - private static final String LINK_SELECTOR = "article h3 a"; |
| 15 | + private static final String ARTICLE_SELECTOR = "article"; |
| 16 | + private static final String LINK_SELECTOR = "h3 a"; |
| 17 | + private static final String TEASER_SELECTOR = "dd.teaser-info span"; |
15 | 18 | private static final String ATTRIBUTE_HREF = "href";
|
16 | 19 |
|
17 | 20 | public Set<CrawlerUrlDTO> deserialize(final Document document) {
|
18 | 21 | final Set<CrawlerUrlDTO> results = new HashSet<>();
|
19 | 22 |
|
20 |
| - Elements filmUrls = document.select(LINK_SELECTOR); |
| 23 | + Elements filmUrls = document.select(ARTICLE_SELECTOR); |
21 | 24 | filmUrls.forEach(
|
22 |
| - filmUrlElement -> { |
23 |
| - String url = filmUrlElement.attr(ATTRIBUTE_HREF); |
24 |
| - url = UrlUtils.addDomainIfMissing(url, ZdfConstants.URL_BASE); |
25 |
| - results.add(new CrawlerUrlDTO(url)); |
| 25 | + articleElement -> { |
| 26 | + final Element filmUrlElement = articleElement.selectFirst(LINK_SELECTOR); |
| 27 | + final Element teaserElement = articleElement.selectFirst(TEASER_SELECTOR); |
| 28 | + if (filmUrlElement != null && isRelevant(teaserElement)) { |
| 29 | + String url = filmUrlElement.attr(ATTRIBUTE_HREF); |
| 30 | + url = UrlUtils.addDomainIfMissing(url, ZdfConstants.URL_BASE); |
| 31 | + results.add(new CrawlerUrlDTO(url)); |
| 32 | + } |
26 | 33 | });
|
27 | 34 |
|
28 | 35 | return results;
|
29 | 36 | }
|
| 37 | + |
| 38 | + private boolean isRelevant(Element teaserElement) { |
| 39 | + if (teaserElement == null) { |
| 40 | + return true; |
| 41 | + } |
| 42 | + return !("ARD".equalsIgnoreCase(teaserElement.text())); |
| 43 | + } |
30 | 44 | }
|
0 commit comments