diff --git a/build.gradle b/build.gradle index 438aae770..6c64103e0 100644 --- a/build.gradle +++ b/build.gradle @@ -27,7 +27,7 @@ sourceCompatibility = JavaVersion.VERSION_17 targetCompatibility = JavaVersion.VERSION_17 group = 'de.mediathekview' archivesBaseName = "MServer" -version = '3.1.228' +version = '3.1.229' def jarName = 'MServer.jar' def mainClass = 'mServer.Main' diff --git a/src/main/java/mServer/Main.java b/src/main/java/mServer/Main.java index f5cf3c242..82e6920c6 100644 --- a/src/main/java/mServer/Main.java +++ b/src/main/java/mServer/Main.java @@ -29,6 +29,9 @@ import mServer.tool.MserverDatumZeit; import mServer.tool.MserverLog; +import java.time.LocalDateTime; +import java.time.temporal.ChronoUnit; + public class Main { public Main() { @@ -88,11 +91,15 @@ public static void main(String[] args) { } private static void runServer(String[] ar) throws InterruptedException { + LocalDateTime beforeRun = LocalDateTime.now().truncatedTo(ChronoUnit.DAYS); while (new MServer(ar).starten()) { - long timeToSleep = (MserverDatumZeit.getSecondsUntilNextDay() + 120) * 1000; // 0:02 - MserverLog.systemMeldung("Schlafenlegen bis zum nächsten Tag (" + timeToSleep + "ms)"); - Thread.sleep(timeToSleep); - MserverLog.systemMeldung("Neustart der Suche"); + if (!LocalDateTime.now().truncatedTo(ChronoUnit.DAYS).isAfter(beforeRun)) { // do not sleep if day changed + long timeToSleep = (MserverDatumZeit.getSecondsUntilNextDay() + 120) * 1000; // 0:02 + MserverLog.systemMeldung("Schlafenlegen bis zum nächsten Tag (" + timeToSleep + "ms)"); + Thread.sleep(timeToSleep); + MserverLog.systemMeldung("Neustart der Suche"); + } + beforeRun = LocalDateTime.now().truncatedTo(ChronoUnit.DAYS); } } diff --git a/src/main/java/mServer/crawler/sender/ard/json/ArdTopicsLetterDeserializer.java b/src/main/java/mServer/crawler/sender/ard/json/ArdTopicsLetterDeserializer.java index 161f66cbf..12870c8cc 100644 --- a/src/main/java/mServer/crawler/sender/ard/json/ArdTopicsLetterDeserializer.java +++ b/src/main/java/mServer/crawler/sender/ard/json/ArdTopicsLetterDeserializer.java @@ -10,6 +10,7 @@ import mServer.crawler.sender.base.JsonUtils; import java.lang.reflect.Type; +import java.util.Arrays; import java.util.HashSet; import java.util.Optional; import java.util.Set; @@ -23,9 +24,13 @@ public class ArdTopicsLetterDeserializer implements JsonDeserializer parseTeaser(final JsonObject teaserObject) { id = JsonUtils.getAttributeAsString(teaserObject, ATTRIBUTE_ID); } - id.ifPresent( - nonNullId -> - results.add( - new CrawlerUrlDTO( - String.format( - ArdConstants.TOPIC_URL, nonNullId, ArdConstants.TOPIC_PAGE_SIZE)))); + if (isRelevant(teaserObject)) { + id.ifPresent( + nonNullId -> + results.add( + new CrawlerUrlDTO( + String.format( + ArdConstants.TOPIC_URL, nonNullId, ArdConstants.TOPIC_PAGE_SIZE)))); + } return results; } + + private boolean isRelevant(final JsonObject teaserObject) { + if (teaserObject.has(ELEMENT_PUBLICATION_SERVICE)) { + final JsonObject publicationService = + teaserObject.get(ELEMENT_PUBLICATION_SERVICE).getAsJsonObject(); + final Optional attributeAsString = + JsonUtils.getAttributeAsString(publicationService, ATTRIBUTE_NAME); + if (attributeAsString.isPresent()) { + + return !Arrays.stream(IGNORED_SENDER) + .anyMatch(sender -> sender.equalsIgnoreCase(attributeAsString.get())); + } + } + + return true; + } } diff --git a/src/main/java/mServer/crawler/sender/zdf/AbstractZdfCrawler.java b/src/main/java/mServer/crawler/sender/zdf/AbstractZdfCrawler.java index b73a5be97..96137476e 100644 --- a/src/main/java/mServer/crawler/sender/zdf/AbstractZdfCrawler.java +++ b/src/main/java/mServer/crawler/sender/zdf/AbstractZdfCrawler.java @@ -97,7 +97,7 @@ protected Collection getExtraDaysEntries() private ConcurrentLinkedQueue getDayUrls() { - int daysPast = CrawlerTool.loadLongMax() ? 60 : 20; + int daysPast = CrawlerTool.loadLongMax() ? 30 : 20; int daysFuture = CrawlerTool.loadLongMax() ? 30 : 10; final ConcurrentLinkedQueue urls = new ConcurrentLinkedQueue<>(); diff --git a/src/main/java/mServer/crawler/sender/zdf/parser/ZdfTopicsPageHtmlDeserializer.java b/src/main/java/mServer/crawler/sender/zdf/parser/ZdfTopicsPageHtmlDeserializer.java index 3bc36186a..a44e0bb36 100644 --- a/src/main/java/mServer/crawler/sender/zdf/parser/ZdfTopicsPageHtmlDeserializer.java +++ b/src/main/java/mServer/crawler/sender/zdf/parser/ZdfTopicsPageHtmlDeserializer.java @@ -4,6 +4,7 @@ import mServer.crawler.sender.base.UrlUtils; import mServer.crawler.sender.zdf.ZdfConstants; import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.util.HashSet; @@ -11,20 +12,33 @@ public class ZdfTopicsPageHtmlDeserializer { - private static final String LINK_SELECTOR = "article h3 a"; + private static final String ARTICLE_SELECTOR = "article"; + private static final String LINK_SELECTOR = "h3 a"; + private static final String TEASER_SELECTOR = "dd.teaser-info span"; private static final String ATTRIBUTE_HREF = "href"; public Set deserialize(final Document document) { final Set results = new HashSet<>(); - Elements filmUrls = document.select(LINK_SELECTOR); + Elements filmUrls = document.select(ARTICLE_SELECTOR); filmUrls.forEach( - filmUrlElement -> { - String url = filmUrlElement.attr(ATTRIBUTE_HREF); - url = UrlUtils.addDomainIfMissing(url, ZdfConstants.URL_BASE); - results.add(new CrawlerUrlDTO(url)); + articleElement -> { + final Element filmUrlElement = articleElement.selectFirst(LINK_SELECTOR); + final Element teaserElement = articleElement.selectFirst(TEASER_SELECTOR); + if (filmUrlElement != null && isRelevant(teaserElement)) { + String url = filmUrlElement.attr(ATTRIBUTE_HREF); + url = UrlUtils.addDomainIfMissing(url, ZdfConstants.URL_BASE); + results.add(new CrawlerUrlDTO(url)); + } }); return results; } + + private boolean isRelevant(Element teaserElement) { + if (teaserElement == null) { + return true; + } + return !("ARD".equalsIgnoreCase(teaserElement.text())); + } }