Skip to content

Commit

Permalink
ARD: ignore zdf entries
Browse files Browse the repository at this point in the history
ZDF: optimize runtime
fix #967
  • Loading branch information
alex1702 committed Mar 4, 2024
2 parents 5e9c030 + 851e9d8 commit f35479f
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 18 deletions.
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ sourceCompatibility = JavaVersion.VERSION_17
targetCompatibility = JavaVersion.VERSION_17
group = 'de.mediathekview'
archivesBaseName = "MServer"
version = '3.1.228'
version = '3.1.229'

def jarName = 'MServer.jar'
def mainClass = 'mServer.Main'
Expand Down
15 changes: 11 additions & 4 deletions src/main/java/mServer/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
import mServer.tool.MserverDatumZeit;
import mServer.tool.MserverLog;

import java.time.LocalDateTime;
import java.time.temporal.ChronoUnit;

public class Main {

public Main() {
Expand Down Expand Up @@ -88,11 +91,15 @@ public static void main(String[] args) {
}

private static void runServer(String[] ar) throws InterruptedException {
LocalDateTime beforeRun = LocalDateTime.now().truncatedTo(ChronoUnit.DAYS);
while (new MServer(ar).starten()) {
long timeToSleep = (MserverDatumZeit.getSecondsUntilNextDay() + 120) * 1000; // 0:02
MserverLog.systemMeldung("Schlafenlegen bis zum nächsten Tag (" + timeToSleep + "ms)");
Thread.sleep(timeToSleep);
MserverLog.systemMeldung("Neustart der Suche");
if (!LocalDateTime.now().truncatedTo(ChronoUnit.DAYS).isAfter(beforeRun)) { // do not sleep if day changed
long timeToSleep = (MserverDatumZeit.getSecondsUntilNextDay() + 120) * 1000; // 0:02
MserverLog.systemMeldung("Schlafenlegen bis zum nächsten Tag (" + timeToSleep + "ms)");
Thread.sleep(timeToSleep);
MserverLog.systemMeldung("Neustart der Suche");
}
beforeRun = LocalDateTime.now().truncatedTo(ChronoUnit.DAYS);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import mServer.crawler.sender.base.JsonUtils;

import java.lang.reflect.Type;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;
Expand All @@ -23,9 +24,13 @@ public class ArdTopicsLetterDeserializer implements JsonDeserializer<PaginationU
private static final String ELEMENT_TOTAL_ELEMENTS = "totalElements";
private static final String ELEMENT_PAGE_SIZE = "pageSize";
private static final String ELEMENT_PAGINATION = "pagination";
private static final String ELEMENT_PUBLICATION_SERVICE = "publicationService";
private static final String ATTRIBUTE_NAME = "name";

private static final String ATTRIBUTE_ID = "id";

private static final String[] IGNORED_SENDER = new String[] {"zdf", "kika", "3sat", "arte"};

@Override
public PaginationUrlDto deserialize(
final JsonElement jsonElement, final Type type, final JsonDeserializationContext context) {
Expand Down Expand Up @@ -79,13 +84,31 @@ private Set<CrawlerUrlDTO> parseTeaser(final JsonObject teaserObject) {
id = JsonUtils.getAttributeAsString(teaserObject, ATTRIBUTE_ID);
}

id.ifPresent(
nonNullId ->
results.add(
new CrawlerUrlDTO(
String.format(
ArdConstants.TOPIC_URL, nonNullId, ArdConstants.TOPIC_PAGE_SIZE))));
if (isRelevant(teaserObject)) {
id.ifPresent(
nonNullId ->
results.add(
new CrawlerUrlDTO(
String.format(
ArdConstants.TOPIC_URL, nonNullId, ArdConstants.TOPIC_PAGE_SIZE))));
}

return results;
}

private boolean isRelevant(final JsonObject teaserObject) {
if (teaserObject.has(ELEMENT_PUBLICATION_SERVICE)) {
final JsonObject publicationService =
teaserObject.get(ELEMENT_PUBLICATION_SERVICE).getAsJsonObject();
final Optional<String> attributeAsString =
JsonUtils.getAttributeAsString(publicationService, ATTRIBUTE_NAME);
if (attributeAsString.isPresent()) {

return !Arrays.stream(IGNORED_SENDER)
.anyMatch(sender -> sender.equalsIgnoreCase(attributeAsString.get()));
}
}

return true;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ protected Collection<CrawlerUrlDTO> getExtraDaysEntries()

private ConcurrentLinkedQueue<CrawlerUrlDTO> getDayUrls() {

int daysPast = CrawlerTool.loadLongMax() ? 60 : 20;
int daysPast = CrawlerTool.loadLongMax() ? 30 : 20;
int daysFuture = CrawlerTool.loadLongMax() ? 30 : 10;

final ConcurrentLinkedQueue<CrawlerUrlDTO> urls = new ConcurrentLinkedQueue<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,41 @@
import mServer.crawler.sender.base.UrlUtils;
import mServer.crawler.sender.zdf.ZdfConstants;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.HashSet;
import java.util.Set;

public class ZdfTopicsPageHtmlDeserializer {

private static final String LINK_SELECTOR = "article h3 a";
private static final String ARTICLE_SELECTOR = "article";
private static final String LINK_SELECTOR = "h3 a";
private static final String TEASER_SELECTOR = "dd.teaser-info span";
private static final String ATTRIBUTE_HREF = "href";

public Set<CrawlerUrlDTO> deserialize(final Document document) {
final Set<CrawlerUrlDTO> results = new HashSet<>();

Elements filmUrls = document.select(LINK_SELECTOR);
Elements filmUrls = document.select(ARTICLE_SELECTOR);
filmUrls.forEach(
filmUrlElement -> {
String url = filmUrlElement.attr(ATTRIBUTE_HREF);
url = UrlUtils.addDomainIfMissing(url, ZdfConstants.URL_BASE);
results.add(new CrawlerUrlDTO(url));
articleElement -> {
final Element filmUrlElement = articleElement.selectFirst(LINK_SELECTOR);
final Element teaserElement = articleElement.selectFirst(TEASER_SELECTOR);
if (filmUrlElement != null && isRelevant(teaserElement)) {
String url = filmUrlElement.attr(ATTRIBUTE_HREF);
url = UrlUtils.addDomainIfMissing(url, ZdfConstants.URL_BASE);
results.add(new CrawlerUrlDTO(url));
}
});

return results;
}

private boolean isRelevant(Element teaserElement) {
if (teaserElement == null) {
return true;
}
return !("ARD".equalsIgnoreCase(teaserElement.text()));
}
}

0 comments on commit f35479f

Please sign in to comment.