From 5e00466d9a49bac116593c67528786cc5f0bf848 Mon Sep 17 00:00:00 2001 From: pidoubleyou Date: Tue, 19 Mar 2024 22:12:14 +0100 Subject: [PATCH 01/10] remove funk entries --- .../sender/zdf/parser/ZdfTopicsPageHtmlDeserializer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/mServer/crawler/sender/zdf/parser/ZdfTopicsPageHtmlDeserializer.java b/src/main/java/mServer/crawler/sender/zdf/parser/ZdfTopicsPageHtmlDeserializer.java index a44e0bb3..af56bf41 100644 --- a/src/main/java/mServer/crawler/sender/zdf/parser/ZdfTopicsPageHtmlDeserializer.java +++ b/src/main/java/mServer/crawler/sender/zdf/parser/ZdfTopicsPageHtmlDeserializer.java @@ -39,6 +39,6 @@ private boolean isRelevant(Element teaserElement) { if (teaserElement == null) { return true; } - return !("ARD".equalsIgnoreCase(teaserElement.text())); + return !("ARD".equalsIgnoreCase(teaserElement.text()) || "funk".equalsIgnoreCase(teaserElement.text())); } } From a5479e47cffae63034fccc0d40fc7ac2615e6599 Mon Sep 17 00:00:00 2001 From: pidoubleyou Date: Tue, 19 Mar 2024 22:13:21 +0100 Subject: [PATCH 02/10] reduce daysFuture --- .../java/mServer/crawler/sender/zdf/AbstractZdfCrawler.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/mServer/crawler/sender/zdf/AbstractZdfCrawler.java b/src/main/java/mServer/crawler/sender/zdf/AbstractZdfCrawler.java index 96137476..60b471d9 100644 --- a/src/main/java/mServer/crawler/sender/zdf/AbstractZdfCrawler.java +++ b/src/main/java/mServer/crawler/sender/zdf/AbstractZdfCrawler.java @@ -43,7 +43,7 @@ protected RecursiveTask> createCrawlerTask() { final ZdfConfiguration configuration = loadConfiguration(); if (configuration.getSearchAuthKey().isPresent() && configuration.getVideoAuthKey().isPresent()) { - shows = new HashSet<>(getDaysEntries(configuration)); + //shows = new HashSet<>(getDaysEntries(configuration)); if (CrawlerTool.loadLongMax()) { shows.addAll(getTopicsEntries()); @@ -98,7 +98,7 @@ protected Collection getExtraDaysEntries() private ConcurrentLinkedQueue getDayUrls() { int daysPast = CrawlerTool.loadLongMax() ? 30 : 20; - int daysFuture = CrawlerTool.loadLongMax() ? 30 : 10; + int daysFuture = CrawlerTool.loadLongMax() ? 20 : 10; final ConcurrentLinkedQueue urls = new ConcurrentLinkedQueue<>(); for (int i = 0; From 7dac3f1d13fe603b11659ac2d4fdaeb1e2250060 Mon Sep 17 00:00:00 2001 From: pidoubleyou Date: Tue, 19 Mar 2024 22:14:48 +0100 Subject: [PATCH 03/10] remove comment --- .../java/mServer/crawler/sender/zdf/AbstractZdfCrawler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/mServer/crawler/sender/zdf/AbstractZdfCrawler.java b/src/main/java/mServer/crawler/sender/zdf/AbstractZdfCrawler.java index 60b471d9..bbef601d 100644 --- a/src/main/java/mServer/crawler/sender/zdf/AbstractZdfCrawler.java +++ b/src/main/java/mServer/crawler/sender/zdf/AbstractZdfCrawler.java @@ -43,7 +43,7 @@ protected RecursiveTask> createCrawlerTask() { final ZdfConfiguration configuration = loadConfiguration(); if (configuration.getSearchAuthKey().isPresent() && configuration.getVideoAuthKey().isPresent()) { - //shows = new HashSet<>(getDaysEntries(configuration)); + shows = new HashSet<>(getDaysEntries(configuration)); if (CrawlerTool.loadLongMax()) { shows.addAll(getTopicsEntries()); From 57bf4d0221f6c5fafaad53378c1c481ae7089088 Mon Sep 17 00:00:00 2001 From: pidoubleyou Date: Sun, 24 Mar 2024 15:36:25 +0100 Subject: [PATCH 04/10] initial orfon crawler --- .../crawler/sender/orfon/OrfHttpClient.java | 79 ++++++ .../sender/orfon/OrfOnBreadCrumsUrlDTO.java | 54 ++++ .../crawler/sender/orfon/OrfOnConstants.java | 23 ++ .../crawler/sender/orfon/OrfOnCrawler.java | 125 +++++++++ .../sender/orfon/OrfOnVideoInfoDTO.java | 104 ++++++++ .../orfon/json/OrfOnAZDeserializer.java | 46 ++++ .../orfon/json/OrfOnEpisodeDeserializer.java | 243 ++++++++++++++++++ .../orfon/json/OrfOnEpisodesDeserializer.java | 37 +++ .../OrfOnHistoryChildrenDeserializer.java | 60 +++++ .../orfon/json/OrfOnHistoryDeserializer.java | 62 +++++ .../OrfOnHistoryVideoItemDeserializer.java | 58 +++++ .../orfon/json/OrfOnScheduleDeserializer.java | 31 +++ .../sender/orfon/task/OrfOnAZTask.java | 39 +++ .../sender/orfon/task/OrfOnEpisodeTask.java | 143 +++++++++++ .../sender/orfon/task/OrfOnEpisodesTask.java | 40 +++ .../orfon/task/OrfOnHistoryChildrenTask.java | 58 +++++ .../sender/orfon/task/OrfOnHistoryTask.java | 40 +++ .../orfon/task/OrfOnHistoryVideoItemTask.java | 38 +++ .../sender/orfon/task/OrfOnPagedTask.java | 67 +++++ .../sender/orfon/task/OrfOnScheduleTask.java | 41 +++ 20 files changed, 1388 insertions(+) create mode 100644 src/main/java/mServer/crawler/sender/orfon/OrfHttpClient.java create mode 100644 src/main/java/mServer/crawler/sender/orfon/OrfOnBreadCrumsUrlDTO.java create mode 100644 src/main/java/mServer/crawler/sender/orfon/OrfOnConstants.java create mode 100644 src/main/java/mServer/crawler/sender/orfon/OrfOnCrawler.java create mode 100644 src/main/java/mServer/crawler/sender/orfon/OrfOnVideoInfoDTO.java create mode 100644 src/main/java/mServer/crawler/sender/orfon/json/OrfOnAZDeserializer.java create mode 100644 src/main/java/mServer/crawler/sender/orfon/json/OrfOnEpisodeDeserializer.java create mode 100644 src/main/java/mServer/crawler/sender/orfon/json/OrfOnEpisodesDeserializer.java create mode 100644 src/main/java/mServer/crawler/sender/orfon/json/OrfOnHistoryChildrenDeserializer.java create mode 100644 src/main/java/mServer/crawler/sender/orfon/json/OrfOnHistoryDeserializer.java create mode 100644 src/main/java/mServer/crawler/sender/orfon/json/OrfOnHistoryVideoItemDeserializer.java create mode 100644 src/main/java/mServer/crawler/sender/orfon/json/OrfOnScheduleDeserializer.java create mode 100644 src/main/java/mServer/crawler/sender/orfon/task/OrfOnAZTask.java create mode 100644 src/main/java/mServer/crawler/sender/orfon/task/OrfOnEpisodeTask.java create mode 100644 src/main/java/mServer/crawler/sender/orfon/task/OrfOnEpisodesTask.java create mode 100644 src/main/java/mServer/crawler/sender/orfon/task/OrfOnHistoryChildrenTask.java create mode 100644 src/main/java/mServer/crawler/sender/orfon/task/OrfOnHistoryTask.java create mode 100644 src/main/java/mServer/crawler/sender/orfon/task/OrfOnHistoryVideoItemTask.java create mode 100644 src/main/java/mServer/crawler/sender/orfon/task/OrfOnPagedTask.java create mode 100644 src/main/java/mServer/crawler/sender/orfon/task/OrfOnScheduleTask.java diff --git a/src/main/java/mServer/crawler/sender/orfon/OrfHttpClient.java b/src/main/java/mServer/crawler/sender/orfon/OrfHttpClient.java new file mode 100644 index 00000000..56575608 --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/OrfHttpClient.java @@ -0,0 +1,79 @@ +package mServer.crawler.sender.orfon; + +import com.google.gson.Gson; +import com.google.gson.JsonElement; +import okhttp3.*; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.io.IOException; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +public class OrfHttpClient { + private static final Logger LOG = LogManager.getLogger(OrfHttpClient.class); + private static final int THREAD_POOL_SIZE = 1; + private static final int TIMEOUT = 60; + + protected OkHttpClient client; + + public OrfHttpClient() { + client = + new OkHttpClient.Builder() + .connectTimeout(TIMEOUT, TimeUnit.SECONDS) + .readTimeout(TIMEOUT, TimeUnit.SECONDS) + .callTimeout(TIMEOUT, TimeUnit.SECONDS) + .connectionPool(new ConnectionPool(THREAD_POOL_SIZE, 5L, TimeUnit.MINUTES)) + .build(); + } + + /** + * Request an url and receive the body as String. Add headers as a string map. + * @param url + * @param headerMap + * @return + * @throws IOException + */ + public String requestBodyAsString(final String url, final Map headerMap) throws IOException { + int retry = 0; + int httpResponseCode; + final String responseString = ""; + do { + okhttp3.Headers.Builder headerBuilder = new Headers.Builder(); + if (headerMap != null) { + for (Map.Entry headerValue : headerMap.entrySet()) { + headerBuilder.add(headerValue.getKey(), headerValue.getValue()); + } + } + Request request = new Request.Builder() + .url(url) + .headers(headerBuilder.build()) + .build(); + + try (final Response response = client.newCall(request).execute()) { + httpResponseCode = response.code(); + if (response.body() == null || httpResponseCode == 404 || httpResponseCode == 410) { + break; + } + if (response.isSuccessful()) { + final ResponseBody responseBody = response.body(); + return responseBody == null ? "" : responseBody.string(); + } + } + retry++; + LOG.debug("Retry #{} due to {} for {}", retry, httpResponseCode, url); + } while (retry < 3); + return responseString; + } + + /** + * Request an url and receive the body as HTML JSOUP Document + * + * @param url The url to request. + * @return request body as HTML JSOUP Document + * @throws IOException If no connection to the url could be opened. + */ + public JsonElement requestBodyAsJsonElement(final String url, final Map headerMap) throws IOException { + return new Gson().fromJson(requestBodyAsString(url, headerMap), JsonElement.class); + } +} diff --git a/src/main/java/mServer/crawler/sender/orfon/OrfOnBreadCrumsUrlDTO.java b/src/main/java/mServer/crawler/sender/orfon/OrfOnBreadCrumsUrlDTO.java new file mode 100644 index 00000000..682de517 --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/OrfOnBreadCrumsUrlDTO.java @@ -0,0 +1,54 @@ +package mServer.crawler.sender.orfon; + + +import mServer.crawler.sender.base.CrawlerUrlDTO; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +public class OrfOnBreadCrumsUrlDTO extends CrawlerUrlDTO { + private List breadCrums = new ArrayList<>(); + + public OrfOnBreadCrumsUrlDTO(String breadCrum, String aUrl) { + super(aUrl); + setBreadCrums(List.of(breadCrum)); + } + public OrfOnBreadCrumsUrlDTO(List breadCrums, String aUrl) { + super(aUrl); + setBreadCrums(breadCrums); + } + + public List getBreadCrums() { + return breadCrums; + } + + public void setBreadCrums(List breadCrums) { + this.breadCrums = breadCrums; + } + + public void setBreadCrumsPath(List breadCrums) { + List fullPath = new ArrayList<>(); + fullPath.addAll(breadCrums); + fullPath.addAll(getBreadCrums()); + setBreadCrums(fullPath); + } + + @Override + public boolean equals(final Object obj) { + if (obj == null || getClass() != obj.getClass()) { + return false; + } + + if (super.equals(obj)) { + return breadCrums.containsAll(((OrfOnBreadCrumsUrlDTO)obj).breadCrums); + } + + return false; + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), this.breadCrums); + } +} diff --git a/src/main/java/mServer/crawler/sender/orfon/OrfOnConstants.java b/src/main/java/mServer/crawler/sender/orfon/OrfOnConstants.java new file mode 100644 index 00000000..cb852f42 --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/OrfOnConstants.java @@ -0,0 +1,23 @@ +package mServer.crawler.sender.orfon; + +public final class OrfOnConstants { + // + public static final String HOST = "https://api-tvthek.orf.at/api/v4.3"; + // + public static final String SCHEDULE = HOST + "/schedule"; + // + public static final String AZ = HOST + "/profiles/lettergroup"; + public static final int PAGE_SIZE = 200; + // + public static final String HISTORY = HOST + "/history"; + // + public static final String EPISODE = HOST + "/episode"; + // + public static final String AUTH = "Basic b3JmX29uX3Y0MzpqRlJzYk5QRmlQU3h1d25MYllEZkNMVU41WU5aMjhtdA=="; + // + private OrfOnConstants() {} + // + public static String createMaxLimmitUrl(String plainUrl) { + return plainUrl + "?limit=" + OrfOnConstants.PAGE_SIZE; + } +} diff --git a/src/main/java/mServer/crawler/sender/orfon/OrfOnCrawler.java b/src/main/java/mServer/crawler/sender/orfon/OrfOnCrawler.java new file mode 100644 index 00000000..c8513fc9 --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/OrfOnCrawler.java @@ -0,0 +1,125 @@ +package mServer.crawler.sender.orfon; + +import de.mediathekview.mlib.Const; +import de.mediathekview.mlib.daten.DatenFilm; +import de.mediathekview.mlib.tool.Log; +import mServer.crawler.CrawlerTool; +import mServer.crawler.FilmeSuchen; +import mServer.crawler.sender.MediathekCrawler; +import mServer.crawler.sender.orfon.task.*; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.*; + +public class OrfOnCrawler extends MediathekCrawler { + private static final Logger LOG = LogManager.getLogger(OrfOnCrawler.class); + private static final DateTimeFormatter DAY_PAGE_DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd"); + + public static final String SENDERNAME = Const.ORF; + + public OrfOnCrawler(FilmeSuchen ssearch, int startPrio) { + super(ssearch, SENDERNAME, 0, 1, startPrio); + } + + @Override + protected RecursiveTask> createCrawlerTask() { + Set allVideos = new HashSet<>(); + try { + // Sendungen Verpasst (letzten 14 Tage) + // TAG > Episode > Episode2Film + final Set epsiodesFromDay = processDayUrlsToCrawl(); + allVideos.addAll(epsiodesFromDay); + + Log.sysLog("ORF Anzahl Tage: " + allVideos.size()); + + if (CrawlerTool.loadLongMax()) { + // + // Sendungen a-z + // Buchstabe > Episoden > Episode2Film + final Set videosFromTopics = processAZUrlsToCrawl(); + allVideos.addAll(videosFromTopics); + Log.sysLog("ORF Anzahl Topics: " + videosFromTopics.size()); + // + // History (top categories) > children > VideoItem > Episode > Episode2Film + final Set historyVideos = processHistoryUrlToCrawl(); + allVideos.addAll(historyVideos); + Log.sysLog("ORF Anzahl History: " + historyVideos.size()); + } + // + Log.sysLog("ORF Anzahl: " + allVideos.size()); + + meldungAddMax(allVideos.size()); + + } catch (final Exception ex) { + Log.errorLog(56146546, ex); + LOG.fatal("Exception in ORFON crawler.", ex); + } + + return new OrfOnEpisodeTask(this, new ConcurrentLinkedQueue<>(allVideos)); + } + + private Set processDayUrlsToCrawl() throws InterruptedException, ExecutionException { + final ForkJoinTask> dayTask = forkJoinPool.submit(new OrfOnScheduleTask(this, createDayUrlsToCrawl())); + return dayTask.get(); + } + + private ConcurrentLinkedQueue createDayUrlsToCrawl() { + final ConcurrentLinkedQueue dayUrlsToCrawl = new ConcurrentLinkedQueue<>(); + final LocalDateTime now = LocalDateTime.now(); + for (int i = 0; i <= 8; i++) { + final String day = now.minusDays(i).format(DAY_PAGE_DATE_FORMATTER); + final String url = OrfOnConstants.SCHEDULE + "/" + day; + dayUrlsToCrawl.offer(new OrfOnBreadCrumsUrlDTO(day,url)); + } + return dayUrlsToCrawl; + } + + private Set processAZUrlsToCrawl() throws InterruptedException, ExecutionException { + final ForkJoinTask> letterTask = forkJoinPool.submit(new OrfOnAZTask(this, createAZUrlsToCrawl())); + final Set letterTaskTopics = letterTask.get(); + final ForkJoinTask> episodesFromTopicsTask = forkJoinPool.submit(new OrfOnEpisodesTask(this, new ConcurrentLinkedQueue<>(letterTaskTopics))); + return episodesFromTopicsTask.get(); + } + + + private ConcurrentLinkedQueue createAZUrlsToCrawl() { + final ConcurrentLinkedQueue letterUrlsToCrawl = new ConcurrentLinkedQueue<>(); + for (char letter = 'A'; letter <= 'Z'; letter++) { + final String url = OrfOnConstants.AZ + "/" + letter + "?limit="+OrfOnConstants.PAGE_SIZE; + letterUrlsToCrawl.offer(new OrfOnBreadCrumsUrlDTO(String.valueOf(letter),url)); + } + // 0 gibt es auch + final String url = OrfOnConstants.AZ + "/0" + "?limit="+OrfOnConstants.PAGE_SIZE; + letterUrlsToCrawl.offer(new OrfOnBreadCrumsUrlDTO("0",url)); + return letterUrlsToCrawl; + } + + private Set processHistoryUrlToCrawl() throws InterruptedException, ExecutionException { + final ForkJoinTask> histroyTask = forkJoinPool.submit(new OrfOnHistoryTask(this, createHistoryUrlToCrawl())); + final Set historyChidrenUrls = histroyTask.get(); + LOG.debug("Found {} entries in OrfOnHistoryTask ", historyChidrenUrls.size()); + // + final ForkJoinTask> historyChildrenTask = forkJoinPool.submit(new OrfOnHistoryChildrenTask(this, new ConcurrentLinkedQueue<>(historyChidrenUrls))); + final Set historyItemUrls = historyChildrenTask.get(); + LOG.debug("Found {} entries in OrfOnHistoryChildrenTask ", historyItemUrls.size()); + // + final ForkJoinTask> historyItemTask = forkJoinPool.submit(new OrfOnHistoryVideoItemTask(this, new ConcurrentLinkedQueue<>(historyItemUrls))); + final Set historyEpisodesUrls = historyItemTask.get(); + LOG.debug("Found {} entries in OrfOnHistoryVideoItemTask ", historyEpisodesUrls.size()); + // + return historyEpisodesUrls; + } + + private ConcurrentLinkedQueue createHistoryUrlToCrawl() { + final ConcurrentLinkedQueue history = new ConcurrentLinkedQueue<>(); + history.offer(new OrfOnBreadCrumsUrlDTO("Base",OrfOnConstants.HISTORY)); + return history; + } + + +} diff --git a/src/main/java/mServer/crawler/sender/orfon/OrfOnVideoInfoDTO.java b/src/main/java/mServer/crawler/sender/orfon/OrfOnVideoInfoDTO.java new file mode 100644 index 00000000..8e325a32 --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/OrfOnVideoInfoDTO.java @@ -0,0 +1,104 @@ +package mServer.crawler.sender.orfon; + +import mServer.crawler.sender.base.Qualities; + +import java.time.Duration; +import java.time.LocalDateTime; +import java.util.Map; +import java.util.Optional; + +public class OrfOnVideoInfoDTO { + private Optional id; + private Optional channel; + private Optional title; + private Optional titleWithDate; + private Optional topic; + private Optional topicForArchive; + private Optional aired; + private Optional duration; + private Optional description; + private Optional website; + private Optional> videoUrls; + private Optional subtitleUrl; + + public OrfOnVideoInfoDTO( + Optional id, + Optional channel, + Optional title, + Optional titleWithDate, + Optional topic, + Optional topicForArchive, + Optional aired, + Optional duration, + Optional description, + Optional website, + Optional> videoUrls, + Optional subtitleUrl) { + super(); + this.id = id; + this.channel = channel; + this.title = title; + this.titleWithDate = titleWithDate; + this.topic = topic; + this.topicForArchive = topicForArchive; + this.aired = aired; + this.duration = duration; + this.description = description; + this.website = website; + this.videoUrls = videoUrls; + this.subtitleUrl = subtitleUrl; + } + + public Optional getId() { + return id; + } + public Optional getChannel() { + return channel; + } + public Optional getTitle() { + return title; + } + public Optional getTitleWithDate() { + return titleWithDate; + } + public Optional getTopic() { + return topic; + } + public Optional getTopicForArchive() { + return topicForArchive; + } + public Optional getAired() { + return aired; + } + public Optional getDuration() { + return duration; + } + public Optional getDescription() { + return description; + } + public Optional getWebsite() { + return website; + } + public Optional> getVideoUrls() { + return videoUrls; + } + public Optional getSubtitleUrl() { + return subtitleUrl; + } + + @Override + public int hashCode() { + if (getId().isPresent()) { + return Integer.valueOf(getId().get()); + } + return super.hashCode(); + } + + @Override + public boolean equals(final Object obj) { + if (obj == null || getClass() != obj.getClass()) { + return false; + } + return this.hashCode() == obj.hashCode(); + } +} diff --git a/src/main/java/mServer/crawler/sender/orfon/json/OrfOnAZDeserializer.java b/src/main/java/mServer/crawler/sender/orfon/json/OrfOnAZDeserializer.java new file mode 100644 index 00000000..eed4bf8c --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/json/OrfOnAZDeserializer.java @@ -0,0 +1,46 @@ +package mServer.crawler.sender.orfon.json; + +import com.google.gson.*; +import mServer.crawler.sender.base.JsonUtils; +import mServer.crawler.sender.base.PagedElementListDTO; +import mServer.crawler.sender.orfon.OrfOnBreadCrumsUrlDTO; +import mServer.crawler.sender.orfon.OrfOnConstants; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.lang.reflect.Type; +import java.util.Optional; + + +public class OrfOnAZDeserializer implements JsonDeserializer> { + private static final Logger LOG = LogManager.getLogger(OrfOnAZDeserializer.class); + private static final String[] TAG_NEXT_PAGE = {"_links", "next", "href"}; + private static final String[] TAG_ITEMS = {"_embedded", "items"}; + private static final String TAG_ITEM_ID = "id"; + private static final String[] TAG_ITEM_EPISODES = {"_links", "episodes", "href"}; + + @Override + public PagedElementListDTO deserialize( + final JsonElement jsonElement, final Type typeOfT, final JsonDeserializationContext context) + throws JsonParseException { + JsonObject jsonPage = jsonElement.getAsJsonObject(); + // + PagedElementListDTO page = new PagedElementListDTO<>(); + page.setNextPage(JsonUtils.getElementValueAsString(jsonElement, TAG_NEXT_PAGE)); + // + final Optional items = JsonUtils.getElement(jsonPage, TAG_ITEMS); + if (items.isPresent() && items.get().isJsonArray()) { + for (JsonElement topic : items.get().getAsJsonArray()) { + final Optional id = JsonUtils.getElementValueAsString(topic, TAG_ITEM_ID); + final Optional url = JsonUtils.getElementValueAsString(topic, TAG_ITEM_EPISODES); + if (id.isPresent() && url.isPresent()) { + page.addElement(new OrfOnBreadCrumsUrlDTO(id.get(), OrfOnConstants.createMaxLimmitUrl(url.get()))); + } else { + LOG.debug("No episodes found in item " + id.orElse("")); + LOG.debug("No episodes found in item {}", id); + } + } + } + return page; + } +} diff --git a/src/main/java/mServer/crawler/sender/orfon/json/OrfOnEpisodeDeserializer.java b/src/main/java/mServer/crawler/sender/orfon/json/OrfOnEpisodeDeserializer.java new file mode 100644 index 00000000..7f85ba22 --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/json/OrfOnEpisodeDeserializer.java @@ -0,0 +1,243 @@ +package mServer.crawler.sender.orfon.json; + +import com.google.gson.JsonDeserializationContext; +import com.google.gson.JsonDeserializer; +import com.google.gson.JsonElement; +import com.google.gson.JsonParseException; +import de.mediathekview.mlib.tool.Log; +import mServer.crawler.sender.base.GeoLocations; +import mServer.crawler.sender.base.JsonUtils; +import mServer.crawler.sender.base.Qualities; +import mServer.crawler.sender.orfon.OrfHttpClient; +import mServer.crawler.sender.orfon.OrfOnConstants; +import mServer.crawler.sender.orfon.OrfOnVideoInfoDTO; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.io.IOException; +import java.lang.reflect.Type; +import java.net.URL; +import java.time.Duration; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.*; + + +public class OrfOnEpisodeDeserializer implements JsonDeserializer { + private static final Logger LOG = LogManager.getLogger(OrfOnEpisodeDeserializer.class); + private static final String[] TAG_CHANNEL = {"_embedded", "channel", "name"}; + private static final String TAG_ID = "id"; + private static final String TAG_TITLE = "title"; + private static final String TAG_TITLE_WITH_DATE = "share_subject"; + private static final String TAG_TOPIC = "profile_title"; + private static final String TAG_TOPIC_ARCHIVE = "sub_headline"; + private static final String TAG_AIRED = "date"; + private static final String TAG_DURATION = "duration_seconds"; + private static final String TAG_DESCRIPTION = "description"; + private static final String TAG_SHARE_BODY = "share_body"; + private static final String[] TAG_SUBTITLE = {"_links", "subtitle", "href"}; + private static final String[] TAG_VIDEO_PATH_1 = {"_embedded", "segments"}; + private static final String[] TAG_VIDEO_PATH_2 = {"_embedded", "playlist", "sources"}; + private static final String TAG_VIDEO_URL = "src"; + private static final String TAG_VIDEO_CODEC = "delivery"; + private static final String TAG_VIDEO_QUALITY = "quality"; + private static final String TAG_VIDEO_FALLBACK = "sources"; + private static final String TAG_VIDEO_FALLBACK_URL = "src"; + + private static final String[] TAG_SUBTITLE_SECTION = {"_embedded", "subtitle"}; + private static final String TAG_SUBTITLE_TTML = "ttml_url"; + private static final String[] PREFERED_CODEC = {"hls", "hds", "streaming", "progressive"}; + // + private final OrfHttpClient connection; + // + + public OrfOnEpisodeDeserializer() { + connection = new OrfHttpClient(); + } + + private static Optional getQuality(final String aQuality) { + switch (aQuality) { + case "Q0A": + case "Q1A": + case "Q4A": + return Optional.of(Qualities.SMALL); + case "Q6A": + return Optional.of(Qualities.NORMAL); + case "Q8C": + return Optional.of(Qualities.HD); + case "QXA": + case "QXADRM": + case "QXB": + case "QXBDRM": + case "Q8A": + return Optional.empty(); + default: + Log.sysLog("ORF: unknown quality: " + aQuality); + LOG.debug("ORF: unknown quality: {}", aQuality); + } + return Optional.empty(); + } + + @Override + public OrfOnVideoInfoDTO deserialize( + final JsonElement jsonElement, final Type typeOfT, final JsonDeserializationContext context) + throws JsonParseException { + return new OrfOnVideoInfoDTO( + JsonUtils.getElementValueAsString(jsonElement, TAG_ID), + JsonUtils.getElementValueAsString(jsonElement, TAG_CHANNEL), + JsonUtils.getElementValueAsString(jsonElement, TAG_TITLE), + JsonUtils.getElementValueAsString(jsonElement, TAG_TITLE_WITH_DATE), + JsonUtils.getElementValueAsString(jsonElement, TAG_TOPIC), + JsonUtils.getElementValueAsString(jsonElement, TAG_TOPIC_ARCHIVE), + parseAiredDate(JsonUtils.getElementValueAsString(jsonElement, TAG_AIRED)), + parseDuration(JsonUtils.getElementValueAsString(jsonElement, TAG_DURATION)), + JsonUtils.getElementValueAsString(jsonElement, TAG_DESCRIPTION), + parseWebsite(JsonUtils.getElementValueAsString(jsonElement, TAG_SHARE_BODY)), + optimizeUrls(parseUrl(jsonElement)), + buildOrResolveSubs(jsonElement) + + ); + } + + private Optional> optimizeUrls(Optional> urls) { + if (urls.isPresent() && urls.get().size() == 1) { + final Map urlMap = urls.get(); + final String urlToOptimize = urlMap.get(Qualities.NORMAL); + urlMap.put(Qualities.SMALL, urlToOptimize.replace("QXA", "Q4A")); + urlMap.put(Qualities.NORMAL, urlToOptimize.replace("QXA", "Q6A")); + urlMap.put(Qualities.HD, urlToOptimize.replace("QXA", "Q8C")); + } + return urls; + } + + private Optional buildOrResolveSubs(JsonElement jsonElement) { + Optional subtitleSource = JsonUtils.getElementValueAsString(jsonElement, TAG_SUBTITLE); + Optional embeddedSubtitleSection = JsonUtils.getElement(jsonElement, TAG_SUBTITLE_SECTION); + Optional setOfSubs = Optional.empty(); + if (embeddedSubtitleSection.isPresent()) { + setOfSubs = parseSubtitleUrls(embeddedSubtitleSection.get()); + } else if (subtitleSource.isPresent()) { + Map myMap = Map.ofEntries( + Map.entry("Authorization", OrfOnConstants.AUTH), + Map.entry("Accept-Charset", "UTF_8"), + Map.entry("User-Agent", "Mozilla"), + Map.entry("Accept-Encoding", "*")); + JsonElement newRequestForSubs = null; + try { + newRequestForSubs = connection.requestBodyAsJsonElement(subtitleSource.get(), myMap); + if (newRequestForSubs != null) { + setOfSubs = parseSubtitleUrls(newRequestForSubs); + } + } catch (IOException e) { + Log.errorLog(873673822, e, "Failed to resolve subtitle: " + subtitleSource); + LOG.error("Failed to resolve subtitle from {} error {}", subtitleSource, e); + } + + } + return setOfSubs; + } + + private Optional parseSubtitleUrls(JsonElement element) { + return JsonUtils.getElementValueAsString(element, TAG_SUBTITLE_TTML); + } + + private Optional> parseUrl(JsonElement jsonElement) { + Optional videoPath1 = JsonUtils.getElement(jsonElement, TAG_VIDEO_PATH_1); + if (videoPath1.isEmpty() || !videoPath1.get().isJsonArray() || videoPath1.get().getAsJsonArray().size() == 0) { + return Optional.empty(); + } + // We need to fallback to episode.sources in case there are many elements in the playlist + if (videoPath1.get().getAsJsonArray().size() > 1) { + return parseFallbackVideo(jsonElement); + } + + Optional videoPath2 = JsonUtils.getElement(videoPath1.get().getAsJsonArray().get(0), TAG_VIDEO_PATH_2); + if (videoPath2.isEmpty() || !videoPath2.get().isJsonArray()) { + return Optional.empty(); + } + for (String key : PREFERED_CODEC) { + Optional> resultingVideos = readVideoForTargetCodec(videoPath2.get(), key); + if (resultingVideos.isPresent()) { + return resultingVideos; + } + } + + return Optional.empty(); + } + + private Optional> parseFallbackVideo(JsonElement root) { + Optional videoSources = JsonUtils.getElement(root, TAG_VIDEO_FALLBACK); + if (videoSources.isPresent()) { + Map urls = new EnumMap<>(Qualities.class); + for (String key : PREFERED_CODEC) { + Optional codecs = JsonUtils.getElement(videoSources.get(), key); + if (codecs.isPresent() && codecs.get().isJsonArray()) { + for (JsonElement singleVideo : codecs.get().getAsJsonArray()) { + Optional tgtUrl = JsonUtils.getElementValueAsString(singleVideo, TAG_VIDEO_FALLBACK_URL); + if (tgtUrl.isPresent()) { + urls.put(Qualities.NORMAL, tgtUrl.get()); + return Optional.of(urls); + } + } + } + } + } + return Optional.empty(); + } + + private Optional> readVideoForTargetCodec(JsonElement urlArray, String targetCodec) { + Map urls = new EnumMap<>(Qualities.class); + for (JsonElement videoElement : urlArray.getAsJsonArray()) { + Optional codec = JsonUtils.getElementValueAsString(videoElement, TAG_VIDEO_CODEC); + Optional qualityValue = JsonUtils.getElementValueAsString(videoElement, TAG_VIDEO_QUALITY); + Optional url = JsonUtils.getElementValueAsString(videoElement, TAG_VIDEO_URL); + if (url.isPresent() && codec.isPresent() && qualityValue.isPresent() && targetCodec.equalsIgnoreCase(codec.get()) && OrfOnEpisodeDeserializer.getQuality(qualityValue.get()).isPresent()) { + final Optional quality = OrfOnEpisodeDeserializer.getQuality(qualityValue.get()); + if (quality.isPresent()) { + urls.put(quality.get(), url.get()); + } + } + } + if (urls.isEmpty()) { + return Optional.empty(); + } + return Optional.of(urls); + } + + private Optional parseWebsite(Optional text) { + Optional result = Optional.empty(); + if (text.isPresent()) { + result = Optional.of(text.get()); + } + return result; + } + + private Optional parseAiredDate(Optional text) { + Optional result = Optional.empty(); + if (text.isPresent()) { + try { + result = Optional.of(LocalDateTime.parse(text.get(), DateTimeFormatter.ISO_ZONED_DATE_TIME)); + } catch (Exception e) { + Log.errorLog(873673825, e, "datetimeformatter failed: " + text.get()); + LOG.error("DateTimeFormatter failed for string {} exception {}", text.get(), e); + } + } + return result; + } + + /////////////// + + private Optional parseDuration(Optional text) { + if (text.isPresent()) { + try { + return Optional.of(Duration.ofSeconds(Integer.parseInt(text.get()))); + } catch (Exception e) { + Log.errorLog(873673826, e, "duration failed: " + text.get()); + LOG.error("Duration failed for string {} exception {}", text.get(), e); + } + } + return Optional.empty(); + + } + +} diff --git a/src/main/java/mServer/crawler/sender/orfon/json/OrfOnEpisodesDeserializer.java b/src/main/java/mServer/crawler/sender/orfon/json/OrfOnEpisodesDeserializer.java new file mode 100644 index 00000000..3977bf26 --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/json/OrfOnEpisodesDeserializer.java @@ -0,0 +1,37 @@ +package mServer.crawler.sender.orfon.json; + +import com.google.gson.*; +import mServer.crawler.sender.base.JsonUtils; +import mServer.crawler.sender.base.PagedElementListDTO; +import mServer.crawler.sender.orfon.OrfOnBreadCrumsUrlDTO; + +import java.lang.reflect.Type; +import java.util.Optional; + +public class OrfOnEpisodesDeserializer implements JsonDeserializer> { + private static final String[] TAG_NEXT_PAGE = {"_links", "next", "href"}; + private static final String[] TAG_ITEMS = {"_embedded", "items"}; + private static final String TAG_EPISODE_ID = "id"; + private static final String[] TAG_EPISODE_LINK = { "_links", "self", "href"}; + + @Override + public PagedElementListDTO deserialize( + final JsonElement jsonElement, final Type typeOfT, final JsonDeserializationContext context) + throws JsonParseException { + JsonObject jsonPage = jsonElement.getAsJsonObject(); + // + PagedElementListDTO page = new PagedElementListDTO<>(); + page.setNextPage(JsonUtils.getElementValueAsString(jsonElement, TAG_NEXT_PAGE)); + // + final Optional items = JsonUtils.getElement(jsonPage, TAG_ITEMS); + if (items.isPresent() && items.get().isJsonArray()) { + for (JsonElement item : items.get().getAsJsonArray()) { + Optional episodeId = JsonUtils.getElementValueAsString(item, TAG_EPISODE_ID); + Optional episodeLink = JsonUtils.getElementValueAsString(item, TAG_EPISODE_LINK); + episodeLink.ifPresent( link -> page.addElement(new OrfOnBreadCrumsUrlDTO(episodeId.orElse("EMPTY"), link))); + } + } + return page; + } + +} diff --git a/src/main/java/mServer/crawler/sender/orfon/json/OrfOnHistoryChildrenDeserializer.java b/src/main/java/mServer/crawler/sender/orfon/json/OrfOnHistoryChildrenDeserializer.java new file mode 100644 index 00000000..57d7b0b9 --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/json/OrfOnHistoryChildrenDeserializer.java @@ -0,0 +1,60 @@ +package mServer.crawler.sender.orfon.json; + +import com.google.gson.JsonDeserializer; +import mServer.crawler.sender.base.JsonUtils; +import mServer.crawler.sender.base.PagedElementListDTO; +import mServer.crawler.sender.orfon.OrfOnBreadCrumsUrlDTO; + +import com.google.gson.JsonDeserializationContext; +import com.google.gson.JsonElement; +import com.google.gson.JsonParseException; +import mServer.crawler.sender.orfon.OrfOnConstants; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.lang.reflect.Type; +import java.util.Optional; + + +public class OrfOnHistoryChildrenDeserializer implements JsonDeserializer> { + private static final Logger LOG = LogManager.getLogger(OrfOnHistoryChildrenDeserializer.class); + private static final String[] TAG_NEXT_PAGE = { "next" }; + private static final String[] TAG_ITEM_ARRAY = { "_items" }; + private static final String[] TAG_ITEM_TITLE = {"title"}; + private static final String[] TAG_TARGET_URL = {"_links", "video_items", "href"}; + private static final String[] TAG_TARGET_URL2 = {"_links", "children", "href"}; + + @Override + public PagedElementListDTO deserialize( + final JsonElement jsonElement, final Type typeOfT, final JsonDeserializationContext context) + throws JsonParseException { + // + PagedElementListDTO page = new PagedElementListDTO<>(); + page.setNextPage(JsonUtils.getElementValueAsString(jsonElement, TAG_NEXT_PAGE)); + // + Optional itemArray = JsonUtils.getElement(jsonElement, TAG_ITEM_ARRAY); + if (itemArray.isPresent() && itemArray.get().isJsonArray()) { + for (JsonElement item : itemArray.get().getAsJsonArray()) { + final Optional videoItemUrl = JsonUtils.getElementValueAsString(item, TAG_TARGET_URL); + final Optional childrenUrl = JsonUtils.getElementValueAsString(item, TAG_TARGET_URL2); + final Optional title = JsonUtils.getElementValueAsString(item, TAG_ITEM_TITLE); + if (videoItemUrl.isPresent()) { + page.addElement(new OrfOnBreadCrumsUrlDTO( + title.orElse("MISSING TITLE"), + OrfOnConstants.createMaxLimmitUrl(videoItemUrl.get()) + )); + } else if (childrenUrl.isPresent()) { + page.addElement(new OrfOnBreadCrumsUrlDTO( + title.orElse("MISSING TITLE"), + OrfOnConstants.createMaxLimmitUrl(childrenUrl.get()) + )); + } else { + LOG.info("No video_items or children tag found {}",JsonUtils.getElementValueAsString(item, TAG_ITEM_TITLE) ); + } + } + } + // + return page; + } + +} diff --git a/src/main/java/mServer/crawler/sender/orfon/json/OrfOnHistoryDeserializer.java b/src/main/java/mServer/crawler/sender/orfon/json/OrfOnHistoryDeserializer.java new file mode 100644 index 00000000..04462344 --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/json/OrfOnHistoryDeserializer.java @@ -0,0 +1,62 @@ +package mServer.crawler.sender.orfon.json; + +import com.google.gson.*; +import mServer.crawler.sender.base.JsonUtils; +import mServer.crawler.sender.base.PagedElementListDTO; +import mServer.crawler.sender.orfon.OrfOnBreadCrumsUrlDTO; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.lang.reflect.Type; +import java.util.Optional; + + +public class OrfOnHistoryDeserializer implements JsonDeserializer> { + private static final Logger LOG = LogManager.getLogger(OrfOnHistoryDeserializer.class); + private static final String[] TAG_NEXT_PAGE = {}; + private static final String[] TAG_ITEM_ARRAY_TOP = {"history_highlights"}; + private static final String[] TAG_ITEM_TITLE = {"title"}; + private static final String[] TAG_ITEM_ARRAY_BUTTOM = {"history_items"}; + private static final String[] TAG_TARGET_URL = {"_links", "children", "href"}; + + @Override + public PagedElementListDTO deserialize( + final JsonElement jsonElement, final Type typeOfT, final JsonDeserializationContext context) + throws JsonParseException { + // + PagedElementListDTO page = new PagedElementListDTO<>(); + page.setNextPage(JsonUtils.getElementValueAsString(jsonElement, TAG_NEXT_PAGE)); + // + final Optional itemArrayTop = JsonUtils.getElement(jsonElement, TAG_ITEM_ARRAY_TOP); + if (itemArrayTop.isPresent() && itemArrayTop.get().isJsonArray()) { + page.addElements(parseSection(itemArrayTop.get().getAsJsonArray()).getElements()); + } + // + final Optional itemArrayButtom = JsonUtils.getElement(jsonElement, TAG_ITEM_ARRAY_BUTTOM); + if (itemArrayButtom.isPresent() && itemArrayButtom.get().isJsonArray()) { + page.addElements(parseSection(itemArrayButtom.get().getAsJsonArray()).getElements()); + } + // + return page; + } + + public PagedElementListDTO parseSection(JsonArray itemArray) { + PagedElementListDTO items = new PagedElementListDTO<>(); + for (JsonElement item : itemArray) { + final Optional url = JsonUtils.getElementValueAsString(item, TAG_TARGET_URL); + final Optional title = JsonUtils.getElementValueAsString(item, TAG_ITEM_TITLE); + if (url.isPresent()) { + items.addElement(new OrfOnBreadCrumsUrlDTO( + title.orElse("EMPTY"), + url.get() + )); + } else { + LOG.debug("missing url for {}", title); + } + } + return items; + } + + + +} diff --git a/src/main/java/mServer/crawler/sender/orfon/json/OrfOnHistoryVideoItemDeserializer.java b/src/main/java/mServer/crawler/sender/orfon/json/OrfOnHistoryVideoItemDeserializer.java new file mode 100644 index 00000000..31ec5d03 --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/json/OrfOnHistoryVideoItemDeserializer.java @@ -0,0 +1,58 @@ +package mServer.crawler.sender.orfon.json; + +import com.google.gson.JsonDeserializationContext; +import com.google.gson.JsonDeserializer; +import com.google.gson.JsonElement; +import com.google.gson.JsonParseException; +import mServer.crawler.sender.base.JsonUtils; +import mServer.crawler.sender.base.PagedElementListDTO; +import mServer.crawler.sender.orfon.OrfOnBreadCrumsUrlDTO; + +import java.lang.reflect.Type; +import java.util.Optional; + +public class OrfOnHistoryVideoItemDeserializer implements JsonDeserializer> { + private static final String[] TAG_NEXT_PAGE = { "next" }; + private static final String[] TAG_ITEM_ARRAY = { "_items" }; + private static final String[] TAG_ITEM_TITLE = {"title"}; + private static final String[] TAG_TARGET_URL = {"_links", "self", "href"}; + private static final String[] TAG_TARGET_URL_EPISODE = {"_links", "episode", "href"}; + + + + @Override + public PagedElementListDTO deserialize( + final JsonElement jsonElement, final Type typeOfT, final JsonDeserializationContext context) + throws JsonParseException { + // + PagedElementListDTO page = new PagedElementListDTO<>(); + page.setNextPage(JsonUtils.getElementValueAsString(jsonElement, TAG_NEXT_PAGE)); + // + Optional itemArrayTop = JsonUtils.getElement(jsonElement, TAG_ITEM_ARRAY); + if (itemArrayTop.isPresent() && itemArrayTop.get().isJsonArray()) { + for (JsonElement item : itemArrayTop.get().getAsJsonArray()) { + final Optional urlSelf = JsonUtils.getElementValueAsString(item, TAG_TARGET_URL); + final Optional urlEpisode = JsonUtils.getElementValueAsString(item, TAG_TARGET_URL_EPISODE); + final Optional title = JsonUtils.getElementValueAsString(item, TAG_ITEM_TITLE); + // self should be an episode but in some cases a segment - only in this cases we have an additional episode element + if (urlSelf.isPresent() && !urlSelf.get().contains("/segment/")) { + page.addElement(new OrfOnBreadCrumsUrlDTO( + title.orElse("MISSING TITLE"), + urlSelf.get() + )); + } else if (urlEpisode.isPresent()) { + page.addElement(new OrfOnBreadCrumsUrlDTO( + title.orElse("MISSING TITLE"), + urlEpisode.get() + )); + } + } + } + // + return page; + } + + + + +} diff --git a/src/main/java/mServer/crawler/sender/orfon/json/OrfOnScheduleDeserializer.java b/src/main/java/mServer/crawler/sender/orfon/json/OrfOnScheduleDeserializer.java new file mode 100644 index 00000000..e4e4e827 --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/json/OrfOnScheduleDeserializer.java @@ -0,0 +1,31 @@ +package mServer.crawler.sender.orfon.json; + +import com.google.gson.*; +import mServer.crawler.sender.base.JsonUtils; +import mServer.crawler.sender.base.PagedElementListDTO; +import mServer.crawler.sender.orfon.OrfOnBreadCrumsUrlDTO; +import mServer.crawler.sender.orfon.OrfOnConstants; + +import java.lang.reflect.Type; +import java.util.Optional; + + +public class OrfOnScheduleDeserializer implements JsonDeserializer> { + private static final String TAG_FILM_ID = "id"; + + @Override + public PagedElementListDTO deserialize( + final JsonElement jsonElement, final Type typeOfT, final JsonDeserializationContext context) + throws JsonParseException { + PagedElementListDTO collectIds = new PagedElementListDTO<>(); + final JsonArray elements = jsonElement.getAsJsonArray(); + for (JsonElement element : elements) { + final Optional id = JsonUtils.getElementValueAsString(element, TAG_FILM_ID); + if (id.isPresent()) { + final String url = OrfOnConstants.EPISODE + "/" + id.get(); + collectIds.addElement(new OrfOnBreadCrumsUrlDTO(id.get(), url)); + } + } + return collectIds; + } +} diff --git a/src/main/java/mServer/crawler/sender/orfon/task/OrfOnAZTask.java b/src/main/java/mServer/crawler/sender/orfon/task/OrfOnAZTask.java new file mode 100644 index 00000000..ed2ffb09 --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/task/OrfOnAZTask.java @@ -0,0 +1,39 @@ +package mServer.crawler.sender.orfon.task; + + +import com.google.gson.JsonDeserializer; +import com.google.gson.reflect.TypeToken; +import mServer.crawler.sender.MediathekReader; +import mServer.crawler.sender.base.AbstractRecursivConverterTask; +import mServer.crawler.sender.base.PagedElementListDTO; +import mServer.crawler.sender.orfon.OrfOnBreadCrumsUrlDTO; +import mServer.crawler.sender.orfon.json.OrfOnAZDeserializer; + +import java.lang.reflect.Type; +import java.util.concurrent.ConcurrentLinkedQueue; + +// extends AbstractRestTask +// return T Class from this task, desirialisation of class R , D , Reasearch in this url +public class OrfOnAZTask extends OrfOnPagedTask { + private static final long serialVersionUID = 1L; + + public OrfOnAZTask(MediathekReader crawler, ConcurrentLinkedQueue urlToCrawlDTOs) { + super(crawler, urlToCrawlDTOs); + } + + @Override + protected JsonDeserializer> getParser(OrfOnBreadCrumsUrlDTO aDTO) { + return new OrfOnAZDeserializer(); + } + + @Override + protected Type getType() { + return new TypeToken>() {}.getType(); + } + + @Override + protected AbstractRecursivConverterTask createNewOwnInstance(ConcurrentLinkedQueue aElementsToProcess) { + return new OrfOnAZTask(crawler, aElementsToProcess); + } + +} diff --git a/src/main/java/mServer/crawler/sender/orfon/task/OrfOnEpisodeTask.java b/src/main/java/mServer/crawler/sender/orfon/task/OrfOnEpisodeTask.java new file mode 100644 index 00000000..e2067bb0 --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/task/OrfOnEpisodeTask.java @@ -0,0 +1,143 @@ +package mServer.crawler.sender.orfon.task; + +import com.google.gson.JsonDeserializer; +import com.google.gson.reflect.TypeToken; +import de.mediathekview.mlib.Const; +import de.mediathekview.mlib.daten.DatenFilm; +import de.mediathekview.mlib.tool.Log; +import jakarta.ws.rs.core.Response; +import mServer.crawler.CrawlerTool; +import mServer.crawler.sender.MediathekReader; +import mServer.crawler.sender.base.AbstractJsonRestTask; +import mServer.crawler.sender.base.AbstractRecursivConverterTask; +import mServer.crawler.sender.base.Qualities; +import mServer.crawler.sender.orfon.OrfOnBreadCrumsUrlDTO; +import mServer.crawler.sender.orfon.OrfOnConstants; +import mServer.crawler.sender.orfon.OrfOnVideoInfoDTO; +import mServer.crawler.sender.orfon.json.OrfOnEpisodeDeserializer; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.lang.reflect.Type; +import java.net.URI; +import java.time.Duration; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.ConcurrentLinkedQueue; + +// extends AbstractRestTask +// return T Class from this task, desirialisation of class R , D , Reasearch in this url +public class OrfOnEpisodeTask extends AbstractJsonRestTask { + private static final long serialVersionUID = 3272445100769901305L; + private static final Logger LOG = LogManager.getLogger(OrfOnEpisodeTask.class); + private static final String ORF_AUDIODESCRIPTION_PREFIX = "AD | "; + + private static final DateTimeFormatter DATE_FORMAT + = DateTimeFormatter.ofPattern("dd.MM.yyyy"); + private static final DateTimeFormatter TIME_FORMAT + = DateTimeFormatter.ofPattern("HH:mm:ss"); + + public OrfOnEpisodeTask(MediathekReader crawler, ConcurrentLinkedQueue urlToCrawlDTOs) { + super(crawler, urlToCrawlDTOs, Optional.of(OrfOnConstants.AUTH)); + } + + @Override + protected JsonDeserializer getParser(OrfOnBreadCrumsUrlDTO aDTO) { + return new OrfOnEpisodeDeserializer(); + } + + @Override + protected Type getType() { + return new TypeToken() { + }.getType(); + } + + @Override + protected void postProcessing(OrfOnVideoInfoDTO aResponseObj, OrfOnBreadCrumsUrlDTO aDTO) { + if (aResponseObj.getTitle().isEmpty() && aResponseObj.getTitleWithDate().isEmpty()) { + LOG.warn("Missing title for {}", aDTO); + return; + } + if (aResponseObj.getTopic().isEmpty()) { + LOG.warn("Missing topic for {}", aDTO); + return; + } + if (aResponseObj.getVideoUrls().isEmpty()) { + LOG.warn("Missing videoUrls for {}", aDTO); + return; + } + if (aResponseObj.getDuration().isEmpty()) { + LOG.warn("Missing duration for {}", aDTO); + } + if (aResponseObj.getAired().isEmpty()) { + LOG.warn("Missing aired date for {}", aDTO); + } + if (aResponseObj.getWebsite().isEmpty()) { + LOG.warn("Missing website for {}", aDTO); + } + + final LocalDateTime dateTime = aResponseObj.getAired().orElse(LocalDateTime.of(1970, 1, 1, 0, 0, 0)); + String dateValue = dateTime.format(DATE_FORMAT); + String timeValue = dateTime.format(TIME_FORMAT); + + final Optional> videoUrls = aResponseObj.getVideoUrls(); + if (videoUrls.isPresent() && !videoUrls.get().isEmpty()) { + final Map urls = videoUrls.get(); + DatenFilm film = new DatenFilm(Const.ORF, + buildTopic(aResponseObj.getTopic().orElse(""), aResponseObj.getTopicForArchive().orElse("")), + aResponseObj.getWebsite().orElse(""), + buildTitle(aResponseObj.getTitle().orElse("")), + urls.getOrDefault(Qualities.NORMAL, ""), "", + dateValue, timeValue, + aResponseObj.getDuration().orElse(Duration.ofMinutes(0L)).getSeconds(), + aResponseObj.getDescription().orElse("")); + if (urls.containsKey(Qualities.SMALL)) { + CrawlerTool.addUrlKlein(film, urls.get(Qualities.SMALL)); + } + if (urls.containsKey(Qualities.HD)) { + CrawlerTool.addUrlHd(film, urls.get(Qualities.HD)); + } + + final Optional subtitleUrl = aResponseObj.getSubtitleUrl(); + if (subtitleUrl.isPresent()) { + CrawlerTool.addUrlSubtitle(film, subtitleUrl.get()); + } + taskResults.add(film); + } + } + + @Override + protected AbstractRecursivConverterTask createNewOwnInstance( + ConcurrentLinkedQueue aElementsToProcess) { + return new OrfOnEpisodeTask(crawler, aElementsToProcess); + } + + @Override + protected void handleHttpError(OrfOnBreadCrumsUrlDTO dto, URI url, Response response) { + Log.errorLog(87732933, "ORF: http error " + response.getStatus() + ", " + url); + LOG.fatal( + "A HTTP error {} occurred when getting REST information from: \"{}\".", + response.getStatus(), + url); + } + + private String buildTopic(String topic, String archiveTopic) { + String newTopic = topic; + if (newTopic.startsWith(ORF_AUDIODESCRIPTION_PREFIX)) { + newTopic = newTopic.replace(ORF_AUDIODESCRIPTION_PREFIX, ""); + } + if (newTopic.equalsIgnoreCase("archiv")) { + newTopic = archiveTopic.replace("History | ", ""); + } + return newTopic; + } + + private String buildTitle(String title) { + if (title.startsWith(ORF_AUDIODESCRIPTION_PREFIX)) { + return title.replace(ORF_AUDIODESCRIPTION_PREFIX, "").concat(" (Audiodeskription)"); + } + return title; + } +} diff --git a/src/main/java/mServer/crawler/sender/orfon/task/OrfOnEpisodesTask.java b/src/main/java/mServer/crawler/sender/orfon/task/OrfOnEpisodesTask.java new file mode 100644 index 00000000..9797939a --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/task/OrfOnEpisodesTask.java @@ -0,0 +1,40 @@ +package mServer.crawler.sender.orfon.task; + +import com.google.gson.JsonDeserializer; +import com.google.gson.reflect.TypeToken; +import mServer.crawler.sender.MediathekReader; +import mServer.crawler.sender.base.AbstractRecursivConverterTask; +import mServer.crawler.sender.base.PagedElementListDTO; +import mServer.crawler.sender.orfon.OrfOnBreadCrumsUrlDTO; +import mServer.crawler.sender.orfon.json.OrfOnEpisodesDeserializer; + +import java.lang.reflect.Type; +import java.util.concurrent.ConcurrentLinkedQueue; + +// extends AbstractRestTask +// return T Class from this task, desirialisation of class R , D , Reasearch in this url +public class OrfOnEpisodesTask extends OrfOnPagedTask { + private static final long serialVersionUID = 1L; + + public OrfOnEpisodesTask(MediathekReader crawler, ConcurrentLinkedQueue urlToCrawlDTOs) { + super(crawler, urlToCrawlDTOs); + } + + @Override + public JsonDeserializer> getParser(OrfOnBreadCrumsUrlDTO aDTO) { + return new OrfOnEpisodesDeserializer(); + } + + @Override + public Type getType() { + return new TypeToken>() {}.getType(); + } + + @Override + public AbstractRecursivConverterTask createNewOwnInstance(ConcurrentLinkedQueue aElementsToProcess) { + return new OrfOnEpisodesTask(crawler, aElementsToProcess); + } + + + +} diff --git a/src/main/java/mServer/crawler/sender/orfon/task/OrfOnHistoryChildrenTask.java b/src/main/java/mServer/crawler/sender/orfon/task/OrfOnHistoryChildrenTask.java new file mode 100644 index 00000000..dbc2982d --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/task/OrfOnHistoryChildrenTask.java @@ -0,0 +1,58 @@ +package mServer.crawler.sender.orfon.task; + +import com.google.gson.JsonDeserializer; +import com.google.gson.reflect.TypeToken; +import mServer.crawler.sender.MediathekReader; +import mServer.crawler.sender.base.AbstractRecursivConverterTask; +import mServer.crawler.sender.base.PagedElementListDTO; +import mServer.crawler.sender.orfon.OrfOnBreadCrumsUrlDTO; +import mServer.crawler.sender.orfon.json.OrfOnHistoryChildrenDeserializer; + +import java.lang.reflect.Type; +import java.util.Set; +import java.util.concurrent.ConcurrentLinkedQueue; + +// extends AbstractRestTask +// return T Class from this task, desirialisation of class R , D , Reasearch in this url +public class OrfOnHistoryChildrenTask extends OrfOnPagedTask { + private static final long serialVersionUID = 1L; + + public OrfOnHistoryChildrenTask(MediathekReader crawler, ConcurrentLinkedQueue urlToCrawlDTOs) { + super(crawler, urlToCrawlDTOs); + } + + @Override + protected JsonDeserializer> getParser(OrfOnBreadCrumsUrlDTO aDTO) { + return new OrfOnHistoryChildrenDeserializer(); + } + + @Override + protected Type getType() { + return new TypeToken>() {}.getType(); + } + + @Override + protected void postProcessingElements(Set elements, OrfOnBreadCrumsUrlDTO originalDTO) { + for (OrfOnBreadCrumsUrlDTO element : elements) { + if (element.getUrl().contains("/children")) { + final ConcurrentLinkedQueue moreContentOnNewPage = new ConcurrentLinkedQueue<>(); + moreContentOnNewPage.add(element); + AbstractRecursivConverterTask resolveChildren = createNewOwnInstance(moreContentOnNewPage); + resolveChildren.fork(); + for(OrfOnBreadCrumsUrlDTO moreElements : resolveChildren.join()) { + moreElements.setBreadCrumsPath(originalDTO.getBreadCrums()); + taskResults.add(moreElements); + } + } else { + element.setBreadCrumsPath(originalDTO.getBreadCrums()); + taskResults.add(element); + } + } + } + + @Override + protected AbstractRecursivConverterTask createNewOwnInstance(ConcurrentLinkedQueue aElementsToProcess) { + return new OrfOnHistoryChildrenTask(crawler, aElementsToProcess); + } + +} diff --git a/src/main/java/mServer/crawler/sender/orfon/task/OrfOnHistoryTask.java b/src/main/java/mServer/crawler/sender/orfon/task/OrfOnHistoryTask.java new file mode 100644 index 00000000..eee0a36c --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/task/OrfOnHistoryTask.java @@ -0,0 +1,40 @@ +package mServer.crawler.sender.orfon.task; + +import com.google.gson.JsonDeserializer; +import com.google.gson.reflect.TypeToken; +import mServer.crawler.sender.MediathekReader; +import mServer.crawler.sender.base.AbstractRecursivConverterTask; +import mServer.crawler.sender.base.PagedElementListDTO; +import mServer.crawler.sender.orfon.OrfOnBreadCrumsUrlDTO; +import mServer.crawler.sender.orfon.json.OrfOnHistoryDeserializer; + +import java.lang.reflect.Type; +import java.util.concurrent.ConcurrentLinkedQueue; + +// extends AbstractRestTask +// return T Class from this task, desirialisation of class R , D , Reasearch in this url +public class OrfOnHistoryTask extends OrfOnPagedTask { + private static final long serialVersionUID = 1L; + + + public OrfOnHistoryTask(MediathekReader crawler, ConcurrentLinkedQueue urlToCrawlDTOs) { + super(crawler, urlToCrawlDTOs); + } + + @Override + protected JsonDeserializer> getParser(OrfOnBreadCrumsUrlDTO aDTO) { + return new OrfOnHistoryDeserializer(); + } + + @Override + protected Type getType() { + return new TypeToken>() {}.getType(); + } + + @Override + protected AbstractRecursivConverterTask createNewOwnInstance(ConcurrentLinkedQueue aElementsToProcess) { + return new OrfOnHistoryTask(crawler, aElementsToProcess); + } + + +} diff --git a/src/main/java/mServer/crawler/sender/orfon/task/OrfOnHistoryVideoItemTask.java b/src/main/java/mServer/crawler/sender/orfon/task/OrfOnHistoryVideoItemTask.java new file mode 100644 index 00000000..00c1f9a1 --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/task/OrfOnHistoryVideoItemTask.java @@ -0,0 +1,38 @@ +package mServer.crawler.sender.orfon.task; + +import com.google.gson.JsonDeserializer; +import com.google.gson.reflect.TypeToken; +import mServer.crawler.sender.MediathekReader; +import mServer.crawler.sender.base.AbstractRecursivConverterTask; +import mServer.crawler.sender.base.PagedElementListDTO; +import mServer.crawler.sender.orfon.OrfOnBreadCrumsUrlDTO; +import mServer.crawler.sender.orfon.json.OrfOnHistoryVideoItemDeserializer; + +import java.lang.reflect.Type; +import java.util.concurrent.ConcurrentLinkedQueue; + +// extends AbstractRestTask +// return T Class from this task, desirialisation of class R , D , Reasearch in this url +public class OrfOnHistoryVideoItemTask extends OrfOnPagedTask { + private static final long serialVersionUID = 1L; + + public OrfOnHistoryVideoItemTask(MediathekReader crawler, ConcurrentLinkedQueue urlToCrawlDTOs) { + super(crawler, urlToCrawlDTOs); + } + + @Override + protected JsonDeserializer> getParser(OrfOnBreadCrumsUrlDTO aDTO) { + return new OrfOnHistoryVideoItemDeserializer(); + } + + @Override + protected Type getType() { + return new TypeToken>() {}.getType(); + } + + @Override + protected AbstractRecursivConverterTask createNewOwnInstance(ConcurrentLinkedQueue aElementsToProcess) { + return new OrfOnHistoryVideoItemTask(crawler, aElementsToProcess); + } + +} diff --git a/src/main/java/mServer/crawler/sender/orfon/task/OrfOnPagedTask.java b/src/main/java/mServer/crawler/sender/orfon/task/OrfOnPagedTask.java new file mode 100644 index 00000000..eb23f2b2 --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/task/OrfOnPagedTask.java @@ -0,0 +1,67 @@ +package mServer.crawler.sender.orfon.task; + +import de.mediathekview.mlib.tool.Log; +import jakarta.ws.rs.core.Response; +import mServer.crawler.sender.MediathekReader; +import mServer.crawler.sender.base.AbstractJsonRestTask; +import mServer.crawler.sender.base.AbstractRecursivConverterTask; +import mServer.crawler.sender.base.PagedElementListDTO; +import mServer.crawler.sender.orfon.OrfOnBreadCrumsUrlDTO; +import mServer.crawler.sender.orfon.OrfOnConstants; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.net.URI; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.ConcurrentLinkedQueue; + +// extends AbstractRestTask +// return T Class from this task, desirialisation of class R , D , Reasearch in this url +public abstract class OrfOnPagedTask extends AbstractJsonRestTask, OrfOnBreadCrumsUrlDTO> { + private static final long serialVersionUID = 1L; + protected final transient Logger log = LogManager.getLogger(this.getClass()); + protected transient Optional> nextPageTask = Optional.empty(); + + protected OrfOnPagedTask(MediathekReader crawler, ConcurrentLinkedQueue urlToCrawlDTOs) { + super(crawler, urlToCrawlDTOs, Optional.of(OrfOnConstants.AUTH)); + } + + protected void postProcessingNextPage(PagedElementListDTO aResponseObj, OrfOnBreadCrumsUrlDTO aDTO) { + if (aResponseObj.getNextPage().isEmpty()) { + return; + } + final ConcurrentLinkedQueue nextPageLinks = new ConcurrentLinkedQueue<>(); + final Optional nextPage = aResponseObj.getNextPage(); + if (nextPage.isPresent()) { + nextPageLinks.add(new OrfOnBreadCrumsUrlDTO(aDTO.getBreadCrums(), nextPage.get())); + nextPageTask = Optional.of(createNewOwnInstance(nextPageLinks)); + nextPageTask.get().fork(); + log.debug("started paging to url {} for {}", nextPage.get(), aDTO.getUrl()); + } + } + + protected void postProcessingElements(Set elements, OrfOnBreadCrumsUrlDTO originalDTO) { + for (OrfOnBreadCrumsUrlDTO element : elements) { + element.setBreadCrumsPath(originalDTO.getBreadCrums()); + taskResults.add(element); + } + } + + @Override + protected void postProcessing(PagedElementListDTO aResponseObj, OrfOnBreadCrumsUrlDTO aDTO) { + postProcessingNextPage(aResponseObj, aDTO); + postProcessingElements(aResponseObj.getElements(), aDTO); + nextPageTask.ifPresent(paginationResults -> postProcessingElements(paginationResults.join(), aDTO)); + } + + @Override + protected void handleHttpError(OrfOnBreadCrumsUrlDTO dto, URI url, Response response) { + Log.errorLog(874764622, "ORF: http error " + response.getStatus() + ", " + url); + log.fatal( + "A HTTP error {} occurred when getting REST information from: \"{}\".", + response.getStatus(), + url); + } + +} diff --git a/src/main/java/mServer/crawler/sender/orfon/task/OrfOnScheduleTask.java b/src/main/java/mServer/crawler/sender/orfon/task/OrfOnScheduleTask.java new file mode 100644 index 00000000..d1ebd2de --- /dev/null +++ b/src/main/java/mServer/crawler/sender/orfon/task/OrfOnScheduleTask.java @@ -0,0 +1,41 @@ +package mServer.crawler.sender.orfon.task; + +import com.google.gson.JsonDeserializer; +import com.google.gson.reflect.TypeToken; +import mServer.crawler.sender.MediathekReader; +import mServer.crawler.sender.base.AbstractRecursivConverterTask; +import mServer.crawler.sender.base.PagedElementListDTO; +import mServer.crawler.sender.orf.TopicUrlDTO; +import mServer.crawler.sender.orfon.OrfOnBreadCrumsUrlDTO; +import mServer.crawler.sender.orfon.json.OrfOnScheduleDeserializer; + +import java.lang.reflect.Type; +import java.util.Set; +import java.util.concurrent.ConcurrentLinkedQueue; + +// extends AbstractRestTask +// return T Class from this task, desirialisation of class R , D , Reasearch in this url +public class OrfOnScheduleTask extends OrfOnPagedTask { + private static final long serialVersionUID = -2556623295745879044L; + + public OrfOnScheduleTask(MediathekReader crawler, ConcurrentLinkedQueue urlToCrawlDTOs) { + super(crawler, urlToCrawlDTOs); + } + + @Override + protected JsonDeserializer> getParser(OrfOnBreadCrumsUrlDTO aDTO) { + return new OrfOnScheduleDeserializer(); + } + + @Override + protected Type getType() { + return new TypeToken>() {}.getType(); + } + + @Override + protected AbstractRecursivConverterTask createNewOwnInstance( + ConcurrentLinkedQueue aElementsToProcess) { + return new OrfOnScheduleTask(crawler, aElementsToProcess); + } + +} From ceaf209bb0022d1220c0bd0aa7696bb99a510701 Mon Sep 17 00:00:00 2001 From: pidoubleyou Date: Sun, 24 Mar 2024 15:37:23 +0100 Subject: [PATCH 05/10] initial orfon crawler --- src/main/java/mServer/crawler/FilmeSuchen.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/mServer/crawler/FilmeSuchen.java b/src/main/java/mServer/crawler/FilmeSuchen.java index 91bd7bf7..f4f97bdd 100644 --- a/src/main/java/mServer/crawler/FilmeSuchen.java +++ b/src/main/java/mServer/crawler/FilmeSuchen.java @@ -32,7 +32,7 @@ import mServer.crawler.sender.dw.DwCrawler; import mServer.crawler.sender.funk.FunkCrawler; import mServer.crawler.sender.kika.KikaApiCrawler; -import mServer.crawler.sender.orf.OrfCrawler; +import mServer.crawler.sender.orfon.OrfOnCrawler; import mServer.crawler.sender.phoenix.PhoenixCrawler; import mServer.crawler.sender.sr.SrCrawler; import mServer.crawler.sender.srf.SrfCrawler; @@ -113,7 +113,7 @@ public FilmeSuchen() { mediathekListe.add(new MediathekSrfPod(this, 1)); } if (crawlerList.contains("ORF")) { - mediathekListe.add(new OrfCrawler(this, 1)); + mediathekListe.add(new OrfOnCrawler(this, 1)); } if (crawlerList.contains("PHONIX")) { mediathekListe.add(new PhoenixCrawler(this, 1)); From d10f523c223fdb16881dcfa58ed168f402e258b8 Mon Sep 17 00:00:00 2001 From: pidoubleyou Date: Sun, 24 Mar 2024 15:37:37 +0100 Subject: [PATCH 06/10] initial orfon crawler --- .../mServer/crawler/sender/base/JsonUtils.java | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/main/java/mServer/crawler/sender/base/JsonUtils.java b/src/main/java/mServer/crawler/sender/base/JsonUtils.java index 16ffdc0c..b21495e3 100644 --- a/src/main/java/mServer/crawler/sender/base/JsonUtils.java +++ b/src/main/java/mServer/crawler/sender/base/JsonUtils.java @@ -133,7 +133,18 @@ public static boolean hasStringElements( } public static Optional getElementValueAsString(final JsonElement aJsonElement, final String... aElementIds) { - Optional rs = Optional.empty(); + Optional rs = JsonUtils.getElement(aJsonElement, aElementIds); + if (rs.isPresent()) { + return Optional.of(rs.get().getAsString()); + } + return Optional.empty(); + } + + public static Optional getElement(final JsonElement aJsonElement, final String... aElementIds) { + Optional rs = Optional.empty(); + if (aElementIds == null || aElementIds.length == 0) { + return rs; + } JsonObject aJsonObject = aJsonElement.getAsJsonObject(); for (int i = 0; i < aElementIds.length-1; i++) { String elementId = aElementIds[i]; @@ -147,7 +158,7 @@ public static Optional getElementValueAsString(final JsonElement aJsonEl // String elementId = aElementIds[aElementIds.length-1]; if (aJsonObject != null && aJsonObject.has(elementId) && !aJsonObject.get(elementId).isJsonNull()) { - rs = Optional.of(aJsonObject.get(elementId).getAsString()); + rs = Optional.of(aJsonObject.get(elementId)); } // return rs; From 45d34372067b70a6423fc1dda559122473432a97 Mon Sep 17 00:00:00 2001 From: pidoubleyou Date: Sun, 24 Mar 2024 15:51:44 +0100 Subject: [PATCH 07/10] move TopicUrlDTO to base package --- .../java/mServer/crawler/AddToFilmlist.java | 4 +- .../sender/{orf => base}/TopicUrlDTO.java | 73 +++++++++---------- .../crawler/sender/kika/KikaApiCrawler.java | 2 +- .../crawler/sender/kika/KikaApiTopicDto.java | 2 +- .../json/KikaApiTopicPageDeserializer.java | 2 +- .../sender/kika/tasks/KikaApiTopicTask.java | 2 +- .../crawler/sender/orfon/OrfOnConstants.java | 2 + .../sender/orfon/task/OrfOnScheduleTask.java | 2 +- .../parser/PhoenixFilmDetailDeserializer.java | 3 +- .../PhoenixSendungOverviewDeserializer.java | 3 +- .../crawler/sender/srf/SrfCrawler.java | 2 +- .../srf/parser/SrfTopicsDeserializer.java | 2 +- .../srf/tasks/SrfTopicOverviewTask.java | 2 +- .../srf/tasks/SrfTopicsOverviewTask.java | 2 +- 14 files changed, 53 insertions(+), 50 deletions(-) rename src/main/java/mServer/crawler/sender/{orf => base}/TopicUrlDTO.java (84%) diff --git a/src/main/java/mServer/crawler/AddToFilmlist.java b/src/main/java/mServer/crawler/AddToFilmlist.java index 726b5225..33b690a8 100644 --- a/src/main/java/mServer/crawler/AddToFilmlist.java +++ b/src/main/java/mServer/crawler/AddToFilmlist.java @@ -16,7 +16,7 @@ import java.util.*; import mServer.crawler.sender.base.UrlUtils; -import mServer.crawler.sender.orf.OrfVideoInfoDTO; +import mServer.crawler.sender.orfon.OrfOnConstants; import mServer.tool.MserverDaten; import okhttp3.OkHttpClient; import okhttp3.Request; @@ -114,7 +114,7 @@ private void performUrlSearch(HashSet hash, final int size) { private void performInitialCleanup() { listeEinsortieren.removeIf(f -> !f.arr[DatenFilm.FILM_URL].toLowerCase().startsWith("http")); listeEinsortieren.removeIf(f -> f.arr[DatenFilm.FILM_SENDER].equals(Const.ORF) && f.arr[DatenFilm.FILM_URL] - .matches(OrfVideoInfoDTO.FILTER_JUGENDSCHUTZ)); + .matches(OrfOnConstants.FILTER_JUGENDSCHUTZ)); listeEinsortieren.removeIf(f -> f.arr[DatenFilm.FILM_SENDER].equals(Const.ARD) && isArdUrlToRemove(f.arr[DatenFilm.FILM_URL])); listeEinsortieren.removeIf(f -> { String groesse = f.arr[DatenFilm.FILM_GROESSE]; diff --git a/src/main/java/mServer/crawler/sender/orf/TopicUrlDTO.java b/src/main/java/mServer/crawler/sender/base/TopicUrlDTO.java similarity index 84% rename from src/main/java/mServer/crawler/sender/orf/TopicUrlDTO.java rename to src/main/java/mServer/crawler/sender/base/TopicUrlDTO.java index 58a8050b..415ece18 100644 --- a/src/main/java/mServer/crawler/sender/orf/TopicUrlDTO.java +++ b/src/main/java/mServer/crawler/sender/base/TopicUrlDTO.java @@ -1,37 +1,36 @@ -package mServer.crawler.sender.orf; - -import mServer.crawler.sender.base.CrawlerUrlDTO; -import java.util.Objects; - -public class TopicUrlDTO extends CrawlerUrlDTO { - - private final String topic; - - public TopicUrlDTO(String aTopic, String aUrl) { - super(aUrl); - topic = aTopic; - } - - public String getTopic() { - return topic; - } - - @Override - public boolean equals(final Object obj) { - if (obj == null || getClass() != obj.getClass()) { - return false; - } - if (super.equals(obj)) { - return getTopic().equals(((TopicUrlDTO) obj).getTopic()); - } - - return false; - } - - @Override - public int hashCode() { - int hash = 3; - hash = 31 * hash + Objects.hashCode(this.topic) + super.hashCode(); - return hash; - } -} +package mServer.crawler.sender.base; + +import java.util.Objects; + +public class TopicUrlDTO extends CrawlerUrlDTO { + + private final String topic; + + public TopicUrlDTO(String aTopic, String aUrl) { + super(aUrl); + topic = aTopic; + } + + public String getTopic() { + return topic; + } + + @Override + public boolean equals(final Object obj) { + if (obj == null || getClass() != obj.getClass()) { + return false; + } + if (super.equals(obj)) { + return getTopic().equals(((TopicUrlDTO) obj).getTopic()); + } + + return false; + } + + @Override + public int hashCode() { + int hash = 3; + hash = 31 * hash + Objects.hashCode(this.topic) + super.hashCode(); + return hash; + } +} diff --git a/src/main/java/mServer/crawler/sender/kika/KikaApiCrawler.java b/src/main/java/mServer/crawler/sender/kika/KikaApiCrawler.java index 5f9ee64b..2832c5fb 100644 --- a/src/main/java/mServer/crawler/sender/kika/KikaApiCrawler.java +++ b/src/main/java/mServer/crawler/sender/kika/KikaApiCrawler.java @@ -13,7 +13,7 @@ import mServer.crawler.sender.base.JsoupConnection; import mServer.crawler.sender.kika.tasks.KikaApiFilmTask; import mServer.crawler.sender.kika.tasks.KikaApiTopicTask; -import mServer.crawler.sender.orf.TopicUrlDTO; +import mServer.crawler.sender.base.TopicUrlDTO; import java.util.Set; import java.util.concurrent.ConcurrentLinkedQueue; diff --git a/src/main/java/mServer/crawler/sender/kika/KikaApiTopicDto.java b/src/main/java/mServer/crawler/sender/kika/KikaApiTopicDto.java index 127de20e..7e11939b 100644 --- a/src/main/java/mServer/crawler/sender/kika/KikaApiTopicDto.java +++ b/src/main/java/mServer/crawler/sender/kika/KikaApiTopicDto.java @@ -4,7 +4,7 @@ import java.util.Optional; import java.util.Set; -import mServer.crawler.sender.orf.TopicUrlDTO; +import mServer.crawler.sender.base.TopicUrlDTO; public class KikaApiTopicDto { diff --git a/src/main/java/mServer/crawler/sender/kika/json/KikaApiTopicPageDeserializer.java b/src/main/java/mServer/crawler/sender/kika/json/KikaApiTopicPageDeserializer.java index 0eab95d8..4cce1ba3 100644 --- a/src/main/java/mServer/crawler/sender/kika/json/KikaApiTopicPageDeserializer.java +++ b/src/main/java/mServer/crawler/sender/kika/json/KikaApiTopicPageDeserializer.java @@ -2,7 +2,7 @@ import com.google.gson.*; -import mServer.crawler.sender.orf.TopicUrlDTO; +import mServer.crawler.sender.base.TopicUrlDTO; import mServer.crawler.sender.base.JsonUtils; import mServer.crawler.sender.base.UrlUtils; import mServer.crawler.sender.kika.KikaApiConstants; diff --git a/src/main/java/mServer/crawler/sender/kika/tasks/KikaApiTopicTask.java b/src/main/java/mServer/crawler/sender/kika/tasks/KikaApiTopicTask.java index f08263da..95bc1506 100644 --- a/src/main/java/mServer/crawler/sender/kika/tasks/KikaApiTopicTask.java +++ b/src/main/java/mServer/crawler/sender/kika/tasks/KikaApiTopicTask.java @@ -10,7 +10,7 @@ import mServer.crawler.sender.kika.KikaApiFilmDto; import mServer.crawler.sender.kika.KikaApiTopicDto; import mServer.crawler.sender.kika.json.KikaApiTopicPageDeserializer; -import mServer.crawler.sender.orf.TopicUrlDTO; +import mServer.crawler.sender.base.TopicUrlDTO; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; diff --git a/src/main/java/mServer/crawler/sender/orfon/OrfOnConstants.java b/src/main/java/mServer/crawler/sender/orfon/OrfOnConstants.java index cb852f42..57eefa8c 100644 --- a/src/main/java/mServer/crawler/sender/orfon/OrfOnConstants.java +++ b/src/main/java/mServer/crawler/sender/orfon/OrfOnConstants.java @@ -1,6 +1,8 @@ package mServer.crawler.sender.orfon; public final class OrfOnConstants { + + public static final String FILTER_JUGENDSCHUTZ = ".*/Jugendschutz[0-9][0-9][0-9][0-9]b[0-9][0-9][0-9][0-9]_.*"; // public static final String HOST = "https://api-tvthek.orf.at/api/v4.3"; // diff --git a/src/main/java/mServer/crawler/sender/orfon/task/OrfOnScheduleTask.java b/src/main/java/mServer/crawler/sender/orfon/task/OrfOnScheduleTask.java index d1ebd2de..51862157 100644 --- a/src/main/java/mServer/crawler/sender/orfon/task/OrfOnScheduleTask.java +++ b/src/main/java/mServer/crawler/sender/orfon/task/OrfOnScheduleTask.java @@ -5,7 +5,7 @@ import mServer.crawler.sender.MediathekReader; import mServer.crawler.sender.base.AbstractRecursivConverterTask; import mServer.crawler.sender.base.PagedElementListDTO; -import mServer.crawler.sender.orf.TopicUrlDTO; +import mServer.crawler.sender.base.TopicUrlDTO; import mServer.crawler.sender.orfon.OrfOnBreadCrumsUrlDTO; import mServer.crawler.sender.orfon.json.OrfOnScheduleDeserializer; diff --git a/src/main/java/mServer/crawler/sender/phoenix/parser/PhoenixFilmDetailDeserializer.java b/src/main/java/mServer/crawler/sender/phoenix/parser/PhoenixFilmDetailDeserializer.java index 8bc3a3b5..676b0c36 100644 --- a/src/main/java/mServer/crawler/sender/phoenix/parser/PhoenixFilmDetailDeserializer.java +++ b/src/main/java/mServer/crawler/sender/phoenix/parser/PhoenixFilmDetailDeserializer.java @@ -5,9 +5,10 @@ import com.google.gson.JsonDeserializer; import com.google.gson.JsonElement; import com.google.gson.JsonObject; +import mServer.crawler.sender.base.JsonUtils; + import java.lang.reflect.Type; import java.util.Optional; -import mServer.crawler.sender.orf.JsonUtils; public class PhoenixFilmDetailDeserializer implements JsonDeserializer> { diff --git a/src/main/java/mServer/crawler/sender/phoenix/parser/PhoenixSendungOverviewDeserializer.java b/src/main/java/mServer/crawler/sender/phoenix/parser/PhoenixSendungOverviewDeserializer.java index 2648a426..05652cbe 100644 --- a/src/main/java/mServer/crawler/sender/phoenix/parser/PhoenixSendungOverviewDeserializer.java +++ b/src/main/java/mServer/crawler/sender/phoenix/parser/PhoenixSendungOverviewDeserializer.java @@ -9,7 +9,8 @@ import java.util.HashSet; import java.util.Optional; import java.util.Set; -import mServer.crawler.sender.orf.JsonUtils; + +import mServer.crawler.sender.base.JsonUtils; import mServer.crawler.sender.phoenix.PhoenixConstants; import mServer.crawler.sender.base.SendungOverviewDto; diff --git a/src/main/java/mServer/crawler/sender/srf/SrfCrawler.java b/src/main/java/mServer/crawler/sender/srf/SrfCrawler.java index e3921868..a59f4b95 100644 --- a/src/main/java/mServer/crawler/sender/srf/SrfCrawler.java +++ b/src/main/java/mServer/crawler/sender/srf/SrfCrawler.java @@ -12,7 +12,7 @@ import mServer.crawler.FilmeSuchen; import mServer.crawler.sender.MediathekCrawler; import mServer.crawler.sender.base.CrawlerUrlDTO; -import mServer.crawler.sender.orf.TopicUrlDTO; +import mServer.crawler.sender.base.TopicUrlDTO; import mServer.crawler.sender.srf.tasks.SrfTopicOverviewTask; import mServer.crawler.sender.srf.tasks.SrfTopicsOverviewTask; import org.apache.logging.log4j.LogManager; diff --git a/src/main/java/mServer/crawler/sender/srf/parser/SrfTopicsDeserializer.java b/src/main/java/mServer/crawler/sender/srf/parser/SrfTopicsDeserializer.java index 12773d7d..c6643f75 100644 --- a/src/main/java/mServer/crawler/sender/srf/parser/SrfTopicsDeserializer.java +++ b/src/main/java/mServer/crawler/sender/srf/parser/SrfTopicsDeserializer.java @@ -9,7 +9,7 @@ import java.util.Optional; import java.util.Set; import mServer.crawler.sender.base.JsonUtils; -import mServer.crawler.sender.orf.TopicUrlDTO; +import mServer.crawler.sender.base.TopicUrlDTO; import mServer.crawler.sender.srf.SrfConstants; public class SrfTopicsDeserializer implements JsonDeserializer> { diff --git a/src/main/java/mServer/crawler/sender/srf/tasks/SrfTopicOverviewTask.java b/src/main/java/mServer/crawler/sender/srf/tasks/SrfTopicOverviewTask.java index ef65ffd8..019ed603 100644 --- a/src/main/java/mServer/crawler/sender/srf/tasks/SrfTopicOverviewTask.java +++ b/src/main/java/mServer/crawler/sender/srf/tasks/SrfTopicOverviewTask.java @@ -11,7 +11,7 @@ import mServer.crawler.sender.base.AbstractRecursivConverterTask; import mServer.crawler.sender.base.CrawlerUrlDTO; import mServer.crawler.sender.base.PagedElementListDTO; -import mServer.crawler.sender.orf.TopicUrlDTO; +import mServer.crawler.sender.base.TopicUrlDTO; import mServer.crawler.sender.srf.SrfConstants; import mServer.crawler.sender.srf.parser.SrfTopicDeserializer; diff --git a/src/main/java/mServer/crawler/sender/srf/tasks/SrfTopicsOverviewTask.java b/src/main/java/mServer/crawler/sender/srf/tasks/SrfTopicsOverviewTask.java index 028c015b..6ee878b8 100644 --- a/src/main/java/mServer/crawler/sender/srf/tasks/SrfTopicsOverviewTask.java +++ b/src/main/java/mServer/crawler/sender/srf/tasks/SrfTopicsOverviewTask.java @@ -10,7 +10,7 @@ import mServer.crawler.sender.ard.tasks.ArdTaskBase; import mServer.crawler.sender.base.AbstractRecursivConverterTask; import mServer.crawler.sender.base.CrawlerUrlDTO; -import mServer.crawler.sender.orf.TopicUrlDTO; +import mServer.crawler.sender.base.TopicUrlDTO; import mServer.crawler.sender.srf.SrfConstants; import mServer.crawler.sender.srf.parser.SrfTopicsDeserializer; From 49cf9f195c3609cedae575c3ab632d0118447fb6 Mon Sep 17 00:00:00 2001 From: pidoubleyou Date: Sun, 24 Mar 2024 15:52:06 +0100 Subject: [PATCH 08/10] remove old orf crawler --- .../mServer/crawler/sender/orf/JsonUtils.java | 51 ---- .../crawler/sender/orf/OrfConstants.java | 35 --- .../crawler/sender/orf/OrfCrawler.java | 104 ------- .../crawler/sender/orf/OrfEpisodeInfoDTO.java | 39 --- .../crawler/sender/orf/OrfVideoInfoDTO.java | 50 ---- .../orf/json/OrfMoreEpisodesDeserializer.java | 26 -- .../orf/parser/OrfMoreEpisodesParser.java | 27 -- .../orf/parser/OrfPlaylistDeserializer.java | 104 ------- .../parser/OrfVideoDetailDeserializer.java | 149 ---------- .../crawler/sender/orf/tasks/OrfDayTask.java | 54 ---- .../sender/orf/tasks/OrfFilmDetailTask.java | 272 ------------------ .../crawler/sender/orf/tasks/OrfHelper.java | 63 ---- .../orf/tasks/OrfHistoryOverviewTask.java | 45 --- .../sender/orf/tasks/OrfHistoryTopicTask.java | 39 --- .../sender/orf/tasks/OrfLetterPageTask.java | 57 ---- .../crawler/sender/orf/tasks/OrfTaskBase.java | 110 ------- 16 files changed, 1225 deletions(-) delete mode 100644 src/main/java/mServer/crawler/sender/orf/JsonUtils.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/OrfConstants.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/OrfCrawler.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/OrfEpisodeInfoDTO.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/OrfVideoInfoDTO.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/json/OrfMoreEpisodesDeserializer.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/parser/OrfMoreEpisodesParser.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/parser/OrfPlaylistDeserializer.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/parser/OrfVideoDetailDeserializer.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/tasks/OrfDayTask.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/tasks/OrfFilmDetailTask.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/tasks/OrfHelper.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/tasks/OrfHistoryOverviewTask.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/tasks/OrfHistoryTopicTask.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/tasks/OrfLetterPageTask.java delete mode 100644 src/main/java/mServer/crawler/sender/orf/tasks/OrfTaskBase.java diff --git a/src/main/java/mServer/crawler/sender/orf/JsonUtils.java b/src/main/java/mServer/crawler/sender/orf/JsonUtils.java deleted file mode 100644 index 989425f0..00000000 --- a/src/main/java/mServer/crawler/sender/orf/JsonUtils.java +++ /dev/null @@ -1,51 +0,0 @@ -package mServer.crawler.sender.orf; - -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; -import java.util.Optional; - -public final class JsonUtils { - - private JsonUtils() { - super(); - } - - /** - * Gets the value of an attribute - * - * @param aJsonObject the object - * @param aAttributeName the name of the attribute - * @return the value of the attribute, if it exists, else Optional.empty - */ - public static Optional getAttributeAsString(final JsonObject aJsonObject, - final String aAttributeName) { - if (aJsonObject.has(aAttributeName)) { - final JsonElement aElement = aJsonObject.get(aAttributeName); - if (!aElement.isJsonNull()) { - return Optional.of(aElement.getAsString()); - } - } - - return Optional.empty(); - } - - /** - * Checks if the {@link JsonObject} has all given elements and if no element - * is null. - * - * @param aJsonObject The object to check. - * @param aElementIds The elements which it should has. - * @return true when the object has all given elements and if no element is - * null. - */ - public static boolean hasElements(final JsonObject aJsonObject, - final String... aElementIds) { - for (final String elementId : aElementIds) { - if (!aJsonObject.has(elementId) || aJsonObject.get(elementId).isJsonNull()) { - return false; - } - } - - return true; - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/OrfConstants.java b/src/main/java/mServer/crawler/sender/orf/OrfConstants.java deleted file mode 100644 index 3b95c98c..00000000 --- a/src/main/java/mServer/crawler/sender/orf/OrfConstants.java +++ /dev/null @@ -1,35 +0,0 @@ -package mServer.crawler.sender.orf; - -public final class OrfConstants { - - public static final String URL_BASE = "https://tvthek.orf.at"; - - /** - * URL für die Sendungen eines Tages Muss am Ende noch um das Datum dd.MM.yyyy ergänzt werden - */ - public static final String URL_DAY = URL_BASE + "/schedule/"; - - /** - * Basis-URL für Übersichtsseite nach Buchstaben Muss am Ende noch um Buchstabe bzw. 0 ergänzt - * werden - */ - public static final String URL_SHOW_LETTER_PAGE = URL_BASE + "/profiles/letter/"; - - /** - * URL für erste Übersichtsseite nach Buchstaben - */ - public static final String URL_SHOW_LETTER_PAGE_A = URL_SHOW_LETTER_PAGE + "A"; - - /** - * URL für verpasste Sendungen eines Tages Muss am Ende noch um Datum ergänzt werden im Format - * DD.MM.YYYY - */ - public static final String URL_DATE = URL_BASE + "/schedule/"; - - /** - * URL für Übersichtsseite des Archivs - */ - public static final String URL_ARCHIVE = URL_BASE + "/archive"; - - private OrfConstants() {} -} diff --git a/src/main/java/mServer/crawler/sender/orf/OrfCrawler.java b/src/main/java/mServer/crawler/sender/orf/OrfCrawler.java deleted file mode 100644 index b4a039a8..00000000 --- a/src/main/java/mServer/crawler/sender/orf/OrfCrawler.java +++ /dev/null @@ -1,104 +0,0 @@ -package mServer.crawler.sender.orf; - -import de.mediathekview.mlib.Const; -import de.mediathekview.mlib.daten.DatenFilm; -import de.mediathekview.mlib.tool.Log; -import mServer.crawler.CrawlerTool; -import mServer.crawler.FilmeSuchen; -import mServer.crawler.sender.MediathekCrawler; -import mServer.crawler.sender.base.CrawlerUrlDTO; -import mServer.crawler.sender.orf.tasks.*; - -import java.time.LocalDateTime; -import java.time.format.DateTimeFormatter; -import java.time.temporal.ChronoUnit; -import java.util.Set; -import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.RecursiveTask; - -public class OrfCrawler extends MediathekCrawler { - - public static final String SENDERNAME = Const.ORF; - - public OrfCrawler(FilmeSuchen ssearch, int startPrio) { - super(ssearch, SENDERNAME, 0, 1, startPrio); - } - - private Set getDaysEntries() throws InterruptedException, ExecutionException { - final OrfDayTask dayTask = new OrfDayTask(this, getDayUrls()); - final Set shows = forkJoinPool.submit(dayTask).get(); - - Log.sysLog("ORF: Anzahl Sendungen aus Verpasst: " + shows.size()); - - return shows; - } - - private ConcurrentLinkedQueue getDayUrls() { - final int maximumDaysForSendungVerpasstSection = 8; - final int maximumDaysForSendungVerpasstSectionFuture = 0; - - final ConcurrentLinkedQueue urls = new ConcurrentLinkedQueue<>(); - for (int i = 0; i < maximumDaysForSendungVerpasstSection - + maximumDaysForSendungVerpasstSectionFuture; i++) { - urls.add(new CrawlerUrlDTO(OrfConstants.URL_DAY + LocalDateTime.now() - .plus(maximumDaysForSendungVerpasstSectionFuture, ChronoUnit.DAYS) - .minus(i, ChronoUnit.DAYS).format(DateTimeFormatter.ofPattern("dd.MM.yyyy")))); - } - - return urls; - } - - - private Set getArchiveEntries() throws InterruptedException, ExecutionException { - final OrfHistoryOverviewTask historyTask = new OrfHistoryOverviewTask(this); - final ConcurrentLinkedQueue topics = forkJoinPool.submit(historyTask).get(); - - final OrfHistoryTopicTask topicTask = new OrfHistoryTopicTask(this, topics); - final Set shows = forkJoinPool.submit(topicTask).get(); - - Log.sysLog("ORF: Anzahl Sendungen History: " + shows.size()); - - return shows; - } - - private ConcurrentLinkedQueue getLetterEntries() throws InterruptedException, ExecutionException { - final OrfLetterPageTask letterTask = new OrfLetterPageTask(); - final ConcurrentLinkedQueue shows = forkJoinPool.submit(letterTask).get(); - - Log.sysLog("ORF: Anzahl Sendungen nach Buchstaben: " + shows.size()); - - return shows; - } - - @Override - protected RecursiveTask> createCrawlerTask() { - - boolean processMoreEpisodes = false; - - final ConcurrentLinkedQueue shows = new ConcurrentLinkedQueue<>(); - try { - - if (CrawlerTool.loadLongMax()) { - shows.addAll(getLetterEntries()); - shows.addAll(getArchiveEntries()); - processMoreEpisodes = true; - } else { - getDaysEntries().forEach(show -> { - if (!shows.contains(show)) { - shows.add(show); - } - }); - } - - } catch (InterruptedException | ExecutionException exception) { - Log.errorLog(56146546, exception); - } - Log.sysLog("ORF Anzahl: " + shows.size()); - - meldungAddMax(shows.size()); - - return new OrfFilmDetailTask(this, shows, processMoreEpisodes); - } - -} diff --git a/src/main/java/mServer/crawler/sender/orf/OrfEpisodeInfoDTO.java b/src/main/java/mServer/crawler/sender/orf/OrfEpisodeInfoDTO.java deleted file mode 100644 index 95947a87..00000000 --- a/src/main/java/mServer/crawler/sender/orf/OrfEpisodeInfoDTO.java +++ /dev/null @@ -1,39 +0,0 @@ -package mServer.crawler.sender.orf; - -import java.time.Duration; -import java.util.Optional; - -public class OrfEpisodeInfoDTO { - - private final OrfVideoInfoDTO videoInfo; - private final Optional description; - private final Optional duration; - private final Optional title; - - public OrfEpisodeInfoDTO(final OrfVideoInfoDTO aVideoInfo, - final Optional aTitle, - final Optional aDescription, - final Optional aDuration - ) { - title = aTitle; - description = aDescription; - duration = aDuration; - videoInfo = aVideoInfo; - } - - public OrfVideoInfoDTO getVideoInfo() { - return videoInfo; - } - - public Optional getDescription() { - return description; - } - - public Optional getDuration() { - return duration; - } - - public Optional getTitle() { - return title; - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/OrfVideoInfoDTO.java b/src/main/java/mServer/crawler/sender/orf/OrfVideoInfoDTO.java deleted file mode 100644 index b64f4c07..00000000 --- a/src/main/java/mServer/crawler/sender/orf/OrfVideoInfoDTO.java +++ /dev/null @@ -1,50 +0,0 @@ -package mServer.crawler.sender.orf; - -import java.util.EnumMap; -import java.util.Map; -import mServer.crawler.sender.base.Qualities; - -public class OrfVideoInfoDTO { - - public static final String FILTER_JUGENDSCHUTZ = ".*/Jugendschutz[0-9][0-9][0-9][0-9]b[0-9][0-9][0-9][0-9]_.*"; - private final Map videoUrls; - private String subtitleUrl; - - public OrfVideoInfoDTO() { - videoUrls = new EnumMap<>(Qualities.class); - } - - public boolean hasVideoUrls() { - return !videoUrls.isEmpty(); - } - - public Qualities getDefaultQuality() { - if (videoUrls.containsKey(Qualities.NORMAL)) { - return Qualities.NORMAL; - } - return videoUrls.keySet().iterator().next(); - } - - public String getDefaultVideoUrl() { - return videoUrls.get(getDefaultQuality()); - } - - public String getSubtitleUrl() { - return subtitleUrl; - } - - public Map getVideoUrls() { - return videoUrls; - } - - public String put(final Qualities key, final String value) { - if (value == null || value.matches(FILTER_JUGENDSCHUTZ)) { - return ""; - } - return videoUrls.put(key, value); - } - - public void setSubtitleUrl(final String subtitleUrl) { - this.subtitleUrl = subtitleUrl; - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/json/OrfMoreEpisodesDeserializer.java b/src/main/java/mServer/crawler/sender/orf/json/OrfMoreEpisodesDeserializer.java deleted file mode 100644 index 4785fee8..00000000 --- a/src/main/java/mServer/crawler/sender/orf/json/OrfMoreEpisodesDeserializer.java +++ /dev/null @@ -1,26 +0,0 @@ -package mServer.crawler.sender.orf.json; - -import com.google.gson.JsonDeserializationContext; -import com.google.gson.JsonDeserializer; -import com.google.gson.JsonElement; -import mServer.crawler.sender.base.CrawlerUrlDTO; -import mServer.crawler.sender.base.JsonUtils; -import mServer.crawler.sender.base.UrlUtils; -import mServer.crawler.sender.orf.OrfConstants; - -import java.lang.reflect.Type; -import java.util.Optional; - -public class OrfMoreEpisodesDeserializer implements JsonDeserializer { - - private static final String ATTRIBUTE_URL = "url"; - - @Override - public CrawlerUrlDTO deserialize( - JsonElement jsonElement, Type type, JsonDeserializationContext jsonDeserializationContext) { - - final Optional url = - JsonUtils.getAttributeAsString(jsonElement.getAsJsonObject(), ATTRIBUTE_URL); - return url.map(s -> new CrawlerUrlDTO(UrlUtils.addDomainIfMissing(s, OrfConstants.URL_BASE))).orElse(null); - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/parser/OrfMoreEpisodesParser.java b/src/main/java/mServer/crawler/sender/orf/parser/OrfMoreEpisodesParser.java deleted file mode 100644 index b262a36d..00000000 --- a/src/main/java/mServer/crawler/sender/orf/parser/OrfMoreEpisodesParser.java +++ /dev/null @@ -1,27 +0,0 @@ -package mServer.crawler.sender.orf.parser; - - -import mServer.crawler.sender.orf.TopicUrlDTO; -import org.jsoup.nodes.Document; - -import java.util.ArrayList; -import java.util.List; - -public class OrfMoreEpisodesParser { - private static final String EPISODES_SELECTOR = "article.b-teaser > a.teaser-link"; - private static final String ATTRIBUTE_HREF = "href"; - - public List parse(final Document document, final String topic) { - final List result = new ArrayList<>(); - - document - .select(EPISODES_SELECTOR) - .forEach( - episode -> { - final String url = episode.attr(ATTRIBUTE_HREF); - result.add(new TopicUrlDTO(topic, url)); - }); - - return result; - } -} \ No newline at end of file diff --git a/src/main/java/mServer/crawler/sender/orf/parser/OrfPlaylistDeserializer.java b/src/main/java/mServer/crawler/sender/orf/parser/OrfPlaylistDeserializer.java deleted file mode 100644 index 36b0cada..00000000 --- a/src/main/java/mServer/crawler/sender/orf/parser/OrfPlaylistDeserializer.java +++ /dev/null @@ -1,104 +0,0 @@ -package mServer.crawler.sender.orf.parser; - -import com.google.gson.JsonArray; -import com.google.gson.JsonDeserializationContext; -import com.google.gson.JsonDeserializer; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; -import java.lang.reflect.Type; -import java.time.Duration; -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; -import mServer.crawler.sender.orf.JsonUtils; -import mServer.crawler.sender.orf.OrfEpisodeInfoDTO; -import mServer.crawler.sender.orf.OrfVideoInfoDTO; - -public class OrfPlaylistDeserializer implements JsonDeserializer> { - - private static final String ELEMENT_GAPLESS_VIDEO = "gapless_video"; - private static final String ELEMENT_PLAYLIST = "playlist"; - private static final String ELEMENT_VIDEOS = "videos"; - - private static final String ATTRIBUTE_TITLE = "title"; - private static final String ATTRIBUTE_DESCRIPTION = "description"; - private static final String ATTRIBUTE_DURATION = "duration"; - private static final String ATTRIBUTE_DURATION_IN_SECONDS = "duration_in_seconds"; - - @Override - public List deserialize(JsonElement aJsonElement, Type aType, JsonDeserializationContext aContext) { - - List episodes = new ArrayList<>(); - - if (!aJsonElement.getAsJsonObject().has(ELEMENT_PLAYLIST)) { - return episodes; - } - - JsonObject playlistObject = aJsonElement.getAsJsonObject().get(ELEMENT_PLAYLIST).getAsJsonObject(); - if (JsonUtils.hasElements(playlistObject, ELEMENT_GAPLESS_VIDEO)) { - parseGaplessVideo(episodes, playlistObject); - } - - parseVideos(episodes, playlistObject); - - return episodes; - } - - private void parseGaplessVideo(List aEpisodes, JsonObject aPlaylistObject) { - - final Optional title = JsonUtils.getAttributeAsString(aPlaylistObject, ATTRIBUTE_TITLE); - final Optional duration = parseDurationInSeconds(aPlaylistObject); - - final Optional videoInfoOptional = parseUrls(aPlaylistObject.getAsJsonObject(ELEMENT_GAPLESS_VIDEO)); - - if (videoInfoOptional.isPresent()) { - OrfEpisodeInfoDTO episode = new OrfEpisodeInfoDTO(videoInfoOptional.get(), title, Optional.empty(), duration); - aEpisodes.add(episode); - } - } - - private void parseVideos(List aEpisodes, JsonObject aPlaylistObject) { - JsonArray videosArray = aPlaylistObject.getAsJsonObject().get(ELEMENT_VIDEOS).getAsJsonArray(); - - for (JsonElement videoElement : videosArray) { - JsonObject videoObject = videoElement.getAsJsonObject(); - final Optional title = JsonUtils.getAttributeAsString(videoObject, ATTRIBUTE_TITLE); - final Optional description = JsonUtils.getAttributeAsString(videoObject, ATTRIBUTE_DESCRIPTION); - final Optional duration = parseDuration(videoObject); - - final Optional videoInfoOptional = parseUrls(videoObject); - - if (videoInfoOptional.isPresent()) { - OrfEpisodeInfoDTO episode = new OrfEpisodeInfoDTO(videoInfoOptional.get(), title, description, duration); - aEpisodes.add(episode); - } - } - } - - private Optional parseUrls(final JsonObject aVideoObject) { - - OrfVideoDetailDeserializer deserializer = new OrfVideoDetailDeserializer(); - return deserializer.deserializeVideoObject(aVideoObject); - } - - private static Optional parseDuration(final JsonObject aVideoObject) { - if (aVideoObject.has(ATTRIBUTE_DURATION)) { - Long durationValue = aVideoObject.get(ATTRIBUTE_DURATION).getAsLong(); - - // Duration ist in Millisekunden angegeben, diese interessieren aber nicht - return Optional.of(Duration.ofSeconds(durationValue / 1000)); - } - - return Optional.empty(); - } - - private static Optional parseDurationInSeconds(final JsonObject aVideoObject) { - if (aVideoObject.has(ATTRIBUTE_DURATION_IN_SECONDS)) { - Double durationValue = aVideoObject.get(ATTRIBUTE_DURATION_IN_SECONDS).getAsDouble(); - - return Optional.of(Duration.ofSeconds(durationValue.longValue())); - } - - return Optional.empty(); - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/parser/OrfVideoDetailDeserializer.java b/src/main/java/mServer/crawler/sender/orf/parser/OrfVideoDetailDeserializer.java deleted file mode 100644 index 846aedb8..00000000 --- a/src/main/java/mServer/crawler/sender/orf/parser/OrfVideoDetailDeserializer.java +++ /dev/null @@ -1,149 +0,0 @@ -package mServer.crawler.sender.orf.parser; - -import java.lang.reflect.Type; -import java.util.Optional; -import com.google.gson.JsonDeserializationContext; -import com.google.gson.JsonDeserializer; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; -import com.google.gson.JsonParseException; -import de.mediathekview.mlib.tool.Log; -import mServer.crawler.sender.base.Qualities; -import mServer.crawler.sender.orf.OrfVideoInfoDTO; - -public class OrfVideoDetailDeserializer implements JsonDeserializer> { - - private static final String WRONG_HTTPS_URL_PART = ".apa."; - private static final String RIGHT_HTTPS_URL_PART = ".sf.apa."; - private static final String ELEMENT_PLAYLIST = "playlist"; - private static final String ELEMENT_VIDEOS = "videos"; - private static final String ELEMENT_SUBTITLES = "subtitles"; - private static final String ELEMENT_SOURCES = "sources"; - - private static final String ATTRIBUTE_DELIVERY = "delivery"; - private static final String ATTRIBUTE_PROTOCOL = "protocol"; - private static final String ATTRIBUTE_QUALITY = "quality"; - private static final String ATTRIBUTE_SRC = "src"; - private static final String ATTRIBUTE_TYPE = "type"; - - private static final String RELEVANT_DELIVERY1 = "progressive"; - private static final String RELEVANT_DELIVERY2 = "hls"; - private static final String RELEVANT_PROTOCOL = "http"; - private static final String RELEVANT_SUBTITLE_TYPE = "ttml"; - private static final String RELEVANT_VIDEO_TYPE1 = "video/mp4"; - private static final String RELEVANT_VIDEO_TYPE2 = "application/x-mpegURL"; - - private static String fixHttpsURL(final String url) { - if (url.contains(RIGHT_HTTPS_URL_PART)) { - return url; - } - return url.replace(WRONG_HTTPS_URL_PART, RIGHT_HTTPS_URL_PART); - } - - private static Optional getQuality(final String aQuality) { - switch (aQuality) { - case "Q1A": - return Optional.empty(); - case "Q4A": - return Optional.of(Qualities.SMALL); - case "Q6A": - return Optional.of(Qualities.NORMAL); - case "Q8C": - return Optional.of(Qualities.HD); - case "Q0A": - // QXA/QXB(DRM): another m3u8 has to be loaded which is often geoblocked - case "QXA": - case "QXADRM": - case "QXB": - case "QXBDRM": - case "Q8A": - return Optional.empty(); - default: - Log.sysLog("ORF: unknown quality: " + aQuality); - } - return Optional.empty(); - } - - private static void parseSubtitles(final JsonElement aSubtitlesElement, - final OrfVideoInfoDTO dto) { - if (aSubtitlesElement.isJsonArray()) { - aSubtitlesElement.getAsJsonArray().forEach(subtitleElement -> { - final JsonObject subtitleObject = subtitleElement.getAsJsonObject(); - if (subtitleObject.has(ATTRIBUTE_SRC) && subtitleObject.has(ATTRIBUTE_TYPE)) { - final String type = subtitleObject.get(ATTRIBUTE_TYPE).getAsString(); - - if (type.equalsIgnoreCase(RELEVANT_SUBTITLE_TYPE)) { - final String url = fixHttpsURL(subtitleObject.get(ATTRIBUTE_SRC).getAsString()); - dto.setSubtitleUrl(url); - } - } - }); - } - } - - private static void parseVideo(final JsonElement aVideoElement, final OrfVideoInfoDTO dto) { - if (aVideoElement.isJsonArray()) { - aVideoElement.getAsJsonArray().forEach(videoElement -> { - final JsonObject videoObject = videoElement.getAsJsonObject(); - if (videoObject.has(ATTRIBUTE_PROTOCOL) && videoObject.has(ATTRIBUTE_QUALITY) - && videoObject.has(ATTRIBUTE_SRC) && videoObject.has(ATTRIBUTE_TYPE)) { - final String type = videoObject.get(ATTRIBUTE_TYPE).getAsString(); - final String protocol = videoObject.get(ATTRIBUTE_PROTOCOL).getAsString(); - final String delivery = videoObject.get(ATTRIBUTE_DELIVERY).getAsString(); - - if (isVideoRelevant(type, protocol, delivery)) { - final String quality = videoObject.get(ATTRIBUTE_QUALITY).getAsString(); - final String url = fixHttpsURL(videoObject.get(ATTRIBUTE_SRC).getAsString()); - - final Optional resolution = getQuality(quality); - if (resolution.isPresent()) { - dto.put(resolution.get(), url); - } - } - } - }); - } - } - - private static boolean isVideoRelevant(String type, String protocol, String delivery) { - return (type.equalsIgnoreCase(RELEVANT_VIDEO_TYPE1) || type.equalsIgnoreCase(RELEVANT_VIDEO_TYPE2)) - && protocol.equalsIgnoreCase(RELEVANT_PROTOCOL) - && (delivery.equalsIgnoreCase(RELEVANT_DELIVERY1) || delivery.equalsIgnoreCase(RELEVANT_DELIVERY2)); - } - - @Override - public Optional deserialize(final JsonElement aJsonElement, final Type aType, - final JsonDeserializationContext aContext) throws JsonParseException { - - final JsonObject jsonObject = aJsonElement.getAsJsonObject(); - if (jsonObject.has(ELEMENT_PLAYLIST)) { - final JsonObject playlistObject = jsonObject.get(ELEMENT_PLAYLIST).getAsJsonObject(); - if (playlistObject.has(ELEMENT_VIDEOS)) { - final JsonObject videoObject - = playlistObject.get(ELEMENT_VIDEOS).getAsJsonArray().get(0).getAsJsonObject(); - - return deserializeVideoObject(videoObject); - } - } - - return Optional.empty(); - } - - public Optional deserializeVideoObject(final JsonObject aVideoObject) { - final OrfVideoInfoDTO dto = new OrfVideoInfoDTO(); - - if (aVideoObject.has(ELEMENT_SOURCES)) { - parseVideo(aVideoObject.get(ELEMENT_SOURCES), dto); - } - - if (aVideoObject.has(ELEMENT_SUBTITLES)) { - parseSubtitles(aVideoObject.get(ELEMENT_SUBTITLES), dto); - } - - if (dto.hasVideoUrls()) { - return Optional.of(dto); - } - - return Optional.empty(); - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/tasks/OrfDayTask.java b/src/main/java/mServer/crawler/sender/orf/tasks/OrfDayTask.java deleted file mode 100644 index 67fb3ded..00000000 --- a/src/main/java/mServer/crawler/sender/orf/tasks/OrfDayTask.java +++ /dev/null @@ -1,54 +0,0 @@ -package mServer.crawler.sender.orf.tasks; - -import mServer.crawler.sender.base.AbstractUrlTask; -import java.util.concurrent.ConcurrentLinkedQueue; -import mServer.crawler.sender.MediathekReader; -import mServer.crawler.sender.base.CrawlerUrlDTO; -import mServer.crawler.sender.orf.TopicUrlDTO; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; - -public class OrfDayTask extends OrfTaskBase { - - private static final String ITEM_SELECTOR = "article a"; - private static final String TITLE_SELECTOR1 = ".item-title"; - private static final String TITLE_SELECTOR2 = ".teaser-title"; - private static final String ATTRIBUTE_HREF = "href"; - - public OrfDayTask(final MediathekReader aCrawler, - final ConcurrentLinkedQueue aUrlToCrawlDTOs) { - super(aCrawler, aUrlToCrawlDTOs); - } - - @Override - protected void processDocument(CrawlerUrlDTO aUrlDTO, Document aDocument) { - Elements elements = aDocument.select(ITEM_SELECTOR); - elements.forEach( - item -> { - Element titleElement = getTitleElement(item); - if (titleElement != null) { - String theme = OrfHelper.parseTheme(titleElement.text()); - String url = item.attr(ATTRIBUTE_HREF); - - TopicUrlDTO dto = new TopicUrlDTO(theme, url); - taskResults.add(dto); - } - }); - - ORF_LOGGER.trace(String.format("%s: Anzahl Filme: %d", aUrlDTO.getUrl(), taskResults.size())); - } - - private Element getTitleElement(Element item) { - Element titleElement = item.selectFirst(TITLE_SELECTOR1); - if (titleElement == null) { - titleElement = item.selectFirst(TITLE_SELECTOR2); - } - return titleElement; - } - - @Override - protected AbstractUrlTask createNewOwnInstance(ConcurrentLinkedQueue aURLsToCrawl) { - return new OrfDayTask(crawler, aURLsToCrawl); - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/tasks/OrfFilmDetailTask.java b/src/main/java/mServer/crawler/sender/orf/tasks/OrfFilmDetailTask.java deleted file mode 100644 index 2210bd92..00000000 --- a/src/main/java/mServer/crawler/sender/orf/tasks/OrfFilmDetailTask.java +++ /dev/null @@ -1,272 +0,0 @@ -package mServer.crawler.sender.orf.tasks; - -import mServer.crawler.sender.base.*; -import com.google.gson.Gson; -import com.google.gson.GsonBuilder; -import com.google.gson.reflect.TypeToken; -import de.mediathekview.mlib.daten.DatenFilm; -import de.mediathekview.mlib.tool.Log; - -import java.io.IOException; -import java.lang.reflect.Type; -import java.time.Duration; -import java.time.LocalDateTime; -import java.time.format.DateTimeFormatter; -import java.time.format.DateTimeParseException; -import java.time.temporal.ChronoUnit; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.concurrent.ConcurrentLinkedQueue; -import mServer.crawler.CrawlerTool; -import mServer.crawler.sender.MediathekReader; -import mServer.crawler.sender.orf.OrfEpisodeInfoDTO; -import mServer.crawler.sender.orf.OrfVideoInfoDTO; -import mServer.crawler.sender.orf.TopicUrlDTO; -import mServer.crawler.sender.orf.json.OrfMoreEpisodesDeserializer; -import mServer.crawler.sender.orf.parser.OrfMoreEpisodesParser; -import mServer.crawler.sender.orf.parser.OrfPlaylistDeserializer; -import org.apache.commons.lang3.StringUtils; -import org.jsoup.nodes.Document; - -public class OrfFilmDetailTask extends OrfTaskBase { - - private static final String TITLE_SELECTOR = ".description-container .description-title"; - private static final String VIDEO_META_DATA_SELECTOR = ".video-meta-data"; - private static final String TIME_SELECTOR = VIDEO_META_DATA_SELECTOR + " time"; - private static final String DURATION_SELECTOR = VIDEO_META_DATA_SELECTOR + " span.duration"; - private static final String DESCRIPTION_SELECTOR = ".description-container .description-text"; - private static final String VIDEO_SELECTOR = "div.jsb_VideoPlaylist"; - private static final String MORE_EPISODES_SELECTOR = "div.more-episodes"; - - private static final String ATTRIBUTE_DATETIME = "datetime"; - private static final String ATTRIBUTE_DATA_JSB = "data-jsb"; - - private static final String PREFIX_AUDIO_DESCRIPTION = "AD |"; - - private static final DateTimeFormatter DATE_TIME_FORMATTER - = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); - - private static final DateTimeFormatter DATE_FORMAT - = DateTimeFormatter.ofPattern("dd.MM.yyyy"); - private static final DateTimeFormatter TIME_FORMAT - = DateTimeFormatter.ofPattern("HH:mm:ss"); - - private static final Type CRAWLER_URL_TYPE_TOKEN = new TypeToken() {}.getType(); - private static final Type LIST_EPISODEINFO_TYPE_TOKEN = new TypeToken>() { - }.getType(); - - private final boolean processMoreEpisodes; - private final transient JsoupConnection jsoupConnection; - - public OrfFilmDetailTask(final MediathekReader aCrawler, - final ConcurrentLinkedQueue aUrlToCrawlDTOs, boolean processMoreEpisodes) { - super(aCrawler, aUrlToCrawlDTOs); - this.processMoreEpisodes = processMoreEpisodes; - jsoupConnection = new JsoupConnection(); - } - - @Override - protected void processDocument(TopicUrlDTO aUrlDTO, Document aDocument) { - final Optional title = HtmlDocumentUtils.getElementString(TITLE_SELECTOR, aDocument); - final Optional time = parseDate(aDocument); - final Optional duration = parseDuration(aDocument); - final Optional description = HtmlDocumentUtils.getElementString(DESCRIPTION_SELECTOR, aDocument); - - final List episodes = parseEpisodes(aDocument); - - for (int i = 0; i < episodes.size(); i++) { - OrfEpisodeInfoDTO episode = episodes.get(i); - if (i == 0) { - createFilm(aUrlDTO, episode.getVideoInfo(), title, description, time, duration); - } else { - createFilm(aUrlDTO, episode.getVideoInfo(), episode.getTitle(), episode.getDescription(), time, episode.getDuration()); - } - } - - if (processMoreEpisodes) { - final List topicUrlDTOS = parseMoreEpisodes(aDocument, aUrlDTO.getTopic()); - topicUrlDTOS.remove(aUrlDTO); - processMoreEpisodes(topicUrlDTOS); - } - - ORF_LOGGER.trace(String.format("%s - %s: Anzahl Filme: %d", aUrlDTO.getTopic(), aUrlDTO.getUrl(), taskResults.size())); - } - - @Override - protected AbstractUrlTask createNewOwnInstance(ConcurrentLinkedQueue aURLsToCrawl) { - return createNewOwnInstance(aURLsToCrawl, processMoreEpisodes); - } - - private AbstractUrlTask createNewOwnInstance(final ConcurrentLinkedQueue urlsToCrawl, boolean processMoreEpisodes) { - return new OrfFilmDetailTask(crawler, urlsToCrawl, processMoreEpisodes); - } - - private void createFilm(final TopicUrlDTO aUrlDTO, - final OrfVideoInfoDTO aVideoInfo, - final Optional aTitle, - final Optional aDescription, - final Optional aTime, - final Optional aDuration) { - - if (aTitle.isPresent()) { - boolean isAudioDescription = aUrlDTO.getTopic().startsWith(PREFIX_AUDIO_DESCRIPTION); - - LocalDateTime time = aTime.orElse(LocalDateTime.now()); - - String datum = time.format(DATE_FORMAT); - String zeit = time.format(TIME_FORMAT); - String url = aVideoInfo.getDefaultVideoUrl(); - - final DatenFilm film = new DatenFilm(crawler.getSendername(), - isAudioDescription - ? trimAudioDescriptionPrefix(aUrlDTO.getTopic()) - : aUrlDTO.getTopic(), - aUrlDTO.getUrl(), - isAudioDescription - ? trimAudioDescriptionPrefix(aTitle.get()) + " (Audiodeskription)" - : aTitle.get(), - url, - "", - datum, - zeit, - aDuration.orElse(Duration.ZERO).getSeconds(), - aDescription.orElse("")); - - if (StringUtils.isNotBlank(aVideoInfo.getSubtitleUrl())) { - CrawlerTool.addUrlSubtitle(film, aVideoInfo.getSubtitleUrl()); - } - - addUrls(film, aVideoInfo.getVideoUrls()); - - taskResults.add(film); - } else { - Log.sysLog("OrfFilmDetailTask: no title or video found for url " + aUrlDTO.getUrl()); - } - } - - private String trimAudioDescriptionPrefix(String text) { - return text.substring(PREFIX_AUDIO_DESCRIPTION.length()); - } - - private void addUrls(final DatenFilm aFilm, final Map aVideoUrls) { - - if (aVideoUrls.containsKey(Qualities.HD)) { - CrawlerTool.addUrlHd(aFilm, aVideoUrls.get(Qualities.HD)); - } - if (aVideoUrls.containsKey(Qualities.SMALL)) { - CrawlerTool.addUrlKlein(aFilm, aVideoUrls.get(Qualities.SMALL)); - } - } - - private List parseEpisodes(Document aDocument) { - Optional json = HtmlDocumentUtils.getElementAttributeString(VIDEO_SELECTOR, ATTRIBUTE_DATA_JSB, aDocument); - - if (json.isPresent()) { - - final Gson gson = new GsonBuilder().registerTypeAdapter(LIST_EPISODEINFO_TYPE_TOKEN, - new OrfPlaylistDeserializer()).create(); - - return gson.fromJson(json.get(), LIST_EPISODEINFO_TYPE_TOKEN); - } - - return new ArrayList<>(); - } - - private static Optional parseDate(Document aDocument) { - Optional date = HtmlDocumentUtils.getElementAttributeString(TIME_SELECTOR, ATTRIBUTE_DATETIME, aDocument); - if (date.isPresent()) { - String dateValue = date.get().replace("CET", " ").replace("CEST", " "); - try { - LocalDateTime localDate = LocalDateTime.parse(dateValue, DATE_TIME_FORMATTER); - return Optional.of(localDate); - } catch (DateTimeParseException e) { - Log.sysLog("OrfFilmDetailTask: unknown date format: " + date.get()); - } - } - - return Optional.empty(); - } - - private static Optional parseDuration(Document aDocument) { - Optional duration = HtmlDocumentUtils.getElementString(DURATION_SELECTOR, aDocument); - if (!duration.isPresent()) { - return Optional.empty(); - } - - Optional unit = determineChronoUnit(duration.get()); - if (!unit.isPresent()) { - Log.sysLog("OrfFilmDetailTask: unknown duration type: " + duration.get()); - return Optional.empty(); - } - - String[] parts = duration.get().split(" ")[0].trim().split(":"); - if (parts.length != 2) { - Log.sysLog("OrfFilmDetailTask: unknown duration part count: " + duration.get()); - return Optional.empty(); - } - - ChronoUnit unitValue = unit.get(); - if (unitValue == ChronoUnit.SECONDS || unitValue == ChronoUnit.MINUTES) { - return Optional.of( - Duration.ofMinutes(Long.parseLong(parts[0])) - .plusSeconds(Long.parseLong(parts[1])) - ); - } - if (unitValue == ChronoUnit.HOURS) { - return Optional.of( - Duration.ofHours(Long.parseLong(parts[0])) - .plusMinutes(Long.parseLong(parts[1])) - ); - } - - return Optional.empty(); - } - - private static Optional determineChronoUnit(String aDuration) { - if (aDuration.contains("Min.")) { - return Optional.of(ChronoUnit.MINUTES); - } - if (aDuration.contains("Std.")) { - return Optional.of(ChronoUnit.HOURS); - } - if (aDuration.contains("Sek.")) { - return Optional.of(ChronoUnit.SECONDS); - } - - return Optional.empty(); - } - - private List parseMoreEpisodes(final Document document, final String topic) { - final Optional json = HtmlDocumentUtils.getElementAttributeString(MORE_EPISODES_SELECTOR, ATTRIBUTE_DATA_JSB, document); - if (json.isPresent()) { - final Gson gson = - new GsonBuilder() - .registerTypeAdapter(CRAWLER_URL_TYPE_TOKEN, new OrfMoreEpisodesDeserializer()) - .create(); - - CrawlerUrlDTO moreEpisodesUrl = gson.fromJson(json.get(), CRAWLER_URL_TYPE_TOKEN); - if (moreEpisodesUrl != null) { - try { - final Document moreEpisodesDocument = jsoupConnection.getDocument(moreEpisodesUrl.getUrl()); - OrfMoreEpisodesParser parser = new OrfMoreEpisodesParser(); - return parser.parse(moreEpisodesDocument, topic); - } catch (IOException e) { - Log.errorLog(237462889, String.format("OrfFilmDetailTask: loading more episodes url %s failed.", moreEpisodesUrl.getUrl())); - } - } - } - - return new ArrayList<>(); - } - - private void processMoreEpisodes(final List moreFilms) { - if (moreFilms != null && !moreFilms.isEmpty()) { - final ConcurrentLinkedQueue queue = new ConcurrentLinkedQueue<>(moreFilms); - final OrfFilmDetailTask task = (OrfFilmDetailTask) createNewOwnInstance(queue, false); - task.fork(); - taskResults.addAll(task.join()); - } - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/tasks/OrfHelper.java b/src/main/java/mServer/crawler/sender/orf/tasks/OrfHelper.java deleted file mode 100644 index 2a7f3f18..00000000 --- a/src/main/java/mServer/crawler/sender/orf/tasks/OrfHelper.java +++ /dev/null @@ -1,63 +0,0 @@ -package mServer.crawler.sender.orf.tasks; - -import java.util.ArrayList; -import java.util.List; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; -import mServer.crawler.sender.orf.OrfConstants; - -/** - * Helper methods for ORF tasks - */ -public class OrfHelper { - - private static final String LETTER_URL_SELECTOR = "li.letter-item > a"; - private static final String ATTRIBUTE_HREF = "href"; - private static final String ATTRIBUTE_TITLE = "title"; - - private OrfHelper() { - } - - public static String parseTheme(final Element aItem) { - String theme = aItem.attr(ATTRIBUTE_TITLE); - return parseTheme(theme); - } - - public static String parseTheme(final String theme) { - final String result = theme.replaceAll("[0-9]{1,2}:[0-9][0-9]$", "").trim(); - // Thema steht vor Doppelpunkt - // Ausnahmen - // - ZIB-Sendungen mit Uhrzeit - // - DokEins-Sendungen - // - Ungarisches Magazin - int index = result.indexOf(':'); - if (index > 0 - && !result.startsWith("ZIB") - && !result.startsWith("DOKeins") - && !result.contains("Ungarisches Magazin")) { - return result.substring(0, index).trim(); - } - return result; - } - - /** - * determines the links to the letter pages - * - * @param aDocument the html document with letter links - * @return list with urls - */ - public static List parseLetterLinks(Document aDocument) { - final List results = new ArrayList<>(); - - Elements links = aDocument.select(LETTER_URL_SELECTOR); - links.forEach(element -> { - if (element.hasAttr(ATTRIBUTE_HREF)) { - String subpage = element.attr(ATTRIBUTE_HREF); - results.add(OrfConstants.URL_BASE + subpage); - } - }); - - return results; - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/tasks/OrfHistoryOverviewTask.java b/src/main/java/mServer/crawler/sender/orf/tasks/OrfHistoryOverviewTask.java deleted file mode 100644 index c0a5de6e..00000000 --- a/src/main/java/mServer/crawler/sender/orf/tasks/OrfHistoryOverviewTask.java +++ /dev/null @@ -1,45 +0,0 @@ -package mServer.crawler.sender.orf.tasks; - -import mServer.crawler.sender.MediathekReader; -import mServer.crawler.sender.base.JsoupConnection; -import mServer.crawler.sender.orf.OrfConstants; -import mServer.crawler.sender.orf.TopicUrlDTO; -import org.jsoup.nodes.Document; -import org.jsoup.select.Elements; - -import java.util.concurrent.Callable; -import java.util.concurrent.ConcurrentLinkedQueue; - -public class OrfHistoryOverviewTask implements Callable> { - - private static final String ATTRIBUTE_HREF = "href"; - private static final String ATTRIBUTE_TITLE = "title"; - private static final String TOPIC_URL_SELECTOR = "section.has-4-in-row article > a"; - - private final MediathekReader crawler; - private final JsoupConnection jsoupConnection; - - public OrfHistoryOverviewTask( - final MediathekReader aCrawler) { - crawler = aCrawler; - jsoupConnection = new JsoupConnection(); - } - - @Override - public ConcurrentLinkedQueue call() throws Exception { - final ConcurrentLinkedQueue results = new ConcurrentLinkedQueue<>(); - - // URLs für Seiten parsen - final Document document = jsoupConnection.getDocument(OrfConstants.URL_ARCHIVE); - - final Elements topics = document.select(TOPIC_URL_SELECTOR); - topics.forEach( - topicElement -> { - final String url = topicElement.attr(ATTRIBUTE_HREF); - final String topic = topicElement.attr(ATTRIBUTE_TITLE); - results.add(new TopicUrlDTO(topic, url)); - }); - - return results; - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/tasks/OrfHistoryTopicTask.java b/src/main/java/mServer/crawler/sender/orf/tasks/OrfHistoryTopicTask.java deleted file mode 100644 index 7dda27be..00000000 --- a/src/main/java/mServer/crawler/sender/orf/tasks/OrfHistoryTopicTask.java +++ /dev/null @@ -1,39 +0,0 @@ -package mServer.crawler.sender.orf.tasks; - -import mServer.crawler.sender.MediathekReader; -import mServer.crawler.sender.base.AbstractDocumentTask; -import mServer.crawler.sender.base.AbstractRecursivConverterTask; -import mServer.crawler.sender.orf.TopicUrlDTO; -import org.jsoup.nodes.Document; - -import java.util.concurrent.ConcurrentLinkedQueue; - -public class OrfHistoryTopicTask extends AbstractDocumentTask { - - private static final String ATTRIBUTE_HREF = "href"; - private static final String SHOW_URL_SELECTOR = "article > a"; - - public OrfHistoryTopicTask( - final MediathekReader crawler, - final ConcurrentLinkedQueue urlToCrawlDTOs - ) { - super(crawler, urlToCrawlDTOs); - } - - @Override - protected AbstractRecursivConverterTask createNewOwnInstance( - final ConcurrentLinkedQueue aElementsToProcess) { - return new OrfHistoryTopicTask(crawler, aElementsToProcess); - } - - @Override - protected void processDocument(final TopicUrlDTO aUrlDto, final Document aDocument) { - aDocument - .select(SHOW_URL_SELECTOR) - .forEach( - showElement -> { - final String url = showElement.attr(ATTRIBUTE_HREF); - taskResults.add(new TopicUrlDTO(aUrlDto.getTopic(), url)); - }); - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/tasks/OrfLetterPageTask.java b/src/main/java/mServer/crawler/sender/orf/tasks/OrfLetterPageTask.java deleted file mode 100644 index 6d52b090..00000000 --- a/src/main/java/mServer/crawler/sender/orf/tasks/OrfLetterPageTask.java +++ /dev/null @@ -1,57 +0,0 @@ -package mServer.crawler.sender.orf.tasks; - -import java.io.IOException; -import java.util.List; -import java.util.concurrent.Callable; -import java.util.concurrent.ConcurrentLinkedQueue; -import mServer.crawler.sender.orf.OrfConstants; -import mServer.crawler.sender.orf.TopicUrlDTO; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.select.Elements; - -public class OrfLetterPageTask implements Callable> { - - private static final Logger LOG = LogManager.getLogger(OrfLetterPageTask.class); - - private static final String SHOW_URL_SELECTOR = "article > a"; - - @Override - public ConcurrentLinkedQueue call() throws Exception { - final ConcurrentLinkedQueue results = new ConcurrentLinkedQueue<>(); - - // URLs für Seiten parsen - final Document document = Jsoup.connect(OrfConstants.URL_SHOW_LETTER_PAGE_A).get(); - List overviewLinks = OrfHelper.parseLetterLinks(document); - - // Sendungen für die einzelnen Seiten pro Buchstabe ermitteln - overviewLinks.forEach(url -> { - try { - Document subpageDocument = Jsoup.connect(url).get(); - results.addAll(parseOverviewPage(subpageDocument)); - } catch (IOException ex) { - LOG.fatal("OrfLetterPageTask: error parsing url " + url, ex); - } - }); - - return results; - } - - private ConcurrentLinkedQueue parseOverviewPage(Document aDocument) { - final ConcurrentLinkedQueue results = new ConcurrentLinkedQueue<>(); - - Elements links = aDocument.select(SHOW_URL_SELECTOR); - links.forEach(element -> { - if (element.hasAttr("href")) { - String link = element.attr("href"); - String theme = OrfHelper.parseTheme(element); - - results.add(new TopicUrlDTO(theme, link)); - } - }); - - return results; - } -} diff --git a/src/main/java/mServer/crawler/sender/orf/tasks/OrfTaskBase.java b/src/main/java/mServer/crawler/sender/orf/tasks/OrfTaskBase.java deleted file mode 100644 index 3ac1b892..00000000 --- a/src/main/java/mServer/crawler/sender/orf/tasks/OrfTaskBase.java +++ /dev/null @@ -1,110 +0,0 @@ -package mServer.crawler.sender.orf.tasks; - -import de.mediathekview.mlib.Config; -import de.mediathekview.mlib.tool.Log; -import java.io.IOException; -import java.net.SocketException; -import java.net.SocketTimeoutException; -import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.TimeUnit; -import mServer.crawler.FilmeSuchen; -import mServer.crawler.RunSender; -import mServer.crawler.sender.MediathekReader; -import mServer.crawler.sender.base.AbstractUrlTask; -import mServer.crawler.sender.base.CrawlerUrlDTO; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.jsoup.HttpStatusException; -import org.jsoup.Jsoup; -import org.jsoup.Connection.Response; -import org.jsoup.nodes.Document; - -public abstract class OrfTaskBase - extends AbstractUrlTask { - - private static final long serialVersionUID = -4124779055395250987L; - private static final String LOAD_DOCUMENT_HTTPERROR - = "Some HTTP error happened while crawl the %s page \"%s\"."; - - private static final int MAX_TIMEOUT = (int) TimeUnit.SECONDS.toMillis(300); - - protected static final Logger ORF_LOGGER = LogManager.getLogger("OrfLogger"); - - public OrfTaskBase(final MediathekReader aCrawler, - final ConcurrentLinkedQueue aUrlToCrawlDTOs) { - super(aCrawler, aUrlToCrawlDTOs); - } - - /** - * In this method you have to use the JSOUP {@link Document} to create a - * object of the return type {@link T}. Add the results to - * {@link AbstractUrlTask#taskResults}. - * - * @param aUrlDTO A DTO containing at least the URL of the given document. - * @param aDocument The JSOUP {@link Document}. - */ - protected abstract void processDocument(final D aUrlDTO, final Document aDocument); - - @Override - protected void processElement(final D aUrlDTO) { - if (Config.getStop()) { - return; - } - - boolean retry = false; - int timeout = (int) TimeUnit.SECONDS.toMillis(120); - - do { - try { - retry = false; - - final Document document = loadDocument(aUrlDTO, timeout); - processDocument(aUrlDTO, document); - } catch (final HttpStatusException httpStatusError) { - FilmeSuchen.listeSenderLaufen.inc(crawler.getSendername(), RunSender.Count.FEHLER); - FilmeSuchen.listeSenderLaufen.inc(crawler.getSendername(), RunSender.Count.FEHLVERSUCHE); - ORF_LOGGER.trace(httpStatusError); - Log.sysLog(String.format(LOAD_DOCUMENT_HTTPERROR, crawler.getSendername(), aUrlDTO.getUrl())); - - Log.errorLog(96459855, - crawler.getSendername() + ": crawlerDocumentLoadError: " + aUrlDTO.getUrl() + ", " + httpStatusError.getStatusCode()); - } catch (final SocketException | SocketTimeoutException socketException) { - FilmeSuchen.listeSenderLaufen.inc(crawler.getSendername(), RunSender.Count.FEHLVERSUCHE); - ORF_LOGGER.trace(socketException); - retry = true; - timeout *= 2; - try { - Thread.sleep(5000); - } catch (InterruptedException ignored) { - // just try again - } - } catch (final Exception exception) { - FilmeSuchen.listeSenderLaufen.inc(crawler.getSendername(), RunSender.Count.FEHLER); - FilmeSuchen.listeSenderLaufen.inc(crawler.getSendername(), RunSender.Count.FEHLVERSUCHE); - Log.errorLog(96459856, exception); - ORF_LOGGER.trace(exception); - } - } while (retry && timeout <= MAX_TIMEOUT); - } - - private Document loadDocument(final D aUrlDTO, int timeout) throws IOException { - long start = System.currentTimeMillis(); - // maxBodySize(0)=unlimited - // necessary for ORF documents which are larger than the default size - Response response = Jsoup.connect(aUrlDTO.getUrl()) - .timeout(timeout) - .maxBodySize(0).execute(); - - long end = System.currentTimeMillis(); - - ORF_LOGGER.trace(String.format("%s: %d - loaded in %d ms", aUrlDTO.getUrl(), response.statusCode(), end - start)); - traceRequest(); - - final Document document = response.parse(); - - end = System.currentTimeMillis(); - ORF_LOGGER.trace(String.format("%s: %d - parsed in %d ms", aUrlDTO.getUrl(), response.statusCode(), end - start)); - - return document; - } -} From 988981c1675cc6cfdf6b7f15192a9686596b79b7 Mon Sep 17 00:00:00 2001 From: pidoubleyou Date: Sun, 24 Mar 2024 16:04:23 +0100 Subject: [PATCH 09/10] fix sonar --- .../crawler/sender/orfon/json/OrfOnEpisodeDeserializer.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/java/mServer/crawler/sender/orfon/json/OrfOnEpisodeDeserializer.java b/src/main/java/mServer/crawler/sender/orfon/json/OrfOnEpisodeDeserializer.java index 7f85ba22..5aaaab79 100644 --- a/src/main/java/mServer/crawler/sender/orfon/json/OrfOnEpisodeDeserializer.java +++ b/src/main/java/mServer/crawler/sender/orfon/json/OrfOnEpisodeDeserializer.java @@ -5,7 +5,6 @@ import com.google.gson.JsonElement; import com.google.gson.JsonParseException; import de.mediathekview.mlib.tool.Log; -import mServer.crawler.sender.base.GeoLocations; import mServer.crawler.sender.base.JsonUtils; import mServer.crawler.sender.base.Qualities; import mServer.crawler.sender.orfon.OrfHttpClient; @@ -16,7 +15,6 @@ import java.io.IOException; import java.lang.reflect.Type; -import java.net.URL; import java.time.Duration; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; From 7ba449bb6b419a7d769aacf7da7607936055c5a7 Mon Sep 17 00:00:00 2001 From: Alexander F Date: Thu, 28 Mar 2024 21:44:31 +0100 Subject: [PATCH 10/10] Version auf 3.1.230 angehoben --- build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index 6c64103e..6e5680a7 100644 --- a/build.gradle +++ b/build.gradle @@ -27,7 +27,7 @@ sourceCompatibility = JavaVersion.VERSION_17 targetCompatibility = JavaVersion.VERSION_17 group = 'de.mediathekview' archivesBaseName = "MServer" -version = '3.1.229' +version = '3.1.230' def jarName = 'MServer.jar' def mainClass = 'mServer.Main'