From 316b6add18a4472922bf72ddd0269074d0ea197b Mon Sep 17 00:00:00 2001 From: Mark Struberg Date: Tue, 20 Nov 2018 23:05:29 +0100 Subject: [PATCH 1/8] adding a crawler for detecting dead links on a page --- .../deadlinksniffer/README.adoc | 3 + crawler4j-examples/deadlinksniffer/pom.xml | 54 +++++ .../deadlinksniffer/DeadLinkCrawlConfig.java | 38 ++++ .../DeadLinkCrawlController.java | 209 ++++++++++++++++++ .../deadlinksniffer/DeadLinkCrawler.java | 192 ++++++++++++++++ .../src/main/resources/logback.xml | 26 +++ pom.xml | 1 + 7 files changed, 523 insertions(+) create mode 100644 crawler4j-examples/deadlinksniffer/README.adoc create mode 100644 crawler4j-examples/deadlinksniffer/pom.xml create mode 100644 crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java create mode 100644 crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java create mode 100644 crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java create mode 100644 crawler4j-examples/deadlinksniffer/src/main/resources/logback.xml diff --git a/crawler4j-examples/deadlinksniffer/README.adoc b/crawler4j-examples/deadlinksniffer/README.adoc new file mode 100644 index 000000000..1911db307 --- /dev/null +++ b/crawler4j-examples/deadlinksniffer/README.adoc @@ -0,0 +1,3 @@ += Crawler4j Dead Link Sniffer + +This application scans a web page for dead links. \ No newline at end of file diff --git a/crawler4j-examples/deadlinksniffer/pom.xml b/crawler4j-examples/deadlinksniffer/pom.xml new file mode 100644 index 000000000..9bddb0e4b --- /dev/null +++ b/crawler4j-examples/deadlinksniffer/pom.xml @@ -0,0 +1,54 @@ + + + 4.0.0 + + + crawler4j-parent + edu.uci.ics + 4.5.0-SNAPSHOT + ../../pom.xml + + crawler4j-deadlinksniffer + + find dead links on a web page + https://github.com/yasserg/crawler4j + + + + edu.uci.ics + crawler4j + ${project.version} + + + + commons-cli + commons-cli + 1.4 + + + + + + + + org.codehaus.mojo + appassembler-maven-plugin + 2.0.0 + + + + edu.uci.ics.crawler4j.deadlinksniffer.DeadLinkCrawlController + DeadLinkSniffer + + + all + + + + + + + diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java new file mode 100644 index 000000000..fe04b373e --- /dev/null +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package edu.uci.ics.crawler4j.deadlinksniffer; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +import edu.uci.ics.crawler4j.crawler.CrawlConfig; + +/** + * @author Mark Struberg + */ +public class DeadLinkCrawlConfig extends CrawlConfig { + private List urlPatterns = new ArrayList<>(); + + public List getUrlPatterns() { + return urlPatterns; + } + + public void addUrlPattern(String urlPattern) { + this.urlPatterns.add(Pattern.compile(urlPattern)); + } +} diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java new file mode 100644 index 000000000..2dc0b3017 --- /dev/null +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java @@ -0,0 +1,209 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package edu.uci.ics.crawler4j.deadlinksniffer; + +import edu.uci.ics.crawler4j.crawler.CrawlController; +import edu.uci.ics.crawler4j.fetcher.PageFetcher; +import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig; +import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Scan given web pages (seed) for dead links. + * + * @author Yasser Ganjisaffar + * @author Mark Struberg + */ +public class DeadLinkCrawlController { + private static final Logger logger = LoggerFactory.getLogger(DeadLinkCrawlController.class); + + public static void main(String[] args) throws Exception { + + Options options = new Options(); + + options.addRequiredOption("s", "seed", true, + "Seeding page where the crawling should get started from."); + + options.addOption("u", "url", true, + "Url Regular Expressions for pages which should get crawled. " + + "If not given the seed will act as a starting point"); + + options.addOption("?", "help", false, + "Print detailed infos about the usage."); + + options.addOption("t", "threads", true, + "Number of Threads to use for crawling. Defaults to 1."); + + options.addOption("o", "outDir", true, + "output Directory to store the downloaded pages and information. Defaults to ./crawl"); + + options.addOption("d", "delay", true, + "time delay between requests in ms. Defaults to 1000 (1 second)."); + + options.addOption("m", "maxdepth", true, + "Maximum Depth of Crawling. Defaults to 3."); + + options.addOption("p", "pages", true, + "Maximum number of pages to fetch. Defaults to 1000."); + + CommandLine cmd = null; + try { + CommandLineParser parser = new DefaultParser(); + cmd = parser.parse(options, args); + } + catch (ParseException pe) { + logger.info(pe.getMessage()); + printHelpAndExit(options); + } + + if (cmd.hasOption("?")) { + printHelpAndExit(options); + } + + DeadLinkCrawlConfig config = new DeadLinkCrawlConfig(); + + /* + * crawlStorageFolder is a folder where intermediate crawl data is + * stored. + */ + String crawlStorageFolder = + cmd.hasOption("o") + ? cmd.getOptionValue("o") + : "crawl"; + config.setCrawlStorageFolder(crawlStorageFolder); + + + /* + * Be polite: Make sure that we don't send more than 1 request per + * second (1000 milliseconds between requests). + */ + int delay = + cmd.hasOption("d") + ? Integer.parseInt(cmd.getOptionValue("d")) + : 1000; + config.setPolitenessDelay(delay); + + /* + * You can set the maximum crawl depth here. The default value is -1 for + * unlimited depth + */ + int maxDepth = + cmd.hasOption("m") + ? Integer.parseInt(cmd.getOptionValue("m")) + : 3; + config.setMaxDepthOfCrawling(maxDepth); + + /* + * You can set the maximum number of pages to crawl. The default value + * is -1 for unlimited number of pages + */ + int pages = + cmd.hasOption("p") + ? Integer.parseInt(cmd.getOptionValue("p")) + : 2000; + config.setMaxPagesToFetch(pages); + + + /* + * numberOfCrawlers shows the number of concurrent threads that should + * be initiated for crawling. + */ + int numberOfCrawlers = + cmd.hasOption("t") + ? Integer.parseInt(cmd.getOptionValue("t")) + : 1; + + if (cmd.hasOption("u")) { + String[] urlPatterns = cmd.getOptionValues("u"); + + for (String urlPattern : urlPatterns) { + config.addUrlPattern(urlPattern); + } + } + + /** + * Do you want crawler4j to crawl also binary data ? + * example: the contents of pdf, or the metadata of images etc + */ + config.setIncludeBinaryContentInCrawling(false); + + /* + * Do you need to set a proxy? If so, you can use: + * config.setProxyHost("proxyserver.example.com"); + * config.setProxyPort(8080); + * + * If your proxy also needs authentication: + * config.setProxyUsername(username); config.getProxyPassword(password); + */ + + /* + * This config parameter can be used to set your crawl to be resumable + * (meaning that you can resume the crawl from a previously + * interrupted/crashed crawl). Note: if you enable resuming feature and + * want to start a fresh crawl, you need to delete the contents of + * rootFolder manually. + */ + config.setResumableCrawling(false); + + + /* + * Instantiate the controller for this crawl. + */ + PageFetcher pageFetcher = new PageFetcher(config); + RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); + RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); + CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); + + /* + * For each crawl, you need to add some seed urls. These are the first + * URLs that are fetched and then the crawler starts following links + * which are found in these pages + */ + boolean addSeedsAsUrls = config.getUrlPatterns().isEmpty(); + String[] seeds = cmd.getOptionValues("s"); + for(String seed : seeds) { + controller.addSeed(seed); + if (addSeedsAsUrls) { + config.addUrlPattern("^" + seed + ".*"); + } + } + + + /* + * Start the crawl. This is a blocking operation, meaning that your code + * will reach the line after this only when crawling is finished. + */ + controller.start(DeadLinkCrawler.class, numberOfCrawlers); + } + + private static void printHelpAndExit(Options options) { + HelpFormatter hf = new HelpFormatter(); + hf.printHelp("DeadLinkSniffer", options); + + System.exit(-1); + } + + +} \ No newline at end of file diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java new file mode 100644 index 000000000..6b423c0b0 --- /dev/null +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java @@ -0,0 +1,192 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package edu.uci.ics.crawler4j.deadlinksniffer; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.regex.Pattern; + +import edu.uci.ics.crawler4j.crawler.Page; +import edu.uci.ics.crawler4j.crawler.WebCrawler; +import edu.uci.ics.crawler4j.parser.HtmlParseData; +import edu.uci.ics.crawler4j.url.WebURL; +import org.apache.http.Header; + +/** + * TODO: Currently not thread safe! + * + * @author Yasser Ganjisaffar + * @author Mark Struberg + */ +public class DeadLinkCrawler extends WebCrawler { + + private static final Pattern IMAGE_EXTENSIONS = Pattern.compile(".*\\.(bmp|gif|jpg|png|jpeg|css|js)$"); + + + private AtomicInteger maxVisits = new AtomicInteger(0); + + private File rootFolder; + private FileWriter brokenPages; + + + /** + * You should implement this function to specify whether the given url + * should be crawled or not (based on your crawling logic). + */ + @Override + public boolean shouldVisit(Page referringPage, WebURL url) { + String href = url.getURL().toLowerCase(); + // Ignore the url if it has an extension that matches our defined set of image extensions. + if (isImageLink(href)) { + return false; + } + + // Only accept the url if it is in the requested url domains. + return ((DeadLinkCrawlConfig) getMyController().getConfig()).getUrlPatterns() + .stream() + .anyMatch(pattern -> pattern.matcher(href).matches()); + } + + @Override + protected boolean shouldFollowLinksIn(WebURL url) { + int visits = maxVisits.incrementAndGet(); + logger.info("Number of visits so far: {}", visits); + return true; + } + + @Override + protected void handlePageStatusCode(WebURL webUrl, int statusCode, String statusDescription) { + if (statusCode != 200) { + logger.info("\n\n FEHLERHAFTE SEITE status {} {} \n\n", statusCode, webUrl.getURL()); + try { + getBrokenPages().append("" + statusCode + ", " + webUrl.getURL() + ", " + webUrl.getParentUrl() + "\n"); + getBrokenPages().flush(); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + } + + /** + * This function is called when a page is fetched and ready to be processed + * by your program. + */ + @Override + public void visit(Page page) { + int docid = page.getWebURL().getDocid(); + String url = page.getWebURL().getURL(); + String domain = page.getWebURL().getDomain(); + String path = page.getWebURL().getPath(); + String subDomain = page.getWebURL().getSubDomain(); + String parentUrl = page.getWebURL().getParentUrl(); + String anchor = page.getWebURL().getAnchor(); + + logger.debug("Docid: {}", docid); + logger.info("URL: {}", url); + logger.debug("Domain: '{}'", domain); + logger.debug("Sub-domain: '{}'", subDomain); + logger.debug("Path: '{}'", path); + logger.debug("Parent page: {}", parentUrl); + logger.debug("Anchor text: {}", anchor); + + if (page.getParseData() instanceof HtmlParseData) { + HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); + String text = htmlParseData.getText(); + String html = htmlParseData.getHtml(); + Set links = htmlParseData.getOutgoingUrls(); + + logger.debug("Text length: {}", text.length()); + logger.debug("Html length: {}", html.length()); + logger.debug("Number of outgoing links: {}", links.size()); + + storeHtml(url, html); + } + + Header[] responseHeaders = page.getFetchResponseHeaders(); + if (responseHeaders != null) { + logger.debug("Response heade rs:"); + for (Header header : responseHeaders) { + logger.debug("\t{}: {}", header.getName(), header.getValue()); + } + } + + logger.debug("============="); + } + + private void storeHtml(String url, String html) { + File f = new File(getRootFolder(), url.replace("/", "_")); + if (f.exists()) { + return; + } + try (FileWriter fw = new FileWriter(f)) { + fw.write(html); + } + catch (IOException e) { + logger.error("could not store file " + f.toString(), e); + } + } + + @Override + public void onBeforeExit() { + closeFile(brokenPages, "errorPages"); + } + + private void closeFile(FileWriter fw, String name) { + if (fw == null) { + return; + } + + try { + fw.close(); + } + catch (IOException e) { + logger.error("problem with closing" + name, e); + } + } + + public File getRootFolder() { + if (rootFolder == null) { + rootFolder = new File(getMyController().getConfig().getCrawlStorageFolder(), "content"); + rootFolder.mkdirs(); + } + return rootFolder; + } + + private boolean isImageLink(String href) { + return IMAGE_EXTENSIONS.matcher(href).matches(); + } + + + private FileWriter getBrokenPages() { + if (brokenPages == null) { + try { + brokenPages = new FileWriter(new File(getMyController().getConfig().getCrawlStorageFolder(), "brokenPages.csv")); + + } + catch (IOException e) { + throw new RuntimeException(e); + } + + } + return brokenPages; + } +} diff --git a/crawler4j-examples/deadlinksniffer/src/main/resources/logback.xml b/crawler4j-examples/deadlinksniffer/src/main/resources/logback.xml new file mode 100644 index 000000000..275f1d416 --- /dev/null +++ b/crawler4j-examples/deadlinksniffer/src/main/resources/logback.xml @@ -0,0 +1,26 @@ + + + + + + + + %date{HH:mm:ss} %-5level [%thread] - [%logger{0}]- %msg%n + + + + + ${LOG_HOME}/${LOG_FILE_NAME} + + %date %-5level [%thread] - [%logger] - %msg%n + + + + + + + + + + + diff --git a/pom.xml b/pom.xml index a5af11dd6..bba99bc31 100644 --- a/pom.xml +++ b/pom.xml @@ -51,6 +51,7 @@ crawler4j crawler4j-examples/crawler4j-examples-base crawler4j-examples/crawler4j-examples-postgres + crawler4j-examples/deadlinksniffer From 8a8b3a2036218952595bdcf29826aef81cf7a5ab Mon Sep 17 00:00:00 2001 From: Mark Struberg Date: Wed, 21 Nov 2018 00:02:01 +0100 Subject: [PATCH 2/8] add more docs + minor fixes * also add pdf to the ignored files list * fix logging setup --- .../deadlinksniffer/README.adoc | 44 ++++++++++++++++++- crawler4j-examples/deadlinksniffer/pom.xml | 7 +++ .../DeadLinkCrawlController.java | 7 ++- .../deadlinksniffer/DeadLinkCrawler.java | 2 +- .../src/main/resources/logback.xml | 11 ----- 5 files changed, 56 insertions(+), 15 deletions(-) diff --git a/crawler4j-examples/deadlinksniffer/README.adoc b/crawler4j-examples/deadlinksniffer/README.adoc index 1911db307..d96d6083a 100644 --- a/crawler4j-examples/deadlinksniffer/README.adoc +++ b/crawler4j-examples/deadlinksniffer/README.adoc @@ -1,3 +1,45 @@ = Crawler4j Dead Link Sniffer -This application scans a web page for dead links. \ No newline at end of file +This application scans a web page for dead links. + +== Compiling + +The whole application can be built by using maven + +---- +$> mvn clean install +---- + +This will also bundle an executable application in `crawler4j-examples/deadlinksniffer/target/appassembler`. + +== Usage + +==== Getting more help +For getting the parameter description: +---- +$> ./bin/DeadLinkSniffer -? +---- + +==== Scanning a web page for dead links. +Example how to scan a sample page for dead links: +This will scan all sub pages which are reachable by all the `seed` (`-s`) pages given. +---- +$> ./bin/DeadLinkSniffer -s=http://mypage.org +---- + +You can also define which URLs should be accessed via a list of regExp parameters `-u`. +For defining multiple rules, simply add multiple `-u` parameters. + +---- +$> ./bin/DeadLinkSniffer -s=http://mypage.org -u="https://.*mypage.org.*" +---- + +==== Output + +By default the output files are in `./crawl`. +The output directory can be specified with the `-o` parameter. + +The output directory contains a file `brokenPages.csv` which contains all broken links. +The first row is the HTTP status, e.g. 404 for 'not found'. +The second row is the name of the resource which is missing. +The third row is the html page on which the dead link was found. \ No newline at end of file diff --git a/crawler4j-examples/deadlinksniffer/pom.xml b/crawler4j-examples/deadlinksniffer/pom.xml index 9bddb0e4b..2245bdcaa 100644 --- a/crawler4j-examples/deadlinksniffer/pom.xml +++ b/crawler4j-examples/deadlinksniffer/pom.xml @@ -46,6 +46,13 @@ all + + + bundle + package + assemble + + diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java index 2dc0b3017..fa27650aa 100644 --- a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java @@ -66,7 +66,7 @@ public static void main(String[] args) throws Exception { "Maximum Depth of Crawling. Defaults to 3."); options.addOption("p", "pages", true, - "Maximum number of pages to fetch. Defaults to 1000."); + "Maximum number of pages to fetch. Defaults to 2000."); CommandLine cmd = null; try { @@ -200,7 +200,10 @@ public static void main(String[] args) throws Exception { private static void printHelpAndExit(Options options) { HelpFormatter hf = new HelpFormatter(); - hf.printHelp("DeadLinkSniffer", options); + hf.printHelp("\n\tDeadLinkSniffer -? - for help"+ + "\n\tDeadLinkSniffer -s=http://mypage.org - for scanning this page" + + "\n\tDeadLinkSniffer -s=http://mypage.org -u=\"https://.*mypage.org.*\" - for scanning this page with all subdomains, etc" + , options); System.exit(-1); } diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java index 6b423c0b0..e97e1c140 100644 --- a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java @@ -38,7 +38,7 @@ */ public class DeadLinkCrawler extends WebCrawler { - private static final Pattern IMAGE_EXTENSIONS = Pattern.compile(".*\\.(bmp|gif|jpg|png|jpeg|css|js)$"); + private static final Pattern IMAGE_EXTENSIONS = Pattern.compile(".*\\.(bmp|gif|jpg|png|jpeg|css|js|pdf)$"); private AtomicInteger maxVisits = new AtomicInteger(0); diff --git a/crawler4j-examples/deadlinksniffer/src/main/resources/logback.xml b/crawler4j-examples/deadlinksniffer/src/main/resources/logback.xml index 275f1d416..c716bfa50 100644 --- a/crawler4j-examples/deadlinksniffer/src/main/resources/logback.xml +++ b/crawler4j-examples/deadlinksniffer/src/main/resources/logback.xml @@ -1,7 +1,5 @@ - - @@ -9,18 +7,9 @@ - - ${LOG_HOME}/${LOG_FILE_NAME} - - %date %-5level [%thread] - [%logger] - %msg%n - - - - - From bb7d4f9273b0c40c5c8be79a1d116c7218cdc01a Mon Sep 17 00:00:00 2001 From: Mark Struberg Date: Wed, 21 Nov 2018 13:37:57 +0100 Subject: [PATCH 3/8] move output to single service to avoid concurrency issues --- .../deadlinksniffer/DeadLinkCrawlConfig.java | 13 +++ .../DeadLinkCrawlController.java | 5 ++ .../deadlinksniffer/DeadLinkCrawler.java | 45 +++-------- .../deadlinksniffer/DeadLinkCrawlerStore.java | 81 +++++++++++++++++++ 4 files changed, 109 insertions(+), 35 deletions(-) create mode 100644 crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlerStore.java diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java index fe04b373e..0c7760397 100644 --- a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java @@ -27,6 +27,7 @@ */ public class DeadLinkCrawlConfig extends CrawlConfig { private List urlPatterns = new ArrayList<>(); + private volatile DeadLinkCrawlerStore crawlerStore; public List getUrlPatterns() { return urlPatterns; @@ -35,4 +36,16 @@ public List getUrlPatterns() { public void addUrlPattern(String urlPattern) { this.urlPatterns.add(Pattern.compile(urlPattern)); } + + public DeadLinkCrawlerStore getCrawlerStore() { + if (crawlerStore == null) { + synchronized (this) { + if (crawlerStore == null) { + crawlerStore = new DeadLinkCrawlerStore(this); + } + } + } + + return crawlerStore; + } } diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java index fa27650aa..b360e3ab4 100644 --- a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java @@ -167,6 +167,11 @@ public static void main(String[] args) throws Exception { */ config.setResumableCrawling(false); + /* + * Yes, we gonna follow a HTTP-301 + */ + config.setFollowRedirects(true); + /* * Instantiate the controller for this crawl. diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java index e97e1c140..248460bfa 100644 --- a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java @@ -29,6 +29,7 @@ import edu.uci.ics.crawler4j.parser.HtmlParseData; import edu.uci.ics.crawler4j.url.WebURL; import org.apache.http.Header; +import org.apache.http.HttpStatus; /** * TODO: Currently not thread safe! @@ -44,7 +45,6 @@ public class DeadLinkCrawler extends WebCrawler { private AtomicInteger maxVisits = new AtomicInteger(0); private File rootFolder; - private FileWriter brokenPages; /** @@ -74,15 +74,12 @@ protected boolean shouldFollowLinksIn(WebURL url) { @Override protected void handlePageStatusCode(WebURL webUrl, int statusCode, String statusDescription) { - if (statusCode != 200) { + if (statusCode != HttpStatus.SC_OK && + statusCode != HttpStatus.SC_TEMPORARY_REDIRECT && + statusCode != HttpStatus.SC_MOVED_TEMPORARILY && + statusCode != HttpStatus.SC_MOVED_PERMANENTLY) { logger.info("\n\n FEHLERHAFTE SEITE status {} {} \n\n", statusCode, webUrl.getURL()); - try { - getBrokenPages().append("" + statusCode + ", " + webUrl.getURL() + ", " + webUrl.getParentUrl() + "\n"); - getBrokenPages().flush(); - } - catch (IOException e) { - throw new RuntimeException(e); - } + getConfig().getCrawlerStore().storePageStatus(statusCode, webUrl); } } @@ -147,25 +144,16 @@ private void storeHtml(String url, String html) { @Override public void onBeforeExit() { - closeFile(brokenPages, "errorPages"); + getConfig().getCrawlerStore().close(); } - private void closeFile(FileWriter fw, String name) { - if (fw == null) { - return; - } - - try { - fw.close(); - } - catch (IOException e) { - logger.error("problem with closing" + name, e); - } + private DeadLinkCrawlConfig getConfig() { + return (DeadLinkCrawlConfig) getMyController().getConfig(); } public File getRootFolder() { if (rootFolder == null) { - rootFolder = new File(getMyController().getConfig().getCrawlStorageFolder(), "content"); + rootFolder = new File(getConfig().getCrawlStorageFolder(), "content"); rootFolder.mkdirs(); } return rootFolder; @@ -176,17 +164,4 @@ private boolean isImageLink(String href) { } - private FileWriter getBrokenPages() { - if (brokenPages == null) { - try { - brokenPages = new FileWriter(new File(getMyController().getConfig().getCrawlStorageFolder(), "brokenPages.csv")); - - } - catch (IOException e) { - throw new RuntimeException(e); - } - - } - return brokenPages; - } } diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlerStore.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlerStore.java new file mode 100644 index 000000000..1696a5ae5 --- /dev/null +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlerStore.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package edu.uci.ics.crawler4j.deadlinksniffer; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; + +import edu.uci.ics.crawler4j.url.WebURL; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Stores information about erroneous pages to the disk. + * + * @author Mark Struberg + */ +public class DeadLinkCrawlerStore { + private static final Logger logger = LoggerFactory.getLogger(DeadLinkCrawlerStore.class); + + private final DeadLinkCrawlConfig config; + + private FileWriter brokenPages; + + + protected DeadLinkCrawlerStore(DeadLinkCrawlConfig config) { + this.config = config; + try { + brokenPages = new FileWriter(new File(config.getCrawlStorageFolder(), "brokenPages.csv")); + } + catch (IOException e) { + throw new RuntimeException(e); + } + + } + + + + + public synchronized void close() { + closeFile(brokenPages, "errorPages"); + brokenPages = null; + } + + private void closeFile(FileWriter fw, String name) { + if (fw == null) { + return; + } + + try { + fw.close(); + } + catch (IOException e) { + logger.error("problem with closing" + name, e); + } + } + + public synchronized void storePageStatus(int statusCode, WebURL webUrlFail) { + try { + brokenPages.append("" + statusCode + ", " + webUrlFail.getURL() + ", " + webUrlFail.getParentUrl() + "\n"); + brokenPages.flush(); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } +} From 055129f78c604c210db258fac98a736d925fd1a2 Mon Sep 17 00:00:00 2001 From: Mark Struberg Date: Sun, 25 Nov 2018 14:39:18 +0100 Subject: [PATCH 4/8] add detection of images without 'alt' tag. --- .../deadlinksniffer/DeadLinkCrawler.java | 11 ++++- .../deadlinksniffer/DeadLinkCrawlerStore.java | 37 +++++++++++++++- .../crawler4j/parser/HtmlContentHandler.java | 19 ++++++++ .../ics/crawler4j/parser/HtmlParseData.java | 10 +++++ .../uci/ics/crawler4j/parser/ImageData.java | 44 +++++++++++++++++++ .../ics/crawler4j/parser/TikaHtmlParser.java | 2 + .../edu/uci/ics/crawler4j/url/WebURL.java | 12 +++++ 7 files changed, 133 insertions(+), 2 deletions(-) create mode 100644 crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/ImageData.java diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java index 248460bfa..87701ea1c 100644 --- a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java @@ -20,6 +20,7 @@ import java.io.File; import java.io.FileWriter; import java.io.IOException; +import java.util.List; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; @@ -27,6 +28,7 @@ import edu.uci.ics.crawler4j.crawler.Page; import edu.uci.ics.crawler4j.crawler.WebCrawler; import edu.uci.ics.crawler4j.parser.HtmlParseData; +import edu.uci.ics.crawler4j.parser.ImageData; import edu.uci.ics.crawler4j.url.WebURL; import org.apache.http.Header; import org.apache.http.HttpStatus; @@ -79,7 +81,7 @@ protected void handlePageStatusCode(WebURL webUrl, int statusCode, String status statusCode != HttpStatus.SC_MOVED_TEMPORARILY && statusCode != HttpStatus.SC_MOVED_PERMANENTLY) { logger.info("\n\n FEHLERHAFTE SEITE status {} {} \n\n", statusCode, webUrl.getURL()); - getConfig().getCrawlerStore().storePageStatus(statusCode, webUrl); + getConfig().getCrawlerStore().storePageStatus(statusCode, webUrl); } } @@ -116,6 +118,13 @@ public void visit(Page page) { logger.debug("Number of outgoing links: {}", links.size()); storeHtml(url, html); + + List imageDatas = htmlParseData.getImageData(); + int imgNr = 0; + for (ImageData imageData : imageDatas) { + imgNr++; + getConfig().getCrawlerStore().storeImageInfo(page, imgNr, imageData); + } } Header[] responseHeaders = page.getFetchResponseHeaders(); diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlerStore.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlerStore.java index 1696a5ae5..464421fd4 100644 --- a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlerStore.java +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlerStore.java @@ -20,6 +20,8 @@ import java.io.FileWriter; import java.io.IOException; +import edu.uci.ics.crawler4j.crawler.Page; +import edu.uci.ics.crawler4j.parser.ImageData; import edu.uci.ics.crawler4j.url.WebURL; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -35,12 +37,14 @@ public class DeadLinkCrawlerStore { private final DeadLinkCrawlConfig config; private FileWriter brokenPages; + private FileWriter imageWoAlt; protected DeadLinkCrawlerStore(DeadLinkCrawlConfig config) { this.config = config; try { brokenPages = new FileWriter(new File(config.getCrawlStorageFolder(), "brokenPages.csv")); + imageWoAlt = new FileWriter(new File(config.getCrawlStorageFolder(), "imageWoAlt.csv")); } catch (IOException e) { throw new RuntimeException(e); @@ -52,8 +56,10 @@ protected DeadLinkCrawlerStore(DeadLinkCrawlConfig config) { public synchronized void close() { - closeFile(brokenPages, "errorPages"); + closeFile(brokenPages, "brokenPages"); + closeFile(imageWoAlt, "imageWoAlt"); brokenPages = null; + imageWoAlt = null; } private void closeFile(FileWriter fw, String name) { @@ -78,4 +84,33 @@ public synchronized void storePageStatus(int statusCode, WebURL webUrlFail) { throw new RuntimeException(e); } } + + public synchronized void storeImageInfo(Page page, int imgNr, ImageData imageData) { + // log all images with missing alt tag + if (!imageData.getAttrVals().containsKey("alt") || imageData.getAttrVals().get("alt").isEmpty()) { + String url = page.getWebURL().getURL(); + logger.info("\n\n IMAGE without 'alt' tag on page {} img: {}", url, imageData.getSrc()); + try { + String src = imageData.getSrc(); + String imgLink; + if (src.startsWith("https://") || src.startsWith("http://")) { + // absolute image + imgLink = src; + } + else if (src.startsWith("/")) { + // server-root relative image + imgLink = page.getWebURL().getRootUrl() + src; + } + else { + // relative image + imgLink = page.getWebURL().getRootUrl() + page.getWebURL().getPath() + "/" + src; + } + imageWoAlt.append(url + ", " + imgNr + ", " + src + ", " + imgLink + "\n"); + imageWoAlt.flush(); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + } } diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java index 67676afcb..1dc428eb5 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java @@ -18,6 +18,7 @@ package edu.uci.ics.crawler4j.parser; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -68,6 +69,7 @@ public static Element getElement(String name) { private final StringBuilder bodyText; private final List outgoingUrls; + private final List imageData; private ExtractedUrlAnchorPair curUrl = null; private boolean anchorFlag = false; @@ -77,6 +79,7 @@ public HtmlContentHandler() { isWithinBodyElement = false; bodyText = new StringBuilder(); outgoingUrls = new ArrayList<>(); + imageData = new ArrayList<>(); } @Override @@ -94,7 +97,19 @@ public void startElement(String uri, String localName, String qName, Attributes String imgSrc = attributes.getValue("src"); if (imgSrc != null) { addToOutgoingUrls(imgSrc, localName); + + Map attrVals; + if (attributes.getLength() == 0) { + attrVals = Collections.emptyMap(); + } else { + attrVals = new HashMap<>(); + for (int i = 0; i < attributes.getLength(); i++) { + attrVals.put(attributes.getLocalName(i), attributes.getValue(i)); + } + } + imageData.add(new ImageData(imgSrc, attrVals)); } + } else if ((element == Element.IFRAME) || (element == Element.FRAME) || (element == Element.EMBED) || (element == Element.SCRIPT)) { String src = attributes.getValue("src"); @@ -209,4 +224,8 @@ public String getBaseUrl() { public Map getMetaTags() { return metaTags; } + + public List getImageData() { + return imageData; + } } diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlParseData.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlParseData.java index cac68fb75..39ffb7f07 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlParseData.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlParseData.java @@ -17,6 +17,7 @@ package edu.uci.ics.crawler4j.parser; +import java.util.List; import java.util.Map; import java.util.Set; @@ -31,6 +32,7 @@ public class HtmlParseData implements ParseData { private Set outgoingUrls; private String contentCharset; + private List imageData; public String getHtml() { return html; @@ -78,6 +80,10 @@ public void setOutgoingUrls(Set outgoingUrls) { this.outgoingUrls = outgoingUrls; } + public void setImageData(List imageData) { + this.imageData = imageData; + } + @Override public String toString() { return text; @@ -90,4 +96,8 @@ public void setContentCharset(String contentCharset) { public String getContentCharset() { return contentCharset; } + + public List getImageData() { + return imageData; + } } \ No newline at end of file diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/ImageData.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/ImageData.java new file mode 100644 index 000000000..14f1859de --- /dev/null +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/ImageData.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package edu.uci.ics.crawler4j.parser; + +import java.util.Map; + +/** + * Information about images on a page. + * Can be used to e.g. detect images without an 'alt' tag. + * @author Mark Struberg + */ +public class ImageData { + + private final String src; + private final Map attrVals; + + public ImageData(String src, Map attrVals) { + this.src = src; + this.attrVals = attrVals; + } + + public String getSrc() { + return src; + } + + public Map getAttrVals() { + return attrVals; + } +} diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java index 48657bce6..ce9f1ed12 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java @@ -61,6 +61,8 @@ public HtmlParseData parse(Page page, String contextURL) throws ParseException { Set outgoingUrls = getOutgoingUrls(contextURL, contentHandler, contentCharset); parsedData.setOutgoingUrls(outgoingUrls); + parsedData.setImageData(contentHandler.getImageData()); + try { if (page.getContentCharset() == null) { parsedData.setHtml(new String(page.getContentData())); diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java index bd64052bd..34e3c750c 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java @@ -38,6 +38,7 @@ public class WebURL implements Serializable { private int docid; private int parentDocid; private String parentUrl; + private String rootUrl; private short depth; private String domain; private String subDomain; @@ -93,6 +94,8 @@ public void setURL(String url) { if (pathEndIdx >= 0) { path = path.substring(0, pathEndIdx); } + + this.rootUrl = domainEndIdx > 0 ? url.substring(0, domainEndIdx) : url; } /** @@ -121,6 +124,15 @@ public void setParentUrl(String parentUrl) { this.parentUrl = parentUrl; } + /** + * The root URL of the page + * E.g. for the url http://somesub.domain.org/myapp?idx=4 this would be + * http://somesub.domain.org/ + */ + public String getRootUrl() { + return rootUrl; + } + /** * @return * crawl depth at which this Url is first observed. Seed Urls From 9fcfb415c8807c8ab52d3e003e3ac8dd8bf64c79 Mon Sep 17 00:00:00 2001 From: Mark Struberg Date: Sun, 2 Dec 2018 16:42:11 +0100 Subject: [PATCH 5/8] disable anything which touches palantir! --- pom.xml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pom.xml b/pom.xml index bba99bc31..06f79abe7 100644 --- a/pom.xml +++ b/pom.xml @@ -50,7 +50,10 @@ crawler4j crawler4j-examples/crawler4j-examples-base + crawler4j-examples/deadlinksniffer From c21fce92ba406477f98aca650ba999615a5e06fe Mon Sep 17 00:00:00 2001 From: Mark Struberg Date: Sun, 2 Dec 2018 17:34:07 +0100 Subject: [PATCH 6/8] add a CSV header --- .../ics/crawler4j/deadlinksniffer/DeadLinkCrawlerStore.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlerStore.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlerStore.java index 464421fd4..59f583d65 100644 --- a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlerStore.java +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlerStore.java @@ -44,7 +44,12 @@ protected DeadLinkCrawlerStore(DeadLinkCrawlConfig config) { this.config = config; try { brokenPages = new FileWriter(new File(config.getCrawlStorageFolder(), "brokenPages.csv")); + brokenPages.append("status, url, parent_url\n"); + brokenPages.flush(); + imageWoAlt = new FileWriter(new File(config.getCrawlStorageFolder(), "imageWoAlt.csv")); + imageWoAlt.append("onPage, imageNr, imgSrc, imgLink\n"); + imageWoAlt.flush(); } catch (IOException e) { throw new RuntimeException(e); From c637618aa6be2cd7741c81ce69f17dddc6870d27 Mon Sep 17 00:00:00 2001 From: Mark Struberg Date: Sun, 2 Dec 2018 17:36:46 +0100 Subject: [PATCH 7/8] add -x for excluding pages --- .../deadlinksniffer/DeadLinkCrawlConfig.java | 18 ++++++++++++++++++ .../DeadLinkCrawlController.java | 13 ++++++++++++- .../deadlinksniffer/DeadLinkCrawler.java | 14 ++++++++++++-- 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java index 0c7760397..b31eac886 100644 --- a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java @@ -28,15 +28,33 @@ public class DeadLinkCrawlConfig extends CrawlConfig { private List urlPatterns = new ArrayList<>(); private volatile DeadLinkCrawlerStore crawlerStore; + private List excludePatterns = new ArrayList<>(); public List getUrlPatterns() { return urlPatterns; } + public List getExcludePatterns() { + return excludePatterns; + } + + /** + * Add a regular expression for URLs which should be followed + * by the crawler. + */ public void addUrlPattern(String urlPattern) { this.urlPatterns.add(Pattern.compile(urlPattern)); } + /** + * Add a regular expression for URLs which should be excluded from scanning. + * This is effectively a stop-criterium and will get evaluated + * after all the patterns added via {@link #addUrlPattern(String)}. + */ + public void addExcludePattern(String excludePattern) { + this.excludePatterns.add(Pattern.compile(excludePattern)); + } + public DeadLinkCrawlerStore getCrawlerStore() { if (crawlerStore == null) { synchronized (this) { diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java index b360e3ab4..c79401086 100644 --- a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java @@ -47,9 +47,12 @@ public static void main(String[] args) throws Exception { "Seeding page where the crawling should get started from."); options.addOption("u", "url", true, - "Url Regular Expressions for pages which should get crawled. " + + "Regular Expressions for page URLs which should get crawled. " + "If not given the seed will act as a starting point"); + options.addOption("x", "exclude", true, + "Regular Expressions for page URLs which should NOT get crawled."); + options.addOption("?", "help", false, "Print detailed infos about the usage."); @@ -143,6 +146,14 @@ public static void main(String[] args) throws Exception { } } + if (cmd.hasOption("x")) { + String[] urlPatterns = cmd.getOptionValues("x"); + + for (String urlPattern : urlPatterns) { + config.addExcludePattern(urlPattern); + } + } + /** * Do you want crawler4j to crawl also binary data ? * example: the contents of pdf, or the metadata of images etc diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java index 87701ea1c..136da50bf 100644 --- a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java @@ -62,9 +62,19 @@ public boolean shouldVisit(Page referringPage, WebURL url) { } // Only accept the url if it is in the requested url domains. - return ((DeadLinkCrawlConfig) getMyController().getConfig()).getUrlPatterns() + if (!((DeadLinkCrawlConfig) getMyController().getConfig()).getUrlPatterns() .stream() - .anyMatch(pattern -> pattern.matcher(href).matches()); + .anyMatch(pattern -> pattern.matcher(href).matches())) { + return false; + } + + // and also only if the url is not explicitly excluded + if (((DeadLinkCrawlConfig) getMyController().getConfig()).getExcludePatterns() + .stream() + .anyMatch(pattern -> pattern.matcher(href).matches())) { + return false; + } + return true; } @Override From 719b0f66e14555be5ca23f2341b798d7865d2372 Mon Sep 17 00:00:00 2001 From: Mark Struberg Date: Fri, 7 Dec 2018 13:41:31 +0100 Subject: [PATCH 8/8] report subsequent error links Previously only the first link which leads to an error page got reported. This change will keep track of those links and also write a report if the same link is used on a subsequent page. We now also write subfolders per domain. --- .../deadlinksniffer/DeadLinkCrawler.java | 30 +++++++++++++++++-- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java index 136da50bf..9c25127f5 100644 --- a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java @@ -20,8 +20,12 @@ import java.io.File; import java.io.FileWriter; import java.io.IOException; +import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; @@ -48,6 +52,11 @@ public class DeadLinkCrawler extends WebCrawler { private File rootFolder; + /** + * contains all broken Urls detected in {@link #handlePageStatusCode(WebURL, int, String)} + */ + private ConcurrentMap brokenUrls = new ConcurrentHashMap(); + /** * You should implement this function to specify whether the given url @@ -91,6 +100,7 @@ protected void handlePageStatusCode(WebURL webUrl, int statusCode, String status statusCode != HttpStatus.SC_MOVED_TEMPORARILY && statusCode != HttpStatus.SC_MOVED_PERMANENTLY) { logger.info("\n\n FEHLERHAFTE SEITE status {} {} \n\n", statusCode, webUrl.getURL()); + brokenUrls.put(webUrl.getURL(), statusCode); getConfig().getCrawlerStore().storePageStatus(statusCode, webUrl); } } @@ -127,7 +137,14 @@ public void visit(Page page) { logger.debug("Html length: {}", html.length()); logger.debug("Number of outgoing links: {}", links.size()); - storeHtml(url, html); + for (WebURL link : links) { + if (brokenUrls.keySet().contains(link.getURL())) { + getConfig().getCrawlerStore().storePageStatus(brokenUrls.get(link.getURL()), link); + } + } + + storeHtml(page.getWebURL(), html); + List imageDatas = htmlParseData.getImageData(); int imgNr = 0; @@ -148,8 +165,15 @@ public void visit(Page page) { logger.debug("============="); } - private void storeHtml(String url, String html) { - File f = new File(getRootFolder(), url.replace("/", "_")); + private void storeHtml(WebURL webURL, String html) { + String rootUrl = webURL.getRootUrl(); + File rootUrlDir = new File(getRootFolder(), rootUrl.replace("/", "_")); + if (!rootUrlDir.exists()) { + rootUrlDir.mkdir(); + } + + + File f = new File(rootUrlDir, webURL.getURL().replace("/", "_")); if (f.exists()) { return; }