diff --git a/crawler4j-examples/deadlinksniffer/README.adoc b/crawler4j-examples/deadlinksniffer/README.adoc
new file mode 100644
index 000000000..d96d6083a
--- /dev/null
+++ b/crawler4j-examples/deadlinksniffer/README.adoc
@@ -0,0 +1,45 @@
+= Crawler4j Dead Link Sniffer
+
+This application scans a web page for dead links.
+
+== Compiling
+
+The whole application can be built by using maven
+
+----
+$> mvn clean install
+----
+
+This will also bundle an executable application in `crawler4j-examples/deadlinksniffer/target/appassembler`.
+
+== Usage
+
+==== Getting more help
+For getting the parameter description:
+----
+$> ./bin/DeadLinkSniffer -?
+----
+
+==== Scanning a web page for dead links.
+Example how to scan a sample page for dead links:
+This will scan all sub pages which are reachable by all the `seed` (`-s`) pages given.
+----
+$> ./bin/DeadLinkSniffer -s=http://mypage.org
+----
+
+You can also define which URLs should be accessed via a list of regExp parameters `-u`.
+For defining multiple rules, simply add multiple `-u` parameters.
+
+----
+$> ./bin/DeadLinkSniffer -s=http://mypage.org -u="https://.*mypage.org.*"
+----
+
+==== Output
+
+By default the output files are in `./crawl`.
+The output directory can be specified with the `-o` parameter.
+
+The output directory contains a file `brokenPages.csv` which contains all broken links.
+The first row is the HTTP status, e.g. 404 for 'not found'.
+The second row is the name of the resource which is missing.
+The third row is the html page on which the dead link was found.
\ No newline at end of file
diff --git a/crawler4j-examples/deadlinksniffer/pom.xml b/crawler4j-examples/deadlinksniffer/pom.xml
new file mode 100644
index 000000000..2245bdcaa
--- /dev/null
+++ b/crawler4j-examples/deadlinksniffer/pom.xml
@@ -0,0 +1,61 @@
+
+
+ 4.0.0
+
+
+ crawler4j-parent
+ edu.uci.ics
+ 4.5.0-SNAPSHOT
+ ../../pom.xml
+
+ crawler4j-deadlinksniffer
+
+ find dead links on a web page
+ https://github.com/yasserg/crawler4j
+
+
+
+ edu.uci.ics
+ crawler4j
+ ${project.version}
+
+
+
+ commons-cli
+ commons-cli
+ 1.4
+
+
+
+
+
+
+
+ org.codehaus.mojo
+ appassembler-maven-plugin
+ 2.0.0
+
+
+
+ edu.uci.ics.crawler4j.deadlinksniffer.DeadLinkCrawlController
+ DeadLinkSniffer
+
+
+ all
+
+
+
+ bundle
+ package
+ assemble
+
+
+
+
+
+
+
+
diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java
new file mode 100644
index 000000000..b31eac886
--- /dev/null
+++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.crawler4j.deadlinksniffer;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import edu.uci.ics.crawler4j.crawler.CrawlConfig;
+
+/**
+ * @author Mark Struberg
+ */
+public class DeadLinkCrawlConfig extends CrawlConfig {
+ private List urlPatterns = new ArrayList<>();
+ private volatile DeadLinkCrawlerStore crawlerStore;
+ private List excludePatterns = new ArrayList<>();
+
+ public List getUrlPatterns() {
+ return urlPatterns;
+ }
+
+ public List getExcludePatterns() {
+ return excludePatterns;
+ }
+
+ /**
+ * Add a regular expression for URLs which should be followed
+ * by the crawler.
+ */
+ public void addUrlPattern(String urlPattern) {
+ this.urlPatterns.add(Pattern.compile(urlPattern));
+ }
+
+ /**
+ * Add a regular expression for URLs which should be excluded from scanning.
+ * This is effectively a stop-criterium and will get evaluated
+ * after all the patterns added via {@link #addUrlPattern(String)}.
+ */
+ public void addExcludePattern(String excludePattern) {
+ this.excludePatterns.add(Pattern.compile(excludePattern));
+ }
+
+ public DeadLinkCrawlerStore getCrawlerStore() {
+ if (crawlerStore == null) {
+ synchronized (this) {
+ if (crawlerStore == null) {
+ crawlerStore = new DeadLinkCrawlerStore(this);
+ }
+ }
+ }
+
+ return crawlerStore;
+ }
+}
diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java
new file mode 100644
index 000000000..c79401086
--- /dev/null
+++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java
@@ -0,0 +1,228 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.crawler4j.deadlinksniffer;
+
+import edu.uci.ics.crawler4j.crawler.CrawlController;
+import edu.uci.ics.crawler4j.fetcher.PageFetcher;
+import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
+import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Scan given web pages (seed) for dead links.
+ *
+ * @author Yasser Ganjisaffar
+ * @author Mark Struberg
+ */
+public class DeadLinkCrawlController {
+ private static final Logger logger = LoggerFactory.getLogger(DeadLinkCrawlController.class);
+
+ public static void main(String[] args) throws Exception {
+
+ Options options = new Options();
+
+ options.addRequiredOption("s", "seed", true,
+ "Seeding page where the crawling should get started from.");
+
+ options.addOption("u", "url", true,
+ "Regular Expressions for page URLs which should get crawled. " +
+ "If not given the seed will act as a starting point");
+
+ options.addOption("x", "exclude", true,
+ "Regular Expressions for page URLs which should NOT get crawled.");
+
+ options.addOption("?", "help", false,
+ "Print detailed infos about the usage.");
+
+ options.addOption("t", "threads", true,
+ "Number of Threads to use for crawling. Defaults to 1.");
+
+ options.addOption("o", "outDir", true,
+ "output Directory to store the downloaded pages and information. Defaults to ./crawl");
+
+ options.addOption("d", "delay", true,
+ "time delay between requests in ms. Defaults to 1000 (1 second).");
+
+ options.addOption("m", "maxdepth", true,
+ "Maximum Depth of Crawling. Defaults to 3.");
+
+ options.addOption("p", "pages", true,
+ "Maximum number of pages to fetch. Defaults to 2000.");
+
+ CommandLine cmd = null;
+ try {
+ CommandLineParser parser = new DefaultParser();
+ cmd = parser.parse(options, args);
+ }
+ catch (ParseException pe) {
+ logger.info(pe.getMessage());
+ printHelpAndExit(options);
+ }
+
+ if (cmd.hasOption("?")) {
+ printHelpAndExit(options);
+ }
+
+ DeadLinkCrawlConfig config = new DeadLinkCrawlConfig();
+
+ /*
+ * crawlStorageFolder is a folder where intermediate crawl data is
+ * stored.
+ */
+ String crawlStorageFolder =
+ cmd.hasOption("o")
+ ? cmd.getOptionValue("o")
+ : "crawl";
+ config.setCrawlStorageFolder(crawlStorageFolder);
+
+
+ /*
+ * Be polite: Make sure that we don't send more than 1 request per
+ * second (1000 milliseconds between requests).
+ */
+ int delay =
+ cmd.hasOption("d")
+ ? Integer.parseInt(cmd.getOptionValue("d"))
+ : 1000;
+ config.setPolitenessDelay(delay);
+
+ /*
+ * You can set the maximum crawl depth here. The default value is -1 for
+ * unlimited depth
+ */
+ int maxDepth =
+ cmd.hasOption("m")
+ ? Integer.parseInt(cmd.getOptionValue("m"))
+ : 3;
+ config.setMaxDepthOfCrawling(maxDepth);
+
+ /*
+ * You can set the maximum number of pages to crawl. The default value
+ * is -1 for unlimited number of pages
+ */
+ int pages =
+ cmd.hasOption("p")
+ ? Integer.parseInt(cmd.getOptionValue("p"))
+ : 2000;
+ config.setMaxPagesToFetch(pages);
+
+
+ /*
+ * numberOfCrawlers shows the number of concurrent threads that should
+ * be initiated for crawling.
+ */
+ int numberOfCrawlers =
+ cmd.hasOption("t")
+ ? Integer.parseInt(cmd.getOptionValue("t"))
+ : 1;
+
+ if (cmd.hasOption("u")) {
+ String[] urlPatterns = cmd.getOptionValues("u");
+
+ for (String urlPattern : urlPatterns) {
+ config.addUrlPattern(urlPattern);
+ }
+ }
+
+ if (cmd.hasOption("x")) {
+ String[] urlPatterns = cmd.getOptionValues("x");
+
+ for (String urlPattern : urlPatterns) {
+ config.addExcludePattern(urlPattern);
+ }
+ }
+
+ /**
+ * Do you want crawler4j to crawl also binary data ?
+ * example: the contents of pdf, or the metadata of images etc
+ */
+ config.setIncludeBinaryContentInCrawling(false);
+
+ /*
+ * Do you need to set a proxy? If so, you can use:
+ * config.setProxyHost("proxyserver.example.com");
+ * config.setProxyPort(8080);
+ *
+ * If your proxy also needs authentication:
+ * config.setProxyUsername(username); config.getProxyPassword(password);
+ */
+
+ /*
+ * This config parameter can be used to set your crawl to be resumable
+ * (meaning that you can resume the crawl from a previously
+ * interrupted/crashed crawl). Note: if you enable resuming feature and
+ * want to start a fresh crawl, you need to delete the contents of
+ * rootFolder manually.
+ */
+ config.setResumableCrawling(false);
+
+ /*
+ * Yes, we gonna follow a HTTP-301
+ */
+ config.setFollowRedirects(true);
+
+
+ /*
+ * Instantiate the controller for this crawl.
+ */
+ PageFetcher pageFetcher = new PageFetcher(config);
+ RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
+ RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
+ CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
+
+ /*
+ * For each crawl, you need to add some seed urls. These are the first
+ * URLs that are fetched and then the crawler starts following links
+ * which are found in these pages
+ */
+ boolean addSeedsAsUrls = config.getUrlPatterns().isEmpty();
+ String[] seeds = cmd.getOptionValues("s");
+ for(String seed : seeds) {
+ controller.addSeed(seed);
+ if (addSeedsAsUrls) {
+ config.addUrlPattern("^" + seed + ".*");
+ }
+ }
+
+
+ /*
+ * Start the crawl. This is a blocking operation, meaning that your code
+ * will reach the line after this only when crawling is finished.
+ */
+ controller.start(DeadLinkCrawler.class, numberOfCrawlers);
+ }
+
+ private static void printHelpAndExit(Options options) {
+ HelpFormatter hf = new HelpFormatter();
+ hf.printHelp("\n\tDeadLinkSniffer -? - for help"+
+ "\n\tDeadLinkSniffer -s=http://mypage.org - for scanning this page" +
+ "\n\tDeadLinkSniffer -s=http://mypage.org -u=\"https://.*mypage.org.*\" - for scanning this page with all subdomains, etc"
+ , options);
+
+ System.exit(-1);
+ }
+
+
+}
\ No newline at end of file
diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java
new file mode 100644
index 000000000..9c25127f5
--- /dev/null
+++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java
@@ -0,0 +1,210 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.crawler4j.deadlinksniffer;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.regex.Pattern;
+
+import edu.uci.ics.crawler4j.crawler.Page;
+import edu.uci.ics.crawler4j.crawler.WebCrawler;
+import edu.uci.ics.crawler4j.parser.HtmlParseData;
+import edu.uci.ics.crawler4j.parser.ImageData;
+import edu.uci.ics.crawler4j.url.WebURL;
+import org.apache.http.Header;
+import org.apache.http.HttpStatus;
+
+/**
+ * TODO: Currently not thread safe!
+ *
+ * @author Yasser Ganjisaffar
+ * @author Mark Struberg
+ */
+public class DeadLinkCrawler extends WebCrawler {
+
+ private static final Pattern IMAGE_EXTENSIONS = Pattern.compile(".*\\.(bmp|gif|jpg|png|jpeg|css|js|pdf)$");
+
+
+ private AtomicInteger maxVisits = new AtomicInteger(0);
+
+ private File rootFolder;
+
+ /**
+ * contains all broken Urls detected in {@link #handlePageStatusCode(WebURL, int, String)}
+ */
+ private ConcurrentMap brokenUrls = new ConcurrentHashMap();
+
+
+ /**
+ * You should implement this function to specify whether the given url
+ * should be crawled or not (based on your crawling logic).
+ */
+ @Override
+ public boolean shouldVisit(Page referringPage, WebURL url) {
+ String href = url.getURL().toLowerCase();
+ // Ignore the url if it has an extension that matches our defined set of image extensions.
+ if (isImageLink(href)) {
+ return false;
+ }
+
+ // Only accept the url if it is in the requested url domains.
+ if (!((DeadLinkCrawlConfig) getMyController().getConfig()).getUrlPatterns()
+ .stream()
+ .anyMatch(pattern -> pattern.matcher(href).matches())) {
+ return false;
+ }
+
+ // and also only if the url is not explicitly excluded
+ if (((DeadLinkCrawlConfig) getMyController().getConfig()).getExcludePatterns()
+ .stream()
+ .anyMatch(pattern -> pattern.matcher(href).matches())) {
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ protected boolean shouldFollowLinksIn(WebURL url) {
+ int visits = maxVisits.incrementAndGet();
+ logger.info("Number of visits so far: {}", visits);
+ return true;
+ }
+
+ @Override
+ protected void handlePageStatusCode(WebURL webUrl, int statusCode, String statusDescription) {
+ if (statusCode != HttpStatus.SC_OK &&
+ statusCode != HttpStatus.SC_TEMPORARY_REDIRECT &&
+ statusCode != HttpStatus.SC_MOVED_TEMPORARILY &&
+ statusCode != HttpStatus.SC_MOVED_PERMANENTLY) {
+ logger.info("\n\n FEHLERHAFTE SEITE status {} {} \n\n", statusCode, webUrl.getURL());
+ brokenUrls.put(webUrl.getURL(), statusCode);
+ getConfig().getCrawlerStore().storePageStatus(statusCode, webUrl);
+ }
+ }
+
+ /**
+ * This function is called when a page is fetched and ready to be processed
+ * by your program.
+ */
+ @Override
+ public void visit(Page page) {
+ int docid = page.getWebURL().getDocid();
+ String url = page.getWebURL().getURL();
+ String domain = page.getWebURL().getDomain();
+ String path = page.getWebURL().getPath();
+ String subDomain = page.getWebURL().getSubDomain();
+ String parentUrl = page.getWebURL().getParentUrl();
+ String anchor = page.getWebURL().getAnchor();
+
+ logger.debug("Docid: {}", docid);
+ logger.info("URL: {}", url);
+ logger.debug("Domain: '{}'", domain);
+ logger.debug("Sub-domain: '{}'", subDomain);
+ logger.debug("Path: '{}'", path);
+ logger.debug("Parent page: {}", parentUrl);
+ logger.debug("Anchor text: {}", anchor);
+
+ if (page.getParseData() instanceof HtmlParseData) {
+ HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
+ String text = htmlParseData.getText();
+ String html = htmlParseData.getHtml();
+ Set links = htmlParseData.getOutgoingUrls();
+
+ logger.debug("Text length: {}", text.length());
+ logger.debug("Html length: {}", html.length());
+ logger.debug("Number of outgoing links: {}", links.size());
+
+ for (WebURL link : links) {
+ if (brokenUrls.keySet().contains(link.getURL())) {
+ getConfig().getCrawlerStore().storePageStatus(brokenUrls.get(link.getURL()), link);
+ }
+ }
+
+ storeHtml(page.getWebURL(), html);
+
+
+ List imageDatas = htmlParseData.getImageData();
+ int imgNr = 0;
+ for (ImageData imageData : imageDatas) {
+ imgNr++;
+ getConfig().getCrawlerStore().storeImageInfo(page, imgNr, imageData);
+ }
+ }
+
+ Header[] responseHeaders = page.getFetchResponseHeaders();
+ if (responseHeaders != null) {
+ logger.debug("Response heade rs:");
+ for (Header header : responseHeaders) {
+ logger.debug("\t{}: {}", header.getName(), header.getValue());
+ }
+ }
+
+ logger.debug("=============");
+ }
+
+ private void storeHtml(WebURL webURL, String html) {
+ String rootUrl = webURL.getRootUrl();
+ File rootUrlDir = new File(getRootFolder(), rootUrl.replace("/", "_"));
+ if (!rootUrlDir.exists()) {
+ rootUrlDir.mkdir();
+ }
+
+
+ File f = new File(rootUrlDir, webURL.getURL().replace("/", "_"));
+ if (f.exists()) {
+ return;
+ }
+ try (FileWriter fw = new FileWriter(f)) {
+ fw.write(html);
+ }
+ catch (IOException e) {
+ logger.error("could not store file " + f.toString(), e);
+ }
+ }
+
+ @Override
+ public void onBeforeExit() {
+ getConfig().getCrawlerStore().close();
+ }
+
+ private DeadLinkCrawlConfig getConfig() {
+ return (DeadLinkCrawlConfig) getMyController().getConfig();
+ }
+
+ public File getRootFolder() {
+ if (rootFolder == null) {
+ rootFolder = new File(getConfig().getCrawlStorageFolder(), "content");
+ rootFolder.mkdirs();
+ }
+ return rootFolder;
+ }
+
+ private boolean isImageLink(String href) {
+ return IMAGE_EXTENSIONS.matcher(href).matches();
+ }
+
+
+}
diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlerStore.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlerStore.java
new file mode 100644
index 000000000..59f583d65
--- /dev/null
+++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlerStore.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.crawler4j.deadlinksniffer;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+
+import edu.uci.ics.crawler4j.crawler.Page;
+import edu.uci.ics.crawler4j.parser.ImageData;
+import edu.uci.ics.crawler4j.url.WebURL;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Stores information about erroneous pages to the disk.
+ *
+ * @author Mark Struberg
+ */
+public class DeadLinkCrawlerStore {
+ private static final Logger logger = LoggerFactory.getLogger(DeadLinkCrawlerStore.class);
+
+ private final DeadLinkCrawlConfig config;
+
+ private FileWriter brokenPages;
+ private FileWriter imageWoAlt;
+
+
+ protected DeadLinkCrawlerStore(DeadLinkCrawlConfig config) {
+ this.config = config;
+ try {
+ brokenPages = new FileWriter(new File(config.getCrawlStorageFolder(), "brokenPages.csv"));
+ brokenPages.append("status, url, parent_url\n");
+ brokenPages.flush();
+
+ imageWoAlt = new FileWriter(new File(config.getCrawlStorageFolder(), "imageWoAlt.csv"));
+ imageWoAlt.append("onPage, imageNr, imgSrc, imgLink\n");
+ imageWoAlt.flush();
+ }
+ catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ }
+
+
+
+
+ public synchronized void close() {
+ closeFile(brokenPages, "brokenPages");
+ closeFile(imageWoAlt, "imageWoAlt");
+ brokenPages = null;
+ imageWoAlt = null;
+ }
+
+ private void closeFile(FileWriter fw, String name) {
+ if (fw == null) {
+ return;
+ }
+
+ try {
+ fw.close();
+ }
+ catch (IOException e) {
+ logger.error("problem with closing" + name, e);
+ }
+ }
+
+ public synchronized void storePageStatus(int statusCode, WebURL webUrlFail) {
+ try {
+ brokenPages.append("" + statusCode + ", " + webUrlFail.getURL() + ", " + webUrlFail.getParentUrl() + "\n");
+ brokenPages.flush();
+ }
+ catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public synchronized void storeImageInfo(Page page, int imgNr, ImageData imageData) {
+ // log all images with missing alt tag
+ if (!imageData.getAttrVals().containsKey("alt") || imageData.getAttrVals().get("alt").isEmpty()) {
+ String url = page.getWebURL().getURL();
+ logger.info("\n\n IMAGE without 'alt' tag on page {} img: {}", url, imageData.getSrc());
+ try {
+ String src = imageData.getSrc();
+ String imgLink;
+ if (src.startsWith("https://") || src.startsWith("http://")) {
+ // absolute image
+ imgLink = src;
+ }
+ else if (src.startsWith("/")) {
+ // server-root relative image
+ imgLink = page.getWebURL().getRootUrl() + src;
+ }
+ else {
+ // relative image
+ imgLink = page.getWebURL().getRootUrl() + page.getWebURL().getPath() + "/" + src;
+ }
+ imageWoAlt.append(url + ", " + imgNr + ", " + src + ", " + imgLink + "\n");
+ imageWoAlt.flush();
+ }
+ catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+}
diff --git a/crawler4j-examples/deadlinksniffer/src/main/resources/logback.xml b/crawler4j-examples/deadlinksniffer/src/main/resources/logback.xml
new file mode 100644
index 000000000..c716bfa50
--- /dev/null
+++ b/crawler4j-examples/deadlinksniffer/src/main/resources/logback.xml
@@ -0,0 +1,15 @@
+
+
+
+
+
+ %date{HH:mm:ss} %-5level [%thread] - [%logger{0}]- %msg%n
+
+
+
+
+
+
+
+
+
diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java
index 67676afcb..1dc428eb5 100644
--- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java
+++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java
@@ -18,6 +18,7 @@
package edu.uci.ics.crawler4j.parser;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -68,6 +69,7 @@ public static Element getElement(String name) {
private final StringBuilder bodyText;
private final List outgoingUrls;
+ private final List imageData;
private ExtractedUrlAnchorPair curUrl = null;
private boolean anchorFlag = false;
@@ -77,6 +79,7 @@ public HtmlContentHandler() {
isWithinBodyElement = false;
bodyText = new StringBuilder();
outgoingUrls = new ArrayList<>();
+ imageData = new ArrayList<>();
}
@Override
@@ -94,7 +97,19 @@ public void startElement(String uri, String localName, String qName, Attributes
String imgSrc = attributes.getValue("src");
if (imgSrc != null) {
addToOutgoingUrls(imgSrc, localName);
+
+ Map attrVals;
+ if (attributes.getLength() == 0) {
+ attrVals = Collections.emptyMap();
+ } else {
+ attrVals = new HashMap<>();
+ for (int i = 0; i < attributes.getLength(); i++) {
+ attrVals.put(attributes.getLocalName(i), attributes.getValue(i));
+ }
+ }
+ imageData.add(new ImageData(imgSrc, attrVals));
}
+
} else if ((element == Element.IFRAME) || (element == Element.FRAME) ||
(element == Element.EMBED) || (element == Element.SCRIPT)) {
String src = attributes.getValue("src");
@@ -209,4 +224,8 @@ public String getBaseUrl() {
public Map getMetaTags() {
return metaTags;
}
+
+ public List getImageData() {
+ return imageData;
+ }
}
diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlParseData.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlParseData.java
index cac68fb75..39ffb7f07 100644
--- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlParseData.java
+++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlParseData.java
@@ -17,6 +17,7 @@
package edu.uci.ics.crawler4j.parser;
+import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -31,6 +32,7 @@ public class HtmlParseData implements ParseData {
private Set outgoingUrls;
private String contentCharset;
+ private List imageData;
public String getHtml() {
return html;
@@ -78,6 +80,10 @@ public void setOutgoingUrls(Set outgoingUrls) {
this.outgoingUrls = outgoingUrls;
}
+ public void setImageData(List imageData) {
+ this.imageData = imageData;
+ }
+
@Override
public String toString() {
return text;
@@ -90,4 +96,8 @@ public void setContentCharset(String contentCharset) {
public String getContentCharset() {
return contentCharset;
}
+
+ public List getImageData() {
+ return imageData;
+ }
}
\ No newline at end of file
diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/ImageData.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/ImageData.java
new file mode 100644
index 000000000..14f1859de
--- /dev/null
+++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/ImageData.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.crawler4j.parser;
+
+import java.util.Map;
+
+/**
+ * Information about images on a page.
+ * Can be used to e.g. detect images without an 'alt' tag.
+ * @author Mark Struberg
+ */
+public class ImageData {
+
+ private final String src;
+ private final Map attrVals;
+
+ public ImageData(String src, Map attrVals) {
+ this.src = src;
+ this.attrVals = attrVals;
+ }
+
+ public String getSrc() {
+ return src;
+ }
+
+ public Map getAttrVals() {
+ return attrVals;
+ }
+}
diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java
index 48657bce6..ce9f1ed12 100644
--- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java
+++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java
@@ -61,6 +61,8 @@ public HtmlParseData parse(Page page, String contextURL) throws ParseException {
Set outgoingUrls = getOutgoingUrls(contextURL, contentHandler, contentCharset);
parsedData.setOutgoingUrls(outgoingUrls);
+ parsedData.setImageData(contentHandler.getImageData());
+
try {
if (page.getContentCharset() == null) {
parsedData.setHtml(new String(page.getContentData()));
diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java
index bd64052bd..34e3c750c 100644
--- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java
+++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java
@@ -38,6 +38,7 @@ public class WebURL implements Serializable {
private int docid;
private int parentDocid;
private String parentUrl;
+ private String rootUrl;
private short depth;
private String domain;
private String subDomain;
@@ -93,6 +94,8 @@ public void setURL(String url) {
if (pathEndIdx >= 0) {
path = path.substring(0, pathEndIdx);
}
+
+ this.rootUrl = domainEndIdx > 0 ? url.substring(0, domainEndIdx) : url;
}
/**
@@ -121,6 +124,15 @@ public void setParentUrl(String parentUrl) {
this.parentUrl = parentUrl;
}
+ /**
+ * The root URL of the page
+ * E.g. for the url http://somesub.domain.org/myapp?idx=4 this would be
+ * http://somesub.domain.org/
+ */
+ public String getRootUrl() {
+ return rootUrl;
+ }
+
/**
* @return
* crawl depth at which this Url is first observed. Seed Urls
diff --git a/pom.xml b/pom.xml
index a5af11dd6..06f79abe7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -50,7 +50,11 @@
crawler4j
crawler4j-examples/crawler4j-examples-base
+
+ crawler4j-examples/deadlinksniffer