diff --git a/crawler4j-examples/deadlinksniffer/README.adoc b/crawler4j-examples/deadlinksniffer/README.adoc new file mode 100644 index 000000000..d96d6083a --- /dev/null +++ b/crawler4j-examples/deadlinksniffer/README.adoc @@ -0,0 +1,45 @@ += Crawler4j Dead Link Sniffer + +This application scans a web page for dead links. + +== Compiling + +The whole application can be built by using maven + +---- +$> mvn clean install +---- + +This will also bundle an executable application in `crawler4j-examples/deadlinksniffer/target/appassembler`. + +== Usage + +==== Getting more help +For getting the parameter description: +---- +$> ./bin/DeadLinkSniffer -? +---- + +==== Scanning a web page for dead links. +Example how to scan a sample page for dead links: +This will scan all sub pages which are reachable by all the `seed` (`-s`) pages given. +---- +$> ./bin/DeadLinkSniffer -s=http://mypage.org +---- + +You can also define which URLs should be accessed via a list of regExp parameters `-u`. +For defining multiple rules, simply add multiple `-u` parameters. + +---- +$> ./bin/DeadLinkSniffer -s=http://mypage.org -u="https://.*mypage.org.*" +---- + +==== Output + +By default the output files are in `./crawl`. +The output directory can be specified with the `-o` parameter. + +The output directory contains a file `brokenPages.csv` which contains all broken links. +The first row is the HTTP status, e.g. 404 for 'not found'. +The second row is the name of the resource which is missing. +The third row is the html page on which the dead link was found. \ No newline at end of file diff --git a/crawler4j-examples/deadlinksniffer/pom.xml b/crawler4j-examples/deadlinksniffer/pom.xml new file mode 100644 index 000000000..2245bdcaa --- /dev/null +++ b/crawler4j-examples/deadlinksniffer/pom.xml @@ -0,0 +1,61 @@ + + + 4.0.0 + + + crawler4j-parent + edu.uci.ics + 4.5.0-SNAPSHOT + ../../pom.xml + + crawler4j-deadlinksniffer + + find dead links on a web page + https://github.com/yasserg/crawler4j + + + + edu.uci.ics + crawler4j + ${project.version} + + + + commons-cli + commons-cli + 1.4 + + + + + + + + org.codehaus.mojo + appassembler-maven-plugin + 2.0.0 + + + + edu.uci.ics.crawler4j.deadlinksniffer.DeadLinkCrawlController + DeadLinkSniffer + + + all + + + + bundle + package + assemble + + + + + + + + diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java new file mode 100644 index 000000000..b31eac886 --- /dev/null +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlConfig.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package edu.uci.ics.crawler4j.deadlinksniffer; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +import edu.uci.ics.crawler4j.crawler.CrawlConfig; + +/** + * @author Mark Struberg + */ +public class DeadLinkCrawlConfig extends CrawlConfig { + private List urlPatterns = new ArrayList<>(); + private volatile DeadLinkCrawlerStore crawlerStore; + private List excludePatterns = new ArrayList<>(); + + public List getUrlPatterns() { + return urlPatterns; + } + + public List getExcludePatterns() { + return excludePatterns; + } + + /** + * Add a regular expression for URLs which should be followed + * by the crawler. + */ + public void addUrlPattern(String urlPattern) { + this.urlPatterns.add(Pattern.compile(urlPattern)); + } + + /** + * Add a regular expression for URLs which should be excluded from scanning. + * This is effectively a stop-criterium and will get evaluated + * after all the patterns added via {@link #addUrlPattern(String)}. + */ + public void addExcludePattern(String excludePattern) { + this.excludePatterns.add(Pattern.compile(excludePattern)); + } + + public DeadLinkCrawlerStore getCrawlerStore() { + if (crawlerStore == null) { + synchronized (this) { + if (crawlerStore == null) { + crawlerStore = new DeadLinkCrawlerStore(this); + } + } + } + + return crawlerStore; + } +} diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java new file mode 100644 index 000000000..c79401086 --- /dev/null +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlController.java @@ -0,0 +1,228 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package edu.uci.ics.crawler4j.deadlinksniffer; + +import edu.uci.ics.crawler4j.crawler.CrawlController; +import edu.uci.ics.crawler4j.fetcher.PageFetcher; +import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig; +import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Scan given web pages (seed) for dead links. + * + * @author Yasser Ganjisaffar + * @author Mark Struberg + */ +public class DeadLinkCrawlController { + private static final Logger logger = LoggerFactory.getLogger(DeadLinkCrawlController.class); + + public static void main(String[] args) throws Exception { + + Options options = new Options(); + + options.addRequiredOption("s", "seed", true, + "Seeding page where the crawling should get started from."); + + options.addOption("u", "url", true, + "Regular Expressions for page URLs which should get crawled. " + + "If not given the seed will act as a starting point"); + + options.addOption("x", "exclude", true, + "Regular Expressions for page URLs which should NOT get crawled."); + + options.addOption("?", "help", false, + "Print detailed infos about the usage."); + + options.addOption("t", "threads", true, + "Number of Threads to use for crawling. Defaults to 1."); + + options.addOption("o", "outDir", true, + "output Directory to store the downloaded pages and information. Defaults to ./crawl"); + + options.addOption("d", "delay", true, + "time delay between requests in ms. Defaults to 1000 (1 second)."); + + options.addOption("m", "maxdepth", true, + "Maximum Depth of Crawling. Defaults to 3."); + + options.addOption("p", "pages", true, + "Maximum number of pages to fetch. Defaults to 2000."); + + CommandLine cmd = null; + try { + CommandLineParser parser = new DefaultParser(); + cmd = parser.parse(options, args); + } + catch (ParseException pe) { + logger.info(pe.getMessage()); + printHelpAndExit(options); + } + + if (cmd.hasOption("?")) { + printHelpAndExit(options); + } + + DeadLinkCrawlConfig config = new DeadLinkCrawlConfig(); + + /* + * crawlStorageFolder is a folder where intermediate crawl data is + * stored. + */ + String crawlStorageFolder = + cmd.hasOption("o") + ? cmd.getOptionValue("o") + : "crawl"; + config.setCrawlStorageFolder(crawlStorageFolder); + + + /* + * Be polite: Make sure that we don't send more than 1 request per + * second (1000 milliseconds between requests). + */ + int delay = + cmd.hasOption("d") + ? Integer.parseInt(cmd.getOptionValue("d")) + : 1000; + config.setPolitenessDelay(delay); + + /* + * You can set the maximum crawl depth here. The default value is -1 for + * unlimited depth + */ + int maxDepth = + cmd.hasOption("m") + ? Integer.parseInt(cmd.getOptionValue("m")) + : 3; + config.setMaxDepthOfCrawling(maxDepth); + + /* + * You can set the maximum number of pages to crawl. The default value + * is -1 for unlimited number of pages + */ + int pages = + cmd.hasOption("p") + ? Integer.parseInt(cmd.getOptionValue("p")) + : 2000; + config.setMaxPagesToFetch(pages); + + + /* + * numberOfCrawlers shows the number of concurrent threads that should + * be initiated for crawling. + */ + int numberOfCrawlers = + cmd.hasOption("t") + ? Integer.parseInt(cmd.getOptionValue("t")) + : 1; + + if (cmd.hasOption("u")) { + String[] urlPatterns = cmd.getOptionValues("u"); + + for (String urlPattern : urlPatterns) { + config.addUrlPattern(urlPattern); + } + } + + if (cmd.hasOption("x")) { + String[] urlPatterns = cmd.getOptionValues("x"); + + for (String urlPattern : urlPatterns) { + config.addExcludePattern(urlPattern); + } + } + + /** + * Do you want crawler4j to crawl also binary data ? + * example: the contents of pdf, or the metadata of images etc + */ + config.setIncludeBinaryContentInCrawling(false); + + /* + * Do you need to set a proxy? If so, you can use: + * config.setProxyHost("proxyserver.example.com"); + * config.setProxyPort(8080); + * + * If your proxy also needs authentication: + * config.setProxyUsername(username); config.getProxyPassword(password); + */ + + /* + * This config parameter can be used to set your crawl to be resumable + * (meaning that you can resume the crawl from a previously + * interrupted/crashed crawl). Note: if you enable resuming feature and + * want to start a fresh crawl, you need to delete the contents of + * rootFolder manually. + */ + config.setResumableCrawling(false); + + /* + * Yes, we gonna follow a HTTP-301 + */ + config.setFollowRedirects(true); + + + /* + * Instantiate the controller for this crawl. + */ + PageFetcher pageFetcher = new PageFetcher(config); + RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); + RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); + CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); + + /* + * For each crawl, you need to add some seed urls. These are the first + * URLs that are fetched and then the crawler starts following links + * which are found in these pages + */ + boolean addSeedsAsUrls = config.getUrlPatterns().isEmpty(); + String[] seeds = cmd.getOptionValues("s"); + for(String seed : seeds) { + controller.addSeed(seed); + if (addSeedsAsUrls) { + config.addUrlPattern("^" + seed + ".*"); + } + } + + + /* + * Start the crawl. This is a blocking operation, meaning that your code + * will reach the line after this only when crawling is finished. + */ + controller.start(DeadLinkCrawler.class, numberOfCrawlers); + } + + private static void printHelpAndExit(Options options) { + HelpFormatter hf = new HelpFormatter(); + hf.printHelp("\n\tDeadLinkSniffer -? - for help"+ + "\n\tDeadLinkSniffer -s=http://mypage.org - for scanning this page" + + "\n\tDeadLinkSniffer -s=http://mypage.org -u=\"https://.*mypage.org.*\" - for scanning this page with all subdomains, etc" + , options); + + System.exit(-1); + } + + +} \ No newline at end of file diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java new file mode 100644 index 000000000..9c25127f5 --- /dev/null +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawler.java @@ -0,0 +1,210 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package edu.uci.ics.crawler4j.deadlinksniffer; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.regex.Pattern; + +import edu.uci.ics.crawler4j.crawler.Page; +import edu.uci.ics.crawler4j.crawler.WebCrawler; +import edu.uci.ics.crawler4j.parser.HtmlParseData; +import edu.uci.ics.crawler4j.parser.ImageData; +import edu.uci.ics.crawler4j.url.WebURL; +import org.apache.http.Header; +import org.apache.http.HttpStatus; + +/** + * TODO: Currently not thread safe! + * + * @author Yasser Ganjisaffar + * @author Mark Struberg + */ +public class DeadLinkCrawler extends WebCrawler { + + private static final Pattern IMAGE_EXTENSIONS = Pattern.compile(".*\\.(bmp|gif|jpg|png|jpeg|css|js|pdf)$"); + + + private AtomicInteger maxVisits = new AtomicInteger(0); + + private File rootFolder; + + /** + * contains all broken Urls detected in {@link #handlePageStatusCode(WebURL, int, String)} + */ + private ConcurrentMap brokenUrls = new ConcurrentHashMap(); + + + /** + * You should implement this function to specify whether the given url + * should be crawled or not (based on your crawling logic). + */ + @Override + public boolean shouldVisit(Page referringPage, WebURL url) { + String href = url.getURL().toLowerCase(); + // Ignore the url if it has an extension that matches our defined set of image extensions. + if (isImageLink(href)) { + return false; + } + + // Only accept the url if it is in the requested url domains. + if (!((DeadLinkCrawlConfig) getMyController().getConfig()).getUrlPatterns() + .stream() + .anyMatch(pattern -> pattern.matcher(href).matches())) { + return false; + } + + // and also only if the url is not explicitly excluded + if (((DeadLinkCrawlConfig) getMyController().getConfig()).getExcludePatterns() + .stream() + .anyMatch(pattern -> pattern.matcher(href).matches())) { + return false; + } + return true; + } + + @Override + protected boolean shouldFollowLinksIn(WebURL url) { + int visits = maxVisits.incrementAndGet(); + logger.info("Number of visits so far: {}", visits); + return true; + } + + @Override + protected void handlePageStatusCode(WebURL webUrl, int statusCode, String statusDescription) { + if (statusCode != HttpStatus.SC_OK && + statusCode != HttpStatus.SC_TEMPORARY_REDIRECT && + statusCode != HttpStatus.SC_MOVED_TEMPORARILY && + statusCode != HttpStatus.SC_MOVED_PERMANENTLY) { + logger.info("\n\n FEHLERHAFTE SEITE status {} {} \n\n", statusCode, webUrl.getURL()); + brokenUrls.put(webUrl.getURL(), statusCode); + getConfig().getCrawlerStore().storePageStatus(statusCode, webUrl); + } + } + + /** + * This function is called when a page is fetched and ready to be processed + * by your program. + */ + @Override + public void visit(Page page) { + int docid = page.getWebURL().getDocid(); + String url = page.getWebURL().getURL(); + String domain = page.getWebURL().getDomain(); + String path = page.getWebURL().getPath(); + String subDomain = page.getWebURL().getSubDomain(); + String parentUrl = page.getWebURL().getParentUrl(); + String anchor = page.getWebURL().getAnchor(); + + logger.debug("Docid: {}", docid); + logger.info("URL: {}", url); + logger.debug("Domain: '{}'", domain); + logger.debug("Sub-domain: '{}'", subDomain); + logger.debug("Path: '{}'", path); + logger.debug("Parent page: {}", parentUrl); + logger.debug("Anchor text: {}", anchor); + + if (page.getParseData() instanceof HtmlParseData) { + HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); + String text = htmlParseData.getText(); + String html = htmlParseData.getHtml(); + Set links = htmlParseData.getOutgoingUrls(); + + logger.debug("Text length: {}", text.length()); + logger.debug("Html length: {}", html.length()); + logger.debug("Number of outgoing links: {}", links.size()); + + for (WebURL link : links) { + if (brokenUrls.keySet().contains(link.getURL())) { + getConfig().getCrawlerStore().storePageStatus(brokenUrls.get(link.getURL()), link); + } + } + + storeHtml(page.getWebURL(), html); + + + List imageDatas = htmlParseData.getImageData(); + int imgNr = 0; + for (ImageData imageData : imageDatas) { + imgNr++; + getConfig().getCrawlerStore().storeImageInfo(page, imgNr, imageData); + } + } + + Header[] responseHeaders = page.getFetchResponseHeaders(); + if (responseHeaders != null) { + logger.debug("Response heade rs:"); + for (Header header : responseHeaders) { + logger.debug("\t{}: {}", header.getName(), header.getValue()); + } + } + + logger.debug("============="); + } + + private void storeHtml(WebURL webURL, String html) { + String rootUrl = webURL.getRootUrl(); + File rootUrlDir = new File(getRootFolder(), rootUrl.replace("/", "_")); + if (!rootUrlDir.exists()) { + rootUrlDir.mkdir(); + } + + + File f = new File(rootUrlDir, webURL.getURL().replace("/", "_")); + if (f.exists()) { + return; + } + try (FileWriter fw = new FileWriter(f)) { + fw.write(html); + } + catch (IOException e) { + logger.error("could not store file " + f.toString(), e); + } + } + + @Override + public void onBeforeExit() { + getConfig().getCrawlerStore().close(); + } + + private DeadLinkCrawlConfig getConfig() { + return (DeadLinkCrawlConfig) getMyController().getConfig(); + } + + public File getRootFolder() { + if (rootFolder == null) { + rootFolder = new File(getConfig().getCrawlStorageFolder(), "content"); + rootFolder.mkdirs(); + } + return rootFolder; + } + + private boolean isImageLink(String href) { + return IMAGE_EXTENSIONS.matcher(href).matches(); + } + + +} diff --git a/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlerStore.java b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlerStore.java new file mode 100644 index 000000000..59f583d65 --- /dev/null +++ b/crawler4j-examples/deadlinksniffer/src/main/java/edu/uci/ics/crawler4j/deadlinksniffer/DeadLinkCrawlerStore.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package edu.uci.ics.crawler4j.deadlinksniffer; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; + +import edu.uci.ics.crawler4j.crawler.Page; +import edu.uci.ics.crawler4j.parser.ImageData; +import edu.uci.ics.crawler4j.url.WebURL; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Stores information about erroneous pages to the disk. + * + * @author Mark Struberg + */ +public class DeadLinkCrawlerStore { + private static final Logger logger = LoggerFactory.getLogger(DeadLinkCrawlerStore.class); + + private final DeadLinkCrawlConfig config; + + private FileWriter brokenPages; + private FileWriter imageWoAlt; + + + protected DeadLinkCrawlerStore(DeadLinkCrawlConfig config) { + this.config = config; + try { + brokenPages = new FileWriter(new File(config.getCrawlStorageFolder(), "brokenPages.csv")); + brokenPages.append("status, url, parent_url\n"); + brokenPages.flush(); + + imageWoAlt = new FileWriter(new File(config.getCrawlStorageFolder(), "imageWoAlt.csv")); + imageWoAlt.append("onPage, imageNr, imgSrc, imgLink\n"); + imageWoAlt.flush(); + } + catch (IOException e) { + throw new RuntimeException(e); + } + + } + + + + + public synchronized void close() { + closeFile(brokenPages, "brokenPages"); + closeFile(imageWoAlt, "imageWoAlt"); + brokenPages = null; + imageWoAlt = null; + } + + private void closeFile(FileWriter fw, String name) { + if (fw == null) { + return; + } + + try { + fw.close(); + } + catch (IOException e) { + logger.error("problem with closing" + name, e); + } + } + + public synchronized void storePageStatus(int statusCode, WebURL webUrlFail) { + try { + brokenPages.append("" + statusCode + ", " + webUrlFail.getURL() + ", " + webUrlFail.getParentUrl() + "\n"); + brokenPages.flush(); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + + public synchronized void storeImageInfo(Page page, int imgNr, ImageData imageData) { + // log all images with missing alt tag + if (!imageData.getAttrVals().containsKey("alt") || imageData.getAttrVals().get("alt").isEmpty()) { + String url = page.getWebURL().getURL(); + logger.info("\n\n IMAGE without 'alt' tag on page {} img: {}", url, imageData.getSrc()); + try { + String src = imageData.getSrc(); + String imgLink; + if (src.startsWith("https://") || src.startsWith("http://")) { + // absolute image + imgLink = src; + } + else if (src.startsWith("/")) { + // server-root relative image + imgLink = page.getWebURL().getRootUrl() + src; + } + else { + // relative image + imgLink = page.getWebURL().getRootUrl() + page.getWebURL().getPath() + "/" + src; + } + imageWoAlt.append(url + ", " + imgNr + ", " + src + ", " + imgLink + "\n"); + imageWoAlt.flush(); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + } +} diff --git a/crawler4j-examples/deadlinksniffer/src/main/resources/logback.xml b/crawler4j-examples/deadlinksniffer/src/main/resources/logback.xml new file mode 100644 index 000000000..c716bfa50 --- /dev/null +++ b/crawler4j-examples/deadlinksniffer/src/main/resources/logback.xml @@ -0,0 +1,15 @@ + + + + + + %date{HH:mm:ss} %-5level [%thread] - [%logger{0}]- %msg%n + + + + + + + + + diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java index 67676afcb..1dc428eb5 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java @@ -18,6 +18,7 @@ package edu.uci.ics.crawler4j.parser; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -68,6 +69,7 @@ public static Element getElement(String name) { private final StringBuilder bodyText; private final List outgoingUrls; + private final List imageData; private ExtractedUrlAnchorPair curUrl = null; private boolean anchorFlag = false; @@ -77,6 +79,7 @@ public HtmlContentHandler() { isWithinBodyElement = false; bodyText = new StringBuilder(); outgoingUrls = new ArrayList<>(); + imageData = new ArrayList<>(); } @Override @@ -94,7 +97,19 @@ public void startElement(String uri, String localName, String qName, Attributes String imgSrc = attributes.getValue("src"); if (imgSrc != null) { addToOutgoingUrls(imgSrc, localName); + + Map attrVals; + if (attributes.getLength() == 0) { + attrVals = Collections.emptyMap(); + } else { + attrVals = new HashMap<>(); + for (int i = 0; i < attributes.getLength(); i++) { + attrVals.put(attributes.getLocalName(i), attributes.getValue(i)); + } + } + imageData.add(new ImageData(imgSrc, attrVals)); } + } else if ((element == Element.IFRAME) || (element == Element.FRAME) || (element == Element.EMBED) || (element == Element.SCRIPT)) { String src = attributes.getValue("src"); @@ -209,4 +224,8 @@ public String getBaseUrl() { public Map getMetaTags() { return metaTags; } + + public List getImageData() { + return imageData; + } } diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlParseData.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlParseData.java index cac68fb75..39ffb7f07 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlParseData.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/HtmlParseData.java @@ -17,6 +17,7 @@ package edu.uci.ics.crawler4j.parser; +import java.util.List; import java.util.Map; import java.util.Set; @@ -31,6 +32,7 @@ public class HtmlParseData implements ParseData { private Set outgoingUrls; private String contentCharset; + private List imageData; public String getHtml() { return html; @@ -78,6 +80,10 @@ public void setOutgoingUrls(Set outgoingUrls) { this.outgoingUrls = outgoingUrls; } + public void setImageData(List imageData) { + this.imageData = imageData; + } + @Override public String toString() { return text; @@ -90,4 +96,8 @@ public void setContentCharset(String contentCharset) { public String getContentCharset() { return contentCharset; } + + public List getImageData() { + return imageData; + } } \ No newline at end of file diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/ImageData.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/ImageData.java new file mode 100644 index 000000000..14f1859de --- /dev/null +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/ImageData.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package edu.uci.ics.crawler4j.parser; + +import java.util.Map; + +/** + * Information about images on a page. + * Can be used to e.g. detect images without an 'alt' tag. + * @author Mark Struberg + */ +public class ImageData { + + private final String src; + private final Map attrVals; + + public ImageData(String src, Map attrVals) { + this.src = src; + this.attrVals = attrVals; + } + + public String getSrc() { + return src; + } + + public Map getAttrVals() { + return attrVals; + } +} diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java index 48657bce6..ce9f1ed12 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java @@ -61,6 +61,8 @@ public HtmlParseData parse(Page page, String contextURL) throws ParseException { Set outgoingUrls = getOutgoingUrls(contextURL, contentHandler, contentCharset); parsedData.setOutgoingUrls(outgoingUrls); + parsedData.setImageData(contentHandler.getImageData()); + try { if (page.getContentCharset() == null) { parsedData.setHtml(new String(page.getContentData())); diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java index bd64052bd..34e3c750c 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java @@ -38,6 +38,7 @@ public class WebURL implements Serializable { private int docid; private int parentDocid; private String parentUrl; + private String rootUrl; private short depth; private String domain; private String subDomain; @@ -93,6 +94,8 @@ public void setURL(String url) { if (pathEndIdx >= 0) { path = path.substring(0, pathEndIdx); } + + this.rootUrl = domainEndIdx > 0 ? url.substring(0, domainEndIdx) : url; } /** @@ -121,6 +124,15 @@ public void setParentUrl(String parentUrl) { this.parentUrl = parentUrl; } + /** + * The root URL of the page + * E.g. for the url http://somesub.domain.org/myapp?idx=4 this would be + * http://somesub.domain.org/ + */ + public String getRootUrl() { + return rootUrl; + } + /** * @return * crawl depth at which this Url is first observed. Seed Urls diff --git a/pom.xml b/pom.xml index a5af11dd6..06f79abe7 100644 --- a/pom.xml +++ b/pom.xml @@ -50,7 +50,11 @@ crawler4j crawler4j-examples/crawler4j-examples-base + + crawler4j-examples/deadlinksniffer