From 7ea4f122aefee35508badf33e41166cd3b700376 Mon Sep 17 00:00:00 2001 From: Bill13579 Date: Sun, 11 Mar 2018 21:18:57 +0900 Subject: [PATCH] Part 2 Source Code --- Project/src/main/Bot.java | 54 +++++++++++++++++++++++++++++++++++--- Project/src/main/Main.java | 2 +- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/Project/src/main/Bot.java b/Project/src/main/Bot.java index 8013440..112f83f 100644 --- a/Project/src/main/Bot.java +++ b/Project/src/main/Bot.java @@ -2,6 +2,12 @@ import java.io.*; import java.net.*; +import java.util.HashSet; +import java.util.Set; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.*; +import org.jsoup.select.*; public class Bot { @@ -12,12 +18,54 @@ public Bot(String start_url) { } public void start() { + crawl(this.start_url); + } + + private void crawl(String url) { + + String html = getHTML(url); + Set crawledURLs = new HashSet(); + Set pendingURLs = new HashSet(); + + Document doc = Jsoup.parse(html); + Elements elements = doc.select("a"); + + for (Element e: elements) { + String href = e.attr("href"); + href = processLink(href, url); + System.out.println(href); + } + System.out.println(processLink("../", url)); + + } + + private String processLink(String link, String base) { - String html = getHTML(this.start_url); - System.out.println(html); + try { + URL u = new URL(base); + if (link.startsWith("./")) { + link = link.substring(2, link.length()); + link = u.getProtocol() + "://" + u.getAuthority() + stripFilename(u.getPath()) + link; + } else if (link.startsWith("#")) { + link = base + link; + } else if (link.startsWith("javascript:")) { + link = null; + } else if (link.startsWith("../") || (!link.startsWith("http://") && !link.startsWith("https://"))) { + link = u.getProtocol() + "://" + u.getAuthority() + stripFilename(u.getPath()) + link; + } + return link; + } catch (Exception e) { + e.printStackTrace(); + return null; + } } + private String stripFilename(String path) { + int pos = path.lastIndexOf("/"); + return pos <= -1 ? path : path.substring(0, pos+1); + } + private String getHTML(String url) { URL u; @@ -46,4 +94,4 @@ private String getHTML(String url) { } -} +} \ No newline at end of file diff --git a/Project/src/main/Main.java b/Project/src/main/Main.java index 44008ab..8a049ac 100644 --- a/Project/src/main/Main.java +++ b/Project/src/main/Main.java @@ -4,7 +4,7 @@ public class Main { public static void main(String[] args) { - String start_url = "file:///data/Tutorials/Java%20Web%20Crawler/java-web-crawler/Test%20Page/index.html"; + String start_url = "http://192.168.3.138/java-web-crawler-test/index.html"; Bot bot = new Bot(start_url); bot.start();