Skip to content

Commit

Permalink
Part 2 Source Code
Browse files Browse the repository at this point in the history
  • Loading branch information
Bill13579 committed Mar 11, 2018
1 parent bdf5bb0 commit 7ea4f12
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 4 deletions.
54 changes: 51 additions & 3 deletions Project/src/main/Bot.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

import java.io.*;
import java.net.*;
import java.util.HashSet;
import java.util.Set;

import org.jsoup.Jsoup;
import org.jsoup.nodes.*;
import org.jsoup.select.*;

public class Bot {

Expand All @@ -12,12 +18,54 @@ public Bot(String start_url) {
}

public void start() {
crawl(this.start_url);
}

private void crawl(String url) {

String html = getHTML(url);
Set<String> crawledURLs = new HashSet<String>();
Set<String> pendingURLs = new HashSet<String>();

Document doc = Jsoup.parse(html);
Elements elements = doc.select("a");

for (Element e: elements) {
String href = e.attr("href");
href = processLink(href, url);
System.out.println(href);
}
System.out.println(processLink("../", url));

}

private String processLink(String link, String base) {

String html = getHTML(this.start_url);
System.out.println(html);
try {
URL u = new URL(base);
if (link.startsWith("./")) {
link = link.substring(2, link.length());
link = u.getProtocol() + "://" + u.getAuthority() + stripFilename(u.getPath()) + link;
} else if (link.startsWith("#")) {
link = base + link;
} else if (link.startsWith("javascript:")) {
link = null;
} else if (link.startsWith("../") || (!link.startsWith("http://") && !link.startsWith("https://"))) {
link = u.getProtocol() + "://" + u.getAuthority() + stripFilename(u.getPath()) + link;
}
return link;
} catch (Exception e) {
e.printStackTrace();
return null;
}

}

private String stripFilename(String path) {
int pos = path.lastIndexOf("/");
return pos <= -1 ? path : path.substring(0, pos+1);
}

private String getHTML(String url) {

URL u;
Expand Down Expand Up @@ -46,4 +94,4 @@ private String getHTML(String url) {

}

}
}
2 changes: 1 addition & 1 deletion Project/src/main/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ public class Main {

public static void main(String[] args) {

String start_url = "file:///data/Tutorials/Java%20Web%20Crawler/java-web-crawler/Test%20Page/index.html";
String start_url = "http://192.168.3.138/java-web-crawler-test/index.html";

Bot bot = new Bot(start_url);
bot.start();
Expand Down

0 comments on commit 7ea4f12

Please sign in to comment.