-
Notifications
You must be signed in to change notification settings - Fork 5
/
pgcrawler.rb
executable file
·86 lines (68 loc) · 2.01 KB
/
pgcrawler.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env ruby
require 'net/http'
require 'rubygems'
require 'nokogiri'
require 'open-uri'
# This is the main Spider Class
class Spider
attr_accessor :address, :port, :path
def initialize(address, port = 80)
@urls = Array.new
@visited_urls = Array.new
self.address = address
self.path = '/'
self.port = 80
end
# 2. This method is called to create the HTTP connection to the target and start spidering
def connect_to_address
puts "[.] Collecting initial URLs from the target."
connection = Net::HTTP.new(self.address)
headers, body = connection.get(self.path)
if headers.code == "200"
# 3. Send the returned page body to the parsing function to search for additional URLs. Method returns an array value.
urls = parse_source(body)
if urls
puts "[.] Collected #{urls.length} URLs."
# 5. Send the urls to the spider method
puts "[.] Spidering URLs."
urls.each do |url|
spider(url) if !@visited_urls.include?(url)
end
else
puts "[-] No URLs found."
exit!
end
else
puts "[-] Make sure the target is a website."
end
return @visited_urls
end
# 4. Find all the links in the page and add them to the @urls array
def parse_source(body)
html_doc = Nokogiri::HTML(body)
html_doc.xpath('//a/@href').each do |links|
@urls << links.content
end
return @urls
end
def spider(url)
puts "[.] Spidering #{url}."
Thread.new {
if new_page = Nokogiri::HTML(open(url))
new_page.xpath('//a/@href').each do |link|
@urls << link.content
spider(link.content)
end
end
}
@visited_urls << url
end
end
#This controles execution flow
def main_function
# 1. Create a new instance of the Spider class and pass it the URL provided at runtime
new_spider = Spider.new(ARGV[0])
spider = new_spider.connect_to_address
puts "[+] Finished processesing #{spider.length} URLs from the target."
end
main_function