Skip to content

Commit

Permalink
[AUDIT][WEBSITE] Find and Parse Sitemap
Browse files Browse the repository at this point in the history
  • Loading branch information
StanGirard committed Jun 29, 2020
1 parent 1be7a61 commit a2bbb1d
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 0 deletions.
6 changes: 6 additions & 0 deletions toolkit/controller/audit/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import toolkit.controller.audit.site_audit
import toolkit.controller.audit.page_audit




36 changes: 36 additions & 0 deletions toolkit/controller/audit/page_audit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from urllib.parse import urlparse
from toolkit.lib.http_tools import request_page
from bs4 import BeautifulSoup

class AuditPage():
def __init__(self, url):
parsed_url = urlparse(url)
self.domain = parsed_url.netloc
self.scheme = parsed_url.scheme
self.path = parsed_url.path
self.request = request_page(self.generate_url())
self.status_code = self.request.status_code
self.headers = self.request.headers
self.soup = BeautifulSoup(self.request.content, 'html.parser')


def __str__(self):
a = "--------------------\n"
a += "Domain: " + self.domain + "\n"
a += "Scheme: " + self.scheme + "\n"
a += "Path: " + self.path + "\n"
a += "Status Code: " + str(self.status_code) + "\n"
a += "Headers: " + str([x for x in self.headers]) + "\n"
return a




def generate_url(self):
return self.scheme + "://" + self.domain + "/" + self.path






101 changes: 101 additions & 0 deletions toolkit/controller/audit/site_audit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from urllib.parse import urlparse
from toolkit.lib.http_tools import request_page
from bs4 import BeautifulSoup
import requests
from bs4 import BeautifulSoup as Soup
import pandas as pd
import hashlib


class AuditWebsite():
def __init__(self, url):
parsed_url = urlparse(url)
self.domain = parsed_url.netloc
self.scheme = parsed_url.scheme
self.path = parsed_url.path
self.sitemap = []
self.robots = False
self.populate_request()
self.robots_finder()
self.populate_urls()

def populate_request(self):
self.request = request_page(self.generate_url())
self.status_code = self.request.status_code

def robots_finder(self):
request = request_page(self.generate_url() + "/robots.txt")
if request.status_code == 200:
self.robots_present = True
self.find_sitemap(request.text)

def find_sitemap(self, robots):
self.sitemap = []
for line in robots.split("\n"):
line = line.lower()
line = line.split(" ")
if line[0] == "sitemap:":
self.sitemap.append(line[1])
if line[0] == "sitemaps:":
self.sitemap.append(line[1])

def populate_urls(self):
list_urls = []
self.urls = []
print(len(self.sitemap))
if len(self.sitemap) > 0:
for i in self.sitemap:
sitemap_urls = parse_sitemap(i)
for url in sitemap_urls:
if url not in list_urls:
list_urls.append(url)
self.urls = list_urls


def generate_url(self):
return self.scheme + "://" + self.domain





def parse_sitemap( url):
resp = requests.get(url)
# we didn't get a valid response, bail
if (200 != resp.status_code):
return False

# BeautifulSoup to parse the document
soup = Soup(resp.content, "xml")

# find all the <url> tags in the document
urls = soup.findAll('url')
sitemaps = soup.findAll('sitemap')
panda_out_total = []


if not urls and not sitemaps:
return False

# Recursive call to the the function if sitemap contains sitemaps
if sitemaps:
for u in sitemaps:
test = u.find('loc').string
panda_recursive = parse_sitemap(test)
panda_out_total += panda_recursive

# storage for later...
out = []

# Extract the keys we want
for u in urls:
loc = None
loc = u.find("loc")
if not loc:
loc = "None"
else:
loc = loc.string
out.append(loc)

#returns the dataframe
return panda_out_total + out

0 comments on commit a2bbb1d

Please sign in to comment.