-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhtml_snippets.py
69 lines (53 loc) · 1.83 KB
/
html_snippets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
import bleach
def strip_tags(html):
'''
Get an html string and return a text string without html tags
'''
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
s = MLStripper()
s.feed(html)
return s.get_data()
def extract_img_src(html):
'''
Get an html string and return a list of img src attr
'''
class IMGExtractor(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_starttag(self, tag, attrs):
if tag == 'img':
for attr in attrs:
if attr[0] == 'src' and attr[1]:
self.fed.append(attr[1])
break
def get_data(self):
return self.fed
e = IMGExtractor()
e.feed(html)
return e.get_data()
def secure_html(html):
'''
Get an html string and return an html string without unallowed tags and attrs
'''
tags = ['strong', 'em', 'b', 'i', 'u', 'span', 'br', 'p', 'div', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'blockquote', 'img', 'a']
attributes = {
'a': ['href', 'class'],
'img': ['src', 'alt', 'width', 'height']
}
html = bleach.clean(html, tags=tags, attributes=attributes)
html = bleach.linkify(html, callbacks=[bleach.callbacks.nofollow, bleach.callbacks.target_blank])
return html
if __name__ == '__main__':
print strip_tags('<div>no tags</div>')
print extract_img_src('<img src="/about" href="#"><img src="" href="#">')
print secure_html('<script>alert("hacker")</script><a>hello</a>')