Skip to content

Commit 9c34154

Browse files
authored
Merge pull request #1 from JuliaComputing/sp/correctws
preserve whitespace
2 parents 5895f49 + 494cbff commit 9c34154

File tree

2 files changed

+37
-7
lines changed

2 files changed

+37
-7
lines changed

src/HTMLSanitizer.jl

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,17 @@ Sanitizes the HTML input according to `whitelist`.
1414
- `prettyprint`: Returns a prettier multiline string instead of a somewhat minified version.
1515
"""
1616
function sanitize(input::AbstractString; isfragment = true, whitelist = WHITELIST, prettyprint = false)
17-
doc = parsehtml(input)
17+
input_preserve_ws = replace(input, r"(\s+)"s => s" 🐑\1🐑 ")
18+
doc = parsehtml(input_preserve_ws)
1819

1920
sanitize_bfs(doc.root, whitelist)
2021

2122
out = IOBuffer()
2223
print(out, doc.root, pretty = prettyprint)
2324

2425
out = String(take!(out))
26+
out = replace(out, r"\s?🐑(\s+)🐑\s?"s => s"\1")
27+
2528
if isfragment
2629
out = replace(out, r"^<HTML>" => "")
2730
out = replace(out, r"</HTML>$" => "")
@@ -30,7 +33,9 @@ function sanitize(input::AbstractString; isfragment = true, whitelist = WHITELIS
3033
end
3134
end
3235

33-
reparent!(node, parent) = node.parent = parent
36+
reparent!(_, _) = nothing
37+
38+
reparent!(node::HTMLElement, parent) = node.parent = parent
3439

3540
# HTMLText isn't mutable, so this does nothing. Will lead to inconsistencies, but ¯\_(ツ)_/¯.
3641
reparent!(node::HTMLText, parent) = nothing
@@ -70,7 +75,8 @@ function sanitize_element(el::HTMLElement{TAG}, whitelist) where TAG
7075
return Gumbo.HTMLText("")
7176
end
7277
@debug("Replacing `$(tag)` with its contents.")
73-
return sanitize_element.(el.children, Ref(whitelist))
78+
out = sanitize_element.(el.children, Ref(whitelist))
79+
return isempty(out) ? Gumbo.HTMLText("") : out
7480
end
7581

7682
el = sanitize_attributes(el, whitelist)

test/runtests.jl

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ using Test
9494
<tfoot><tr><td>Sum</td></tr></tfoot>
9595
<tbody><tr><td>1</td></tr></tbody>
9696
</table>"""
97-
@test replace(orig, '\n' => "") == HTMLSanitizer.sanitize(orig)
97+
@test replace(orig, "\n" => "") == replace(HTMLSanitizer.sanitize(orig), "\n" => "")
9898
end
9999

100100
@testset "test_summary_tag_are_not_removed" begin
@@ -116,10 +116,34 @@ using Test
116116
<summary>Baz</summary>
117117
</details>
118118
Qux
119-
</details>
120-
"""
121-
@test replace(orig, r"[\s\n]" => "") == replace(HTMLSanitizer.sanitize(orig), r"[\s\n]" => "")
119+
</details>"""
120+
@test replace(orig, "\n" => "") == replace(HTMLSanitizer.sanitize(orig), "\n" => "")
122121
end
123122
end
124123

124+
@testset "preserve relevant whitespace" begin
125+
orig = """
126+
<!DOCTYPE html>
127+
<html>
128+
<head>
129+
<meta description="test page"></meta>
130+
</head>
131+
<body>
132+
<p>A simple test page.</p>
133+
<a></a>
134+
<a></a>
135+
<pre>
136+
<code>
137+
foo
138+
bar
139+
baz
140+
</code>
141+
</pre>
142+
</body>
143+
</html>
144+
"""
145+
expected = "<HTML>\n\n \n \n \n \n <p>A simple test page.</p>\n <a></a>\n <a></a>\n <pre>\n <code>\nfoo\nbar\nbaz\n </code>\n </pre>\n \n\n</HTML>"
146+
@test sanitize(orig, isfragment=false) == expected
147+
end
148+
125149
include("malicious_html.jl")

0 commit comments

Comments
 (0)