-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathXmlTreeBuilder.java
151 lines (133 loc) · 5.13 KB
/
XmlTreeBuilder.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
package org.jsoup.parser;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.CDataNode;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.DocumentType;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.nodes.XmlDeclaration;
import java.io.Reader;
import java.io.StringReader;
import java.util.List;
/**
* Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the
* document.
* <p>Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}</p>
*
* @author Jonathan Hedley
*/
public class XmlTreeBuilder extends TreeBuilder {
ParseSettings defaultSettings() {
return ParseSettings.preserveCase;
}
@Override
protected void initialiseParse(Reader input, String baseUri, Parser parser) {
super.initialiseParse(input, baseUri, parser);
stack.add(doc); // place the document onto the stack. differs from HtmlTreeBuilder (not on stack)
doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
}
Document parse(Reader input, String baseUri) {
return parse(input, baseUri, new Parser(this));
}
Document parse(String input, String baseUri) {
return parse(new StringReader(input), baseUri, new Parser(this));
}
@Override
protected boolean process(Token token) {
// start tag, end tag, doctype, comment, character, eof
switch (token.type) {
case StartTag:
insert(token.asStartTag());
break;
case EndTag:
popStackToClose(token.asEndTag());
break;
case Comment:
insert(token.asComment());
break;
case Character:
insert(token.asCharacter());
break;
case Doctype:
insert(token.asDoctype());
break;
case EOF: // could put some normalisation here if desired
break;
default:
Validate.fail("Unexpected token type: " + token.type);
}
return true;
}
private void insertNode(Node node) {
currentElement().appendChild(node);
}
Element insert(Token.StartTag startTag) {
Tag tag = Tag.valueOf(startTag.name(), settings);
// todo: wonder if for xml parsing, should treat all tags as unknown? because it's not html.
Element el = new Element(tag, baseUri, settings.normalizeAttributes(startTag.attributes));
insertNode(el);
if (startTag.isSelfClosing()) {
if (!tag.isKnownTag()) // unknown tag, remember this is self closing for output. see above.
tag.setSelfClosing();
} else {
stack.add(el);
}
return el;
}
void insert(Token.Comment commentToken) {
Comment comment = new Comment(commentToken.getData());
Node insert = comment;
if (commentToken.bogus && comment.isXmlDeclaration()) {
// xml declarations are emitted as bogus comments (which is right for html, but not xml)
// so we do a bit of a hack and parse the data as an element to pull the attributes out
XmlDeclaration decl = comment.asXmlDeclaration(); // else, we couldn't parse it as a decl, so leave as a comment
if (decl != null)
insert = decl;
}
insertNode(insert);
}
void insert(Token.Character token) {
final String data = token.getData();
insertNode(token.isCData() ? new CDataNode(data) : new TextNode(data));
}
void insert(Token.Doctype d) {
DocumentType doctypeNode = new DocumentType(settings.normalizeTag(d.getName()), d.getPublicIdentifier(), d.getSystemIdentifier());
doctypeNode.setPubSysKey(d.getPubSysKey());
insertNode(doctypeNode);
}
/**
* If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not
* found, skips.
*
* @param endTag tag to close
*/
private void popStackToClose(Token.EndTag endTag) {
String elName = settings.normalizeTag(endTag.tagName);
Element firstFound = null;
for (int pos = stack.size() -1; pos >= 0; pos--) {
Element next = stack.get(pos);
if (next.nodeName().equals(elName)) {
firstFound = next;
break;
}
}
if (firstFound == null)
return; // not found, skip
for (int pos = stack.size() -1; pos >= 0; pos--) {
Element next = stack.get(pos);
stack.remove(pos);
if (next == firstFound)
break;
}
}
List<Node> parseFragment(String inputFragment, String baseUri, Parser parser) {
initialiseParse(new StringReader(inputFragment), baseUri, parser);
runParser();
return doc.childNodes();
}
List<Node> parseFragment(String inputFragment, Element context, String baseUri, Parser parser) {
return parseFragment(inputFragment, baseUri, parser);
}
}