-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDocument.java
598 lines (527 loc) · 19.9 KB
/
Document.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
package org.jsoup.nodes;
import org.jsoup.internal.StringUtil;
import org.jsoup.helper.Validate;
import org.jsoup.parser.ParseSettings;
import org.jsoup.parser.Parser;
import org.jsoup.parser.Tag;
import org.jsoup.select.Elements;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.util.ArrayList;
import java.util.List;
/**
A HTML Document.
@author Jonathan Hedley, [email protected] */
public class Document extends Element {
private OutputSettings outputSettings = new OutputSettings();
private Parser parser; // the parser used to parse this document
private QuirksMode quirksMode = QuirksMode.noQuirks;
private String location;
private boolean updateMetaCharset = false;
/**
Create a new, empty Document.
@param baseUri base URI of document
@see org.jsoup.Jsoup#parse
@see #createShell
*/
public Document(String baseUri) {
super(Tag.valueOf("#root", ParseSettings.htmlDefault), baseUri);
this.location = baseUri;
}
/**
Create a valid, empty shell of a document, suitable for adding more elements to.
@param baseUri baseUri of document
@return document with html, head, and body elements.
*/
public static Document createShell(String baseUri) {
Validate.notNull(baseUri);
Document doc = new Document(baseUri);
doc.parser = doc.parser();
Element html = doc.appendElement("html");
html.appendElement("head");
html.appendElement("body");
return doc;
}
/**
* Get the URL this Document was parsed from. If the starting URL is a redirect,
* this will return the final URL from which the document was served from.
* @return location
*/
public String location() {
return location;
}
/**
Accessor to the document's {@code head} element.
@return {@code head}
*/
public Element head() {
return findFirstElementByTagName("head", this);
}
/**
Accessor to the document's {@code body} element.
@return {@code body}
*/
public Element body() {
return findFirstElementByTagName("body", this);
}
/**
Get the string contents of the document's {@code title} element.
@return Trimmed title, or empty string if none set.
*/
public String title() {
// title is a preserve whitespace tag (for document output), but normalised here
Element titleEl = getElementsByTag("title").first();
return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : "";
}
/**
Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if
not present
@param title string to set as title
*/
public void title(String title) {
Validate.notNull(title);
Element titleEl = getElementsByTag("title").first();
if (titleEl == null) { // add to head
head().appendElement("title").text(title);
} else {
titleEl.text(title);
}
}
/**
Create a new Element, with this document's base uri. Does not make the new element a child of this document.
@param tagName element tag name (e.g. {@code a})
@return new element
*/
public Element createElement(String tagName) {
return new Element(Tag.valueOf(tagName, ParseSettings.preserveCase), this.baseUri());
}
/**
Normalise the document. This happens after the parse phase so generally does not need to be called.
Moves any text content that is not in the body element into the body.
@return this document after normalisation
*/
public Document normalise() {
Element htmlEl = findFirstElementByTagName("html", this);
if (htmlEl == null)
htmlEl = appendElement("html");
if (head() == null)
htmlEl.prependElement("head");
if (body() == null)
htmlEl.appendElement("body");
// pull text nodes out of root, html, and head els, and push into body. non-text nodes are already taken care
// of. do in inverse order to maintain text order.
normaliseTextNodes(head());
normaliseTextNodes(htmlEl);
normaliseTextNodes(this);
normaliseStructure("head", htmlEl);
normaliseStructure("body", htmlEl);
ensureMetaCharsetElement();
return this;
}
// does not recurse.
private void normaliseTextNodes(Element element) {
List<Node> toMove = new ArrayList<>();
for (Node node: element.childNodes) {
if (node instanceof TextNode) {
TextNode tn = (TextNode) node;
if (!tn.isBlank())
toMove.add(tn);
}
}
for (int i = toMove.size()-1; i >= 0; i--) {
Node node = toMove.get(i);
element.removeChild(node);
body().prependChild(new TextNode(" "));
body().prependChild(node);
}
}
// merge multiple <head> or <body> contents into one, delete the remainder, and ensure they are owned by <html>
private void normaliseStructure(String tag, Element htmlEl) {
Elements elements = this.getElementsByTag(tag);
Element master = elements.first(); // will always be available as created above if not existent
if (elements.size() > 1) { // dupes, move contents to master
List<Node> toMove = new ArrayList<>();
for (int i = 1; i < elements.size(); i++) {
Node dupe = elements.get(i);
toMove.addAll(dupe.ensureChildNodes());
dupe.remove();
}
for (Node dupe : toMove)
master.appendChild(dupe);
}
// ensure parented by <html>
if (!master.parent().equals(htmlEl)) {
htmlEl.appendChild(master); // includes remove()
}
}
// fast method to get first by tag name, used for html, head, body finders
private Element findFirstElementByTagName(String tag, Node node) {
if (node.nodeName().equals(tag))
return (Element) node;
else {
int size = node.childNodeSize();
for (int i = 0; i < size; i++) {
Element found = findFirstElementByTagName(tag, node.childNode(i));
if (found != null)
return found;
}
}
return null;
}
@Override
public String outerHtml() {
return super.html(); // no outer wrapper tag
}
/**
Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared.
@param text unencoded text
@return this document
*/
@Override
public Element text(String text) {
body().text(text); // overridden to not nuke doc structure
return this;
}
@Override
public String nodeName() {
return "#document";
}
/**
* Sets the charset used in this document. This method is equivalent
* to {@link OutputSettings#charset(java.nio.charset.Charset)
* OutputSettings.charset(Charset)} but in addition it updates the
* charset / encoding element within the document.
*
* <p>This enables
* {@link #updateMetaCharsetElement(boolean) meta charset update}.</p>
*
* <p>If there's no element with charset / encoding information yet it will
* be created. Obsolete charset / encoding definitions are removed!</p>
*
* <p><b>Elements used:</b></p>
*
* <ul>
* <li><b>Html:</b> <i><meta charset="CHARSET"></i></li>
* <li><b>Xml:</b> <i><?xml version="1.0" encoding="CHARSET"></i></li>
* </ul>
*
* @param charset Charset
*
* @see #updateMetaCharsetElement(boolean)
* @see OutputSettings#charset(java.nio.charset.Charset)
*/
public void charset(Charset charset) {
updateMetaCharsetElement(true);
outputSettings.charset(charset);
ensureMetaCharsetElement();
}
/**
* Returns the charset used in this document. This method is equivalent
* to {@link OutputSettings#charset()}.
*
* @return Current Charset
*
* @see OutputSettings#charset()
*/
public Charset charset() {
return outputSettings.charset();
}
/**
* Sets whether the element with charset information in this document is
* updated on changes through {@link #charset(java.nio.charset.Charset)
* Document.charset(Charset)} or not.
*
* <p>If set to <tt>false</tt> <i>(default)</i> there are no elements
* modified.</p>
*
* @param update If <tt>true</tt> the element updated on charset
* changes, <tt>false</tt> if not
*
* @see #charset(java.nio.charset.Charset)
*/
public void updateMetaCharsetElement(boolean update) {
this.updateMetaCharset = update;
}
/**
* Returns whether the element with charset information in this document is
* updated on changes through {@link #charset(java.nio.charset.Charset)
* Document.charset(Charset)} or not.
*
* @return Returns <tt>true</tt> if the element is updated on charset
* changes, <tt>false</tt> if not
*/
public boolean updateMetaCharsetElement() {
return updateMetaCharset;
}
@Override
public Document clone() {
Document clone = (Document) super.clone();
clone.outputSettings = this.outputSettings.clone();
return clone;
}
/**
* Ensures a meta charset (html) or xml declaration (xml) with the current
* encoding used. This only applies with
* {@link #updateMetaCharsetElement(boolean) updateMetaCharset} set to
* <tt>true</tt>, otherwise this method does nothing.
*
* <ul>
* <li>An existing element gets updated with the current charset</li>
* <li>If there's no element yet it will be inserted</li>
* <li>Obsolete elements are removed</li>
* </ul>
*
* <p><b>Elements used:</b></p>
*
* <ul>
* <li><b>Html:</b> <i><meta charset="CHARSET"></i></li>
* <li><b>Xml:</b> <i><?xml version="1.0" encoding="CHARSET"></i></li>
* </ul>
*/
private void ensureMetaCharsetElement() {
if (updateMetaCharset) {
OutputSettings.Syntax syntax = outputSettings().syntax();
if (syntax == OutputSettings.Syntax.html) {
Element metaCharset = select("meta[charset]").first();
if (metaCharset != null) {
metaCharset.attr("charset", charset().displayName());
} else {
Element head = head();
if (head != null) {
head.appendElement("meta").attr("charset", charset().displayName());
}
}
// Remove obsolete elements
select("meta[name=charset]").remove();
} else if (syntax == OutputSettings.Syntax.xml) {
Node node = childNodes().get(0);
if (node instanceof XmlDeclaration) {
XmlDeclaration decl = (XmlDeclaration) node;
if (decl.name().equals("xml")) {
decl.attr("encoding", charset().displayName());
final String version = decl.attr("version");
if (version != null) {
decl.attr("version", "1.0");
}
} else {
decl = new XmlDeclaration("xml", false);
decl.attr("version", "1.0");
decl.attr("encoding", charset().displayName());
prependChild(decl);
}
} else {
XmlDeclaration decl = new XmlDeclaration("xml", false);
decl.attr("version", "1.0");
decl.attr("encoding", charset().displayName());
prependChild(decl);
}
}
}
}
/**
* A Document's output settings control the form of the text() and html() methods.
*/
public static class OutputSettings implements Cloneable {
/**
* The output serialization syntax.
*/
public enum Syntax {html, xml}
private Entities.EscapeMode escapeMode = Entities.EscapeMode.base;
private Charset charset;
private ThreadLocal<CharsetEncoder> encoderThreadLocal = new ThreadLocal<>(); // initialized by start of OuterHtmlVisitor
Entities.CoreCharset coreCharset; // fast encoders for ascii and utf8
private boolean prettyPrint = true;
private boolean outline = false;
private int indentAmount = 1;
private Syntax syntax = Syntax.html;
public OutputSettings() {
charset(Charset.forName("UTF8"));
}
/**
* Get the document's current HTML escape mode: <code>base</code>, which provides a limited set of named HTML
* entities and escapes other characters as numbered entities for maximum compatibility; or <code>extended</code>,
* which uses the complete set of HTML named entities.
* <p>
* The default escape mode is <code>base</code>.
* @return the document's current escape mode
*/
public Entities.EscapeMode escapeMode() {
return escapeMode;
}
/**
* Set the document's escape mode, which determines how characters are escaped when the output character set
* does not support a given character:- using either a named or a numbered escape.
* @param escapeMode the new escape mode to use
* @return the document's output settings, for chaining
*/
public OutputSettings escapeMode(Entities.EscapeMode escapeMode) {
this.escapeMode = escapeMode;
return this;
}
/**
* Get the document's current output charset, which is used to control which characters are escaped when
* generating HTML (via the <code>html()</code> methods), and which are kept intact.
* <p>
* Where possible (when parsing from a URL or File), the document's output charset is automatically set to the
* input charset. Otherwise, it defaults to UTF-8.
* @return the document's current charset.
*/
public Charset charset() {
return charset;
}
/**
* Update the document's output charset.
* @param charset the new charset to use.
* @return the document's output settings, for chaining
*/
public OutputSettings charset(Charset charset) {
this.charset = charset;
return this;
}
/**
* Update the document's output charset.
* @param charset the new charset (by name) to use.
* @return the document's output settings, for chaining
*/
public OutputSettings charset(String charset) {
charset(Charset.forName(charset));
return this;
}
CharsetEncoder prepareEncoder() {
// created at start of OuterHtmlVisitor so each pass has own encoder, so OutputSettings can be shared among threads
CharsetEncoder encoder = charset.newEncoder();
encoderThreadLocal.set(encoder);
coreCharset = Entities.CoreCharset.byName(encoder.charset().name());
return encoder;
}
CharsetEncoder encoder() {
CharsetEncoder encoder = encoderThreadLocal.get();
return encoder != null ? encoder : prepareEncoder();
}
/**
* Get the document's current output syntax.
* @return current syntax
*/
public Syntax syntax() {
return syntax;
}
/**
* Set the document's output syntax. Either {@code html}, with empty tags and boolean attributes (etc), or
* {@code xml}, with self-closing tags.
* @param syntax serialization syntax
* @return the document's output settings, for chaining
*/
public OutputSettings syntax(Syntax syntax) {
this.syntax = syntax;
return this;
}
/**
* Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format
* the output, and the output will generally look like the input.
* @return if pretty printing is enabled.
*/
public boolean prettyPrint() {
return prettyPrint;
}
/**
* Enable or disable pretty printing.
* @param pretty new pretty print setting
* @return this, for chaining
*/
public OutputSettings prettyPrint(boolean pretty) {
prettyPrint = pretty;
return this;
}
/**
* Get if outline mode is enabled. Default is false. If enabled, the HTML output methods will consider
* all tags as block.
* @return if outline mode is enabled.
*/
public boolean outline() {
return outline;
}
/**
* Enable or disable HTML outline mode.
* @param outlineMode new outline setting
* @return this, for chaining
*/
public OutputSettings outline(boolean outlineMode) {
outline = outlineMode;
return this;
}
/**
* Get the current tag indent amount, used when pretty printing.
* @return the current indent amount
*/
public int indentAmount() {
return indentAmount;
}
/**
* Set the indent amount for pretty printing
* @param indentAmount number of spaces to use for indenting each level. Must be {@literal >=} 0.
* @return this, for chaining
*/
public OutputSettings indentAmount(int indentAmount) {
Validate.isTrue(indentAmount >= 0);
this.indentAmount = indentAmount;
return this;
}
@Override
public OutputSettings clone() {
OutputSettings clone;
try {
clone = (OutputSettings) super.clone();
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
}
clone.charset(charset.name()); // new charset and charset encoder
clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name());
// indentAmount, prettyPrint are primitives so object.clone() will handle
return clone;
}
}
/**
* Get the document's current output settings.
* @return the document's current output settings.
*/
public OutputSettings outputSettings() {
return outputSettings;
}
/**
* Set the document's output settings.
* @param outputSettings new output settings.
* @return this document, for chaining.
*/
public Document outputSettings(OutputSettings outputSettings) {
Validate.notNull(outputSettings);
this.outputSettings = outputSettings;
return this;
}
public enum QuirksMode {
noQuirks, quirks, limitedQuirks
}
public QuirksMode quirksMode() {
return quirksMode;
}
public Document quirksMode(QuirksMode quirksMode) {
this.quirksMode = quirksMode;
return this;
}
/**
* Get the parser that was used to parse this document.
* @return the parser
*/
public Parser parser() {
return parser;
}
/**
* Set the parser used to create this document. This parser is then used when further parsing within this document
* is required.
* @param parser the configured parser to use when further parsing is required for this document.
* @return this document, for chaining.
*/
public Document parser(Parser parser) {
this.parser = parser;
return this;
}
}