-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathJsoup.java
293 lines (273 loc) · 11.1 KB
/
Jsoup.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
package org.jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Whitelist;
import org.jsoup.helper.DataUtil;
import org.jsoup.helper.HttpConnection;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
/**
* The core public access point to the jsoup functionality.
*
* @author Jonathan Hedley
*/
public class Jsoup {
private Jsoup() {
}
/**
* Parse HTML into a Document. The parser will make a sensible, balanced
* document tree out of any HTML.
*
* @param html HTML to parse
* @param baseUri The URL where the HTML was retrieved from. Used to resolve
* relative URLs to absolute URLs, that occur before the HTML declares a
* {@code <base href>} tag.
* @return sane HTML
*/
public static Document parse(String html, String baseUri) {
return Parser.parse(html, baseUri);
}
/**
* Parse HTML into a Document, using the provided Parser. You can provide an
* alternate parser, such as a simple XML (non-HTML) parser.
*
* @param html HTML to parse
* @param baseUri The URL where the HTML was retrieved from. Used to resolve
* relative URLs to absolute URLs, that occur before the HTML declares a
* {@code <base href>} tag.
* @param parser alternate {@link Parser#xmlParser() parser} to use.
* @return sane HTML
*/
public static Document parse(String html, String baseUri, Parser parser) {
return parser.parseInput(html, baseUri);
}
/**
* Parse HTML into a Document. As no base URI is specified, absolute URL
* detection relies on the HTML including a {@code <base href>} tag.
*
* @param html HTML to parse
* @return sane HTML
*
* @see #parse(String, String)
*/
public static Document parse(String html) {
return Parser.parse(html, "");
}
/**
* Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML
* page.
* <p>
* Use examples:
* <ul>
* <li><code>Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "jsoup").get();</code></li>
* <li><code>Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();</code></li>
* </ul>
*
* @param url URL to connect to. The protocol must be {@code http} or
* {@code https}.
* @return the connection. You can add data, cookies, and headers; set the
* user-agent, referrer, method; and then execute.
*/
public static Connection connect(String url) {
return HttpConnection.connect(url);
}
/**
* Parse the contents of a file as HTML.
*
* @param in file to load HTML from
* @param charsetName (optional) character set of file contents. Set to
* {@code null} to determine from {@code http-equiv} meta tag, if present,
* or fall back to {@code UTF-8} (which is often safe to do).
* @param baseUri The URL where the HTML was retrieved from, to resolve
* relative links against.
* @return sane HTML
*
* @throws IOException if the file could not be found, or read, or if the
* charsetName is invalid.
*/
public static Document parse(File in, String charsetName, String baseUri) throws IOException {
return DataUtil.load(in, charsetName, baseUri);
}
/**
* Parse the contents of a file as HTML. The location of the file is used as
* the base URI to qualify relative URLs.
*
* @param in file to load HTML from
* @param charsetName (optional) character set of file contents. Set to
* {@code null} to determine from {@code http-equiv} meta tag, if present,
* or fall back to {@code UTF-8} (which is often safe to do).
* @return sane HTML
*
* @throws IOException if the file could not be found, or read, or if the
* charsetName is invalid.
* @see #parse(File, String, String)
*/
public static Document parse(File in, String charsetName) throws IOException {
return DataUtil.load(in, charsetName, in.getAbsolutePath());
}
/**
* Read an input stream, and parse it to a Document.
*
* @param in input stream to read. Make sure to close it after parsing.
* @param charsetName (optional) character set of file contents. Set to
* {@code null} to determine from {@code http-equiv} meta tag, if present,
* or fall back to {@code UTF-8} (which is often safe to do).
* @param baseUri The URL where the HTML was retrieved from, to resolve
* relative links against.
* @return sane HTML
*
* @throws IOException if the file could not be found, or read, or if the
* charsetName is invalid.
*/
public static Document parse(InputStream in, String charsetName, String baseUri) throws IOException {
return DataUtil.load(in, charsetName, baseUri);
}
/**
* Read an input stream, and parse it to a Document. You can provide an
* alternate parser, such as a simple XML (non-HTML) parser.
*
* @param in input stream to read. Make sure to close it after parsing.
* @param charsetName (optional) character set of file contents. Set to
* {@code null} to determine from {@code http-equiv} meta tag, if present,
* or fall back to {@code UTF-8} (which is often safe to do).
* @param baseUri The URL where the HTML was retrieved from, to resolve
* relative links against.
* @param parser alternate {@link Parser#xmlParser() parser} to use.
* @return sane HTML
*
* @throws IOException if the file could not be found, or read, or if the
* charsetName is invalid.
*/
public static Document parse(InputStream in, String charsetName, String baseUri, Parser parser) throws IOException {
return DataUtil.load(in, charsetName, baseUri, parser);
}
/**
* Parse a fragment of HTML, with the assumption that it forms the
* {@code body} of the HTML.
*
* @param bodyHtml body HTML fragment
* @param baseUri URL to resolve relative URLs against.
* @return sane HTML document
*
* @see Document#body()
*/
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
return Parser.parseBodyFragment(bodyHtml, baseUri);
}
/**
* Parse a fragment of HTML, with the assumption that it forms the
* {@code body} of the HTML.
*
* @param bodyHtml body HTML fragment
* @return sane HTML document
*
* @see Document#body()
*/
public static Document parseBodyFragment(String bodyHtml) {
return Parser.parseBodyFragment(bodyHtml, "");
}
/**
* Fetch a URL, and parse it as HTML. Provided for compatibility; in most
* cases use {@link #connect(String)} instead.
* <p>
* The encoding character set is determined by the content-type header or
* http-equiv meta tag, or falls back to {@code UTF-8}.
*
* @param url URL to fetch (with a GET). The protocol must be {@code http}
* or {@code https}.
* @param timeoutMillis Connection and read timeout, in milliseconds. If
* exceeded, IOException is thrown.
* @return The parsed HTML.
*
* @throws java.net.MalformedURLException if the request URL is not a HTTP
* or HTTPS URL, or is otherwise malformed
* @throws HttpStatusException if the response is not OK and HTTP response
* errors are not ignored
* @throws UnsupportedMimeTypeException if the response mime type is not
* supported and those errors are not ignored
* @throws java.net.SocketTimeoutException if the connection times out
* @throws IOException if a connection or read error occurs
*
* @see #connect(String)
*/
public static Document parse(URL url, int timeoutMillis) throws IOException {
Connection con = HttpConnection.connect(url);
con.timeout(timeoutMillis);
return con.get();
}
/**
* Get safe HTML from untrusted input HTML, by parsing input HTML and
* filtering it through a white-list of permitted tags and attributes.
*
* @param bodyHtml input untrusted HTML (body fragment)
* @param baseUri URL to resolve relative URLs against
* @param whitelist white-list of permitted HTML elements
* @return safe HTML (body fragment)
*
* @see Cleaner#clean(Document)
*/
public static String clean(String bodyHtml, String baseUri, Whitelist whitelist) {
Document dirty = parseBodyFragment(bodyHtml, baseUri);
Cleaner cleaner = new Cleaner(whitelist);
Document clean = cleaner.clean(dirty);
return clean.body().html();
}
/**
* Get safe HTML from untrusted input HTML, by parsing input HTML and
* filtering it through a white-list of permitted tags and attributes.
*
* @param bodyHtml input untrusted HTML (body fragment)
* @param whitelist white-list of permitted HTML elements
* @return safe HTML (body fragment)
*
* @see Cleaner#clean(Document)
*/
public static String clean(String bodyHtml, Whitelist whitelist) {
return clean(bodyHtml, "", whitelist);
}
/**
* Get safe HTML from untrusted input HTML, by parsing input HTML and
* filtering it through a white-list of permitted tags and attributes.
* <p>
* The HTML is treated as a body fragment; it's expected the cleaned HTML
* will be used within the body of an existing document. If you want to
* clean full documents, use {@link Cleaner#clean(Document)} instead, and
* add structural tags (<code>html, head, body</code> etc) to the whitelist.
*
* @param bodyHtml input untrusted HTML (body fragment)
* @param baseUri URL to resolve relative URLs against
* @param whitelist white-list of permitted HTML elements
* @param outputSettings document output settings; use to control
* pretty-printing and entity escape modes
* @return safe HTML (body fragment)
* @see Cleaner#clean(Document)
*/
public static String clean(String bodyHtml, String baseUri, Whitelist whitelist, Document.OutputSettings outputSettings) {
Document dirty = parseBodyFragment(bodyHtml, baseUri);
Cleaner cleaner = new Cleaner(whitelist);
Document clean = cleaner.clean(dirty);
clean.outputSettings(outputSettings);
return clean.body().html();
}
/**
* Test if the input body HTML has only tags and attributes allowed by the
* Whitelist. Useful for form validation.
* <p>
* The input HTML should still be run through the cleaner to set up enforced
* attributes, and to tidy the output.
* <p>
* Assumes the HTML is a body fragment (i.e. will be used in an existing
* HTML document body.)
*
* @param bodyHtml HTML to test
* @param whitelist whitelist to test against
* @return true if no tags or attributes were removed; false otherwise
* @see #clean(String, org.jsoup.safety.Whitelist)
*/
public static boolean isValid(String bodyHtml, Whitelist whitelist) {
return new Cleaner(whitelist).isValidBodyHtml(bodyHtml);
}
}