2727
2828package org .apache .hc .core5 .net .uri .internal .uris ;
2929
30+ import static org .apache .hc .core5 .util .TextUtils .isHex ;
31+
3032import org .apache .hc .core5 .annotation .Contract ;
3133import org .apache .hc .core5 .annotation .Internal ;
3234import org .apache .hc .core5 .annotation .ThreadingBehavior ;
3941/**
4042 * Parser for {@link Rfc3986Uri}.
4143 *
42- * @since 5.4
44+ * @since 5.5
4345 */
4446@ Internal
4547@ Contract (threading = ThreadingBehavior .STATELESS )
@@ -80,29 +82,40 @@ public static Rfc3986Uri parse(final String s) {
8082 } else {
8183 hostStart = authStart ;
8284 }
83- if (hostStart >= authEnd ) {
84- throw new IllegalArgumentException ("Empty host in authority" );
85- }
8685
87- if (buf [hostStart ] == '[' ) {
88- final int rb = indexOf (buf , ']' , hostStart + 1 , authEnd );
89- if (rb < 0 ) {
90- throw new IllegalArgumentException ("Unclosed IPv6 literal" );
91- }
92- host = s .substring (hostStart , rb + 1 ); // keep literal verbatim
93- if (rb + 1 < authEnd && buf [rb + 1 ] == ':' ) {
94- port = Ports .parsePort (buf , rb + 2 , authEnd );
95- }
86+ // RFC 3986 allows empty host in authority (reg-name can be empty), e.g. file:///path
87+ if (hostStart >= authEnd ) {
88+ host = "" ;
89+ cur .updatePos (authEnd );
9690 } else {
97- final int colon = lastIndexOf (buf , ':' , hostStart , authEnd );
98- if (colon >= 0 ) {
99- host = Ascii .lowerAscii (s .substring (hostStart , colon ));
100- port = Ports .parsePort (buf , colon + 1 , authEnd );
91+ if (buf [hostStart ] == '[' ) {
92+ final int rb = indexOf (buf , ']' , hostStart + 1 , authEnd );
93+ if (rb < 0 ) {
94+ throw new IllegalArgumentException ("Unclosed IPv6 literal" );
95+ }
96+ host = Ascii .lowerAscii (s .substring (hostStart , rb + 1 )); // normalize consistently
97+ if (rb + 1 < authEnd && buf [rb + 1 ] == ':' ) {
98+ if (rb + 2 == authEnd ) {
99+ port = -1 ; // empty port is syntactically allowed
100+ } else {
101+ port = Ports .parsePort (buf , rb + 2 , authEnd );
102+ }
103+ }
101104 } else {
102- host = Ascii .lowerAscii (s .substring (hostStart , authEnd ));
105+ final int colon = lastIndexOf (buf , ':' , hostStart , authEnd );
106+ if (colon >= 0 ) {
107+ host = Ascii .lowerAscii (s .substring (hostStart , colon ));
108+ if (colon + 1 == authEnd ) {
109+ port = -1 ; // empty port is syntactically allowed
110+ } else {
111+ port = Ports .parsePort (buf , colon + 1 , authEnd );
112+ }
113+ } else {
114+ host = Ascii .lowerAscii (s .substring (hostStart , authEnd ));
115+ }
103116 }
117+ cur .updatePos (authEnd );
104118 }
105- cur .updatePos (authEnd );
106119 }
107120
108121 // ---- path ----
@@ -125,9 +138,259 @@ public static Rfc3986Uri parse(final String s) {
125138 cur .updatePos (buf .length );
126139 }
127140
128- return new Rfc3986Uri (s , scheme , userInfo , host , port , path , query , fragment );
141+ final Rfc3986Uri u = new Rfc3986Uri (s , scheme , userInfo , host , port , path , query , fragment );
142+
143+ // strict-by-default validation
144+ validateScheme (u .getScheme ());
145+ validateUserInfo (u .getUserInfo ());
146+ validateHost (u .getHost ());
147+ validatePath (u .getPath ());
148+ validateQuery (u .getQuery ());
149+ validateFragment (u .getFragment ());
150+
151+ return u ;
152+ }
153+
154+ private static void validateScheme (final String scheme ) {
155+ if (scheme == null ) {
156+ return ;
157+ }
158+ for (int i = 0 ; i < scheme .length (); i ++) {
159+ final char c = scheme .charAt (i );
160+ if (c > 0x7F ) {
161+ throw new IllegalArgumentException ("Non-ASCII character in scheme" );
162+ }
163+ }
164+ }
165+
166+ private static void validateUserInfo (final String userInfo ) {
167+ if (userInfo == null ) {
168+ return ;
169+ }
170+ validatePctEncoding (userInfo , "userinfo" );
171+ for (int i = 0 ; i < userInfo .length (); i ++) {
172+ final char c = userInfo .charAt (i );
173+ if (c == '%' ) {
174+ i += 2 ;
175+ continue ;
176+ }
177+ if (c > 0x7F ) {
178+ throw new IllegalArgumentException ("Non-ASCII character in userinfo" );
179+ }
180+ if (!isUnreserved (c ) && !isSubDelim (c ) && c != ':' ) {
181+ throw new IllegalArgumentException ("Illegal character in userinfo" );
182+ }
183+ }
184+ }
185+
186+ private static void validateHost (final String host ) {
187+ if (host == null ) {
188+ return ;
189+ }
190+ if (host .isEmpty ()) {
191+ return ; // allowed (reg-name is *)
192+ }
193+ if (host .charAt (0 ) == '[' ) {
194+ if (host .charAt (host .length () - 1 ) != ']' ) {
195+ throw new IllegalArgumentException ("Unclosed IP-literal" );
196+ }
197+ final String inside = host .substring (1 , host .length () - 1 );
198+ validateIpLiteral (inside );
199+ return ;
200+ }
201+ validatePctEncoding (host , "host" );
202+ for (int i = 0 ; i < host .length (); i ++) {
203+ final char c = host .charAt (i );
204+ if (c == '%' ) {
205+ i += 2 ;
206+ continue ;
207+ }
208+ if (c > 0x7F ) {
209+ throw new IllegalArgumentException ("Non-ASCII character in host" );
210+ }
211+ if (!isUnreserved (c ) && !isSubDelim (c )) {
212+ throw new IllegalArgumentException ("Illegal character in host" );
213+ }
214+ }
215+ }
216+
217+ private static void validatePath (final String path ) {
218+ if (path == null ) {
219+ return ;
220+ }
221+ validatePctEncoding (path , "path" );
222+ for (int i = 0 ; i < path .length (); i ++) {
223+ final char c = path .charAt (i );
224+ if (c == '%' ) {
225+ i += 2 ;
226+ continue ;
227+ }
228+ if (c > 0x7F ) {
229+ throw new IllegalArgumentException ("Non-ASCII character in path" );
230+ }
231+ if (c == '/' ) {
232+ continue ;
233+ }
234+ if (!isUnreserved (c ) && !isSubDelim (c ) && c != ':' && c != '@' ) {
235+ throw new IllegalArgumentException ("Illegal character in path" );
236+ }
237+ }
238+ }
239+
240+ private static void validateQuery (final String query ) {
241+ if (query == null ) {
242+ return ;
243+ }
244+ validatePctEncoding (query , "query" );
245+ for (int i = 0 ; i < query .length (); i ++) {
246+ final char c = query .charAt (i );
247+ if (c == '%' ) {
248+ i += 2 ;
249+ continue ;
250+ }
251+ if (c > 0x7F ) {
252+ throw new IllegalArgumentException ("Non-ASCII character in query" );
253+ }
254+ if (c == '/' || c == '?' ) {
255+ continue ;
256+ }
257+ if (!isUnreserved (c ) && !isSubDelim (c ) && c != ':' && c != '@' ) {
258+ throw new IllegalArgumentException ("Illegal character in query" );
259+ }
260+ }
261+ }
262+
263+ private static void validateFragment (final String fragment ) {
264+ if (fragment == null ) {
265+ return ;
266+ }
267+ validatePctEncoding (fragment , "fragment" );
268+ for (int i = 0 ; i < fragment .length (); i ++) {
269+ final char c = fragment .charAt (i );
270+ if (c == '%' ) {
271+ i += 2 ;
272+ continue ;
273+ }
274+ if (c > 0x7F ) {
275+ throw new IllegalArgumentException ("Non-ASCII character in fragment" );
276+ }
277+ if (c == '/' || c == '?' ) {
278+ continue ;
279+ }
280+ if (!isUnreserved (c ) && !isSubDelim (c ) && c != ':' && c != '@' ) {
281+ throw new IllegalArgumentException ("Illegal character in fragment" );
282+ }
283+ }
284+ }
285+
286+ private static void validatePctEncoding (final String s , final String component ) {
287+ for (int i = 0 ; i < s .length (); i ++) {
288+ final char c = s .charAt (i );
289+ if (c == '%' ) {
290+ if (i + 2 >= s .length ()) {
291+ throw new IllegalArgumentException ("Incomplete pct-encoding in " + component );
292+ }
293+ final char h1 = s .charAt (i + 1 );
294+ final char h2 = s .charAt (i + 2 );
295+ if (!isHex (h1 ) || !isHex (h2 )) {
296+ throw new IllegalArgumentException ("Invalid pct-encoding in " + component );
297+ }
298+ i += 2 ;
299+ } else if (c <= 0x1F || c == 0x7F ) {
300+ throw new IllegalArgumentException ("Control character in " + component );
301+ }
302+ }
303+ }
304+
305+ private static void validateIpLiteral (final String inside ) {
306+ if (inside .isEmpty ()) {
307+ throw new IllegalArgumentException ("Empty IP-literal" );
308+ }
309+ final char first = inside .charAt (0 );
310+ if (first == 'v' || first == 'V' ) {
311+ validateIpvFuture (inside );
312+ } else {
313+ validateIpv6Address (inside );
314+ }
315+ }
316+
317+ private static void validateIpvFuture (final String s ) {
318+ int i = 1 ;
319+ if (i >= s .length ()) {
320+ throw new IllegalArgumentException ("Invalid IPvFuture" );
321+ }
322+ int hexdigs = 0 ;
323+ while (i < s .length () && isHex (s .charAt (i ))) {
324+ hexdigs ++;
325+ i ++;
326+ }
327+ if (hexdigs == 0 || i >= s .length () || s .charAt (i ) != '.' ) {
328+ throw new IllegalArgumentException ("Invalid IPvFuture" );
329+ }
330+ i ++;
331+ if (i >= s .length ()) {
332+ throw new IllegalArgumentException ("Invalid IPvFuture" );
333+ }
334+ int tail = 0 ;
335+ while (i < s .length ()) {
336+ final char c = s .charAt (i );
337+ if (c > 0x7F ) {
338+ throw new IllegalArgumentException ("Non-ASCII character in IPvFuture" );
339+ }
340+ if (!isUnreserved (c ) && !isSubDelim (c ) && c != ':' ) {
341+ throw new IllegalArgumentException ("Illegal character in IPvFuture" );
342+ }
343+ tail ++;
344+ i ++;
345+ }
346+ if (tail == 0 ) {
347+ throw new IllegalArgumentException ("Invalid IPvFuture" );
348+ }
349+ }
350+
351+ private static void validateIpv6Address (final String s ) {
352+ if (s .indexOf (":::" ) >= 0 ) {
353+ throw new IllegalArgumentException ("Invalid IPv6 literal" );
354+ }
355+ final int dbl = s .indexOf ("::" );
356+ if (dbl >= 0 && s .indexOf ("::" , dbl + 2 ) >= 0 ) {
357+ throw new IllegalArgumentException ("Invalid IPv6 literal" );
358+ }
359+ for (int i = 0 ; i < s .length (); i ++) {
360+ final char c = s .charAt (i );
361+ if (c > 0x7F ) {
362+ throw new IllegalArgumentException ("Non-ASCII character in IPv6 literal" );
363+ }
364+ if (!(isHex (c ) || c == ':' || c == '.' )) {
365+ throw new IllegalArgumentException ("Illegal character in IPv6 literal" );
366+ }
367+ }
368+ // This is "strict enough" for RFC 3986 IP-literal validation without pulling in InetAddress.
369+ // It rejects obvious junk and enforces the '::' compression constraint.
370+ }
371+
372+ private static boolean isUnreserved (final char c ) {
373+ return Ascii .isAlpha (c ) || Ascii .isDigit (c ) || c == '-' || c == '.' || c == '_' || c == '~' ;
129374 }
130375
376+ private static boolean isSubDelim (final char c ) {
377+ switch (c ) {
378+ case '!' :
379+ case '$' :
380+ case '&' :
381+ case '\'' :
382+ case '(' :
383+ case ')' :
384+ case '*' :
385+ case '+' :
386+ case ',' :
387+ case ';' :
388+ case '=' :
389+ return true ;
390+ default :
391+ return false ;
392+ }
393+ }
131394
132395 private static int scanScheme (final char [] a , final int from , final int toExcl ) {
133396 int finalFrom = from ;
@@ -163,12 +426,7 @@ private static int scanUntil(final char[] a, final int from, final int toExcl, f
163426 }
164427
165428 private static int indexOf (final char [] a , final char ch , final int from , final int toExcl ) {
166- for (int i = from ; i < toExcl ; i ++) {
167- if (a [i ] == ch ) {
168- return i ;
169- }
170- }
171- return -1 ;
429+ return Ascii .indexOf (a , ch , from , toExcl );
172430 }
173431
174432 private static int lastIndexOf (final char [] a , final char ch , final int from , final int toExcl ) {
0 commit comments