2222import java .text .Normalizer ;
2323import java .util .ArrayList ;
2424import java .util .Arrays ;
25+ import java .util .Comparator ;
2526import java .util .LinkedList ;
2627import java .util .List ;
2728
@@ -83,150 +84,126 @@ public class ParsedIRI implements Cloneable, Serializable {
8384
8485 private static final long serialVersionUID = -5681843777254402303L ;
8586
86- private final String iri ;
87-
88- private int pos ;
89-
90- private String scheme ;
91-
92- private String userInfo ;
93-
94- private String host ;
95-
96- private int port = -1 ;
97-
98- private String path ;
99-
100- private String query ;
87+ private static final Comparator <int []> CMP = new Comparator <int []>() {
10188
102- private String fragment ;
89+ public int compare (int [] o1 , int [] o2 ) {
90+ return o1 [0 ] - o2 [0 ];
91+ }
92+ };
10393
104- private static int EOF = '\n' ;
94+ private static int EOF = 0 ;
10595
106- private static String [] iprivate = unicodeToString (
107- new String [] { "U+E000-F8FF" , "U+F0000-FFFFD" , "U+100000-10FFFD" });
96+ private static int [][] iprivate = {
97+ new int [] { 0xE000 , 0xF8FF },
98+ new int [] { 0xF0000 , 0xFFFFD },
99+ new int [] { 0x100000 , 0x10FFFD } };
108100
109- private static String [] ucschar = unicodeToString ( new String [] {
110- "U+00A0-D7FF" ,
111- "U+F900-FDCF" ,
112- "U+FDF0-FFEF" ,
113- "U+10000-1FFFD" ,
114- "U+20000-2FFFD" ,
115- "U+30000-3FFFD" ,
116- "U+40000-4FFFD" ,
117- "U+50000-5FFFD" ,
118- "U+60000-6FFFD" ,
119- "U+70000-7FFFD" ,
120- "U+80000-8FFFD" ,
121- "U+90000-9FFFD" ,
122- "U+A0000-AFFFD" ,
123- "U+B0000-BFFFD" ,
124- "U+C0000-CFFFD" ,
125- "U+D0000-DFFFD" ,
126- "U+E1000-EFFFD" }) ;
101+ private static int [][] ucschar = {
102+ new int [] { 0x00A0 , 0xD7FF } ,
103+ new int [] { 0xF900 , 0xFDCF } ,
104+ new int [] { 0xFDF0 , 0xFFEF } ,
105+ new int [] { 0x10000 , 0x1FFFD } ,
106+ new int [] { 0x20000 , 0x2FFFD } ,
107+ new int [] { 0x30000 , 0x3FFFD } ,
108+ new int [] { 0x40000 , 0x4FFFD } ,
109+ new int [] { 0x50000 , 0x5FFFD } ,
110+ new int [] { 0x60000 , 0x6FFFD } ,
111+ new int [] { 0x70000 , 0x7FFFD } ,
112+ new int [] { 0x80000 , 0x8FFFD } ,
113+ new int [] { 0x90000 , 0x9FFFD } ,
114+ new int [] { 0xA0000 , 0xAFFFD } ,
115+ new int [] { 0xB0000 , 0xBFFFD } ,
116+ new int [] { 0xC0000 , 0xCFFFD } ,
117+ new int [] { 0xD0000 , 0xDFFFD } ,
118+ new int [] { 0xE1000 , 0xEFFFD } } ;
127119
128- private static String [] ALPHA = { "A-Z" , "a-z" };
120+ private static int [][] ALPHA = { new int [] { 'A' , 'Z' }, new int [] { 'a' , 'z' } };
129121
130- private static String [] DIGIT = { "0-9" };
122+ private static int [][] DIGIT = { new int [] { '0' , '9' } };
131123
132- private static String [] HEXDIG = flatten ( union (DIGIT , new String [] { "A-F" , "a-f" }) );
124+ private static int [][] sub_delims = union ('!' , '$' , '&' , '\'' , '(' , ')' , '*' , '+' , ',' , ';' , '=' );
133125
134- private static String [] sub_delims = { "!" , "$" , "&" , "'" , "(" , ")" , "*" , "+" , "," , ";" , "=" } ;
126+ private static int [][] gen_delims = union ( ':' , '/' , '?' , '#' , '[' , ']' , '@' ) ;
135127
136- private static String [] gen_delims = { ":" , "/" , "?" , "#" , "[" , "]" , "@" } ;
128+ private static int [][] reserved = union ( gen_delims , sub_delims ) ;
137129
138- private static String [] reserved = union (gen_delims , sub_delims );
130+ private static int [][] unreserved_rfc3986 = union (ALPHA , DIGIT , '-' , '.' , '_' , '~' );
139131
140- private static String [] unreserved_rfc3986 = union (ALPHA , DIGIT , new String [] { "-" , "." , "_" , "~" } );
132+ private static int [][] unreserved = union (unreserved_rfc3986 , ucschar );
141133
142- private static String [] unreserved = union (unreserved_rfc3986 , ucschar );
134+ private static int [][] schar = union (ALPHA , DIGIT , '+' , '-' , '.' );
143135
144- private static String [] schar = union (ALPHA , DIGIT , new String [] { "+" , "-" , "." } );
136+ private static int [][] uchar = union (unreserved , sub_delims , ':' );
145137
146- private static String [] uchar = union (unreserved , sub_delims , new String [] { ":" } );
138+ private static int [][] hchar = union (unreserved , sub_delims );
147139
148- private static String [] hchar = union (unreserved , sub_delims );
140+ private static int [][] pchar = union (unreserved , sub_delims , ':' , '@' );
149141
150- private static String [] pchar = union (unreserved , sub_delims , new String [] { ":" , "@" } );
142+ private static int [][] qchar = union (pchar , iprivate , '/' , '?' );
151143
152- private static String [] qchar = union (pchar , iprivate , new String [] { "/" , "?" } );
144+ private static int [][] fchar = union (pchar , '/' , '?' );
153145
154- private static String [] fchar = union (pchar , new String [] { "/" , "?" });
146+ private static int [] HEXDIG = flatten (
147+ union (DIGIT , new int [][] { new int [] { 'A' , 'F' }, new int [] { 'a' , 'f' } }));
155148
156- private static String [] ascii = flatten (union (unreserved_rfc3986 , reserved , new String [] { "%" } ));
149+ private static int [] ascii = flatten (union (unreserved_rfc3986 , reserved , '%' ));
157150
158- private static String [] common = flatten (union ( unreserved_rfc3986 , reserved ,
159- new String [] { "%" , "<" , ">" , " \" " , " " , "{" , "}" , "|" , " \\ " , "^" , "`" } ));
151+ private static int [] common = flatten (
152+ union ( unreserved_rfc3986 , reserved , '%' , '<' , '>' , '"' , ' ' , '{' , '}' , '|' , '\\' , '^' , '`' ));
160153
161154 private static String [] common_pct = pctEncode (common );
162155
163- /**
164- * Decodes U+ 32bit hex values into 16bit characters with Java surrogates
165- */
166- private static String [] unicodeToString (String [] encodings ) {
167- StringBuilder sb = new StringBuilder (5 );
168- String [] decodings = new String [encodings .length ];
169- for (int i = 0 ; i < encodings .length ; i ++) {
170- String encoded = encodings [i ];
171- if (encoded .startsWith ("U+" )) {
172- int idx = encoded .indexOf ('-' );
173- int start = Integer .parseInt (encoded .substring (2 , idx ), 16 );
174- int end = Integer .parseInt (encoded .substring (idx + 1 ), 16 );
175- sb .setLength (0 );
176- sb .appendCodePoint (start ).append ('-' ).appendCodePoint (end );
177- decodings [i ] = sb .toString ();
156+ private static int [][] union (Object ... sets ) {
157+ List <int []> list = new ArrayList <>();
158+ for (Object set : sets ) {
159+ if (set instanceof int [][]) {
160+ int [][] ar = (int [][])set ;
161+ for (int i = 0 ; i < ar .length ; i ++) {
162+ list .add (ar [i ]);
163+ }
164+ }
165+ else if (set instanceof Character ) {
166+ char chr = (Character )set ;
167+ list .add (new int [] { chr , chr });
178168 }
179169 else {
180- decodings [ i ] = encoded ;
170+ assert false ;
181171 }
182172 }
183- return decodings ;
184- }
185-
186- private static String [] union (String []... src ) {
187- int len = 0 ;
188- for (String [] s : src ) {
189- len += s .length ;
190- }
191- if (len == 0 ) {
192- return new String [0 ];
193- }
194- String [] dest = Arrays .copyOf (src [0 ], len );
195- int destPos = src [0 ].length ;
196- for (int i = 1 ; i < src .length ; i ++) {
197- System .arraycopy (src [i ], 0 , dest , destPos , src [i ].length );
198- destPos += src [i ].length ;
199- }
200- Arrays .sort (dest );
173+ int [][] dest = list .toArray (new int [][] {});
174+ Arrays .sort (dest , CMP );
201175 return dest ;
202176 }
203177
204- private static String [] flatten (String ... arrays ) {
205- List <String > list = new ArrayList <>();
178+ private static int [] flatten (int [] ... arrays ) {
179+ List <Integer > list = new ArrayList <>();
206180 for (int i = 0 ; i < arrays .length ; i ++) {
207- String str = arrays [i ];
208- if (str .length () == 1 ) {
209- list .add (str ); // character
181+ int [] str = arrays [i ];
182+ if (str .length == 1 ) {
183+ list .add (str [ 0 ] ); // character
210184 }
211- else if (str .length () == 3 && str . charAt ( 1 ) == '-' ) {
212- for (char chr = str . charAt ( 0 ) , end = str . charAt ( 2 ) ; chr <= end ; chr ++) {
213- list .add (Character . toString ( chr ) ); // range
185+ else if (str .length == 2 ) {
186+ for (int chr = str [ 0 ] , end = str [ 1 ] ; chr <= end ; chr ++) {
187+ list .add (chr ); // range
214188 }
215189 }
216190 else {
217191 assert false ;
218192 }
219193 }
220- String [] chars = list .toArray (new String [list .size ()]);
194+ int [] chars = new int [list .size ()];
195+ for (int i = 0 ; i < chars .length ; i ++) {
196+ chars [i ] = list .get (i );
197+ }
221198 Arrays .sort (chars );
222199 return chars ;
223200 }
224201
225- private static String [] pctEncode (String [] unencoded ) {
202+ private static String [] pctEncode (int [] unencoded ) {
226203 CharsetEncoder encoder = Charset .forName ("UTF-8" ).newEncoder ();
227204 String [] result = new String [unencoded .length ];
228205 for (int i = 0 ; i < unencoded .length ; i ++) {
229- String ns = Normalizer . normalize (unencoded [i ], Normalizer . Form . NFC );
206+ String ns = new String ( Character . toChars (unencoded [i ]) );
230207 ByteBuffer bb = null ;
231208 try {
232209 bb = encoder .encode (CharBuffer .wrap (ns ));
@@ -238,8 +215,8 @@ private static String[] pctEncode(String[] unencoded) {
238215 while (bb .hasRemaining ()) {
239216 byte b = (byte )(bb .get () & 0xff );
240217 sb .append ('%' );
241- sb .append (HEXDIG [(b >> 4 ) & 0x0f ]);
242- sb .append (HEXDIG [(b >> 0 ) & 0x0f ]);
218+ sb .appendCodePoint (HEXDIG [(b >> 4 ) & 0x0f ]);
219+ sb .appendCodePoint (HEXDIG [(b >> 0 ) & 0x0f ]);
243220 }
244221 result [i ] = sb .toString ();
245222 }
@@ -301,6 +278,24 @@ public static ParsedIRI create(String str) {
301278 }
302279 }
303280
281+ private final String iri ;
282+
283+ private int pos ;
284+
285+ private String scheme ;
286+
287+ private String userInfo ;
288+
289+ private String host ;
290+
291+ private int port = -1 ;
292+
293+ private String path ;
294+
295+ private String query ;
296+
297+ private String fragment ;
298+
304299 /**
305300 * Constructs a ParsedIRI by parsing the given string.
306301 *
@@ -1003,7 +998,7 @@ private String parsePath()
1003998 return parsePctEncoded (fchar , '?' , '#' );
1004999 }
10051000
1006- private String parsePctEncoded (String [] set , int end1 , int end2 )
1001+ private String parsePctEncoded (int [] [] set , int end1 , int end2 )
10071002 throws URISyntaxException
10081003 {
10091004 int start = pos ;
@@ -1016,7 +1011,7 @@ else if (('a' <= chr && chr <= 'z') || ('A' <= chr && chr <= 'Z') || ('0' <= chr
10161011 advance (1 );
10171012 }
10181013 else if ('%' == chr ) {
1019- if (isMember (HEXDIG , peek (1 )) && isMember (HEXDIG , peek (2 ))) {
1014+ if (Arrays . binarySearch (HEXDIG , peek (1 )) >= 0 && Arrays . binarySearch (HEXDIG , peek (2 )) >= 0 ) {
10201015 advance (3 );
10211016 }
10221017 else {
@@ -1033,26 +1028,7 @@ else if (isMember(set, chr)) {
10331028 return iri .substring (start , pos );
10341029 }
10351030
1036- private boolean isMember (String [] set , int chr ) {
1037- int idx = Arrays .binarySearch (set , new String (Character .toChars (chr )));
1038- int i = idx < 0 ? Math .max (Math .min (-1 - idx , set .length - 1 ), 0 ) : idx ;
1039- return isMember (set [i ], chr ) || i > 0 && isMember (set [i - 1 ], chr );
1040- }
1041-
1042- private boolean isMember (String range , int chr ) {
1043- if (3 == range .codePointCount (0 , range .length ())) {
1044- int start = range .codePointAt (0 );
1045- assert '-' == range .charAt (range .offsetByCodePoints (0 , 1 ));
1046- int end = range .codePointAt (range .offsetByCodePoints (0 , 2 ));
1047- return start <= chr && chr <= end ;
1048- }
1049- else {
1050- assert 1 == range .codePointCount (0 , range .length ());
1051- return chr == range .codePointAt (0 );
1052- }
1053- }
1054-
1055- private String parseMember (String [] set , int end )
1031+ private String parseMember (int [][] set , int end )
10561032 throws URISyntaxException
10571033 {
10581034 int start = pos ;
@@ -1071,6 +1047,21 @@ else if (isMember(set, chr)) {
10711047 return iri .substring (start , pos );
10721048 }
10731049
1050+ private boolean isMember (int [][] set , int chr ) {
1051+ int idx = Arrays .binarySearch (set , new int [] { chr }, CMP );
1052+ if (idx >= 0 ) {
1053+ return true ; // lower range matched exactly
1054+ }
1055+ else if (idx == -1 ) {
1056+ return false ; // insertion point is 0, below lowest range
1057+ }
1058+ else {
1059+ int i = -idx - 2 ; // range just before insertion point
1060+ assert set [i ][0 ] <= chr && set [i ].length == 2 ;
1061+ return chr <= set [i ][1 ];
1062+ }
1063+ }
1064+
10741065 private int peek () {
10751066 if (pos < iri .length ()) {
10761067 return iri .codePointAt (pos );
@@ -1106,7 +1097,7 @@ private URISyntaxException error(String reason) {
11061097 private void appendAscii (StringBuilder sb , String input ) {
11071098 for (int c = 0 , n = input .codePointCount (0 , input .length ()); c < n ; c ++) {
11081099 int chr = input .codePointAt (input .offsetByCodePoints (0 , c ));
1109- if (isMember (ascii , chr )) {
1100+ if (Arrays . binarySearch (ascii , chr ) >= 0 ) {
11101101 sb .appendCodePoint (chr );
11111102 }
11121103 else {
@@ -1217,8 +1208,8 @@ private String[] listPctEncodings(String path) {
12171208
12181209 private String normalizePctEncoding (String encoded ) {
12191210 int cidx = Arrays .binarySearch (common_pct , encoded );
1220- if (cidx >= 0 && isMember (unreserved , common [cidx ]. codePointAt ( 0 ) )) {
1221- return common [cidx ]; // quickly decode unreserved encodings
1211+ if (cidx >= 0 && isMember (unreserved , common [cidx ])) {
1212+ return new String ( Character . toChars ( common [cidx ])) ; // quickly decode unreserved encodings
12221213 }
12231214 else if (cidx >= 0 ) {
12241215 return encoded ; // pass through reserved encodings
0 commit comments