@@ -104,26 +104,27 @@ public class ParsedIRI implements Cloneable, Serializable {
104104
105105 private static int EOF = '\n' ;
106106
107- private static String [] iprivate = { "\uE000 -\uF8FF " , "\uF000 0-\uFFFF D" , "\u1000 00-\u10FF FD" };
108-
109- private static String [] ucschar = {
110- "\u00A0 -\uD7FF " ,
111- "\uF900 -\uFDCF " ,
112- "\uFDF0 -\uFFEF " ,
113- "\u1000 0-\u1FFF D" ,
114- "\u2000 0-\u2FFF D" ,
115- "\u3000 0-\u3FFF D" ,
116- "\u4000 0-\u4FFF D" ,
117- "\u5000 0-\u5FFF D" ,
118- "\u6000 0-\u6FFF D" ,
119- "\u7000 0-\u7FFF D" ,
120- "\u8000 0-\u8FFF D" ,
121- "\u9000 0-\u9FFF D" ,
122- "\uA000 0-\uAFFF D" ,
123- "\uB000 0-\uBFFF D" ,
124- "\uC000 0-\uCFFF D" ,
125- "\uD000 0-\uDFFF D" ,
126- "\uE100 0-\uEFFF D" };
107+ private static String [] iprivate = unicodeToString (
108+ new String [] { "U+E000-F8FF" , "U+F0000-FFFFD" , "U+100000-10FFFD" });
109+
110+ private static String [] ucschar = unicodeToString (new String [] {
111+ "U+00A0-D7FF" ,
112+ "U+F900-FDCF" ,
113+ "U+FDF0-FFEF" ,
114+ "U+10000-1FFFD" ,
115+ "U+20000-2FFFD" ,
116+ "U+30000-3FFFD" ,
117+ "U+40000-4FFFD" ,
118+ "U+50000-5FFFD" ,
119+ "U+60000-6FFFD" ,
120+ "U+70000-7FFFD" ,
121+ "U+80000-8FFFD" ,
122+ "U+90000-9FFFD" ,
123+ "U+A0000-AFFFD" ,
124+ "U+B0000-BFFFD" ,
125+ "U+C0000-CFFFD" ,
126+ "U+D0000-DFFFD" ,
127+ "U+E1000-EFFFD" });
127128
128129 private static String [] ALPHA = { "A-Z" , "a-z" };
129130
@@ -158,6 +159,29 @@ public class ParsedIRI implements Cloneable, Serializable {
158159
159160 private static String [] common_pct = pctEncode (common );
160161
162+ /**
163+ * Decodes U+ 32bit hex values into 16bit characters with Java surrogates
164+ */
165+ private static String [] unicodeToString (String [] encodings ) {
166+ StringBuilder sb = new StringBuilder (5 );
167+ String [] decodings = new String [encodings .length ];
168+ for (int i = 0 ; i < encodings .length ; i ++) {
169+ String encoded = encodings [i ];
170+ if (encoded .startsWith ("U+" )) {
171+ int idx = encoded .indexOf ('-' );
172+ int start = Integer .parseInt (encoded .substring (2 , idx ), 16 );
173+ int end = Integer .parseInt (encoded .substring (idx + 1 ), 16 );
174+ sb .setLength (0 );
175+ sb .appendCodePoint (start ).append ('-' ).appendCodePoint (end );
176+ decodings [i ] = sb .toString ();
177+ }
178+ else {
179+ decodings [i ] = encoded ;
180+ }
181+ }
182+ return decodings ;
183+ }
184+
161185 private static String [] union (String []... src ) {
162186 int len = 0 ;
163187 for (String [] s : src ) {
@@ -183,24 +207,11 @@ private static String[] flatten(String... arrays) {
183207 if (str .length () == 1 ) {
184208 list .add (str ); // character
185209 }
186- else if (str .length () == 2 ) {
187- assert Character .isSurrogatePair (str .charAt (0 ), str .charAt (1 ));
188- list .add (str ); // character
189- }
190210 else if (str .length () == 3 && str .charAt (1 ) == '-' ) {
191211 for (char chr = str .charAt (0 ), end = str .charAt (2 ); chr <= end ; chr ++) {
192212 list .add (Character .toString (chr )); // range
193213 }
194214 }
195- else if (str .length () == 5 && str .charAt (2 ) == '-' ) {
196- assert Character .isSurrogatePair (str .charAt (0 ), str .charAt (1 ));
197- assert Character .isSurrogatePair (str .charAt (3 ), str .charAt (4 ));
198- int start = Character .toCodePoint (str .charAt (0 ), str .charAt (1 ));
199- int end = Character .toCodePoint (str .charAt (3 ), str .charAt (4 ));
200- for (int cp = start ; cp < end ; cp ++) {
201- list .add (new StringBuilder ().appendCodePoint (cp ).toString ());
202- }
203- }
204215 else {
205216 assert false ;
206217 }
@@ -582,7 +593,8 @@ public ParsedIRI normalize() {
582593 ParsedIRI normalized = new ParsedIRI (_scheme , _userInfo , _host , _port , _path , _query , _fragment );
583594 if (this .iri .equals (normalized .iri )) {
584595 return this ;
585- } else {
596+ }
597+ else {
586598 return normalized ;
587599 }
588600 }
@@ -887,7 +899,7 @@ else if (scheme != null && ':' == peek(0)) {
887899 fragment = parsePctEncoded (fchar );
888900 }
889901 if (pos != iri .length ()) {
890- throw error ("Unexpected trailing character" );
902+ throw error ("Unexpected character" );
891903 }
892904 }
893905
@@ -1031,25 +1043,15 @@ private boolean isMember(String[] set, int chr) {
10311043 }
10321044
10331045 private boolean isMember (String range , int chr ) {
1034- if (range .length () == 1 ) {
1035- return range .equals (Character .toString (Character .toChars (chr )[0 ]));
1036- }
1037- else if (range .length () == 2 ) {
1038- return range .equals (new String (Character .toChars (chr )));
1039- }
1040- else if (range .length () == 3 && range .charAt (1 ) == '-' ) {
1046+ if (3 == range .codePointCount (0 , range .length ())) {
10411047 int start = range .codePointAt (0 );
1042- int end = range .codePointAt (2 );
1043- return start <= chr && chr <= end ;
1044- }
1045- else if (range .length () == 5 && range .charAt (2 ) == '-' ) {
1046- int start = range .codePointAt (0 );
1047- int end = range .codePointAt (3 );
1048+ assert '-' == range .charAt (range .offsetByCodePoints (0 , 1 ));
1049+ int end = range .codePointAt (range .offsetByCodePoints (0 , 2 ));
10481050 return start <= chr && chr <= end ;
10491051 }
10501052 else {
1051- assert false ;
1052- return false ;
1053+ assert 1 == range . codePointCount ( 0 , range . length ()) ;
1054+ return chr == range . codePointAt ( 0 ) ;
10531055 }
10541056 }
10551057
@@ -1085,8 +1087,8 @@ private void advance(int ahead) {
10851087 }
10861088
10871089 private URISyntaxException error (String reason ) {
1088- int end = Math . min ( pos + 10 , iri .length () );
1089- return new URISyntaxException (iri , reason + ": \" " + iri . substring ( pos , end ) + " \" " , pos );
1090+ int cp = iri .codePointAt ( pos );
1091+ return new URISyntaxException (iri , reason + " U+" + Integer . toHexString ( cp ). toUpperCase () , pos );
10901092 }
10911093
10921094 private void appendAscii (StringBuilder sb , String input ) {
0 commit comments