Skip to content

Commit d0be10b

Browse files
author
James Leigh
committed
Issue #69: Encode unicode code points in hex string (for easy spec reference) and convert them into code point Strings on initialization
Signed-off-by: James Leigh <james.leigh@ontotext.com>
1 parent 2eed40a commit d0be10b

2 files changed

Lines changed: 62 additions & 53 deletions

File tree

core/util/src/main/java/org/eclipse/rdf4j/common/net/ParsedIRI.java

Lines changed: 54 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -104,26 +104,27 @@ public class ParsedIRI implements Cloneable, Serializable {
104104

105105
private static int EOF = '\n';
106106

107-
private static String[] iprivate = { "\uE000-\uF8FF", "\uF0000-\uFFFFD", "\u100000-\u10FFFD" };
108-
109-
private static String[] ucschar = {
110-
"\u00A0-\uD7FF",
111-
"\uF900-\uFDCF",
112-
"\uFDF0-\uFFEF",
113-
"\u10000-\u1FFFD",
114-
"\u20000-\u2FFFD",
115-
"\u30000-\u3FFFD",
116-
"\u40000-\u4FFFD",
117-
"\u50000-\u5FFFD",
118-
"\u60000-\u6FFFD",
119-
"\u70000-\u7FFFD",
120-
"\u80000-\u8FFFD",
121-
"\u90000-\u9FFFD",
122-
"\uA0000-\uAFFFD",
123-
"\uB0000-\uBFFFD",
124-
"\uC0000-\uCFFFD",
125-
"\uD0000-\uDFFFD",
126-
"\uE1000-\uEFFFD" };
107+
private static String[] iprivate = unicodeToString(
108+
new String[] { "U+E000-F8FF", "U+F0000-FFFFD", "U+100000-10FFFD" });
109+
110+
private static String[] ucschar = unicodeToString(new String[] {
111+
"U+00A0-D7FF",
112+
"U+F900-FDCF",
113+
"U+FDF0-FFEF",
114+
"U+10000-1FFFD",
115+
"U+20000-2FFFD",
116+
"U+30000-3FFFD",
117+
"U+40000-4FFFD",
118+
"U+50000-5FFFD",
119+
"U+60000-6FFFD",
120+
"U+70000-7FFFD",
121+
"U+80000-8FFFD",
122+
"U+90000-9FFFD",
123+
"U+A0000-AFFFD",
124+
"U+B0000-BFFFD",
125+
"U+C0000-CFFFD",
126+
"U+D0000-DFFFD",
127+
"U+E1000-EFFFD" });
127128

128129
private static String[] ALPHA = { "A-Z", "a-z" };
129130

@@ -158,6 +159,29 @@ public class ParsedIRI implements Cloneable, Serializable {
158159

159160
private static String[] common_pct = pctEncode(common);
160161

162+
/**
163+
* Decodes U+ 32bit hex values into 16bit characters with Java surrogates
164+
*/
165+
private static String[] unicodeToString(String[] encodings) {
166+
StringBuilder sb = new StringBuilder(5);
167+
String[] decodings = new String[encodings.length];
168+
for (int i = 0; i < encodings.length; i++) {
169+
String encoded = encodings[i];
170+
if (encoded.startsWith("U+")) {
171+
int idx = encoded.indexOf('-');
172+
int start = Integer.parseInt(encoded.substring(2, idx), 16);
173+
int end = Integer.parseInt(encoded.substring(idx + 1), 16);
174+
sb.setLength(0);
175+
sb.appendCodePoint(start).append('-').appendCodePoint(end);
176+
decodings[i] = sb.toString();
177+
}
178+
else {
179+
decodings[i] = encoded;
180+
}
181+
}
182+
return decodings;
183+
}
184+
161185
private static String[] union(String[]... src) {
162186
int len = 0;
163187
for (String[] s : src) {
@@ -183,24 +207,11 @@ private static String[] flatten(String... arrays) {
183207
if (str.length() == 1) {
184208
list.add(str); // character
185209
}
186-
else if (str.length() == 2) {
187-
assert Character.isSurrogatePair(str.charAt(0), str.charAt(1));
188-
list.add(str); // character
189-
}
190210
else if (str.length() == 3 && str.charAt(1) == '-') {
191211
for (char chr = str.charAt(0), end = str.charAt(2); chr <= end; chr++) {
192212
list.add(Character.toString(chr)); // range
193213
}
194214
}
195-
else if (str.length() == 5 && str.charAt(2) == '-') {
196-
assert Character.isSurrogatePair(str.charAt(0), str.charAt(1));
197-
assert Character.isSurrogatePair(str.charAt(3), str.charAt(4));
198-
int start = Character.toCodePoint(str.charAt(0), str.charAt(1));
199-
int end = Character.toCodePoint(str.charAt(3), str.charAt(4));
200-
for (int cp = start; cp < end; cp++) {
201-
list.add(new StringBuilder().appendCodePoint(cp).toString());
202-
}
203-
}
204215
else {
205216
assert false;
206217
}
@@ -582,7 +593,8 @@ public ParsedIRI normalize() {
582593
ParsedIRI normalized = new ParsedIRI(_scheme, _userInfo, _host, _port, _path, _query, _fragment);
583594
if (this.iri.equals(normalized.iri)) {
584595
return this;
585-
} else {
596+
}
597+
else {
586598
return normalized;
587599
}
588600
}
@@ -887,7 +899,7 @@ else if (scheme != null && ':' == peek(0)) {
887899
fragment = parsePctEncoded(fchar);
888900
}
889901
if (pos != iri.length()) {
890-
throw error("Unexpected trailing character");
902+
throw error("Unexpected character");
891903
}
892904
}
893905

@@ -1031,25 +1043,15 @@ private boolean isMember(String[] set, int chr) {
10311043
}
10321044

10331045
private boolean isMember(String range, int chr) {
1034-
if (range.length() == 1) {
1035-
return range.equals(Character.toString(Character.toChars(chr)[0]));
1036-
}
1037-
else if (range.length() == 2) {
1038-
return range.equals(new String(Character.toChars(chr)));
1039-
}
1040-
else if (range.length() == 3 && range.charAt(1) == '-') {
1046+
if (3 == range.codePointCount(0, range.length())) {
10411047
int start = range.codePointAt(0);
1042-
int end = range.codePointAt(2);
1043-
return start <= chr && chr <= end;
1044-
}
1045-
else if (range.length() == 5 && range.charAt(2) == '-') {
1046-
int start = range.codePointAt(0);
1047-
int end = range.codePointAt(3);
1048+
assert '-' == range.charAt(range.offsetByCodePoints(0, 1));
1049+
int end = range.codePointAt(range.offsetByCodePoints(0, 2));
10481050
return start <= chr && chr <= end;
10491051
}
10501052
else {
1051-
assert false;
1052-
return false;
1053+
assert 1 == range.codePointCount(0, range.length());
1054+
return chr == range.codePointAt(0);
10531055
}
10541056
}
10551057

@@ -1085,8 +1087,8 @@ private void advance(int ahead) {
10851087
}
10861088

10871089
private URISyntaxException error(String reason) {
1088-
int end = Math.min(pos + 10, iri.length());
1089-
return new URISyntaxException(iri, reason + ": \"" + iri.substring(pos, end) + "\"", pos);
1090+
int cp = iri.codePointAt(pos);
1091+
return new URISyntaxException(iri, reason + " U+" + Integer.toHexString(cp).toUpperCase(), pos);
10901092
}
10911093

10921094
private void appendAscii(StringBuilder sb, String input) {

core/util/src/test/java/org/eclipse/rdf4j/common/net/ParsedIRITest.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,13 @@ public void testDurst()
683683
assertURI2IRI("http://www.example.org/D%FCrst", "http://www.example.org/D%FCrst");
684684
}
685685

686+
@Test
687+
public void testDeseret()
688+
throws URISyntaxException
689+
{
690+
assertURI2IRI("http://www.example.org/U+10400/%F0%90%90%80", "http://www.example.org/U+10400/\uD801\uDC00");
691+
}
692+
686693
@Test
687694
public void testPunycodeEncoding()
688695
throws URISyntaxException
@@ -696,8 +703,8 @@ public void testPunycodeEncoding()
696703
private void assertURI2IRI(String uri, String iri)
697704
throws URISyntaxException
698705
{
699-
assertEquals(iri, new ParsedIRI(uri).normalize().toString());
700706
assertEquals(uri, new ParsedIRI(iri).toASCIIString());
707+
assertEquals(iri, new ParsedIRI(uri).normalize().toString());
701708
}
702709

703710
@Test

0 commit comments

Comments
 (0)