Skip to content

Commit cb3ff65

Browse files
author
James Leigh
authored
Merge pull request #847 from jamesrdf/issues/#846-parsediri-perf
Fix #846: Use int[] instead of String for IRI parsing
2 parents 32bea03 + d3ea1c0 commit cb3ff65

1 file changed

Lines changed: 117 additions & 126 deletions

File tree

core/util/src/main/java/org/eclipse/rdf4j/common/net/ParsedIRI.java

Lines changed: 117 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import java.text.Normalizer;
2323
import java.util.ArrayList;
2424
import java.util.Arrays;
25+
import java.util.Comparator;
2526
import java.util.LinkedList;
2627
import java.util.List;
2728

@@ -83,150 +84,126 @@ public class ParsedIRI implements Cloneable, Serializable {
8384

8485
private static final long serialVersionUID = -5681843777254402303L;
8586

86-
private final String iri;
87-
88-
private int pos;
89-
90-
private String scheme;
91-
92-
private String userInfo;
93-
94-
private String host;
95-
96-
private int port = -1;
97-
98-
private String path;
99-
100-
private String query;
87+
private static final Comparator<int[]> CMP = new Comparator<int[]>() {
10188

102-
private String fragment;
89+
public int compare(int[] o1, int[] o2) {
90+
return o1[0] - o2[0];
91+
}
92+
};
10393

104-
private static int EOF = '\n';
94+
private static int EOF = 0;
10595

106-
private static String[] iprivate = unicodeToString(
107-
new String[] { "U+E000-F8FF", "U+F0000-FFFFD", "U+100000-10FFFD" });
96+
private static int[][] iprivate = {
97+
new int[] { 0xE000, 0xF8FF },
98+
new int[] { 0xF0000, 0xFFFFD },
99+
new int[] { 0x100000, 0x10FFFD } };
108100

109-
private static String[] ucschar = unicodeToString(new String[] {
110-
"U+00A0-D7FF",
111-
"U+F900-FDCF",
112-
"U+FDF0-FFEF",
113-
"U+10000-1FFFD",
114-
"U+20000-2FFFD",
115-
"U+30000-3FFFD",
116-
"U+40000-4FFFD",
117-
"U+50000-5FFFD",
118-
"U+60000-6FFFD",
119-
"U+70000-7FFFD",
120-
"U+80000-8FFFD",
121-
"U+90000-9FFFD",
122-
"U+A0000-AFFFD",
123-
"U+B0000-BFFFD",
124-
"U+C0000-CFFFD",
125-
"U+D0000-DFFFD",
126-
"U+E1000-EFFFD" });
101+
private static int[][] ucschar = {
102+
new int[] { 0x00A0, 0xD7FF },
103+
new int[] { 0xF900, 0xFDCF },
104+
new int[] { 0xFDF0, 0xFFEF },
105+
new int[] { 0x10000, 0x1FFFD },
106+
new int[] { 0x20000, 0x2FFFD },
107+
new int[] { 0x30000, 0x3FFFD },
108+
new int[] { 0x40000, 0x4FFFD },
109+
new int[] { 0x50000, 0x5FFFD },
110+
new int[] { 0x60000, 0x6FFFD },
111+
new int[] { 0x70000, 0x7FFFD },
112+
new int[] { 0x80000, 0x8FFFD },
113+
new int[] { 0x90000, 0x9FFFD },
114+
new int[] { 0xA0000, 0xAFFFD },
115+
new int[] { 0xB0000, 0xBFFFD },
116+
new int[] { 0xC0000, 0xCFFFD },
117+
new int[] { 0xD0000, 0xDFFFD },
118+
new int[] { 0xE1000, 0xEFFFD } };
127119

128-
private static String[] ALPHA = { "A-Z", "a-z" };
120+
private static int[][] ALPHA = { new int[] { 'A', 'Z' }, new int[] { 'a', 'z' } };
129121

130-
private static String[] DIGIT = { "0-9" };
122+
private static int[][] DIGIT = { new int[] { '0', '9' } };
131123

132-
private static String[] HEXDIG = flatten(union(DIGIT, new String[] { "A-F", "a-f" }));
124+
private static int[][] sub_delims = union('!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=');
133125

134-
private static String[] sub_delims = { "!", "$", "&", "'", "(", ")", "*", "+", ",", ";", "=" };
126+
private static int[][] gen_delims = union(':', '/', '?', '#', '[', ']', '@');
135127

136-
private static String[] gen_delims = { ":", "/", "?", "#", "[", "]", "@" };
128+
private static int[][] reserved = union(gen_delims, sub_delims);
137129

138-
private static String[] reserved = union(gen_delims, sub_delims);
130+
private static int[][] unreserved_rfc3986 = union(ALPHA, DIGIT, '-', '.', '_', '~');
139131

140-
private static String[] unreserved_rfc3986 = union(ALPHA, DIGIT, new String[] { "-", ".", "_", "~" });
132+
private static int[][] unreserved = union(unreserved_rfc3986, ucschar);
141133

142-
private static String[] unreserved = union(unreserved_rfc3986, ucschar);
134+
private static int[][] schar = union(ALPHA, DIGIT, '+', '-', '.');
143135

144-
private static String[] schar = union(ALPHA, DIGIT, new String[] { "+", "-", "." });
136+
private static int[][] uchar = union(unreserved, sub_delims, ':');
145137

146-
private static String[] uchar = union(unreserved, sub_delims, new String[] { ":" });
138+
private static int[][] hchar = union(unreserved, sub_delims);
147139

148-
private static String[] hchar = union(unreserved, sub_delims);
140+
private static int[][] pchar = union(unreserved, sub_delims, ':', '@');
149141

150-
private static String[] pchar = union(unreserved, sub_delims, new String[] { ":", "@" });
142+
private static int[][] qchar = union(pchar, iprivate, '/', '?');
151143

152-
private static String[] qchar = union(pchar, iprivate, new String[] { "/", "?" });
144+
private static int[][] fchar = union(pchar, '/', '?');
153145

154-
private static String[] fchar = union(pchar, new String[] { "/", "?" });
146+
private static int[] HEXDIG = flatten(
147+
union(DIGIT, new int[][] { new int[] { 'A', 'F' }, new int[] { 'a', 'f' } }));
155148

156-
private static String[] ascii = flatten(union(unreserved_rfc3986, reserved, new String[] { "%" }));
149+
private static int[] ascii = flatten(union(unreserved_rfc3986, reserved, '%'));
157150

158-
private static String[] common = flatten(union(unreserved_rfc3986, reserved,
159-
new String[] { "%", "<", ">", "\"", " ", "{", "}", "|", "\\", "^", "`" }));
151+
private static int[] common = flatten(
152+
union(unreserved_rfc3986, reserved, '%', '<', '>', '"', ' ', '{', '}', '|', '\\', '^', '`'));
160153

161154
private static String[] common_pct = pctEncode(common);
162155

163-
/**
164-
* Decodes U+ 32bit hex values into 16bit characters with Java surrogates
165-
*/
166-
private static String[] unicodeToString(String[] encodings) {
167-
StringBuilder sb = new StringBuilder(5);
168-
String[] decodings = new String[encodings.length];
169-
for (int i = 0; i < encodings.length; i++) {
170-
String encoded = encodings[i];
171-
if (encoded.startsWith("U+")) {
172-
int idx = encoded.indexOf('-');
173-
int start = Integer.parseInt(encoded.substring(2, idx), 16);
174-
int end = Integer.parseInt(encoded.substring(idx + 1), 16);
175-
sb.setLength(0);
176-
sb.appendCodePoint(start).append('-').appendCodePoint(end);
177-
decodings[i] = sb.toString();
156+
private static int[][] union(Object... sets) {
157+
List<int[]> list = new ArrayList<>();
158+
for (Object set : sets) {
159+
if (set instanceof int[][]) {
160+
int[][] ar = (int[][])set;
161+
for (int i = 0; i < ar.length; i++) {
162+
list.add(ar[i]);
163+
}
164+
}
165+
else if (set instanceof Character) {
166+
char chr = (Character)set;
167+
list.add(new int[] { chr, chr });
178168
}
179169
else {
180-
decodings[i] = encoded;
170+
assert false;
181171
}
182172
}
183-
return decodings;
184-
}
185-
186-
private static String[] union(String[]... src) {
187-
int len = 0;
188-
for (String[] s : src) {
189-
len += s.length;
190-
}
191-
if (len == 0) {
192-
return new String[0];
193-
}
194-
String[] dest = Arrays.copyOf(src[0], len);
195-
int destPos = src[0].length;
196-
for (int i = 1; i < src.length; i++) {
197-
System.arraycopy(src[i], 0, dest, destPos, src[i].length);
198-
destPos += src[i].length;
199-
}
200-
Arrays.sort(dest);
173+
int[][] dest = list.toArray(new int[][] {});
174+
Arrays.sort(dest, CMP);
201175
return dest;
202176
}
203177

204-
private static String[] flatten(String... arrays) {
205-
List<String> list = new ArrayList<>();
178+
private static int[] flatten(int[]... arrays) {
179+
List<Integer> list = new ArrayList<>();
206180
for (int i = 0; i < arrays.length; i++) {
207-
String str = arrays[i];
208-
if (str.length() == 1) {
209-
list.add(str); // character
181+
int[] str = arrays[i];
182+
if (str.length == 1) {
183+
list.add(str[0]); // character
210184
}
211-
else if (str.length() == 3 && str.charAt(1) == '-') {
212-
for (char chr = str.charAt(0), end = str.charAt(2); chr <= end; chr++) {
213-
list.add(Character.toString(chr)); // range
185+
else if (str.length == 2) {
186+
for (int chr = str[0], end = str[1]; chr <= end; chr++) {
187+
list.add(chr); // range
214188
}
215189
}
216190
else {
217191
assert false;
218192
}
219193
}
220-
String[] chars = list.toArray(new String[list.size()]);
194+
int[] chars = new int[list.size()];
195+
for (int i = 0; i < chars.length; i++) {
196+
chars[i] = list.get(i);
197+
}
221198
Arrays.sort(chars);
222199
return chars;
223200
}
224201

225-
private static String[] pctEncode(String[] unencoded) {
202+
private static String[] pctEncode(int[] unencoded) {
226203
CharsetEncoder encoder = Charset.forName("UTF-8").newEncoder();
227204
String[] result = new String[unencoded.length];
228205
for (int i = 0; i < unencoded.length; i++) {
229-
String ns = Normalizer.normalize(unencoded[i], Normalizer.Form.NFC);
206+
String ns = new String(Character.toChars(unencoded[i]));
230207
ByteBuffer bb = null;
231208
try {
232209
bb = encoder.encode(CharBuffer.wrap(ns));
@@ -238,8 +215,8 @@ private static String[] pctEncode(String[] unencoded) {
238215
while (bb.hasRemaining()) {
239216
byte b = (byte)(bb.get() & 0xff);
240217
sb.append('%');
241-
sb.append(HEXDIG[(b >> 4) & 0x0f]);
242-
sb.append(HEXDIG[(b >> 0) & 0x0f]);
218+
sb.appendCodePoint(HEXDIG[(b >> 4) & 0x0f]);
219+
sb.appendCodePoint(HEXDIG[(b >> 0) & 0x0f]);
243220
}
244221
result[i] = sb.toString();
245222
}
@@ -301,6 +278,24 @@ public static ParsedIRI create(String str) {
301278
}
302279
}
303280

281+
private final String iri;
282+
283+
private int pos;
284+
285+
private String scheme;
286+
287+
private String userInfo;
288+
289+
private String host;
290+
291+
private int port = -1;
292+
293+
private String path;
294+
295+
private String query;
296+
297+
private String fragment;
298+
304299
/**
305300
* Constructs a ParsedIRI by parsing the given string.
306301
*
@@ -1003,7 +998,7 @@ private String parsePath()
1003998
return parsePctEncoded(fchar, '?', '#');
1004999
}
10051000

1006-
private String parsePctEncoded(String[] set, int end1, int end2)
1001+
private String parsePctEncoded(int[][] set, int end1, int end2)
10071002
throws URISyntaxException
10081003
{
10091004
int start = pos;
@@ -1016,7 +1011,7 @@ else if (('a' <= chr && chr <= 'z') || ('A' <= chr && chr <= 'Z') || ('0' <= chr
10161011
advance(1);
10171012
}
10181013
else if ('%' == chr) {
1019-
if (isMember(HEXDIG, peek(1)) && isMember(HEXDIG, peek(2))) {
1014+
if (Arrays.binarySearch(HEXDIG, peek(1)) >= 0 && Arrays.binarySearch(HEXDIG, peek(2)) >= 0) {
10201015
advance(3);
10211016
}
10221017
else {
@@ -1033,26 +1028,7 @@ else if (isMember(set, chr)) {
10331028
return iri.substring(start, pos);
10341029
}
10351030

1036-
private boolean isMember(String[] set, int chr) {
1037-
int idx = Arrays.binarySearch(set, new String(Character.toChars(chr)));
1038-
int i = idx < 0 ? Math.max(Math.min(-1 - idx, set.length - 1), 0) : idx;
1039-
return isMember(set[i], chr) || i > 0 && isMember(set[i - 1], chr);
1040-
}
1041-
1042-
private boolean isMember(String range, int chr) {
1043-
if (3 == range.codePointCount(0, range.length())) {
1044-
int start = range.codePointAt(0);
1045-
assert '-' == range.charAt(range.offsetByCodePoints(0, 1));
1046-
int end = range.codePointAt(range.offsetByCodePoints(0, 2));
1047-
return start <= chr && chr <= end;
1048-
}
1049-
else {
1050-
assert 1 == range.codePointCount(0, range.length());
1051-
return chr == range.codePointAt(0);
1052-
}
1053-
}
1054-
1055-
private String parseMember(String[] set, int end)
1031+
private String parseMember(int[][] set, int end)
10561032
throws URISyntaxException
10571033
{
10581034
int start = pos;
@@ -1071,6 +1047,21 @@ else if (isMember(set, chr)) {
10711047
return iri.substring(start, pos);
10721048
}
10731049

1050+
private boolean isMember(int[][] set, int chr) {
1051+
int idx = Arrays.binarySearch(set, new int[] { chr }, CMP);
1052+
if (idx >= 0) {
1053+
return true; // lower range matched exactly
1054+
}
1055+
else if (idx == -1) {
1056+
return false; // insertion point is 0, below lowest range
1057+
}
1058+
else {
1059+
int i = -idx - 2; // range just before insertion point
1060+
assert set[i][0] <= chr && set[i].length == 2;
1061+
return chr <= set[i][1];
1062+
}
1063+
}
1064+
10741065
private int peek() {
10751066
if (pos < iri.length()) {
10761067
return iri.codePointAt(pos);
@@ -1106,7 +1097,7 @@ private URISyntaxException error(String reason) {
11061097
private void appendAscii(StringBuilder sb, String input) {
11071098
for (int c = 0, n = input.codePointCount(0, input.length()); c < n; c++) {
11081099
int chr = input.codePointAt(input.offsetByCodePoints(0, c));
1109-
if (isMember(ascii, chr)) {
1100+
if (Arrays.binarySearch(ascii, chr) >= 0) {
11101101
sb.appendCodePoint(chr);
11111102
}
11121103
else {
@@ -1217,8 +1208,8 @@ private String[] listPctEncodings(String path) {
12171208

12181209
private String normalizePctEncoding(String encoded) {
12191210
int cidx = Arrays.binarySearch(common_pct, encoded);
1220-
if (cidx >= 0 && isMember(unreserved, common[cidx].codePointAt(0))) {
1221-
return common[cidx]; // quickly decode unreserved encodings
1211+
if (cidx >= 0 && isMember(unreserved, common[cidx])) {
1212+
return new String(Character.toChars(common[cidx])); // quickly decode unreserved encodings
12221213
}
12231214
else if (cidx >= 0) {
12241215
return encoded; // pass through reserved encodings

0 commit comments

Comments
 (0)