Skip to content

Commit 2d51b28

Browse files
author
James Leigh
committed
Issue #69: Optimize IRI parser for common characters
Signed-off-by: James Leigh <james.leigh@ontotext.com>
1 parent 8170066 commit 2d51b28

2 files changed

Lines changed: 72 additions & 48 deletions

File tree

core/util/src/main/java/org/eclipse/rdf4j/common/net/ParsedIRI.java

Lines changed: 67 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,8 @@ public class ParsedIRI implements Cloneable, Serializable {
145145

146146
private static String[] uchar = union(unreserved, sub_delims, new String[] { ":" });
147147

148+
private static String[] hchar = union(unreserved, sub_delims);
149+
148150
private static String[] pchar = union(unreserved, sub_delims, new String[] { ":", "@" });
149151

150152
private static String[] qchar = union(pchar, iprivate, new String[] { "/", "?" });
@@ -856,43 +858,47 @@ private void parse()
856858
if ("jar".equalsIgnoreCase(scheme)) {
857859
scheme = scheme + ':' + parseScheme();
858860
}
859-
if ('/' == peek(0) && '/' == peek(1)) {
861+
int peek = peek();
862+
if ('/' == peek && '/' == peek(1)) {
860863
advance(2);
861-
userInfo = parseUserInfo();
864+
if (iri.indexOf('@') >= 0) {
865+
userInfo = parseUserInfo();
866+
}
862867
host = parseHost();
863-
if (':' == peek(0)) {
868+
if (':' == peek()) {
864869
advance(1);
865-
String p = parseMember(DIGIT);
870+
String p = parseMember(DIGIT, '/');
866871
if (p.length() > 0) {
867872
port = Integer.parseInt(p);
868873
}
869874
else {
870875
port = -1;
871876
}
872877
}
873-
if ('/' == peek(0) || '?' == peek(0) || '#' == peek(0) || EOF == peek(0)) {
878+
int next = peek();
879+
if ('/' == next || '?' == next || '#' == next || EOF == next) {
874880
path = parsePath();
875881
}
876882
else {
877883
error("absolute or empty path expected");
878884
}
879885
}
880-
else if ('/' == peek(0) || '?' == peek(0) || '#' == peek(0) || EOF == peek(0)) {
886+
else if ('/' == peek || '?' == peek || '#' == peek || EOF == peek) {
881887
path = parsePath();
882888
}
883-
else if ('%' == peek(0) || ':' != peek(0) && isMember(pchar, peek(0))) {
889+
else if ('%' == peek || ':' != peek && isMember(pchar, peek)) {
884890
path = parsePath();
885891
}
886-
else if (scheme != null && ':' == peek(0)) {
892+
else if (scheme != null && ':' == peek) {
887893
path = parsePath();
888894
}
889-
if ('?' == peek(0)) {
895+
if ('?' == peek()) {
890896
advance(1);
891-
query = parsePctEncoded(qchar);
897+
query = parsePctEncoded(qchar, '#', EOF);
892898
}
893-
if ('#' == peek(0)) {
899+
if ('#' == peek()) {
894900
advance(1);
895-
fragment = parsePctEncoded(fchar);
901+
fragment = parsePctEncoded(fchar, '#', EOF);
896902
}
897903
if (pos != iri.length()) {
898904
throw error("Unexpected character");
@@ -931,10 +937,10 @@ private String buildIRI(String scheme, String userInfo, String host, int port, S
931937
private String parseScheme()
932938
throws URISyntaxException
933939
{
934-
if (isMember(ALPHA, peek(0))) {
940+
if (isMember(ALPHA, peek())) {
935941
int start = pos;
936-
String scheme = parseMember(schar);
937-
if (':' == peek(0)) {
942+
String scheme = parseMember(schar, ':');
943+
if (':' == peek()) {
938944
advance(1);
939945
return scheme;
940946
}
@@ -949,8 +955,8 @@ private String parseUserInfo()
949955
throws URISyntaxException
950956
{
951957
int start = pos;
952-
String userinfo = parsePctEncoded(uchar);
953-
if ('@' == peek(0)) {
958+
String userinfo = parsePctEncoded(uchar, '@', '/');
959+
if ('@' == peek()) {
954960
advance(1);
955961
return userinfo;
956962
}
@@ -964,24 +970,24 @@ private String parseHost()
964970
throws URISyntaxException
965971
{
966972
int start = pos;
967-
if ('[' == peek(0)) {
973+
if ('[' == peek()) {
968974
advance(1); // IP-Literal
969-
parseMember(uchar);
970-
if (']' == peek(0)) {
975+
parseMember(uchar, ']');
976+
if (']' == peek()) {
971977
advance(1);
972978
return iri.substring(start, pos);
973979
}
974980
else {
975981
throw error("Invalid host IP address");
976982
}
977983
}
978-
else if (isMember(DIGIT, peek(0))) {
984+
else if (isMember(DIGIT, peek())) {
979985
for (int i = 0; i < 4; i++) {
980-
int octet = Integer.parseInt(parseMember(DIGIT));
986+
int octet = Integer.parseInt(parseMember(DIGIT, '.'));
981987
if (octet < 0 || octet > 255) {
982988
throw error("Invalid IPv4 address");
983989
}
984-
if ('.' == peek(0)) {
990+
if ('.' == peek()) {
985991
advance(1);
986992
}
987993
else {
@@ -991,43 +997,42 @@ else if (isMember(DIGIT, peek(0))) {
991997
return iri.substring(start, pos);
992998
}
993999
else {
994-
return parsePctEncoded(union(unreserved, sub_delims));
1000+
return parsePctEncoded(hchar, ':', '/');
9951001
}
9961002
}
9971003

9981004
private String parsePath()
9991005
throws URISyntaxException
10001006
{
1001-
int start = pos;
1002-
if ('/' != peek(0)) {
1003-
parsePctEncoded(pchar);
1004-
}
1005-
while ('/' == peek(0)) {
1006-
advance(1);
1007-
parsePctEncoded(pchar);
1008-
}
1009-
return iri.substring(start, pos);
1007+
return parsePctEncoded(fchar, '?', '#');
10101008
}
10111009

1012-
private String parsePctEncoded(String[] set)
1010+
private String parsePctEncoded(String[] set, int end1, int end2)
10131011
throws URISyntaxException
10141012
{
1015-
if ('%' != peek(0) && !isMember(set, peek(0))) {
1016-
return "";
1017-
}
10181013
int start = pos;
1019-
while ('%' == peek(0) || isMember(set, peek(0))) {
1020-
if ('%' == peek(0)) {
1014+
while (true) {
1015+
int chr = peek();
1016+
if (chr == EOF || chr == end1 || chr == end2) {
1017+
break; // optimize end character
1018+
}
1019+
else if (('a' <= chr && chr <= 'z') || ('A' <= chr && chr <= 'Z') || ('0' <= chr && chr <= '9')) {
1020+
advance(1);
1021+
}
1022+
else if ('%' == chr) {
10211023
if (isMember(HEXDIG, peek(1)) && isMember(HEXDIG, peek(2))) {
10221024
advance(3);
10231025
}
10241026
else {
1025-
throw error("Illegal Percent encoding");
1027+
throw error("Illegal percent encoding");
10261028
}
10271029
}
1028-
else {
1030+
else if (isMember(set, chr)) {
10291031
advance(1);
10301032
}
1033+
else {
1034+
break;
1035+
}
10311036
}
10321037
return iri.substring(start, pos);
10331038
}
@@ -1051,19 +1056,34 @@ private boolean isMember(String range, int chr) {
10511056
}
10521057
}
10531058

1054-
private String parseMember(String[] set)
1059+
private String parseMember(String[] set, int end)
10551060
throws URISyntaxException
10561061
{
1057-
if (!isMember(set, peek(0))) {
1058-
return "";
1059-
}
10601062
int start = pos;
1061-
while (isMember(set, peek(0))) {
1062-
advance(1);
1063+
while (true) {
1064+
int chr = peek();
1065+
if (chr == EOF || chr == end) {
1066+
break;
1067+
}
1068+
else if (isMember(set, chr)) {
1069+
advance(1);
1070+
}
1071+
else {
1072+
break;
1073+
}
10631074
}
10641075
return iri.substring(start, pos);
10651076
}
10661077

1078+
private int peek() {
1079+
if (pos < iri.length()) {
1080+
return iri.codePointAt(pos);
1081+
}
1082+
else {
1083+
return EOF;
1084+
}
1085+
}
1086+
10671087
private int peek(int ahead) {
10681088
if (pos + ahead < iri.length()) {
10691089
return iri.codePointAt(iri.offsetByCodePoints(pos, ahead));

core/util/src/test/java/org/eclipse/rdf4j/common/net/ParsedIRITest.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,11 @@
77
*******************************************************************************/
88
package org.eclipse.rdf4j.common.net;
99

10-
import static org.junit.Assert.*;
10+
import static org.junit.Assert.assertEquals;
11+
import static org.junit.Assert.assertFalse;
12+
import static org.junit.Assert.assertNull;
13+
import static org.junit.Assert.assertTrue;
14+
import static org.junit.Assert.fail;
1115

1216
import java.io.ByteArrayOutputStream;
1317
import java.io.UnsupportedEncodingException;

0 commit comments

Comments
 (0)