Skip to content

Commit bc2e6ab

Browse files
author
James Leigh
committed
Fix #62: Decode both \uXXXX and \UXXXXXXXX before processing SPARQL grammer
Signed-off-by: James Leigh <james.leigh@ontotext.com>
1 parent 7da044c commit bc2e6ab

6 files changed

Lines changed: 270 additions & 64 deletions

File tree

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
/* Generated By:JavaCC: Do not edit this line. CharStream.java Version 5.0 */
2+
/* JavaCCOptions:STATIC=false,SUPPORT_CLASS_VISIBILITY_PUBLIC=true */
3+
package org.eclipse.rdf4j.query.parser.sparql.ast;
4+
5+
/**
6+
* This interface describes a character stream that maintains line and
7+
* column number positions of the characters. It also has the capability
8+
* to backup the stream to some extent. An implementation of this
9+
* interface is used in the TokenManager implementation generated by
10+
* JavaCCParser.
11+
*
12+
* All the methods except backup can be implemented in any fashion. backup
13+
* needs to be implemented correctly for the correct operation of the lexer.
14+
* Rest of the methods are all used to get information like line number,
15+
* column number and the String that constitutes a token and are not used
16+
* by the lexer. Hence their implementation won't affect the generated lexer's
17+
* operation.
18+
*/
19+
20+
public
21+
interface CharStream {
22+
23+
/**
24+
* Returns the next character from the selected input. The method
25+
* of selecting the input is the responsibility of the class
26+
* implementing this interface. Can throw any java.io.IOException.
27+
*/
28+
char readChar() throws java.io.IOException;
29+
30+
@Deprecated
31+
/**
32+
* Returns the column position of the character last read.
33+
* @deprecated
34+
* @see #getEndColumn
35+
*/
36+
int getColumn();
37+
38+
@Deprecated
39+
/**
40+
* Returns the line number of the character last read.
41+
* @deprecated
42+
* @see #getEndLine
43+
*/
44+
int getLine();
45+
46+
/**
47+
* Returns the column number of the last character for current token (being
48+
* matched after the last call to BeginTOken).
49+
*/
50+
int getEndColumn();
51+
52+
/**
53+
* Returns the line number of the last character for current token (being
54+
* matched after the last call to BeginTOken).
55+
*/
56+
int getEndLine();
57+
58+
/**
59+
* Returns the column number of the first character for current token (being
60+
* matched after the last call to BeginTOken).
61+
*/
62+
int getBeginColumn();
63+
64+
/**
65+
* Returns the line number of the first character for current token (being
66+
* matched after the last call to BeginTOken).
67+
*/
68+
int getBeginLine();
69+
70+
/**
71+
* Backs up the input stream by amount steps. Lexer calls this method if it
72+
* had already read some characters, but could not use them to match a
73+
* (longer) token. So, they will be used again as the prefix of the next
74+
* token and it is the implemetation's responsibility to do this right.
75+
*/
76+
void backup(int amount);
77+
78+
/**
79+
* Returns the next character that marks the beginning of the next token.
80+
* All characters must remain in the buffer between two successive calls
81+
* to this method to implement backup correctly.
82+
*/
83+
char BeginToken() throws java.io.IOException;
84+
85+
/**
86+
* Returns a string made up of characters from the marked token beginning
87+
* to the current buffer position. Implementations have the choice of returning
88+
* anything that they want to. For example, for efficiency, one might decide
89+
* to just return null, which is a valid implementation.
90+
*/
91+
String GetImage();
92+
93+
/**
94+
* Returns an array of characters that make up the suffix of length 'len' for
95+
* the currently matched token. This is used to build up the matched string
96+
* for use in actions in the case of MORE. A simple and inefficient
97+
* implementation of this is as follows :
98+
*
99+
* {
100+
* String t = GetImage();
101+
* return t.substring(t.length() - len, t.length()).toCharArray();
102+
* }
103+
*/
104+
char[] GetSuffix(int len);
105+
106+
/**
107+
* The lexer calls this function to indicate that it is done with the stream
108+
* and hence implementations can free any resources held by this class.
109+
* Again, the body of this function can be just empty and it will not
110+
* affect the lexer's operation.
111+
*/
112+
void Done();
113+
114+
}
115+
/* JavaCC - OriginalChecksum=d5d02d7f2852c9b712f39bed41ca22b5 (do not edit this line) */

core/queryparser/sparql/src/main/java/org/eclipse/rdf4j/query/parser/sparql/ast/SyntaxTreeBuilder.java

Lines changed: 19 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,10 @@ public class SyntaxTreeBuilder/*@bgen(jjtree)*/implements SyntaxTreeBuilderTreeC
2929
public static ASTQueryContainer parseQuery(String query)
3030
throws TokenMgrError, ParseException
3131
{
32-
SyntaxTreeBuilder stb = new SyntaxTreeBuilder( new StringReader(query) );
32+
SyntaxTreeBuilder stb = new SyntaxTreeBuilder( new UnicodeEscapeStream(new StringReader(query), 1) );
3333

3434
// Set size of tab to 1 to force tokenmanager to report correct column
3535
// index for substring splitting of service graph pattern.
36-
stb.jj_input_stream.setTabSize(1);
3736

3837
ASTQueryContainer container = stb.QueryContainer();
3938
container.setSourceString(query);
@@ -51,11 +50,10 @@ public static ASTQueryContainer parseQuery(String query)
5150
public static ASTUpdateSequence parseUpdateSequence(String sequence)
5251
throws TokenMgrError, ParseException
5352
{
54-
SyntaxTreeBuilder stb = new SyntaxTreeBuilder( new StringReader(sequence) );
53+
SyntaxTreeBuilder stb = new SyntaxTreeBuilder( new UnicodeEscapeStream(new StringReader(sequence), 1) );
5554

5655
// Set size of tab to 1 to force tokenmanager to report correct column
5756
// index for substring splitting of service graph pattern.
58-
stb.jj_input_stream.setTabSize(1);
5957

6058
ASTUpdateSequence seq = stb.UpdateSequence();
6159
seq.setSourceString(sequence);
@@ -8159,6 +8157,11 @@ private boolean jj_2_7(int xla) {
81598157
finally { jj_save(6, xla); }
81608158
}
81618159

8160+
private boolean jj_3R_67() {
8161+
if (jj_scan_token(LBRACK)) return true;
8162+
return false;
8163+
}
8164+
81628165
private boolean jj_3R_61() {
81638166
Token xsp;
81648167
xsp = jj_scanpos;
@@ -8648,14 +8651,8 @@ private boolean jj_3R_84() {
86488651
return false;
86498652
}
86508653

8651-
private boolean jj_3R_67() {
8652-
if (jj_scan_token(LBRACK)) return true;
8653-
return false;
8654-
}
8655-
86568654
/** Generated Token Manager. */
86578655
public SyntaxTreeBuilderTokenManager token_source;
8658-
JavaCharStream jj_input_stream;
86598656
/** Current token. */
86608657
public Token token;
86618658
/** Next token. */
@@ -8701,41 +8698,9 @@ private static void jj_la1_init_5() {
87018698
private boolean jj_rescan = false;
87028699
private int jj_gc = 0;
87038700

8704-
/** Constructor with InputStream. */
8705-
public SyntaxTreeBuilder(java.io.InputStream stream) {
8706-
this(stream, null);
8707-
}
8708-
/** Constructor with InputStream and supplied encoding */
8709-
public SyntaxTreeBuilder(java.io.InputStream stream, String encoding) {
8710-
try { jj_input_stream = new JavaCharStream(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
8711-
token_source = new SyntaxTreeBuilderTokenManager(jj_input_stream);
8712-
token = new Token();
8713-
jj_ntk = -1;
8714-
jj_gen = 0;
8715-
for (int i = 0; i < 174; i++) jj_la1[i] = -1;
8716-
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
8717-
}
8718-
8719-
/** Reinitialise. */
8720-
public void ReInit(java.io.InputStream stream) {
8721-
ReInit(stream, null);
8722-
}
8723-
/** Reinitialise. */
8724-
public void ReInit(java.io.InputStream stream, String encoding) {
8725-
try { jj_input_stream.ReInit(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
8726-
token_source.ReInit(jj_input_stream);
8727-
token = new Token();
8728-
jj_ntk = -1;
8729-
jjtree.reset();
8730-
jj_gen = 0;
8731-
for (int i = 0; i < 174; i++) jj_la1[i] = -1;
8732-
for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls();
8733-
}
8734-
8735-
/** Constructor. */
8736-
public SyntaxTreeBuilder(java.io.Reader stream) {
8737-
jj_input_stream = new JavaCharStream(stream, 1, 1);
8738-
token_source = new SyntaxTreeBuilderTokenManager(jj_input_stream);
8701+
/** Constructor with user supplied CharStream. */
8702+
public SyntaxTreeBuilder(CharStream stream) {
8703+
token_source = new SyntaxTreeBuilderTokenManager(stream);
87398704
token = new Token();
87408705
jj_ntk = -1;
87418706
jj_gen = 0;
@@ -8744,9 +8709,8 @@ public SyntaxTreeBuilder(java.io.Reader stream) {
87448709
}
87458710

87468711
/** Reinitialise. */
8747-
public void ReInit(java.io.Reader stream) {
8748-
jj_input_stream.ReInit(stream, 1, 1);
8749-
token_source.ReInit(jj_input_stream);
8712+
public void ReInit(CharStream stream) {
8713+
token_source.ReInit(stream);
87508714
token = new Token();
87518715
jj_ntk = -1;
87528716
jjtree.reset();
@@ -8865,18 +8829,21 @@ private void jj_add_error_token(int kind, int pos) {
88658829
for (int i = 0; i < jj_endpos; i++) {
88668830
jj_expentry[i] = jj_lasttokens[i];
88678831
}
8868-
jj_entries_loop: for (java.util.Iterator<?> it = jj_expentries.iterator(); it.hasNext();) {
8832+
boolean exists = false;
8833+
for (java.util.Iterator<?> it = jj_expentries.iterator(); it.hasNext();) {
8834+
exists = true;
88698835
int[] oldentry = (int[])(it.next());
88708836
if (oldentry.length == jj_expentry.length) {
88718837
for (int i = 0; i < jj_expentry.length; i++) {
88728838
if (oldentry[i] != jj_expentry[i]) {
8873-
continue jj_entries_loop;
8839+
exists = false;
8840+
break;
88748841
}
88758842
}
8876-
jj_expentries.add(jj_expentry);
8877-
break jj_entries_loop;
8843+
if (exists) break;
88788844
}
88798845
}
8846+
if (!exists) jj_expentries.add(jj_expentry);
88808847
if (pos != 0) jj_lasttokens[(jj_endpos = pos) - 1] = kind;
88818848
}
88828849
}

core/queryparser/sparql/src/main/java/org/eclipse/rdf4j/query/parser/sparql/ast/SyntaxTreeBuilderTokenManager.java

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3315,25 +3315,23 @@ private static final boolean jjCanMove_2(int hiByte, int i1, int i2, long l1, lo
33153315
static final long[] jjtoSpecial = {
33163316
0x8L, 0x0L, 0x0L,
33173317
};
3318-
protected JavaCharStream input_stream;
3318+
protected CharStream input_stream;
33193319
private final int[] jjrounds = new int[157];
33203320
private final int[] jjstateSet = new int[314];
33213321
protected char curChar;
33223322
/** Constructor. */
3323-
public SyntaxTreeBuilderTokenManager(JavaCharStream stream){
3324-
if (JavaCharStream.staticFlag)
3325-
throw new Error("ERROR: Cannot use a static CharStream class with a non-static lexical analyzer.");
3323+
public SyntaxTreeBuilderTokenManager(CharStream stream){
33263324
input_stream = stream;
33273325
}
33283326

33293327
/** Constructor. */
3330-
public SyntaxTreeBuilderTokenManager(JavaCharStream stream, int lexState){
3328+
public SyntaxTreeBuilderTokenManager(CharStream stream, int lexState){
33313329
this(stream);
33323330
SwitchTo(lexState);
33333331
}
33343332

33353333
/** Reinitialise parser. */
3336-
public void ReInit(JavaCharStream stream)
3334+
public void ReInit(CharStream stream)
33373335
{
33383336
jjmatchedPos = jjnewStateCnt = 0;
33393337
curLexState = defaultLexState;
@@ -3349,7 +3347,7 @@ private void ReInitRounds()
33493347
}
33503348

33513349
/** Reinitialise parser. */
3352-
public void ReInit(JavaCharStream stream, int lexState)
3350+
public void ReInit(CharStream stream, int lexState)
33533351
{
33543352
ReInit(stream);
33553353
SwitchTo(lexState);

0 commit comments

Comments
 (0)