Skip to content

Commit 08b898c

Browse files
authored
GH-5149 Lucene numdocs param (#5163)
2 parents f687e85 + 42a331d commit 08b898c

11 files changed

Lines changed: 294 additions & 26 deletions

File tree

core/sail/elasticsearch/src/main/java/org/eclipse/rdf4j/sail/elasticsearch/ElasticsearchIndex.java

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import java.util.HashMap;
1919
import java.util.HashSet;
2020
import java.util.Map;
21+
import java.util.Objects;
2122
import java.util.Properties;
2223
import java.util.Set;
2324

@@ -577,10 +578,11 @@ protected Iterable<? extends DocumentScore> query(Resource subject, QuerySpec sp
577578
}
578579

579580
SearchHits hits;
581+
int numDocs = Objects.requireNonNullElse(spec.getNumDocs(), -1);
580582
if (subject != null) {
581-
hits = search(subject, request, qb);
583+
hits = search(subject, request, qb, numDocs);
582584
} else {
583-
hits = search(request, qb);
585+
hits = search(request, qb, numDocs);
584586
}
585587
return Iterables.transform(hits, new Function<>() {
586588

@@ -600,11 +602,24 @@ public DocumentScore apply(SearchHit hit) {
600602
* @return search hits
601603
*/
602604
public SearchHits search(Resource resource, SearchRequestBuilder request, QueryBuilder query) {
605+
return search(resource, request, query, -1);
606+
}
607+
608+
/**
609+
* Evaluates the given query only for the given resource.
610+
*
611+
* @param resource
612+
* @param request
613+
* @param query
614+
* @param numDocs
615+
* @return search hits
616+
*/
617+
public SearchHits search(Resource resource, SearchRequestBuilder request, QueryBuilder query, int numDocs) {
603618
// rewrite the query
604619
QueryBuilder idQuery = QueryBuilders.termQuery(SearchFields.URI_FIELD_NAME,
605620
SearchFields.getResourceID(resource));
606621
QueryBuilder combinedQuery = QueryBuilders.boolQuery().must(idQuery).must(query);
607-
return search(request, combinedQuery);
622+
return search(request, combinedQuery, numDocs);
608623
}
609624

610625
@Override
@@ -712,10 +727,23 @@ private ShapeRelation toSpatialOp(String relation) {
712727
* Evaluates the given query and returns the results as a TopDocs instance.
713728
*/
714729
public SearchHits search(SearchRequestBuilder request, QueryBuilder query) {
730+
return search(request, query, -1);
731+
}
732+
733+
/**
734+
* Evaluates the given query and returns the results as a TopDocs instance.
735+
*/
736+
public SearchHits search(SearchRequestBuilder request, QueryBuilder query, int numDocs) {
715737
String[] types = getTypes();
716738
int nDocs;
717-
if (maxDocs > 0) {
718-
nDocs = maxDocs;
739+
if (numDocs > 0) {
740+
if (maxDocs > 0 && maxDocs < numDocs) {
741+
nDocs = maxDocs;
742+
} else {
743+
nDocs = numDocs;
744+
}
745+
} else if (defaultNumDocs > 0) {
746+
nDocs = defaultNumDocs;
719747
} else {
720748
long docCount = client.prepareSearch(indexName)
721749
.setTypes(types)

core/sail/lucene-api/src/main/java/org/eclipse/rdf4j/sail/lucene/AbstractSearchIndex.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ public abstract class AbstractSearchIndex implements SearchIndex {
6565
REJECTED_DATATYPES.add("http://www.w3.org/2001/XMLSchema#float");
6666
}
6767

68+
protected int defaultNumDocs;
6869
protected int maxDocs;
6970

7071
protected Set<String> wktFields = Collections.singleton(SearchFields.getPropertyField(GEO.AS_WKT));
@@ -75,8 +76,10 @@ public abstract class AbstractSearchIndex implements SearchIndex {
7576

7677
@Override
7778
public void initialize(Properties parameters) throws Exception {
78-
String maxDocParam = parameters.getProperty(LuceneSail.MAX_DOCUMENTS_KEY);
79-
maxDocs = (maxDocParam != null) ? Integer.parseInt(maxDocParam) : -1;
79+
String maxDocumentsParam = parameters.getProperty(LuceneSail.MAX_DOCUMENTS_KEY);
80+
maxDocs = (maxDocumentsParam != null) ? Integer.parseInt(maxDocumentsParam) : -1;
81+
String defaultNumDocsParam = parameters.getProperty(LuceneSail.DEFAULT_NUM_DOCS_KEY);
82+
defaultNumDocs = (defaultNumDocsParam != null) ? Integer.parseInt(defaultNumDocsParam) : defaultNumDocs;
8083

8184
String wktFieldParam = parameters.getProperty(LuceneSail.WKT_FIELDS);
8285
if (wktFieldParam != null) {

core/sail/lucene-api/src/main/java/org/eclipse/rdf4j/sail/lucene/LuceneSail.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -290,10 +290,17 @@ public class LuceneSail extends NotifyingSailWrapper {
290290
public static final String LUCENE_RAMDIR_KEY = "useramdir";
291291

292292
/**
293-
* Set the key "maxDocuments=&lt;n&gt;" as sail parameter to limit the maximum number of documents to return from a
294-
* search query. The default is to return all documents. NB: this may involve extra cost for some SearchIndex
293+
* Set the key "defaultNumDocs=&lt;n&gt;" as sail parameter to limit the maximum number of documents to return from
294+
* a search query. The default is to return all documents. NB: this may involve extra cost for some SearchIndex
295295
* implementations as they may have to determine this number.
296296
*/
297+
public static final String DEFAULT_NUM_DOCS_KEY = "defaultNumDocs";
298+
299+
/**
300+
* Set the key "maxDocuments=&lt;n&gt;" as sail parameter to limit the maximum number of documents the user can
301+
* query at a time to return from a search query. The default is the value of the {@link #DEFAULT_NUM_DOCS_KEY}
302+
* parameter.
303+
*/
297304
public static final String MAX_DOCUMENTS_KEY = "maxDocuments";
298305

299306
/**

core/sail/lucene-api/src/main/java/org/eclipse/rdf4j/sail/lucene/LuceneSailSchema.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ public class LuceneSailSchema {
5252

5353
public static final IRI CONTEXT;
5454

55+
public static final IRI NUM_DOCS;
56+
5557
static {
5658
ValueFactory factory = SimpleValueFactory.getInstance(); // compatible with beta4:
5759
// creating a new factory
@@ -73,5 +75,6 @@ public class LuceneSailSchema {
7375
WITHIN_DISTANCE = factory.createIRI(NAMESPACE + "withinDistance");
7476
DISTANCE = factory.createIRI(NAMESPACE + "distance");
7577
CONTEXT = factory.createIRI(NAMESPACE + "context");
78+
NUM_DOCS = factory.createIRI(NAMESPACE + "numDocs");
7679
}
7780
}

core/sail/lucene-api/src/main/java/org/eclipse/rdf4j/sail/lucene/QuerySpec.java

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@
1616
import java.util.stream.Collectors;
1717

1818
import org.eclipse.rdf4j.model.IRI;
19+
import org.eclipse.rdf4j.model.Literal;
1920
import org.eclipse.rdf4j.model.Resource;
21+
import org.eclipse.rdf4j.model.Value;
2022
import org.eclipse.rdf4j.query.algebra.QueryModelNode;
2123
import org.eclipse.rdf4j.query.algebra.SingletonSet;
2224
import org.eclipse.rdf4j.query.algebra.StatementPattern;
@@ -67,21 +69,43 @@ private static void append(Var var, StringBuilder buffer) {
6769

6870
private final StatementPattern idPattern;
6971

72+
private final StatementPattern numDocsPattern;
73+
7074
private final Resource subject;
7175

7276
private final String matchesVarName;
7377

7478
private final String scoreVarName;
7579

80+
private final Integer numDocs;
81+
7682
public QuerySpec(StatementPattern matchesPattern, Collection<QueryParam> queryPatterns,
7783
StatementPattern scorePattern, StatementPattern typePattern,
7884
StatementPattern idPattern, Resource subject) {
85+
this(matchesPattern, queryPatterns, scorePattern, typePattern, idPattern, null, subject);
86+
}
87+
88+
public QuerySpec(StatementPattern matchesPattern, Collection<QueryParam> queryPatterns,
89+
StatementPattern scorePattern, StatementPattern typePattern,
90+
StatementPattern idPattern, StatementPattern numDocsPattern, Resource subject) {
7991
this.matchesPattern = matchesPattern;
8092
this.queryPatterns = queryPatterns;
8193
this.scorePattern = scorePattern;
8294
this.typePattern = typePattern;
8395
this.idPattern = idPattern;
96+
this.numDocsPattern = numDocsPattern;
8497
this.subject = subject;
98+
if (numDocsPattern != null) {
99+
Value val = numDocsPattern.getObjectVar().getValue();
100+
if (val != null && val.isLiteral()) {
101+
this.numDocs = ((Literal) val).intValue();
102+
} else {
103+
throw new IllegalArgumentException("numDocs should be constant literal value");
104+
}
105+
} else {
106+
this.numDocs = null;
107+
}
108+
85109
if (matchesPattern != null) {
86110
this.matchesVarName = matchesPattern.getSubjectVar().getName();
87111
} else {
@@ -101,9 +125,11 @@ public QuerySpec(String matchesVarName, String propertyVarName, String scoreVarN
101125
this.matchesPattern = null;
102126
this.scorePattern = null;
103127
this.typePattern = null;
128+
this.numDocsPattern = null;
104129
this.queryPatterns = Set.of();
105130
this.idPattern = null;
106131
this.subject = subject;
132+
this.numDocs = null;
107133
}
108134

109135
@Override
@@ -121,6 +147,7 @@ public QueryModelNode removeQueryPatterns() {
121147
replace(getScorePattern(), replacement);
122148
replace(getTypePattern(), replacement);
123149
replace(getIdPattern(), replacement);
150+
replace(getNumDocsPattern(), replacement);
124151

125152
final QueryModelNode placeholder = new SingletonSet();
126153

@@ -154,6 +181,10 @@ public StatementPattern getScorePattern() {
154181
return scorePattern;
155182
}
156183

184+
public StatementPattern getNumDocsPattern() {
185+
return numDocsPattern;
186+
}
187+
157188
/**
158189
* The variable name associated with the query score
159190
*
@@ -163,6 +194,10 @@ public String getScoreVariableName() {
163194
return scoreVarName;
164195
}
165196

197+
public Integer getNumDocs() {
198+
return numDocs;
199+
}
200+
166201
public StatementPattern getTypePattern() {
167202
return typePattern;
168203
}

core/sail/lucene-api/src/main/java/org/eclipse/rdf4j/sail/lucene/QuerySpecBuilder.java

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.INDEXID;
1616
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.LUCENE_QUERY;
1717
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.MATCHES;
18+
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.NUM_DOCS;
1819
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.PROPERTY;
1920
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.QUERY;
2021
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.SCORE;
@@ -152,7 +153,7 @@ public void process(TupleExpr tupleExpr, BindingSet bindings, Collection<SearchQ
152153
}
153154

154155
// find the relevant outgoing patterns
155-
StatementPattern typePattern, propertyPattern, scorePattern, snippetPattern;
156+
StatementPattern typePattern, propertyPattern, scorePattern, snippetPattern, numDocsPattern;
156157
List<StatementPattern> queryPatterns;
157158

158159
try {
@@ -161,6 +162,7 @@ public void process(TupleExpr tupleExpr, BindingSet bindings, Collection<SearchQ
161162
propertyPattern = getPattern(matchesVar, filter.propertyPatterns);
162163
scorePattern = getPattern(matchesVar, filter.scorePatterns);
163164
snippetPattern = getPattern(matchesVar, filter.snippetPatterns);
165+
numDocsPattern = getPattern(matchesVar, filter.numDocsPatterns);
164166
} catch (IllegalArgumentException e) {
165167
failOrWarn(e);
166168
continue;
@@ -302,7 +304,8 @@ else if (propertyValue != null) {
302304
queryString, propertyURI, null));
303305
}
304306

305-
QuerySpec querySpec = new QuerySpec(matchesPattern, queries, scorePattern, typePattern, idPattern, subject);
307+
QuerySpec querySpec = new QuerySpec(matchesPattern, queries, scorePattern, typePattern, idPattern,
308+
numDocsPattern, subject);
306309

307310
if (querySpec.isEvaluable()) {
308311
// constant optimizer
@@ -341,6 +344,10 @@ else if (propertyValue != null) {
341344
funcCall.addArg(new ValueConstant(LuceneSailSchema.SNIPPET));
342345
funcCall.addResultVar(snippetVar);
343346
}
347+
if (numDocsPattern != null) {
348+
funcCall.addArg(new ValueConstant(LuceneSailSchema.NUM_DOCS));
349+
funcCall.addArg(numDocsPattern.getObjectVar());
350+
}
344351

345352
Join join = new Join();
346353
matchesPattern.replaceWith(join);
@@ -465,6 +472,8 @@ private static class PatternFilter extends AbstractQueryModelVisitor<RuntimeExce
465472

466473
public ArrayList<StatementPattern> boostPatterns = new ArrayList<>();
467474

475+
public ArrayList<StatementPattern> numDocsPatterns = new ArrayList<>();
476+
468477
/**
469478
* Method implementing the visitor pattern that gathers all statements using a predicate from the LuceneSail's
470479
* namespace.
@@ -487,6 +496,8 @@ public void meet(StatementPattern node) {
487496
idPatterns.add(node);
488497
} else if (BOOST.equals(predicate)) {
489498
boostPatterns.add(node);
499+
} else if (NUM_DOCS.equals(predicate)) {
500+
numDocsPatterns.add(node);
490501
} else if (TYPE.equals(predicate)) {
491502
Value object = node.getObjectVar().getValue();
492503
if (LUCENE_QUERY.equals(object)) {

core/sail/lucene-api/src/test/java/org/eclipse/rdf4j/sail/lucene/QuerySpecBuilderTest.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.BOOST;
1515
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.LUCENE_QUERY;
1616
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.MATCHES;
17+
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.NUM_DOCS;
1718
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.QUERY;
1819
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.SCORE;
1920
import static org.eclipse.rdf4j.sail.lucene.LuceneSailSchema.SNIPPET;
@@ -55,6 +56,7 @@ public void testQueryInterpretation() {
5556
"<" + TYPE + "> <" + LUCENE_QUERY + ">; " +
5657
"<" + QUERY + "> \"my Lucene query\"; " +
5758
"<" + SCORE + "> ?Score; " +
59+
"<" + NUM_DOCS + "> 76; " +
5860
"<" + SNIPPET + "> ?Snippet ]. } ";
5961
ParsedQuery query = parser.parseQuery(buffer, null);
6062
TupleExpr tupleExpr = query.getTupleExpr();
@@ -69,6 +71,8 @@ public void testQueryInterpretation() {
6971
assertEquals("Score", querySpec.getScorePattern().getObjectVar().getName());
7072
assertEquals("Snippet", param.getSnippetPattern().getObjectVar().getName());
7173
assertEquals(LUCENE_QUERY, querySpec.getTypePattern().getObjectVar().getValue());
74+
assertEquals(76, querySpec.getNumDocs());
75+
assertEquals(76, ((Literal) querySpec.getNumDocsPattern().getObjectVar().getValue()).intValue());
7276
assertEquals("my Lucene query", param.getQuery());
7377
assertNull(querySpec.getSubject());
7478
}
@@ -80,11 +84,13 @@ public void testMultipleQueriesInterpretation() {
8084
"<" + TYPE + "> <" + LUCENE_QUERY + ">; " +
8185
"<" + QUERY + "> \"my Lucene query\"; " +
8286
"<" + SCORE + "> ?score1; " +
87+
"<" + NUM_DOCS + "> 86; " +
8388
"<" + SNIPPET + "> ?snippet1 ]. " +
8489
" ?sub2 <" + MATCHES + "> [ " +
8590
"<" + TYPE + "> <" + LUCENE_QUERY + ">; " +
8691
"<" + QUERY + "> \"second lucene query\"; " +
8792
"<" + SCORE + "> ?score2; " +
93+
"<" + NUM_DOCS + "> 13; " +
8894
"<" + SNIPPET + "> ?snippet2 ]. " +
8995
// and connect them both via any X in between, just as salt to make the
9096
// parser do something
@@ -103,6 +109,7 @@ public void testMultipleQueriesInterpretation() {
103109
// Matched the first
104110
assertEquals("sub1", querySpec.getMatchesPattern().getSubjectVar().getName());
105111
assertEquals(1, querySpec.getQueryPatterns().size());
112+
assertEquals(86, querySpec.getNumDocs());
106113
QuerySpec.QueryParam param = querySpec.getQueryPatterns().iterator().next();
107114
assertEquals("my Lucene query",
108115
((Literal) param.getQueryPattern().getObjectVar().getValue()).getLabel());
@@ -116,6 +123,7 @@ public void testMultipleQueriesInterpretation() {
116123
// and the second
117124
assertEquals("sub2", querySpec.getMatchesPattern().getSubjectVar().getName());
118125
assertEquals(1, querySpec.getQueryPatterns().size());
126+
assertEquals(13, querySpec.getNumDocs());
119127
QuerySpec.QueryParam param = querySpec.getQueryPatterns().iterator().next();
120128
assertEquals("second lucene query",
121129
((Literal) param.getQueryPattern().getObjectVar().getValue()).getLabel());

0 commit comments

Comments
 (0)