Skip to content

Commit 30941ca

Browse files
committed
Allow fast paged results.
1 parent ab7ba92 commit 30941ca

1 file changed

Lines changed: 26 additions & 10 deletions

File tree

src/pybool_ir/experiments/retrieval.py

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ def __init__(self, indexer: Indexer, collection: Collection,
8686
query_parser: QueryParser = PubmedQueryParser(),
8787
eval_measures: List[Measure] = None,
8888
run_path: Path = None, filter_topics: List[str] = None,
89-
ignore_dates: bool = False, date_field: str = "dp"):
89+
ignore_dates: bool = False, date_field: str = "dp",
90+
page_start=0, page_size=-1):
9091
super().__init__(indexer)
9192
self.ignore_dates = ignore_dates
9293
self.date_field = date_field
@@ -98,6 +99,9 @@ def __init__(self, indexer: Indexer, collection: Collection,
9899
filtered_qrels = list(filter(lambda x: x.query_id in filter_topics, collection.qrels))
99100
collection = Collection(collection.identifier, filtered_topics, filtered_qrels)
100101

102+
self.page_start = page_start
103+
self.page_size = page_size
104+
101105
# Timings for reproducibility and sanity checks.
102106
self.date_created = datetime.now()
103107
self.date_completed = None
@@ -162,19 +166,27 @@ def _retrieval(self) -> List[ScoredDoc]:
162166
for query_id, lucene_query in tqdm(self.queries.items(), desc="retrieval"):
163167
# Documents can remain un-scored for efficiency (?).
164168
hits = self.index.search(lucene_query, scored=False)
165-
for hit in hits:
169+
page_size = self.page_size
170+
if page_size > len(hits):
171+
page_size = hits
172+
173+
page_start = self.page_start
174+
if page_start > len(hits):
175+
page_start = -1
176+
177+
page_end = -1
178+
if self.page_start+page_size < len(hits):
179+
page_end = page_start+page_size
180+
181+
print(len(hits), page_start, page_end)
182+
183+
for hit in hits[page_start:page_end]:
166184
yield ScoredDoc(query_id, hit["id"], 0)
167185
self.date_completed = datetime.now()
168186

169187
def doc(self, pmid: str):
170188
hits = self.index.search(f"id:{pmid}")
171189
for hit in hits:
172-
# article: ix.PubmedArticle = ix.PubmedArticle.from_dict(hit.dict("mesh_heading_list",
173-
# "mesh_qualifier_list",
174-
# "mesh_major_heading_list",
175-
# "keyword_list",
176-
# "publication_type",
177-
# "supplementary_concept_list"))
178190
return hit
179191
return None
180192

@@ -244,7 +256,9 @@ def __repr__(self):
244256

245257
def AdHocExperiment(indexer: Indexer, raw_query: str = None, topic_id: str = "0",
246258
query_parser: QueryParser = PubmedQueryParser(),
247-
date_from="1900/01/01", date_to="3000/01/01", ignore_dates: bool = False, date_field: str = "dp") -> RetrievalExperiment:
259+
date_from="1900/01/01", date_to="3000/01/01",
260+
ignore_dates: bool = False, date_field: str = "dp",
261+
page_start=0, page_size=-1) -> RetrievalExperiment:
248262
"""
249263
Unlike the `RetrievalExperiment` class, which expects a `Collection` object, this class allows for ad-hoc queries to be run, for example:
250264
@@ -261,4 +275,6 @@ def AdHocExperiment(indexer: Indexer, raw_query: str = None, topic_id: str = "0"
261275
raw_query=raw_query,
262276
date_from=date_from,
263277
date_to=date_to)], [])
264-
return RetrievalExperiment(indexer, collection, query_parser, ignore_dates=ignore_dates, date_field=date_field)
278+
return RetrievalExperiment(indexer, collection, query_parser,
279+
ignore_dates=ignore_dates, date_field=date_field,
280+
page_start=page_start, page_size=page_size)

0 commit comments

Comments
 (0)