@@ -86,7 +86,8 @@ def __init__(self, indexer: Indexer, collection: Collection,
8686 query_parser : QueryParser = PubmedQueryParser (),
8787 eval_measures : List [Measure ] = None ,
8888 run_path : Path = None , filter_topics : List [str ] = None ,
89- ignore_dates : bool = False , date_field : str = "dp" ):
89+ ignore_dates : bool = False , date_field : str = "dp" ,
90+ page_start = 0 , page_size = - 1 ):
9091 super ().__init__ (indexer )
9192 self .ignore_dates = ignore_dates
9293 self .date_field = date_field
@@ -98,6 +99,9 @@ def __init__(self, indexer: Indexer, collection: Collection,
9899 filtered_qrels = list (filter (lambda x : x .query_id in filter_topics , collection .qrels ))
99100 collection = Collection (collection .identifier , filtered_topics , filtered_qrels )
100101
102+ self .page_start = page_start
103+ self .page_size = page_size
104+
101105 # Timings for reproducibility and sanity checks.
102106 self .date_created = datetime .now ()
103107 self .date_completed = None
@@ -162,19 +166,27 @@ def _retrieval(self) -> List[ScoredDoc]:
162166 for query_id , lucene_query in tqdm (self .queries .items (), desc = "retrieval" ):
163167 # Documents can remain un-scored for efficiency (?).
164168 hits = self .index .search (lucene_query , scored = False )
165- for hit in hits :
169+ page_size = self .page_size
170+ if page_size > len (hits ):
171+ page_size = hits
172+
173+ page_start = self .page_start
174+ if page_start > len (hits ):
175+ page_start = - 1
176+
177+ page_end = - 1
178+ if self .page_start + page_size < len (hits ):
179+ page_end = page_start + page_size
180+
181+ print (len (hits ), page_start , page_end )
182+
183+ for hit in hits [page_start :page_end ]:
166184 yield ScoredDoc (query_id , hit ["id" ], 0 )
167185 self .date_completed = datetime .now ()
168186
169187 def doc (self , pmid : str ):
170188 hits = self .index .search (f"id:{ pmid } " )
171189 for hit in hits :
172- # article: ix.PubmedArticle = ix.PubmedArticle.from_dict(hit.dict("mesh_heading_list",
173- # "mesh_qualifier_list",
174- # "mesh_major_heading_list",
175- # "keyword_list",
176- # "publication_type",
177- # "supplementary_concept_list"))
178190 return hit
179191 return None
180192
@@ -244,7 +256,9 @@ def __repr__(self):
244256
245257def AdHocExperiment (indexer : Indexer , raw_query : str = None , topic_id : str = "0" ,
246258 query_parser : QueryParser = PubmedQueryParser (),
247- date_from = "1900/01/01" , date_to = "3000/01/01" , ignore_dates : bool = False , date_field : str = "dp" ) -> RetrievalExperiment :
259+ date_from = "1900/01/01" , date_to = "3000/01/01" ,
260+ ignore_dates : bool = False , date_field : str = "dp" ,
261+ page_start = 0 , page_size = - 1 ) -> RetrievalExperiment :
248262 """
249263 Unlike the `RetrievalExperiment` class, which expects a `Collection` object, this class allows for ad-hoc queries to be run, for example:
250264
@@ -261,4 +275,6 @@ def AdHocExperiment(indexer: Indexer, raw_query: str = None, topic_id: str = "0"
261275 raw_query = raw_query ,
262276 date_from = date_from ,
263277 date_to = date_to )], [])
264- return RetrievalExperiment (indexer , collection , query_parser , ignore_dates = ignore_dates , date_field = date_field )
278+ return RetrievalExperiment (indexer , collection , query_parser ,
279+ ignore_dates = ignore_dates , date_field = date_field ,
280+ page_start = page_start , page_size = page_size )
0 commit comments