Skip to content

Commit d829ef7

Browse files
committed
Initial indexing and query implementation for clinicaltrials.gov.
1 parent 97fc9cb commit d829ef7

5 files changed

Lines changed: 549 additions & 203 deletions

File tree

src/pybool_ir/cli/__main__.py

Lines changed: 83 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from tqdm.auto import tqdm
1010

1111
import pybool_ir
12-
from pybool_ir.experiments.retrieval import AdHocExperiment
1312
from pybool_ir.index.generic import GenericSearcher
1413
from pybool_ir.query import GenericQueryParser
1514

@@ -190,6 +189,41 @@ def pubmed_index(baseline_path: Path, index_path: Path, store_fields: bool):
190189
ix.bulk_index(Path(baseline_path))
191190

192191

192+
193+
@ctgov.command("index")
194+
@click.option(
195+
"-b",
196+
"--baseline",
197+
"baseline_path",
198+
type=click.Path(),
199+
multiple=False,
200+
required=True,
201+
help="location of baseline download"
202+
)
203+
@click.option(
204+
"-i",
205+
"--index",
206+
"index_path",
207+
type=click.Path(),
208+
multiple=False,
209+
required=True,
210+
help="location to write the lucene index"
211+
)
212+
@click.option(
213+
"-s",
214+
"--store",
215+
"store_fields",
216+
default=False,
217+
type=click.BOOL,
218+
multiple=False,
219+
required=False,
220+
help="whether to store fields or not"
221+
)
222+
def ctgov_index(baseline_path: Path, index_path: Path, store_fields: bool):
223+
from pybool_ir.index.ctgov import ClinicalTrialsGovIndexer
224+
with ClinicalTrialsGovIndexer(Path(index_path), store_fields=store_fields) as ix:
225+
ix.bulk_index(Path(baseline_path))
226+
193227
@ir_datasets.command("index")
194228
@click.option(
195229
"-c",
@@ -329,11 +363,6 @@ def validate(self, query):
329363
except Exception as e:
330364
raise ValidationError(message=str(e), cursor_position=-1)
331365

332-
with AdHocExperiment(PubmedIndexer(Path(index_path), store_fields=store_fields), raw_query="test",page_start=0,page_size=10) as ex:
333-
results = ex.run
334-
total_count = len(results)
335-
print(results)
336-
337366
with PubmedIndexer(Path(index_path), store_fields=store_fields) as ix:
338367
print(f"pybool_ir {pybool_ir.__version__}")
339368
print(f"loaded: {ix.index_path}")
@@ -343,6 +372,54 @@ def validate(self, query):
343372
lucene_query = parser.parse_lucene(raw_query)
344373
ix.search_fmt(lucene_query)
345374

375+
@ctgov.command("search")
376+
@click.option(
377+
"-i",
378+
"--index",
379+
"index_path",
380+
type=click.Path(),
381+
multiple=False,
382+
required=True,
383+
help="location to the lucene index"
384+
)
385+
@click.option(
386+
"-s",
387+
"--store",
388+
"store_fields",
389+
default=False,
390+
type=click.BOOL,
391+
multiple=False,
392+
required=False,
393+
help="whether to display stored fields or not"
394+
)
395+
def pubmed_search(index_path: Path, store_fields: bool):
396+
from pybool_ir.index.ctgov import ClinicalTrialsGovIndexer
397+
from pybool_ir.query.essie.parser import EssieQueryParser
398+
from prompt_toolkit import PromptSession
399+
from prompt_toolkit.validation import Validator
400+
from prompt_toolkit.validation import ValidationError
401+
402+
parser = EssieQueryParser()
403+
404+
print(parser.parse_ast("test"))
405+
406+
class QueryValidator(Validator):
407+
def validate(self, query):
408+
text = query.text
409+
try:
410+
parser._parse(text)
411+
except Exception as e:
412+
raise ValidationError(message=str(e), cursor_position=-1)
413+
414+
with ClinicalTrialsGovIndexer(Path(index_path), store_fields=store_fields) as ix:
415+
print(f"pybool_ir {pybool_ir.__version__}")
416+
print(f"loaded: {ix.index_path}")
417+
session = PromptSession()
418+
while True:
419+
raw_query = session.prompt("?>", validator=QueryValidator())
420+
lucene_query = parser.parse_lucene(raw_query)
421+
ix.search_fmt(lucene_query)
422+
346423

347424
@csur.command("process")
348425
@click.option(

0 commit comments

Comments
 (0)