@@ -97,27 +97,45 @@ def get_rules(parsers: List[str], regexes: Optional[io.IOBase]) -> Matchers:
9797 return rules
9898
9999
100+ def parse_item (item : str , all : list [str ] | None ) -> list [str ]:
101+ if item == '*' :
102+ assert all
103+ return all
104+ elif item .startswith ('{' ):
105+ assert item .endswith ('}' )
106+ return item [1 :- 1 ].split (',' )
107+ else :
108+ return [item ]
109+
110+ def rules_to_parsers (args : argparse .Namespace ) -> Iterator [tuple [str , str , int ]]:
111+ seen = set ()
112+ for selector in args .selector :
113+ p , c , s = selector .split (':' )
114+ for triplet in (
115+ (pp , 'none' if ss == 0 else cc , ss )
116+ for pp in parse_item (p , ['basic' , 're2' , 'regex' , 'legacy' ])
117+ for cc in (parse_item (c , list (CACHES )) if CACHEABLE [pp ] else ['none' ])
118+ for ss in (map (int , parse_item (s , None )) if cc != 'none' else [0 ])
119+ ):
120+ if triplet not in seen :
121+ seen .add (triplet )
122+ yield triplet
123+
100124def run_stdout (args : argparse .Namespace ) -> None :
101125 lines = list (map (sys .intern , args .file ))
102126 count = len (lines )
103127 uniques = len (set (lines ))
104128 print (f"{ args .file .name } : { count } lines, { uniques } unique ({ uniques / count :.0%} )" )
105129
106- rules = get_rules ( args . bases , args . regexes )
130+ parsers = list ( rules_to_parsers ( args ) )
107131
108- # width of the parser label
109- w = math .ceil (
110- 3
111- + max (map (len , args .bases ))
112- + max (map (len , args .caches ))
113- + max (map (math .log10 , args .cachesizes ))
132+ rules = get_rules ([* {p for p , _ , _ in parsers }], args .regexes )
133+
134+ w = max (
135+ math .ceil (3 + len (p ) + len (c ) + (s and math .log10 (s )))
136+ for p , c , s in parsers
114137 )
115- for p , c , n in (
116- (p , c , n )
117- for p in args .bases
118- for c in (args .caches if CACHEABLE [p ] and args .cachesizes != [0 ] else ["none" ])
119- for n in (args .cachesizes if c != "none" else [0 ])
120- ):
138+ for p , c , n in parsers :
121139 name = "-" .join (map (str , filter (None , (p , c != "none" and c , n ))))
122140 print (f"{ name :{w }} " , end = ": " , flush = True )
123141
@@ -133,22 +151,16 @@ def run_stdout(args: argparse.Namespace) -> None:
133151def run_csv (args : argparse .Namespace ) -> None :
134152 lines = list (map (sys .intern , args .file ))
135153 LEN = len (lines ) * 1000
136- rules = get_rules (args .bases , args .regexes )
137154
138- parsers = [
139- (p , c , n )
140- for p in args .bases
141- for c in (args .caches if CACHEABLE [p ] else ["none" ])
142- for n in (args .cachesizes if c != "none" else [0 ])
143- ]
155+ parsers = list (rules_to_parsers (args ))
144156 if not parsers :
145157 sys .exit ("No parser selected" )
146158
159+ rules = get_rules ([* {p for p , _ , _ in parsers }], args .regexes )
147160 columns = {"size" : "" }
148161 columns .update (
149162 (f"{ p } -{ c } " , p if c == "none" else f"{ p } -{ c } " )
150- for p in args .bases
151- for c in (args .caches if CACHEABLE [p ] else ["none" ])
163+ for p , c , _ in parsers
152164 )
153165 w = csv .DictWriter (
154166 sys .stdout ,
@@ -171,11 +183,13 @@ def run_csv(args: argparse.Namespace) -> None:
171183 # cache could be ignored as it should always be `"none"`
172184 for parser , cache , _ in ps :
173185 p = get_parser (parser , cache , 0 , rules )
174- zeroes [f"{ parser } -{ cache } " ] = run (p , lines ) // LEN
186+ zeroes [f"{ parser } -{ cache } " ] = run (p , linges ) // LEN
175187
176188 # special cases for configurations where we can't have
177189 # cachesize lines, write the template row out directly
178- if args .bases == ["legacy" ] or args .caches == ["none" ] or args .cachesizes == [0 ]:
190+ if all (p == 'legacy' for p , _ , _ in parsers )\
191+ or all (c == 'none' for _ , c , _ in parsers )\
192+ or all (s == 0 for _ , _ , s in parsers ):
179193 zeroes ["size" ] = 0
180194 w .writerow (zeroes )
181195 return
@@ -457,37 +471,14 @@ def __call__(
457471 with a first cell of value 0.""" ,
458472)
459473bench .add_argument (
460- "--bases" ,
461- nargs = "+" ,
462- choices = ["basic" , "re2" , "regex" , "legacy" ],
463- default = ["basic" , "re2" , "regex" , "legacy" ],
464- help = """Base resolvers to benchmark. `basic` is a linear search
465- through the regexes file, `re2` is a prefiltered regex set
466- implemented in C++, `regex` is a prefiltered regex set implemented
467- in Rust, `legacy` is the legacy API (essentially a basic resolver
468- with a clearing cache of fixed 200 entries, but less layered so
469- usually slightly faster than an equivalent basic-based resolver).""" ,
470- )
471- bench .add_argument (
472- "--caches" ,
473- nargs = "+" ,
474- choices = list (CACHES ),
475- default = list (CACHES ),
476- help = """Cache implementations to test. `clearing` completely
477- clears the cache when full, `lru` uses a least-recently-eviction
478- policy. `lru` is not thread-safe, so `lru-threadsafe` adds a mutex
479- and measures *uncontended* locking overhead.""" ,
480- )
481- bench .add_argument (
482- "--cachesizes" ,
483- nargs = "+" ,
484- type = int ,
485- default = [10 , 20 , 50 , 100 , 200 , 500 , 1000 , 2000 , 5000 ],
486- help = """Caches are a classic way to trade memory for performances.
487- Different base resolvers and traffic patterns have different
488- benefits from caches, this option allows testing the benefits of
489- various cache sizes (and thus amounts of memory used) on the cache
490- strategies. """ ,
474+ "selector" ,
475+ nargs = "*" ,
476+ default = ["*:*:{10,20,50,100,200,500,1000,2000,5000}" ],
477+ help = f"""A generative selector expression, composed of 3 parts: 1.
478+ the parser (base), 2. the cache implementation ({ ', ' .join (CACHES )} )
479+ and 3. the cache size. For parser and cache `*` is an alias for stands
480+ in for "every value", a bracketed expression for an enumeration, and
481+ the selector can be repeated to explicitly list each configuration """
491482)
492483
493484hitrates = sub .add_parser (
0 commit comments