Skip to content

Commit c3c3d96

Browse files
committed
Switch bench to a more flexible selectors system
In checking things with sample 2, I've realised the current configuration system is too inflexible to allow easily running bench just once when some of the configurations are just not acceptable or sensible. A more flexible selector system rather than 3 separate options being producted together allows cleaner parser configurations, which makes running things easier.
1 parent a30a485 commit c3c3d96

1 file changed

Lines changed: 46 additions & 55 deletions

File tree

src/ua_parser/__main__.py

Lines changed: 46 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -97,27 +97,45 @@ def get_rules(parsers: List[str], regexes: Optional[io.IOBase]) -> Matchers:
9797
return rules
9898

9999

100+
def parse_item(item: str, all: list[str] | None) -> list[str]:
101+
if item == '*':
102+
assert all
103+
return all
104+
elif item.startswith('{'):
105+
assert item.endswith('}')
106+
return item[1:-1].split(',')
107+
else:
108+
return [item]
109+
110+
def rules_to_parsers(args: argparse.Namespace) -> Iterator[tuple[str, str, int]]:
111+
seen = set()
112+
for selector in args.selector:
113+
p, c, s = selector.split(':')
114+
for triplet in (
115+
(pp, 'none' if ss == 0 else cc, ss)
116+
for pp in parse_item(p, ['basic', 're2', 'regex', 'legacy'])
117+
for cc in (parse_item(c, list(CACHES)) if CACHEABLE[pp] else ['none'])
118+
for ss in (map(int, parse_item(s, None)) if cc != 'none' else [0])
119+
):
120+
if triplet not in seen:
121+
seen.add(triplet)
122+
yield triplet
123+
100124
def run_stdout(args: argparse.Namespace) -> None:
101125
lines = list(map(sys.intern, args.file))
102126
count = len(lines)
103127
uniques = len(set(lines))
104128
print(f"{args.file.name}: {count} lines, {uniques} unique ({uniques / count:.0%})")
105129

106-
rules = get_rules(args.bases, args.regexes)
130+
parsers = list(rules_to_parsers(args))
107131

108-
# width of the parser label
109-
w = math.ceil(
110-
3
111-
+ max(map(len, args.bases))
112-
+ max(map(len, args.caches))
113-
+ max(map(math.log10, args.cachesizes))
132+
rules = get_rules([*{p for p, _, _ in parsers}], args.regexes)
133+
134+
w = max(
135+
math.ceil(3 + len(p) + len(c) + (s and math.log10(s)))
136+
for p, c, s in parsers
114137
)
115-
for p, c, n in (
116-
(p, c, n)
117-
for p in args.bases
118-
for c in (args.caches if CACHEABLE[p] and args.cachesizes != [0] else ["none"])
119-
for n in (args.cachesizes if c != "none" else [0])
120-
):
138+
for p, c, n in parsers:
121139
name = "-".join(map(str, filter(None, (p, c != "none" and c, n))))
122140
print(f"{name:{w}}", end=": ", flush=True)
123141

@@ -133,22 +151,16 @@ def run_stdout(args: argparse.Namespace) -> None:
133151
def run_csv(args: argparse.Namespace) -> None:
134152
lines = list(map(sys.intern, args.file))
135153
LEN = len(lines) * 1000
136-
rules = get_rules(args.bases, args.regexes)
137154

138-
parsers = [
139-
(p, c, n)
140-
for p in args.bases
141-
for c in (args.caches if CACHEABLE[p] else ["none"])
142-
for n in (args.cachesizes if c != "none" else [0])
143-
]
155+
parsers = list(rules_to_parsers(args))
144156
if not parsers:
145157
sys.exit("No parser selected")
146158

159+
rules = get_rules([*{p for p, _, _ in parsers}], args.regexes)
147160
columns = {"size": ""}
148161
columns.update(
149162
(f"{p}-{c}", p if c == "none" else f"{p}-{c}")
150-
for p in args.bases
151-
for c in (args.caches if CACHEABLE[p] else ["none"])
163+
for p, c, _ in parsers
152164
)
153165
w = csv.DictWriter(
154166
sys.stdout,
@@ -171,11 +183,13 @@ def run_csv(args: argparse.Namespace) -> None:
171183
# cache could be ignored as it should always be `"none"`
172184
for parser, cache, _ in ps:
173185
p = get_parser(parser, cache, 0, rules)
174-
zeroes[f"{parser}-{cache}"] = run(p, lines) // LEN
186+
zeroes[f"{parser}-{cache}"] = run(p, linges) // LEN
175187

176188
# special cases for configurations where we can't have
177189
# cachesize lines, write the template row out directly
178-
if args.bases == ["legacy"] or args.caches == ["none"] or args.cachesizes == [0]:
190+
if all(p == 'legacy' for p, _, _ in parsers)\
191+
or all(c == 'none' for _, c, _ in parsers)\
192+
or all(s == 0 for _, _, s in parsers):
179193
zeroes["size"] = 0
180194
w.writerow(zeroes)
181195
return
@@ -457,37 +471,14 @@ def __call__(
457471
with a first cell of value 0.""",
458472
)
459473
bench.add_argument(
460-
"--bases",
461-
nargs="+",
462-
choices=["basic", "re2", "regex", "legacy"],
463-
default=["basic", "re2", "regex", "legacy"],
464-
help="""Base resolvers to benchmark. `basic` is a linear search
465-
through the regexes file, `re2` is a prefiltered regex set
466-
implemented in C++, `regex` is a prefiltered regex set implemented
467-
in Rust, `legacy` is the legacy API (essentially a basic resolver
468-
with a clearing cache of fixed 200 entries, but less layered so
469-
usually slightly faster than an equivalent basic-based resolver).""",
470-
)
471-
bench.add_argument(
472-
"--caches",
473-
nargs="+",
474-
choices=list(CACHES),
475-
default=list(CACHES),
476-
help="""Cache implementations to test. `clearing` completely
477-
clears the cache when full, `lru` uses a least-recently-eviction
478-
policy. `lru` is not thread-safe, so `lru-threadsafe` adds a mutex
479-
and measures *uncontended* locking overhead.""",
480-
)
481-
bench.add_argument(
482-
"--cachesizes",
483-
nargs="+",
484-
type=int,
485-
default=[10, 20, 50, 100, 200, 500, 1000, 2000, 5000],
486-
help="""Caches are a classic way to trade memory for performances.
487-
Different base resolvers and traffic patterns have different
488-
benefits from caches, this option allows testing the benefits of
489-
various cache sizes (and thus amounts of memory used) on the cache
490-
strategies. """,
474+
"selector",
475+
nargs="*",
476+
default=["*:*:{10,20,50,100,200,500,1000,2000,5000}"],
477+
help=f"""A generative selector expression, composed of 3 parts: 1.
478+
the parser (base), 2. the cache implementation ({', '.join(CACHES)})
479+
and 3. the cache size. For parser and cache `*` is an alias for stands
480+
in for "every value", a bracketed expression for an enumeration, and
481+
the selector can be repeated to explicitly list each configuration """
491482
)
492483

493484
hitrates = sub.add_parser(

0 commit comments

Comments
 (0)