Skip to content

Commit 59950ff

Browse files
committed
Worked on search benchmarking and docs
1 parent f5b5384 commit 59950ff

29 files changed

Lines changed: 628 additions & 378 deletions

benchmarks/report.py

Lines changed: 244 additions & 77 deletions
Large diffs are not rendered by default.

benchmarks/search_benchmark.py

Lines changed: 71 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
from hed import HedString, QueryHandler # noqa: E402
2929
from hed.models.basic_search import find_matching # noqa: E402
30-
from hed.models.string_search import StringQueryHandler, search_series # noqa: E402
30+
from hed.models.string_search import StringQueryHandler, string_search # noqa: E402
3131

3232
from data_generator import DataGenerator # noqa: E402
3333

@@ -143,7 +143,7 @@ def _bench_basic(self, raw, query, cfg_label, q_label):
143143
med, _ = time_it(lambda: find_matching(series, query), self.n_runs)
144144
matches = int(find_matching(series, query).sum())
145145
return {
146-
"engine": "basic_search",
146+
"engine": "Basic search",
147147
"query_label": q_label,
148148
"config_label": cfg_label,
149149
"query": query,
@@ -166,7 +166,7 @@ def do_search():
166166
search_med, _ = time_it(do_search, self.n_runs)
167167
result = do_search()
168168
return {
169-
"engine": "QueryHandler",
169+
"engine": "Object search",
170170
"query_label": q_label,
171171
"config_label": cfg_label,
172172
"query": query,
@@ -181,8 +181,9 @@ def _bench_string_qh(self, raw, query, cfg_label, q_label, schema_lookup, suffix
181181
sqh = StringQueryHandler(query)
182182
search_med, _ = time_it(lambda: sqh.search(raw, schema_lookup=schema_lookup), self.n_runs)
183183
result = sqh.search(raw, schema_lookup=schema_lookup)
184+
label = "String search" if suffix == "no_lookup" else "String search (lookup)"
184185
return {
185-
"engine": f"StringQueryHandler_{suffix}",
186+
"engine": label,
186187
"query_label": q_label,
187188
"config_label": cfg_label,
188189
"query": query,
@@ -235,15 +236,15 @@ def run_all(self, series_configs):
235236
rec = self._bench_basic_series(series, bs_query, label, q_label, n_rows)
236237
records.append(rec)
237238

238-
# --- search_series (StringQueryHandler) no lookup ---
239+
# --- String search (StringQueryHandler) no lookup ---
239240
rec = self._bench_search_series(series, qh_query, label, q_label, n_rows, None, "no_lookup")
240241
records.append(rec)
241242

242-
# --- search_series (StringQueryHandler) with lookup ---
243+
# --- String search (StringQueryHandler) with lookup ---
243244
rec = self._bench_search_series(series, qh_query, label, q_label, n_rows, self.lookup, "with_lookup")
244245
records.append(rec)
245246

246-
# --- QueryHandler loop ---
247+
# --- Object search (QueryHandler loop) ---
247248
rec = self._bench_qh_loop(series, qh_query, label, q_label, n_rows)
248249
records.append(rec)
249250

@@ -253,7 +254,7 @@ def _bench_basic_series(self, series, query, cfg_label, q_label, n_rows):
253254
med, _ = time_it(lambda: find_matching(series, query), self.n_runs)
254255
matches = int(find_matching(series, query).sum())
255256
return {
256-
"engine": "basic_search",
257+
"engine": "Basic search",
257258
"query_label": q_label,
258259
"config_label": cfg_label,
259260
"n_rows": n_rows,
@@ -263,10 +264,12 @@ def _bench_basic_series(self, series, query, cfg_label, q_label, n_rows):
263264
}
264265

265266
def _bench_search_series(self, series, query, cfg_label, q_label, n_rows, lookup, suffix):
266-
med, _ = time_it(lambda: search_series(series, query, schema_lookup=lookup), self.n_runs)
267-
matches = int(search_series(series, query, schema_lookup=lookup).sum())
267+
strings = series.tolist()
268+
med, _ = time_it(lambda: string_search(strings, query, schema_lookup=lookup), self.n_runs)
269+
matches = sum(string_search(strings, query, schema_lookup=lookup))
270+
label = "String search" if suffix == "no_lookup" else "String search (lookup)"
268271
return {
269-
"engine": f"search_series_{suffix}",
272+
"engine": label,
270273
"query_label": q_label,
271274
"config_label": cfg_label,
272275
"n_rows": n_rows,
@@ -294,7 +297,7 @@ def do_all():
294297
if qh.search(hs):
295298
count += 1
296299
return {
297-
"engine": "QueryHandler_loop",
300+
"engine": "Object search",
298301
"query_label": q_label,
299302
"config_label": cfg_label,
300303
"n_rows": n_rows,
@@ -402,16 +405,35 @@ def sweep_query_complexity(self):
402405
return records
403406

404407
def sweep_schema_lookup(self):
405-
"""Compare StringQueryHandler with vs without schema_lookup."""
406-
raw = self.gen.make_string(n_tags=15, n_groups=3, depth=1)
407-
query = "Event"
408-
sqh = StringQueryHandler(query)
408+
"""Compare StringQueryHandler with vs without schema_lookup across query types.
409+
410+
Uses a fixed short-form string containing known descendants of Event and Action so
411+
the behavioural difference (which strings match) is deterministic.
412+
"""
413+
# Fixed short-form string with known Event and Action descendants.
414+
# Sensory-event, Agent-action, Data-feature are Event descendants;
415+
# Communicate, Clap-hands are Action descendants.
416+
raw = (
417+
"Sensory-event, Agent-action, Data-feature, Communicate, Clap-hands, "
418+
"Communicate-gesturally, Blue, High, (Red, Move), (Experiment-control, Frown)"
419+
)
420+
queries = [
421+
("Ancestor: Event", "Event"),
422+
("Ancestor: Action", "Action"),
423+
("Exact: Sensory-event", "Sensory-event"),
424+
("Compound: Event && Action", "Event && Action"),
425+
]
409426
records = []
410-
for with_lookup in [False, True]:
411-
lk = self.lookup if with_lookup else None
412-
label = "with_lookup" if with_lookup else "no_lookup"
413-
med, _ = time_it(lambda lk=lk: sqh.search(raw, schema_lookup=lk), self.n_runs)
414-
records.append({"factor": "schema_lookup", "level": label, "engine": "StringQueryHandler", "time": med})
427+
for q_label, query in queries:
428+
sqh = StringQueryHandler(query)
429+
for with_lookup in [False, True]:
430+
lk = self.lookup if with_lookup else None
431+
mode = "With lookup" if with_lookup else "No lookup"
432+
med, _ = time_it(lambda lk=lk, _sqh=sqh: _sqh.search(raw, schema_lookup=lk), self.n_runs)
433+
matches = len(sqh.search(raw, schema_lookup=lk))
434+
records.append(
435+
{"factor": "schema_lookup", "level": q_label, "engine": mode, "time": med, "matches": matches}
436+
)
415437
return records
416438

417439
def sweep_string_form(self):
@@ -440,18 +462,18 @@ def qh_search():
440462
qh.search(hs)
441463

442464
search_med, _ = time_it(qh_search, self.n_runs)
443-
records.append({"factor": "compile_vs_search", "level": "compile", "engine": "QueryHandler", "time": comp})
444-
records.append({"factor": "compile_vs_search", "level": "search", "engine": "QueryHandler", "time": search_med})
465+
records.append({"factor": "compile_vs_search", "level": "compile", "engine": "Object search", "time": comp})
466+
records.append(
467+
{"factor": "compile_vs_search", "level": "search", "engine": "Object search", "time": search_med}
468+
)
445469

446470
# StringQueryHandler
447471
comp2, _ = time_it(lambda: StringQueryHandler(query), self.n_runs)
448472
sqh = StringQueryHandler(query)
449473
search_med2, _ = time_it(lambda: sqh.search(raw, schema_lookup=self.lookup), self.n_runs)
474+
records.append({"factor": "compile_vs_search", "level": "compile", "engine": "String search", "time": comp2})
450475
records.append(
451-
{"factor": "compile_vs_search", "level": "compile", "engine": "StringQueryHandler", "time": comp2}
452-
)
453-
records.append(
454-
{"factor": "compile_vs_search", "level": "search", "engine": "StringQueryHandler", "time": search_med2}
476+
{"factor": "compile_vs_search", "level": "search", "engine": "String search", "time": search_med2}
455477
)
456478

457479
return records
@@ -534,7 +556,7 @@ def _bench_all_engines(self, raw, qh_query, bs_query=None):
534556
# basic_search
535557
if bs_query is not None:
536558
med, _ = time_it(lambda: find_matching(series1, bs_query), self.n_runs)
537-
yield "basic_search", med
559+
yield "Basic search", med
538560

539561
# QueryHandler
540562
qh = QueryHandler(qh_query)
@@ -544,31 +566,32 @@ def qh_search():
544566
qh.search(hs)
545567

546568
med, _ = time_it(qh_search, self.n_runs)
547-
yield "QueryHandler", med
569+
yield "Object search", med
548570

549571
# StringQueryHandler no lookup
550572
sqh = StringQueryHandler(qh_query)
551573
med, _ = time_it(lambda: sqh.search(raw, schema_lookup=None), self.n_runs)
552-
yield "SQH_no_lookup", med
574+
yield "String search", med
553575

554576
# StringQueryHandler with lookup
555577
med, _ = time_it(lambda: sqh.search(raw, schema_lookup=self.lookup), self.n_runs)
556-
yield "SQH_with_lookup", med
578+
yield "String search (lookup)", med
557579

558580
def _bench_series_engines(self, series, qh_query, bs_query, n_rows):
559581
"""Yield (engine_name, median_time) for series-level engines."""
560582
# basic_search
561583
if bs_query is not None:
562-
med, _ = time_it(lambda: find_matching(series, bs_query), max(3, self.n_runs // 2))
563-
yield "basic_search", med
584+
med, _ = time_it(lambda: find_matching(series, bs_query), self.n_runs)
585+
yield "Basic search", med
564586

565-
# search_series no lookup
566-
med, _ = time_it(lambda: search_series(series, qh_query, schema_lookup=None), max(3, self.n_runs // 2))
567-
yield "search_series_no_lookup", med
587+
# String search no lookup
588+
strings = series.tolist()
589+
med, _ = time_it(lambda: string_search(strings, qh_query, schema_lookup=None), self.n_runs)
590+
yield "String search", med
568591

569-
# search_series with lookup
570-
med, _ = time_it(lambda: search_series(series, qh_query, schema_lookup=self.lookup), max(3, self.n_runs // 2))
571-
yield "search_series_with_lookup", med
592+
# String search with lookup
593+
med, _ = time_it(lambda: string_search(strings, qh_query, schema_lookup=self.lookup), self.n_runs)
594+
yield "String search (lookup)", med
572595

573596
# QueryHandler loop
574597
qh = QueryHandler(qh_query)
@@ -580,8 +603,8 @@ def qh_loop():
580603
hs = HedString(s, schema)
581604
qh.search(hs)
582605

583-
med, _ = time_it(qh_loop, max(3, self.n_runs // 2))
584-
yield "QueryHandler_loop", med
606+
med, _ = time_it(qh_loop, self.n_runs)
607+
yield "Object search", med
585608

586609

587610
# ======================================================================
@@ -595,7 +618,7 @@ def run_full_benchmark(quick=False):
595618
gen = DataGenerator()
596619

597620
n_single = 10 if quick else 20
598-
n_series = 3 if quick else 5
621+
n_series = 3 if quick else 10
599622
n_sweep = 5 if quick else 10
600623

601624
# ------------------------------------------------------------------
@@ -675,20 +698,22 @@ def run_full_benchmark(quick=False):
675698
med, _ = time_it(lambda bs_query=bs_query: find_matching(real_series, bs_query), n_series)
676699
real_results.append(
677700
{
678-
"engine": "basic_search",
701+
"engine": "Basic search",
679702
"query_label": q_label,
680703
"total_time": med,
681704
"per_row": med / real_n,
682705
"n_rows": real_n,
683706
}
684707
)
685708

709+
real_strings = real_series.tolist()
686710
med, _ = time_it(
687-
lambda qh_query=qh_query: search_series(real_series, qh_query, schema_lookup=gen.lookup), n_series
711+
lambda qh_query=qh_query, _rs=real_strings: string_search(_rs, qh_query, schema_lookup=gen.lookup),
712+
n_series,
688713
)
689714
real_results.append(
690715
{
691-
"engine": "search_series",
716+
"engine": "String search",
692717
"query_label": q_label,
693718
"total_time": med,
694719
"per_row": med / real_n,
@@ -708,7 +733,7 @@ def qh_loop(qh=qh, schema=schema):
708733
med, _ = time_it(qh_loop, n_series)
709734
real_results.append(
710735
{
711-
"engine": "QueryHandler_loop",
736+
"engine": "Object search",
712737
"query_label": q_label,
713738
"total_time": med,
714739
"per_row": med / real_n,
-4.8 KB
Loading
-10.4 KB
Loading
-2.15 KB
Loading
64.4 KB
Loading
-18.7 KB
Loading
-1.57 KB
Loading
2.81 KB
Loading
-2.58 KB
Loading

0 commit comments

Comments
 (0)