Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
4e8dbc1
feat(synth): deterministic synthetic cell-image generator for benchma…
timtreis Jun 17, 2026
1e560e1
refactor(synth): review hardening — single cell-extent, sturdier test…
timtreis Jun 17, 2026
3fa70d7
feat(bench): symbol-level PR target mapper (build step 2)
timtreis Jun 17, 2026
f877a81
revert(bench): drop the change-detection mapper; benchmark all expose…
timtreis Jun 17, 2026
4dbeb44
feat(bench): fixture/runner/comparator — benchmark all get_* head-vs-…
timtreis Jun 17, 2026
aecf082
feat(bench): two-job benchmark workflow + orchestration driver (build…
timtreis Jun 17, 2026
a710fbe
refactor(bench): review cleanup — leaner comments/docstrings + small …
timtreis Jun 17, 2026
057913d
test(bench): trim to a lean regression set
timtreis Jun 17, 2026
1cae1a0
style: ruff-format with current ruff (88-col line wraps)
timtreis Jun 17, 2026
76daa62
chore: stop tracking scratch tasks/ and .claude/ (added by mistake)
timtreis Jun 17, 2026
3e928a3
refactor(bench): report raw main/head timings, drop noise-band classi…
timtreis Jun 17, 2026
62e8cd6
Revert "Merge pull request #80 from afermg/feat/synth-bench-generator"
timtreis Jun 17, 2026
0c3ba6e
feat(bench): synthetic-image PR performance benchmark action
timtreis Jun 17, 2026
dd997e9
Revert "feat(bench): synthetic-image PR performance benchmark action"
timtreis Jun 17, 2026
4459ce9
demo: self-contained PR benchmark action (simplified)
timtreis Jun 17, 2026
10c3865
demo: move the whole benchmark into .github/scripts (no package module)
timtreis Jun 17, 2026
36e6037
demo: extend benchmark matrix to 4 sizes x 2 counts (256–2048)
timtreis Jun 17, 2026
f832814
demo: median per cell, 3 seeds x 3 counts, dynamic affected-threshold…
timtreis Jun 17, 2026
d5941f8
demo: drop 256px image size (unrealistically small)
timtreis Jun 17, 2026
68e5190
demo: shift matrix down to 256-1024 (drop slow 2048)
timtreis Jun 17, 2026
7ff9c18
demo: report regressions too — flag functions that moved >=1.05x eith…
timtreis Jun 17, 2026
941bc23
demo: slim benchmark.py — drop unused bits
timtreis Jun 17, 2026
c9e0baf
revert(granunlarity): it has an independent PR, was used as test
afermg Jun 17, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
227 changes: 227 additions & 0 deletions .github/scripts/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
#!/usr/bin/env python3
"""Self-contained PR benchmark — lives entirely in .github/scripts, nothing in the package.

Subcommands:
run --out FILE generate seeded synthetic inputs, time every cp_measure get_* -> JSON
compare --base F --head F [--md F] diff two run JSONs into a head-vs-main timing table

Run once per environment (PR head, main) on the SAME seeded inputs (pure-numpy generation is
deterministic), then compare. The driver installs each env; this script only needs cp_measure
importable plus numpy.
"""

import os

for _v in (
"OMP_NUM_THREADS",
"OPENBLAS_NUM_THREADS",
"MKL_NUM_THREADS",
"NUMEXPR_NUM_THREADS",
"VECLIB_MAXIMUM_THREADS",
"NUMBA_NUM_THREADS",
):
os.environ.setdefault(_v, "1")

import argparse # noqa: E402
import json # noqa: E402
import signal # noqa: E402
import statistics # noqa: E402
from contextlib import contextmanager # noqa: E402
from pathlib import Path # noqa: E402
from time import perf_counter # noqa: E402

import numpy # noqa: E402

MATRIX = {"sizes": (256, 512, 1024), "counts": (16, 64, 256), "seeds": (0, 1, 2)}
BLOBS_PER_CHANNEL = 5
WARMUP, REPS, TIMEOUT = 1, 3, 120.0
AFFECTED = 1.05 # report a function if any cell moves by this factor either way (faster or slower)


# --- synthetic generator: n ellipses on a grid + random Gaussian blobs per channel --------------
def generate(size: int, n: int, seed: int = 0):
rng = numpy.random.default_rng(seed)
yy, xx = numpy.mgrid[0:size, 0:size]
labels = numpy.zeros((size, size), numpy.int32)
if n:
cols = int(numpy.ceil(numpy.sqrt(n)))
rows = int(numpy.ceil(n / cols))
a, b = 0.35 * size / rows, 0.35 * size / cols
for k in range(n):
r, c = divmod(k, cols)
cy, cx = (r + 0.5) * size / rows, (c + 0.5) * size / cols
labels[((yy - cy) / a) ** 2 + ((xx - cx) / b) ** 2 <= 1] = k + 1
channels = []
for _ in range(2): # ch0 → core features, ch0+ch1 → colocalisation
img = numpy.zeros((size, size))
for _ in range(BLOBS_PER_CHANNEL):
cy, cx = rng.uniform(0, size, 2)
s = rng.uniform(size / 10, size / 5)
img += numpy.exp(-((yy - cy) ** 2 + (xx - cx) ** 2) / (2 * s * s))
channels.append(img.astype(numpy.float32))
return labels, numpy.stack(channels)


# --- timing -------------------------------------------------------------------------------------
class _Timeout(Exception):
pass


def _raise_timeout(*_):
raise _Timeout()


@contextmanager
def _time_limit(seconds: float):
signal.signal(signal.SIGALRM, _raise_timeout)
signal.setitimer(signal.ITIMER_REAL, seconds)
try:
yield
finally:
signal.setitimer(signal.ITIMER_REAL, 0)


def _norm01(img):
img = img.astype("float64")
lo, hi = float(img.min()), float(img.max())
return (img - lo) / (hi - lo) if hi > lo else img - lo


def _functions():
from cp_measure import bulk

out = []
for arity, reg in (
(1, bulk.get_core_measurements()),
(2, bulk.get_correlation_measurements()),
):
for name, fn in reg.items():
out.append((name, fn, arity))
return out


def _time(fn, args) -> dict:
try:
with _time_limit(TIMEOUT):
for _ in range(WARMUP):
fn(*args)
reps = []
for _ in range(REPS):
t = perf_counter()
fn(*args)
reps.append(perf_counter() - t)
except _Timeout:
return {"status": "timeout"}
except Exception as exc:
return {"status": "error", "error": f"{type(exc).__name__}: {exc}"[:200]}
return {"status": "ok", "reps": reps}


def run(out_path: str):
funcs = _functions()
cells, results = [], {name: {} for name, _, _ in funcs}
for size in MATRIX["sizes"]:
for n in MATRIX["counts"]:
for seed in MATRIX["seeds"]:
labels, channels = generate(size, n, seed)
imgs = (_norm01(channels[0]), _norm01(channels[1]))
key = f"s{size}_n{n}_seed{seed}"
cells.append({"key": key, "size": size, "n_objects": n})
for name, fn, arity in funcs:
args = (
(labels, imgs[0]) if arity == 1 else (imgs[0], imgs[1], labels)
)
results[name][key] = _time(fn, args)
Path(out_path).write_text(
json.dumps({"cells": cells, "results": results}, indent=2)
)


# --- compare ------------------------------------------------------------------------------------
def _median_ms(results_for_fn: dict, keys: list[str]):
"""Median (ms) over all ok rep times in a cell's seeds, or None."""
times = [
t
for k in keys
if results_for_fn.get(k, {}).get("status") == "ok"
for t in results_for_fn[k]["reps"]
]
return statistics.median(times) * 1e3 if times else None


def compare(base: dict, head: dict, commit: str = "") -> str:
groups: dict[tuple, list[str]] = {}
for e in head["cells"]:
groups.setdefault((e["size"], e["n_objects"]), []).append(e["key"])
sizes = sorted({s for s, _ in groups})
counts = sorted({n for _, n in groups})
br, hr = base["results"], head["results"]

ref = f"`{commit[:7]}`" if commit else "PR head"
out = [
f"### Benchmark — {ref} vs `main`",
"",
f"`speedup = main/head` (>1 faster, <1 slower) · median per cell · "
f"showing functions that moved ≥{AFFECTED:.2f}× either way",
]

lo = 1.0 / AFFECTED # a cell at or below this is a regression worth reporting
affected = [] # (function, {(size, count): speedup})
for fn in sorted(hr):
grid, speedups = {}, []
for size in sizes:
for n in counts:
m = _median_ms(br.get(fn, {}), groups.get((size, n), []))
h = _median_ms(hr[fn], groups.get((size, n), []))
grid[(size, n)] = (m / h) if (m and h) else None
if grid[(size, n)]:
speedups.append(grid[(size, n)])
if speedups and (max(speedups) >= AFFECTED or min(speedups) <= lo):
affected.append((fn, grid))

if not affected:
out += ["", f"_No function moved by ≥{AFFECTED:.2f}× (faster or slower)._"]
return "\n".join(out)

for fn, grid in affected:
out += [
"",
f"#### `{fn}`",
"",
"| size \\ objects | " + " | ".join(str(n) for n in counts) + " |",
"|---" + "|--:" * len(counts) + "|",
]
for size in sizes:
row = [
(f"{grid[(size, n)]:.2f}×" if grid.get((size, n)) else "—")
for n in counts
]
out.append(f"| **{size}** | " + " | ".join(row) + " |")
return "\n".join(out)


def main(argv=None) -> int:
p = argparse.ArgumentParser(description=__doc__)
sub = p.add_subparsers(dest="cmd", required=True)
r = sub.add_parser("run")
r.add_argument("--out", required=True)
c = sub.add_parser("compare")
c.add_argument("--base", required=True)
c.add_argument("--head", required=True)
c.add_argument("--commit", default="")
c.add_argument("--md")
a = p.parse_args(argv)
if a.cmd == "run":
run(a.out)
else:
md = compare(
json.loads(Path(a.base).read_text()),
json.loads(Path(a.head).read_text()),
a.commit,
)
(Path(a.md).write_text(md) if a.md else print(md))
return 0


if __name__ == "__main__":
raise SystemExit(main())
32 changes: 32 additions & 0 deletions .github/scripts/run_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env bash
# Install the PR head and main into separate venvs and run benchmark.py (from this checkout) in
# each, then compare. Each run regenerates the same seeded inputs, so nothing is shared on disk.
# Usage: run_benchmark.sh <out-dir> <head-commit-sha>
set -euo pipefail

OUT="${1:-bench-out}"
COMMIT="${2:-}"
HEAD_DIR="$(pwd)"
WORK="$(mktemp -d)"
BENCH="$HEAD_DIR/.github/scripts/benchmark.py"
mkdir -p "$OUT"
trap 'git worktree remove --force "$WORK/main" 2>/dev/null || true; rm -rf "$WORK"' EXIT

# six is a centrosome runtime dep not declared in its metadata; install it into the bench venvs only.
echo "::group::PR head env"
uv venv "$WORK/venv-head"
uv pip install --python "$WORK/venv-head/bin/python" -e "$HEAD_DIR" six
"$WORK/venv-head/bin/python" "$BENCH" run --out "$OUT/head.json"
echo "::endgroup::"

echo "::group::main env"
git fetch --no-tags --depth=1 origin main
git worktree add --detach "$WORK/main" origin/main
uv venv "$WORK/venv-main"
uv pip install --python "$WORK/venv-main/bin/python" -e "$WORK/main" six
"$WORK/venv-main/bin/python" "$BENCH" run --out "$OUT/main.json"
echo "::endgroup::"

"$WORK/venv-head/bin/python" "$BENCH" compare \
--base "$OUT/main.json" --head "$OUT/head.json" --commit "$COMMIT" --md "$OUT/table.md"
cat "$OUT/table.md"
45 changes: 45 additions & 0 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: benchmark

# Runs on every commit to a PR: times every public get_* on the PR head vs main and posts a
# sticky comment with the timing table. Self-contained — workflow, tooling and generator all live
# on the PR branch (pull_request runs the workflow from the head), so nothing is needed on main.

on:
pull_request:

permissions:
pull-requests: write

concurrency:
group: bench-${{ github.event.pull_request.number }}
cancel-in-progress: true

jobs:
benchmark:
runs-on: ubuntu-latest
timeout-minutes: 60
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- uses: astral-sh/setup-uv@v5
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Benchmark head vs main
run: bash .github/scripts/run_benchmark.sh bench-out "${{ github.event.pull_request.head.sha }}"
- name: Post sticky comment
env:
GH_TOKEN: ${{ github.token }}
REPO: ${{ github.repository }}
PR: ${{ github.event.pull_request.number }}
run: |
set -euo pipefail
{ echo '<!-- cp-bench -->'; echo; cat bench-out/table.md; } > body.md
CID="$(gh api "repos/$REPO/issues/$PR/comments?per_page=100" \
--jq 'map(select(.body | startswith("<!-- cp-bench -->")))[0].id // empty')"
if [ -n "$CID" ]; then
gh api -X PATCH "repos/$REPO/issues/comments/$CID" -F body=@body.md >/dev/null
else
gh api -X POST "repos/$REPO/issues/$PR/comments" -F body=@body.md >/dev/null
fi
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
__pycache__/
/.agent-shell/
.pre-commit-config.yaml

# local benchmark output
bench-out/
Loading