afermg · timtreis · Jun 3, 2026 · Jun 3, 2026 · Jun 4, 2026 · Jun 6, 2026
diff --git a/src/cp_measure/bulk.py b/src/cp_measure/bulk.py
@@ -49,18 +49,25 @@ def _numba_registries() -> dict[str, dict[str, Callable]]:
     """Registries for the 'numba' accelerator.
 
     Composes the numba implementations (``intensity`` and the ``pearson`` /
-    ``manders_fold`` / ``rwc`` / ``overlap`` colocalization features) with the
-    numpy implementations of every other feature — a single global "numba"
+    ``manders_fold`` / ``rwc`` / ``costes`` / ``overlap`` colocalization features)
+    with the numpy implementations of every other feature — a single global "numba"
     selection still yields a full, working feature set, accelerated where a
     numba backend exists. This is explicit per-function composition, NOT an
     error-driven fallback.
 
+    The five correlation entries are kept per-group so the featurizer's per-group
+    selection (``_collect_correlation_features``) keeps working; each is now a thin,
+    gated wrapper over the shared ``get_correlation_all`` kernel. A caller wanting
+    several coloc features in ONE flatten+kernel pass (the efficient path) calls
+    ``cp_measure.core.numba.get_correlation_all(..., features=...)`` directly — it is
+    stateless, so fusion happens by requesting the set in one call, not by the
+    registry.
+
     Note: ``overlap`` is not in the numpy ``_CORRELATION`` registry, so the numba
-    correlation registry intentionally exposes one feature the numpy one does not
-    (the numba ``overlap`` backend exists and is cheap to surface). Adding
-    ``overlap`` to the numpy ``_CORRELATION`` for symmetry is a separate call.
+    correlation registry intentionally exposes one feature the numpy one does not.
     """
     from cp_measure.core.numba import (
+        get_correlation_costes as _numba_costes,
         get_correlation_manders_fold as _numba_manders_fold,
         get_correlation_overlap as _numba_overlap,
         get_correlation_pearson as _numba_pearson,
@@ -75,6 +82,7 @@ def _numba_registries() -> dict[str, dict[str, Callable]]:
             "pearson": _numba_pearson,
             "manders_fold": _numba_manders_fold,
             "rwc": _numba_rwc,
+            "costes": _numba_costes,
             "overlap": _numba_overlap,
         },
     }

diff --git a/src/cp_measure/core/numba/__init__.py b/src/cp_measure/core/numba/__init__.py
@@ -5,12 +5,14 @@
 ``numba`` extra; availability is gated by ``cp_measure._detect.HAS_NUMBA``.
 
 This backend accelerates ``intensity`` and the colocalization features
-``pearson``/``manders_fold``/``rwc``/``overlap``; the global "numba" accelerator
-composes them with the numpy implementations of every other feature (see
-``cp_measure.bulk``).
+``pearson``/``manders_fold``/``rwc``/``overlap``/``costes``; the global "numba"
+accelerator composes them with the numpy implementations of every other feature
+(see ``cp_measure.bulk``).
 """
 
 from cp_measure.core.numba.measurecolocalization import (
+    get_correlation_all,
+    get_correlation_costes,
     get_correlation_manders_fold,
     get_correlation_overlap,
     get_correlation_pearson,
@@ -19,6 +21,8 @@
 from cp_measure.core.numba.measureobjectintensity import get_intensity
 
 __all__ = [
+    "get_correlation_all",
+    "get_correlation_costes",
     "get_correlation_manders_fold",
     "get_correlation_overlap",
     "get_correlation_pearson",

diff --git a/src/cp_measure/core/numba/_costes.py b/src/cp_measure/core/numba/_costes.py
@@ -0,0 +1,278 @@
+"""Costes automated-threshold colocalization kernel (single-threaded, cached).
+
+Per-object port of ``measurecolocalization.get_correlation_costes_ind`` and its
+``bisection_costes`` / ``linear_costes`` search. Each object's two channels are
+the contiguous blocks ``g1[offsets[k]:offsets[k+1]]`` / ``g2[...]`` from
+:func:`cp_measure.primitives._segment_numba.flatten_pairs_grouped` (the same
+grouped layout the other coloc features use — no sort here).
+
+The reference's ``calculate_threshold`` call inside ``..._ind`` is dead (its
+outputs are never read), so it is not ported. ``thr`` likewise has no effect on
+costes. ``scale`` (``infer_scale``, dtype-keyed: float→1) is passed in host-side.
+
+Exactness: bit-exact vs the numpy reference on **float** pixels (the realistic
+input; ``scale==1``). Integer-dtype input diverges by design — the reference
+computes ``z = fi + si`` in that dtype and overflows (uint8/uint16), corrupting
+the regression; the float64 kernel here does not. See ``tasks/numba_costes_plan.md``.
+
+Serial: a plain object loop, no ``prange``/``nogil``.
+"""
+
+import numpy as np
+from numba import njit
+
+
+@njit(cache=True, error_model="numpy")
+def _regression_ab(g1, g2, lo, hi):
+    """Costes orthogonal-regression line ``si ≈ a·fi + b`` over non-zero pixels.
+
+    ``non_zero = (fi > 0) | (si > 0)``; ``a`` from the covariance recovered via
+    ``cov = 0.5(var(fi+si) - var(fi) - var(si))`` (all ``ddof=1``), matching the
+    reference exactly.
+    """
+    cnt = 0
+    s1 = 0.0
+    s2 = 0.0
+    for i in range(lo, hi):
+        if g1[i] > 0 or g2[i] > 0:
+            cnt += 1
+            s1 += g1[i]
+            s2 += g2[i]
+    m1 = s1 / cnt
+    m2 = s2 / cnt
+    vx = 0.0
+    vy = 0.0
+    vz = 0.0
+    for i in range(lo, hi):
+        if g1[i] > 0 or g2[i] > 0:
+            d1 = g1[i] - m1
+            d2 = g2[i] - m2
+            dz = (g1[i] + g2[i]) - (m1 + m2)
+            vx += d1 * d1
+            vy += d2 * d2
+            vz += dz * dz
+    denom_n = cnt - 1
+    xvar = vx / denom_n
+    yvar = vy / denom_n
+    zvar = vz / denom_n
+    covar = 0.5 * (zvar - (xvar + yvar))
+    yx = yvar - xvar
+    a = (yx + np.sqrt(yx * yx + 4.0 * covar * covar)) / (2.0 * covar)
+    b = m2 - a * m1
+    return a, b
+
+
+@njit(cache=True, error_model="numpy")
+def _count_combt(g1, g2, lo, hi, thr_fi, thr_si):
+    """``count_nonzero((fi < thr_fi) | (si < thr_si))`` over the object's block."""
+    cnt = 0
+    for i in range(lo, hi):
+        if g1[i] < thr_fi or g2[i] < thr_si:
+            cnt += 1
+    return cnt
+
+
+@njit(cache=True, error_model="numpy")
+def _pearson_combt(g1, g2, lo, hi, thr_fi, thr_si):
+    """Pearson r over the ``(fi < thr_fi) | (si < thr_si)`` subset.
+
+    Mirrors ``scipy.stats.pearsonr``'s operation order — centre, normalise each
+    vector by its L2 norm, accumulate the normalised products, clamp to [-1, 1] —
+    to stay as close as possible to the value the reference branches on. Pass
+    ``thr_fi = thr_si = inf`` for the full-block correlation.
+    """
+    cnt = 0
+    s1 = 0.0
+    s2 = 0.0
+    for i in range(lo, hi):
+        if g1[i] < thr_fi or g2[i] < thr_si:
+            cnt += 1
+            s1 += g1[i]
+            s2 += g2[i]
+    if cnt == 0:
+        return np.nan
+    m1 = s1 / cnt
+    m2 = s2 / cnt
+    nx = 0.0
+    ny = 0.0
+    for i in range(lo, hi):
+        if g1[i] < thr_fi or g2[i] < thr_si:
+            d1 = g1[i] - m1
+            d2 = g2[i] - m2
+            nx += d1 * d1
+            ny += d2 * d2
+    normx = np.sqrt(nx)
+    normy = np.sqrt(ny)
+    r = 0.0
+    for i in range(lo, hi):
+        if g1[i] < thr_fi or g2[i] < thr_si:
+            r += ((g1[i] - m1) / normx) * ((g2[i] - m2) / normy)
+    if r > 1.0:
+        r = 1.0
+    elif r < -1.0:
+        r = -1.0
+    return r
+
+
+@njit(cache=True, error_model="numpy")
+def _count_pearson_combt(g1, g2, lo, hi, thr_fi, thr_si):
+    """``(cnt, r)`` in one shot: the subset count AND its Pearson r.
+
+    The standalone ``_count_combt`` recomputes exactly the count that
+    ``_pearson_combt``'s first pass already produces, so the bisection paid two
+    count passes per visited threshold. This fuses them: pass 1 yields ``cnt``
+    (+ sums); ``r`` is computed (passes 2-3, identical to ``_pearson_combt``) only
+    when ``cnt > 2`` (else returned as ``nan`` and unused). Bit-identical to the
+    separate calls (same predicate, same accumulation order).
+    """
+    cnt = 0
+    s1 = 0.0
+    s2 = 0.0
+    for i in range(lo, hi):
+        if g1[i] < thr_fi or g2[i] < thr_si:
+            cnt += 1
+            s1 += g1[i]
+            s2 += g2[i]
+    if cnt <= 2:
+        return cnt, np.nan
+    m1 = s1 / cnt
+    m2 = s2 / cnt
+    nx = 0.0
+    ny = 0.0
+    for i in range(lo, hi):
+        if g1[i] < thr_fi or g2[i] < thr_si:
+            d1 = g1[i] - m1
+            d2 = g2[i] - m2
+            nx += d1 * d1
+            ny += d2 * d2
+    normx = np.sqrt(nx)
+    normy = np.sqrt(ny)
+    r = 0.0
+    for i in range(lo, hi):
+        if g1[i] < thr_fi or g2[i] < thr_si:
+            r += ((g1[i] - m1) / normx) * ((g2[i] - m2) / normy)
+    if r > 1.0:
+        r = 1.0
+    elif r < -1.0:
+        r = -1.0
+    return cnt, r
+
+
+@njit(cache=True, error_model="numpy")
+def _bisection(g1, g2, lo, hi, a, b, scale):
+    """``bisection_costes`` (M_FASTER): narrowing-window search for the threshold."""
+    left = 1.0
+    right = float(scale)
+    mid = np.floor((right - left) / 1.2) + left
+    lastmid = 0.0
+    valid = 1.0
+    while lastmid != mid:
+        thr_fi = mid / scale
+        thr_si = a * thr_fi + b
+        cnt, r = _count_pearson_combt(g1, g2, lo, hi, thr_fi, thr_si)
+        if cnt <= 2:
+            left = mid - 1
+        else:
+            if r < 0:
+                left = mid - 1
+            elif r >= 0:
+                right = mid + 1
+                valid = mid
+            # NaN (constant subset): neither bound moves; loop exits next step.
+        lastmid = mid
+        if right - left > 6:
+            mid = np.floor((right - left) / 1.2) + left
+        else:
+            mid = np.floor((right - left) / 2.0) + left
+    thr_fi_c = (valid - 1) / scale
+    thr_si_c = a * thr_fi_c + b
+    return thr_fi_c, thr_si_c
+
+
+@njit(cache=True, error_model="numpy")
+def _linear(g1, g2, lo, hi, a, b, scale, accurate):
+    """``linear_costes`` (M_FAST / M_ACCURATE): step the threshold down to R≈0."""
+    i_step = 1.0 / scale
+    fi_max = -np.inf
+    si_max = -np.inf
+    for i in range(lo, hi):
+        if g1[i] > fi_max:
+            fi_max = g1[i]
+        if g2[i] > si_max:
+            si_max = g2[i]
+    img_max = fi_max if fi_max > si_max else si_max
+    cur = i_step * (np.floor(img_max / i_step) + 1)
+    thr_fi_c = cur
+    thr_si_c = a * cur + b
+    while cur > fi_max and (a * cur + b) > si_max:
+        cur -= i_step
+    r = 0.0
+    num_true = -1  # sentinel (reference's None: forces recompute on first pass)
+    while cur > i_step:
+        thr_fi_c = cur
+        thr_si_c = a * cur + b
+        cnt = _count_combt(g1, g2, lo, hi, thr_fi_c, thr_si_c)
+        if cnt < 2:  # reference: scipy.stats.pearsonr raises -> break
+            break
+        if cnt != num_true:
+            r = _pearson_combt(g1, g2, lo, hi, thr_fi_c, thr_si_c)
+            num_true = cnt
+        if r <= 0:
+            break
+        elif accurate or cur < i_step * 10:
+            cur -= i_step
+        elif r > 0.45:
+            cur -= i_step * 10
+        elif r > 0.35:
+            cur -= i_step * 5
+        elif r > 0.25:
+            cur -= i_step * 2
+        else:
+            cur -= i_step
+    return thr_fi_c, thr_si_c
+
+
+@njit(cache=True, error_model="numpy")
+def costes_per_object(g1, g2, offsets, n, scale, mode):
+    """Costes C1/C2 per object over grouped value blocks.
+
+    ``mode``: 0 = bisection (M_FASTER), 1 = linear M_FAST, 2 = linear M_ACCURATE.
+    Returns ``(C1[n], C2[n])``. An object with no pixel above the Costes threshold
+    yields ``0.0`` (matching the reference's fringe-case default).
+    """
+    C1 = np.zeros(n)
+    C2 = np.zeros(n)
+    for k in range(n):
+        lo = offsets[k]
+        hi = offsets[k + 1]
+        if hi - lo == 0:
+            continue
+        a, b = _regression_ab(g1, g2, lo, hi)
+        if mode == 0:
+            thr_fi_c, thr_si_c = _bisection(g1, g2, lo, hi, a, b, scale)
+        else:
+            thr_fi_c, thr_si_c = _linear(g1, g2, lo, hi, a, b, scale, mode == 2)
+
+        sum_ge_fi = 0.0
+        sum_ge_si = 0.0
+        sum_fi_c = 0.0
+        sum_si_c = 0.0
+        n_comb = 0
+        for i in range(lo, hi):
+            v1 = g1[i]
+            v2 = g2[i]
+            if v1 >= thr_fi_c:
+                sum_ge_fi += v1
+            if v2 >= thr_si_c:
+                sum_ge_si += v2
+            if v1 > thr_fi_c and v2 > thr_si_c:
+                n_comb += 1
+                sum_fi_c += v1
+                sum_si_c += v2
+        # tot_* (denominators) are read only when n_comb > 0, which guarantees a
+        # pixel strictly above each threshold, hence sum_ge_* covers it — so the
+        # reference's `if any(x > thr)` guard on the totals is always true here.
+        if n_comb > 0:
+            C1[k] = sum_fi_c / sum_ge_fi
+            C2[k] = sum_si_c / sum_ge_si
+    return C1, C2