Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions src/cp_measure/bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,18 +49,25 @@ def _numba_registries() -> dict[str, dict[str, Callable]]:
"""Registries for the 'numba' accelerator.

Composes the numba implementations (``intensity`` and the ``pearson`` /
``manders_fold`` / ``rwc`` / ``overlap`` colocalization features) with the
numpy implementations of every other feature — a single global "numba"
``manders_fold`` / ``rwc`` / ``costes`` / ``overlap`` colocalization features)
with the numpy implementations of every other feature — a single global "numba"
selection still yields a full, working feature set, accelerated where a
numba backend exists. This is explicit per-function composition, NOT an
error-driven fallback.

The five correlation entries are kept per-group so the featurizer's per-group
selection (``_collect_correlation_features``) keeps working; each is now a thin,
gated wrapper over the shared ``get_correlation_all`` kernel. A caller wanting
several coloc features in ONE flatten+kernel pass (the efficient path) calls
``cp_measure.core.numba.get_correlation_all(..., features=...)`` directly — it is
stateless, so fusion happens by requesting the set in one call, not by the
registry.

Note: ``overlap`` is not in the numpy ``_CORRELATION`` registry, so the numba
correlation registry intentionally exposes one feature the numpy one does not
(the numba ``overlap`` backend exists and is cheap to surface). Adding
``overlap`` to the numpy ``_CORRELATION`` for symmetry is a separate call.
correlation registry intentionally exposes one feature the numpy one does not.
"""
from cp_measure.core.numba import (
get_correlation_costes as _numba_costes,
get_correlation_manders_fold as _numba_manders_fold,
get_correlation_overlap as _numba_overlap,
get_correlation_pearson as _numba_pearson,
Expand All @@ -75,6 +82,7 @@ def _numba_registries() -> dict[str, dict[str, Callable]]:
"pearson": _numba_pearson,
"manders_fold": _numba_manders_fold,
"rwc": _numba_rwc,
"costes": _numba_costes,
"overlap": _numba_overlap,
},
}
Expand Down
10 changes: 7 additions & 3 deletions src/cp_measure/core/numba/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
``numba`` extra; availability is gated by ``cp_measure._detect.HAS_NUMBA``.

This backend accelerates ``intensity`` and the colocalization features
``pearson``/``manders_fold``/``rwc``/``overlap``; the global "numba" accelerator
composes them with the numpy implementations of every other feature (see
``cp_measure.bulk``).
``pearson``/``manders_fold``/``rwc``/``overlap``/``costes``; the global "numba"
accelerator composes them with the numpy implementations of every other feature
(see ``cp_measure.bulk``).
"""

from cp_measure.core.numba.measurecolocalization import (
get_correlation_all,
get_correlation_costes,
get_correlation_manders_fold,
get_correlation_overlap,
get_correlation_pearson,
Expand All @@ -19,6 +21,8 @@
from cp_measure.core.numba.measureobjectintensity import get_intensity

__all__ = [
"get_correlation_all",
"get_correlation_costes",
"get_correlation_manders_fold",
"get_correlation_overlap",
"get_correlation_pearson",
Expand Down
278 changes: 278 additions & 0 deletions src/cp_measure/core/numba/_costes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
"""Costes automated-threshold colocalization kernel (single-threaded, cached).

Per-object port of ``measurecolocalization.get_correlation_costes_ind`` and its
``bisection_costes`` / ``linear_costes`` search. Each object's two channels are
the contiguous blocks ``g1[offsets[k]:offsets[k+1]]`` / ``g2[...]`` from
:func:`cp_measure.primitives._segment_numba.flatten_pairs_grouped` (the same
grouped layout the other coloc features use — no sort here).

The reference's ``calculate_threshold`` call inside ``..._ind`` is dead (its
outputs are never read), so it is not ported. ``thr`` likewise has no effect on
costes. ``scale`` (``infer_scale``, dtype-keyed: float→1) is passed in host-side.

Exactness: bit-exact vs the numpy reference on **float** pixels (the realistic
input; ``scale==1``). Integer-dtype input diverges by design — the reference
computes ``z = fi + si`` in that dtype and overflows (uint8/uint16), corrupting
the regression; the float64 kernel here does not. See ``tasks/numba_costes_plan.md``.

Serial: a plain object loop, no ``prange``/``nogil``.
"""

import numpy as np
from numba import njit


@njit(cache=True, error_model="numpy")
def _regression_ab(g1, g2, lo, hi):
"""Costes orthogonal-regression line ``si ≈ a·fi + b`` over non-zero pixels.

``non_zero = (fi > 0) | (si > 0)``; ``a`` from the covariance recovered via
``cov = 0.5(var(fi+si) - var(fi) - var(si))`` (all ``ddof=1``), matching the
reference exactly.
"""
cnt = 0
s1 = 0.0
s2 = 0.0
for i in range(lo, hi):
if g1[i] > 0 or g2[i] > 0:
cnt += 1
s1 += g1[i]
s2 += g2[i]
m1 = s1 / cnt
m2 = s2 / cnt
vx = 0.0
vy = 0.0
vz = 0.0
for i in range(lo, hi):
if g1[i] > 0 or g2[i] > 0:
d1 = g1[i] - m1
d2 = g2[i] - m2
dz = (g1[i] + g2[i]) - (m1 + m2)
vx += d1 * d1
vy += d2 * d2
vz += dz * dz
denom_n = cnt - 1
xvar = vx / denom_n
yvar = vy / denom_n
zvar = vz / denom_n
covar = 0.5 * (zvar - (xvar + yvar))
yx = yvar - xvar
a = (yx + np.sqrt(yx * yx + 4.0 * covar * covar)) / (2.0 * covar)
b = m2 - a * m1
return a, b


@njit(cache=True, error_model="numpy")
def _count_combt(g1, g2, lo, hi, thr_fi, thr_si):
"""``count_nonzero((fi < thr_fi) | (si < thr_si))`` over the object's block."""
cnt = 0
for i in range(lo, hi):
if g1[i] < thr_fi or g2[i] < thr_si:
cnt += 1
return cnt


@njit(cache=True, error_model="numpy")
def _pearson_combt(g1, g2, lo, hi, thr_fi, thr_si):
"""Pearson r over the ``(fi < thr_fi) | (si < thr_si)`` subset.

Mirrors ``scipy.stats.pearsonr``'s operation order — centre, normalise each
vector by its L2 norm, accumulate the normalised products, clamp to [-1, 1] —
to stay as close as possible to the value the reference branches on. Pass
``thr_fi = thr_si = inf`` for the full-block correlation.
"""
cnt = 0
s1 = 0.0
s2 = 0.0
for i in range(lo, hi):
if g1[i] < thr_fi or g2[i] < thr_si:
cnt += 1
s1 += g1[i]
s2 += g2[i]
if cnt == 0:
return np.nan
m1 = s1 / cnt
m2 = s2 / cnt
nx = 0.0
ny = 0.0
for i in range(lo, hi):
if g1[i] < thr_fi or g2[i] < thr_si:
d1 = g1[i] - m1
d2 = g2[i] - m2
nx += d1 * d1
ny += d2 * d2
normx = np.sqrt(nx)
normy = np.sqrt(ny)
r = 0.0
for i in range(lo, hi):
if g1[i] < thr_fi or g2[i] < thr_si:
r += ((g1[i] - m1) / normx) * ((g2[i] - m2) / normy)
if r > 1.0:
r = 1.0
elif r < -1.0:
r = -1.0
return r


@njit(cache=True, error_model="numpy")
def _count_pearson_combt(g1, g2, lo, hi, thr_fi, thr_si):
"""``(cnt, r)`` in one shot: the subset count AND its Pearson r.

The standalone ``_count_combt`` recomputes exactly the count that
``_pearson_combt``'s first pass already produces, so the bisection paid two
count passes per visited threshold. This fuses them: pass 1 yields ``cnt``
(+ sums); ``r`` is computed (passes 2-3, identical to ``_pearson_combt``) only
when ``cnt > 2`` (else returned as ``nan`` and unused). Bit-identical to the
separate calls (same predicate, same accumulation order).
"""
cnt = 0
s1 = 0.0
s2 = 0.0
for i in range(lo, hi):
if g1[i] < thr_fi or g2[i] < thr_si:
cnt += 1
s1 += g1[i]
s2 += g2[i]
if cnt <= 2:
return cnt, np.nan
m1 = s1 / cnt
m2 = s2 / cnt
nx = 0.0
ny = 0.0
for i in range(lo, hi):
if g1[i] < thr_fi or g2[i] < thr_si:
d1 = g1[i] - m1
d2 = g2[i] - m2
nx += d1 * d1
ny += d2 * d2
normx = np.sqrt(nx)
normy = np.sqrt(ny)
r = 0.0
for i in range(lo, hi):
if g1[i] < thr_fi or g2[i] < thr_si:
r += ((g1[i] - m1) / normx) * ((g2[i] - m2) / normy)
if r > 1.0:
r = 1.0
elif r < -1.0:
r = -1.0
return cnt, r


@njit(cache=True, error_model="numpy")
def _bisection(g1, g2, lo, hi, a, b, scale):
"""``bisection_costes`` (M_FASTER): narrowing-window search for the threshold."""
left = 1.0
right = float(scale)
mid = np.floor((right - left) / 1.2) + left
lastmid = 0.0
valid = 1.0
while lastmid != mid:
thr_fi = mid / scale
thr_si = a * thr_fi + b
cnt, r = _count_pearson_combt(g1, g2, lo, hi, thr_fi, thr_si)
if cnt <= 2:
left = mid - 1
else:
if r < 0:
left = mid - 1
elif r >= 0:
right = mid + 1
valid = mid
# NaN (constant subset): neither bound moves; loop exits next step.
lastmid = mid
if right - left > 6:
mid = np.floor((right - left) / 1.2) + left
else:
mid = np.floor((right - left) / 2.0) + left
thr_fi_c = (valid - 1) / scale
thr_si_c = a * thr_fi_c + b
return thr_fi_c, thr_si_c


@njit(cache=True, error_model="numpy")
def _linear(g1, g2, lo, hi, a, b, scale, accurate):
"""``linear_costes`` (M_FAST / M_ACCURATE): step the threshold down to R≈0."""
i_step = 1.0 / scale
fi_max = -np.inf
si_max = -np.inf
for i in range(lo, hi):
if g1[i] > fi_max:
fi_max = g1[i]
if g2[i] > si_max:
si_max = g2[i]
img_max = fi_max if fi_max > si_max else si_max
cur = i_step * (np.floor(img_max / i_step) + 1)
thr_fi_c = cur
thr_si_c = a * cur + b
while cur > fi_max and (a * cur + b) > si_max:
cur -= i_step
r = 0.0
num_true = -1 # sentinel (reference's None: forces recompute on first pass)
while cur > i_step:
thr_fi_c = cur
thr_si_c = a * cur + b
cnt = _count_combt(g1, g2, lo, hi, thr_fi_c, thr_si_c)
if cnt < 2: # reference: scipy.stats.pearsonr raises -> break
break
if cnt != num_true:
r = _pearson_combt(g1, g2, lo, hi, thr_fi_c, thr_si_c)
num_true = cnt
if r <= 0:
break
elif accurate or cur < i_step * 10:
cur -= i_step
elif r > 0.45:
cur -= i_step * 10
elif r > 0.35:
cur -= i_step * 5
elif r > 0.25:
cur -= i_step * 2
else:
cur -= i_step
return thr_fi_c, thr_si_c


@njit(cache=True, error_model="numpy")
def costes_per_object(g1, g2, offsets, n, scale, mode):
"""Costes C1/C2 per object over grouped value blocks.

``mode``: 0 = bisection (M_FASTER), 1 = linear M_FAST, 2 = linear M_ACCURATE.
Returns ``(C1[n], C2[n])``. An object with no pixel above the Costes threshold
yields ``0.0`` (matching the reference's fringe-case default).
"""
C1 = np.zeros(n)
C2 = np.zeros(n)
for k in range(n):
lo = offsets[k]
hi = offsets[k + 1]
if hi - lo == 0:
continue
a, b = _regression_ab(g1, g2, lo, hi)
if mode == 0:
thr_fi_c, thr_si_c = _bisection(g1, g2, lo, hi, a, b, scale)
else:
thr_fi_c, thr_si_c = _linear(g1, g2, lo, hi, a, b, scale, mode == 2)

sum_ge_fi = 0.0
sum_ge_si = 0.0
sum_fi_c = 0.0
sum_si_c = 0.0
n_comb = 0
for i in range(lo, hi):
v1 = g1[i]
v2 = g2[i]
if v1 >= thr_fi_c:
sum_ge_fi += v1
if v2 >= thr_si_c:
sum_ge_si += v2
if v1 > thr_fi_c and v2 > thr_si_c:
n_comb += 1
sum_fi_c += v1
sum_si_c += v2
# tot_* (denominators) are read only when n_comb > 0, which guarantees a
# pixel strictly above each threshold, hence sum_ge_* covers it — so the
# reference's `if any(x > thr)` guard on the totals is always true here.
if n_comb > 0:
C1[k] = sum_fi_c / sum_ge_fi
C2[k] = sum_si_c / sum_ge_si
return C1, C2
Loading