From 7d130020a746e53ddecbf6286c6487b78ab5c215 Mon Sep 17 00:00:00 2001 From: Tim Treis Date: Fri, 27 Mar 2026 14:14:06 +0100 Subject: [PATCH 1/4] feat: add return_as parameter to featurize() for pandas/pyarrow/anndata output Add a `return_as` parameter to `featurize()` that supports four output formats: "tuple" (default, backward-compatible), "pandas" (DataFrame), "pyarrow" (Table with schema metadata), and "anndata" (AnnData with structured obs/var/uns). All three optional formats are lazy-imported to avoid hard dependencies. Co-Authored-By: Claude Opus 4.6 (1M context) --- pyproject.toml | 6 + src/cp_measure/_converters.py | 117 ++++++++++++++++++ src/cp_measure/featurizer.py | 182 ++++++++++++++++++++++++---- test/test_return_as.py | 221 ++++++++++++++++++++++++++++++++++ 4 files changed, 501 insertions(+), 25 deletions(-) create mode 100644 src/cp_measure/_converters.py create mode 100644 test/test_return_as.py diff --git a/pyproject.toml b/pyproject.toml index d0512da..956a739 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,12 @@ dependencies = [ "mahotas<2.0.0,>=1.4.13", ] +[project.optional-dependencies] +pandas = ["pandas>=1.5"] +pyarrow = ["pyarrow>=12.0"] +anndata = ["anndata>=0.9", "pandas>=1.5"] +all = ["cp_measure[pandas,pyarrow,anndata]"] + [project.urls] Homepage = "https://github.com/afermg/cp_measure" diff --git a/src/cp_measure/_converters.py b/src/cp_measure/_converters.py new file mode 100644 index 0000000..4881818 --- /dev/null +++ b/src/cp_measure/_converters.py @@ -0,0 +1,117 @@ +"""Output format converters for :func:`cp_measure.featurizer.featurize`. + +Each converter lazily imports its optional dependency and raises a helpful +:class:`ImportError` when the package is missing. +""" + +from __future__ import annotations + +import json + +import numpy as np + + +def _lazy_import(module_name: str, extra: str): + """Import *module_name* or raise with install instructions.""" + import importlib + + try: + return importlib.import_module(module_name) + except ImportError: + raise ImportError( + f"{module_name} is required for return_as='{extra}'. " + f"Install it with: pip install cp_measure[{extra}]" + ) from None + + +def _unpack_rows(rows: list[tuple]) -> tuple[list, list, list]: + """Extract image_id, object_type, label lists from row tuples.""" + if not rows: + return [], [], [] + image_ids, object_types, labels = zip(*rows) + return list(image_ids), list(object_types), list(labels) + + +def _to_pandas(*, data, columns, rows, **_kwargs): + pd = _lazy_import("pandas", "pandas") + + df = pd.DataFrame(data, columns=columns) + image_ids, object_types, labels = _unpack_rows(rows) + df.insert(0, "image_id", image_ids) + df.insert(1, "object_type", object_types) + df.insert(2, "label", labels) + return df + + +def _to_pyarrow(*, data, columns, rows, col_meta, config, channels, objects, is_3d, **_kwargs): + pa = _lazy_import("pyarrow", "pyarrow") + + image_ids, object_types, labels = _unpack_rows(rows) + arrays = { + "image_id": image_ids, + "object_type": object_types, + "label": labels, + } + for i, col in enumerate(columns): + arrays[col] = data[:, i] + + table = pa.table(arrays) + + # Attach per-column metadata to feature columns in the schema. + fields = [] + for i, field in enumerate(table.schema): + if i < 3: + fields.append(field) + else: + meta = col_meta[i - 3] + fields.append( + field.with_metadata( + {k: str(v).encode() for k, v in meta.items() if v is not None} + ) + ) + schema = pa.schema( + fields, + metadata={ + b"cp_measure_config": json.dumps(config).encode(), + b"channels": json.dumps(channels).encode(), + b"objects": json.dumps(objects).encode(), + b"is_3d": json.dumps(is_3d).encode(), + }, + ) + return table.cast(schema) + + +def _to_anndata(*, data, columns, rows, col_meta, config, channels, objects, is_3d, **_kwargs): + ad = _lazy_import("anndata", "anndata") + pd = _lazy_import("pandas", "anndata") + + obs = pd.DataFrame(rows, columns=["image_id", "object_type", "label"]) + if obs["image_id"].iloc[0] is not None: + obs.index = [f"{r[0]}_{r[1]}_{r[2]}" for r in rows] + else: + obs.index = [f"{r[1]}_{r[2]}" for r in rows] + obs.index = obs.index.astype(str) + + var = pd.DataFrame(col_meta) + var.index = columns + + uns = { + "config": config, + "channels": channels, + "objects": objects, + "is_3d": is_3d, + } + + return ad.AnnData(X=data.astype(np.float32, copy=False), obs=obs, var=var, uns=uns) + + +_CONVERTERS = { + "pandas": _to_pandas, + "pyarrow": _to_pyarrow, + "anndata": _to_anndata, +} + + +def convert(fmt: str, **kwargs): + """Dispatch to the appropriate converter.""" + return _CONVERTERS[fmt](**kwargs) diff --git a/src/cp_measure/featurizer.py b/src/cp_measure/featurizer.py index f5fc3b2..fdedada 100644 --- a/src/cp_measure/featurizer.py +++ b/src/cp_measure/featurizer.py @@ -17,9 +17,15 @@ import itertools import warnings +from typing import TYPE_CHECKING, Literal, overload import numpy as np +if TYPE_CHECKING: + import anndata as ad + import pandas as pd + import pyarrow as pa + # Feature groups that only support 2D spatial data. _2D_ONLY = {"radial_distribution", "radial_zernikes", "zernike", "feret"} @@ -166,13 +172,58 @@ def make_featurizer_config( } +@overload +def featurize( + image: np.ndarray, + masks: np.ndarray, + config: dict | None = ..., + *, + image_id: str | int | None = ..., + return_as: Literal["tuple"] = ..., +) -> tuple[np.ndarray, list[str], list[tuple]]: ... + + +@overload +def featurize( + image: np.ndarray, + masks: np.ndarray, + config: dict | None = ..., + *, + image_id: str | int | None = ..., + return_as: Literal["pandas"], +) -> pd.DataFrame: ... + + +@overload +def featurize( + image: np.ndarray, + masks: np.ndarray, + config: dict | None = ..., + *, + image_id: str | int | None = ..., + return_as: Literal["pyarrow"], +) -> pa.Table: ... + + +@overload +def featurize( + image: np.ndarray, + masks: np.ndarray, + config: dict | None = ..., + *, + image_id: str | int | None = ..., + return_as: Literal["anndata"], +) -> ad.AnnData: ... + + def featurize( image: np.ndarray, masks: np.ndarray, config: dict | None = None, *, image_id: str | int | None = None, -) -> tuple[np.ndarray, list[str], list[tuple]]: + return_as: Literal["tuple", "pandas", "pyarrow", "anndata"] = "tuple", +): """Compute all configured features for the given image and masks. Parameters @@ -190,18 +241,35 @@ def featurize( If ``None``, all features are enabled with default parameters. image_id : str | int | None, optional Identifier for this image, stored in each row tuple. + return_as : str, optional + Output format. One of ``"tuple"`` (default), ``"pandas"``, + ``"pyarrow"``, or ``"anndata"``. Non-tuple formats require the + corresponding package to be installed (e.g. + ``pip install cp_measure[anndata]``). Returns ------- - data : numpy.ndarray - 2-D float array of shape ``(n_rows, n_features)``. - columns : list[str] - Feature column names. Shape features are bare names (e.g. - ``"Area"``), per-channel features are ``"{feature}__{channel}"``, - and correlation features are ``"{feature}__{ch1}__{ch2}"``. - rows : list[tuple] - One ``(image_id, object_name, label)`` tuple per row. + tuple or pd.DataFrame or pa.Table or anndata.AnnData + When ``return_as="tuple"`` (default): ``(data, columns, rows)`` + where *data* is a 2-D float array, *columns* is a list of + feature names, and *rows* is a list of + ``(image_id, object_name, label)`` tuples. + + When ``return_as="pandas"``: a DataFrame with feature columns + plus ``image_id``, ``object_type``, and ``label`` columns. + + When ``return_as="pyarrow"``: a PyArrow Table with per-column + metadata in the schema. + + When ``return_as="anndata"``: an AnnData object with features + in ``X``, object metadata in ``obs``, feature metadata in + ``var``, and configuration in ``uns``. """ + _valid_return_as = {"tuple", "pandas", "pyarrow", "anndata"} + if return_as not in _valid_return_as: + raise ValueError( + f"return_as must be one of {_valid_return_as!r}, got {return_as!r}" + ) if config is None: config = make_featurizer_config() channels, objects = _resolve_names(config, image.shape[0]) @@ -220,10 +288,12 @@ def featurize( # Shape features are purely geometric and ignore pixel values. dummy_pixels = None + collect_meta = return_as != "tuple" all_rows: list[tuple] = [] all_blocks: list[np.ndarray] = [] columns: list[str] | None = None + col_meta: list[dict] | None = None for mask_idx, object_name in enumerate(objects): mask = masks[mask_idx] @@ -233,27 +303,51 @@ def featurize( continue results: dict[str, np.ndarray] = {} - - for func, params in shape_feats: - results.update(func(mask, dummy_pixels, **params)) + building_meta = collect_meta and columns is None + meta_entries: list[dict] = [] if building_meta else [] + + for func, params, group_name in shape_feats: + raw = func(mask, dummy_pixels, **params) + results.update(raw) + if building_meta: + for key in raw: + meta_entries.append( + _meta_entry(group_name, "shape", key) + ) for ch_idx, ch_name in enumerate(channels): pixels = image[ch_idx] - for func, params in channel_feats: - for key, values in func(mask, pixels, **params).items(): + for func, params, group_name in channel_feats: + raw = func(mask, pixels, **params) + for key, values in raw.items(): results[f"{key}__{ch_name}"] = values + if building_meta: + meta_entries.append( + _meta_entry(group_name, "channel", key, channel=ch_name) + ) n_ch = len(channels) - for func, params, symmetric in corr_feats: + for func, params, symmetric, group_name in corr_feats: iter_fn = itertools.combinations if symmetric else itertools.permutations for ch_i, ch_j in iter_fn(range(n_ch), 2): - for key, values in func( + raw = func( pixels_1=image[ch_i], pixels_2=image[ch_j], masks=mask, **params, - ).items(): + ) + for key, values in raw.items(): results[f"{key}__{channels[ch_i]}__{channels[ch_j]}"] = values + if building_meta: + meta_entries.append( + _meta_entry( + group_name, + "correlation", + key, + channel=channels[ch_i], + channel_2=channels[ch_j], + ) + ) # Build column list from the first non-empty mask. # Order-sensitive comparison is safe: all measurement functions @@ -263,6 +357,8 @@ def featurize( col_names = list(results.keys()) if columns is None: columns = col_names + if building_meta: + col_meta = meta_entries elif col_names != columns: raise RuntimeError( f"feature keys for object {object_name!r} differ from " @@ -280,7 +376,23 @@ def featurize( raise ValueError("all masks have no labels (all zeros)") data = np.vstack(all_blocks) - return data, columns, all_rows + + if return_as == "tuple": + return data, columns, all_rows + + from cp_measure._converters import convert + + return convert( + return_as, + data=data, + columns=columns, + rows=all_rows, + col_meta=col_meta, + config=config, + channels=channels, + objects=objects, + is_3d=is_3d, + ) # --------------------------------------------------------------------------- @@ -288,6 +400,19 @@ def featurize( # --------------------------------------------------------------------------- +def _meta_entry( + group: str, ftype: str, name: str, *, channel: str | None = None, channel_2: str | None = None +) -> dict: + """Build a per-column metadata dict for ``var`` / schema metadata.""" + return { + "feature_group": group, + "feature_type": ftype, + "feature_name": name, + "channel": channel, + "channel_2": channel_2, + } + + def _resolve_channels(n_channels: int) -> list[str]: """Generate default channel names, zero-padded when n >= 10.""" width = len(str(n_channels - 1)) if n_channels >= 10 else 1 @@ -351,7 +476,10 @@ def _validate( def _collect_channel_features( config: dict, core_funcs: dict, *, skipped: set[str] ) -> list[tuple]: - """Collect enabled per-channel feature functions and their params.""" + """Collect enabled per-channel feature functions and their params. + + Each element is ``(func, params, group_name)``. + """ feats: list[tuple] = [] for name in ( "intensity", @@ -361,20 +489,23 @@ def _collect_channel_features( "radial_zernikes", ): if config[name] and name not in skipped: - feats.append((core_funcs[name], config[f"{name}_params"])) + feats.append((core_funcs[name], config[f"{name}_params"], name)) return feats def _collect_shape_features( config: dict, core_funcs: dict, *, skipped: set[str] ) -> list[tuple]: - """Collect enabled shape feature functions and their params.""" + """Collect enabled shape feature functions and their params. + + Each element is ``(func, params, group_name)``. + """ feats: list[tuple] = [] for name in ("sizeshape", "zernike"): if config[name] and name not in skipped: - feats.append((core_funcs[name], config[f"{name}_params"])) + feats.append((core_funcs[name], config[f"{name}_params"], name)) if config["feret"] and "feret" not in skipped: - feats.append((core_funcs["feret"], {})) + feats.append((core_funcs["feret"], {}, "feret")) return feats @@ -393,7 +524,8 @@ def _collect_correlation_features( """Collect enabled correlation feature functions. The third element of each tuple indicates whether the metric is - symmetric (combinations) or asymmetric (permutations). + symmetric (combinations) or asymmetric (permutations). The fourth + element is the feature group name. """ if n_channels < 2: has_corr = any( @@ -430,5 +562,5 @@ def _collect_correlation_features( for cfg_key, func_key, params_key, symmetric in specs: if config[cfg_key]: params = config[params_key] if params_key else {} - feats.append((corr_funcs[func_key], params, symmetric)) + feats.append((corr_funcs[func_key], params, symmetric, cfg_key)) return feats diff --git a/test/test_return_as.py b/test/test_return_as.py new file mode 100644 index 0000000..6ec8f66 --- /dev/null +++ b/test/test_return_as.py @@ -0,0 +1,221 @@ +"""Tests for the return_as parameter of featurize().""" + +import numpy as np +import pytest + +from cp_measure.featurizer import featurize, make_featurizer_config + +from conftest import ALL_OFF + + +@pytest.fixture() +def config_2ch(): + return make_featurizer_config( + ["DNA", "ER"], + objects=["nuclei"], + **{**ALL_OFF, "intensity": True, "sizeshape": True, "correlation_pearson": True}, + ) + + +@pytest.fixture() +def config_2ch_multi(): + return make_featurizer_config( + ["DNA", "ER"], + objects=["nuclei", "cells"], + **{**ALL_OFF, "intensity": True, "sizeshape": True, "correlation_pearson": True}, + ) + + +class TestReturnAsValidation: + def test_invalid_return_as(self, image_2d_2ch, mask_2d, config_2ch): + with pytest.raises(ValueError, match="return_as must be one of"): + featurize(image_2d_2ch, mask_2d, config_2ch, return_as="invalid") + + def test_tuple_default(self, image_2d_2ch, mask_2d, config_2ch): + result = featurize(image_2d_2ch, mask_2d, config_2ch) + assert isinstance(result, tuple) + assert len(result) == 3 + + +class TestReturnAsPandas: + pd = pytest.importorskip("pandas") + + def test_returns_dataframe(self, image_2d_2ch, mask_2d, config_2ch): + df = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="pandas") + assert isinstance(df, self.pd.DataFrame) + + def test_metadata_columns(self, image_2d_2ch, mask_2d, config_2ch): + df = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="pandas") + assert "image_id" in df.columns + assert "object_type" in df.columns + assert "label" in df.columns + assert df.columns[0] == "image_id" + assert df.columns[1] == "object_type" + assert df.columns[2] == "label" + + def test_row_count(self, image_2d_2ch, mask_2d, config_2ch): + data, _, rows = featurize(image_2d_2ch, mask_2d, config_2ch) + df = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="pandas") + assert len(df) == len(rows) + assert len(df) == data.shape[0] + + def test_feature_columns_match(self, image_2d_2ch, mask_2d, config_2ch): + _, columns, _ = featurize(image_2d_2ch, mask_2d, config_2ch) + df = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="pandas") + feature_cols = df.columns[3:] + assert list(feature_cols) == columns + + def test_values_match_tuple(self, image_2d_2ch, mask_2d, config_2ch): + data, columns, _ = featurize(image_2d_2ch, mask_2d, config_2ch) + df = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="pandas") + np.testing.assert_array_equal(df[columns].values, data) + + def test_object_type_values(self, image_2d_2ch, mask_2d, config_2ch): + df = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="pandas") + assert (df["object_type"] == "nuclei").all() + + def test_multi_mask(self, image_2d_2ch, masks_2d_multi, config_2ch_multi): + df = featurize(image_2d_2ch, masks_2d_multi, config_2ch_multi, return_as="pandas") + assert set(df["object_type"]) == {"nuclei", "cells"} + assert len(df[df["object_type"] == "nuclei"]) == 2 + assert len(df[df["object_type"] == "cells"]) == 3 + + def test_image_id(self, image_2d_2ch, mask_2d, config_2ch): + df = featurize( + image_2d_2ch, mask_2d, config_2ch, image_id="plate1", return_as="pandas" + ) + assert (df["image_id"] == "plate1").all() + + +class TestReturnAsPyArrow: + pa = pytest.importorskip("pyarrow") + + def test_returns_table(self, image_2d_2ch, mask_2d, config_2ch): + table = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="pyarrow") + assert isinstance(table, self.pa.Table) + + def test_metadata_columns(self, image_2d_2ch, mask_2d, config_2ch): + table = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="pyarrow") + names = table.column_names + assert names[0] == "image_id" + assert names[1] == "object_type" + assert names[2] == "label" + + def test_row_count(self, image_2d_2ch, mask_2d, config_2ch): + data, _, _ = featurize(image_2d_2ch, mask_2d, config_2ch) + table = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="pyarrow") + assert table.num_rows == data.shape[0] + + def test_schema_metadata(self, image_2d_2ch, mask_2d, config_2ch): + table = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="pyarrow") + meta = table.schema.metadata + assert b"cp_measure_config" in meta + assert b"channels" in meta + assert b"is_3d" in meta + + def test_column_metadata(self, image_2d_2ch, mask_2d, config_2ch): + table = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="pyarrow") + # Feature columns (index >= 3) should have metadata + field = table.schema.field(3) + assert field.metadata is not None + assert b"feature_group" in field.metadata + + def test_feature_columns_match(self, image_2d_2ch, mask_2d, config_2ch): + _, columns, _ = featurize(image_2d_2ch, mask_2d, config_2ch) + table = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="pyarrow") + assert table.column_names[3:] == columns + + +class TestReturnAsAnnData: + ad = pytest.importorskip("anndata") + + def test_returns_anndata(self, image_2d_2ch, mask_2d, config_2ch): + adata = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="anndata") + assert isinstance(adata, self.ad.AnnData) + + def test_x_shape(self, image_2d_2ch, mask_2d, config_2ch): + data, columns, rows = featurize(image_2d_2ch, mask_2d, config_2ch) + adata = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="anndata") + assert adata.X.shape == data.shape + assert adata.n_obs == len(rows) + assert adata.n_vars == len(columns) + + def test_x_values_match(self, image_2d_2ch, mask_2d, config_2ch): + data, _, _ = featurize(image_2d_2ch, mask_2d, config_2ch) + adata = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="anndata") + np.testing.assert_array_almost_equal(adata.X, data.astype(np.float32)) + + def test_obs_columns(self, image_2d_2ch, mask_2d, config_2ch): + adata = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="anndata") + assert "image_id" in adata.obs.columns + assert "object_type" in adata.obs.columns + assert "label" in adata.obs.columns + + def test_obs_values(self, image_2d_2ch, mask_2d, config_2ch): + adata = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="anndata") + assert (adata.obs["object_type"] == "nuclei").all() + assert list(adata.obs["label"]) == [1, 2] + + def test_obs_names_with_image_id(self, image_2d_2ch, mask_2d, config_2ch): + adata = featurize( + image_2d_2ch, mask_2d, config_2ch, image_id="img1", return_as="anndata" + ) + assert adata.obs_names[0] == "img1_nuclei_1" + + def test_obs_names_without_image_id(self, image_2d_2ch, mask_2d, config_2ch): + adata = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="anndata") + assert adata.obs_names[0] == "nuclei_1" + + def test_var_columns(self, image_2d_2ch, mask_2d, config_2ch): + adata = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="anndata") + for col in ("feature_group", "feature_type", "feature_name", "channel", "channel_2"): + assert col in adata.var.columns + + def test_var_names(self, image_2d_2ch, mask_2d, config_2ch): + _, columns, _ = featurize(image_2d_2ch, mask_2d, config_2ch) + adata = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="anndata") + assert list(adata.var_names) == columns + + def test_var_shape_features(self, image_2d_2ch, mask_2d, config_2ch): + adata = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="anndata") + shape_vars = adata.var[adata.var["feature_type"] == "shape"] + assert len(shape_vars) > 0 + assert shape_vars["channel"].isna().all() + assert (shape_vars["feature_group"] == "sizeshape").all() + + def test_var_channel_features(self, image_2d_2ch, mask_2d, config_2ch): + adata = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="anndata") + ch_vars = adata.var[adata.var["feature_type"] == "channel"] + assert len(ch_vars) > 0 + assert set(ch_vars["channel"].dropna()) == {"DNA", "ER"} + + def test_var_correlation_features(self, image_2d_2ch, mask_2d, config_2ch): + adata = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="anndata") + corr_vars = adata.var[adata.var["feature_type"] == "correlation"] + assert len(corr_vars) > 0 + assert corr_vars["channel"].notna().all() + assert corr_vars["channel_2"].notna().all() + + def test_uns_keys(self, image_2d_2ch, mask_2d, config_2ch): + adata = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="anndata") + assert "config" in adata.uns + assert "channels" in adata.uns + assert "objects" in adata.uns + assert "is_3d" in adata.uns + assert adata.uns["channels"] == ["DNA", "ER"] + assert adata.uns["objects"] == ["nuclei"] + assert adata.uns["is_3d"] is False + + def test_multi_mask(self, image_2d_2ch, masks_2d_multi, config_2ch_multi): + adata = featurize( + image_2d_2ch, masks_2d_multi, config_2ch_multi, return_as="anndata" + ) + assert adata.n_obs == 5 # 2 nuclei + 3 cells + assert set(adata.obs["object_type"]) == {"nuclei", "cells"} + + def test_3d_uns_flag(self, image_3d_2ch, mask_3d): + config = make_featurizer_config( + ["DNA", "ER"], **{**ALL_OFF, "intensity": True, "sizeshape": True} + ) + adata = featurize(image_3d_2ch, mask_3d, config, return_as="anndata") + assert adata.uns["is_3d"] is True From bee765513fa0f6cc296d8b468693926b99602aeb Mon Sep 17 00:00:00 2001 From: Tim Treis Date: Tue, 7 Apr 2026 22:32:33 +0200 Subject: [PATCH 2/4] fix: address review findings for return_as PR - Use raw tuple check instead of DataFrame iloc for None vs NaN safety in _to_anndata - Add warning when 2D-only features are silently skipped on volumetric data - Remove redundant no-op ternary in meta_entries initialization - Remove unnecessary comment in _to_pyarrow Co-Authored-By: Claude Opus 4.6 (1M context) --- src/cp_measure/_converters.py | 3 +-- src/cp_measure/featurizer.py | 11 +++++++++-- test/test_featurizer.py | 3 ++- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/cp_measure/_converters.py b/src/cp_measure/_converters.py index 4881818..78209fb 100644 --- a/src/cp_measure/_converters.py +++ b/src/cp_measure/_converters.py @@ -57,7 +57,6 @@ def _to_pyarrow(*, data, columns, rows, col_meta, config, channels, objects, is_ table = pa.table(arrays) - # Attach per-column metadata to feature columns in the schema. fields = [] for i, field in enumerate(table.schema): if i < 3: @@ -86,7 +85,7 @@ def _to_anndata(*, data, columns, rows, col_meta, config, channels, objects, is_ pd = _lazy_import("pandas", "anndata") obs = pd.DataFrame(rows, columns=["image_id", "object_type", "label"]) - if obs["image_id"].iloc[0] is not None: + if rows[0][0] is not None: obs.index = [f"{r[0]}_{r[1]}_{r[2]}" for r in rows] else: obs.index = [f"{r[1]}_{r[2]}" for r in rows] diff --git a/src/cp_measure/featurizer.py b/src/cp_measure/featurizer.py index fdedada..b16d65d 100644 --- a/src/cp_measure/featurizer.py +++ b/src/cp_measure/featurizer.py @@ -304,7 +304,7 @@ def featurize( results: dict[str, np.ndarray] = {} building_meta = collect_meta and columns is None - meta_entries: list[dict] = [] if building_meta else [] + meta_entries: list[dict] = [] for func, params, group_name in shape_feats: raw = func(mask, dummy_pixels, **params) @@ -513,7 +513,14 @@ def _warn_and_filter_2d_only(config: dict, is_3d: bool) -> set[str]: """Return the set of 2D-only feature names to skip, warning if any.""" if not is_3d: return set() - return {name for name in _2D_ONLY if config.get(name, False)} + skipped = {name for name in _2D_ONLY if config.get(name, False)} + if skipped: + warnings.warn( + f"Skipping 2D-only features for volumetric data: {sorted(skipped)}", + UserWarning, + stacklevel=2, + ) + return skipped def _collect_correlation_features( diff --git a/test/test_featurizer.py b/test/test_featurizer.py index fc638ff..96aa8dd 100644 --- a/test/test_featurizer.py +++ b/test/test_featurizer.py @@ -93,7 +93,8 @@ def test_3d_skips_2d_only_features(self, image_3d_2ch, mask_3d): ["DNA", "ER"], **{**ALL_OFF, "intensity": True, "sizeshape": True, "zernike": True}, ) - data, columns, rows = featurize(image_3d_2ch, mask_3d, config) + with pytest.warns(UserWarning, match="Skipping 2D-only features"): + data, columns, rows = featurize(image_3d_2ch, mask_3d, config) assert data.shape[0] == 2 assert not any("Zernike" in c for c in columns) From 288c4897dae51a171c7616beecbc4bd3e4d74b7b Mon Sep 17 00:00:00 2001 From: Tim Treis Date: Tue, 7 Apr 2026 22:40:20 +0200 Subject: [PATCH 3/4] docs: add return_as output examples to README Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index ca42f3f..a44fa4a 100644 --- a/README.md +++ b/README.md @@ -94,15 +94,31 @@ data, columns, rows = featurize(image, masks, config) Volumetric `(C, Z, H, W)` data is supported. The featurizer automatically skips 2D-only features (`radial_distribution`, `radial_zernikes`, `zernike`, `feret`). All other features (`intensity`, `sizeshape`, `texture`, `granularity`, correlations) work for both 2D and 3D. -The output is plain numpy + lists, so converting to a DataFrame is straightforward: +The default output is plain numpy + lists. Use `return_as` to get structured output directly: ```python notest -import pandas as pd -row_names = [f"{img}__{obj}__{label}" for img, obj, label in rows] -df = pd.DataFrame(data, index=row_names, columns=columns) +# pip install cp_measure[pandas] +df = featurize(image, masks, config, return_as="pandas") +# image_id object_type label Area BoundingBoxArea ... +# None object 1 2500.0 2500.0 +# None object 2 2500.0 2500.0 +# Shape: (2, 123) — 3 metadata columns + 120 feature columns + +# pip install cp_measure[anndata] +adata = featurize(image, masks, config, return_as="anndata") +# AnnData object with n_obs x n_vars = 2 x 120 +# obs: 'image_id', 'object_type', 'label' +# var: 'feature_group', 'feature_type', 'feature_name', 'channel', 'channel_2' +# uns: 'config', 'channels', 'objects', 'is_3d' + +# pip install cp_measure[pyarrow] +table = featurize(image, masks, config, return_as="pyarrow") +# pyarrow.Table with 2 rows, 123 columns +# Per-column metadata in schema fields, table-level metadata: +# 'cp_measure_config', 'channels', 'objects', 'is_3d' ``` -Note: DataFrame libraries must be installed independently, to keep the dependency tree low. +These are optional dependencies — install only what you need (or `pip install cp_measure[all]`). ## Important notes From 97675a3043302e2bf133ca6bdf1982cb83809279 Mon Sep 17 00:00:00 2001 From: Tim Treis Date: Tue, 7 Apr 2026 22:41:54 +0200 Subject: [PATCH 4/4] style: run ruff formatter on new files Co-Authored-By: Claude Opus 4.6 (1M context) --- src/cp_measure/_converters.py | 8 ++++++-- src/cp_measure/featurizer.py | 11 +++++++---- test/test_return_as.py | 26 ++++++++++++++++++++++---- 3 files changed, 35 insertions(+), 10 deletions(-) diff --git a/src/cp_measure/_converters.py b/src/cp_measure/_converters.py index 78209fb..9154f87 100644 --- a/src/cp_measure/_converters.py +++ b/src/cp_measure/_converters.py @@ -43,7 +43,9 @@ def _to_pandas(*, data, columns, rows, **_kwargs): return df -def _to_pyarrow(*, data, columns, rows, col_meta, config, channels, objects, is_3d, **_kwargs): +def _to_pyarrow( + *, data, columns, rows, col_meta, config, channels, objects, is_3d, **_kwargs +): pa = _lazy_import("pyarrow", "pyarrow") image_ids, object_types, labels = _unpack_rows(rows) @@ -80,7 +82,9 @@ def _to_pyarrow(*, data, columns, rows, col_meta, config, channels, objects, is_ return table.cast(schema) -def _to_anndata(*, data, columns, rows, col_meta, config, channels, objects, is_3d, **_kwargs): +def _to_anndata( + *, data, columns, rows, col_meta, config, channels, objects, is_3d, **_kwargs +): ad = _lazy_import("anndata", "anndata") pd = _lazy_import("pandas", "anndata") diff --git a/src/cp_measure/featurizer.py b/src/cp_measure/featurizer.py index b16d65d..ed12a46 100644 --- a/src/cp_measure/featurizer.py +++ b/src/cp_measure/featurizer.py @@ -311,9 +311,7 @@ def featurize( results.update(raw) if building_meta: for key in raw: - meta_entries.append( - _meta_entry(group_name, "shape", key) - ) + meta_entries.append(_meta_entry(group_name, "shape", key)) for ch_idx, ch_name in enumerate(channels): pixels = image[ch_idx] @@ -401,7 +399,12 @@ def featurize( def _meta_entry( - group: str, ftype: str, name: str, *, channel: str | None = None, channel_2: str | None = None + group: str, + ftype: str, + name: str, + *, + channel: str | None = None, + channel_2: str | None = None, ) -> dict: """Build a per-column metadata dict for ``var`` / schema metadata.""" return { diff --git a/test/test_return_as.py b/test/test_return_as.py index 6ec8f66..28f0073 100644 --- a/test/test_return_as.py +++ b/test/test_return_as.py @@ -13,7 +13,12 @@ def config_2ch(): return make_featurizer_config( ["DNA", "ER"], objects=["nuclei"], - **{**ALL_OFF, "intensity": True, "sizeshape": True, "correlation_pearson": True}, + **{ + **ALL_OFF, + "intensity": True, + "sizeshape": True, + "correlation_pearson": True, + }, ) @@ -22,7 +27,12 @@ def config_2ch_multi(): return make_featurizer_config( ["DNA", "ER"], objects=["nuclei", "cells"], - **{**ALL_OFF, "intensity": True, "sizeshape": True, "correlation_pearson": True}, + **{ + **ALL_OFF, + "intensity": True, + "sizeshape": True, + "correlation_pearson": True, + }, ) @@ -75,7 +85,9 @@ def test_object_type_values(self, image_2d_2ch, mask_2d, config_2ch): assert (df["object_type"] == "nuclei").all() def test_multi_mask(self, image_2d_2ch, masks_2d_multi, config_2ch_multi): - df = featurize(image_2d_2ch, masks_2d_multi, config_2ch_multi, return_as="pandas") + df = featurize( + image_2d_2ch, masks_2d_multi, config_2ch_multi, return_as="pandas" + ) assert set(df["object_type"]) == {"nuclei", "cells"} assert len(df[df["object_type"] == "nuclei"]) == 2 assert len(df[df["object_type"] == "cells"]) == 3 @@ -168,7 +180,13 @@ def test_obs_names_without_image_id(self, image_2d_2ch, mask_2d, config_2ch): def test_var_columns(self, image_2d_2ch, mask_2d, config_2ch): adata = featurize(image_2d_2ch, mask_2d, config_2ch, return_as="anndata") - for col in ("feature_group", "feature_type", "feature_name", "channel", "channel_2"): + for col in ( + "feature_group", + "feature_type", + "feature_name", + "channel", + "channel_2", + ): assert col in adata.var.columns def test_var_names(self, image_2d_2ch, mask_2d, config_2ch):