Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,31 @@ data, columns, rows = featurize(image, masks, config)

Volumetric `(C, Z, H, W)` data is supported. The featurizer automatically skips 2D-only features (`radial_distribution`, `radial_zernikes`, `zernike`, `feret`). All other features (`intensity`, `sizeshape`, `texture`, `granularity`, correlations) work for both 2D and 3D.

The output is plain numpy + lists, so converting to a DataFrame is straightforward:
The default output is plain numpy + lists. Use `return_as` to get structured output directly:

```python notest
import pandas as pd
row_names = [f"{img}__{obj}__{label}" for img, obj, label in rows]
df = pd.DataFrame(data, index=row_names, columns=columns)
# pip install cp_measure[pandas]
df = featurize(image, masks, config, return_as="pandas")
# image_id object_type label Area BoundingBoxArea ...
# None object 1 2500.0 2500.0
# None object 2 2500.0 2500.0
# Shape: (2, 123) — 3 metadata columns + 120 feature columns

# pip install cp_measure[anndata]
adata = featurize(image, masks, config, return_as="anndata")
# AnnData object with n_obs x n_vars = 2 x 120
# obs: 'image_id', 'object_type', 'label'
# var: 'feature_group', 'feature_type', 'feature_name', 'channel', 'channel_2'
# uns: 'config', 'channels', 'objects', 'is_3d'

# pip install cp_measure[pyarrow]
table = featurize(image, masks, config, return_as="pyarrow")
# pyarrow.Table with 2 rows, 123 columns
# Per-column metadata in schema fields, table-level metadata:
# 'cp_measure_config', 'channels', 'objects', 'is_3d'
```

Note: DataFrame libraries must be installed independently, to keep the dependency tree low.
These are optional dependencies — install only what you need (or `pip install cp_measure[all]`).

## Important notes

Expand Down
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ dependencies = [
"mahotas<2.0.0,>=1.4.13",
]

[project.optional-dependencies]
pandas = ["pandas>=1.5"]
pyarrow = ["pyarrow>=12.0"]
anndata = ["anndata>=0.9", "pandas>=1.5"]
all = ["cp_measure[pandas,pyarrow,anndata]"]

[project.urls]
Homepage = "https://github.com/afermg/cp_measure"

Expand Down
120 changes: 120 additions & 0 deletions src/cp_measure/_converters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
"""Output format converters for :func:`cp_measure.featurizer.featurize`.

Each converter lazily imports its optional dependency and raises a helpful
:class:`ImportError` when the package is missing.
"""

from __future__ import annotations

import json

import numpy as np


def _lazy_import(module_name: str, extra: str):
"""Import *module_name* or raise with install instructions."""
import importlib

try:
return importlib.import_module(module_name)
except ImportError:
raise ImportError(
f"{module_name} is required for return_as='{extra}'. "
f"Install it with: pip install cp_measure[{extra}]"
) from None


def _unpack_rows(rows: list[tuple]) -> tuple[list, list, list]:
"""Extract image_id, object_type, label lists from row tuples."""
if not rows:
return [], [], []
image_ids, object_types, labels = zip(*rows)
return list(image_ids), list(object_types), list(labels)


def _to_pandas(*, data, columns, rows, **_kwargs):
pd = _lazy_import("pandas", "pandas")

df = pd.DataFrame(data, columns=columns)
image_ids, object_types, labels = _unpack_rows(rows)
df.insert(0, "image_id", image_ids)
df.insert(1, "object_type", object_types)
df.insert(2, "label", labels)
return df


def _to_pyarrow(
*, data, columns, rows, col_meta, config, channels, objects, is_3d, **_kwargs
):
pa = _lazy_import("pyarrow", "pyarrow")

image_ids, object_types, labels = _unpack_rows(rows)
arrays = {
"image_id": image_ids,
"object_type": object_types,
"label": labels,
}
for i, col in enumerate(columns):
arrays[col] = data[:, i]

table = pa.table(arrays)

fields = []
for i, field in enumerate(table.schema):
if i < 3:
fields.append(field)
else:
meta = col_meta[i - 3]
fields.append(
field.with_metadata(
{k: str(v).encode() for k, v in meta.items() if v is not None}
)
)
schema = pa.schema(
fields,
metadata={
b"cp_measure_config": json.dumps(config).encode(),
b"channels": json.dumps(channels).encode(),
b"objects": json.dumps(objects).encode(),
b"is_3d": json.dumps(is_3d).encode(),
},
)
return table.cast(schema)


def _to_anndata(
*, data, columns, rows, col_meta, config, channels, objects, is_3d, **_kwargs
):
ad = _lazy_import("anndata", "anndata")
pd = _lazy_import("pandas", "anndata")

obs = pd.DataFrame(rows, columns=["image_id", "object_type", "label"])
if rows[0][0] is not None:
obs.index = [f"{r[0]}_{r[1]}_{r[2]}" for r in rows]
else:
obs.index = [f"{r[1]}_{r[2]}" for r in rows]
obs.index = obs.index.astype(str)

var = pd.DataFrame(col_meta)
var.index = columns

uns = {
"config": config,
"channels": channels,
"objects": objects,
"is_3d": is_3d,
}

return ad.AnnData(X=data.astype(np.float32, copy=False), obs=obs, var=var, uns=uns)


_CONVERTERS = {
"pandas": _to_pandas,
"pyarrow": _to_pyarrow,
"anndata": _to_anndata,
}


def convert(fmt: str, **kwargs):
"""Dispatch to the appropriate converter."""
return _CONVERTERS[fmt](**kwargs)
Loading
Loading