Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ nanobind_add_module(_core
src/bindings/flow.cxx
src/bindings/graph.cxx
src/bindings/ground_truth.cxx
src/bindings/label_multiset.cxx
src/bindings/segmentation.cxx
src/bindings/transformation.cxx
src/bindings/util.cxx
Expand Down
96 changes: 96 additions & 0 deletions MIGRATION_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -906,6 +906,102 @@ Important differences:
API yet; the implementation is structured so these filters/extra edges can
be added later.

### Label Multiset

The label-multiset data structure stores, for each spatial block of a label
volume, a histogram of `(label, count)` pairs over the underlying voxels,
with identical histograms across blocks deduplicated into shared storage.
It is used by Paintera to build multi-resolution label pyramids.

`nifty.tools` exposes three functions / classes — `readSubset`,
`downsampleMultiset`, and `MultisetMerger` — operating on five flat arrays
(`offsets`, `entry_sizes`, `entry_offsets`, `ids`, `counts`). `bioimage-cpp`
keeps the same algorithm and storage layout but wraps it in a
`LabelMultiset` dataclass plus a level-0 bootstrap helper.

Nifty:

```python
import nifty.tools as nt

# nifty does not provide a "from labels" helper; level-0 multisets are
# typically constructed by the caller (e.g. by writing histograms manually
# to N5 chunks).
blocking = nt.blocking([0, 0, 0], list(labels.shape), [2, 2, 2])
argmax, new_offsets, new_ids, new_counts = nt.downsampleMultiset(
blocking, offsets, entry_sizes, entry_offsets, ids, counts,
restrict_set=-1,
)

ids, counts = nt.readSubset(offsets, sizes, ids, counts, True)

merger = nt.MultisetMerger(unique_offsets, entry_sizes, ids, counts)
merger.update(unique_offsets, entry_sizes, ids, counts, offsets)
```

bioimage-cpp:

```python
import bioimage_cpp as bic
from bioimage_cpp.label_multiset import (
LabelMultiset,
MultisetMerger,
downsample_multiset,
multiset_from_labels,
read_subset,
)

# Build the level-0 multiset directly from a label volume.
ms0 = multiset_from_labels(labels, block_shape=(1, 1, 1))

# Downsample one level. `blocking.roi_end` must match the input's spatial
# extent (i.e. the shape used to build ms0).
blocking = bic.utils.Blocking([0, 0, 0], list(labels.shape), [2, 2, 2])
ms1 = downsample_multiset(ms0, blocking, restrict_set=-1)

# Merge entries from a list of (offset, size) ranges into one histogram.
ids, counts = read_subset(offsets, sizes, ms1.ids, ms1.counts)

# Deduplicating merger — constructor takes one offset per unique entry.
merger = MultisetMerger.from_multiset(ms1)
merger.update(unique_offsets, entry_sizes, ids, counts, offsets)
```

Name and API changes:

| nifty-style name | bioimage-cpp name |
| --- | --- |
| `readSubset` | `read_subset` |
| `downsampleMultiset` | `downsample_multiset` |
| `MultisetMerger` | `MultisetMerger` |
| `MultisetMerger.get_ids()` | `MultisetMerger.ids` (property) |
| `MultisetMerger.get_counts()` | `MultisetMerger.counts` |
| `restrict_set` (keyword) | `restrict_set` (keyword, same default `-1`) |

Notes:

- A `LabelMultiset` carries all five arrays (`offsets`, `entry_offsets`,
`entry_sizes`, `ids`, `counts`) plus `argmax`. Nifty's
`downsampleMultiset` returns only four of them and leaves the caller to
reconstruct `entry_offsets` / `entry_sizes`; `bioimage-cpp` returns them
directly so multi-level downsample chains compose without bookkeeping.
- `multiset_from_labels(labels, block_shape)` builds the level-0 multiset
from a `uint32` or `uint64` label volume in one call. There is no nifty
equivalent.
- `MultisetMerger.__init__` takes one offset per unique entry (length
`n_unique`), matching nifty's contract. Use
`MultisetMerger.from_multiset(ms)` to construct one directly from a
`LabelMultiset`.
- Count dtype is `uint32` (nifty uses `int32`). Convert at the boundary
if you are reading nifty-written data.
- 2D and 3D blockings are both supported. The bindings instantiate `uint64`
ids, `uint32` counts, and `uint64` offsets; wider dtype matrices can be
added on demand.
- The `LabelMultisetWrapper` z5/N5 reader from
`nifty/tools/label_multiset_wrapper.hxx` is intentionally **not** ported
— I/O stays out of the C++ core. Read/write Paintera-format chunks with
`zarr`/`numpy` in Python if needed.

### Lifted Multicut

Nifty exposes lifted multicut through a separate objective + solver hierarchy.
Expand Down
229 changes: 229 additions & 0 deletions development/label_multiset/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
"""Benchmark the bioimage-cpp label_multiset implementation against nifty.

Runs:
- multiset_from_labels (bioimage-cpp only — nifty has no direct equivalent;
upstream code typically writes the level-0 multiset out manually)
- downsampleMultiset
- readSubset
- MultisetMerger.update

on a deterministic 3D label volume. Prints a comparison table and verifies
that the results agree numerically.

Run with:
python development/label_multiset/benchmark.py
"""
from __future__ import annotations

import time
from dataclasses import dataclass
from typing import Callable, Tuple

import numpy as np

from bioimage_cpp._core import Blocking as BicBlocking
from bioimage_cpp.label_multiset import (
MultisetMerger,
downsample_multiset,
multiset_from_labels,
read_subset,
)

try:
import nifty.tools as nt

HAVE_NIFTY = True
except ImportError:
HAVE_NIFTY = False


SHAPE = (256, 256, 256)
N_LABELS = 2000
COARSEN = 2 # voxels per coarse cell (smaller -> more variety per downsample block)
DOWN_BLOCK = (2, 2, 2)
N_SUBSET_QUERIES = 5000
SUBSET_RANGE = 64
N_REPEATS = 3


def make_labels(seed: int = 0) -> np.ndarray:
rng = np.random.default_rng(seed)
coarse_shape = tuple(s // COARSEN for s in SHAPE)
coarse = rng.integers(0, N_LABELS, size=coarse_shape, dtype=np.uint64)
return np.kron(coarse, np.ones((COARSEN,) * len(SHAPE), dtype=np.uint64))


@dataclass
class TimedResult:
seconds: float
value: object


def time_best(fn: Callable[[], object], repeats: int = N_REPEATS) -> TimedResult:
times = []
value = None
for _ in range(repeats):
t0 = time.perf_counter()
value = fn()
t1 = time.perf_counter()
times.append(t1 - t0)
return TimedResult(min(times), value)


def fmt_row(name: str, ours: float, theirs: float | None) -> str:
if theirs is None:
return f" {name:<32} ours: {ours*1000:8.2f} ms nifty: (n/a)"
ratio = ours / theirs if theirs > 0 else float("inf")
return (
f" {name:<32} ours: {ours*1000:8.2f} ms nifty: {theirs*1000:8.2f} ms"
f" ratio: {ratio:5.2f}x"
)


def bench_from_labels(labels: np.ndarray) -> TimedResult:
return time_best(lambda: multiset_from_labels(labels, (1, 1, 1)))


def bench_downsample_ours(ms) -> Tuple[TimedResult, object]:
blocking = BicBlocking([0, 0, 0], list(SHAPE), list(DOWN_BLOCK))
res = time_best(lambda: downsample_multiset(ms, blocking))
return res, blocking


def bench_downsample_nifty(ms) -> TimedResult | None:
if not HAVE_NIFTY:
return None
n_blocking = nt.blocking(
roiBegin=[0, 0, 0], roiEnd=list(SHAPE), blockShape=list(DOWN_BLOCK)
)
n_offsets = ms.offsets.astype(np.uint64)
n_entry_sizes = ms.entry_sizes.astype(np.uint64)
n_entry_offsets = ms.entry_offsets.astype(np.uint64)
n_ids = ms.ids.astype(np.uint64)
n_counts = ms.counts.astype(np.int32)

return time_best(
lambda: nt.downsampleMultiset(
n_blocking, n_offsets, n_entry_sizes, n_entry_offsets,
n_ids, n_counts, restrict_set=-1,
)
)


def bench_read_subset(ms) -> Tuple[TimedResult, TimedResult | None]:
rng = np.random.default_rng(1)
# Random sub-multisets: pick N_SUBSET_QUERIES random spatial positions and
# gather their (offset, size) ranges.
n_spatial = ms.n_spatial
positions = rng.integers(0, n_spatial, size=N_SUBSET_QUERIES)
range_size = SUBSET_RANGE # how many entries to merge per query
offsets_list = []
sizes_list = []
for p in positions:
start = max(0, p - range_size // 2)
end = min(n_spatial, start + range_size)
offsets_list.append(ms.offsets[start:end].astype(np.uint64))
sizes_list.append(
ms.entry_sizes[ms.entry_offsets[start:end]].astype(np.uint64)
)
flat_offsets = np.concatenate(offsets_list)
flat_sizes = np.concatenate(sizes_list)

ours = time_best(lambda: read_subset(flat_offsets, flat_sizes, ms.ids, ms.counts))

if HAVE_NIFTY:
ids_int32 = ms.ids.astype(np.uint64)
counts_int32 = ms.counts.astype(np.int32)
theirs = time_best(
lambda: nt.readSubset(
flat_offsets, flat_sizes, ids_int32, counts_int32, True
)
)
else:
theirs = None
return ours, theirs


def bench_merger(ms_downsampled) -> Tuple[TimedResult, TimedResult | None]:
# Build a merger from the downsampled multiset, then update with itself.
# The constructor expects one offset per unique entry (length n_unique).
entry_sizes = ms_downsampled.entry_sizes.astype(np.uint64)
ids = ms_downsampled.ids.astype(np.uint64)
counts = ms_downsampled.counts.astype(np.uint32)
counts_i32 = ms_downsampled.counts.astype(np.int32)

unique_off = np.array(
[int(ms_downsampled.offsets[
np.where(ms_downsampled.entry_offsets == e)[0][0]])
for e in range(ms_downsampled.n_entries)],
dtype=np.uint64,
)

def run_ours():
m = MultisetMerger(unique_off, entry_sizes, ids, counts)
spatial = ms_downsampled.entry_offsets.astype(np.uint64).copy()
m.update(unique_off, entry_sizes, ids, counts, spatial)
return m

ours = time_best(run_ours, repeats=N_REPEATS)

if HAVE_NIFTY:
def run_nifty():
m = nt.MultisetMerger(unique_off, entry_sizes, ids, counts_i32)
spatial = ms_downsampled.entry_offsets.astype(np.uint64).copy()
m.update(unique_off, entry_sizes, ids, counts_i32, spatial)
return m
theirs = time_best(run_nifty, repeats=N_REPEATS)
else:
theirs = None
return ours, theirs


def main() -> None:
print(f"shape={SHAPE} labels={N_LABELS} downsample={DOWN_BLOCK} repeats={N_REPEATS}")
print(f"nifty available: {HAVE_NIFTY}")
print()
print("Generating label volume...")
labels = make_labels()
print(f" unique labels in volume: {len(np.unique(labels))}")
print()

print("Benchmarks (best of N):")
t_from = bench_from_labels(labels)
print(fmt_row("multiset_from_labels (1,1,1)", t_from.seconds, None))
ms0 = t_from.value

t_down, blocking = bench_downsample_ours(ms0)
t_down_nifty = bench_downsample_nifty(ms0)
print(fmt_row(
f"downsample_multiset {DOWN_BLOCK}",
t_down.seconds,
t_down_nifty.seconds if t_down_nifty else None,
))
ms1 = t_down.value
print(f" -> level-1: n_spatial={ms1.n_spatial} n_entries={ms1.n_entries}")

t_read_ours, t_read_nifty = bench_read_subset(ms0)
print(fmt_row(
f"read_subset (x{N_SUBSET_QUERIES})",
t_read_ours.seconds,
t_read_nifty.seconds if t_read_nifty else None,
))

t_merger_ours, t_merger_nifty = bench_merger(ms1)
print(fmt_row(
"MultisetMerger.update",
t_merger_ours.seconds,
t_merger_nifty.seconds if t_merger_nifty else None,
))

# Correctness cross-check on downsample.
if HAVE_NIFTY and t_down_nifty is not None:
bic_argmax = ms1.argmax
n_argmax = t_down_nifty.value[0]
assert bic_argmax.tolist() == n_argmax.tolist(), "argmax mismatch vs nifty!"
print("\nargmax(downsample) cross-check vs nifty: OK")


if __name__ == "__main__":
main()
Loading
Loading