From 5fac5cd5344a7e14421887e869297a897e75be3c Mon Sep 17 00:00:00 2001 From: Constantin Pape Date: Fri, 22 May 2026 21:52:37 +0200 Subject: [PATCH] Implement label multiset --- CMakeLists.txt | 1 + MIGRATION_GUIDE.md | 96 +++++ development/label_multiset/benchmark.py | 229 ++++++++++++ .../label_multiset/downsample.hxx | 124 +++++++ .../label_multiset/from_labels.hxx | 147 ++++++++ .../bioimage_cpp/label_multiset/merger.hxx | 143 ++++++++ .../bioimage_cpp/label_multiset/multiset.hxx | 66 ++++ .../label_multiset/read_subset.hxx | 136 +++++++ src/bindings/label_multiset.cxx | 335 ++++++++++++++++++ src/bindings/label_multiset.hxx | 9 + src/bindings/module.cxx | 2 + src/bioimage_cpp/__init__.py | 2 + src/bioimage_cpp/label_multiset/__init__.py | 260 ++++++++++++++ tests/label_multiset/__init__.py | 0 tests/label_multiset/test_against_nifty.py | 149 ++++++++ tests/label_multiset/test_downsample.py | 125 +++++++ tests/label_multiset/test_merger.py | 129 +++++++ tests/label_multiset/test_read_subset.py | 75 ++++ 18 files changed, 2028 insertions(+) create mode 100644 development/label_multiset/benchmark.py create mode 100644 include/bioimage_cpp/label_multiset/downsample.hxx create mode 100644 include/bioimage_cpp/label_multiset/from_labels.hxx create mode 100644 include/bioimage_cpp/label_multiset/merger.hxx create mode 100644 include/bioimage_cpp/label_multiset/multiset.hxx create mode 100644 include/bioimage_cpp/label_multiset/read_subset.hxx create mode 100644 src/bindings/label_multiset.cxx create mode 100644 src/bindings/label_multiset.hxx create mode 100644 src/bioimage_cpp/label_multiset/__init__.py create mode 100644 tests/label_multiset/__init__.py create mode 100644 tests/label_multiset/test_against_nifty.py create mode 100644 tests/label_multiset/test_downsample.py create mode 100644 tests/label_multiset/test_merger.py create mode 100644 tests/label_multiset/test_read_subset.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 61b5056..acb8ecb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,6 +19,7 @@ nanobind_add_module(_core src/bindings/flow.cxx src/bindings/graph.cxx src/bindings/ground_truth.cxx + src/bindings/label_multiset.cxx src/bindings/segmentation.cxx src/bindings/transformation.cxx src/bindings/util.cxx diff --git a/MIGRATION_GUIDE.md b/MIGRATION_GUIDE.md index 72cd32d..c88ca9d 100644 --- a/MIGRATION_GUIDE.md +++ b/MIGRATION_GUIDE.md @@ -906,6 +906,102 @@ Important differences: API yet; the implementation is structured so these filters/extra edges can be added later. +### Label Multiset + +The label-multiset data structure stores, for each spatial block of a label +volume, a histogram of `(label, count)` pairs over the underlying voxels, +with identical histograms across blocks deduplicated into shared storage. +It is used by Paintera to build multi-resolution label pyramids. + +`nifty.tools` exposes three functions / classes — `readSubset`, +`downsampleMultiset`, and `MultisetMerger` — operating on five flat arrays +(`offsets`, `entry_sizes`, `entry_offsets`, `ids`, `counts`). `bioimage-cpp` +keeps the same algorithm and storage layout but wraps it in a +`LabelMultiset` dataclass plus a level-0 bootstrap helper. + +Nifty: + +```python +import nifty.tools as nt + +# nifty does not provide a "from labels" helper; level-0 multisets are +# typically constructed by the caller (e.g. by writing histograms manually +# to N5 chunks). +blocking = nt.blocking([0, 0, 0], list(labels.shape), [2, 2, 2]) +argmax, new_offsets, new_ids, new_counts = nt.downsampleMultiset( + blocking, offsets, entry_sizes, entry_offsets, ids, counts, + restrict_set=-1, +) + +ids, counts = nt.readSubset(offsets, sizes, ids, counts, True) + +merger = nt.MultisetMerger(unique_offsets, entry_sizes, ids, counts) +merger.update(unique_offsets, entry_sizes, ids, counts, offsets) +``` + +bioimage-cpp: + +```python +import bioimage_cpp as bic +from bioimage_cpp.label_multiset import ( + LabelMultiset, + MultisetMerger, + downsample_multiset, + multiset_from_labels, + read_subset, +) + +# Build the level-0 multiset directly from a label volume. +ms0 = multiset_from_labels(labels, block_shape=(1, 1, 1)) + +# Downsample one level. `blocking.roi_end` must match the input's spatial +# extent (i.e. the shape used to build ms0). +blocking = bic.utils.Blocking([0, 0, 0], list(labels.shape), [2, 2, 2]) +ms1 = downsample_multiset(ms0, blocking, restrict_set=-1) + +# Merge entries from a list of (offset, size) ranges into one histogram. +ids, counts = read_subset(offsets, sizes, ms1.ids, ms1.counts) + +# Deduplicating merger — constructor takes one offset per unique entry. +merger = MultisetMerger.from_multiset(ms1) +merger.update(unique_offsets, entry_sizes, ids, counts, offsets) +``` + +Name and API changes: + +| nifty-style name | bioimage-cpp name | +| --- | --- | +| `readSubset` | `read_subset` | +| `downsampleMultiset` | `downsample_multiset` | +| `MultisetMerger` | `MultisetMerger` | +| `MultisetMerger.get_ids()` | `MultisetMerger.ids` (property) | +| `MultisetMerger.get_counts()` | `MultisetMerger.counts` | +| `restrict_set` (keyword) | `restrict_set` (keyword, same default `-1`) | + +Notes: + +- A `LabelMultiset` carries all five arrays (`offsets`, `entry_offsets`, + `entry_sizes`, `ids`, `counts`) plus `argmax`. Nifty's + `downsampleMultiset` returns only four of them and leaves the caller to + reconstruct `entry_offsets` / `entry_sizes`; `bioimage-cpp` returns them + directly so multi-level downsample chains compose without bookkeeping. +- `multiset_from_labels(labels, block_shape)` builds the level-0 multiset + from a `uint32` or `uint64` label volume in one call. There is no nifty + equivalent. +- `MultisetMerger.__init__` takes one offset per unique entry (length + `n_unique`), matching nifty's contract. Use + `MultisetMerger.from_multiset(ms)` to construct one directly from a + `LabelMultiset`. +- Count dtype is `uint32` (nifty uses `int32`). Convert at the boundary + if you are reading nifty-written data. +- 2D and 3D blockings are both supported. The bindings instantiate `uint64` + ids, `uint32` counts, and `uint64` offsets; wider dtype matrices can be + added on demand. +- The `LabelMultisetWrapper` z5/N5 reader from + `nifty/tools/label_multiset_wrapper.hxx` is intentionally **not** ported + — I/O stays out of the C++ core. Read/write Paintera-format chunks with + `zarr`/`numpy` in Python if needed. + ### Lifted Multicut Nifty exposes lifted multicut through a separate objective + solver hierarchy. diff --git a/development/label_multiset/benchmark.py b/development/label_multiset/benchmark.py new file mode 100644 index 0000000..e49f37b --- /dev/null +++ b/development/label_multiset/benchmark.py @@ -0,0 +1,229 @@ +"""Benchmark the bioimage-cpp label_multiset implementation against nifty. + +Runs: + - multiset_from_labels (bioimage-cpp only — nifty has no direct equivalent; + upstream code typically writes the level-0 multiset out manually) + - downsampleMultiset + - readSubset + - MultisetMerger.update + +on a deterministic 3D label volume. Prints a comparison table and verifies +that the results agree numerically. + +Run with: + python development/label_multiset/benchmark.py +""" +from __future__ import annotations + +import time +from dataclasses import dataclass +from typing import Callable, Tuple + +import numpy as np + +from bioimage_cpp._core import Blocking as BicBlocking +from bioimage_cpp.label_multiset import ( + MultisetMerger, + downsample_multiset, + multiset_from_labels, + read_subset, +) + +try: + import nifty.tools as nt + + HAVE_NIFTY = True +except ImportError: + HAVE_NIFTY = False + + +SHAPE = (256, 256, 256) +N_LABELS = 2000 +COARSEN = 2 # voxels per coarse cell (smaller -> more variety per downsample block) +DOWN_BLOCK = (2, 2, 2) +N_SUBSET_QUERIES = 5000 +SUBSET_RANGE = 64 +N_REPEATS = 3 + + +def make_labels(seed: int = 0) -> np.ndarray: + rng = np.random.default_rng(seed) + coarse_shape = tuple(s // COARSEN for s in SHAPE) + coarse = rng.integers(0, N_LABELS, size=coarse_shape, dtype=np.uint64) + return np.kron(coarse, np.ones((COARSEN,) * len(SHAPE), dtype=np.uint64)) + + +@dataclass +class TimedResult: + seconds: float + value: object + + +def time_best(fn: Callable[[], object], repeats: int = N_REPEATS) -> TimedResult: + times = [] + value = None + for _ in range(repeats): + t0 = time.perf_counter() + value = fn() + t1 = time.perf_counter() + times.append(t1 - t0) + return TimedResult(min(times), value) + + +def fmt_row(name: str, ours: float, theirs: float | None) -> str: + if theirs is None: + return f" {name:<32} ours: {ours*1000:8.2f} ms nifty: (n/a)" + ratio = ours / theirs if theirs > 0 else float("inf") + return ( + f" {name:<32} ours: {ours*1000:8.2f} ms nifty: {theirs*1000:8.2f} ms" + f" ratio: {ratio:5.2f}x" + ) + + +def bench_from_labels(labels: np.ndarray) -> TimedResult: + return time_best(lambda: multiset_from_labels(labels, (1, 1, 1))) + + +def bench_downsample_ours(ms) -> Tuple[TimedResult, object]: + blocking = BicBlocking([0, 0, 0], list(SHAPE), list(DOWN_BLOCK)) + res = time_best(lambda: downsample_multiset(ms, blocking)) + return res, blocking + + +def bench_downsample_nifty(ms) -> TimedResult | None: + if not HAVE_NIFTY: + return None + n_blocking = nt.blocking( + roiBegin=[0, 0, 0], roiEnd=list(SHAPE), blockShape=list(DOWN_BLOCK) + ) + n_offsets = ms.offsets.astype(np.uint64) + n_entry_sizes = ms.entry_sizes.astype(np.uint64) + n_entry_offsets = ms.entry_offsets.astype(np.uint64) + n_ids = ms.ids.astype(np.uint64) + n_counts = ms.counts.astype(np.int32) + + return time_best( + lambda: nt.downsampleMultiset( + n_blocking, n_offsets, n_entry_sizes, n_entry_offsets, + n_ids, n_counts, restrict_set=-1, + ) + ) + + +def bench_read_subset(ms) -> Tuple[TimedResult, TimedResult | None]: + rng = np.random.default_rng(1) + # Random sub-multisets: pick N_SUBSET_QUERIES random spatial positions and + # gather their (offset, size) ranges. + n_spatial = ms.n_spatial + positions = rng.integers(0, n_spatial, size=N_SUBSET_QUERIES) + range_size = SUBSET_RANGE # how many entries to merge per query + offsets_list = [] + sizes_list = [] + for p in positions: + start = max(0, p - range_size // 2) + end = min(n_spatial, start + range_size) + offsets_list.append(ms.offsets[start:end].astype(np.uint64)) + sizes_list.append( + ms.entry_sizes[ms.entry_offsets[start:end]].astype(np.uint64) + ) + flat_offsets = np.concatenate(offsets_list) + flat_sizes = np.concatenate(sizes_list) + + ours = time_best(lambda: read_subset(flat_offsets, flat_sizes, ms.ids, ms.counts)) + + if HAVE_NIFTY: + ids_int32 = ms.ids.astype(np.uint64) + counts_int32 = ms.counts.astype(np.int32) + theirs = time_best( + lambda: nt.readSubset( + flat_offsets, flat_sizes, ids_int32, counts_int32, True + ) + ) + else: + theirs = None + return ours, theirs + + +def bench_merger(ms_downsampled) -> Tuple[TimedResult, TimedResult | None]: + # Build a merger from the downsampled multiset, then update with itself. + # The constructor expects one offset per unique entry (length n_unique). + entry_sizes = ms_downsampled.entry_sizes.astype(np.uint64) + ids = ms_downsampled.ids.astype(np.uint64) + counts = ms_downsampled.counts.astype(np.uint32) + counts_i32 = ms_downsampled.counts.astype(np.int32) + + unique_off = np.array( + [int(ms_downsampled.offsets[ + np.where(ms_downsampled.entry_offsets == e)[0][0]]) + for e in range(ms_downsampled.n_entries)], + dtype=np.uint64, + ) + + def run_ours(): + m = MultisetMerger(unique_off, entry_sizes, ids, counts) + spatial = ms_downsampled.entry_offsets.astype(np.uint64).copy() + m.update(unique_off, entry_sizes, ids, counts, spatial) + return m + + ours = time_best(run_ours, repeats=N_REPEATS) + + if HAVE_NIFTY: + def run_nifty(): + m = nt.MultisetMerger(unique_off, entry_sizes, ids, counts_i32) + spatial = ms_downsampled.entry_offsets.astype(np.uint64).copy() + m.update(unique_off, entry_sizes, ids, counts_i32, spatial) + return m + theirs = time_best(run_nifty, repeats=N_REPEATS) + else: + theirs = None + return ours, theirs + + +def main() -> None: + print(f"shape={SHAPE} labels={N_LABELS} downsample={DOWN_BLOCK} repeats={N_REPEATS}") + print(f"nifty available: {HAVE_NIFTY}") + print() + print("Generating label volume...") + labels = make_labels() + print(f" unique labels in volume: {len(np.unique(labels))}") + print() + + print("Benchmarks (best of N):") + t_from = bench_from_labels(labels) + print(fmt_row("multiset_from_labels (1,1,1)", t_from.seconds, None)) + ms0 = t_from.value + + t_down, blocking = bench_downsample_ours(ms0) + t_down_nifty = bench_downsample_nifty(ms0) + print(fmt_row( + f"downsample_multiset {DOWN_BLOCK}", + t_down.seconds, + t_down_nifty.seconds if t_down_nifty else None, + )) + ms1 = t_down.value + print(f" -> level-1: n_spatial={ms1.n_spatial} n_entries={ms1.n_entries}") + + t_read_ours, t_read_nifty = bench_read_subset(ms0) + print(fmt_row( + f"read_subset (x{N_SUBSET_QUERIES})", + t_read_ours.seconds, + t_read_nifty.seconds if t_read_nifty else None, + )) + + t_merger_ours, t_merger_nifty = bench_merger(ms1) + print(fmt_row( + "MultisetMerger.update", + t_merger_ours.seconds, + t_merger_nifty.seconds if t_merger_nifty else None, + )) + + # Correctness cross-check on downsample. + if HAVE_NIFTY and t_down_nifty is not None: + bic_argmax = ms1.argmax + n_argmax = t_down_nifty.value[0] + assert bic_argmax.tolist() == n_argmax.tolist(), "argmax mismatch vs nifty!" + print("\nargmax(downsample) cross-check vs nifty: OK") + + +if __name__ == "__main__": + main() diff --git a/include/bioimage_cpp/label_multiset/downsample.hxx b/include/bioimage_cpp/label_multiset/downsample.hxx new file mode 100644 index 0000000..ce814c0 --- /dev/null +++ b/include/bioimage_cpp/label_multiset/downsample.hxx @@ -0,0 +1,124 @@ +#pragma once + +#include "bioimage_cpp/array_view.hxx" +#include "bioimage_cpp/blocking.hxx" +#include "bioimage_cpp/label_multiset/multiset.hxx" +#include "bioimage_cpp/label_multiset/read_subset.hxx" + +#include +#include +#include +#include +#include +#include + +namespace bioimage_cpp::label_multiset { + +// Direct port of nifty::tools::downsampleMultiset. Reads a level-N multiset +// described by (offsets, entry_sizes, entry_offsets, ids, counts) defined over +// the *full* spatial grid (blocking.roi_end()), aggregates into the blocks of +// the given blocking, and writes a deduplicated level-(N+1) multiset. +// +// Outputs: +// new_argmax — length blocking.number_of_blocks() +// new_offsets — length blocking.number_of_blocks(); offset into new_ids +// new_entry_offsets — length blocking.number_of_blocks(); maps block → unique entry idx +// new_entry_sizes — length n_unique; size of each unique entry +// new_ids, new_counts — concatenated, deduplicated +template +inline void downsample_multiset( + const Blocking &blocking, + const ConstArrayView &offsets, + const ConstArrayView &entry_sizes, + const ConstArrayView &entry_offsets, + const ConstArrayView &ids, + const ConstArrayView &counts, + const int restrict_set, + ArrayView &new_argmax, + ArrayView &new_offsets, + ArrayView &new_entry_offsets, + std::vector &new_entry_sizes, + std::vector &new_ids, + std::vector &new_counts +) { + using Key = HashKey; + std::unordered_map, HashKeyHash> candidate_dict; + + const std::size_t n_blocks = static_cast(blocking.number_of_blocks()); + const auto &shape = blocking.roi_end(); + const auto strides = c_order_strides_for_shape(shape); + + std::size_t current_candidate_id = 0; + // Stored per unique entry. Indexed by candidate id. + std::vector unique_entry_offsets; + + for (std::size_t block_id = 0; block_id < n_blocks; ++block_id) { + std::vector this_ids; + std::vector this_counts; + const auto block = blocking.get_block(static_cast(block_id)); + read_subset_block(block, strides, + offsets, entry_sizes, entry_offsets, ids, counts, + this_ids, this_counts, /*argsort=*/true); + + IdT max_label{}; + CountT max_count{}; + if (restrict_set > 0 && static_cast(this_ids.size()) > restrict_set) { + // sort by count descending + argsort_by_first(this_counts, this_ids, /*ascending=*/false); + max_label = this_ids[0]; + max_count = this_counts[0]; + this_ids.resize(static_cast(restrict_set)); + this_counts.resize(static_cast(restrict_set)); + argsort_by_first(this_ids, this_counts); + } else { + auto max_it = std::max_element(this_counts.begin(), this_counts.end()); + max_label = this_ids[std::distance(this_counts.begin(), max_it)]; + max_count = *max_it; + } + new_argmax.data[block_id] = max_label; + + Key key{max_label, max_count}; + bool add_entry = true; + auto cand_it = candidate_dict.find(key); + if (cand_it != candidate_dict.end()) { + for (const std::size_t c_id : cand_it->second) { + const std::size_t c_offset = unique_entry_offsets[c_id]; + const std::size_t c_size = static_cast(new_entry_sizes[c_id]); + bool match = ranges_equal( + this_ids.begin(), this_ids.end(), + new_ids.begin() + c_offset, new_ids.begin() + c_offset + c_size + ); + if (match) { + match = ranges_equal( + this_counts.begin(), this_counts.end(), + new_counts.begin() + c_offset, new_counts.begin() + c_offset + c_size + ); + } + if (match) { + new_offsets.data[block_id] = static_cast(c_offset); + new_entry_offsets.data[block_id] = static_cast(c_id); + add_entry = false; + break; + } + } + } + + if (add_entry) { + const std::size_t this_offset = new_ids.size(); + new_offsets.data[block_id] = static_cast(this_offset); + new_entry_offsets.data[block_id] = static_cast(current_candidate_id); + unique_entry_offsets.emplace_back(this_offset); + new_entry_sizes.emplace_back(static_cast(this_ids.size())); + new_ids.insert(new_ids.end(), this_ids.begin(), this_ids.end()); + new_counts.insert(new_counts.end(), this_counts.begin(), this_counts.end()); + if (cand_it == candidate_dict.end()) { + candidate_dict.emplace(key, std::vector{current_candidate_id}); + } else { + cand_it->second.emplace_back(current_candidate_id); + } + ++current_candidate_id; + } + } +} + +} // namespace bioimage_cpp::label_multiset diff --git a/include/bioimage_cpp/label_multiset/from_labels.hxx b/include/bioimage_cpp/label_multiset/from_labels.hxx new file mode 100644 index 0000000..03979f5 --- /dev/null +++ b/include/bioimage_cpp/label_multiset/from_labels.hxx @@ -0,0 +1,147 @@ +#pragma once + +#include "bioimage_cpp/array_view.hxx" +#include "bioimage_cpp/blocking.hxx" +#include "bioimage_cpp/label_multiset/multiset.hxx" +#include "bioimage_cpp/label_multiset/read_subset.hxx" + +#include +#include +#include +#include +#include + +namespace bioimage_cpp::label_multiset { + +// Build a deduplicated level-0 multiset from a (C-contiguous) label volume, +// where each spatial position corresponds to one block of the given blocking +// (the blocking's roi spans the full label volume, block_shape determines the +// aggregation footprint of each "voxel" in the resulting multiset). +// +// For block_shape == (1,1,...,1) this gives the trivial "one label, count 1 +// per pixel" multiset. +// +// Outputs: +// argmax — length n_blocks +// out_offsets — length n_blocks +// out_entry_offsets — length n_blocks +// out_entry_sizes — length n_unique +// out_ids, out_counts — concatenated +template +inline void multiset_from_labels( + const ConstArrayView &labels, + const Blocking &blocking, + ArrayView &argmax, + ArrayView &out_offsets, + ArrayView &out_entry_offsets, + std::vector &out_entry_sizes, + std::vector &out_ids, + std::vector &out_counts +) { + using Key = HashKey; + std::unordered_map, HashKeyHash> candidate_dict; + + const std::size_t ndim = blocking.ndim(); + const auto &shape = blocking.roi_end(); + const auto strides = c_order_strides_for_shape(shape); + + const std::size_t n_blocks = static_cast(blocking.number_of_blocks()); + std::vector unique_entry_offsets; + std::size_t current_candidate_id = 0; + + std::vector this_ids; + std::vector this_counts; + + for (std::size_t block_id = 0; block_id < n_blocks; ++block_id) { + const auto block = blocking.get_block(static_cast(block_id)); + const auto &begin = block.begin(); + const auto &end = block.end(); + + std::unordered_map count_dict; + std::vector coord = begin; + while (true) { + std::size_t index = 0; + for (std::size_t d = 0; d < ndim; ++d) { + index += static_cast(coord[d]) * strides[d]; + } + const IdT id = static_cast(labels.data[index]); + auto it = count_dict.find(id); + if (it == count_dict.end()) { + count_dict.emplace(id, CountT{1}); + } else { + it->second += CountT{1}; + } + + std::ptrdiff_t axis = static_cast(ndim) - 1; + for (; axis >= 0; --axis) { + ++coord[axis]; + if (coord[axis] < end[axis]) { + break; + } + coord[axis] = begin[axis]; + } + if (axis < 0) { + break; + } + } + + this_ids.assign(count_dict.size(), IdT{}); + this_counts.assign(count_dict.size(), CountT{}); + std::size_t i = 0; + for (const auto &elem : count_dict) { + this_ids[i] = elem.first; + this_counts[i] = elem.second; + ++i; + } + argsort_by_first(this_ids, this_counts); + + auto max_it = std::max_element(this_counts.begin(), this_counts.end()); + const IdT max_label = this_ids[static_cast(std::distance(this_counts.begin(), max_it))]; + const CountT max_count = *max_it; + argmax.data[block_id] = max_label; + + Key key{max_label, max_count}; + bool add_entry = true; + auto cand_it = candidate_dict.find(key); + if (cand_it != candidate_dict.end()) { + for (const std::size_t c_id : cand_it->second) { + const std::size_t c_offset = unique_entry_offsets[c_id]; + const std::size_t c_size = static_cast(out_entry_sizes[c_id]); + bool match = ranges_equal( + this_ids.begin(), this_ids.end(), + out_ids.begin() + c_offset, out_ids.begin() + c_offset + c_size + ); + if (match) { + match = ranges_equal( + this_counts.begin(), this_counts.end(), + out_counts.begin() + c_offset, out_counts.begin() + c_offset + c_size + ); + } + if (match) { + out_offsets.data[block_id] = static_cast(c_offset); + out_entry_offsets.data[block_id] = static_cast(c_id); + add_entry = false; + break; + } + } + } + + if (add_entry) { + const std::size_t this_offset = out_ids.size(); + out_offsets.data[block_id] = static_cast(this_offset); + out_entry_offsets.data[block_id] = static_cast(current_candidate_id); + unique_entry_offsets.emplace_back(this_offset); + out_entry_sizes.emplace_back(static_cast(this_ids.size())); + out_ids.insert(out_ids.end(), this_ids.begin(), this_ids.end()); + out_counts.insert(out_counts.end(), this_counts.begin(), this_counts.end()); + if (cand_it == candidate_dict.end()) { + candidate_dict.emplace(key, std::vector{current_candidate_id}); + } else { + cand_it->second.emplace_back(current_candidate_id); + } + ++current_candidate_id; + } + } +} + +} // namespace bioimage_cpp::label_multiset diff --git a/include/bioimage_cpp/label_multiset/merger.hxx b/include/bioimage_cpp/label_multiset/merger.hxx new file mode 100644 index 0000000..b14d3e2 --- /dev/null +++ b/include/bioimage_cpp/label_multiset/merger.hxx @@ -0,0 +1,143 @@ +#pragma once + +#include "bioimage_cpp/array_view.hxx" +#include "bioimage_cpp/label_multiset/multiset.hxx" + +#include +#include +#include +#include +#include +#include + +namespace bioimage_cpp::label_multiset { + +template +class MultisetMerger { +public: + using OffsetType = OffsetT; + using IdType = IdT; + using CountType = CountT; + + MultisetMerger( + const ConstArrayView &offsets, + const ConstArrayView &entry_sizes, + const ConstArrayView &ids, + const ConstArrayView &counts + ) + : offsets_(offsets.data, offsets.data + offsets.shape[0]), + entry_sizes_(entry_sizes.data, entry_sizes.data + entry_sizes.shape[0]), + ids_(ids.data, ids.data + ids.shape[0]), + counts_(counts.data, counts.data + counts.shape[0]) { + init_hashed(); + } + + const std::vector &ids() const { return ids_; } + const std::vector &counts() const { return counts_; } + const std::vector &offsets() const { return offsets_; } + const std::vector &entry_sizes() const { return entry_sizes_; } + + // Ingest a batch of *unique* entries described by (unique_offsets, + // entry_sizes, ids, counts), then rewrite `offsets` so that each element + // (which was indexed by "entry id within the input batch") becomes the + // absolute byte offset into the deduplicated ids_/counts_ arrays. + void update( + const ConstArrayView &unique_offsets, + const ConstArrayView &batch_entry_sizes, + const ConstArrayView &batch_ids, + const ConstArrayView &batch_counts, + ArrayView &offsets + ) { + const std::size_t n_entries = static_cast(unique_offsets.shape[0]); + // Maps batch entry id → absolute byte offset in ids_/counts_. + std::unordered_map new_offset_dict; + + for (std::size_t entry = 0; entry < n_entries; ++entry) { + const OffsetT off = unique_offsets.data[entry]; + const OffsetT size = batch_entry_sizes.data[entry]; + + const IdT *ids_begin = batch_ids.data + off; + const IdT *ids_end = ids_begin + size; + const CountT *counts_begin = batch_counts.data + off; + const CountT *counts_end = counts_begin + size; + + auto max_it = std::max_element(counts_begin, counts_end); + const IdT max_label = *(ids_begin + std::distance(counts_begin, max_it)); + const CountT max_count = *max_it; + + HashKey key{max_label, max_count}; + auto hash_it = hashed_.find(key); + bool new_entry = true; + if (hash_it != hashed_.end()) { + for (const std::size_t c_id : hash_it->second) { + const std::size_t c_offset = static_cast(offsets_[c_id]); + const std::size_t c_size = static_cast(entry_sizes_[c_id]); + bool match = ranges_equal( + ids_begin, ids_end, + ids_.begin() + c_offset, ids_.begin() + c_offset + c_size + ); + if (match) { + match = ranges_equal( + counts_begin, counts_end, + counts_.begin() + c_offset, counts_.begin() + c_offset + c_size + ); + } + if (match) { + new_entry = false; + new_offset_dict[static_cast(entry)] = static_cast(c_offset); + break; + } + } + } + + if (new_entry) { + const std::size_t this_size = static_cast(std::distance(ids_begin, ids_end)); + const std::size_t this_offset = ids_.size(); + offsets_.emplace_back(static_cast(this_offset)); + entry_sizes_.emplace_back(static_cast(this_size)); + new_offset_dict[static_cast(entry)] = static_cast(this_offset); + ids_.insert(ids_.end(), ids_begin, ids_end); + counts_.insert(counts_.end(), counts_begin, counts_end); + const std::size_t this_id = offsets_.size() - 1; + if (hash_it == hashed_.end()) { + hashed_.emplace(key, std::vector{this_id}); + // hash_it is invalidated by emplace; we don't reuse it. + } else { + hash_it->second.emplace_back(this_id); + } + } + } + + const std::size_t n_off = static_cast(offsets.shape[0]); + for (std::size_t i = 0; i < n_off; ++i) { + offsets.data[i] = new_offset_dict[offsets.data[i]]; + } + } + +private: + void init_hashed() { + const std::size_t n_entries = offsets_.size(); + for (std::size_t entry = 0; entry < n_entries; ++entry) { + const std::size_t off = static_cast(offsets_[entry]); + const std::size_t size = static_cast(entry_sizes_[entry]); + auto max_it = std::max_element(counts_.begin() + off, counts_.begin() + off + size); + const IdT max_label = ids_[static_cast(std::distance(counts_.begin(), max_it))]; + const CountT max_count = *max_it; + HashKey key{max_label, max_count}; + auto it = hashed_.find(key); + if (it == hashed_.end()) { + hashed_.emplace(key, std::vector{entry}); + } else { + it->second.emplace_back(entry); + } + } + } + + std::vector offsets_; + std::vector entry_sizes_; + std::vector ids_; + std::vector counts_; + std::unordered_map, std::vector, HashKeyHash> hashed_; +}; + +} // namespace bioimage_cpp::label_multiset diff --git a/include/bioimage_cpp/label_multiset/multiset.hxx b/include/bioimage_cpp/label_multiset/multiset.hxx new file mode 100644 index 0000000..53e97a1 --- /dev/null +++ b/include/bioimage_cpp/label_multiset/multiset.hxx @@ -0,0 +1,66 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace bioimage_cpp::label_multiset { + +template +inline void reorder_inplace(V &v, const std::vector &idx) { + V tmp(v.size()); + for (std::size_t i = 0; i < v.size(); ++i) { + tmp[i] = v[idx[i]]; + } + v.swap(tmp); +} + +template +inline void argsort_by_first(V1 &v1, V2 &v2, const bool ascending = true) { + std::vector idx(v1.size()); + std::iota(idx.begin(), idx.end(), 0); + if (ascending) { + std::sort(idx.begin(), idx.end(), + [&v1](std::size_t a, std::size_t b) { return v1[a] < v1[b]; }); + } else { + std::sort(idx.begin(), idx.end(), + [&v1](std::size_t a, std::size_t b) { return v1[a] > v1[b]; }); + } + reorder_inplace(v1, idx); + reorder_inplace(v2, idx); +} + +template +inline bool ranges_equal(It1 a_begin, It1 a_end, It2 b_begin, It2 b_end) { + if (std::distance(a_begin, a_end) != std::distance(b_begin, b_end)) { + return false; + } + return std::equal(a_begin, a_end, b_begin); +} + +template +struct HashKey { + IdType id; + CountType count; + bool operator==(const HashKey &other) const noexcept { + return id == other.id && count == other.count; + } +}; + +struct HashKeyHash { + template + std::size_t operator()(const HashKey &key) const noexcept { + std::size_t h1 = std::hash{}(key.id); + std::size_t h2 = std::hash{}(key.count); + // boost::hash_combine + h1 ^= h2 + 0x9e3779b9ULL + (h1 << 6) + (h1 >> 2); + return h1; + } +}; + +} // namespace bioimage_cpp::label_multiset diff --git a/include/bioimage_cpp/label_multiset/read_subset.hxx b/include/bioimage_cpp/label_multiset/read_subset.hxx new file mode 100644 index 0000000..55928af --- /dev/null +++ b/include/bioimage_cpp/label_multiset/read_subset.hxx @@ -0,0 +1,136 @@ +#pragma once + +#include "bioimage_cpp/array_view.hxx" +#include "bioimage_cpp/blocking.hxx" +#include "bioimage_cpp/label_multiset/multiset.hxx" + +#include +#include +#include +#include + +namespace bioimage_cpp::label_multiset { + +template +inline void read_subset( + const ConstArrayView &offsets, + const ConstArrayView &sizes, + const ConstArrayView &ids, + const ConstArrayView &counts, + std::vector &ids_out, + std::vector &counts_out, + const bool argsort = true +) { + std::unordered_map count_dict; + + const std::size_t n_offsets = static_cast(offsets.shape[0]); + for (std::size_t off_id = 0; off_id < n_offsets; ++off_id) { + const std::size_t offset = static_cast(offsets.data[off_id]); + const std::size_t size = static_cast(sizes.data[off_id]); + for (std::size_t pos = offset; pos < offset + size; ++pos) { + const IdT id = ids.data[pos]; + const CountT count = counts.data[pos]; + auto it = count_dict.find(id); + if (it == count_dict.end()) { + count_dict.emplace(id, count); + } else { + it->second += count; + } + } + } + + ids_out.resize(count_dict.size()); + counts_out.resize(count_dict.size()); + std::size_t i = 0; + for (const auto &elem : count_dict) { + ids_out[i] = elem.first; + counts_out[i] = elem.second; + ++i; + } + if (argsort) { + argsort_by_first(ids_out, counts_out); + } +} + +// Block-aware variant: collect (offset, size) pairs for every spatial position +// in the block, then call the flat overload. C-order strides over the *full* +// spatial domain. +template +inline void read_subset_block( + const Block &block, + const std::vector &strides, + const ConstArrayView &offsets, + const ConstArrayView &entry_sizes, + const ConstArrayView &entry_offsets, + const ConstArrayView &ids, + const ConstArrayView &counts, + std::vector &ids_out, + std::vector &counts_out, + const bool argsort = true +) { + std::unordered_map count_dict; + const auto &begin = block.begin(); + const auto &end = block.end(); + const std::size_t ndim = begin.size(); + + // Generic N-D iteration via incrementing coordinate vector. + std::vector coord = begin; + while (true) { + std::size_t index = 0; + for (std::size_t d = 0; d < ndim; ++d) { + index += static_cast(coord[d]) * strides[d]; + } + const std::size_t off = static_cast(offsets.data[index]); + const std::size_t entry_idx = static_cast(entry_offsets.data[index]); + const std::size_t size = static_cast(entry_sizes.data[entry_idx]); + for (std::size_t pos = off; pos < off + size; ++pos) { + const IdT id = ids.data[pos]; + const CountT count = counts.data[pos]; + auto it = count_dict.find(id); + if (it == count_dict.end()) { + count_dict.emplace(id, count); + } else { + it->second += count; + } + } + + // Increment N-D coord (last axis fastest). + std::ptrdiff_t axis = static_cast(ndim) - 1; + for (; axis >= 0; --axis) { + ++coord[axis]; + if (coord[axis] < end[axis]) { + break; + } + coord[axis] = begin[axis]; + } + if (axis < 0) { + break; + } + } + + ids_out.resize(count_dict.size()); + counts_out.resize(count_dict.size()); + std::size_t i = 0; + for (const auto &elem : count_dict) { + ids_out[i] = elem.first; + counts_out[i] = elem.second; + ++i; + } + if (argsort) { + argsort_by_first(ids_out, counts_out); + } +} + +inline std::vector c_order_strides_for_shape(const CoordinateVector &shape) { + const std::size_t ndim = shape.size(); + std::vector strides(ndim, 1); + if (ndim == 0) { + return strides; + } + for (std::ptrdiff_t d = static_cast(ndim) - 2; d >= 0; --d) { + strides[d] = strides[d + 1] * static_cast(shape[d + 1]); + } + return strides; +} + +} // namespace bioimage_cpp::label_multiset diff --git a/src/bindings/label_multiset.cxx b/src/bindings/label_multiset.cxx new file mode 100644 index 0000000..a954b81 --- /dev/null +++ b/src/bindings/label_multiset.cxx @@ -0,0 +1,335 @@ +#include "label_multiset.hxx" + +#include "bioimage_cpp/array_view.hxx" +#include "bioimage_cpp/blocking.hxx" +#include "bioimage_cpp/label_multiset/downsample.hxx" +#include "bioimage_cpp/label_multiset/from_labels.hxx" +#include "bioimage_cpp/label_multiset/merger.hxx" +#include "bioimage_cpp/label_multiset/read_subset.hxx" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nb = nanobind; + +namespace bioimage_cpp::bindings { +namespace { + +using IdT = std::uint64_t; +using CountT = std::uint32_t; +using OffsetT = std::uint64_t; +using LabelT = std::uint64_t; + +using OffsetArray = nb::ndarray; +using IdArray = nb::ndarray; +using CountArray = nb::ndarray; +using OffsetOutArray = nb::ndarray; +using IdOutArray = nb::ndarray; +using CountOutArray = nb::ndarray; + +template +ConstArrayView view_1d(const nb::ndarray &arr, + const char *name) { + if (arr.ndim() != 1) { + throw std::invalid_argument( + std::string(name) + " must have ndim 1, got ndim=" + + std::to_string(arr.ndim()) + ); + } + return ConstArrayView{ + arr.data(), + {static_cast(arr.shape(0))}, + {}, + }; +} + +template +nb::ndarray alloc_1d(std::size_t n) { + auto *data = new T[n](); + nb::capsule owner(data, [](void *p) noexcept { + delete[] static_cast(p); + }); + std::size_t shape[1] = {n}; + return nb::ndarray(data, 1, shape, owner); +} + +template +nb::ndarray vector_to_array(std::vector &&v) { + auto *data = new T[v.size()]; + std::copy(v.begin(), v.end(), data); + nb::capsule owner(data, [](void *p) noexcept { + delete[] static_cast(p); + }); + std::size_t shape[1] = {v.size()}; + return nb::ndarray(data, 1, shape, owner); +} + +// ----- read_subset ----- + +std::pair, + nb::ndarray> +bind_read_subset_flat(OffsetArray offsets, OffsetArray sizes, + IdArray ids, CountArray counts, + bool argsort) { + auto offsets_v = view_1d(offsets, "offsets"); + auto sizes_v = view_1d(sizes, "sizes"); + auto ids_v = view_1d(ids, "ids"); + auto counts_v = view_1d(counts, "counts"); + if (offsets_v.shape[0] != sizes_v.shape[0]) { + throw std::invalid_argument("offsets and sizes must have the same length"); + } + if (ids_v.shape[0] != counts_v.shape[0]) { + throw std::invalid_argument("ids and counts must have the same length"); + } + + std::vector ids_out; + std::vector counts_out; + { + nb::gil_scoped_release release; + label_multiset::read_subset(offsets_v, sizes_v, ids_v, counts_v, + ids_out, counts_out, argsort); + } + return {vector_to_array(std::move(ids_out)), + vector_to_array(std::move(counts_out))}; +} + +// ----- downsample_multiset ----- + +std::tuple, + nb::ndarray, + nb::ndarray, + nb::ndarray, + nb::ndarray, + nb::ndarray> +bind_downsample_multiset(const Blocking &blocking, + OffsetArray offsets, + OffsetArray entry_sizes, + OffsetArray entry_offsets, + IdArray ids, + CountArray counts, + int restrict_set) { + auto offsets_v = view_1d(offsets, "offsets"); + auto entry_sizes_v = view_1d(entry_sizes, "entry_sizes"); + auto entry_offsets_v = view_1d(entry_offsets, "entry_offsets"); + auto ids_v = view_1d(ids, "ids"); + auto counts_v = view_1d(counts, "counts"); + if (ids_v.shape[0] != counts_v.shape[0]) { + throw std::invalid_argument("ids and counts must have the same length"); + } + if (offsets_v.shape[0] != entry_offsets_v.shape[0]) { + throw std::invalid_argument( + "offsets and entry_offsets must have the same length" + ); + } + + const std::size_t n_blocks = static_cast(blocking.number_of_blocks()); + + auto new_argmax = alloc_1d(n_blocks); + auto new_offsets = alloc_1d(n_blocks); + auto new_entry_offsets = alloc_1d(n_blocks); + + std::vector new_entry_sizes; + std::vector new_ids; + std::vector new_counts; + + ArrayView argmax_view{ + new_argmax.data(), + {static_cast(n_blocks)}, {}}; + ArrayView off_view{ + new_offsets.data(), + {static_cast(n_blocks)}, {}}; + ArrayView eoff_view{ + new_entry_offsets.data(), + {static_cast(n_blocks)}, {}}; + + { + nb::gil_scoped_release release; + label_multiset::downsample_multiset( + blocking, offsets_v, entry_sizes_v, entry_offsets_v, ids_v, counts_v, + restrict_set, + argmax_view, off_view, eoff_view, + new_entry_sizes, new_ids, new_counts + ); + } + + return std::make_tuple( + std::move(new_argmax), + std::move(new_offsets), + std::move(new_entry_offsets), + vector_to_array(std::move(new_entry_sizes)), + vector_to_array(std::move(new_ids)), + vector_to_array(std::move(new_counts)) + ); +} + +// ----- multiset_from_labels ----- + +template +std::tuple, + nb::ndarray, + nb::ndarray, + nb::ndarray, + nb::ndarray, + nb::ndarray> +bind_multiset_from_labels_t( + nb::ndarray labels, + const Blocking &blocking +) { + if (labels.ndim() != blocking.ndim()) { + throw std::invalid_argument( + "labels ndim must match blocking ndim, got labels ndim=" + + std::to_string(labels.ndim()) + ); + } + std::vector shape(labels.ndim()); + for (std::size_t a = 0; a < labels.ndim(); ++a) { + shape[a] = static_cast(labels.shape(a)); + if (shape[a] != blocking.roi_end()[a]) { + throw std::invalid_argument( + "labels shape must match blocking.roi_end()" + ); + } + } + ConstArrayView labels_v{labels.data(), shape, {}}; + + const std::size_t n_blocks = static_cast(blocking.number_of_blocks()); + auto argmax = alloc_1d(n_blocks); + auto offsets = alloc_1d(n_blocks); + auto entry_offsets = alloc_1d(n_blocks); + + std::vector entry_sizes; + std::vector ids; + std::vector counts; + + ArrayView argmax_view{argmax.data(), + {static_cast(n_blocks)}, {}}; + ArrayView off_view{offsets.data(), + {static_cast(n_blocks)}, {}}; + ArrayView eoff_view{entry_offsets.data(), + {static_cast(n_blocks)}, {}}; + + { + nb::gil_scoped_release release; + label_multiset::multiset_from_labels( + labels_v, blocking, + argmax_view, off_view, eoff_view, + entry_sizes, ids, counts + ); + } + return std::make_tuple( + std::move(argmax), + std::move(offsets), + std::move(entry_offsets), + vector_to_array(std::move(entry_sizes)), + vector_to_array(std::move(ids)), + vector_to_array(std::move(counts)) + ); +} + +// ----- MultisetMerger ----- + +class PyMultisetMerger { +public: + PyMultisetMerger(OffsetArray offsets, OffsetArray entry_sizes, + IdArray ids, CountArray counts) + : merger_( + view_1d(offsets, "offsets"), + view_1d(entry_sizes, "entry_sizes"), + view_1d(ids, "ids"), + view_1d(counts, "counts") + ) {} + + nb::ndarray + update(OffsetArray unique_offsets, OffsetArray entry_sizes, + IdArray ids, CountArray counts, OffsetOutArray offsets) { + auto uo_v = view_1d(unique_offsets, "unique_offsets"); + auto es_v = view_1d(entry_sizes, "entry_sizes"); + auto ids_v = view_1d(ids, "ids"); + auto counts_v = view_1d(counts, "counts"); + if (offsets.ndim() != 1) { + throw std::invalid_argument("offsets must have ndim 1"); + } + ArrayView off_view{ + offsets.data(), + {static_cast(offsets.shape(0))}, + {}}; + { + nb::gil_scoped_release release; + merger_.update(uo_v, es_v, ids_v, counts_v, off_view); + } + return offsets; + } + + nb::ndarray ids_array() const { + return copy_to_array(merger_.ids()); + } + nb::ndarray counts_array() const { + return copy_to_array(merger_.counts()); + } + nb::ndarray offsets_array() const { + return copy_to_array(merger_.offsets()); + } + nb::ndarray entry_sizes_array() const { + return copy_to_array(merger_.entry_sizes()); + } + +private: + template + static nb::ndarray copy_to_array(const std::vector &v) { + auto *data = new T[v.size()]; + std::copy(v.begin(), v.end(), data); + nb::capsule owner(data, [](void *p) noexcept { + delete[] static_cast(p); + }); + std::size_t shape[1] = {v.size()}; + return nb::ndarray(data, 1, shape, owner); + } + + label_multiset::MultisetMerger merger_; +}; + +} // namespace + +void bind_label_multiset(nb::module_ &m) { + m.def("_read_subset", &bind_read_subset_flat, + nb::arg("offsets"), nb::arg("sizes"), + nb::arg("ids"), nb::arg("counts"), + nb::arg("argsort") = true); + + m.def("_downsample_multiset", &bind_downsample_multiset, + nb::arg("blocking"), + nb::arg("offsets"), nb::arg("entry_sizes"), nb::arg("entry_offsets"), + nb::arg("ids"), nb::arg("counts"), + nb::arg("restrict_set") = -1); + + m.def("_multiset_from_labels_u32", &bind_multiset_from_labels_t, + nb::arg("labels"), nb::arg("blocking")); + m.def("_multiset_from_labels_u64", &bind_multiset_from_labels_t, + nb::arg("labels"), nb::arg("blocking")); + + nb::class_(m, "_MultisetMerger") + .def(nb::init(), + nb::arg("offsets"), nb::arg("entry_sizes"), + nb::arg("ids"), nb::arg("counts")) + .def("update", &PyMultisetMerger::update, + nb::arg("unique_offsets"), nb::arg("entry_sizes"), + nb::arg("ids"), nb::arg("counts"), nb::arg("offsets")) + .def("get_ids", &PyMultisetMerger::ids_array) + .def("get_counts", &PyMultisetMerger::counts_array) + .def("get_offsets", &PyMultisetMerger::offsets_array) + .def("get_entry_sizes", &PyMultisetMerger::entry_sizes_array); +} + +} // namespace bioimage_cpp::bindings diff --git a/src/bindings/label_multiset.hxx b/src/bindings/label_multiset.hxx new file mode 100644 index 0000000..b670320 --- /dev/null +++ b/src/bindings/label_multiset.hxx @@ -0,0 +1,9 @@ +#pragma once + +#include + +namespace bioimage_cpp::bindings { + +void bind_label_multiset(nanobind::module_ &m); + +} // namespace bioimage_cpp::bindings diff --git a/src/bindings/module.cxx b/src/bindings/module.cxx index 5281816..909524f 100644 --- a/src/bindings/module.cxx +++ b/src/bindings/module.cxx @@ -5,6 +5,7 @@ #include "flow.hxx" #include "graph.hxx" #include "ground_truth.hxx" +#include "label_multiset.hxx" #include "segmentation.hxx" #include "transformation.hxx" #include "util.hxx" @@ -23,6 +24,7 @@ NB_MODULE(_core, m) { bioimage_cpp::bindings::bind_flow(m); bioimage_cpp::bindings::bind_graph(m); bioimage_cpp::bindings::bind_ground_truth(m); + bioimage_cpp::bindings::bind_label_multiset(m); bioimage_cpp::bindings::bind_segmentation(m); bioimage_cpp::bindings::bind_transformation(m); bioimage_cpp::bindings::bind_util(m); diff --git a/src/bioimage_cpp/__init__.py b/src/bioimage_cpp/__init__.py index 3a66762..aa9343e 100644 --- a/src/bioimage_cpp/__init__.py +++ b/src/bioimage_cpp/__init__.py @@ -66,6 +66,7 @@ from . import filters from . import flow from . import graph +from . import label_multiset from . import segmentation from . import transformation from . import utils @@ -77,6 +78,7 @@ "filters", "flow", "graph", + "label_multiset", "segmentation", "transformation", "utils", diff --git a/src/bioimage_cpp/label_multiset/__init__.py b/src/bioimage_cpp/label_multiset/__init__.py new file mode 100644 index 0000000..807f6a6 --- /dev/null +++ b/src/bioimage_cpp/label_multiset/__init__.py @@ -0,0 +1,260 @@ +"""Label multiset data structure. + +A label multiset is a sparse, deduplicated representation of label +distributions over a grid of spatial blocks. For each spatial position +(block) it stores a histogram of the labels in the corresponding fine +region as a `(ids, counts)` pair, with identical histograms across +different blocks pointing to the same shared storage. + +This is a re-implementation of the label-multiset utilities from +`nifty.tools.label_multiset` with no external C++ dependencies. + +Storage layout (mirrors nifty): + +- ``offsets`` length ``n_spatial``: spatial position to byte offset into ``ids`` / ``counts`` +- ``entry_offsets`` length ``n_spatial``: spatial position to unique-entry index +- ``entry_sizes`` length ``n_unique``: number of ``(id, count)`` pairs per entry +- ``ids`` length ``total_elems``: concatenated label ids (sorted within each entry) +- ``counts`` length ``total_elems``: counts aligned with ``ids`` +- ``argmax`` length ``n_spatial``: argmax label per spatial position +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import numpy as np + +from .. import _core +from .._core import Blocking + +_ID_DTYPE = np.dtype(np.uint64) +_COUNT_DTYPE = np.dtype(np.uint32) +_OFFSET_DTYPE = np.dtype(np.uint64) + + +@dataclass +class LabelMultiset: + """A deduplicated label-histogram representation over a spatial grid.""" + + argmax: np.ndarray # shape (n_spatial,), dtype uint64 + offsets: np.ndarray # shape (n_spatial,), dtype uint64 + entry_offsets: np.ndarray # shape (n_spatial,), dtype uint64 + entry_sizes: np.ndarray # shape (n_unique,), dtype uint64 + ids: np.ndarray # shape (total_elems,), dtype uint64 + counts: np.ndarray # shape (total_elems,), dtype uint32 + + @property + def n_spatial(self) -> int: + return int(self.offsets.shape[0]) + + @property + def n_entries(self) -> int: + return int(self.entry_sizes.shape[0]) + + def entry(self, spatial_index: int) -> Tuple[np.ndarray, np.ndarray]: + """Return ``(ids, counts)`` for the multiset at ``spatial_index``.""" + off = int(self.offsets[spatial_index]) + entry_idx = int(self.entry_offsets[spatial_index]) + size = int(self.entry_sizes[entry_idx]) + return self.ids[off : off + size], self.counts[off : off + size] + + def __getitem__(self, spatial_index: int) -> Tuple[np.ndarray, np.ndarray]: + return self.entry(spatial_index) + + +def _as_blocking( + shape: Tuple[int, ...], + block_shape: Tuple[int, ...], +) -> Blocking: + if len(shape) != len(block_shape): + raise ValueError( + f"shape and block_shape must have the same length, got " + f"{len(shape)} and {len(block_shape)}" + ) + roi_begin = [0] * len(shape) + return Blocking(roi_begin, list(shape), list(block_shape)) + + +def multiset_from_labels( + labels: np.ndarray, + block_shape: Tuple[int, ...], +) -> LabelMultiset: + """Build a level-0 label multiset by aggregating ``labels`` over blocks. + + For each block of shape ``block_shape``, computes the label histogram of + the contained voxels. Identical histograms are deduplicated. + """ + labels = np.ascontiguousarray(labels) + if labels.dtype == np.uint32: + fn = _core._multiset_from_labels_u32 + elif labels.dtype == np.uint64: + fn = _core._multiset_from_labels_u64 + else: + raise TypeError( + f"labels must have dtype uint32 or uint64, got {labels.dtype}" + ) + blocking = _as_blocking(tuple(labels.shape), tuple(block_shape)) + argmax, offsets, entry_offsets, entry_sizes, ids, counts = fn(labels, blocking) + return LabelMultiset( + argmax=argmax, + offsets=offsets, + entry_offsets=entry_offsets, + entry_sizes=entry_sizes, + ids=ids, + counts=counts, + ) + + +def downsample_multiset( + multiset: LabelMultiset, + blocking: Blocking, + restrict_set: int = -1, +) -> LabelMultiset: + """Downsample ``multiset`` by aggregating its entries into ``blocking``'s blocks. + + ``blocking`` must be defined over the same spatial extent as the input + multiset (i.e. ``prod(blocking.roi_end) == multiset.n_spatial``). + """ + argmax, new_offsets, new_entry_offsets, new_entry_sizes, new_ids, new_counts = ( + _core._downsample_multiset( + blocking, + multiset.offsets, + multiset.entry_sizes, + multiset.entry_offsets, + multiset.ids, + multiset.counts, + restrict_set, + ) + ) + return LabelMultiset( + argmax=argmax, + offsets=new_offsets, + entry_offsets=new_entry_offsets, + entry_sizes=new_entry_sizes, + ids=new_ids, + counts=new_counts, + ) + + +def read_subset( + offsets: np.ndarray, + sizes: np.ndarray, + ids: np.ndarray, + counts: np.ndarray, + argsort: bool = True, +) -> Tuple[np.ndarray, np.ndarray]: + """Merge multisets located at the given ``(offset, size)`` ranges. + + Returns the summed ``(ids, counts)``, sorted by id if ``argsort``. + """ + offsets = np.ascontiguousarray(offsets, dtype=_OFFSET_DTYPE) + sizes = np.ascontiguousarray(sizes, dtype=_OFFSET_DTYPE) + ids = np.ascontiguousarray(ids, dtype=_ID_DTYPE) + counts = np.ascontiguousarray(counts, dtype=_COUNT_DTYPE) + return _core._read_subset(offsets, sizes, ids, counts, argsort) + + +def _unique_offsets_of(multiset: "LabelMultiset") -> np.ndarray: + """For each unique entry of ``multiset``, return the byte offset into ids/counts.""" + n = multiset.n_entries + out = np.empty(n, dtype=_OFFSET_DTYPE) + for e in range(n): + out[e] = multiset.offsets[np.where(multiset.entry_offsets == e)[0][0]] + return out + + +class MultisetMerger: + """Stateful deduplicating merger for multisets produced in batches. + + The constructor expects one offset *per unique entry* (i.e. arrays of + length ``n_unique``, not ``n_spatial``). Use :meth:`from_multiset` to + build one straight from a :class:`LabelMultiset`. + + Call :meth:`update` with subsequent batches; each call extends the + internal storage with any genuinely new entries and rewrites the + passed-in ``offsets`` array so each spatial position points at its + final deduplicated byte offset. + """ + + def __init__( + self, + unique_offsets: np.ndarray, + entry_sizes: np.ndarray, + ids: np.ndarray, + counts: np.ndarray, + ) -> None: + unique_offsets = np.ascontiguousarray(unique_offsets, dtype=_OFFSET_DTYPE) + entry_sizes = np.ascontiguousarray(entry_sizes, dtype=_OFFSET_DTYPE) + if unique_offsets.shape != entry_sizes.shape: + raise ValueError( + "unique_offsets and entry_sizes must have the same length " + "(one entry each). Use MultisetMerger.from_multiset() if you " + "have a LabelMultiset instead." + ) + self._impl = _core._MultisetMerger( + unique_offsets, + entry_sizes, + np.ascontiguousarray(ids, dtype=_ID_DTYPE), + np.ascontiguousarray(counts, dtype=_COUNT_DTYPE), + ) + + @classmethod + def from_multiset(cls, multiset: "LabelMultiset") -> "MultisetMerger": + """Build a merger seeded with the unique entries of ``multiset``.""" + return cls( + _unique_offsets_of(multiset), + multiset.entry_sizes, + multiset.ids, + multiset.counts, + ) + + def update( + self, + unique_offsets: np.ndarray, + entry_sizes: np.ndarray, + ids: np.ndarray, + counts: np.ndarray, + offsets: np.ndarray, + ) -> np.ndarray: + """Ingest a batch of entries and rewrite ``offsets`` in-place. + + ``offsets`` is mutated in-place and also returned. + """ + if offsets.dtype != _OFFSET_DTYPE or not offsets.flags["C_CONTIGUOUS"]: + raise TypeError( + "offsets must be a contiguous uint64 array (it is modified in place)" + ) + return self._impl.update( + np.ascontiguousarray(unique_offsets, dtype=_OFFSET_DTYPE), + np.ascontiguousarray(entry_sizes, dtype=_OFFSET_DTYPE), + np.ascontiguousarray(ids, dtype=_ID_DTYPE), + np.ascontiguousarray(counts, dtype=_COUNT_DTYPE), + offsets, + ) + + @property + def ids(self) -> np.ndarray: + return self._impl.get_ids() + + @property + def counts(self) -> np.ndarray: + return self._impl.get_counts() + + @property + def offsets(self) -> np.ndarray: + return self._impl.get_offsets() + + @property + def entry_sizes(self) -> np.ndarray: + return self._impl.get_entry_sizes() + + +__all__ = [ + "LabelMultiset", + "MultisetMerger", + "downsample_multiset", + "multiset_from_labels", + "read_subset", +] diff --git a/tests/label_multiset/__init__.py b/tests/label_multiset/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/label_multiset/test_against_nifty.py b/tests/label_multiset/test_against_nifty.py new file mode 100644 index 0000000..9ae7485 --- /dev/null +++ b/tests/label_multiset/test_against_nifty.py @@ -0,0 +1,149 @@ +"""Cross-check against nifty.tools.label_multiset (skipped if nifty missing).""" +import numpy as np +import pytest + +nifty_tools = pytest.importorskip("nifty.tools") + +from bioimage_cpp._core import Blocking as BicBlocking +from bioimage_cpp.label_multiset import ( + MultisetMerger, + downsample_multiset, + multiset_from_labels, + read_subset, +) + + +def _bic_to_nifty_counts(counts: np.ndarray) -> np.ndarray: + return counts.astype(np.int32) + + +def _entries(ids: np.ndarray, counts: np.ndarray, offsets: np.ndarray, + entry_sizes: np.ndarray, entry_offsets: np.ndarray): + """Yield per-spatial-position (ids_tuple, counts_tuple) for comparison.""" + n = offsets.shape[0] + out = [] + for s in range(n): + off = int(offsets[s]) + eidx = int(entry_offsets[s]) + sz = int(entry_sizes[eidx]) + out.append( + (tuple(int(x) for x in ids[off : off + sz]), + tuple(int(x) for x in counts[off : off + sz])) + ) + return out + + +def _entries_from_ms(ms): + return _entries(ms.ids, ms.counts, ms.offsets, ms.entry_sizes, ms.entry_offsets) + + +def test_read_subset_matches_nifty(): + rng = np.random.default_rng(0) + ids = rng.integers(0, 10, size=20, dtype=np.uint64) + counts = rng.integers(1, 5, size=20, dtype=np.uint32) + offsets = np.array([0, 5, 10, 15], dtype=np.uint64) + sizes = np.array([5, 5, 5, 5], dtype=np.uint64) + + bic_ids, bic_counts = read_subset(offsets, sizes, ids, counts) + + n_ids, n_counts = nifty_tools.readSubset( + offsets.astype(np.uint64), + sizes.astype(np.uint64), + ids.astype(np.uint64), + _bic_to_nifty_counts(counts), + True, + ) + # Both should be sorted by id and equal element-wise (up to count dtype). + assert bic_ids.tolist() == n_ids.tolist() + assert bic_counts.tolist() == n_counts.astype(np.uint32).tolist() + + +def test_downsample_matches_nifty_3d(): + rng = np.random.default_rng(1) + labels = rng.integers(0, 8, size=(4, 4, 4), dtype=np.uint64) + + # Build the level-0 multiset *both ways* and downsample with both. + ms0 = multiset_from_labels(labels, (1, 1, 1)) + + # nifty's downsampleMultiset signature. + n_blocking = nifty_tools.blocking( + roiBegin=[0, 0, 0], roiEnd=list(labels.shape), blockShape=[2, 2, 2] + ) + bic_blocking = BicBlocking([0, 0, 0], list(labels.shape), [2, 2, 2]) + + # Convert our level-0 to nifty types for input. + n_offsets = ms0.offsets.astype(np.uint64) + n_entry_sizes = ms0.entry_sizes.astype(np.uint64) + n_entry_offsets = ms0.entry_offsets.astype(np.uint64) + n_ids = ms0.ids.astype(np.uint64) + n_counts = _bic_to_nifty_counts(ms0.counts) + + n_argmax, n_new_offsets, n_new_ids, n_new_counts = nifty_tools.downsampleMultiset( + n_blocking, n_offsets, n_entry_sizes, n_entry_offsets, n_ids, n_counts, + restrict_set=-1, + ) + + bic = downsample_multiset(ms0, bic_blocking) + + # Argmax should match. + assert bic.argmax.tolist() == n_argmax.tolist() + + # Reconstruct entries from nifty's output. nifty returns flat + # (new_offsets pointing into new_ids/new_counts) but not new_entry_sizes — + # since nifty's downsample reorders/uniqifies offsets implicitly, we + # rebuild sizes via np.unique on offsets. + unique_off, inverse = np.unique(n_new_offsets, return_inverse=True) + # Sizes are the gaps between consecutive unique offsets, with the last + # entry extending to len(new_ids). + sizes_arr = np.diff( + np.concatenate([unique_off, np.array([n_new_ids.shape[0]], dtype=unique_off.dtype)]) + ) + nifty_entries = _entries( + n_new_ids, + n_new_counts.astype(np.uint32), + n_new_offsets.astype(np.uint64), + sizes_arr.astype(np.uint64), + inverse.astype(np.uint64), + ) + bic_entries = _entries_from_ms(bic) + assert bic_entries == nifty_entries + + +def test_multiset_merger_matches_nifty(): + rng = np.random.default_rng(2) + labels = rng.integers(0, 4, size=(4, 4), dtype=np.uint64) + ms = multiset_from_labels(labels, (2, 2)) + + unique_offsets = np.array( + [int(ms.offsets[np.where(ms.entry_offsets == e)[0][0]]) + for e in range(ms.n_entries)], + dtype=np.uint64, + ) + + # Both constructors expect (unique_offsets, entry_sizes, ids, counts). + bic_merger = MultisetMerger.from_multiset(ms) + n_merger = nifty_tools.MultisetMerger( + unique_offsets, + ms.entry_sizes.astype(np.uint64), + ms.ids.astype(np.uint64), + _bic_to_nifty_counts(ms.counts), + ) + + # Ingest the same multiset as a batch — everything should be deduped. + spatial_offsets_bic = ms.entry_offsets.astype(np.uint64).copy() + spatial_offsets_n = ms.entry_offsets.astype(np.uint64).copy() + + bic_merger.update( + unique_offsets, ms.entry_sizes.astype(np.uint64), + ms.ids, ms.counts, spatial_offsets_bic, + ) + n_merger.update( + unique_offsets, ms.entry_sizes.astype(np.uint64), + ms.ids.astype(np.uint64), _bic_to_nifty_counts(ms.counts), + spatial_offsets_n, + ) + + assert bic_merger.ids.tolist() == ms.ids.tolist() + assert bic_merger.counts.tolist() == ms.counts.tolist() + assert n_merger.get_ids().tolist() == ms.ids.tolist() + assert spatial_offsets_bic.tolist() == spatial_offsets_n.tolist() diff --git a/tests/label_multiset/test_downsample.py b/tests/label_multiset/test_downsample.py new file mode 100644 index 0000000..a0242b1 --- /dev/null +++ b/tests/label_multiset/test_downsample.py @@ -0,0 +1,125 @@ +import numpy as np +import pytest + +from bioimage_cpp._core import Blocking +from bioimage_cpp.label_multiset import ( + LabelMultiset, + downsample_multiset, + multiset_from_labels, +) + + +def _python_block_histograms(labels: np.ndarray, block_shape): + """Compute per-block (sorted_ids, counts) histograms in pure Python.""" + shape = labels.shape + ndim = labels.ndim + blocks_per_axis = [ + (shape[a] + block_shape[a] - 1) // block_shape[a] for a in range(ndim) + ] + blocks = [] + if ndim == 2: + for by in range(blocks_per_axis[0]): + for bx in range(blocks_per_axis[1]): + y0, y1 = by * block_shape[0], min((by + 1) * block_shape[0], shape[0]) + x0, x1 = bx * block_shape[1], min((bx + 1) * block_shape[1], shape[1]) + vals, counts = np.unique(labels[y0:y1, x0:x1], return_counts=True) + blocks.append((tuple(int(v) for v in vals), tuple(int(c) for c in counts))) + elif ndim == 3: + for bz in range(blocks_per_axis[0]): + for by in range(blocks_per_axis[1]): + for bx in range(blocks_per_axis[2]): + z0 = bz * block_shape[0]; z1 = min((bz + 1) * block_shape[0], shape[0]) + y0 = by * block_shape[1]; y1 = min((by + 1) * block_shape[1], shape[1]) + x0 = bx * block_shape[2]; x1 = min((bx + 1) * block_shape[2], shape[2]) + vals, counts = np.unique( + labels[z0:z1, y0:y1, x0:x1], return_counts=True + ) + blocks.append( + (tuple(int(v) for v in vals), tuple(int(c) for c in counts)) + ) + else: + raise ValueError("only 2D and 3D in test") + return blocks + + +def _entries_as_dict(ms: LabelMultiset): + return [ + (tuple(int(i) for i in ms.entry(s)[0]), tuple(int(c) for c in ms.entry(s)[1])) + for s in range(ms.n_spatial) + ] + + +@pytest.mark.parametrize("seed", [0, 1, 2]) +def test_multiset_from_labels_2d_matches_python(seed): + rng = np.random.default_rng(seed) + labels = rng.integers(0, 7, size=(6, 8), dtype=np.uint64) + block_shape = (2, 2) + + ms = multiset_from_labels(labels, block_shape) + expected = _python_block_histograms(labels, block_shape) + actual = _entries_as_dict(ms) + assert actual == expected + + +def test_multiset_from_labels_3d_matches_python(): + rng = np.random.default_rng(42) + labels = rng.integers(0, 5, size=(4, 6, 8), dtype=np.uint32) + block_shape = (2, 2, 2) + + ms = multiset_from_labels(labels, block_shape) + expected = _python_block_histograms(labels, block_shape) + actual = _entries_as_dict(ms) + assert actual == expected + + +def test_downsample_equivalent_to_block_aggregation(): + # Building level-0 with block_shape (1,1) and then downsampling by (2,2) + # must give the same result as building directly with block_shape (2,2). + rng = np.random.default_rng(7) + labels = rng.integers(0, 4, size=(8, 6), dtype=np.uint64) + ms0 = multiset_from_labels(labels, (1, 1)) + blocking = Blocking([0, 0], list(labels.shape), [2, 2]) + ms1 = downsample_multiset(ms0, blocking) + ms_direct = multiset_from_labels(labels, (2, 2)) + + assert _entries_as_dict(ms1) == _entries_as_dict(ms_direct) + assert ms1.argmax.tolist() == ms_direct.argmax.tolist() + + +def test_downsample_3d_two_levels(): + rng = np.random.default_rng(11) + labels = rng.integers(0, 6, size=(8, 8, 8), dtype=np.uint64) + ms0 = multiset_from_labels(labels, (1, 1, 1)) + + b1 = Blocking([0, 0, 0], [8, 8, 8], [2, 2, 2]) + ms1 = downsample_multiset(ms0, b1) + expected1 = _python_block_histograms(labels, (2, 2, 2)) + assert _entries_as_dict(ms1) == expected1 + + b2 = Blocking([0, 0, 0], [4, 4, 4], [2, 2, 2]) + ms2 = downsample_multiset(ms1, b2) + expected2 = _python_block_histograms(labels, (4, 4, 4)) + assert _entries_as_dict(ms2) == expected2 + + +def test_downsample_dedup_reduces_entries(): + # All-zero volume — every block is identical, so n_entries must collapse to 1. + labels = np.zeros((8, 8), dtype=np.uint64) + ms = multiset_from_labels(labels, (2, 2)) + assert ms.n_spatial == 16 + assert ms.n_entries == 1 + assert ms.argmax.tolist() == [0] * 16 + + +def test_downsample_restrict_set_keeps_top_k(): + # 4x4 block with labels [0]*15 + [1]; restrict_set=1 should keep only label 0. + labels = np.zeros((4, 4), dtype=np.uint64) + labels[3, 3] = 1 + ms_full = multiset_from_labels(labels, (1, 1)) + b = Blocking([0, 0], [4, 4], [4, 4]) + ms_top1 = downsample_multiset(ms_full, b, restrict_set=1) + assert ms_top1.n_spatial == 1 + ids, counts = ms_top1.entry(0) + assert ids.tolist() == [0] + assert counts.tolist() == [15] + assert ms_top1.argmax.tolist() == [0] diff --git a/tests/label_multiset/test_merger.py b/tests/label_multiset/test_merger.py new file mode 100644 index 0000000..58e568e --- /dev/null +++ b/tests/label_multiset/test_merger.py @@ -0,0 +1,129 @@ +import numpy as np + +from bioimage_cpp._core import Blocking +from bioimage_cpp.label_multiset import ( + MultisetMerger, + downsample_multiset, + multiset_from_labels, +) + + +def _unique_offsets(ms): + """For each unique entry, return the offset where it lives in ms.ids.""" + return np.array( + [int(ms.offsets[np.where(ms.entry_offsets == e)[0][0]]) + for e in range(ms.n_entries)], + dtype=np.uint64, + ) + + +def test_merger_init_roundtrips_inputs(): + offsets = np.array([0, 2], dtype=np.uint64) + entry_sizes = np.array([2, 1], dtype=np.uint64) + ids = np.array([1, 2, 5], dtype=np.uint64) + counts = np.array([3, 4, 7], dtype=np.uint32) + + m = MultisetMerger(offsets, entry_sizes, ids, counts) + assert m.ids.tolist() == [1, 2, 5] + assert m.counts.tolist() == [3, 4, 7] + assert m.offsets.tolist() == [0, 2] + assert m.entry_sizes.tolist() == [2, 1] + + +def test_merger_deduplicates_identical_entry(): + # Existing storage has entry A: ids=[1,2], counts=[3,4] + offsets = np.array([0], dtype=np.uint64) + entry_sizes = np.array([2], dtype=np.uint64) + ids = np.array([1, 2], dtype=np.uint64) + counts = np.array([3, 4], dtype=np.uint32) + m = MultisetMerger(offsets, entry_sizes, ids, counts) + + # New batch: one entry identical to A, one genuinely new. + batch_unique_offsets = np.array([0, 2], dtype=np.uint64) + batch_entry_sizes = np.array([2, 1], dtype=np.uint64) + batch_ids = np.array([1, 2, 9], dtype=np.uint64) + batch_counts = np.array([3, 4, 11], dtype=np.uint32) + spatial_offsets = np.array([0, 1], dtype=np.uint64) + + rewritten = m.update( + batch_unique_offsets, batch_entry_sizes, batch_ids, batch_counts, + spatial_offsets, + ) + assert rewritten.tolist() == [0, 2] + assert m.ids.tolist() == [1, 2, 9] + assert m.counts.tolist() == [3, 4, 11] + assert m.offsets.tolist() == [0, 2] + assert m.entry_sizes.tolist() == [2, 1] + + +def test_merger_update_with_identical_entries_is_idempotent(): + # Building a merger from a multiset and feeding the same multiset back as + # an update should leave ids/counts unchanged and rewrite spatial offsets + # to point at the original byte offsets. + rng = np.random.default_rng(3) + labels = rng.integers(0, 4, size=(4, 4), dtype=np.uint64) + ms = multiset_from_labels(labels, (2, 2)) + + m = MultisetMerger.from_multiset(ms) + spatial_in = ms.entry_offsets.astype(np.uint64).copy() + rewritten = m.update( + _unique_offsets(ms), + ms.entry_sizes.astype(np.uint64), + ms.ids, ms.counts, + spatial_in, + ) + assert m.ids.tolist() == ms.ids.tolist() + assert m.counts.tolist() == ms.counts.tolist() + assert rewritten.tolist() == ms.offsets.tolist() + + +def test_merger_from_multiset_with_real_dedup(): + # Build a multiset that actually deduplicates (all-zero volume → 1 entry, + # many spatial positions). Ensure from_multiset() correctly extracts the + # unique offsets and the merger handles the size mismatch (n_spatial >> + # n_unique). + labels = np.zeros((8, 8), dtype=np.uint64) + ms = multiset_from_labels(labels, (2, 2)) + assert ms.n_spatial == 16 + assert ms.n_entries == 1 + + m = MultisetMerger.from_multiset(ms) + assert m.entry_sizes.tolist() == [1] + assert m.ids.tolist() == [0] + assert m.counts.tolist() == [4] + + # Update with the same multiset — must remain idempotent. + spatial = ms.entry_offsets.astype(np.uint64).copy() + rewritten = m.update( + _unique_offsets(ms), + ms.entry_sizes.astype(np.uint64), + ms.ids, ms.counts, + spatial, + ) + assert m.entry_sizes.tolist() == [1] + assert rewritten.tolist() == [0] * 16 + + +def test_merger_grows_on_disjoint_update(): + # Two non-overlapping multisets fed in sequence — every entry from the + # second must be appended. + offsets_a = np.array([0, 1], dtype=np.uint64) + entry_sizes_a = np.array([1, 1], dtype=np.uint64) + ids_a = np.array([10, 20], dtype=np.uint64) + counts_a = np.array([5, 7], dtype=np.uint32) + + m = MultisetMerger(offsets_a, entry_sizes_a, ids_a, counts_a) + + unique_offsets_b = np.array([0, 1], dtype=np.uint64) + entry_sizes_b = np.array([1, 1], dtype=np.uint64) + ids_b = np.array([30, 40], dtype=np.uint64) + counts_b = np.array([9, 11], dtype=np.uint32) + spatial_b = np.array([0, 1], dtype=np.uint64) + + rewritten = m.update( + unique_offsets_b, entry_sizes_b, ids_b, counts_b, spatial_b, + ) + assert m.ids.tolist() == [10, 20, 30, 40] + assert m.counts.tolist() == [5, 7, 9, 11] + assert m.offsets.tolist() == [0, 1, 2, 3] + assert rewritten.tolist() == [2, 3] diff --git a/tests/label_multiset/test_read_subset.py b/tests/label_multiset/test_read_subset.py new file mode 100644 index 0000000..fad1e61 --- /dev/null +++ b/tests/label_multiset/test_read_subset.py @@ -0,0 +1,75 @@ +import numpy as np +import pytest + +from bioimage_cpp.label_multiset import read_subset + + +def test_read_subset_basic_merge(): + # Two ranges over a flat ids/counts buffer. + # range 0: ids=[1,2], counts=[3,4] + # range 1: ids=[2,5], counts=[7,1] + # merged: {1: 3, 2: 11, 5: 1} + ids = np.array([1, 2, 2, 5], dtype=np.uint64) + counts = np.array([3, 4, 7, 1], dtype=np.uint32) + offsets = np.array([0, 2], dtype=np.uint64) + sizes = np.array([2, 2], dtype=np.uint64) + + out_ids, out_counts = read_subset(offsets, sizes, ids, counts) + assert out_ids.tolist() == [1, 2, 5] + assert out_counts.tolist() == [3, 11, 1] + + +def test_read_subset_single_range(): + ids = np.array([7, 7, 9], dtype=np.uint64) + counts = np.array([2, 5, 3], dtype=np.uint32) + offsets = np.array([0], dtype=np.uint64) + sizes = np.array([3], dtype=np.uint64) + + out_ids, out_counts = read_subset(offsets, sizes, ids, counts) + assert out_ids.tolist() == [7, 9] + assert out_counts.tolist() == [7, 3] + + +def test_read_subset_no_argsort_returns_same_elements(): + ids = np.array([1, 2, 5], dtype=np.uint64) + counts = np.array([3, 4, 1], dtype=np.uint32) + offsets = np.array([0], dtype=np.uint64) + sizes = np.array([3], dtype=np.uint64) + + sorted_ids, sorted_counts = read_subset(offsets, sizes, ids, counts, argsort=True) + unsorted_ids, unsorted_counts = read_subset( + offsets, sizes, ids, counts, argsort=False + ) + # Same multiset regardless of order. + assert sorted(sorted_ids.tolist()) == sorted(unsorted_ids.tolist()) + assert sum(sorted_counts.tolist()) == sum(unsorted_counts.tolist()) + + +def test_read_subset_empty_range_list(): + ids = np.array([1, 2], dtype=np.uint64) + counts = np.array([3, 4], dtype=np.uint32) + offsets = np.array([], dtype=np.uint64) + sizes = np.array([], dtype=np.uint64) + + out_ids, out_counts = read_subset(offsets, sizes, ids, counts) + assert out_ids.shape == (0,) + assert out_counts.shape == (0,) + + +def test_read_subset_dtype_promotion(): + # Int Python lists / wrong dtypes should be safely coerced. + out_ids, out_counts = read_subset([0, 2], [2, 1], [1, 2, 3], [4, 5, 6]) + assert out_ids.tolist() == [1, 2, 3] + assert out_counts.tolist() == [4, 5, 6] + + +def test_read_subset_overlapping_ranges_double_count(): + # Same range listed twice should double the counts. + ids = np.array([1, 2], dtype=np.uint64) + counts = np.array([3, 4], dtype=np.uint32) + offsets = np.array([0, 0], dtype=np.uint64) + sizes = np.array([2, 2], dtype=np.uint64) + + out_ids, out_counts = read_subset(offsets, sizes, ids, counts) + assert out_ids.tolist() == [1, 2] + assert out_counts.tolist() == [6, 8]