From 5fac5cd5344a7e14421887e869297a897e75be3c Mon Sep 17 00:00:00 2001
From: Constantin Pape <constantin.pape@informatik.uni-goettingen.de>
Date: Fri, 22 May 2026 21:52:37 +0200
Subject: [PATCH] Implement label multiset

---
 CMakeLists.txt                                |   1 +
 MIGRATION_GUIDE.md                            |  96 +++++
 development/label_multiset/benchmark.py       | 229 ++++++++++++
 .../label_multiset/downsample.hxx             | 124 +++++++
 .../label_multiset/from_labels.hxx            | 147 ++++++++
 .../bioimage_cpp/label_multiset/merger.hxx    | 143 ++++++++
 .../bioimage_cpp/label_multiset/multiset.hxx  |  66 ++++
 .../label_multiset/read_subset.hxx            | 136 +++++++
 src/bindings/label_multiset.cxx               | 335 ++++++++++++++++++
 src/bindings/label_multiset.hxx               |   9 +
 src/bindings/module.cxx                       |   2 +
 src/bioimage_cpp/__init__.py                  |   2 +
 src/bioimage_cpp/label_multiset/__init__.py   | 260 ++++++++++++++
 tests/label_multiset/__init__.py              |   0
 tests/label_multiset/test_against_nifty.py    | 149 ++++++++
 tests/label_multiset/test_downsample.py       | 125 +++++++
 tests/label_multiset/test_merger.py           | 129 +++++++
 tests/label_multiset/test_read_subset.py      |  75 ++++
 18 files changed, 2028 insertions(+)
 create mode 100644 development/label_multiset/benchmark.py
 create mode 100644 include/bioimage_cpp/label_multiset/downsample.hxx
 create mode 100644 include/bioimage_cpp/label_multiset/from_labels.hxx
 create mode 100644 include/bioimage_cpp/label_multiset/merger.hxx
 create mode 100644 include/bioimage_cpp/label_multiset/multiset.hxx
 create mode 100644 include/bioimage_cpp/label_multiset/read_subset.hxx
 create mode 100644 src/bindings/label_multiset.cxx
 create mode 100644 src/bindings/label_multiset.hxx
 create mode 100644 src/bioimage_cpp/label_multiset/__init__.py
 create mode 100644 tests/label_multiset/__init__.py
 create mode 100644 tests/label_multiset/test_against_nifty.py
 create mode 100644 tests/label_multiset/test_downsample.py
 create mode 100644 tests/label_multiset/test_merger.py
 create mode 100644 tests/label_multiset/test_read_subset.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 61b5056..acb8ecb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,6 +19,7 @@ nanobind_add_module(_core
     src/bindings/flow.cxx
     src/bindings/graph.cxx
     src/bindings/ground_truth.cxx
+    src/bindings/label_multiset.cxx
     src/bindings/segmentation.cxx
     src/bindings/transformation.cxx
     src/bindings/util.cxx
diff --git a/MIGRATION_GUIDE.md b/MIGRATION_GUIDE.md
index 72cd32d..c88ca9d 100644
--- a/MIGRATION_GUIDE.md
+++ b/MIGRATION_GUIDE.md
@@ -906,6 +906,102 @@ Important differences:
   API yet; the implementation is structured so these filters/extra edges can
   be added later.
 
+### Label Multiset
+
+The label-multiset data structure stores, for each spatial block of a label
+volume, a histogram of `(label, count)` pairs over the underlying voxels,
+with identical histograms across blocks deduplicated into shared storage.
+It is used by Paintera to build multi-resolution label pyramids.
+
+`nifty.tools` exposes three functions / classes — `readSubset`,
+`downsampleMultiset`, and `MultisetMerger` — operating on five flat arrays
+(`offsets`, `entry_sizes`, `entry_offsets`, `ids`, `counts`). `bioimage-cpp`
+keeps the same algorithm and storage layout but wraps it in a
+`LabelMultiset` dataclass plus a level-0 bootstrap helper.
+
+Nifty:
+
+```python
+import nifty.tools as nt
+
+# nifty does not provide a "from labels" helper; level-0 multisets are
+# typically constructed by the caller (e.g. by writing histograms manually
+# to N5 chunks).
+blocking = nt.blocking([0, 0, 0], list(labels.shape), [2, 2, 2])
+argmax, new_offsets, new_ids, new_counts = nt.downsampleMultiset(
+    blocking, offsets, entry_sizes, entry_offsets, ids, counts,
+    restrict_set=-1,
+)
+
+ids, counts = nt.readSubset(offsets, sizes, ids, counts, True)
+
+merger = nt.MultisetMerger(unique_offsets, entry_sizes, ids, counts)
+merger.update(unique_offsets, entry_sizes, ids, counts, offsets)
+```
+
+bioimage-cpp:
+
+```python
+import bioimage_cpp as bic
+from bioimage_cpp.label_multiset import (
+    LabelMultiset,
+    MultisetMerger,
+    downsample_multiset,
+    multiset_from_labels,
+    read_subset,
+)
+
+# Build the level-0 multiset directly from a label volume.
+ms0 = multiset_from_labels(labels, block_shape=(1, 1, 1))
+
+# Downsample one level. `blocking.roi_end` must match the input's spatial
+# extent (i.e. the shape used to build ms0).
+blocking = bic.utils.Blocking([0, 0, 0], list(labels.shape), [2, 2, 2])
+ms1 = downsample_multiset(ms0, blocking, restrict_set=-1)
+
+# Merge entries from a list of (offset, size) ranges into one histogram.
+ids, counts = read_subset(offsets, sizes, ms1.ids, ms1.counts)
+
+# Deduplicating merger — constructor takes one offset per unique entry.
+merger = MultisetMerger.from_multiset(ms1)
+merger.update(unique_offsets, entry_sizes, ids, counts, offsets)
+```
+
+Name and API changes:
+
+| nifty-style name | bioimage-cpp name |
+| --- | --- |
+| `readSubset` | `read_subset` |
+| `downsampleMultiset` | `downsample_multiset` |
+| `MultisetMerger` | `MultisetMerger` |
+| `MultisetMerger.get_ids()` | `MultisetMerger.ids` (property) |
+| `MultisetMerger.get_counts()` | `MultisetMerger.counts` |
+| `restrict_set` (keyword) | `restrict_set` (keyword, same default `-1`) |
+
+Notes:
+
+- A `LabelMultiset` carries all five arrays (`offsets`, `entry_offsets`,
+  `entry_sizes`, `ids`, `counts`) plus `argmax`. Nifty's
+  `downsampleMultiset` returns only four of them and leaves the caller to
+  reconstruct `entry_offsets` / `entry_sizes`; `bioimage-cpp` returns them
+  directly so multi-level downsample chains compose without bookkeeping.
+- `multiset_from_labels(labels, block_shape)` builds the level-0 multiset
+  from a `uint32` or `uint64` label volume in one call. There is no nifty
+  equivalent.
+- `MultisetMerger.__init__` takes one offset per unique entry (length
+  `n_unique`), matching nifty's contract. Use
+  `MultisetMerger.from_multiset(ms)` to construct one directly from a
+  `LabelMultiset`.
+- Count dtype is `uint32` (nifty uses `int32`). Convert at the boundary
+  if you are reading nifty-written data.
+- 2D and 3D blockings are both supported. The bindings instantiate `uint64`
+  ids, `uint32` counts, and `uint64` offsets; wider dtype matrices can be
+  added on demand.
+- The `LabelMultisetWrapper` z5/N5 reader from
+  `nifty/tools/label_multiset_wrapper.hxx` is intentionally **not** ported
+  — I/O stays out of the C++ core. Read/write Paintera-format chunks with
+  `zarr`/`numpy` in Python if needed.
+
 ### Lifted Multicut
 
 Nifty exposes lifted multicut through a separate objective + solver hierarchy.
diff --git a/development/label_multiset/benchmark.py b/development/label_multiset/benchmark.py
new file mode 100644
index 0000000..e49f37b
--- /dev/null
+++ b/development/label_multiset/benchmark.py
@@ -0,0 +1,229 @@
+"""Benchmark the bioimage-cpp label_multiset implementation against nifty.
+
+Runs:
+  - multiset_from_labels (bioimage-cpp only — nifty has no direct equivalent;
+    upstream code typically writes the level-0 multiset out manually)
+  - downsampleMultiset
+  - readSubset
+  - MultisetMerger.update
+
+on a deterministic 3D label volume. Prints a comparison table and verifies
+that the results agree numerically.
+
+Run with:
+    python development/label_multiset/benchmark.py
+"""
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass
+from typing import Callable, Tuple
+
+import numpy as np
+
+from bioimage_cpp._core import Blocking as BicBlocking
+from bioimage_cpp.label_multiset import (
+    MultisetMerger,
+    downsample_multiset,
+    multiset_from_labels,
+    read_subset,
+)
+
+try:
+    import nifty.tools as nt
+
+    HAVE_NIFTY = True
+except ImportError:
+    HAVE_NIFTY = False
+
+
+SHAPE = (256, 256, 256)
+N_LABELS = 2000
+COARSEN = 2  # voxels per coarse cell (smaller -> more variety per downsample block)
+DOWN_BLOCK = (2, 2, 2)
+N_SUBSET_QUERIES = 5000
+SUBSET_RANGE = 64
+N_REPEATS = 3
+
+
+def make_labels(seed: int = 0) -> np.ndarray:
+    rng = np.random.default_rng(seed)
+    coarse_shape = tuple(s // COARSEN for s in SHAPE)
+    coarse = rng.integers(0, N_LABELS, size=coarse_shape, dtype=np.uint64)
+    return np.kron(coarse, np.ones((COARSEN,) * len(SHAPE), dtype=np.uint64))
+
+
+@dataclass
+class TimedResult:
+    seconds: float
+    value: object
+
+
+def time_best(fn: Callable[[], object], repeats: int = N_REPEATS) -> TimedResult:
+    times = []
+    value = None
+    for _ in range(repeats):
+        t0 = time.perf_counter()
+        value = fn()
+        t1 = time.perf_counter()
+        times.append(t1 - t0)
+    return TimedResult(min(times), value)
+
+
+def fmt_row(name: str, ours: float, theirs: float | None) -> str:
+    if theirs is None:
+        return f"  {name:<32} ours: {ours*1000:8.2f} ms   nifty: (n/a)"
+    ratio = ours / theirs if theirs > 0 else float("inf")
+    return (
+        f"  {name:<32} ours: {ours*1000:8.2f} ms   nifty: {theirs*1000:8.2f} ms"
+        f"   ratio: {ratio:5.2f}x"
+    )
+
+
+def bench_from_labels(labels: np.ndarray) -> TimedResult:
+    return time_best(lambda: multiset_from_labels(labels, (1, 1, 1)))
+
+
+def bench_downsample_ours(ms) -> Tuple[TimedResult, object]:
+    blocking = BicBlocking([0, 0, 0], list(SHAPE), list(DOWN_BLOCK))
+    res = time_best(lambda: downsample_multiset(ms, blocking))
+    return res, blocking
+
+
+def bench_downsample_nifty(ms) -> TimedResult | None:
+    if not HAVE_NIFTY:
+        return None
+    n_blocking = nt.blocking(
+        roiBegin=[0, 0, 0], roiEnd=list(SHAPE), blockShape=list(DOWN_BLOCK)
+    )
+    n_offsets = ms.offsets.astype(np.uint64)
+    n_entry_sizes = ms.entry_sizes.astype(np.uint64)
+    n_entry_offsets = ms.entry_offsets.astype(np.uint64)
+    n_ids = ms.ids.astype(np.uint64)
+    n_counts = ms.counts.astype(np.int32)
+
+    return time_best(
+        lambda: nt.downsampleMultiset(
+            n_blocking, n_offsets, n_entry_sizes, n_entry_offsets,
+            n_ids, n_counts, restrict_set=-1,
+        )
+    )
+
+
+def bench_read_subset(ms) -> Tuple[TimedResult, TimedResult | None]:
+    rng = np.random.default_rng(1)
+    # Random sub-multisets: pick N_SUBSET_QUERIES random spatial positions and
+    # gather their (offset, size) ranges.
+    n_spatial = ms.n_spatial
+    positions = rng.integers(0, n_spatial, size=N_SUBSET_QUERIES)
+    range_size = SUBSET_RANGE  # how many entries to merge per query
+    offsets_list = []
+    sizes_list = []
+    for p in positions:
+        start = max(0, p - range_size // 2)
+        end = min(n_spatial, start + range_size)
+        offsets_list.append(ms.offsets[start:end].astype(np.uint64))
+        sizes_list.append(
+            ms.entry_sizes[ms.entry_offsets[start:end]].astype(np.uint64)
+        )
+    flat_offsets = np.concatenate(offsets_list)
+    flat_sizes = np.concatenate(sizes_list)
+
+    ours = time_best(lambda: read_subset(flat_offsets, flat_sizes, ms.ids, ms.counts))
+
+    if HAVE_NIFTY:
+        ids_int32 = ms.ids.astype(np.uint64)
+        counts_int32 = ms.counts.astype(np.int32)
+        theirs = time_best(
+            lambda: nt.readSubset(
+                flat_offsets, flat_sizes, ids_int32, counts_int32, True
+            )
+        )
+    else:
+        theirs = None
+    return ours, theirs
+
+
+def bench_merger(ms_downsampled) -> Tuple[TimedResult, TimedResult | None]:
+    # Build a merger from the downsampled multiset, then update with itself.
+    # The constructor expects one offset per unique entry (length n_unique).
+    entry_sizes = ms_downsampled.entry_sizes.astype(np.uint64)
+    ids = ms_downsampled.ids.astype(np.uint64)
+    counts = ms_downsampled.counts.astype(np.uint32)
+    counts_i32 = ms_downsampled.counts.astype(np.int32)
+
+    unique_off = np.array(
+        [int(ms_downsampled.offsets[
+            np.where(ms_downsampled.entry_offsets == e)[0][0]])
+         for e in range(ms_downsampled.n_entries)],
+        dtype=np.uint64,
+    )
+
+    def run_ours():
+        m = MultisetMerger(unique_off, entry_sizes, ids, counts)
+        spatial = ms_downsampled.entry_offsets.astype(np.uint64).copy()
+        m.update(unique_off, entry_sizes, ids, counts, spatial)
+        return m
+
+    ours = time_best(run_ours, repeats=N_REPEATS)
+
+    if HAVE_NIFTY:
+        def run_nifty():
+            m = nt.MultisetMerger(unique_off, entry_sizes, ids, counts_i32)
+            spatial = ms_downsampled.entry_offsets.astype(np.uint64).copy()
+            m.update(unique_off, entry_sizes, ids, counts_i32, spatial)
+            return m
+        theirs = time_best(run_nifty, repeats=N_REPEATS)
+    else:
+        theirs = None
+    return ours, theirs
+
+
+def main() -> None:
+    print(f"shape={SHAPE}  labels={N_LABELS}  downsample={DOWN_BLOCK}  repeats={N_REPEATS}")
+    print(f"nifty available: {HAVE_NIFTY}")
+    print()
+    print("Generating label volume...")
+    labels = make_labels()
+    print(f"  unique labels in volume: {len(np.unique(labels))}")
+    print()
+
+    print("Benchmarks (best of N):")
+    t_from = bench_from_labels(labels)
+    print(fmt_row("multiset_from_labels (1,1,1)", t_from.seconds, None))
+    ms0 = t_from.value
+
+    t_down, blocking = bench_downsample_ours(ms0)
+    t_down_nifty = bench_downsample_nifty(ms0)
+    print(fmt_row(
+        f"downsample_multiset {DOWN_BLOCK}",
+        t_down.seconds,
+        t_down_nifty.seconds if t_down_nifty else None,
+    ))
+    ms1 = t_down.value
+    print(f"    -> level-1: n_spatial={ms1.n_spatial}  n_entries={ms1.n_entries}")
+
+    t_read_ours, t_read_nifty = bench_read_subset(ms0)
+    print(fmt_row(
+        f"read_subset (x{N_SUBSET_QUERIES})",
+        t_read_ours.seconds,
+        t_read_nifty.seconds if t_read_nifty else None,
+    ))
+
+    t_merger_ours, t_merger_nifty = bench_merger(ms1)
+    print(fmt_row(
+        "MultisetMerger.update",
+        t_merger_ours.seconds,
+        t_merger_nifty.seconds if t_merger_nifty else None,
+    ))
+
+    # Correctness cross-check on downsample.
+    if HAVE_NIFTY and t_down_nifty is not None:
+        bic_argmax = ms1.argmax
+        n_argmax = t_down_nifty.value[0]
+        assert bic_argmax.tolist() == n_argmax.tolist(), "argmax mismatch vs nifty!"
+        print("\nargmax(downsample) cross-check vs nifty: OK")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/include/bioimage_cpp/label_multiset/downsample.hxx b/include/bioimage_cpp/label_multiset/downsample.hxx
new file mode 100644
index 0000000..ce814c0
--- /dev/null
+++ b/include/bioimage_cpp/label_multiset/downsample.hxx
@@ -0,0 +1,124 @@
+#pragma once
+
+#include "bioimage_cpp/array_view.hxx"
+#include "bioimage_cpp/blocking.hxx"
+#include "bioimage_cpp/label_multiset/multiset.hxx"
+#include "bioimage_cpp/label_multiset/read_subset.hxx"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <unordered_map>
+#include <vector>
+
+namespace bioimage_cpp::label_multiset {
+
+// Direct port of nifty::tools::downsampleMultiset. Reads a level-N multiset
+// described by (offsets, entry_sizes, entry_offsets, ids, counts) defined over
+// the *full* spatial grid (blocking.roi_end()), aggregates into the blocks of
+// the given blocking, and writes a deduplicated level-(N+1) multiset.
+//
+// Outputs:
+//   new_argmax        — length blocking.number_of_blocks()
+//   new_offsets       — length blocking.number_of_blocks(); offset into new_ids
+//   new_entry_offsets — length blocking.number_of_blocks(); maps block → unique entry idx
+//   new_entry_sizes   — length n_unique; size of each unique entry
+//   new_ids, new_counts — concatenated, deduplicated
+template <class OffsetT, class IdT, class CountT>
+inline void downsample_multiset(
+    const Blocking &blocking,
+    const ConstArrayView<OffsetT> &offsets,
+    const ConstArrayView<OffsetT> &entry_sizes,
+    const ConstArrayView<OffsetT> &entry_offsets,
+    const ConstArrayView<IdT> &ids,
+    const ConstArrayView<CountT> &counts,
+    const int restrict_set,
+    ArrayView<IdT> &new_argmax,
+    ArrayView<OffsetT> &new_offsets,
+    ArrayView<OffsetT> &new_entry_offsets,
+    std::vector<OffsetT> &new_entry_sizes,
+    std::vector<IdT> &new_ids,
+    std::vector<CountT> &new_counts
+) {
+    using Key = HashKey<IdT, CountT>;
+    std::unordered_map<Key, std::vector<std::size_t>, HashKeyHash> candidate_dict;
+
+    const std::size_t n_blocks = static_cast<std::size_t>(blocking.number_of_blocks());
+    const auto &shape = blocking.roi_end();
+    const auto strides = c_order_strides_for_shape(shape);
+
+    std::size_t current_candidate_id = 0;
+    // Stored per unique entry. Indexed by candidate id.
+    std::vector<std::size_t> unique_entry_offsets;
+
+    for (std::size_t block_id = 0; block_id < n_blocks; ++block_id) {
+        std::vector<IdT> this_ids;
+        std::vector<CountT> this_counts;
+        const auto block = blocking.get_block(static_cast<std::uint64_t>(block_id));
+        read_subset_block(block, strides,
+                          offsets, entry_sizes, entry_offsets, ids, counts,
+                          this_ids, this_counts, /*argsort=*/true);
+
+        IdT max_label{};
+        CountT max_count{};
+        if (restrict_set > 0 && static_cast<int>(this_ids.size()) > restrict_set) {
+            // sort by count descending
+            argsort_by_first(this_counts, this_ids, /*ascending=*/false);
+            max_label = this_ids[0];
+            max_count = this_counts[0];
+            this_ids.resize(static_cast<std::size_t>(restrict_set));
+            this_counts.resize(static_cast<std::size_t>(restrict_set));
+            argsort_by_first(this_ids, this_counts);
+        } else {
+            auto max_it = std::max_element(this_counts.begin(), this_counts.end());
+            max_label = this_ids[std::distance(this_counts.begin(), max_it)];
+            max_count = *max_it;
+        }
+        new_argmax.data[block_id] = max_label;
+
+        Key key{max_label, max_count};
+        bool add_entry = true;
+        auto cand_it = candidate_dict.find(key);
+        if (cand_it != candidate_dict.end()) {
+            for (const std::size_t c_id : cand_it->second) {
+                const std::size_t c_offset = unique_entry_offsets[c_id];
+                const std::size_t c_size = static_cast<std::size_t>(new_entry_sizes[c_id]);
+                bool match = ranges_equal(
+                    this_ids.begin(), this_ids.end(),
+                    new_ids.begin() + c_offset, new_ids.begin() + c_offset + c_size
+                );
+                if (match) {
+                    match = ranges_equal(
+                        this_counts.begin(), this_counts.end(),
+                        new_counts.begin() + c_offset, new_counts.begin() + c_offset + c_size
+                    );
+                }
+                if (match) {
+                    new_offsets.data[block_id] = static_cast<OffsetT>(c_offset);
+                    new_entry_offsets.data[block_id] = static_cast<OffsetT>(c_id);
+                    add_entry = false;
+                    break;
+                }
+            }
+        }
+
+        if (add_entry) {
+            const std::size_t this_offset = new_ids.size();
+            new_offsets.data[block_id] = static_cast<OffsetT>(this_offset);
+            new_entry_offsets.data[block_id] = static_cast<OffsetT>(current_candidate_id);
+            unique_entry_offsets.emplace_back(this_offset);
+            new_entry_sizes.emplace_back(static_cast<OffsetT>(this_ids.size()));
+            new_ids.insert(new_ids.end(), this_ids.begin(), this_ids.end());
+            new_counts.insert(new_counts.end(), this_counts.begin(), this_counts.end());
+            if (cand_it == candidate_dict.end()) {
+                candidate_dict.emplace(key, std::vector<std::size_t>{current_candidate_id});
+            } else {
+                cand_it->second.emplace_back(current_candidate_id);
+            }
+            ++current_candidate_id;
+        }
+    }
+}
+
+} // namespace bioimage_cpp::label_multiset
diff --git a/include/bioimage_cpp/label_multiset/from_labels.hxx b/include/bioimage_cpp/label_multiset/from_labels.hxx
new file mode 100644
index 0000000..03979f5
--- /dev/null
+++ b/include/bioimage_cpp/label_multiset/from_labels.hxx
@@ -0,0 +1,147 @@
+#pragma once
+
+#include "bioimage_cpp/array_view.hxx"
+#include "bioimage_cpp/blocking.hxx"
+#include "bioimage_cpp/label_multiset/multiset.hxx"
+#include "bioimage_cpp/label_multiset/read_subset.hxx"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <unordered_map>
+#include <vector>
+
+namespace bioimage_cpp::label_multiset {
+
+// Build a deduplicated level-0 multiset from a (C-contiguous) label volume,
+// where each spatial position corresponds to one block of the given blocking
+// (the blocking's roi spans the full label volume, block_shape determines the
+// aggregation footprint of each "voxel" in the resulting multiset).
+//
+// For block_shape == (1,1,...,1) this gives the trivial "one label, count 1
+// per pixel" multiset.
+//
+// Outputs:
+//   argmax            — length n_blocks
+//   out_offsets       — length n_blocks
+//   out_entry_offsets — length n_blocks
+//   out_entry_sizes   — length n_unique
+//   out_ids, out_counts — concatenated
+template <class LabelT, class OffsetT, class IdT, class CountT>
+inline void multiset_from_labels(
+    const ConstArrayView<LabelT> &labels,
+    const Blocking &blocking,
+    ArrayView<IdT> &argmax,
+    ArrayView<OffsetT> &out_offsets,
+    ArrayView<OffsetT> &out_entry_offsets,
+    std::vector<OffsetT> &out_entry_sizes,
+    std::vector<IdT> &out_ids,
+    std::vector<CountT> &out_counts
+) {
+    using Key = HashKey<IdT, CountT>;
+    std::unordered_map<Key, std::vector<std::size_t>, HashKeyHash> candidate_dict;
+
+    const std::size_t ndim = blocking.ndim();
+    const auto &shape = blocking.roi_end();
+    const auto strides = c_order_strides_for_shape(shape);
+
+    const std::size_t n_blocks = static_cast<std::size_t>(blocking.number_of_blocks());
+    std::vector<std::size_t> unique_entry_offsets;
+    std::size_t current_candidate_id = 0;
+
+    std::vector<IdT> this_ids;
+    std::vector<CountT> this_counts;
+
+    for (std::size_t block_id = 0; block_id < n_blocks; ++block_id) {
+        const auto block = blocking.get_block(static_cast<std::uint64_t>(block_id));
+        const auto &begin = block.begin();
+        const auto &end = block.end();
+
+        std::unordered_map<IdT, CountT> count_dict;
+        std::vector<Coordinate> coord = begin;
+        while (true) {
+            std::size_t index = 0;
+            for (std::size_t d = 0; d < ndim; ++d) {
+                index += static_cast<std::size_t>(coord[d]) * strides[d];
+            }
+            const IdT id = static_cast<IdT>(labels.data[index]);
+            auto it = count_dict.find(id);
+            if (it == count_dict.end()) {
+                count_dict.emplace(id, CountT{1});
+            } else {
+                it->second += CountT{1};
+            }
+
+            std::ptrdiff_t axis = static_cast<std::ptrdiff_t>(ndim) - 1;
+            for (; axis >= 0; --axis) {
+                ++coord[axis];
+                if (coord[axis] < end[axis]) {
+                    break;
+                }
+                coord[axis] = begin[axis];
+            }
+            if (axis < 0) {
+                break;
+            }
+        }
+
+        this_ids.assign(count_dict.size(), IdT{});
+        this_counts.assign(count_dict.size(), CountT{});
+        std::size_t i = 0;
+        for (const auto &elem : count_dict) {
+            this_ids[i] = elem.first;
+            this_counts[i] = elem.second;
+            ++i;
+        }
+        argsort_by_first(this_ids, this_counts);
+
+        auto max_it = std::max_element(this_counts.begin(), this_counts.end());
+        const IdT max_label = this_ids[static_cast<std::size_t>(std::distance(this_counts.begin(), max_it))];
+        const CountT max_count = *max_it;
+        argmax.data[block_id] = max_label;
+
+        Key key{max_label, max_count};
+        bool add_entry = true;
+        auto cand_it = candidate_dict.find(key);
+        if (cand_it != candidate_dict.end()) {
+            for (const std::size_t c_id : cand_it->second) {
+                const std::size_t c_offset = unique_entry_offsets[c_id];
+                const std::size_t c_size = static_cast<std::size_t>(out_entry_sizes[c_id]);
+                bool match = ranges_equal(
+                    this_ids.begin(), this_ids.end(),
+                    out_ids.begin() + c_offset, out_ids.begin() + c_offset + c_size
+                );
+                if (match) {
+                    match = ranges_equal(
+                        this_counts.begin(), this_counts.end(),
+                        out_counts.begin() + c_offset, out_counts.begin() + c_offset + c_size
+                    );
+                }
+                if (match) {
+                    out_offsets.data[block_id] = static_cast<OffsetT>(c_offset);
+                    out_entry_offsets.data[block_id] = static_cast<OffsetT>(c_id);
+                    add_entry = false;
+                    break;
+                }
+            }
+        }
+
+        if (add_entry) {
+            const std::size_t this_offset = out_ids.size();
+            out_offsets.data[block_id] = static_cast<OffsetT>(this_offset);
+            out_entry_offsets.data[block_id] = static_cast<OffsetT>(current_candidate_id);
+            unique_entry_offsets.emplace_back(this_offset);
+            out_entry_sizes.emplace_back(static_cast<OffsetT>(this_ids.size()));
+            out_ids.insert(out_ids.end(), this_ids.begin(), this_ids.end());
+            out_counts.insert(out_counts.end(), this_counts.begin(), this_counts.end());
+            if (cand_it == candidate_dict.end()) {
+                candidate_dict.emplace(key, std::vector<std::size_t>{current_candidate_id});
+            } else {
+                cand_it->second.emplace_back(current_candidate_id);
+            }
+            ++current_candidate_id;
+        }
+    }
+}
+
+} // namespace bioimage_cpp::label_multiset
diff --git a/include/bioimage_cpp/label_multiset/merger.hxx b/include/bioimage_cpp/label_multiset/merger.hxx
new file mode 100644
index 0000000..b14d3e2
--- /dev/null
+++ b/include/bioimage_cpp/label_multiset/merger.hxx
@@ -0,0 +1,143 @@
+#pragma once
+
+#include "bioimage_cpp/array_view.hxx"
+#include "bioimage_cpp/label_multiset/multiset.hxx"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <unordered_map>
+#include <vector>
+
+namespace bioimage_cpp::label_multiset {
+
+template <class OffsetT, class IdT, class CountT>
+class MultisetMerger {
+public:
+    using OffsetType = OffsetT;
+    using IdType = IdT;
+    using CountType = CountT;
+
+    MultisetMerger(
+        const ConstArrayView<OffsetT> &offsets,
+        const ConstArrayView<OffsetT> &entry_sizes,
+        const ConstArrayView<IdT> &ids,
+        const ConstArrayView<CountT> &counts
+    )
+        : offsets_(offsets.data, offsets.data + offsets.shape[0]),
+          entry_sizes_(entry_sizes.data, entry_sizes.data + entry_sizes.shape[0]),
+          ids_(ids.data, ids.data + ids.shape[0]),
+          counts_(counts.data, counts.data + counts.shape[0]) {
+        init_hashed();
+    }
+
+    const std::vector<IdT> &ids() const { return ids_; }
+    const std::vector<CountT> &counts() const { return counts_; }
+    const std::vector<OffsetT> &offsets() const { return offsets_; }
+    const std::vector<OffsetT> &entry_sizes() const { return entry_sizes_; }
+
+    // Ingest a batch of *unique* entries described by (unique_offsets,
+    // entry_sizes, ids, counts), then rewrite `offsets` so that each element
+    // (which was indexed by "entry id within the input batch") becomes the
+    // absolute byte offset into the deduplicated ids_/counts_ arrays.
+    void update(
+        const ConstArrayView<OffsetT> &unique_offsets,
+        const ConstArrayView<OffsetT> &batch_entry_sizes,
+        const ConstArrayView<IdT> &batch_ids,
+        const ConstArrayView<CountT> &batch_counts,
+        ArrayView<OffsetT> &offsets
+    ) {
+        const std::size_t n_entries = static_cast<std::size_t>(unique_offsets.shape[0]);
+        // Maps batch entry id → absolute byte offset in ids_/counts_.
+        std::unordered_map<OffsetT, OffsetT> new_offset_dict;
+
+        for (std::size_t entry = 0; entry < n_entries; ++entry) {
+            const OffsetT off = unique_offsets.data[entry];
+            const OffsetT size = batch_entry_sizes.data[entry];
+
+            const IdT *ids_begin = batch_ids.data + off;
+            const IdT *ids_end = ids_begin + size;
+            const CountT *counts_begin = batch_counts.data + off;
+            const CountT *counts_end = counts_begin + size;
+
+            auto max_it = std::max_element(counts_begin, counts_end);
+            const IdT max_label = *(ids_begin + std::distance(counts_begin, max_it));
+            const CountT max_count = *max_it;
+
+            HashKey<IdT, CountT> key{max_label, max_count};
+            auto hash_it = hashed_.find(key);
+            bool new_entry = true;
+            if (hash_it != hashed_.end()) {
+                for (const std::size_t c_id : hash_it->second) {
+                    const std::size_t c_offset = static_cast<std::size_t>(offsets_[c_id]);
+                    const std::size_t c_size = static_cast<std::size_t>(entry_sizes_[c_id]);
+                    bool match = ranges_equal(
+                        ids_begin, ids_end,
+                        ids_.begin() + c_offset, ids_.begin() + c_offset + c_size
+                    );
+                    if (match) {
+                        match = ranges_equal(
+                            counts_begin, counts_end,
+                            counts_.begin() + c_offset, counts_.begin() + c_offset + c_size
+                        );
+                    }
+                    if (match) {
+                        new_entry = false;
+                        new_offset_dict[static_cast<OffsetT>(entry)] = static_cast<OffsetT>(c_offset);
+                        break;
+                    }
+                }
+            }
+
+            if (new_entry) {
+                const std::size_t this_size = static_cast<std::size_t>(std::distance(ids_begin, ids_end));
+                const std::size_t this_offset = ids_.size();
+                offsets_.emplace_back(static_cast<OffsetT>(this_offset));
+                entry_sizes_.emplace_back(static_cast<OffsetT>(this_size));
+                new_offset_dict[static_cast<OffsetT>(entry)] = static_cast<OffsetT>(this_offset);
+                ids_.insert(ids_.end(), ids_begin, ids_end);
+                counts_.insert(counts_.end(), counts_begin, counts_end);
+                const std::size_t this_id = offsets_.size() - 1;
+                if (hash_it == hashed_.end()) {
+                    hashed_.emplace(key, std::vector<std::size_t>{this_id});
+                    // hash_it is invalidated by emplace; we don't reuse it.
+                } else {
+                    hash_it->second.emplace_back(this_id);
+                }
+            }
+        }
+
+        const std::size_t n_off = static_cast<std::size_t>(offsets.shape[0]);
+        for (std::size_t i = 0; i < n_off; ++i) {
+            offsets.data[i] = new_offset_dict[offsets.data[i]];
+        }
+    }
+
+private:
+    void init_hashed() {
+        const std::size_t n_entries = offsets_.size();
+        for (std::size_t entry = 0; entry < n_entries; ++entry) {
+            const std::size_t off = static_cast<std::size_t>(offsets_[entry]);
+            const std::size_t size = static_cast<std::size_t>(entry_sizes_[entry]);
+            auto max_it = std::max_element(counts_.begin() + off, counts_.begin() + off + size);
+            const IdT max_label = ids_[static_cast<std::size_t>(std::distance(counts_.begin(), max_it))];
+            const CountT max_count = *max_it;
+            HashKey<IdT, CountT> key{max_label, max_count};
+            auto it = hashed_.find(key);
+            if (it == hashed_.end()) {
+                hashed_.emplace(key, std::vector<std::size_t>{entry});
+            } else {
+                it->second.emplace_back(entry);
+            }
+        }
+    }
+
+    std::vector<OffsetT> offsets_;
+    std::vector<OffsetT> entry_sizes_;
+    std::vector<IdT> ids_;
+    std::vector<CountT> counts_;
+    std::unordered_map<HashKey<IdT, CountT>, std::vector<std::size_t>, HashKeyHash> hashed_;
+};
+
+} // namespace bioimage_cpp::label_multiset
diff --git a/include/bioimage_cpp/label_multiset/multiset.hxx b/include/bioimage_cpp/label_multiset/multiset.hxx
new file mode 100644
index 0000000..53e97a1
--- /dev/null
+++ b/include/bioimage_cpp/label_multiset/multiset.hxx
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+namespace bioimage_cpp::label_multiset {
+
+template <class V>
+inline void reorder_inplace(V &v, const std::vector<std::size_t> &idx) {
+    V tmp(v.size());
+    for (std::size_t i = 0; i < v.size(); ++i) {
+        tmp[i] = v[idx[i]];
+    }
+    v.swap(tmp);
+}
+
+template <class V1, class V2>
+inline void argsort_by_first(V1 &v1, V2 &v2, const bool ascending = true) {
+    std::vector<std::size_t> idx(v1.size());
+    std::iota(idx.begin(), idx.end(), 0);
+    if (ascending) {
+        std::sort(idx.begin(), idx.end(),
+                  [&v1](std::size_t a, std::size_t b) { return v1[a] < v1[b]; });
+    } else {
+        std::sort(idx.begin(), idx.end(),
+                  [&v1](std::size_t a, std::size_t b) { return v1[a] > v1[b]; });
+    }
+    reorder_inplace(v1, idx);
+    reorder_inplace(v2, idx);
+}
+
+template <class It1, class It2>
+inline bool ranges_equal(It1 a_begin, It1 a_end, It2 b_begin, It2 b_end) {
+    if (std::distance(a_begin, a_end) != std::distance(b_begin, b_end)) {
+        return false;
+    }
+    return std::equal(a_begin, a_end, b_begin);
+}
+
+template <class IdType, class CountType>
+struct HashKey {
+    IdType id;
+    CountType count;
+    bool operator==(const HashKey &other) const noexcept {
+        return id == other.id && count == other.count;
+    }
+};
+
+struct HashKeyHash {
+    template <class IdType, class CountType>
+    std::size_t operator()(const HashKey<IdType, CountType> &key) const noexcept {
+        std::size_t h1 = std::hash<IdType>{}(key.id);
+        std::size_t h2 = std::hash<CountType>{}(key.count);
+        // boost::hash_combine
+        h1 ^= h2 + 0x9e3779b9ULL + (h1 << 6) + (h1 >> 2);
+        return h1;
+    }
+};
+
+} // namespace bioimage_cpp::label_multiset
diff --git a/include/bioimage_cpp/label_multiset/read_subset.hxx b/include/bioimage_cpp/label_multiset/read_subset.hxx
new file mode 100644
index 0000000..55928af
--- /dev/null
+++ b/include/bioimage_cpp/label_multiset/read_subset.hxx
@@ -0,0 +1,136 @@
+#pragma once
+
+#include "bioimage_cpp/array_view.hxx"
+#include "bioimage_cpp/blocking.hxx"
+#include "bioimage_cpp/label_multiset/multiset.hxx"
+
+#include <cstddef>
+#include <cstdint>
+#include <unordered_map>
+#include <vector>
+
+namespace bioimage_cpp::label_multiset {
+
+template <class OffsetT, class IdT, class CountT>
+inline void read_subset(
+    const ConstArrayView<OffsetT> &offsets,
+    const ConstArrayView<OffsetT> &sizes,
+    const ConstArrayView<IdT> &ids,
+    const ConstArrayView<CountT> &counts,
+    std::vector<IdT> &ids_out,
+    std::vector<CountT> &counts_out,
+    const bool argsort = true
+) {
+    std::unordered_map<IdT, CountT> count_dict;
+
+    const std::size_t n_offsets = static_cast<std::size_t>(offsets.shape[0]);
+    for (std::size_t off_id = 0; off_id < n_offsets; ++off_id) {
+        const std::size_t offset = static_cast<std::size_t>(offsets.data[off_id]);
+        const std::size_t size = static_cast<std::size_t>(sizes.data[off_id]);
+        for (std::size_t pos = offset; pos < offset + size; ++pos) {
+            const IdT id = ids.data[pos];
+            const CountT count = counts.data[pos];
+            auto it = count_dict.find(id);
+            if (it == count_dict.end()) {
+                count_dict.emplace(id, count);
+            } else {
+                it->second += count;
+            }
+        }
+    }
+
+    ids_out.resize(count_dict.size());
+    counts_out.resize(count_dict.size());
+    std::size_t i = 0;
+    for (const auto &elem : count_dict) {
+        ids_out[i] = elem.first;
+        counts_out[i] = elem.second;
+        ++i;
+    }
+    if (argsort) {
+        argsort_by_first(ids_out, counts_out);
+    }
+}
+
+// Block-aware variant: collect (offset, size) pairs for every spatial position
+// in the block, then call the flat overload. C-order strides over the *full*
+// spatial domain.
+template <class OffsetT, class IdT, class CountT>
+inline void read_subset_block(
+    const Block &block,
+    const std::vector<std::size_t> &strides,
+    const ConstArrayView<OffsetT> &offsets,
+    const ConstArrayView<OffsetT> &entry_sizes,
+    const ConstArrayView<OffsetT> &entry_offsets,
+    const ConstArrayView<IdT> &ids,
+    const ConstArrayView<CountT> &counts,
+    std::vector<IdT> &ids_out,
+    std::vector<CountT> &counts_out,
+    const bool argsort = true
+) {
+    std::unordered_map<IdT, CountT> count_dict;
+    const auto &begin = block.begin();
+    const auto &end = block.end();
+    const std::size_t ndim = begin.size();
+
+    // Generic N-D iteration via incrementing coordinate vector.
+    std::vector<Coordinate> coord = begin;
+    while (true) {
+        std::size_t index = 0;
+        for (std::size_t d = 0; d < ndim; ++d) {
+            index += static_cast<std::size_t>(coord[d]) * strides[d];
+        }
+        const std::size_t off = static_cast<std::size_t>(offsets.data[index]);
+        const std::size_t entry_idx = static_cast<std::size_t>(entry_offsets.data[index]);
+        const std::size_t size = static_cast<std::size_t>(entry_sizes.data[entry_idx]);
+        for (std::size_t pos = off; pos < off + size; ++pos) {
+            const IdT id = ids.data[pos];
+            const CountT count = counts.data[pos];
+            auto it = count_dict.find(id);
+            if (it == count_dict.end()) {
+                count_dict.emplace(id, count);
+            } else {
+                it->second += count;
+            }
+        }
+
+        // Increment N-D coord (last axis fastest).
+        std::ptrdiff_t axis = static_cast<std::ptrdiff_t>(ndim) - 1;
+        for (; axis >= 0; --axis) {
+            ++coord[axis];
+            if (coord[axis] < end[axis]) {
+                break;
+            }
+            coord[axis] = begin[axis];
+        }
+        if (axis < 0) {
+            break;
+        }
+    }
+
+    ids_out.resize(count_dict.size());
+    counts_out.resize(count_dict.size());
+    std::size_t i = 0;
+    for (const auto &elem : count_dict) {
+        ids_out[i] = elem.first;
+        counts_out[i] = elem.second;
+        ++i;
+    }
+    if (argsort) {
+        argsort_by_first(ids_out, counts_out);
+    }
+}
+
+inline std::vector<std::size_t> c_order_strides_for_shape(const CoordinateVector &shape) {
+    const std::size_t ndim = shape.size();
+    std::vector<std::size_t> strides(ndim, 1);
+    if (ndim == 0) {
+        return strides;
+    }
+    for (std::ptrdiff_t d = static_cast<std::ptrdiff_t>(ndim) - 2; d >= 0; --d) {
+        strides[d] = strides[d + 1] * static_cast<std::size_t>(shape[d + 1]);
+    }
+    return strides;
+}
+
+} // namespace bioimage_cpp::label_multiset
diff --git a/src/bindings/label_multiset.cxx b/src/bindings/label_multiset.cxx
new file mode 100644
index 0000000..a954b81
--- /dev/null
+++ b/src/bindings/label_multiset.cxx
@@ -0,0 +1,335 @@
+#include "label_multiset.hxx"
+
+#include "bioimage_cpp/array_view.hxx"
+#include "bioimage_cpp/blocking.hxx"
+#include "bioimage_cpp/label_multiset/downsample.hxx"
+#include "bioimage_cpp/label_multiset/from_labels.hxx"
+#include "bioimage_cpp/label_multiset/merger.hxx"
+#include "bioimage_cpp/label_multiset/read_subset.hxx"
+
+#include <nanobind/ndarray.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/vector.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <numeric>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+namespace nb = nanobind;
+
+namespace bioimage_cpp::bindings {
+namespace {
+
+using IdT = std::uint64_t;
+using CountT = std::uint32_t;
+using OffsetT = std::uint64_t;
+using LabelT = std::uint64_t;
+
+using OffsetArray = nb::ndarray<nb::numpy, const OffsetT, nb::c_contig>;
+using IdArray = nb::ndarray<nb::numpy, const IdT, nb::c_contig>;
+using CountArray = nb::ndarray<nb::numpy, const CountT, nb::c_contig>;
+using OffsetOutArray = nb::ndarray<nb::numpy, OffsetT, nb::c_contig>;
+using IdOutArray = nb::ndarray<nb::numpy, IdT, nb::c_contig>;
+using CountOutArray = nb::ndarray<nb::numpy, CountT, nb::c_contig>;
+
+template <class T>
+ConstArrayView<T> view_1d(const nb::ndarray<nb::numpy, const T, nb::c_contig> &arr,
+                          const char *name) {
+    if (arr.ndim() != 1) {
+        throw std::invalid_argument(
+            std::string(name) + " must have ndim 1, got ndim=" +
+            std::to_string(arr.ndim())
+        );
+    }
+    return ConstArrayView<T>{
+        arr.data(),
+        {static_cast<std::ptrdiff_t>(arr.shape(0))},
+        {},
+    };
+}
+
+template <class T>
+nb::ndarray<nb::numpy, T, nb::c_contig> alloc_1d(std::size_t n) {
+    auto *data = new T[n]();
+    nb::capsule owner(data, [](void *p) noexcept {
+        delete[] static_cast<T *>(p);
+    });
+    std::size_t shape[1] = {n};
+    return nb::ndarray<nb::numpy, T, nb::c_contig>(data, 1, shape, owner);
+}
+
+template <class T>
+nb::ndarray<nb::numpy, T, nb::c_contig> vector_to_array(std::vector<T> &&v) {
+    auto *data = new T[v.size()];
+    std::copy(v.begin(), v.end(), data);
+    nb::capsule owner(data, [](void *p) noexcept {
+        delete[] static_cast<T *>(p);
+    });
+    std::size_t shape[1] = {v.size()};
+    return nb::ndarray<nb::numpy, T, nb::c_contig>(data, 1, shape, owner);
+}
+
+// ----- read_subset -----
+
+std::pair<nb::ndarray<nb::numpy, IdT, nb::c_contig>,
+          nb::ndarray<nb::numpy, CountT, nb::c_contig>>
+bind_read_subset_flat(OffsetArray offsets, OffsetArray sizes,
+                      IdArray ids, CountArray counts,
+                      bool argsort) {
+    auto offsets_v = view_1d(offsets, "offsets");
+    auto sizes_v = view_1d(sizes, "sizes");
+    auto ids_v = view_1d(ids, "ids");
+    auto counts_v = view_1d(counts, "counts");
+    if (offsets_v.shape[0] != sizes_v.shape[0]) {
+        throw std::invalid_argument("offsets and sizes must have the same length");
+    }
+    if (ids_v.shape[0] != counts_v.shape[0]) {
+        throw std::invalid_argument("ids and counts must have the same length");
+    }
+
+    std::vector<IdT> ids_out;
+    std::vector<CountT> counts_out;
+    {
+        nb::gil_scoped_release release;
+        label_multiset::read_subset(offsets_v, sizes_v, ids_v, counts_v,
+                                    ids_out, counts_out, argsort);
+    }
+    return {vector_to_array(std::move(ids_out)),
+            vector_to_array(std::move(counts_out))};
+}
+
+// ----- downsample_multiset -----
+
+std::tuple<nb::ndarray<nb::numpy, IdT, nb::c_contig>,
+           nb::ndarray<nb::numpy, OffsetT, nb::c_contig>,
+           nb::ndarray<nb::numpy, OffsetT, nb::c_contig>,
+           nb::ndarray<nb::numpy, OffsetT, nb::c_contig>,
+           nb::ndarray<nb::numpy, IdT, nb::c_contig>,
+           nb::ndarray<nb::numpy, CountT, nb::c_contig>>
+bind_downsample_multiset(const Blocking &blocking,
+                         OffsetArray offsets,
+                         OffsetArray entry_sizes,
+                         OffsetArray entry_offsets,
+                         IdArray ids,
+                         CountArray counts,
+                         int restrict_set) {
+    auto offsets_v = view_1d(offsets, "offsets");
+    auto entry_sizes_v = view_1d(entry_sizes, "entry_sizes");
+    auto entry_offsets_v = view_1d(entry_offsets, "entry_offsets");
+    auto ids_v = view_1d(ids, "ids");
+    auto counts_v = view_1d(counts, "counts");
+    if (ids_v.shape[0] != counts_v.shape[0]) {
+        throw std::invalid_argument("ids and counts must have the same length");
+    }
+    if (offsets_v.shape[0] != entry_offsets_v.shape[0]) {
+        throw std::invalid_argument(
+            "offsets and entry_offsets must have the same length"
+        );
+    }
+
+    const std::size_t n_blocks = static_cast<std::size_t>(blocking.number_of_blocks());
+
+    auto new_argmax = alloc_1d<IdT>(n_blocks);
+    auto new_offsets = alloc_1d<OffsetT>(n_blocks);
+    auto new_entry_offsets = alloc_1d<OffsetT>(n_blocks);
+
+    std::vector<OffsetT> new_entry_sizes;
+    std::vector<IdT> new_ids;
+    std::vector<CountT> new_counts;
+
+    ArrayView<IdT> argmax_view{
+        new_argmax.data(),
+        {static_cast<std::ptrdiff_t>(n_blocks)}, {}};
+    ArrayView<OffsetT> off_view{
+        new_offsets.data(),
+        {static_cast<std::ptrdiff_t>(n_blocks)}, {}};
+    ArrayView<OffsetT> eoff_view{
+        new_entry_offsets.data(),
+        {static_cast<std::ptrdiff_t>(n_blocks)}, {}};
+
+    {
+        nb::gil_scoped_release release;
+        label_multiset::downsample_multiset(
+            blocking, offsets_v, entry_sizes_v, entry_offsets_v, ids_v, counts_v,
+            restrict_set,
+            argmax_view, off_view, eoff_view,
+            new_entry_sizes, new_ids, new_counts
+        );
+    }
+
+    return std::make_tuple(
+        std::move(new_argmax),
+        std::move(new_offsets),
+        std::move(new_entry_offsets),
+        vector_to_array(std::move(new_entry_sizes)),
+        vector_to_array(std::move(new_ids)),
+        vector_to_array(std::move(new_counts))
+    );
+}
+
+// ----- multiset_from_labels -----
+
+template <class L>
+std::tuple<nb::ndarray<nb::numpy, IdT, nb::c_contig>,
+           nb::ndarray<nb::numpy, OffsetT, nb::c_contig>,
+           nb::ndarray<nb::numpy, OffsetT, nb::c_contig>,
+           nb::ndarray<nb::numpy, OffsetT, nb::c_contig>,
+           nb::ndarray<nb::numpy, IdT, nb::c_contig>,
+           nb::ndarray<nb::numpy, CountT, nb::c_contig>>
+bind_multiset_from_labels_t(
+    nb::ndarray<nb::numpy, const L, nb::c_contig> labels,
+    const Blocking &blocking
+) {
+    if (labels.ndim() != blocking.ndim()) {
+        throw std::invalid_argument(
+            "labels ndim must match blocking ndim, got labels ndim=" +
+            std::to_string(labels.ndim())
+        );
+    }
+    std::vector<std::ptrdiff_t> shape(labels.ndim());
+    for (std::size_t a = 0; a < labels.ndim(); ++a) {
+        shape[a] = static_cast<std::ptrdiff_t>(labels.shape(a));
+        if (shape[a] != blocking.roi_end()[a]) {
+            throw std::invalid_argument(
+                "labels shape must match blocking.roi_end()"
+            );
+        }
+    }
+    ConstArrayView<L> labels_v{labels.data(), shape, {}};
+
+    const std::size_t n_blocks = static_cast<std::size_t>(blocking.number_of_blocks());
+    auto argmax = alloc_1d<IdT>(n_blocks);
+    auto offsets = alloc_1d<OffsetT>(n_blocks);
+    auto entry_offsets = alloc_1d<OffsetT>(n_blocks);
+
+    std::vector<OffsetT> entry_sizes;
+    std::vector<IdT> ids;
+    std::vector<CountT> counts;
+
+    ArrayView<IdT> argmax_view{argmax.data(),
+        {static_cast<std::ptrdiff_t>(n_blocks)}, {}};
+    ArrayView<OffsetT> off_view{offsets.data(),
+        {static_cast<std::ptrdiff_t>(n_blocks)}, {}};
+    ArrayView<OffsetT> eoff_view{entry_offsets.data(),
+        {static_cast<std::ptrdiff_t>(n_blocks)}, {}};
+
+    {
+        nb::gil_scoped_release release;
+        label_multiset::multiset_from_labels<L, OffsetT, IdT, CountT>(
+            labels_v, blocking,
+            argmax_view, off_view, eoff_view,
+            entry_sizes, ids, counts
+        );
+    }
+    return std::make_tuple(
+        std::move(argmax),
+        std::move(offsets),
+        std::move(entry_offsets),
+        vector_to_array(std::move(entry_sizes)),
+        vector_to_array(std::move(ids)),
+        vector_to_array(std::move(counts))
+    );
+}
+
+// ----- MultisetMerger -----
+
+class PyMultisetMerger {
+public:
+    PyMultisetMerger(OffsetArray offsets, OffsetArray entry_sizes,
+                     IdArray ids, CountArray counts)
+        : merger_(
+              view_1d(offsets, "offsets"),
+              view_1d(entry_sizes, "entry_sizes"),
+              view_1d(ids, "ids"),
+              view_1d(counts, "counts")
+          ) {}
+
+    nb::ndarray<nb::numpy, OffsetT, nb::c_contig>
+    update(OffsetArray unique_offsets, OffsetArray entry_sizes,
+           IdArray ids, CountArray counts, OffsetOutArray offsets) {
+        auto uo_v = view_1d(unique_offsets, "unique_offsets");
+        auto es_v = view_1d(entry_sizes, "entry_sizes");
+        auto ids_v = view_1d(ids, "ids");
+        auto counts_v = view_1d(counts, "counts");
+        if (offsets.ndim() != 1) {
+            throw std::invalid_argument("offsets must have ndim 1");
+        }
+        ArrayView<OffsetT> off_view{
+            offsets.data(),
+            {static_cast<std::ptrdiff_t>(offsets.shape(0))},
+            {}};
+        {
+            nb::gil_scoped_release release;
+            merger_.update(uo_v, es_v, ids_v, counts_v, off_view);
+        }
+        return offsets;
+    }
+
+    nb::ndarray<nb::numpy, IdT, nb::c_contig> ids_array() const {
+        return copy_to_array<IdT>(merger_.ids());
+    }
+    nb::ndarray<nb::numpy, CountT, nb::c_contig> counts_array() const {
+        return copy_to_array<CountT>(merger_.counts());
+    }
+    nb::ndarray<nb::numpy, OffsetT, nb::c_contig> offsets_array() const {
+        return copy_to_array<OffsetT>(merger_.offsets());
+    }
+    nb::ndarray<nb::numpy, OffsetT, nb::c_contig> entry_sizes_array() const {
+        return copy_to_array<OffsetT>(merger_.entry_sizes());
+    }
+
+private:
+    template <class T>
+    static nb::ndarray<nb::numpy, T, nb::c_contig> copy_to_array(const std::vector<T> &v) {
+        auto *data = new T[v.size()];
+        std::copy(v.begin(), v.end(), data);
+        nb::capsule owner(data, [](void *p) noexcept {
+            delete[] static_cast<T *>(p);
+        });
+        std::size_t shape[1] = {v.size()};
+        return nb::ndarray<nb::numpy, T, nb::c_contig>(data, 1, shape, owner);
+    }
+
+    label_multiset::MultisetMerger<OffsetT, IdT, CountT> merger_;
+};
+
+} // namespace
+
+void bind_label_multiset(nb::module_ &m) {
+    m.def("_read_subset", &bind_read_subset_flat,
+          nb::arg("offsets"), nb::arg("sizes"),
+          nb::arg("ids"), nb::arg("counts"),
+          nb::arg("argsort") = true);
+
+    m.def("_downsample_multiset", &bind_downsample_multiset,
+          nb::arg("blocking"),
+          nb::arg("offsets"), nb::arg("entry_sizes"), nb::arg("entry_offsets"),
+          nb::arg("ids"), nb::arg("counts"),
+          nb::arg("restrict_set") = -1);
+
+    m.def("_multiset_from_labels_u32", &bind_multiset_from_labels_t<std::uint32_t>,
+          nb::arg("labels"), nb::arg("blocking"));
+    m.def("_multiset_from_labels_u64", &bind_multiset_from_labels_t<std::uint64_t>,
+          nb::arg("labels"), nb::arg("blocking"));
+
+    nb::class_<PyMultisetMerger>(m, "_MultisetMerger")
+        .def(nb::init<OffsetArray, OffsetArray, IdArray, CountArray>(),
+             nb::arg("offsets"), nb::arg("entry_sizes"),
+             nb::arg("ids"), nb::arg("counts"))
+        .def("update", &PyMultisetMerger::update,
+             nb::arg("unique_offsets"), nb::arg("entry_sizes"),
+             nb::arg("ids"), nb::arg("counts"), nb::arg("offsets"))
+        .def("get_ids", &PyMultisetMerger::ids_array)
+        .def("get_counts", &PyMultisetMerger::counts_array)
+        .def("get_offsets", &PyMultisetMerger::offsets_array)
+        .def("get_entry_sizes", &PyMultisetMerger::entry_sizes_array);
+}
+
+} // namespace bioimage_cpp::bindings
diff --git a/src/bindings/label_multiset.hxx b/src/bindings/label_multiset.hxx
new file mode 100644
index 0000000..b670320
--- /dev/null
+++ b/src/bindings/label_multiset.hxx
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <nanobind/nanobind.h>
+
+namespace bioimage_cpp::bindings {
+
+void bind_label_multiset(nanobind::module_ &m);
+
+} // namespace bioimage_cpp::bindings
diff --git a/src/bindings/module.cxx b/src/bindings/module.cxx
index 5281816..909524f 100644
--- a/src/bindings/module.cxx
+++ b/src/bindings/module.cxx
@@ -5,6 +5,7 @@
 #include "flow.hxx"
 #include "graph.hxx"
 #include "ground_truth.hxx"
+#include "label_multiset.hxx"
 #include "segmentation.hxx"
 #include "transformation.hxx"
 #include "util.hxx"
@@ -23,6 +24,7 @@ NB_MODULE(_core, m) {
     bioimage_cpp::bindings::bind_flow(m);
     bioimage_cpp::bindings::bind_graph(m);
     bioimage_cpp::bindings::bind_ground_truth(m);
+    bioimage_cpp::bindings::bind_label_multiset(m);
     bioimage_cpp::bindings::bind_segmentation(m);
     bioimage_cpp::bindings::bind_transformation(m);
     bioimage_cpp::bindings::bind_util(m);
diff --git a/src/bioimage_cpp/__init__.py b/src/bioimage_cpp/__init__.py
index 3a66762..aa9343e 100644
--- a/src/bioimage_cpp/__init__.py
+++ b/src/bioimage_cpp/__init__.py
@@ -66,6 +66,7 @@
 from . import filters
 from . import flow
 from . import graph
+from . import label_multiset
 from . import segmentation
 from . import transformation
 from . import utils
@@ -77,6 +78,7 @@
     "filters",
     "flow",
     "graph",
+    "label_multiset",
     "segmentation",
     "transformation",
     "utils",
diff --git a/src/bioimage_cpp/label_multiset/__init__.py b/src/bioimage_cpp/label_multiset/__init__.py
new file mode 100644
index 0000000..807f6a6
--- /dev/null
+++ b/src/bioimage_cpp/label_multiset/__init__.py
@@ -0,0 +1,260 @@
+"""Label multiset data structure.
+
+A label multiset is a sparse, deduplicated representation of label
+distributions over a grid of spatial blocks. For each spatial position
+(block) it stores a histogram of the labels in the corresponding fine
+region as a `(ids, counts)` pair, with identical histograms across
+different blocks pointing to the same shared storage.
+
+This is a re-implementation of the label-multiset utilities from
+`nifty.tools.label_multiset` with no external C++ dependencies.
+
+Storage layout (mirrors nifty):
+
+- ``offsets``         length ``n_spatial``: spatial position to byte offset into ``ids`` / ``counts``
+- ``entry_offsets``   length ``n_spatial``: spatial position to unique-entry index
+- ``entry_sizes``     length ``n_unique``: number of ``(id, count)`` pairs per entry
+- ``ids``             length ``total_elems``: concatenated label ids (sorted within each entry)
+- ``counts``          length ``total_elems``: counts aligned with ``ids``
+- ``argmax``          length ``n_spatial``: argmax label per spatial position
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+
+from .. import _core
+from .._core import Blocking
+
+_ID_DTYPE = np.dtype(np.uint64)
+_COUNT_DTYPE = np.dtype(np.uint32)
+_OFFSET_DTYPE = np.dtype(np.uint64)
+
+
+@dataclass
+class LabelMultiset:
+    """A deduplicated label-histogram representation over a spatial grid."""
+
+    argmax: np.ndarray  # shape (n_spatial,), dtype uint64
+    offsets: np.ndarray  # shape (n_spatial,), dtype uint64
+    entry_offsets: np.ndarray  # shape (n_spatial,), dtype uint64
+    entry_sizes: np.ndarray  # shape (n_unique,), dtype uint64
+    ids: np.ndarray  # shape (total_elems,), dtype uint64
+    counts: np.ndarray  # shape (total_elems,), dtype uint32
+
+    @property
+    def n_spatial(self) -> int:
+        return int(self.offsets.shape[0])
+
+    @property
+    def n_entries(self) -> int:
+        return int(self.entry_sizes.shape[0])
+
+    def entry(self, spatial_index: int) -> Tuple[np.ndarray, np.ndarray]:
+        """Return ``(ids, counts)`` for the multiset at ``spatial_index``."""
+        off = int(self.offsets[spatial_index])
+        entry_idx = int(self.entry_offsets[spatial_index])
+        size = int(self.entry_sizes[entry_idx])
+        return self.ids[off : off + size], self.counts[off : off + size]
+
+    def __getitem__(self, spatial_index: int) -> Tuple[np.ndarray, np.ndarray]:
+        return self.entry(spatial_index)
+
+
+def _as_blocking(
+    shape: Tuple[int, ...],
+    block_shape: Tuple[int, ...],
+) -> Blocking:
+    if len(shape) != len(block_shape):
+        raise ValueError(
+            f"shape and block_shape must have the same length, got "
+            f"{len(shape)} and {len(block_shape)}"
+        )
+    roi_begin = [0] * len(shape)
+    return Blocking(roi_begin, list(shape), list(block_shape))
+
+
+def multiset_from_labels(
+    labels: np.ndarray,
+    block_shape: Tuple[int, ...],
+) -> LabelMultiset:
+    """Build a level-0 label multiset by aggregating ``labels`` over blocks.
+
+    For each block of shape ``block_shape``, computes the label histogram of
+    the contained voxels. Identical histograms are deduplicated.
+    """
+    labels = np.ascontiguousarray(labels)
+    if labels.dtype == np.uint32:
+        fn = _core._multiset_from_labels_u32
+    elif labels.dtype == np.uint64:
+        fn = _core._multiset_from_labels_u64
+    else:
+        raise TypeError(
+            f"labels must have dtype uint32 or uint64, got {labels.dtype}"
+        )
+    blocking = _as_blocking(tuple(labels.shape), tuple(block_shape))
+    argmax, offsets, entry_offsets, entry_sizes, ids, counts = fn(labels, blocking)
+    return LabelMultiset(
+        argmax=argmax,
+        offsets=offsets,
+        entry_offsets=entry_offsets,
+        entry_sizes=entry_sizes,
+        ids=ids,
+        counts=counts,
+    )
+
+
+def downsample_multiset(
+    multiset: LabelMultiset,
+    blocking: Blocking,
+    restrict_set: int = -1,
+) -> LabelMultiset:
+    """Downsample ``multiset`` by aggregating its entries into ``blocking``'s blocks.
+
+    ``blocking`` must be defined over the same spatial extent as the input
+    multiset (i.e. ``prod(blocking.roi_end) == multiset.n_spatial``).
+    """
+    argmax, new_offsets, new_entry_offsets, new_entry_sizes, new_ids, new_counts = (
+        _core._downsample_multiset(
+            blocking,
+            multiset.offsets,
+            multiset.entry_sizes,
+            multiset.entry_offsets,
+            multiset.ids,
+            multiset.counts,
+            restrict_set,
+        )
+    )
+    return LabelMultiset(
+        argmax=argmax,
+        offsets=new_offsets,
+        entry_offsets=new_entry_offsets,
+        entry_sizes=new_entry_sizes,
+        ids=new_ids,
+        counts=new_counts,
+    )
+
+
+def read_subset(
+    offsets: np.ndarray,
+    sizes: np.ndarray,
+    ids: np.ndarray,
+    counts: np.ndarray,
+    argsort: bool = True,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Merge multisets located at the given ``(offset, size)`` ranges.
+
+    Returns the summed ``(ids, counts)``, sorted by id if ``argsort``.
+    """
+    offsets = np.ascontiguousarray(offsets, dtype=_OFFSET_DTYPE)
+    sizes = np.ascontiguousarray(sizes, dtype=_OFFSET_DTYPE)
+    ids = np.ascontiguousarray(ids, dtype=_ID_DTYPE)
+    counts = np.ascontiguousarray(counts, dtype=_COUNT_DTYPE)
+    return _core._read_subset(offsets, sizes, ids, counts, argsort)
+
+
+def _unique_offsets_of(multiset: "LabelMultiset") -> np.ndarray:
+    """For each unique entry of ``multiset``, return the byte offset into ids/counts."""
+    n = multiset.n_entries
+    out = np.empty(n, dtype=_OFFSET_DTYPE)
+    for e in range(n):
+        out[e] = multiset.offsets[np.where(multiset.entry_offsets == e)[0][0]]
+    return out
+
+
+class MultisetMerger:
+    """Stateful deduplicating merger for multisets produced in batches.
+
+    The constructor expects one offset *per unique entry* (i.e. arrays of
+    length ``n_unique``, not ``n_spatial``). Use :meth:`from_multiset` to
+    build one straight from a :class:`LabelMultiset`.
+
+    Call :meth:`update` with subsequent batches; each call extends the
+    internal storage with any genuinely new entries and rewrites the
+    passed-in ``offsets`` array so each spatial position points at its
+    final deduplicated byte offset.
+    """
+
+    def __init__(
+        self,
+        unique_offsets: np.ndarray,
+        entry_sizes: np.ndarray,
+        ids: np.ndarray,
+        counts: np.ndarray,
+    ) -> None:
+        unique_offsets = np.ascontiguousarray(unique_offsets, dtype=_OFFSET_DTYPE)
+        entry_sizes = np.ascontiguousarray(entry_sizes, dtype=_OFFSET_DTYPE)
+        if unique_offsets.shape != entry_sizes.shape:
+            raise ValueError(
+                "unique_offsets and entry_sizes must have the same length "
+                "(one entry each). Use MultisetMerger.from_multiset() if you "
+                "have a LabelMultiset instead."
+            )
+        self._impl = _core._MultisetMerger(
+            unique_offsets,
+            entry_sizes,
+            np.ascontiguousarray(ids, dtype=_ID_DTYPE),
+            np.ascontiguousarray(counts, dtype=_COUNT_DTYPE),
+        )
+
+    @classmethod
+    def from_multiset(cls, multiset: "LabelMultiset") -> "MultisetMerger":
+        """Build a merger seeded with the unique entries of ``multiset``."""
+        return cls(
+            _unique_offsets_of(multiset),
+            multiset.entry_sizes,
+            multiset.ids,
+            multiset.counts,
+        )
+
+    def update(
+        self,
+        unique_offsets: np.ndarray,
+        entry_sizes: np.ndarray,
+        ids: np.ndarray,
+        counts: np.ndarray,
+        offsets: np.ndarray,
+    ) -> np.ndarray:
+        """Ingest a batch of entries and rewrite ``offsets`` in-place.
+
+        ``offsets`` is mutated in-place and also returned.
+        """
+        if offsets.dtype != _OFFSET_DTYPE or not offsets.flags["C_CONTIGUOUS"]:
+            raise TypeError(
+                "offsets must be a contiguous uint64 array (it is modified in place)"
+            )
+        return self._impl.update(
+            np.ascontiguousarray(unique_offsets, dtype=_OFFSET_DTYPE),
+            np.ascontiguousarray(entry_sizes, dtype=_OFFSET_DTYPE),
+            np.ascontiguousarray(ids, dtype=_ID_DTYPE),
+            np.ascontiguousarray(counts, dtype=_COUNT_DTYPE),
+            offsets,
+        )
+
+    @property
+    def ids(self) -> np.ndarray:
+        return self._impl.get_ids()
+
+    @property
+    def counts(self) -> np.ndarray:
+        return self._impl.get_counts()
+
+    @property
+    def offsets(self) -> np.ndarray:
+        return self._impl.get_offsets()
+
+    @property
+    def entry_sizes(self) -> np.ndarray:
+        return self._impl.get_entry_sizes()
+
+
+__all__ = [
+    "LabelMultiset",
+    "MultisetMerger",
+    "downsample_multiset",
+    "multiset_from_labels",
+    "read_subset",
+]
diff --git a/tests/label_multiset/__init__.py b/tests/label_multiset/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/label_multiset/test_against_nifty.py b/tests/label_multiset/test_against_nifty.py
new file mode 100644
index 0000000..9ae7485
--- /dev/null
+++ b/tests/label_multiset/test_against_nifty.py
@@ -0,0 +1,149 @@
+"""Cross-check against nifty.tools.label_multiset (skipped if nifty missing)."""
+import numpy as np
+import pytest
+
+nifty_tools = pytest.importorskip("nifty.tools")
+
+from bioimage_cpp._core import Blocking as BicBlocking
+from bioimage_cpp.label_multiset import (
+    MultisetMerger,
+    downsample_multiset,
+    multiset_from_labels,
+    read_subset,
+)
+
+
+def _bic_to_nifty_counts(counts: np.ndarray) -> np.ndarray:
+    return counts.astype(np.int32)
+
+
+def _entries(ids: np.ndarray, counts: np.ndarray, offsets: np.ndarray,
+             entry_sizes: np.ndarray, entry_offsets: np.ndarray):
+    """Yield per-spatial-position (ids_tuple, counts_tuple) for comparison."""
+    n = offsets.shape[0]
+    out = []
+    for s in range(n):
+        off = int(offsets[s])
+        eidx = int(entry_offsets[s])
+        sz = int(entry_sizes[eidx])
+        out.append(
+            (tuple(int(x) for x in ids[off : off + sz]),
+             tuple(int(x) for x in counts[off : off + sz]))
+        )
+    return out
+
+
+def _entries_from_ms(ms):
+    return _entries(ms.ids, ms.counts, ms.offsets, ms.entry_sizes, ms.entry_offsets)
+
+
+def test_read_subset_matches_nifty():
+    rng = np.random.default_rng(0)
+    ids = rng.integers(0, 10, size=20, dtype=np.uint64)
+    counts = rng.integers(1, 5, size=20, dtype=np.uint32)
+    offsets = np.array([0, 5, 10, 15], dtype=np.uint64)
+    sizes = np.array([5, 5, 5, 5], dtype=np.uint64)
+
+    bic_ids, bic_counts = read_subset(offsets, sizes, ids, counts)
+
+    n_ids, n_counts = nifty_tools.readSubset(
+        offsets.astype(np.uint64),
+        sizes.astype(np.uint64),
+        ids.astype(np.uint64),
+        _bic_to_nifty_counts(counts),
+        True,
+    )
+    # Both should be sorted by id and equal element-wise (up to count dtype).
+    assert bic_ids.tolist() == n_ids.tolist()
+    assert bic_counts.tolist() == n_counts.astype(np.uint32).tolist()
+
+
+def test_downsample_matches_nifty_3d():
+    rng = np.random.default_rng(1)
+    labels = rng.integers(0, 8, size=(4, 4, 4), dtype=np.uint64)
+
+    # Build the level-0 multiset *both ways* and downsample with both.
+    ms0 = multiset_from_labels(labels, (1, 1, 1))
+
+    # nifty's downsampleMultiset signature.
+    n_blocking = nifty_tools.blocking(
+        roiBegin=[0, 0, 0], roiEnd=list(labels.shape), blockShape=[2, 2, 2]
+    )
+    bic_blocking = BicBlocking([0, 0, 0], list(labels.shape), [2, 2, 2])
+
+    # Convert our level-0 to nifty types for input.
+    n_offsets = ms0.offsets.astype(np.uint64)
+    n_entry_sizes = ms0.entry_sizes.astype(np.uint64)
+    n_entry_offsets = ms0.entry_offsets.astype(np.uint64)
+    n_ids = ms0.ids.astype(np.uint64)
+    n_counts = _bic_to_nifty_counts(ms0.counts)
+
+    n_argmax, n_new_offsets, n_new_ids, n_new_counts = nifty_tools.downsampleMultiset(
+        n_blocking, n_offsets, n_entry_sizes, n_entry_offsets, n_ids, n_counts,
+        restrict_set=-1,
+    )
+
+    bic = downsample_multiset(ms0, bic_blocking)
+
+    # Argmax should match.
+    assert bic.argmax.tolist() == n_argmax.tolist()
+
+    # Reconstruct entries from nifty's output. nifty returns flat
+    # (new_offsets pointing into new_ids/new_counts) but not new_entry_sizes —
+    # since nifty's downsample reorders/uniqifies offsets implicitly, we
+    # rebuild sizes via np.unique on offsets.
+    unique_off, inverse = np.unique(n_new_offsets, return_inverse=True)
+    # Sizes are the gaps between consecutive unique offsets, with the last
+    # entry extending to len(new_ids).
+    sizes_arr = np.diff(
+        np.concatenate([unique_off, np.array([n_new_ids.shape[0]], dtype=unique_off.dtype)])
+    )
+    nifty_entries = _entries(
+        n_new_ids,
+        n_new_counts.astype(np.uint32),
+        n_new_offsets.astype(np.uint64),
+        sizes_arr.astype(np.uint64),
+        inverse.astype(np.uint64),
+    )
+    bic_entries = _entries_from_ms(bic)
+    assert bic_entries == nifty_entries
+
+
+def test_multiset_merger_matches_nifty():
+    rng = np.random.default_rng(2)
+    labels = rng.integers(0, 4, size=(4, 4), dtype=np.uint64)
+    ms = multiset_from_labels(labels, (2, 2))
+
+    unique_offsets = np.array(
+        [int(ms.offsets[np.where(ms.entry_offsets == e)[0][0]])
+         for e in range(ms.n_entries)],
+        dtype=np.uint64,
+    )
+
+    # Both constructors expect (unique_offsets, entry_sizes, ids, counts).
+    bic_merger = MultisetMerger.from_multiset(ms)
+    n_merger = nifty_tools.MultisetMerger(
+        unique_offsets,
+        ms.entry_sizes.astype(np.uint64),
+        ms.ids.astype(np.uint64),
+        _bic_to_nifty_counts(ms.counts),
+    )
+
+    # Ingest the same multiset as a batch — everything should be deduped.
+    spatial_offsets_bic = ms.entry_offsets.astype(np.uint64).copy()
+    spatial_offsets_n = ms.entry_offsets.astype(np.uint64).copy()
+
+    bic_merger.update(
+        unique_offsets, ms.entry_sizes.astype(np.uint64),
+        ms.ids, ms.counts, spatial_offsets_bic,
+    )
+    n_merger.update(
+        unique_offsets, ms.entry_sizes.astype(np.uint64),
+        ms.ids.astype(np.uint64), _bic_to_nifty_counts(ms.counts),
+        spatial_offsets_n,
+    )
+
+    assert bic_merger.ids.tolist() == ms.ids.tolist()
+    assert bic_merger.counts.tolist() == ms.counts.tolist()
+    assert n_merger.get_ids().tolist() == ms.ids.tolist()
+    assert spatial_offsets_bic.tolist() == spatial_offsets_n.tolist()
diff --git a/tests/label_multiset/test_downsample.py b/tests/label_multiset/test_downsample.py
new file mode 100644
index 0000000..a0242b1
--- /dev/null
+++ b/tests/label_multiset/test_downsample.py
@@ -0,0 +1,125 @@
+import numpy as np
+import pytest
+
+from bioimage_cpp._core import Blocking
+from bioimage_cpp.label_multiset import (
+    LabelMultiset,
+    downsample_multiset,
+    multiset_from_labels,
+)
+
+
+def _python_block_histograms(labels: np.ndarray, block_shape):
+    """Compute per-block (sorted_ids, counts) histograms in pure Python."""
+    shape = labels.shape
+    ndim = labels.ndim
+    blocks_per_axis = [
+        (shape[a] + block_shape[a] - 1) // block_shape[a] for a in range(ndim)
+    ]
+    blocks = []
+    if ndim == 2:
+        for by in range(blocks_per_axis[0]):
+            for bx in range(blocks_per_axis[1]):
+                y0, y1 = by * block_shape[0], min((by + 1) * block_shape[0], shape[0])
+                x0, x1 = bx * block_shape[1], min((bx + 1) * block_shape[1], shape[1])
+                vals, counts = np.unique(labels[y0:y1, x0:x1], return_counts=True)
+                blocks.append((tuple(int(v) for v in vals), tuple(int(c) for c in counts)))
+    elif ndim == 3:
+        for bz in range(blocks_per_axis[0]):
+            for by in range(blocks_per_axis[1]):
+                for bx in range(blocks_per_axis[2]):
+                    z0 = bz * block_shape[0]; z1 = min((bz + 1) * block_shape[0], shape[0])
+                    y0 = by * block_shape[1]; y1 = min((by + 1) * block_shape[1], shape[1])
+                    x0 = bx * block_shape[2]; x1 = min((bx + 1) * block_shape[2], shape[2])
+                    vals, counts = np.unique(
+                        labels[z0:z1, y0:y1, x0:x1], return_counts=True
+                    )
+                    blocks.append(
+                        (tuple(int(v) for v in vals), tuple(int(c) for c in counts))
+                    )
+    else:
+        raise ValueError("only 2D and 3D in test")
+    return blocks
+
+
+def _entries_as_dict(ms: LabelMultiset):
+    return [
+        (tuple(int(i) for i in ms.entry(s)[0]), tuple(int(c) for c in ms.entry(s)[1]))
+        for s in range(ms.n_spatial)
+    ]
+
+
+@pytest.mark.parametrize("seed", [0, 1, 2])
+def test_multiset_from_labels_2d_matches_python(seed):
+    rng = np.random.default_rng(seed)
+    labels = rng.integers(0, 7, size=(6, 8), dtype=np.uint64)
+    block_shape = (2, 2)
+
+    ms = multiset_from_labels(labels, block_shape)
+    expected = _python_block_histograms(labels, block_shape)
+    actual = _entries_as_dict(ms)
+    assert actual == expected
+
+
+def test_multiset_from_labels_3d_matches_python():
+    rng = np.random.default_rng(42)
+    labels = rng.integers(0, 5, size=(4, 6, 8), dtype=np.uint32)
+    block_shape = (2, 2, 2)
+
+    ms = multiset_from_labels(labels, block_shape)
+    expected = _python_block_histograms(labels, block_shape)
+    actual = _entries_as_dict(ms)
+    assert actual == expected
+
+
+def test_downsample_equivalent_to_block_aggregation():
+    # Building level-0 with block_shape (1,1) and then downsampling by (2,2)
+    # must give the same result as building directly with block_shape (2,2).
+    rng = np.random.default_rng(7)
+    labels = rng.integers(0, 4, size=(8, 6), dtype=np.uint64)
+    ms0 = multiset_from_labels(labels, (1, 1))
+    blocking = Blocking([0, 0], list(labels.shape), [2, 2])
+    ms1 = downsample_multiset(ms0, blocking)
+    ms_direct = multiset_from_labels(labels, (2, 2))
+
+    assert _entries_as_dict(ms1) == _entries_as_dict(ms_direct)
+    assert ms1.argmax.tolist() == ms_direct.argmax.tolist()
+
+
+def test_downsample_3d_two_levels():
+    rng = np.random.default_rng(11)
+    labels = rng.integers(0, 6, size=(8, 8, 8), dtype=np.uint64)
+    ms0 = multiset_from_labels(labels, (1, 1, 1))
+
+    b1 = Blocking([0, 0, 0], [8, 8, 8], [2, 2, 2])
+    ms1 = downsample_multiset(ms0, b1)
+    expected1 = _python_block_histograms(labels, (2, 2, 2))
+    assert _entries_as_dict(ms1) == expected1
+
+    b2 = Blocking([0, 0, 0], [4, 4, 4], [2, 2, 2])
+    ms2 = downsample_multiset(ms1, b2)
+    expected2 = _python_block_histograms(labels, (4, 4, 4))
+    assert _entries_as_dict(ms2) == expected2
+
+
+def test_downsample_dedup_reduces_entries():
+    # All-zero volume — every block is identical, so n_entries must collapse to 1.
+    labels = np.zeros((8, 8), dtype=np.uint64)
+    ms = multiset_from_labels(labels, (2, 2))
+    assert ms.n_spatial == 16
+    assert ms.n_entries == 1
+    assert ms.argmax.tolist() == [0] * 16
+
+
+def test_downsample_restrict_set_keeps_top_k():
+    # 4x4 block with labels [0]*15 + [1]; restrict_set=1 should keep only label 0.
+    labels = np.zeros((4, 4), dtype=np.uint64)
+    labels[3, 3] = 1
+    ms_full = multiset_from_labels(labels, (1, 1))
+    b = Blocking([0, 0], [4, 4], [4, 4])
+    ms_top1 = downsample_multiset(ms_full, b, restrict_set=1)
+    assert ms_top1.n_spatial == 1
+    ids, counts = ms_top1.entry(0)
+    assert ids.tolist() == [0]
+    assert counts.tolist() == [15]
+    assert ms_top1.argmax.tolist() == [0]
diff --git a/tests/label_multiset/test_merger.py b/tests/label_multiset/test_merger.py
new file mode 100644
index 0000000..58e568e
--- /dev/null
+++ b/tests/label_multiset/test_merger.py
@@ -0,0 +1,129 @@
+import numpy as np
+
+from bioimage_cpp._core import Blocking
+from bioimage_cpp.label_multiset import (
+    MultisetMerger,
+    downsample_multiset,
+    multiset_from_labels,
+)
+
+
+def _unique_offsets(ms):
+    """For each unique entry, return the offset where it lives in ms.ids."""
+    return np.array(
+        [int(ms.offsets[np.where(ms.entry_offsets == e)[0][0]])
+         for e in range(ms.n_entries)],
+        dtype=np.uint64,
+    )
+
+
+def test_merger_init_roundtrips_inputs():
+    offsets = np.array([0, 2], dtype=np.uint64)
+    entry_sizes = np.array([2, 1], dtype=np.uint64)
+    ids = np.array([1, 2, 5], dtype=np.uint64)
+    counts = np.array([3, 4, 7], dtype=np.uint32)
+
+    m = MultisetMerger(offsets, entry_sizes, ids, counts)
+    assert m.ids.tolist() == [1, 2, 5]
+    assert m.counts.tolist() == [3, 4, 7]
+    assert m.offsets.tolist() == [0, 2]
+    assert m.entry_sizes.tolist() == [2, 1]
+
+
+def test_merger_deduplicates_identical_entry():
+    # Existing storage has entry A: ids=[1,2], counts=[3,4]
+    offsets = np.array([0], dtype=np.uint64)
+    entry_sizes = np.array([2], dtype=np.uint64)
+    ids = np.array([1, 2], dtype=np.uint64)
+    counts = np.array([3, 4], dtype=np.uint32)
+    m = MultisetMerger(offsets, entry_sizes, ids, counts)
+
+    # New batch: one entry identical to A, one genuinely new.
+    batch_unique_offsets = np.array([0, 2], dtype=np.uint64)
+    batch_entry_sizes = np.array([2, 1], dtype=np.uint64)
+    batch_ids = np.array([1, 2, 9], dtype=np.uint64)
+    batch_counts = np.array([3, 4, 11], dtype=np.uint32)
+    spatial_offsets = np.array([0, 1], dtype=np.uint64)
+
+    rewritten = m.update(
+        batch_unique_offsets, batch_entry_sizes, batch_ids, batch_counts,
+        spatial_offsets,
+    )
+    assert rewritten.tolist() == [0, 2]
+    assert m.ids.tolist() == [1, 2, 9]
+    assert m.counts.tolist() == [3, 4, 11]
+    assert m.offsets.tolist() == [0, 2]
+    assert m.entry_sizes.tolist() == [2, 1]
+
+
+def test_merger_update_with_identical_entries_is_idempotent():
+    # Building a merger from a multiset and feeding the same multiset back as
+    # an update should leave ids/counts unchanged and rewrite spatial offsets
+    # to point at the original byte offsets.
+    rng = np.random.default_rng(3)
+    labels = rng.integers(0, 4, size=(4, 4), dtype=np.uint64)
+    ms = multiset_from_labels(labels, (2, 2))
+
+    m = MultisetMerger.from_multiset(ms)
+    spatial_in = ms.entry_offsets.astype(np.uint64).copy()
+    rewritten = m.update(
+        _unique_offsets(ms),
+        ms.entry_sizes.astype(np.uint64),
+        ms.ids, ms.counts,
+        spatial_in,
+    )
+    assert m.ids.tolist() == ms.ids.tolist()
+    assert m.counts.tolist() == ms.counts.tolist()
+    assert rewritten.tolist() == ms.offsets.tolist()
+
+
+def test_merger_from_multiset_with_real_dedup():
+    # Build a multiset that actually deduplicates (all-zero volume → 1 entry,
+    # many spatial positions). Ensure from_multiset() correctly extracts the
+    # unique offsets and the merger handles the size mismatch (n_spatial >>
+    # n_unique).
+    labels = np.zeros((8, 8), dtype=np.uint64)
+    ms = multiset_from_labels(labels, (2, 2))
+    assert ms.n_spatial == 16
+    assert ms.n_entries == 1
+
+    m = MultisetMerger.from_multiset(ms)
+    assert m.entry_sizes.tolist() == [1]
+    assert m.ids.tolist() == [0]
+    assert m.counts.tolist() == [4]
+
+    # Update with the same multiset — must remain idempotent.
+    spatial = ms.entry_offsets.astype(np.uint64).copy()
+    rewritten = m.update(
+        _unique_offsets(ms),
+        ms.entry_sizes.astype(np.uint64),
+        ms.ids, ms.counts,
+        spatial,
+    )
+    assert m.entry_sizes.tolist() == [1]
+    assert rewritten.tolist() == [0] * 16
+
+
+def test_merger_grows_on_disjoint_update():
+    # Two non-overlapping multisets fed in sequence — every entry from the
+    # second must be appended.
+    offsets_a = np.array([0, 1], dtype=np.uint64)
+    entry_sizes_a = np.array([1, 1], dtype=np.uint64)
+    ids_a = np.array([10, 20], dtype=np.uint64)
+    counts_a = np.array([5, 7], dtype=np.uint32)
+
+    m = MultisetMerger(offsets_a, entry_sizes_a, ids_a, counts_a)
+
+    unique_offsets_b = np.array([0, 1], dtype=np.uint64)
+    entry_sizes_b = np.array([1, 1], dtype=np.uint64)
+    ids_b = np.array([30, 40], dtype=np.uint64)
+    counts_b = np.array([9, 11], dtype=np.uint32)
+    spatial_b = np.array([0, 1], dtype=np.uint64)
+
+    rewritten = m.update(
+        unique_offsets_b, entry_sizes_b, ids_b, counts_b, spatial_b,
+    )
+    assert m.ids.tolist() == [10, 20, 30, 40]
+    assert m.counts.tolist() == [5, 7, 9, 11]
+    assert m.offsets.tolist() == [0, 1, 2, 3]
+    assert rewritten.tolist() == [2, 3]
diff --git a/tests/label_multiset/test_read_subset.py b/tests/label_multiset/test_read_subset.py
new file mode 100644
index 0000000..fad1e61
--- /dev/null
+++ b/tests/label_multiset/test_read_subset.py
@@ -0,0 +1,75 @@
+import numpy as np
+import pytest
+
+from bioimage_cpp.label_multiset import read_subset
+
+
+def test_read_subset_basic_merge():
+    # Two ranges over a flat ids/counts buffer.
+    #  range 0: ids=[1,2], counts=[3,4]
+    #  range 1: ids=[2,5], counts=[7,1]
+    # merged: {1: 3, 2: 11, 5: 1}
+    ids = np.array([1, 2, 2, 5], dtype=np.uint64)
+    counts = np.array([3, 4, 7, 1], dtype=np.uint32)
+    offsets = np.array([0, 2], dtype=np.uint64)
+    sizes = np.array([2, 2], dtype=np.uint64)
+
+    out_ids, out_counts = read_subset(offsets, sizes, ids, counts)
+    assert out_ids.tolist() == [1, 2, 5]
+    assert out_counts.tolist() == [3, 11, 1]
+
+
+def test_read_subset_single_range():
+    ids = np.array([7, 7, 9], dtype=np.uint64)
+    counts = np.array([2, 5, 3], dtype=np.uint32)
+    offsets = np.array([0], dtype=np.uint64)
+    sizes = np.array([3], dtype=np.uint64)
+
+    out_ids, out_counts = read_subset(offsets, sizes, ids, counts)
+    assert out_ids.tolist() == [7, 9]
+    assert out_counts.tolist() == [7, 3]
+
+
+def test_read_subset_no_argsort_returns_same_elements():
+    ids = np.array([1, 2, 5], dtype=np.uint64)
+    counts = np.array([3, 4, 1], dtype=np.uint32)
+    offsets = np.array([0], dtype=np.uint64)
+    sizes = np.array([3], dtype=np.uint64)
+
+    sorted_ids, sorted_counts = read_subset(offsets, sizes, ids, counts, argsort=True)
+    unsorted_ids, unsorted_counts = read_subset(
+        offsets, sizes, ids, counts, argsort=False
+    )
+    # Same multiset regardless of order.
+    assert sorted(sorted_ids.tolist()) == sorted(unsorted_ids.tolist())
+    assert sum(sorted_counts.tolist()) == sum(unsorted_counts.tolist())
+
+
+def test_read_subset_empty_range_list():
+    ids = np.array([1, 2], dtype=np.uint64)
+    counts = np.array([3, 4], dtype=np.uint32)
+    offsets = np.array([], dtype=np.uint64)
+    sizes = np.array([], dtype=np.uint64)
+
+    out_ids, out_counts = read_subset(offsets, sizes, ids, counts)
+    assert out_ids.shape == (0,)
+    assert out_counts.shape == (0,)
+
+
+def test_read_subset_dtype_promotion():
+    # Int Python lists / wrong dtypes should be safely coerced.
+    out_ids, out_counts = read_subset([0, 2], [2, 1], [1, 2, 3], [4, 5, 6])
+    assert out_ids.tolist() == [1, 2, 3]
+    assert out_counts.tolist() == [4, 5, 6]
+
+
+def test_read_subset_overlapping_ranges_double_count():
+    # Same range listed twice should double the counts.
+    ids = np.array([1, 2], dtype=np.uint64)
+    counts = np.array([3, 4], dtype=np.uint32)
+    offsets = np.array([0, 0], dtype=np.uint64)
+    sizes = np.array([2, 2], dtype=np.uint64)
+
+    out_ids, out_counts = read_subset(offsets, sizes, ids, counts)
+    assert out_ids.tolist() == [1, 2]
+    assert out_counts.tolist() == [6, 8]