diff --git a/src/hardlink/aware.rs b/src/hardlink/aware.rs index 36dedbc0..688a85ec 100644 --- a/src/hardlink/aware.rs +++ b/src/hardlink/aware.rs @@ -82,8 +82,9 @@ where })); let ino = InodeNumber::get(stats); + let dev = stats.dev(); self.record - .add(ino, size, links, path) + .add(ino, dev, size, links, path) .map_err(ReportHardlinksError::AddToRecord) } } diff --git a/src/hardlink/hardlink_list.rs b/src/hardlink/hardlink_list.rs index c955de8e..02efe145 100644 --- a/src/hardlink/hardlink_list.rs +++ b/src/hardlink/hardlink_list.rs @@ -20,6 +20,20 @@ use pipe_trait::Pipe; #[cfg(any(unix, test))] use std::path::Path; +/// Internal key used to uniquely identify an inode across all filesystems. +/// +/// Hardlinks cannot span filesystems, so including the device number prevents +/// false deduplication of files from different filesystems that happen to share +/// the same inode number. Both du-dust and dua-cli track `(device, inode)` pairs +/// for the same reason. +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +struct InodeKey { + /// Device number of the filesystem the inode belongs to. + dev: u64, + /// Inode number within the device. + ino: InodeNumber, +} + /// Map value in [`HardlinkList`]. #[derive(Debug, Clone)] struct Value { @@ -38,8 +52,8 @@ struct Value { /// [`Reflection`] which implement these traits. #[derive(Debug, SmartDefault, Clone)] pub struct HardlinkList( - /// Map an inode number to its size, number of links, and detected paths. - DashMap>, + /// Map an inode key (device + inode number) to its size, number of links, and detected paths. + DashMap>, ); impl HardlinkList { @@ -112,13 +126,15 @@ where pub(crate) fn add( &self, ino: InodeNumber, + dev: u64, size: Size, links: u64, path: &Path, ) -> Result<(), AddError> { + let key = InodeKey { dev, ino }; let mut assertions = Ok(()); self.0 - .entry(ino) + .entry(key) .and_modify(|recorded| { if size != recorded.size { assertions = Err(AddError::SizeConflict(SizeConflictError { diff --git a/src/hardlink/hardlink_list/iter.rs b/src/hardlink/hardlink_list/iter.rs index 4b2c1b3c..69d0f10c 100644 --- a/src/hardlink/hardlink_list/iter.rs +++ b/src/hardlink/hardlink_list/iter.rs @@ -1,4 +1,4 @@ -use super::{HardlinkList, Value}; +use super::{HardlinkList, InodeKey, Value}; use crate::{hardlink::LinkPathList, inode::InodeNumber}; use dashmap::{iter::Iter as DashIter, mapref::multiple::RefMulti}; use pipe_trait::Pipe; @@ -7,7 +7,7 @@ use pipe_trait::Pipe; #[derive(derive_more::Debug)] #[debug(bound())] #[debug("Iter(..)")] -pub struct Iter<'a, Size>(DashIter<'a, InodeNumber, Value>); +pub struct Iter<'a, Size>(DashIter<'a, InodeKey, Value>); impl HardlinkList { /// Iterate over the recorded entries. @@ -20,7 +20,7 @@ impl HardlinkList { #[derive(derive_more::Debug)] #[debug(bound())] #[debug("Item(..)")] -pub struct Item<'a, Size>(RefMulti<'a, InodeNumber, Value>); +pub struct Item<'a, Size>(RefMulti<'a, InodeKey, Value>); impl<'a, Size> Iterator for Iter<'a, Size> { type Item = Item<'a, Size>; @@ -33,7 +33,7 @@ impl<'a, Size> Item<'a, Size> { /// The inode number of the file. #[inline] pub fn ino(&self) -> InodeNumber { - *self.0.key() + self.0.key().ino } /// Size of the file. diff --git a/src/hardlink/hardlink_list/reflection.rs b/src/hardlink/hardlink_list/reflection.rs index c190041b..e19ca084 100644 --- a/src/hardlink/hardlink_list/reflection.rs +++ b/src/hardlink/hardlink_list/reflection.rs @@ -1,4 +1,4 @@ -use super::{HardlinkList, Value}; +use super::{HardlinkList, InodeKey, Value}; use crate::{hardlink::LinkPathListReflection, inode::InodeNumber}; use dashmap::DashMap; use derive_more::{Display, Error, Into, IntoIterator}; @@ -12,8 +12,14 @@ use serde::{Deserialize, Serialize}; /// internal content. /// /// **Guarantees:** -/// * Every inode number is unique. -/// * The internal list is always sorted by inode numbers. +/// * Every `(device, inode)` pair is unique within the scope of a single scan, but inode +/// numbers alone are **not** guaranteed to be unique: when scanning multiple filesystems, +/// two unrelated files on different devices can share the same inode number and will each +/// produce a separate entry. The reflection stores only the inode number (the JSON format +/// does not carry device information), so round-tripping a multi-filesystem scan through +/// JSON is an unsupported edge case. +/// * The internal list is always sorted by inode numbers (and by device number as a +/// tie-breaker when two entries share the same inode number). /// /// **Equality:** `Reflection` implements `PartialEq` and `Eq` traits. /// @@ -95,10 +101,17 @@ impl From>> for Reflection { impl From> for Reflection { fn from(HardlinkList(list): HardlinkList) -> Self { - list.into_iter() - .map(|(ino, value)| ReflectionEntry::new(ino, value)) + // Collect to a vec, sort by (ino, dev) for a stable, deterministic order, then + // strip dev before wrapping. Sorting here (with dev still available) avoids the + // nondeterminism that would arise from an unstable sort on ino alone when two + // entries from different filesystems share the same inode number. + let mut pairs: Vec<(InodeKey, Value)> = list.into_iter().collect(); + pairs.sort_unstable_by_key(|(key, _)| (u64::from(key.ino), key.dev)); + pairs + .into_iter() + .map(|(key, value)| ReflectionEntry::new(key.ino, value)) .collect::>() - .pipe(Reflection::from) + .pipe(Reflection) } } @@ -119,7 +132,15 @@ impl TryFrom> for HardlinkList { for entry in entries { let (ino, value) = entry.dissolve(); - if map.insert(ino, value).is_some() { + // Device number is unknown when loading from a reflection (e.g. JSON input); + // use dev=0 as a placeholder. This means that when reloading JSON output that + // was produced by scanning multiple filesystems, files from different devices + // sharing the same inode number cannot be distinguished and therefore cannot + // all be represented. Such duplicates cause a ConversionError::DuplicatedInode + // and are treated as an unsupported edge case, since the JSON format does not + // carry device information. + let key = InodeKey { dev: 0, ino }; + if map.insert(key, value).is_some() { return ino.pipe(ConversionError::DuplicatedInode).pipe(Err); } } diff --git a/src/hardlink/hardlink_list/test.rs b/src/hardlink/hardlink_list/test.rs index 8e6878d2..b98845a6 100644 --- a/src/hardlink/hardlink_list/test.rs +++ b/src/hardlink/hardlink_list/test.rs @@ -3,21 +3,22 @@ use crate::size::Bytes; use pipe_trait::Pipe; use pretty_assertions::{assert_eq, assert_ne}; -const TABLE: &[(u64, u64, u64, &str)] = &[ - (241, 3652, 1, "a"), - (569, 2210, 1, "b"), - (110, 2350, 3, "c"), - (110, 2350, 3, "c1"), - (778, 1110, 1, "d"), - (274, 6060, 2, "e"), - (274, 6060, 2, "e1"), - (883, 4530, 1, "f"), +const TABLE: &[(u64, u64, u64, u64, &str)] = &[ + // dev, ino, size, links, path + (0, 241, 3652, 1, "a"), + (0, 569, 2210, 1, "b"), + (0, 110, 2350, 3, "c"), + (0, 110, 2350, 3, "c1"), + (0, 778, 1110, 1, "d"), + (0, 274, 6060, 2, "e"), + (0, 274, 6060, 2, "e1"), + (0, 883, 4530, 1, "f"), ]; fn add(list: HardlinkList) -> HardlinkList { let values = TABLE[ROW]; - let (ino, size, links, path) = values; - if let Err(error) = list.add(ino.into(), size.into(), links, path.as_ref()) { + let (dev, ino, size, links, path) = values; + if let Err(error) = list.add(ino.into(), dev, size.into(), links, path.as_ref()) { panic!("Failed to add {values:?} (index: {ROW}) to the list: {error}"); } list @@ -119,10 +120,10 @@ fn insertion_difference_cause_inequality() { #[test] fn detect_size_change() { let list = HardlinkList::::new(); - list.add(123.into(), 100.into(), 1, "a".as_ref()) + list.add(123.into(), 0, 100.into(), 1, "a".as_ref()) .expect("add the first path"); let actual = list - .add(123.into(), 110.into(), 1, "b".as_ref()) + .add(123.into(), 0, 110.into(), 1, "b".as_ref()) .expect_err("add the second path"); let expected = AddError::SizeConflict(SizeConflictError { ino: 123.into(), @@ -135,10 +136,10 @@ fn detect_size_change() { #[test] fn detect_number_of_links_change() { let list = HardlinkList::::new(); - list.add(123.into(), 100.into(), 1, "a".as_ref()) + list.add(123.into(), 0, 100.into(), 1, "a".as_ref()) .expect("add the first path"); let actual = list - .add(123.into(), 100.into(), 2, "b".as_ref()) + .add(123.into(), 0, 100.into(), 2, "b".as_ref()) .expect_err("add the second path"); let expected = AddError::NumberOfLinksConflict(NumberOfLinksConflictError { ino: 123.into(), @@ -147,3 +148,38 @@ fn detect_number_of_links_change() { }); assert_eq!(actual, expected); } + +/// Files on different devices may share the same inode number, but they are +/// unrelated — hardlinks cannot span filesystem boundaries. Verify that two +/// files with the same inode number but different device numbers produce +/// separate entries in the list (i.e. the device number is actually used in +/// the deduplication key). +#[test] +fn same_ino_on_different_devices_are_treated_separately() { + let list = HardlinkList::::new(); + + // dev=1, ino=100 — first filesystem + list.add(100.into(), 1, 50.into(), 2, "dev1/file_a".as_ref()) + .expect("add dev1/file_a"); + list.add(100.into(), 1, 50.into(), 2, "dev1/file_b".as_ref()) + .expect("add dev1/file_b (same dev+ino → same inode group)"); + + // dev=2, ino=100 — second filesystem, coincidentally same inode number + list.add(100.into(), 2, 80.into(), 2, "dev2/file_c".as_ref()) + .expect("add dev2/file_c (different dev → separate inode group)"); + list.add(100.into(), 2, 80.into(), 2, "dev2/file_d".as_ref()) + .expect("add dev2/file_d (same dev+ino → same inode group as file_c)"); + + // Each device should produce its own entry, so the list should have 2 entries. + assert_eq!(list.len(), 2, "expected one entry per (dev, ino) pair"); + + let reflection = list.into_reflection(); + // Both entries expose ino=100 in the reflection (device is not part of the + // public JSON format), so there are still 2 entries in the vector. + assert_eq!(reflection.len(), 2); + + // Paths are grouped per (dev, ino): each group has exactly 2 paths. + for entry in reflection.iter() { + assert_eq!(entry.paths.len(), 2); + } +}