From 79d5e0245638a7b436e100d7c6bd30bdfcf4e297 Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Sun, 26 Apr 2026 06:41:53 -0700 Subject: [PATCH 01/23] Implement minimal multi-process support (fork/exec/waitpid/pipe) Add single-host multi-process support for the Linux userland platform, enabling piped command execution (e.g., echo hello | cat) within a single host process. All forks use vfork semantics: parent suspends while child runs in shared address space, child detaches to its own VA partition on exec. Key changes: - ProcessRegistry for process lifecycle (create, exit, waitpid) - AddressSpaceProvider trait + VA partition allocator (128x1TiB) - GlobalState/ProcessState split (per-process PageManager) - do_fork with vfork semantics and VforkDone futex signaling - Exec detach to new address space before loading new binary - Fork-aware FD close (Arc refcount prevents cross-process destruction) - FD cleanup on process exit for proper pipe EOF detection - vfork syscall handler, Wait4 syscall dispatch - PIE-only children (dynamic ELF load hint via pm.addr_min()) Tested with fork+exec+waitpid and pipe-between-two-children (echo|cat). No regressions in existing test suite. --- litebox/src/fd/mod.rs | 73 ++- litebox/src/lib.rs | 1 + litebox/src/litebox.rs | 9 + litebox/src/mm/linux.rs | 68 +- litebox/src/mm/mod.rs | 27 +- litebox/src/platform/address_space.rs | 92 +++ litebox/src/platform/mock.rs | 6 + litebox/src/platform/mod.rs | 9 +- litebox/src/process/mod.rs | 598 ++++++++++++++++++ litebox/src/shim.rs | 5 + litebox_common_linux/src/lib.rs | 39 ++ litebox_platform_linux_kernel/src/lib.rs | 11 +- litebox_platform_linux_userland/src/lib.rs | 113 ++++ litebox_runner_linux_userland/src/lib.rs | 13 +- .../tests/common/mod.rs | 18 +- .../tests/multiprocess/cat_stdin.c | 14 + .../tests/multiprocess/echo_hello.c | 11 + .../tests/multiprocess/exit_with.c | 10 + .../tests/multiprocess/fork_exec_wait.c | 45 ++ .../tests/multiprocess/pipe_fork.c | 80 +++ litebox_runner_linux_userland/tests/run.rs | 64 ++ litebox_shim_linux/src/lib.rs | 55 +- litebox_shim_linux/src/loader/elf.rs | 9 +- litebox_shim_linux/src/loader/mod.rs | 7 +- litebox_shim_linux/src/syscalls/file.rs | 35 + litebox_shim_linux/src/syscalls/mm.rs | 14 +- litebox_shim_linux/src/syscalls/process.rs | 335 +++++++++- 27 files changed, 1694 insertions(+), 67 deletions(-) create mode 100644 litebox/src/platform/address_space.rs create mode 100644 litebox/src/process/mod.rs create mode 100644 litebox_runner_linux_userland/tests/multiprocess/cat_stdin.c create mode 100644 litebox_runner_linux_userland/tests/multiprocess/echo_hello.c create mode 100644 litebox_runner_linux_userland/tests/multiprocess/exit_with.c create mode 100644 litebox_runner_linux_userland/tests/multiprocess/fork_exec_wait.c create mode 100644 litebox_runner_linux_userland/tests/multiprocess/pipe_fork.c diff --git a/litebox/src/fd/mod.rs b/litebox/src/fd/mod.rs index af267b5b4..addafe215 100644 --- a/litebox/src/fd/mod.rs +++ b/litebox/src/fd/mod.rs @@ -36,6 +36,29 @@ impl Descriptors { Self { entries: vec![] } } + /// Clone the entire descriptor table for fork. + /// + /// Each entry in the new table shares the same underlying `DescriptorEntry` + /// (via `Arc::clone`), matching the semantics of both POSIX fork (shared + /// file descriptions) and NT handle inheritance. Per-FD metadata is **not** + /// cloned; each slot in the child starts with a fresh `AnyMap`. + /// + /// Calls `on_dup()` on each entry to notify subsystems of the new reference. + pub(crate) fn clone_table(&self) -> Self { + let entries = self + .entries + .iter() + .map(|slot| { + slot.as_ref().map(|ind| { + let cloned = IndividualEntry::new(Arc::clone(&ind.x)); + cloned.x.read().entry.on_dup(); + cloned + }) + }) + .collect(); + Self { entries } + } + /// Insert `entry` into the descriptor table, returning an `OwnedFd` to this entry. #[expect( clippy::missing_panics_doc, @@ -95,6 +118,7 @@ impl Descriptors { let new_ind_entry = IndividualEntry::new(Arc::clone( &self.entries[fd.x.as_usize()?].as_ref().unwrap().x, )); + new_ind_entry.x.read().entry.on_dup(); let old = self.entries[idx].replace(new_ind_entry); assert!(old.is_none()); Some(TypedFd { @@ -116,6 +140,7 @@ impl Descriptors { let Some(old) = self.entries[fd.x.as_usize()?].take() else { unreachable!(); }; + old.x.read().entry.on_close(); fd.x.mark_as_closed(); Arc::into_inner(old.x) .map(RwLock::into_inner) @@ -143,6 +168,7 @@ impl Descriptors { if Arc::strong_count(&old.x) == 1 { // Unique, so we can just return it if allowed. if can_close_immediately(old.x.read().as_subsystem::()) { + old.x.read().entry.on_close(); fd.x.mark_as_closed(); let entry = Arc::into_inner(old.x) .map(RwLock::into_inner) @@ -156,6 +182,7 @@ impl Descriptors { Some(CloseResult::Deferred) } } else { + old.x.read().entry.on_close(); fd.x.mark_as_closed(); // Shared, so we need to duplicate it. let old = self.entries[idx].replace(old); @@ -676,6 +703,28 @@ impl RawDescriptorStorage { self.stored_fds.get(fd).is_some_and(Option::is_some) } + /// Clone the entire raw descriptor storage for fork. + /// + /// Each slot in the new storage shares the same underlying `OwnedFd` + /// (via `Arc::clone`), matching POSIX fork semantics where the child + /// inherits copies of the parent's file descriptor table that refer to + /// the same open file descriptions. + #[must_use] + pub fn clone_for_fork(&self) -> Self { + Self { + stored_fds: self + .stored_fds + .iter() + .map(|slot| { + slot.as_ref().map(|stored| StoredFd { + x: Arc::clone(&stored.x), + subsystem_entry_type_id: stored.subsystem_entry_type_id, + }) + }) + .collect(), + } + } + /// Returns an iterator over raw integer indices that are currently alive (i.e., occupied). pub fn iter_alive(&self) -> impl Iterator + '_ { self.stored_fds @@ -762,7 +811,29 @@ pub trait FdEnabledSubsystem: Sized { } /// A per-FD entry stored in the descriptor table for a specific [`FdEnabledSubsystem`] -pub trait FdEnabledSubsystemEntry: Send + Sync + core::any::Any {} +/// +/// # Hook contract +/// +/// `on_dup` and `on_close` are called while a read lock is held on the +/// containing `DescriptorEntry`. Implementations must use interior mutability +/// (e.g., atomics) and must **not** attempt to acquire a write lock on the +/// same entry, or deadlock will result. +/// +/// The initial `insert()` does **not** call `on_dup()`; subsystems should +/// initialize any reference count to 1 in their constructor. +pub trait FdEnabledSubsystemEntry: Send + Sync + core::any::Any { + /// Called when a new reference to this entry is created (dup, fork). + /// + /// Subsystems that track reference counts (e.g., pipe write-ends for + /// EOF detection) should increment their count here. + fn on_dup(&self) {} + + /// Called when a reference to this entry is dropped (close). + /// + /// This is called for every close, even when other references remain. + /// Subsystems should decrement their reference count here. + fn on_close(&self) {} +} /// Possible errors from [`RawDescriptorStorage::fd_from_raw_integer`] and /// [`RawDescriptorStorage::fd_consume_raw_integer`]. diff --git a/litebox/src/lib.rs b/litebox/src/lib.rs index f3d80997a..144a00bbb 100644 --- a/litebox/src/lib.rs +++ b/litebox/src/lib.rs @@ -24,6 +24,7 @@ pub mod net; pub mod path; pub mod pipes; pub mod platform; +pub mod process; pub mod shim; pub mod sync; pub mod tls; diff --git a/litebox/src/litebox.rs b/litebox/src/litebox.rs index 2fb209c22..0de125ba7 100644 --- a/litebox/src/litebox.rs +++ b/litebox/src/litebox.rs @@ -7,6 +7,7 @@ use alloc::sync::Arc; use crate::{ fd::Descriptors, + process::ProcessRegistry, sync::{RawSyncPrimitivesProvider, RwLock}, }; @@ -65,6 +66,7 @@ impl LiteBox { crate::sync::lock_tracing::LockTracker::init(platform); let descriptors = RwLock::new(Descriptors::new_from_litebox_creation()); + let process_registry = ProcessRegistry::new(); litebox_util_log::trace!("LiteBox instance initialized"); @@ -72,6 +74,7 @@ impl LiteBox { x: Arc::new(LiteBoxX { platform, descriptors, + process_registry, }), } } @@ -106,10 +109,16 @@ impl LiteBox { ) -> impl core::ops::DerefMut> + use<'_, Platform> { self.x.descriptors.write() } + + /// Access the process registry. + pub fn process_registry(&self) -> &ProcessRegistry { + &self.x.process_registry + } } /// The actual body of [`LiteBox`], containing any components that might be shared. pub(crate) struct LiteBoxX { pub(crate) platform: &'static Platform, descriptors: RwLock>, + process_registry: ProcessRegistry, } diff --git a/litebox/src/mm/linux.rs b/litebox/src/mm/linux.rs index f33094971..72fc72452 100644 --- a/litebox/src/mm/linux.rs +++ b/litebox/src/mm/linux.rs @@ -304,30 +304,53 @@ pub(super) struct Vmem + 'static, const pub(super) brk: usize, /// Virtual memory areas. vmas: RangeMap, + /// Minimum valid address for this address space. + pub(super) addr_min: usize, + /// Maximum valid address (exclusive) for this address space. + pub(super) addr_max: usize, } impl + 'static, const ALIGN: usize> Vmem { pub(super) const STACK_GUARD_GAP: usize = 256 << 12; - /// Create a new [`Vmem`] instance with the given memory [backend](PageManagementProvider). + /// Create a new [`Vmem`] instance using the platform's default address range. pub(super) fn new(platform: &'static Platform) -> Self { + Self::new_with_range(platform, Platform::TASK_ADDR_MIN..Platform::TASK_ADDR_MAX) + } + + /// Create a new [`Vmem`] instance scoped to the given VA range. + /// + /// Used for multi-process support where each process gets a VA partition. + pub(super) fn new_with_range( + platform: &'static Platform, + range: core::ops::Range, + ) -> Self { + assert!( + range.start.is_multiple_of(ALIGN) && range.end.is_multiple_of(ALIGN), + "Vmem: address range must be aligned to {ALIGN} bytes" + ); let mut vmem = Self { vmas: RangeMap::new(), brk: 0, platform, + addr_min: range.start, + addr_max: range.end, }; for each in platform.reserved_pages() { - assert!( - each.start % ALIGN == 0 && each.end % ALIGN == 0, - "Vmem: reserved range is not aligned to {ALIGN} bytes" - ); - vmem.vmas.insert( - each.start..each.end, - VmArea { - flags: VmFlags::empty(), - is_file_backed: false, - }, - ); + // Only insert reserved pages that fall within our range + if each.start >= range.start && each.end <= range.end { + assert!( + each.start % ALIGN == 0 && each.end % ALIGN == 0, + "Vmem: reserved range is not aligned to {ALIGN} bytes" + ); + vmem.vmas.insert( + each.start..each.end, + VmArea { + flags: VmFlags::empty(), + is_file_backed: false, + }, + ); + } } vmem } @@ -453,10 +476,10 @@ impl + 'static, const ALIGN: usize> Vmem fixed_address_behavior: FixedAddressBehavior, ) -> Result, AllocationError> { let (start, end) = (suggested_range.start, suggested_range.end); - if start < Platform::TASK_ADDR_MIN { + if start < self.addr_min { return Err(AllocationError::BelowMinAddress); } - if end > Platform::TASK_ADDR_MAX { + if end > self.addr_max { return Err(AllocationError::AboveMaxAddress); } let platform_fixed_address_behavior = match fixed_address_behavior { @@ -518,8 +541,8 @@ impl + 'static, const ALIGN: usize> Vmem let new_start = ret.as_usize(); let new_end = new_start + suggested_range.len(); self.vmas.insert(new_start..new_end, vma); - debug_assert!(new_start >= Platform::TASK_ADDR_MIN); - debug_assert!(new_end <= Platform::TASK_ADDR_MAX); + debug_assert!(new_start >= self.addr_min); + debug_assert!(new_end <= self.addr_max); Ok(ret) } @@ -890,11 +913,11 @@ impl + 'static, const ALIGN: usize> Vmem fixed_addr: bool, ) -> Option { let size = length.as_usize(); - if size > Platform::TASK_ADDR_MAX { + if size > self.addr_max.saturating_sub(self.addr_min) { return None; } if let Some(suggested_address) = suggested_address { - if (Platform::TASK_ADDR_MAX - size) < suggested_address.0 { + if (self.addr_max - size) < suggested_address.0 { return None; } if fixed_addr @@ -912,12 +935,9 @@ impl + 'static, const ALIGN: usize> Vmem // top down // 1. check [last_end, TASK_SIZE_MAX) - let (low_limit, high_limit) = ( - Platform::TASK_ADDR_MIN, - Platform::TASK_ADDR_MAX - length.as_usize(), - ); - debug_assert!(Platform::TASK_ADDR_MIN % ALIGN == 0); - debug_assert!(Platform::TASK_ADDR_MAX % ALIGN == 0); + let (low_limit, high_limit) = (self.addr_min, self.addr_max - length.as_usize()); + debug_assert!(self.addr_min.is_multiple_of(ALIGN)); + debug_assert!(self.addr_max.is_multiple_of(ALIGN)); let last_end = self.vmas.last_range_value().map_or(low_limit, |r| r.0.end); if last_end <= high_limit { return Some(high_limit); diff --git a/litebox/src/mm/mod.rs b/litebox/src/mm/mod.rs index a46b3c855..a19f1abdd 100644 --- a/litebox/src/mm/mod.rs +++ b/litebox/src/mm/mod.rs @@ -46,6 +46,19 @@ where Self { vmem } } + /// Create a new `PageManager` scoped to a specific VA range. + /// + /// Used for multi-process support where each process gets a VA partition. + pub fn new_with_range(litebox: &LiteBox, range: core::ops::Range) -> Self { + let vmem = RwLock::new(linux::Vmem::new_with_range(litebox.x.platform, range)); + Self { vmem } + } + + /// Returns the minimum address of this process's virtual address range. + pub fn addr_min(&self) -> usize { + self.vmem.read().addr_min + } + /// Create a mapping with the given flags. /// /// `suggested_new_address` is the hint address for where to create the pages if it is not `None`. @@ -672,15 +685,21 @@ where error_code: u64, ) -> Result<(), PageFaultError> { let fault_addr = fault_addr & !(ALIGN - 1); - if !(Platform::TASK_ADDR_MIN..Platform::TASK_ADDR_MAX).contains(&fault_addr) { - return Err(PageFaultError::AccessError("Invalid address")); + // Read address bounds from vmem to avoid using Platform constants directly + { + let vmem = self.vmem.read(); + if !(vmem.addr_min..vmem.addr_max).contains(&fault_addr) { + return Err(PageFaultError::AccessError("Invalid address")); + } } let mut vmem = self.vmem.write(); + let addr_min = vmem.addr_min; + let addr_max = vmem.addr_max; // Find the range closest to the fault address let (start, vma) = { let (r, vma) = vmem - .overlapping(fault_addr..Platform::TASK_ADDR_MAX) + .overlapping(fault_addr..addr_max) .next() .ok_or(PageFaultError::AccessError("no mapping"))?; (r.start, *vma) @@ -692,7 +711,7 @@ where } if !vmem - .overlapping(Platform::TASK_ADDR_MIN..fault_addr) + .overlapping(addr_min..fault_addr) .next_back() .is_none_or(|(prev_range, prev_vma)| { // Enforce gap between stack and other preceding non-stack mappings. diff --git a/litebox/src/platform/address_space.rs b/litebox/src/platform/address_space.rs new file mode 100644 index 000000000..2c0e9d429 --- /dev/null +++ b/litebox/src/platform/address_space.rs @@ -0,0 +1,92 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//! Address space management for multi-process support. +//! +//! Platforms implement [`AddressSpaceProvider`] to manage isolated or shared +//! memory regions for guest processes. + +use core::fmt::Debug; +use core::hash::Hash; +use core::ops::Range; +use thiserror::Error; + +/// Platform-wide property: are address spaces isolated or shared? +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum AddressSpaceKind { + /// Each address space has independent memory (e.g., kernel page tables, + /// separate host processes). The platform handles memory isolation; + /// the shim does not need to manage CoW. + Isolated, + /// Address spaces share the same host memory (e.g., VA partitions in a + /// single userland process). The shim is responsible for copy-on-write + /// or other memory separation. + SharedMemory, +} + +/// Errors from address space operations. +#[derive(Error, Debug)] +pub enum AddressSpaceError { + #[error("no space available for a new address space")] + NoSpace, + #[error("the address space ID is invalid")] + InvalidId, + #[error("address space operations are not supported by this platform")] + NotSupported, +} + +/// Address space management for multi-process support. +/// +/// Platforms implement this trait to create, destroy, and switch between +/// address spaces. Each address space represents an isolated (or partitioned) +/// memory region for a guest process. +pub trait AddressSpaceProvider { + /// An opaque identifier for an address space. + type AddressSpaceId: Copy + Eq + Send + Sync + Hash + Debug + 'static; + + /// Platform-wide: are address spaces isolated or shared? + const ADDRESS_SPACE_KIND: AddressSpaceKind; + + /// Create a new address space. + fn create_address_space(&self) -> Result { + Err(AddressSpaceError::NotSupported) + } + + /// Destroy an address space, releasing all resources. + fn destroy_address_space(&self, _id: Self::AddressSpaceId) -> Result<(), AddressSpaceError> { + Err(AddressSpaceError::NotSupported) + } + + /// Make `id` the active address space for the current thread. + /// + /// Activation is thread-local: each thread independently tracks its + /// active address space. On kernel platforms this switches page tables. + /// On userland platforms this may be a no-op. + fn activate_address_space(&self, _id: Self::AddressSpaceId) -> Result<(), AddressSpaceError> { + Err(AddressSpaceError::NotSupported) + } + + /// Execute `f` with the given address space active, then restore the + /// previously active address space. + fn with_address_space( + &self, + _id: Self::AddressSpaceId, + f: impl FnOnce() -> R, + ) -> Result { + let _ = f; + Err(AddressSpaceError::NotSupported) + } + + /// Return the VA range available to the given address space. + /// + /// Primarily meaningful for [`AddressSpaceKind::SharedMemory`] platforms + /// (e.g., userland VA partitions) where the shim needs to scope memory + /// operations (mmap, brk, etc.) to the correct region. Platforms with + /// hardware-isolated address spaces typically return `NotSupported`. + fn address_space_range( + &self, + _id: Self::AddressSpaceId, + ) -> Result, AddressSpaceError> { + Err(AddressSpaceError::NotSupported) + } +} diff --git a/litebox/src/platform/mock.rs b/litebox/src/platform/mock.rs index 4bcb936eb..bfac9a849 100644 --- a/litebox/src/platform/mock.rs +++ b/litebox/src/platform/mock.rs @@ -340,3 +340,9 @@ unsafe impl ThreadLocalStorageProvider for MockPlatform { MOCK_TLS.replace(value) } } + +impl AddressSpaceProvider for MockPlatform { + type AddressSpaceId = u32; + const ADDRESS_SPACE_KIND: address_space::AddressSpaceKind = + address_space::AddressSpaceKind::SharedMemory; +} diff --git a/litebox/src/platform/mod.rs b/litebox/src/platform/mod.rs index 2a0b6a9df..f2aea2756 100644 --- a/litebox/src/platform/mod.rs +++ b/litebox/src/platform/mod.rs @@ -7,6 +7,7 @@ //! trait is merely a collection of subtraits that could be composed independently from various //! other crates that implement them upon various types. +pub mod address_space; pub mod common_providers; pub mod page_mgmt; pub mod trivial_providers; @@ -18,6 +19,7 @@ use either::Either; use thiserror::Error; use zerocopy::{FromBytes, IntoBytes}; +pub use address_space::AddressSpaceProvider; pub use page_mgmt::PageManagementProvider; /// A provider of a platform upon which LiteBox can execute. @@ -26,7 +28,12 @@ pub use page_mgmt::PageManagementProvider; /// provided by it. _However_, most of the provided APIs within the provider act upon an `&self` to /// allow storage of any useful "globals" within it necessary. pub trait Provider: - RawMutexProvider + IPInterfaceProvider + TimeProvider + PunchthroughProvider + RawPointerProvider + RawMutexProvider + + IPInterfaceProvider + + TimeProvider + + PunchthroughProvider + + RawPointerProvider + + AddressSpaceProvider { } diff --git a/litebox/src/process/mod.rs b/litebox/src/process/mod.rs new file mode 100644 index 000000000..eb9ed4fc6 --- /dev/null +++ b/litebox/src/process/mod.rs @@ -0,0 +1,598 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//! Process identity and lifecycle management. +//! +//! This module provides a platform-agnostic process registry for tracking +//! parent-child relationships, exit status, and process lifecycle. OS-specific +//! semantics (POSIX process groups/sessions, NT job objects, etc.) belong in +//! the shim layer, not here. + +use alloc::sync::Arc; +use alloc::vec::Vec; +use core::sync::atomic::{AtomicBool, Ordering}; +use hashbrown::HashMap; +use thiserror::Error; + +use crate::event::{Events, observer::Observer}; +use crate::platform::RawMutex as RawMutexTrait; +use crate::sync::{Mutex, RawSyncPrimitivesProvider}; + +/// Process identifier. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct ProcessId(u32); + +impl ProcessId { + /// The first process created in every LiteBox instance. + pub const INIT: Self = Self(1); + + /// Create a new `ProcessId` from a raw value. + /// Returns `None` if `raw` is 0 (invalid). + pub fn new(raw: u32) -> Option { + if raw == 0 { None } else { Some(Self(raw)) } + } + + /// Get the raw `u32` value of this process ID. + pub fn as_u32(self) -> u32 { + self.0 + } +} + +/// Per-process state tracked by the core. +pub struct ProcessContext { + pub id: ProcessId, + /// Parent process. `None` only for the init process. + pub parent: Option, + pub state: ProcessState, + /// Child processes. + children: Vec, +} + +/// Whether a process is running or has exited. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ProcessState { + Running, + /// The process has exited. The `u32` is opaque to the core; + /// shims assign platform-specific meaning (POSIX: wait status encoding; + /// NT: NTSTATUS / DWORD exit code, etc.). + Exited(u32), +} + +/// Returned by [`ProcessRegistry::exit_process`] so the shim can notify the +/// parent through whatever mechanism is appropriate (SIGCHLD, handle +/// signaling, etc.). +pub struct ExitNotification { + pub parent_pid: ProcessId, + pub child_pid: ProcessId, + pub exit_status: u32, +} + +/// Errors from [`ProcessRegistry::create_process`]. +#[derive(Error, Debug)] +pub enum CreateProcessError { + #[error("the specified parent PID does not exist in the registry")] + NoSuchParent, + #[error("a root (init) process already exists")] + InitAlreadyExists, +} + +/// Shared handle for observing a process's exit. +/// +/// `exited` becomes `true` when the process exits. `observer` is notified +/// with readiness events so shims can integrate with their event loop. +pub struct ProcessExitObserver { + /// Whether the process has exited. + pub exited: Arc, + /// Observer that receives [`Events::IN`] when the process exits. + pub observer: Arc>, +} + +/// An exit notification subject that observers can register on. +pub struct ExitSubject { + observers: Mutex>>>, + notified: AtomicBool, +} + +impl ExitSubject { + fn new() -> Self { + Self { + observers: Mutex::new(Vec::new()), + notified: AtomicBool::new(false), + } + } + + /// Register an observer to be notified when the process exits. + pub fn register_observer(&self, observer: alloc::sync::Weak>) { + let fire_immediately; + { + let mut observers = self.observers.lock(); + fire_immediately = self.notified.load(Ordering::Acquire); + observers.push(observer.clone()); + } + // Fire outside the lock to avoid deadlock if on_events re-enters. + if fire_immediately && let Some(obs) = observer.upgrade() { + obs.on_events(&Events::IN); + } + } + + /// Notify all registered observers. Collects them under the lock, then + /// fires outside the lock to prevent deadlocks. + fn notify(&self) { + let snapshot: Vec<_>; + { + let observers = self.observers.lock(); + // Set notified under the lock so register_observer sees a + // consistent state: once notified is true the observer list + // is finalized for this notification round. + self.notified.store(true, Ordering::Release); + snapshot = observers + .iter() + .filter_map(alloc::sync::Weak::upgrade) + .collect(); + } + for obs in snapshot { + obs.on_events(&Events::IN); + } + } +} + +/// Internal per-process entry in the registry. +struct ProcessEntry { + context: ProcessContext, + exit_observer: Arc>, + exited_flag: Arc, +} + +/// A registry of processes with parent-child lifecycle management. +/// +/// `ProcessRegistry` is parameterized on a platform type for mutex support. +pub struct ProcessRegistry { + inner: Mutex>, + /// Futex-like primitive: value is `exit_epoch`. Woken on every child exit + /// so that blocking `wait_for_any_child_exit` can unblock. + exit_event: ::RawMutex, +} + +struct RegistryInner { + processes: HashMap>, + next_pid: u32, + /// Counter incremented on every process exit. Used with `exit_event` futex + /// so that `wait_for_any_child_exit` can block efficiently. + exit_epoch: u32, +} + +#[allow( + clippy::missing_panics_doc, + clippy::must_use_candidate, + clippy::new_without_default, + clippy::result_unit_err +)] +impl ProcessRegistry { + /// Create a new, empty process registry. + pub fn new() -> Self { + Self { + inner: Mutex::new(RegistryInner { + processes: HashMap::new(), + next_pid: 1, + exit_epoch: 0, + }), + exit_event: ::RawMutex::INIT, + } + } + + /// Allocate a PID and register a new process. + /// + /// `parent` is the parent process ID. Pass `None` to create the init + /// process (PID 1). Only one init process is allowed. + pub fn create_process( + &self, + parent: Option, + ) -> Result { + let mut inner = self.inner.lock(); + let pid = match parent { + None => { + // Creating init process + let pid = ProcessId::INIT; + if inner.processes.contains_key(&pid) { + return Err(CreateProcessError::InitAlreadyExists); + } + // Ensure next_pid is past init + if inner.next_pid <= pid.as_u32() { + inner.next_pid = pid.as_u32() + 1; + } + pid + } + Some(parent_pid) => { + if !inner.processes.contains_key(&parent_pid) { + return Err(CreateProcessError::NoSuchParent); + } + let raw = inner.next_pid; + inner.next_pid = raw.checked_add(1).expect("PID space exhausted"); + let pid = ProcessId(raw); + // Register as child of parent + inner + .processes + .get_mut(&parent_pid) + .unwrap() + .context + .children + .push(pid); + pid + } + }; + + let exited_flag = Arc::new(AtomicBool::new(false)); + let exit_observer = Arc::new(ExitSubject::new()); + + inner.processes.insert( + pid, + ProcessEntry { + context: ProcessContext { + id: pid, + parent, + state: ProcessState::Running, + children: Vec::new(), + }, + exit_observer, + exited_flag, + }, + ); + + Ok(pid) + } + + /// Remove a process that was created but never started (e.g., child setup + /// failed after PID allocation). + /// + /// # Panics + /// + /// Panics if the process has children, is not in `Running` state, or does + /// not exist. + pub fn abort_process(&self, id: ProcessId) { + let mut inner = self.inner.lock(); + let entry = inner + .processes + .remove(&id) + .expect("abort_process: no such process"); + assert!( + matches!(entry.context.state, ProcessState::Running), + "abort_process: process not running" + ); + assert!( + entry.context.children.is_empty(), + "abort_process: process has children" + ); + // Remove from parent's children list + if let Some(parent_pid) = entry.context.parent + && let Some(parent) = inner.processes.get_mut(&parent_pid) + { + parent.context.children.retain(|&c| c != id); + } + } + + /// Record that a process has exited with the given status. + /// + /// For each orphaned child, calls `orphan_handler` so the shim can decide + /// the reparenting policy (e.g., POSIX reparents to init). + /// + /// Returns `Some(ExitNotification)` if the parent is still alive, + /// `None` otherwise. + pub fn exit_process( + &self, + id: ProcessId, + status: u32, + mut orphan_handler: impl FnMut(ProcessId), + ) -> Option { + let (children, exit_observer, notification); + { + let mut inner = self.inner.lock(); + let entry = inner + .processes + .get_mut(&id) + .expect("exit_process: no such process"); + assert!( + matches!(entry.context.state, ProcessState::Running), + "exit_process: process already exited" + ); + entry.context.state = ProcessState::Exited(status); + entry.exited_flag.store(true, Ordering::Release); + exit_observer = Arc::clone(&entry.exit_observer); + children = entry.context.children.clone(); + + // Check if parent is alive + let parent_pid = entry.context.parent; + notification = parent_pid.and_then(|ppid| { + let parent = inner.processes.get(&ppid)?; + if matches!(parent.context.state, ProcessState::Running) { + Some(ExitNotification { + parent_pid: ppid, + child_pid: id, + exit_status: status, + }) + } else { + None + } + }); + + // Bump the exit epoch so blocking waiters unblock. + inner.exit_epoch = inner.exit_epoch.wrapping_add(1); + } + // Wake any threads blocked in wait_for_any_child_exit. + self.exit_event + .underlying_atomic() + .fetch_add(1, Ordering::Release); + self.exit_event.wake_all(); + // All callbacks run outside the lock to prevent deadlocks. + exit_observer.notify(); + for child_pid in children { + orphan_handler(child_pid); + } + notification + } + + /// Read process context through a closure. + /// Returns `None` if the process does not exist. + pub fn with_context( + &self, + id: ProcessId, + f: impl FnOnce(&ProcessContext) -> R, + ) -> Option { + let inner = self.inner.lock(); + inner.processes.get(&id).map(|e| f(&e.context)) + } + + /// Returns `true` if the process exists and is in `Running` state. + pub fn is_alive(&self, id: ProcessId) -> bool { + let inner = self.inner.lock(); + inner + .processes + .get(&id) + .is_some_and(|e| matches!(e.context.state, ProcessState::Running)) + } + + /// Get the parent PID of a process. + pub fn get_parent(&self, id: ProcessId) -> Option { + let inner = self.inner.lock(); + inner.processes.get(&id).and_then(|e| e.context.parent) + } + + /// Get the child PIDs of a process. + pub fn get_children(&self, id: ProcessId) -> Option> { + let inner = self.inner.lock(); + inner.processes.get(&id).map(|e| e.context.children.clone()) + } + + /// Total number of processes in the registry (running + exited). + pub fn process_count(&self) -> usize { + let inner = self.inner.lock(); + inner.processes.len() + } + + /// Remove an exited process from the table. + /// + /// # Panics + /// + /// Panics if the process is still running or does not exist. + pub fn remove_process(&self, id: ProcessId) { + let mut inner = self.inner.lock(); + let entry = inner + .processes + .remove(&id) + .expect("remove_process: no such process"); + assert!( + matches!(entry.context.state, ProcessState::Exited(_)), + "remove_process: process still running" + ); + // Remove from parent's children list + if let Some(parent_pid) = entry.context.parent + && let Some(parent) = inner.processes.get_mut(&parent_pid) + { + parent.context.children.retain(|&c| c != id); + } + } + + /// Non-blocking check for an exited child of `parent`. + /// + /// `target` selects which children to consider: + /// - `> 0`: only the child with that specific PID + /// - `-1`: any child + /// - `0` / other negative: not yet supported (returns `None`) + /// + /// If a matching exited child is found, it is reaped (removed from the + /// registry) and `Some((child_pid, exit_status))` is returned. + /// Returns `None` if no matching exited child exists. + /// Returns `Some(Err(()))` if the parent has no children matching `target` + /// (i.e., ECHILD condition). + pub fn try_wait(&self, parent: ProcessId, target: i32) -> Result, ()> { + let mut inner = self.inner.lock(); + let parent_entry = inner.processes.get(&parent).ok_or(())?; + let children = parent_entry.context.children.clone(); + + if children.is_empty() { + return Err(()); // ECHILD + } + + // Find a matching exited child + let found = match target { + t if t > 0 => { + let target_pid = ProcessId(t.cast_unsigned()); + // Verify it's actually a child of parent + if !children.contains(&target_pid) { + return Err(()); // ECHILD — not our child + } + let entry = inner.processes.get(&target_pid); + match entry { + Some(e) if matches!(e.context.state, ProcessState::Exited(_)) => { + if let ProcessState::Exited(status) = e.context.state { + Some((target_pid, status)) + } else { + None + } + } + _ => None, + } + } + -1 => { + // Any child + let mut result = None; + for &child_pid in &children { + if let Some(entry) = inner.processes.get(&child_pid) + && let ProcessState::Exited(status) = entry.context.state + { + result = Some((child_pid, status)); + break; + } + } + result + } + _ => return Err(()), // process groups not supported + }; + + // Reap the child if found + if let Some((child_pid, _)) = found { + // Remove child from registry + let _entry = inner.processes.remove(&child_pid); + // Remove from parent's children list + if let Some(parent) = inner.processes.get_mut(&parent) { + parent.context.children.retain(|&c| c != child_pid); + } + } + + Ok(found) + } + + /// Block until any child exit occurs (or return immediately if one has + /// happened since the last call). Used by blocking wait4. + pub fn wait_for_any_child_exit(&self) { + let epoch = self.exit_event.underlying_atomic().load(Ordering::Acquire); + // Block until the epoch changes (i.e., a new exit has been recorded). + let _ = self.exit_event.block(epoch); + } + + /// Get exit observers for all children of `parent` matching `target`. + /// Used by blocking wait to know when to re-check. + pub fn child_exit_observers( + &self, + parent: ProcessId, + target: i32, + ) -> Vec> { + let inner = self.inner.lock(); + let Some(parent_entry) = inner.processes.get(&parent) else { + return Vec::new(); + }; + let children = &parent_entry.context.children; + let pids: Vec = match target { + t if t > 0 => { + let pid = ProcessId(t.cast_unsigned()); + if children.contains(&pid) { + alloc::vec![pid] + } else { + Vec::new() + } + } + -1 => children.clone(), + _ => Vec::new(), + }; + pids.iter() + .filter_map(|pid| { + let entry = inner.processes.get(pid)?; + Some(ProcessExitObserver { + exited: Arc::clone(&entry.exited_flag), + observer: Arc::clone(&entry.exit_observer), + }) + }) + .collect() + } + + /// Obtain a shared exit-observation handle for the given process. + pub fn exit_observer(&self, id: ProcessId) -> Option> { + let inner = self.inner.lock(); + let entry = inner.processes.get(&id)?; + Some(ProcessExitObserver { + exited: Arc::clone(&entry.exited_flag), + observer: Arc::clone(&entry.exit_observer), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::platform::mock::MockPlatform; + + type Registry = ProcessRegistry; + + fn new_registry() -> Registry { + Registry::new() + } + + #[test] + fn test_create_init_process() { + let registry = new_registry(); + let pid = registry.create_process(None).unwrap(); + assert_eq!(pid, ProcessId::INIT); + assert!(registry.is_alive(pid)); + assert_eq!(registry.get_parent(pid), None); + } + + #[test] + fn test_create_child_process() { + let registry = new_registry(); + let init = registry.create_process(None).unwrap(); + let child = registry.create_process(Some(init)).unwrap(); + assert_ne!(init, child); + assert!(registry.is_alive(child)); + assert_eq!(registry.get_parent(child), Some(init)); + assert_eq!(registry.get_children(init), Some(alloc::vec![child])); + } + + #[test] + fn test_exit_process() { + let registry = new_registry(); + let init = registry.create_process(None).unwrap(); + let child = registry.create_process(Some(init)).unwrap(); + + let notif = registry.exit_process(child, 42, |_| {}); + assert!(notif.is_some()); + let notif = notif.unwrap(); + assert_eq!(notif.parent_pid, init); + assert_eq!(notif.child_pid, child); + assert_eq!(notif.exit_status, 42); + + assert!(!registry.is_alive(child)); + registry.remove_process(child); + assert_eq!(registry.get_children(init), Some(alloc::vec![])); + } + + #[test] + fn test_abort_process() { + let registry = new_registry(); + let init = registry.create_process(None).unwrap(); + let child = registry.create_process(Some(init)).unwrap(); + registry.abort_process(child); + assert_eq!(registry.get_children(init), Some(alloc::vec![])); + } + + #[test] + fn test_duplicate_init_rejected() { + let registry = new_registry(); + registry.create_process(None).unwrap(); + assert!(matches!( + registry.create_process(None), + Err(CreateProcessError::InitAlreadyExists) + )); + } + + #[test] + fn test_exit_observer() { + let registry = new_registry(); + let init = registry.create_process(None).unwrap(); + let child = registry.create_process(Some(init)).unwrap(); + + let observer = registry.exit_observer(child).unwrap(); + assert!(!observer.exited.load(Ordering::Acquire)); + + registry.exit_process(child, 0, |_| {}); + assert!(observer.exited.load(Ordering::Acquire)); + } +} diff --git a/litebox/src/shim.rs b/litebox/src/shim.rs index 82a800339..752644d68 100644 --- a/litebox/src/shim.rs +++ b/litebox/src/shim.rs @@ -28,6 +28,11 @@ pub trait EnterShim { /// FUTURE: use a single per-architecture type for all shims and platforms. type ExecutionContext; + /// The process ID for this thread's process, if multi-process is supported. + fn process_id(&self) -> Option { + None + } + /// Initialize a new thread. Must be called by the platform exactly once /// before running the thread in the guest for the first time. /// diff --git a/litebox_common_linux/src/lib.rs b/litebox_common_linux/src/lib.rs index 578bd51a9..64e17c02c 100644 --- a/litebox_common_linux/src/lib.rs +++ b/litebox_common_linux/src/lib.rs @@ -2146,6 +2146,12 @@ pub enum SyscallRequest { new_value: Platform::RawConstPointer, old_value: Option>, }, + Wait4 { + pid: i32, + wstatus: Option>, + options: i32, + // rusage is ignored for now + }, } impl SyscallRequest { @@ -2544,6 +2550,24 @@ impl SyscallRequest { }, Sysno::eventfd2 => sys_req!(Eventfd2 { initval, flags }), Sysno::getrandom => sys_req!(GetRandom { buf:*,count,flags }), + Sysno::vfork => { + // vfork is equivalent to clone(CLONE_VM | CLONE_VFORK | SIGCHLD) + // with no new stack (child runs on parent's stack). + let args = CloneArgs { + flags: CloneFlags::VM | CloneFlags::VFORK, + stack: 0, + parent_tid: 0, + child_tid: 0, + tls: 0, + pidfd: 0, + exit_signal: 17, // SIGCHLD + stack_size: 0, + set_tid: 0, + set_tid_size: 0, + cgroup: 0, + }; + SyscallRequest::Clone { args } + } Sysno::clone => { let args = CloneArgs { // The upper 32 bits are clone3-specific. The low 8 bits are the exit signal. @@ -2603,6 +2627,21 @@ impl SyscallRequest { Sysno::umask => sys_req!(Umask { mask }), Sysno::alarm => sys_req!(Alarm { seconds }), Sysno::setitimer => sys_req!(SetITimer { which:?, new_value:*, old_value:* }), + Sysno::wait4 => { + let pid: i32 = ctx.sys_req_arg(0); + let wstatus: Platform::RawMutPointer = ctx.sys_req_ptr(1); + let options: i32 = ctx.sys_req_arg(2); + // arg3 is rusage, ignored for now + SyscallRequest::Wait4 { + pid, + wstatus: if wstatus.as_usize() == 0 { + None + } else { + Some(wstatus) + }, + options, + } + } // Noisy unsupported syscalls. Sysno::statx | Sysno::io_uring_setup | Sysno::rseq | Sysno::statfs => { return Err(errno::Errno::ENOSYS); diff --git a/litebox_platform_linux_kernel/src/lib.rs b/litebox_platform_linux_kernel/src/lib.rs index cc207d3da..020a73bf7 100644 --- a/litebox_platform_linux_kernel/src/lib.rs +++ b/litebox_platform_linux_kernel/src/lib.rs @@ -13,8 +13,9 @@ use litebox::mm::linux::PageRange; use litebox::platform::RawPointerProvider; use litebox::platform::page_mgmt::FixedAddressBehavior; use litebox::platform::{ - IPInterfaceProvider, ImmediatelyWokenUp, PageManagementProvider, Provider, Punchthrough, - PunchthroughProvider, PunchthroughToken, RawMutexProvider, TimeProvider, UnblockedOrTimedOut, + AddressSpaceProvider, IPInterfaceProvider, ImmediatelyWokenUp, PageManagementProvider, + Provider, Punchthrough, PunchthroughProvider, PunchthroughToken, RawMutexProvider, + TimeProvider, UnblockedOrTimedOut, }; use litebox_common_linux::PunchthroughSyscall; use litebox_common_linux::errno::Errno; @@ -84,6 +85,12 @@ impl<'a, Host: HostInterface> PunchthroughToken for LinuxPunchthroughToken<'a, H impl Provider for LinuxKernel {} +impl AddressSpaceProvider for LinuxKernel { + type AddressSpaceId = u32; + const ADDRESS_SPACE_KIND: litebox::platform::address_space::AddressSpaceKind = + litebox::platform::address_space::AddressSpaceKind::Isolated; +} + // TODO: implement pointer validation to ensure the pointers are in user space. type UserConstPtr = litebox::platform::common_providers::userspace_pointers::UserConstPtr< litebox::platform::common_providers::userspace_pointers::NoValidation, diff --git a/litebox_platform_linux_userland/src/lib.rs b/litebox_platform_linux_userland/src/lib.rs index a1ccf7fdc..dcf2f0ee4 100644 --- a/litebox_platform_linux_userland/src/lib.rs +++ b/litebox_platform_linux_userland/src/lib.rs @@ -105,6 +105,8 @@ pub struct LinuxUserland { /// is persistent across multiple process executions, however, it is ephemeral across true /// reboots. boot_id: std::sync::OnceLock>, + /// VA partition allocator for multi-process support. + va_partitions: VaPartitionAllocator, } impl core::fmt::Debug for LinuxUserland { @@ -236,6 +238,7 @@ impl LinuxUserland { reserved_pages, cow_regions: std::sync::RwLock::new(std::collections::BTreeMap::new()), boot_id: std::sync::OnceLock::new(), + va_partitions: VaPartitionAllocator::new(), }; Box::leak(Box::new(platform)) } @@ -416,6 +419,32 @@ impl LinuxUserland { impl litebox::platform::Provider for LinuxUserland {} +impl litebox::platform::AddressSpaceProvider for LinuxUserland { + type AddressSpaceId = u32; + const ADDRESS_SPACE_KIND: litebox::platform::address_space::AddressSpaceKind = + litebox::platform::address_space::AddressSpaceKind::SharedMemory; + + fn create_address_space( + &self, + ) -> Result { + self.va_partitions.allocate() + } + + fn destroy_address_space( + &self, + id: Self::AddressSpaceId, + ) -> Result<(), litebox::platform::address_space::AddressSpaceError> { + self.va_partitions.release(id) + } + + fn address_space_range( + &self, + id: Self::AddressSpaceId, + ) -> Result, litebox::platform::address_space::AddressSpaceError> { + self.va_partitions.range(id) + } +} + impl litebox::platform::SignalProvider for LinuxUserland { type Signal = litebox_common_linux::signal::Signal; @@ -2272,6 +2301,90 @@ impl litebox::mm::linux::VmemPageFaultHandler for LinuxUserland { } } +// --------------------------------------------------------------------------- +// VA Partition Allocator +// --------------------------------------------------------------------------- + +/// Allocates 1 TiB VA partitions from the 47-bit userland address space. +/// +/// The usable VA range `0x1_0000..0x7FFF_FFFF_F000` (~128 TiB) is divided +/// into 1 TiB slots. Slot 0 covers `0x0..0x100_0000_0000` (though only +/// `0x1_0000..` is usable), slot 1 covers `0x100_0000_0000..0x200_0000_0000`, +/// and so on. A simple bitmap tracks which slots are allocated. +struct VaPartitionAllocator { + /// Bitmap of allocated partitions. Bit N = partition N is allocated. + /// 128 bits covers all 128 possible 1-TiB partitions in 47-bit VA. + allocated: std::sync::Mutex, +} + +impl VaPartitionAllocator { + /// Size of each VA partition: 1 TiB. + const PARTITION_SIZE: usize = 1 << 40; // 0x100_0000_0000 + /// Maximum partition index (exclusive). 128 TiB / 1 TiB = 128. + const MAX_PARTITIONS: u32 = 128; + /// First allocatable partition. Partition 0 is reserved for the init + /// process (its range is the platform default `TASK_ADDR_MIN..TASK_ADDR_MAX`). + const FIRST_ALLOC: u32 = 1; + + fn new() -> Self { + // Mark partition 0 as pre-allocated (init process). + Self { + allocated: std::sync::Mutex::new(1), + } + } + + fn allocate(&self) -> Result { + let mut bitmap = self.allocated.lock().unwrap(); + for i in Self::FIRST_ALLOC..Self::MAX_PARTITIONS { + if *bitmap & (1u128 << i) == 0 { + *bitmap |= 1u128 << i; + return Ok(i); + } + } + Err(litebox::platform::address_space::AddressSpaceError::NoSpace) + } + + fn release(&self, id: u32) -> Result<(), litebox::platform::address_space::AddressSpaceError> { + if !(Self::FIRST_ALLOC..Self::MAX_PARTITIONS).contains(&id) { + return Err(litebox::platform::address_space::AddressSpaceError::InvalidId); + } + let mut bitmap = self.allocated.lock().unwrap(); + if *bitmap & (1u128 << id) == 0 { + return Err(litebox::platform::address_space::AddressSpaceError::InvalidId); + } + *bitmap &= !(1u128 << id); + Ok(()) + } + + fn range( + &self, + id: u32, + ) -> Result, litebox::platform::address_space::AddressSpaceError> { + if id >= Self::MAX_PARTITIONS { + return Err(litebox::platform::address_space::AddressSpaceError::InvalidId); + } + // Verify partition is allocated + { + let bitmap = self.allocated.lock().unwrap(); + if *bitmap & (1u128 << id) == 0 { + return Err(litebox::platform::address_space::AddressSpaceError::InvalidId); + } + } + let start = (id as usize) * Self::PARTITION_SIZE; + let end = start + Self::PARTITION_SIZE; + // Clamp to usable VA space + let start = start.max(0x1_0000); // TASK_ADDR_MIN + let end = end.min(0x7FFF_FFFF_F000); // TASK_ADDR_MAX + // Align to page boundary + let start = (start + 0xFFF) & !0xFFF; + let end = end & !0xFFF; + if start >= end { + return Err(litebox::platform::address_space::AddressSpaceError::InvalidId); + } + Ok(start..end) + } +} + #[cfg(test)] mod tests { use core::sync::atomic::AtomicU32; diff --git a/litebox_runner_linux_userland/src/lib.rs b/litebox_runner_linux_userland/src/lib.rs index 90b24878b..7e170951f 100644 --- a/litebox_runner_linux_userland/src/lib.rs +++ b/litebox_runner_linux_userland/src/lib.rs @@ -380,13 +380,12 @@ pub fn run(cli_args: CliArgs) -> Result<()> { envp }; - let program = shim.load_program( - initial_file_system, - platform.init_task(), - prog_path, - argv, - envp, - )?; + let mut task_params = platform.init_task(); + // Use deterministic guest PIDs starting from 1 (init process). + task_params.pid = 1; + task_params.ppid = 0; + + let program = shim.load_program(initial_file_system, task_params, prog_path, argv, envp)?; #[cfg(feature = "lock_tracing")] litebox::sync::start_recording(); diff --git a/litebox_runner_linux_userland/tests/common/mod.rs b/litebox_runner_linux_userland/tests/common/mod.rs index 60d760aa9..20e47be48 100644 --- a/litebox_runner_linux_userland/tests/common/mod.rs +++ b/litebox_runner_linux_userland/tests/common/mod.rs @@ -98,12 +98,28 @@ fn find_rewriter_source_files() -> Vec { /// Compile C code into an executable with caching pub fn compile(src_path: &str, unique_name: &str, exec_or_lib: bool, nolibc: bool) -> PathBuf { + compile_inner(src_path, unique_name, exec_or_lib, nolibc, false) +} + +pub fn compile_static_pie(src_path: &str, unique_name: &str) -> PathBuf { + compile_inner(src_path, unique_name, true, false, true) +} + +fn compile_inner( + src_path: &str, + unique_name: &str, + exec_or_lib: bool, + nolibc: bool, + static_pie: bool, +) -> PathBuf { let dir_path = std::env::var("OUT_DIR").unwrap(); let path = std::path::Path::new(dir_path.as_str()).join(unique_name); let output = path.to_str().unwrap(); let mut args = vec!["-o", output, src_path]; - if exec_or_lib { + if static_pie { + args.extend_from_slice(&["-static-pie", "-fpie"]); + } else if exec_or_lib { args.push("-static"); } if nolibc { diff --git a/litebox_runner_linux_userland/tests/multiprocess/cat_stdin.c b/litebox_runner_linux_userland/tests/multiprocess/cat_stdin.c new file mode 100644 index 000000000..1f7c4e827 --- /dev/null +++ b/litebox_runner_linux_userland/tests/multiprocess/cat_stdin.c @@ -0,0 +1,14 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// Helper: read stdin and write to stdout until EOF, then exit. +#include + +int main(void) { + char buf[256]; + ssize_t n; + while ((n = read(STDIN_FILENO, buf, sizeof(buf))) > 0) { + write(STDOUT_FILENO, buf, n); + } + return 0; +} diff --git a/litebox_runner_linux_userland/tests/multiprocess/echo_hello.c b/litebox_runner_linux_userland/tests/multiprocess/echo_hello.c new file mode 100644 index 000000000..ccbb2269f --- /dev/null +++ b/litebox_runner_linux_userland/tests/multiprocess/echo_hello.c @@ -0,0 +1,11 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// Helper: write "hello\n" to stdout and exit. +#include + +int main(void) { + const char msg[] = "hello\n"; + write(STDOUT_FILENO, msg, sizeof(msg) - 1); + return 0; +} diff --git a/litebox_runner_linux_userland/tests/multiprocess/exit_with.c b/litebox_runner_linux_userland/tests/multiprocess/exit_with.c new file mode 100644 index 000000000..b57faac8c --- /dev/null +++ b/litebox_runner_linux_userland/tests/multiprocess/exit_with.c @@ -0,0 +1,10 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// Helper: exit with the status given as argv[1]. +#include + +int main(int argc, char *argv[]) { + if (argc < 2) return 1; + return atoi(argv[1]); +} diff --git a/litebox_runner_linux_userland/tests/multiprocess/fork_exec_wait.c b/litebox_runner_linux_userland/tests/multiprocess/fork_exec_wait.c new file mode 100644 index 000000000..20e306fed --- /dev/null +++ b/litebox_runner_linux_userland/tests/multiprocess/fork_exec_wait.c @@ -0,0 +1,45 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// Test: vfork + exec a helper program, wait for it, verify exit status. +// Uses vfork() explicitly since our fork implementation has vfork semantics. +// The child must only call execve/_exit (no library calls). + +#include +#include +#include +#include + +int main(int argc, char *argv[]) { + if (argc < 2) { + fprintf(stderr, "usage: fork_exec_wait \n"); + return 1; + } + const char *helper = argv[1]; + + pid_t pid = vfork(); + if (pid < 0) { + perror("vfork"); + return 1; + } + if (pid == 0) { + // Child: exec the helper with exit code 42. + execl(helper, helper, "42", (char *)NULL); + // If exec fails, _exit immediately. + _exit(127); + } + // Parent + int wstatus; + pid_t waited = waitpid(pid, &wstatus, 0); + if (waited != pid) { + fprintf(stderr, "waitpid returned %d, expected %d\n", waited, pid); + return 1; + } + if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 42) { + fprintf(stderr, "unexpected exit status: 0x%x (WIFEXITED=%d, WEXITSTATUS=%d)\n", + wstatus, WIFEXITED(wstatus), WEXITSTATUS(wstatus)); + return 1; + } + printf("fork_exec_wait: OK\n"); + return 0; +} diff --git a/litebox_runner_linux_userland/tests/multiprocess/pipe_fork.c b/litebox_runner_linux_userland/tests/multiprocess/pipe_fork.c new file mode 100644 index 000000000..d37a263f7 --- /dev/null +++ b/litebox_runner_linux_userland/tests/multiprocess/pipe_fork.c @@ -0,0 +1,80 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// Test: pipe between two forked children (simulates `echo hello | cat`). +// Uses vfork + exec so children detach and run concurrently. +// +// Child 1: dup2 pipe write end to stdout, exec "echo_hello" helper +// Child 2: dup2 pipe read end to stdin, exec "cat_stdin" helper +// Parent: close pipe ends, wait for both children. + +#include +#include +#include +#include + +int main(int argc, char *argv[]) { + if (argc < 3) { + fprintf(stderr, "usage: pipe_fork \n"); + return 1; + } + const char *echo_path = argv[1]; + const char *cat_path = argv[2]; + + int pipefd[2]; + if (pipe(pipefd) < 0) { + perror("pipe"); + return 1; + } + + // Fork child 1: writer (echo_hello) + pid_t writer = vfork(); + if (writer < 0) { + perror("vfork writer"); + return 1; + } + if (writer == 0) { + // Redirect stdout to pipe write end + dup2(pipefd[1], STDOUT_FILENO); + close(pipefd[0]); + close(pipefd[1]); + execl(echo_path, echo_path, (char *)NULL); + _exit(127); + } + + // Fork child 2: reader (cat_stdin) + pid_t reader = vfork(); + if (reader < 0) { + perror("vfork reader"); + return 1; + } + if (reader == 0) { + // Redirect stdin to pipe read end + dup2(pipefd[0], STDIN_FILENO); + close(pipefd[0]); + close(pipefd[1]); + execl(cat_path, cat_path, (char *)NULL); + _exit(127); + } + + // Parent: close both pipe ends so children get proper EOF + close(pipefd[0]); + close(pipefd[1]); + + // Wait for both children + int wstatus; + for (int i = 0; i < 2; i++) { + pid_t w = wait(&wstatus); + if (w < 0) { + perror("wait"); + return 1; + } + if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 0) { + fprintf(stderr, "child %d exited with status 0x%x\n", w, wstatus); + return 1; + } + } + + printf("pipe_fork: OK\n"); + return 0; +} diff --git a/litebox_runner_linux_userland/tests/run.rs b/litebox_runner_linux_userland/tests/run.rs index f9e3c6b7a..d548e34ce 100644 --- a/litebox_runner_linux_userland/tests/run.rs +++ b/litebox_runner_linux_userland/tests/run.rs @@ -604,3 +604,67 @@ fn test_shebang() { "shebang test failed, output: {output_str}" ); } + +// Multi-process tests (fork, pipe, waitpid) + +#[test] +fn test_fork_exec_wait() { + // Compile the main test program (static non-PIE, runs as init process). + let main_target = common::compile( + "./tests/multiprocess/fork_exec_wait.c", + "fork_exec_wait", + true, + false, + ); + // Compile the helper as static-pie so it can load in any VA partition. + let helper_target = common::compile_static_pie("./tests/multiprocess/exit_with.c", "exit_with"); + + // Build a runner with the helper binary added to the guest filesystem. + let mut runner = Runner::new(&main_target, "fork_exec_wait"); + runner.with_fs_path(|out_dir| { + // Rewrite and place the helper binary in the guest filesystem. + let guest_helper = out_dir.join("out/exit_with"); + let success = common::rewrite_with_cache(&helper_target, &guest_helper, &[]); + assert!(success, "failed to rewrite exit_with helper"); + }); + // Pass the guest path to the helper as an argument. + runner.arg("/out/exit_with"); + let output = runner.output(); + let output_str = String::from_utf8_lossy(&output); + assert!( + output_str.contains("fork_exec_wait: OK"), + "fork_exec_wait test failed, output: {output_str}" + ); +} + +#[test] +fn test_pipe_fork() { + // Compile main test (static non-PIE, runs as init) + let main_target = common::compile("./tests/multiprocess/pipe_fork.c", "pipe_fork", true, false); + // Compile helpers as static-pie (loaded in child VA partitions) + let echo_target = common::compile_static_pie("./tests/multiprocess/echo_hello.c", "echo_hello"); + let cat_target = common::compile_static_pie("./tests/multiprocess/cat_stdin.c", "cat_stdin"); + + let mut runner = Runner::new(&main_target, "pipe_fork"); + runner.with_fs_path(|out_dir| { + let guest_echo = out_dir.join("out/echo_hello"); + let success = common::rewrite_with_cache(&echo_target, &guest_echo, &[]); + assert!(success, "failed to rewrite echo_hello helper"); + + let guest_cat = out_dir.join("out/cat_stdin"); + let success = common::rewrite_with_cache(&cat_target, &guest_cat, &[]); + assert!(success, "failed to rewrite cat_stdin helper"); + }); + runner.arg("/out/echo_hello"); + runner.arg("/out/cat_stdin"); + let output = runner.output(); + let output_str = String::from_utf8_lossy(&output); + assert!( + output_str.contains("hello"), + "pipe_fork test failed — expected 'hello' in output, got: {output_str}" + ); + assert!( + output_str.contains("pipe_fork: OK"), + "pipe_fork test failed, output: {output_str}" + ); +} diff --git a/litebox_shim_linux/src/lib.rs b/litebox_shim_linux/src/lib.rs index c1c3d1b47..3f05a2fa4 100644 --- a/litebox_shim_linux/src/lib.rs +++ b/litebox_shim_linux/src/lib.rs @@ -101,7 +101,8 @@ impl litebox::shim::EnterShim for LinuxShimEntrypoints { if info.kernel_mode && info.exception == litebox::shim::Exception::PAGE_FAULT { if unsafe { self.task - .global + .process + .borrow() .pm .handle_page_fault(info.cr2, info.error_code.into()) } @@ -183,12 +184,20 @@ impl LinuxShimBuilder { } /// Build the shim. + /// + /// # Panics + /// Panics if the init process cannot be created in the process registry. pub fn build(self) -> LinuxShim { let mut net = Network::new(&self.litebox); net.set_platform_interaction(litebox::net::PlatformInteraction::Manual); + let process_registry = litebox::process::ProcessRegistry::new(); + // Register the init process (PID 1). + process_registry + .create_process(None) + .expect("failed to create init process"); + let global = Arc::new(GlobalState { platform: self.platform, - pm: PageManager::new(&self.litebox), futex_manager: FutexManager::new(), pipes: Pipes::new(&self.litebox), net: litebox::sync::Mutex::new(net), @@ -197,15 +206,19 @@ impl LinuxShimBuilder { next_thread_id: 2.into(), // start from 2, as 1 is used by the main thread litebox: self.litebox, unix_addr_table: litebox::sync::RwLock::new(syscalls::unix::UnixAddrTable::new()), + process_registry, }); - LinuxShim(global) + let init_process = Arc::new(ProcessState { + pm: PageManager::new(&global.litebox), + }); + LinuxShim(global, init_process) } } -pub struct LinuxShim(Arc>); +pub struct LinuxShim(Arc>, Arc); impl Clone for LinuxShim { fn clone(&self) -> Self { - Self(self.0.clone()) + Self(self.0.clone(), self.1.clone()) } } @@ -238,6 +251,7 @@ impl LinuxShim { _not_send: core::marker::PhantomData, task: Task { global: self.0.clone(), + process: RefCell::new(self.1.clone()), thread: syscalls::process::ThreadState::new_process(pid), wait_state: wait::WaitState::new(self.0.platform), pid, @@ -254,6 +268,7 @@ impl LinuxShim { fs: Arc::new(syscalls::file::FsState::new()).into(), files: files.into(), signals: syscalls::signal::SignalState::new_process(), + fork_context: RefCell::new(None), }, }; @@ -274,9 +289,9 @@ impl LinuxShim { }) } - /// Get the global page manager + /// Get the page manager for the initial process. pub fn page_manager(&self) -> &PageManager { - &self.0.pm + &self.1.pm } /// Perform queued network interactions with the outside world. @@ -991,6 +1006,11 @@ impl Task { SyscallRequest::Tgkill { tgid, tid, sig } => self.sys_tgkill(tgid, tid, sig), SyscallRequest::Sigaltstack { ss, old_ss } => self.sys_sigaltstack(ss, old_ss, ctx), SyscallRequest::Alarm { seconds } => syscall!(sys_alarm(seconds)), + SyscallRequest::Wait4 { + pid, + wstatus, + options, + } => self.sys_wait4(pid, wstatus, options), _ => { log_unsupported!("{request:?}"); Err(Errno::ENOSYS) @@ -999,14 +1019,12 @@ impl Task { } } -/// Global shim state, shared across all tasks. +/// Global shim state, shared across all tasks and all processes. struct GlobalState { /// The platform instance used throughout the shim. platform: &'static Platform, /// The LiteBox instance used throughout the shim. litebox: litebox::LiteBox, - /// The page manager for managing virtual memory. - pm: litebox::mm::PageManager, /// The futex manager for handling futex operations. futex_manager: FutexManager, /// The anonymous pipe implementation. @@ -1022,10 +1040,19 @@ struct GlobalState { next_thread_id: core::sync::atomic::AtomicI32, /// UNIX domain socket address table unix_addr_table: litebox::sync::RwLock>, + /// Process registry for tracking parent-child relationships and exit status. + process_registry: litebox::process::ProcessRegistry, +} + +/// Per-process state, shared among threads of the same process. +struct ProcessState { + /// The page manager for this process's virtual memory / address space. + pm: litebox::mm::PageManager, } struct Task { global: Arc>, + process: RefCell>, wait_state: wait::WaitState, thread: syscalls::process::ThreadState, /// Process ID @@ -1045,6 +1072,8 @@ struct Task { files: RefCell>>, /// Signal state signals: syscalls::signal::SignalState, + /// Fork context: present on vfork children, used to signal parent on exec/exit. + fork_context: RefCell>, } impl Drop for Task { @@ -1071,6 +1100,9 @@ mod test_utils { Task { wait_state: wait::WaitState::new(self.platform), thread: syscalls::process::ThreadState::new_process(pid), + process: RefCell::new(Arc::new(ProcessState { + pm: PageManager::new(&self.litebox), + })), pid, ppid: 0, tid: pid, @@ -1084,6 +1116,7 @@ mod test_utils { fs: Arc::new(syscalls::file::FsState::new()).into(), files: files.into(), signals: syscalls::signal::SignalState::new_process(), + fork_context: RefCell::new(None), global: self, } } @@ -1099,6 +1132,7 @@ mod test_utils { let task = Task { wait_state: wait::WaitState::new(self.global.platform), global: self.global.clone(), + process: self.process.clone(), thread: self.thread.new_thread(tid)?, pid: self.pid, ppid: self.ppid, @@ -1108,6 +1142,7 @@ mod test_utils { fs: self.fs.clone(), files: self.files.clone(), signals: self.signals.clone_for_new_task(), + fork_context: RefCell::new(None), }; Some(task) } diff --git a/litebox_shim_linux/src/loader/elf.rs b/litebox_shim_linux/src/loader/elf.rs index 0d62030a8..3d7d40d4b 100644 --- a/litebox_shim_linux/src/loader/elf.rs +++ b/litebox_shim_linux/src/loader/elf.rs @@ -72,13 +72,15 @@ impl litebox_common_linux::loader::MapMemory for ElfFile<'_, FS> { type Error = Errno; fn reserve(&mut self, len: usize, align: usize) -> Result { + // Compute a hint address within this process's VA partition. + let hint = self.task.process.borrow().pm.addr_min() + super::PIE_LOAD_OFFSET; // Allocate a mapping large enough that even if it's maximally misaligned we can // still fit `len` bytes. let mapping_len = len + (align.max(PAGE_SIZE) - PAGE_SIZE); let mapping_ptr = self .task .sys_mmap( - super::DEFAULT_LOW_ADDR, + hint, mapping_len, litebox_common_linux::ProtFlags::PROT_NONE, litebox_common_linux::MapFlags::MAP_ANONYMOUS @@ -202,6 +204,7 @@ impl<'a, FS: ShimFS> ElfLoader<'a, FS> { mut aux: AuxVec, ) -> Result { let global = &self.main.file.task.global; + let process = self.main.file.task.process.borrow(); // Load the main ELF file first so that it gets privileged addresses. let info = self @@ -220,7 +223,7 @@ impl<'a, FS: ShimFS> ElfLoader<'a, FS> { None }; - global.pm.set_initial_brk(info.brk); + process.pm.set_initial_brk(info.brk); aux.insert(AuxKey::AT_PAGESZ, PAGE_SIZE); aux.insert(AuxKey::AT_PHDR, info.phdrs_addr); aux.insert(AuxKey::AT_PHENT, info.phent_size()); @@ -236,7 +239,7 @@ impl<'a, FS: ShimFS> ElfLoader<'a, FS> { let sp = unsafe { let length = litebox::mm::linux::NonZeroPageSize::new(super::DEFAULT_STACK_SIZE) .expect("DEFAULT_STACK_SIZE is not page-aligned"); - global + process .pm .create_stack_pages(None, length, CreatePagesFlags::empty()) .map_err(ElfLoaderError::MappingError)? diff --git a/litebox_shim_linux/src/loader/mod.rs b/litebox_shim_linux/src/loader/mod.rs index a7e370cd3..924d747ac 100644 --- a/litebox_shim_linux/src/loader/mod.rs +++ b/litebox_shim_linux/src/loader/mod.rs @@ -10,6 +10,7 @@ mod stack; pub(crate) const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8 MB -/// A default low address is used for the binary (which grows upwards) to avoid -/// conflicts with the kernel's memory mappings (which grows downwards). -pub(crate) const DEFAULT_LOW_ADDR: usize = 0x1000_0000; +/// Offset added to the process's `addr_min` when computing the PIE load hint. +/// This places binaries low in the partition (growing upwards), leaving +/// room for top-down allocations (stack, mmap) at the high end. +pub(crate) const PIE_LOAD_OFFSET: usize = 0x1000_0000; // 256 MiB diff --git a/litebox_shim_linux/src/syscalls/file.rs b/litebox_shim_linux/src/syscalls/file.rs index 7b900243d..d7ff52620 100644 --- a/litebox_shim_linux/src/syscalls/file.rs +++ b/litebox_shim_linux/src/syscalls/file.rs @@ -76,6 +76,21 @@ impl FilesState { } } + /// Clone the file descriptor table for fork. + /// + /// The child gets its own `RawDescriptorStorage` (so close/dup in the + /// child does not affect the parent's FD numbering), but the underlying + /// open file descriptions are shared via Arc. + pub(crate) fn clone_for_fork(&self) -> Self { + Self { + fs: self.fs.clone(), + raw_descriptor_store: litebox::sync::RwLock::new( + self.raw_descriptor_store.read().clone_for_fork(), + ), + max_fd: AtomicUsize::new(self.max_fd.load(Ordering::Relaxed)), + } + } + pub(crate) fn set_max_fd(&self, max_fd: usize) { self.max_fd.store(max_fd, Ordering::Relaxed); } @@ -519,6 +534,11 @@ impl Task { match rds.fd_consume_raw_integer(raw_fd) { Ok(fd) => { drop(rds); + // If another process (fork) still holds a reference to this FD, + // just drop our reference without closing the underlying entry. + if alloc::sync::Arc::strong_count(&fd) > 1 { + return Ok(()); + } return files.fs.close(&fd).map_err(Errno::from); } Err(litebox::fd::ErrRawIntFd::NotFound) => { @@ -530,14 +550,23 @@ impl Task { } if let Ok(fd) = rds.fd_consume_raw_integer(raw_fd) { drop(rds); + if alloc::sync::Arc::strong_count(&fd) > 1 { + return Ok(()); + } return self.global.close_socket(&self.wait_cx(), fd); } if let Ok(fd) = rds.fd_consume_raw_integer(raw_fd) { drop(rds); + if alloc::sync::Arc::strong_count(&fd) > 1 { + return Ok(()); + } return self.global.pipes.close(&fd).map_err(Errno::from); } if let Ok(fd) = rds.fd_consume_raw_integer::(raw_fd) { drop(rds); + if alloc::sync::Arc::strong_count(&fd) > 1 { + return Ok(()); + } let entry = { let mut dt = self.global.litebox.descriptor_table_mut(); dt.remove(&fd) @@ -547,6 +576,9 @@ impl Task { } if let Ok(fd) = rds.fd_consume_raw_integer::>(raw_fd) { drop(rds); + if alloc::sync::Arc::strong_count(&fd) > 1 { + return Ok(()); + } let entry = { let mut dt = self.global.litebox.descriptor_table_mut(); dt.remove(&fd) @@ -556,6 +588,9 @@ impl Task { } if let Ok(fd) = rds.fd_consume_raw_integer::>(raw_fd) { drop(rds); + if alloc::sync::Arc::strong_count(&fd) > 1 { + return Ok(()); + } let entry = { let mut dt = self.global.litebox.descriptor_table_mut(); dt.remove(&fd) diff --git a/litebox_shim_linux/src/syscalls/mm.rs b/litebox_shim_linux/src/syscalls/mm.rs index ce6c3513c..501b4645a 100644 --- a/litebox_shim_linux/src/syscalls/mm.rs +++ b/litebox_shim_linux/src/syscalls/mm.rs @@ -45,7 +45,7 @@ impl Task { op: impl FnOnce(MutPtr) -> Result, ) -> Result, MappingError> { litebox_common_linux::mm::do_mmap( - &self.global.pm, + &self.process.borrow().pm, suggested_addr, len, prot, @@ -175,7 +175,7 @@ impl Task { // SAFETY: ptr is the freshly CoW-mapped region of exactly `len` bytes with // `permissions`. unsafe { - self.global.pm.register_existing_mapping( + self.process.borrow().pm.register_existing_mapping( range, permissions, true, @@ -303,7 +303,7 @@ impl Task { /// Handle syscall `munmap` #[inline] pub(crate) fn sys_munmap(&self, addr: crate::MutPtr, len: usize) -> Result<(), Errno> { - litebox_common_linux::mm::sys_munmap(&self.global.pm, addr, len) + litebox_common_linux::mm::sys_munmap(&self.process.borrow().pm, addr, len) } /// Handle syscall `mprotect` @@ -314,7 +314,7 @@ impl Task { len: usize, prot: ProtFlags, ) -> Result<(), Errno> { - litebox_common_linux::mm::sys_mprotect(&self.global.pm, addr, len, prot) + litebox_common_linux::mm::sys_mprotect(&self.process.borrow().pm, addr, len, prot) } #[inline] @@ -327,7 +327,7 @@ impl Task { new_addr: usize, ) -> Result, Errno> { litebox_common_linux::mm::sys_mremap( - &self.global.pm, + &self.process.borrow().pm, old_addr, old_size, new_size, @@ -339,7 +339,7 @@ impl Task { /// Handle syscall `brk` #[inline] pub(crate) fn sys_brk(&self, addr: MutPtr) -> Result { - litebox_common_linux::mm::sys_brk(&self.global.pm, addr) + litebox_common_linux::mm::sys_brk(&self.process.borrow().pm, addr) } /// Handle syscall `madvise` @@ -350,7 +350,7 @@ impl Task { len: usize, advice: litebox_common_linux::MadviseBehavior, ) -> Result<(), Errno> { - litebox_common_linux::mm::sys_madvise(&self.global.pm, addr, len, advice) + litebox_common_linux::mm::sys_madvise(&self.process.borrow().pm, addr, len, advice) } } diff --git a/litebox_shim_linux/src/syscalls/process.rs b/litebox_shim_linux/src/syscalls/process.rs index 69a41b70b..bb2f5fd26 100644 --- a/litebox_shim_linux/src/syscalls/process.rs +++ b/litebox_shim_linux/src/syscalls/process.rs @@ -8,7 +8,7 @@ use alloc::boxed::Box; use alloc::collections::btree_map::BTreeMap; use alloc::sync::Arc; use alloc::vec::Vec; -use core::cell::Cell; +use core::cell::{Cell, RefCell}; use core::mem::offset_of; use core::ops::Range; use core::sync::atomic::{AtomicBool, Ordering}; @@ -19,7 +19,7 @@ use litebox::platform::ThreadProvider; use litebox::platform::{Instant as _, SystemTime as _, TimeProvider}; use litebox::platform::{ PunchthroughProvider as _, PunchthroughToken as _, RawConstPointer as _, RawMutex as _, - ThreadLocalStorageProvider as _, + RawMutexProvider, ThreadLocalStorageProvider as _, }; use litebox::platform::{RawMutPointer as _, TimerHandle, TimerProvider}; use litebox::sync::Mutex; @@ -29,6 +29,49 @@ use litebox_common_linux::{ }; use litebox_platform_multiplex::Platform; +/// One-shot signal from a vfork child to the parent, indicating the child +/// has called `execve` or `_exit` and the parent may resume. +/// +/// On userland, all forks are treated as vfork: the parent is suspended +/// while the child runs in the shared address space. When the child performs +/// exec (detaching to its own VA partition) or exits, it signals the parent +/// via this structure. +pub(crate) struct VforkDone { + /// 0 = not done, 1 = done. + futex: ::RawMutex, +} + +impl VforkDone { + fn new() -> Self { + Self { + futex: ::RawMutex::INIT, + } + } + + /// Signal that the child is done (called by child on exec or exit). + pub(crate) fn signal(&self) { + self.futex.underlying_atomic().store(1, Ordering::Release); + self.futex.wake_all(); + } + + /// Block until the child signals done (called by parent after spawning child). + fn wait(&self) { + loop { + if self.futex.underlying_atomic().load(Ordering::Acquire) != 0 { + return; + } + let _ = self.futex.block(0); + } + } +} + +/// Context carried by a fork child task so that exec and exit know +/// to signal the parent and (on exec) detach to a new address space. +pub(crate) struct ForkContext { + /// Signaled on exec or exit to wake the parent. + pub(crate) vfork_done: Arc, +} + /// Process-management-related state on [`Task`]. pub(crate) struct ThreadState { init_state: Cell, @@ -322,6 +365,12 @@ enum ThreadInitState { tls: Option, set_child_tid: Option>, }, + /// A fork child: starts with parent's register state, return value 0. + NewForkChild { + /// The guest FS base (TLS pointer) inherited from the parent. + #[cfg(target_arch = "x86_64")] + guest_fsbase: usize, + }, } /// Credentials of a process @@ -500,6 +549,19 @@ impl Task { pub(crate) fn prepare_for_exit(&mut self) { self.thread.detach_from_process(); + // Close all file descriptors when the process leader exits. + // Only the process leader (pid == tid) closes FDs, not worker threads + // (which share the same FD table and may exit during exec). + if self.pid == self.tid { + let files = self.files.borrow(); + let live_fds: alloc::vec::Vec = + files.raw_descriptor_store.read().iter_alive().collect(); + drop(files); + for fd in live_fds { + let _ = self.do_close(fd); + } + } + if let Some(clear_child_tid) = self.thread.clear_child_tid.take() { // Clear the child TID if requested // TODO: if we are the last thread, we don't need to clear it @@ -515,6 +577,33 @@ impl Task { if let Some(robust_list) = self.thread.robust_list.take() { let _ = wake_robust_list(robust_list); } + + // If this is the process leader (pid == tid) and it's exiting, + // record the exit in the process registry so waitpid can collect it. + // This must happen BEFORE signaling vfork_done, so that the parent + // can immediately waitpid after being unblocked. + if self.pid == self.tid + && let Some(process_id) = litebox::process::ProcessId::new(self.pid.cast_unsigned()) + { + // Get the exit status from the process thread group. + let exit_status = self.thread.process.inner.lock().exit_status; + let wait_status = match exit_status { + ExitStatus::Exit(code) => (u32::from(code.cast_unsigned()) & 0xff) << 8, + ExitStatus::Signal(sig) => sig.as_i32().cast_unsigned() & 0x7f, + }; + let _ = self + .global + .process_registry + .exit_process(process_id, wait_status, |_orphan| { + // TODO: reparent orphans to init + }); + } + + // If this is a vfork child that never exec'd, signal the parent. + // Done after exit recording so parent's waitpid sees the exit. + if let Some(fc) = self.fork_context.borrow_mut().take() { + fc.vfork_done.signal(); + } } pub(crate) fn sys_exit(&self, status: i32) { @@ -527,6 +616,46 @@ impl Task { // Tear down occurs similarly to `sys_exit`. self.exit_group(ExitStatus::Exit(status.truncate())); } + + /// wait4(pid, wstatus, options, rusage) — wait for a child process. + pub(crate) fn sys_wait4( + &self, + pid: i32, + wstatus: Option>, + options: i32, + ) -> Result { + const WNOHANG: i32 = 1; + + let parent_pid = + litebox::process::ProcessId::new(self.pid.cast_unsigned()).ok_or(Errno::ESRCH)?; + + loop { + match self.global.process_registry.try_wait(parent_pid, pid) { + Err(()) => { + // No matching children at all — ECHILD. + return Err(Errno::ECHILD); + } + Ok(Some((child_pid, status))) => { + // Reaped a child. + if let Some(wstatus) = wstatus { + let _ = wstatus.write_at_offset(0, status.cast_signed()); + } + return Ok(child_pid.as_u32() as usize); + } + Ok(None) => { + // Children exist but none exited yet. + if options & WNOHANG != 0 { + return Ok(0); + } + // Block: sleep briefly and retry. This is a simple poll loop. + // A proper implementation would use ExitSubject observers, + // but for the minimal multi-process support this suffices. + // Block until some child exits. + self.global.process_registry.wait_for_any_child_exit(); + } + } + } + } } /// A descriptor for thread-local storage (TLS). @@ -577,12 +706,155 @@ impl Task { /// Creates a new thread or process. /// - /// Note we currently only support creating threads with the VM, FS, and FILES flags set. + /// If `CLONE_THREAD` is set, creates a new thread in the current process. + /// Otherwise, treats the clone as a fork (vfork semantics: parent is + /// suspended until child calls exec or exits). fn do_clone( &self, ctx: &litebox_common_linux::PtRegs, args: &litebox_common_linux::CloneArgs, clone3: bool, + ) -> Result { + let litebox_common_linux::CloneArgs { mut flags, .. } = *args; + + // `CLONE_DETACHED` is ignored but has been reserved for reuse with + // `clone3` or in combination with `CLONE_PIDFD`. + if !clone3 && !flags.contains(CloneFlags::PIDFD) { + flags.remove(CloneFlags::DETACHED); + } + + if !flags.contains(CloneFlags::THREAD) { + // This is a fork (or vfork). Route to fork path. + return self.do_fork(ctx, args); + } + + // Thread clone path — requires VM, THREAD, SIGHAND, FILES. + self.do_thread_clone(ctx, args, clone3) + } + + /// Fork: create a new child process with vfork semantics. + /// + /// On userland, all forks are treated as vfork: the parent is suspended + /// while the child runs in the parent's shared address space. When the + /// child calls `execve` (detaching to its own VA partition) or `_exit`, + /// the parent is woken. + /// + /// The child gets: + /// - A new PID/TID + /// - Its own cloned FD table (close in child doesn't affect parent) + /// - The parent's ProcessState (shared memory, shared PageManager) + /// - A `ForkContext` so exec/exit can signal the parent + fn do_fork( + &self, + ctx: &litebox_common_linux::PtRegs, + args: &litebox_common_linux::CloneArgs, + ) -> Result { + const MAX_SIGNAL_NUMBER: u64 = 64; + let litebox_common_linux::CloneArgs { + exit_signal, + set_tid, + set_tid_size, + cgroup, + .. + } = *args; + + if cgroup != 0 { + log_unsupported!("fork with cgroup"); + return Err(Errno::EINVAL); + } + if set_tid != 0 || set_tid_size != 0 { + log_unsupported!("fork with set_tid"); + return Err(Errno::EINVAL); + } + + // Validate exit_signal (typically SIGCHLD for fork). + if exit_signal > MAX_SIGNAL_NUMBER { + return Err(Errno::EINVAL); + } + + // Register the child process in the process registry. + let parent_process_id = + litebox::process::ProcessId::new(self.pid.cast_unsigned()).expect("parent PID is 0"); + let child_process_id = self + .global + .process_registry + .create_process(Some(parent_process_id)) + .map_err(|_| Errno::EAGAIN)?; + let child_pid = child_process_id.as_u32().cast_signed(); + + // Advance the thread ID counter past the child PID to avoid collisions. + let _ = self + .global + .next_thread_id + .fetch_max(child_pid + 1, Ordering::Relaxed); + + // Clone the FD table for the child. + let child_files = Arc::new(self.files.borrow().clone_for_fork()); + + // Capture the parent's guest FS base for the child. + #[cfg(target_arch = "x86_64")] + let guest_fsbase = { + let punchthrough = litebox_common_linux::PunchthroughSyscall::GetFsBase; + let token = self + .global + .platform + .get_punchthrough_token_for(punchthrough) + .expect("Failed to get punchthrough token for GET_FS"); + token.execute().unwrap() + }; + + // Create the vfork synchronization. + let vfork_done = Arc::new(VforkDone::new()); + + // Build the child task. The child shares the parent's ProcessState + // (and thus PageManager / address space) until it execs. + let child_thread = ThreadState::new_process(child_pid); + child_thread.init_state.set(ThreadInitState::NewForkChild { + #[cfg(target_arch = "x86_64")] + guest_fsbase, + }); + + let child_task = Task { + global: self.global.clone(), + process: RefCell::new(self.process.borrow().clone()), // shared address space + wait_state: crate::wait::WaitState::new(self.global.platform), + thread: child_thread, + pid: child_pid, + ppid: self.pid, + tid: child_pid, + credentials: self.credentials.clone(), + comm: self.comm.clone(), + fs: RefCell::new((*self.fs.borrow()).clone()), + files: RefCell::new(child_files), + signals: self.signals.clone_for_new_task(), + fork_context: RefCell::new(Some(ForkContext { + vfork_done: vfork_done.clone(), + })), + }; + + // Spawn the child as a new host thread. + let r = unsafe { + self.global + .platform + .spawn_thread(ctx, Box::new(NewThreadArgs { task: child_task })) + }; + if let Err(err) = r { + litebox_util_log::error!(err:% = err; "failed to spawn fork child"); + return Err(Errno::ENOMEM); + } + + // Parent blocks here until child execs or exits. + vfork_done.wait(); + + Ok(usize::try_from(child_pid).unwrap()) + } + + /// Creates a new thread within the current process. + fn do_thread_clone( + &self, + ctx: &litebox_common_linux::PtRegs, + args: &litebox_common_linux::CloneArgs, + clone3: bool, ) -> Result { const MAX_SIGNAL_NUMBER: u64 = 64; @@ -717,6 +989,7 @@ impl Task { Box::new(NewThreadArgs { task: Task { global: self.global.clone(), + process: RefCell::new(self.process.borrow().clone()), wait_state: crate::wait::WaitState::new(self.global.platform), thread, pid: self.pid, @@ -727,6 +1000,7 @@ impl Task { fs: fs.into(), files: self.files.clone(), // TODO: !CLONE_FILES support signals: self.signals.clone_for_new_task(), + fork_context: RefCell::new(None), }, }), ) @@ -1386,6 +1660,29 @@ impl Task { Err(Errno::ELOOP) } + /// Detach from the parent's shared address space to a new VA partition. + /// + /// Called during exec of a vfork child. Creates a new address space via the + /// platform, builds a new `ProcessState` with a `PageManager` scoped to that + /// partition's VA range, and replaces `self.process`. + fn detach_to_new_address_space(&self) { + use litebox::platform::AddressSpaceProvider; + + let platform = self.global.platform; + let as_id = platform + .create_address_space() + .expect("failed to create address space for fork child"); + let range = platform + .address_space_range(as_id) + .expect("failed to get address space range"); + + let new_process = Arc::new(crate::ProcessState { + pm: litebox::mm::PageManager::new_with_range(&self.global.litebox, range), + }); + + *self.process.borrow_mut() = new_process; + } + /// Handle syscall `execve`. pub(crate) fn sys_execve( &self, @@ -1467,9 +1764,20 @@ impl Task { self.signals.reset_for_exec(); + // If this is a vfork child, detach to a new address space before + // releasing memory (so we don't destroy the parent's mappings). + let vfork_done = self + .fork_context + .borrow_mut() + .take() + .map(|fc| fc.vfork_done); + if vfork_done.is_some() { + self.detach_to_new_address_space(); + } + // Don't release reserved mappings. let release = |_r: Range, vm: VmFlags| !vm.is_empty(); - unsafe { self.global.pm.release_memory(release) } + unsafe { self.process.borrow().pm.release_memory(release) } .expect("failed to release memory mappings"); litebox_platform_multiplex::Platform::clear_guest_thread_local_storage(); @@ -1477,6 +1785,12 @@ impl Task { self.load_program(loader, argv_vec, envp_vec) .expect("TODO: terminate the process cleanly"); + // Signal the parent that the vfork child has exec'd and detached. + // The parent's address space is intact; it can safely resume. + if let Some(vd) = vfork_done { + vd.signal(); + } + self.init_thread_context(ctx); Ok(0) } @@ -1575,6 +1889,19 @@ impl Task { let _ = child_tid_ptr.write_at_offset(0, self.tid); } } + ThreadInitState::NewForkChild { + #[cfg(target_arch = "x86_64")] + guest_fsbase, + } => { + // Fork child: return 0 from the fork syscall. + #[cfg(target_arch = "x86_64")] + { + ctx.rax = 0; + // Restore the parent's guest FS base (TLS) on this new host thread. + self.sys_arch_prctl(ArchPrctlArg::SetFs(guest_fsbase)) + .expect("failed to set guest fsbase for fork child"); + } + } } } } From 2d4f009cb34f943fdb8dcf18938048e23d878133 Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Sun, 26 Apr 2026 09:05:50 -0700 Subject: [PATCH 02/23] Phase 5-6: Fork-aware FD lifecycle, cross-process signals, SIGPIPE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fork-aware FD refcounting: fork_refcount on IndividualEntry tracks cross-process FD sharing; clone_for_fork creates independent OwnedFds - Cross-process signal mailbox: BTreeMap-based per-PID mailboxes for delivering signals (SIGCHLD, kill) between processes - Drain mailbox on return to userspace (prepare_to_run_guest, check_for_interrupt) — not just in waitpid - SIGPIPE delivery on EPIPE for write, writev, sendto, sendmsg - SignalState::clone_for_fork: deep-clone handlers, fresh shared_pending - siginfo_chld: correctly decodes wait_status for CLD_EXITED vs CLD_KILLED - Cross-process kill() routes to target's signal mailbox --- litebox/src/fd/mod.rs | 128 ++++++++++++++---- litebox/src/net/mod.rs | 3 + litebox_shim_linux/src/lib.rs | 74 ++++++++++ litebox_shim_linux/src/syscalls/file.rs | 45 +++--- litebox_shim_linux/src/syscalls/net.rs | 26 ++-- litebox_shim_linux/src/syscalls/process.rs | 45 ++++-- litebox_shim_linux/src/syscalls/signal/mod.rs | 124 ++++++++++++++++- litebox_shim_linux/src/syscalls/unix.rs | 7 +- litebox_shim_linux/src/wait.rs | 2 + 9 files changed, 370 insertions(+), 84 deletions(-) diff --git a/litebox/src/fd/mod.rs b/litebox/src/fd/mod.rs index addafe215..25c5a67a5 100644 --- a/litebox/src/fd/mod.rs +++ b/litebox/src/fd/mod.rs @@ -50,7 +50,8 @@ impl Descriptors { .iter() .map(|slot| { slot.as_ref().map(|ind| { - let cloned = IndividualEntry::new(Arc::clone(&ind.x)); + let mut cloned = IndividualEntry::new(Arc::clone(&ind.x)); + cloned.fork_refcount = 1; cloned.x.read().entry.on_dup(); cloned }) @@ -59,6 +60,25 @@ impl Descriptors { Self { entries } } + /// Increment the fork reference count for each of the given descriptor slot indices. + /// + /// This must be called during fork, paired with [`RawDescriptorStorage::clone_for_fork`], + /// so that each forked slot index is properly tracked. When a process closes an FD via + /// [`Self::remove`], the fork_refcount is decremented; the entry is only truly removed + /// when fork_refcount reaches 0. + #[expect( + clippy::missing_panics_doc, + reason = "panics only on invariant violation (slot must exist during fork)" + )] + pub fn increment_fork_refcounts(&mut self, slot_indices: &[usize]) { + for &idx in slot_indices { + let entry = self.entries[idx] + .as_mut() + .expect("fork: descriptor slot must exist"); + entry.fork_refcount += 1; + } + } + /// Insert `entry` into the descriptor table, returning an `OwnedFd` to this entry. #[expect( clippy::missing_panics_doc, @@ -130,18 +150,31 @@ impl Descriptors { /// Removes the entry at `fd`, closing out the file descriptor. /// /// Returns the descriptor entry if it is unique (i.e., it was not duplicated, or all duplicates - /// have been cleared out). + /// have been cleared out) AND no other process holds a fork reference to this slot. /// /// If the `fd` was already closed out, then (obviously) it does not return an entry. + #[expect( + clippy::missing_panics_doc, + reason = "panics only on invariant violation" + )] pub fn remove( &mut self, fd: &TypedFd, ) -> Option { - let Some(old) = self.entries[fd.x.as_usize()?].take() else { - unreachable!(); - }; - old.x.read().entry.on_close(); + let idx = fd.x.as_usize()?; + let entry = self.entries[idx].as_mut().unwrap(); + entry.x.read().entry.on_close(); fd.x.mark_as_closed(); + + assert!(entry.fork_refcount > 0); + entry.fork_refcount -= 1; + if entry.fork_refcount > 0 { + // Another process still references this slot — don't remove the entry. + return None; + } + + // Last fork reference — truly vacate the slot. + let old = self.entries[idx].take().unwrap(); Arc::into_inner(old.x) .map(RwLock::into_inner) .map(DescriptorEntry::into_subsystem_entry::) @@ -162,9 +195,19 @@ impl Descriptors { can_close_immediately: F, ) -> Option> { let idx = fd.x.as_usize()?; - let Some(old) = self.entries[idx].take() else { - unreachable!(); - }; + let entry = self.entries[idx].as_mut().unwrap(); + + // If another process holds a fork reference, just decrement and don't truly close. + assert!(entry.fork_refcount > 0); + if entry.fork_refcount > 1 { + entry.x.read().entry.on_close(); + fd.x.mark_as_closed(); + entry.fork_refcount -= 1; + return Some(CloseResult::ForkDecremented); + } + + // fork_refcount == 1: this is the last process. Proceed with normal close logic. + let old = self.entries[idx].take().unwrap(); if Arc::strong_count(&old.x) == 1 { // Unique, so we can just return it if allowed. if can_close_immediately(old.x.read().as_subsystem::()) { @@ -184,7 +227,7 @@ impl Descriptors { } else { old.x.read().entry.on_close(); fd.x.mark_as_closed(); - // Shared, so we need to duplicate it. + // Shared (via dup), so we need to duplicate it. let old = self.entries[idx].replace(old); assert!(old.is_none()); Some(CloseResult::Duplicated(TypedFd { @@ -215,23 +258,26 @@ impl Descriptors { ) -> Vec { // Each FD corresponds to an `IndividualEntry`, which has an Arc to a `DescriptorEntry`. If // we have the same number of FDs as matching to the strong-count of a descriptor entry, + // AND the slot has fork_refcount == 1 (no other process references it), // then it must be the case that we have everything needed to close the entries out. let removable_entries: Vec<*const RwLock<_, _>> = { - let mut strong_count_and_count = HashMap::<*const _, (usize, usize)>::new(); + let mut strong_count_and_count = HashMap::<*const _, (usize, usize, bool)>::new(); for fd in fds.iter() { let entry = &self.entries[fd.x.as_usize().unwrap()]; // It would not be "incorrect" to see a closed out entry, but as it currently stands, I // believe that we'll only see alive entries, so this `unwrap` is confirming that; if we // need to expand it out, we'd simply have a `continue` here. let entry = entry.as_ref().unwrap(); - strong_count_and_count + let has_fork_refs = entry.fork_refcount > 1; + let record = strong_count_and_count .entry(Arc::as_ptr(&entry.x)) - .or_insert((Arc::strong_count(&entry.x), 0)) - .1 += 1; + .or_insert((Arc::strong_count(&entry.x), 0, false)); + record.1 += 1; + record.2 |= has_fork_refs; } strong_count_and_count .into_iter() - .filter(|(_ptr, (sc, c))| sc == c) + .filter(|(_ptr, (sc, c, has_fork))| sc == c && !has_fork) .map(|(ptr, _)| ptr) .collect() }; @@ -569,6 +615,9 @@ pub(crate) enum CloseResult { Duplicated(TypedFd), /// The FD was unique but couldn't be closed immediately (e.g., due to pending data) Deferred, + /// Another process still holds a fork reference to this slot. The fork_refcount + /// was decremented and the FD was marked closed; no further action needed. + ForkDecremented, } /// Safe(r) conversions between safely-typed file descriptors and unsafely-typed integers. @@ -705,24 +754,39 @@ impl RawDescriptorStorage { /// Clone the entire raw descriptor storage for fork. /// - /// Each slot in the new storage shares the same underlying `OwnedFd` - /// (via `Arc::clone`), matching POSIX fork semantics where the child - /// inherits copies of the parent's file descriptor table that refer to - /// the same open file descriptions. + /// Each slot in the new storage gets a **new, independent** `OwnedFd` + /// (with the same raw index as the parent's), avoiding shared `AtomicBool` + /// poisoning when either process closes the FD independently. + /// + /// Returns `(cloned_storage, slot_indices)` where `slot_indices` is the + /// list of descriptor table slot indices that were cloned. The caller MUST + /// call [`Descriptors::increment_fork_refcounts`] with these indices so that + /// the descriptor table knows multiple processes reference these slots. #[must_use] - pub fn clone_for_fork(&self) -> Self { - Self { - stored_fds: self - .stored_fds - .iter() - .map(|slot| { - slot.as_ref().map(|stored| StoredFd { - x: Arc::clone(&stored.x), + #[expect( + clippy::missing_panics_doc, + reason = "panics only if FD is closed during fork (invariant violation)" + )] + pub fn clone_for_fork(&self) -> (Self, Vec) { + let mut slot_indices = Vec::new(); + let stored_fds = self + .stored_fds + .iter() + .map(|slot| { + slot.as_ref().map(|stored| { + let raw = stored + .x + .as_usize() + .expect("FD should not be closed during fork"); + slot_indices.push(raw); + StoredFd { + x: Arc::new(OwnedFd::new(raw)), subsystem_entry_type_id: stored.subsystem_entry_type_id, - }) + } }) - .collect(), - } + }) + .collect(); + (Self { stored_fds }, slot_indices) } /// Returns an iterator over raw integer indices that are currently alive (i.e., occupied). @@ -858,6 +922,9 @@ pub enum MetadataError { struct IndividualEntry { x: Arc>, metadata: AnyMap, + /// Number of processes referencing this slot (incremented on fork, decremented on close). + /// Starts at 1 when created or duplicated. When this reaches 0, the slot is truly vacated. + fork_refcount: usize, } impl core::ops::Deref for IndividualEntry { type Target = Arc>; @@ -870,6 +937,7 @@ impl IndividualEntry { Self { x, metadata: AnyMap::new(), + fork_refcount: 1, } } } diff --git a/litebox/src/net/mod.rs b/litebox/src/net/mod.rs index 954162620..c7416a929 100644 --- a/litebox/src/net/mod.rs +++ b/litebox/src/net/mod.rs @@ -875,6 +875,9 @@ where // We attempt to queue it for future closure and then just return. self.queued_for_closure.push(dup_fd); } + super::fd::CloseResult::ForkDecremented => { + // Another process still holds a fork reference. Our close is done. + } super::fd::CloseResult::Deferred => { let Some(()) = dt.with_entry_mut(fd, |entry| entry.entry.consider_closed = true) else { diff --git a/litebox_shim_linux/src/lib.rs b/litebox_shim_linux/src/lib.rs index 3f05a2fa4..0909e8e2a 100644 --- a/litebox_shim_linux/src/lib.rs +++ b/litebox_shim_linux/src/lib.rs @@ -17,6 +17,7 @@ extern crate alloc; use alloc::vec; use alloc::vec::Vec; +use alloc::collections::vec_deque::VecDeque; use alloc::sync::Arc; use core::cell::{Cell, RefCell}; use litebox::{ @@ -207,6 +208,7 @@ impl LinuxShimBuilder { litebox: self.litebox, unix_addr_table: litebox::sync::RwLock::new(syscalls::unix::UnixAddrTable::new()), process_registry, + signal_mailboxes: litebox::sync::Mutex::new(alloc::collections::BTreeMap::new()), }); let init_process = Arc::new(ProcessState { pm: PageManager::new(&global.litebox), @@ -269,6 +271,7 @@ impl LinuxShim { files: files.into(), signals: syscalls::signal::SignalState::new_process(), fork_context: RefCell::new(None), + signal_mailbox: self.0.register_signal_mailbox(pid), }, }; @@ -1042,6 +1045,65 @@ struct GlobalState { unix_addr_table: litebox::sync::RwLock>, /// Process registry for tracking parent-child relationships and exit status. process_registry: litebox::process::ProcessRegistry, + /// Cross-process signal mailboxes, keyed by PID. + /// Used for delivering signals (e.g., SIGCHLD) between processes. + #[allow(clippy::type_complexity)] + signal_mailboxes: litebox::sync::Mutex< + Platform, + alloc::collections::BTreeMap< + i32, + Arc< + litebox::sync::Mutex< + Platform, + VecDeque<( + litebox_common_linux::signal::Signal, + litebox_common_linux::signal::Siginfo, + )>, + >, + >, + >, + >, +} + +impl GlobalState { + /// Register a signal mailbox for a process. + fn register_signal_mailbox( + &self, + pid: i32, + ) -> Arc< + litebox::sync::Mutex< + Platform, + VecDeque<( + litebox_common_linux::signal::Signal, + litebox_common_linux::signal::Siginfo, + )>, + >, + > { + let mailbox = Arc::new(litebox::sync::Mutex::new(VecDeque::new())); + self.signal_mailboxes.lock().insert(pid, mailbox.clone()); + mailbox + } + + /// Deregister a signal mailbox for a process. + fn deregister_signal_mailbox(&self, pid: i32) { + self.signal_mailboxes.lock().remove(&pid); + } + + /// Send a signal to a process by PID. Returns true if the target mailbox exists. + fn send_signal_to_process( + &self, + target_pid: i32, + signal: litebox_common_linux::signal::Signal, + siginfo: litebox_common_linux::signal::Siginfo, + ) -> bool { + let mailboxes = self.signal_mailboxes.lock(); + if let Some(mailbox) = mailboxes.get(&target_pid) { + mailbox.lock().push_back((signal, siginfo)); + true + } else { + false + } + } } /// Per-process state, shared among threads of the same process. @@ -1074,6 +1136,16 @@ struct Task { signals: syscalls::signal::SignalState, /// Fork context: present on vfork children, used to signal parent on exec/exit. fork_context: RefCell>, + /// Cross-process signal mailbox for this process (shared with GlobalState). + signal_mailbox: Arc< + litebox::sync::Mutex< + Platform, + VecDeque<( + litebox_common_linux::signal::Signal, + litebox_common_linux::signal::Siginfo, + )>, + >, + >, } impl Drop for Task { @@ -1117,6 +1189,7 @@ mod test_utils { files: files.into(), signals: syscalls::signal::SignalState::new_process(), fork_context: RefCell::new(None), + signal_mailbox: self.register_signal_mailbox(pid), global: self, } } @@ -1143,6 +1216,7 @@ mod test_utils { files: self.files.clone(), signals: self.signals.clone_for_new_task(), fork_context: RefCell::new(None), + signal_mailbox: self.signal_mailbox.clone(), }; Some(task) } diff --git a/litebox_shim_linux/src/syscalls/file.rs b/litebox_shim_linux/src/syscalls/file.rs index d7ff52620..cee1d5745 100644 --- a/litebox_shim_linux/src/syscalls/file.rs +++ b/litebox_shim_linux/src/syscalls/file.rs @@ -78,15 +78,22 @@ impl FilesState { /// Clone the file descriptor table for fork. /// - /// The child gets its own `RawDescriptorStorage` (so close/dup in the - /// child does not affect the parent's FD numbering), but the underlying - /// open file descriptions are shared via Arc. - pub(crate) fn clone_for_fork(&self) -> Self { + /// The child gets its own `RawDescriptorStorage` with independent `OwnedFd` + /// instances (so close in the child does not poison the parent's FDs). + /// The underlying open file descriptions are shared via Arc in the global + /// descriptor table, tracked by `fork_refcount`. + /// + /// The caller must provide a mutable reference to the global descriptor table + /// so that fork_refcounts can be incremented atomically with the clone. + pub(crate) fn clone_for_fork( + &self, + descriptors: &mut litebox::fd::Descriptors, + ) -> Self { + let (cloned_rds, slot_indices) = self.raw_descriptor_store.read().clone_for_fork(); + descriptors.increment_fork_refcounts(&slot_indices); Self { fs: self.fs.clone(), - raw_descriptor_store: litebox::sync::RwLock::new( - self.raw_descriptor_store.read().clone_for_fork(), - ), + raw_descriptor_store: litebox::sync::RwLock::new(cloned_rds), max_fd: AtomicUsize::new(self.max_fd.load(Ordering::Relaxed)), } } @@ -443,7 +450,7 @@ impl Task { ) .flatten(); if let Err(Errno::EPIPE) = res { - unimplemented!("send SIGPIPE to the current task"); + self.raise_sigpipe(); } res } @@ -534,11 +541,6 @@ impl Task { match rds.fd_consume_raw_integer(raw_fd) { Ok(fd) => { drop(rds); - // If another process (fork) still holds a reference to this FD, - // just drop our reference without closing the underlying entry. - if alloc::sync::Arc::strong_count(&fd) > 1 { - return Ok(()); - } return files.fs.close(&fd).map_err(Errno::from); } Err(litebox::fd::ErrRawIntFd::NotFound) => { @@ -550,23 +552,14 @@ impl Task { } if let Ok(fd) = rds.fd_consume_raw_integer(raw_fd) { drop(rds); - if alloc::sync::Arc::strong_count(&fd) > 1 { - return Ok(()); - } return self.global.close_socket(&self.wait_cx(), fd); } if let Ok(fd) = rds.fd_consume_raw_integer(raw_fd) { drop(rds); - if alloc::sync::Arc::strong_count(&fd) > 1 { - return Ok(()); - } return self.global.pipes.close(&fd).map_err(Errno::from); } if let Ok(fd) = rds.fd_consume_raw_integer::(raw_fd) { drop(rds); - if alloc::sync::Arc::strong_count(&fd) > 1 { - return Ok(()); - } let entry = { let mut dt = self.global.litebox.descriptor_table_mut(); dt.remove(&fd) @@ -576,9 +569,6 @@ impl Task { } if let Ok(fd) = rds.fd_consume_raw_integer::>(raw_fd) { drop(rds); - if alloc::sync::Arc::strong_count(&fd) > 1 { - return Ok(()); - } let entry = { let mut dt = self.global.litebox.descriptor_table_mut(); dt.remove(&fd) @@ -588,9 +578,6 @@ impl Task { } if let Ok(fd) = rds.fd_consume_raw_integer::>(raw_fd) { drop(rds); - if alloc::sync::Arc::strong_count(&fd) > 1 { - return Ok(()); - } let entry = { let mut dt = self.global.litebox.descriptor_table_mut(); dt.remove(&fd) @@ -737,7 +724,7 @@ impl Task { ) .flatten(); if let Err(Errno::EPIPE) = res { - unimplemented!("send SIGPIPE to the current task"); + self.raise_sigpipe(); } res } diff --git a/litebox_shim_linux/src/syscalls/net.rs b/litebox_shim_linux/src/syscalls/net.rs index d17574814..2ec296c5e 100644 --- a/litebox_shim_linux/src/syscalls/net.rs +++ b/litebox_shim_linux/src/syscalls/net.rs @@ -767,11 +767,7 @@ impl GlobalState { }, ) .map_err(Errno::from); - if let Err(Errno::EPIPE) = ret - && !flags.contains(SendFlags::NOSIGNAL) - { - unimplemented!("send signal SIGPIPE on EPIPE"); - } + // Note: SIGPIPE is sent at the Task level (do_sendto/sys_sendmsg) ret } @@ -1340,7 +1336,7 @@ impl Task { flags: SendFlags, sockaddr: Option, ) -> Result { - self.files.borrow().with_socket( + let ret = self.files.borrow().with_socket( &self.global, sockfd, |fd| { @@ -1358,7 +1354,13 @@ impl Task { .transpose()?; file.sendto(self, buf, flags, addr) }, - ) + ); + if let Err(Errno::EPIPE) = ret { + if !flags.contains(SendFlags::NOSIGNAL) { + self.raise_sigpipe(); + } + } + ret } /// Handle syscall `sendmsg` @@ -1399,7 +1401,7 @@ impl Task { .msg_iov .to_owned_slice(msg.msg_iovlen) .ok_or(Errno::EFAULT)?; - self.files.borrow().with_socket( + let ret = self.files.borrow().with_socket( &self.global, sockfd, |fd| { @@ -1440,7 +1442,13 @@ impl Task { } Ok(total_sent) }, - ) + ); + if let Err(Errno::EPIPE) = ret { + if !flags.contains(SendFlags::NOSIGNAL) { + self.raise_sigpipe(); + } + } + ret } /// Handle syscall `recvfrom` diff --git a/litebox_shim_linux/src/syscalls/process.rs b/litebox_shim_linux/src/syscalls/process.rs index bb2f5fd26..a4d082f77 100644 --- a/litebox_shim_linux/src/syscalls/process.rs +++ b/litebox_shim_linux/src/syscalls/process.rs @@ -591,12 +591,31 @@ impl Task { ExitStatus::Exit(code) => (u32::from(code.cast_unsigned()) & 0xff) << 8, ExitStatus::Signal(sig) => sig.as_i32().cast_unsigned() & 0x7f, }; - let _ = self - .global - .process_registry - .exit_process(process_id, wait_status, |_orphan| { - // TODO: reparent orphans to init - }); + let notification = + self.global + .process_registry + .exit_process(process_id, wait_status, |_orphan| { + // TODO: reparent orphans to init + }); + + // Deliver SIGCHLD to the parent process. + if let Some(notif) = notification { + use litebox_common_linux::signal::Signal; + let siginfo = super::signal::siginfo_chld( + notif.child_pid.as_u32().cast_signed(), + notif.exit_status, + ); + self.global.send_signal_to_process( + notif.parent_pid.as_u32().cast_signed(), + Signal::SIGCHLD, + siginfo, + ); + } + } + + // Deregister the signal mailbox for this process. + if self.pid == self.tid { + self.global.deregister_signal_mailbox(self.pid); } // If this is a vfork child that never exec'd, signal the parent. @@ -630,6 +649,9 @@ impl Task { litebox::process::ProcessId::new(self.pid.cast_unsigned()).ok_or(Errno::ESRCH)?; loop { + // Pick up any cross-process signals (e.g., SIGCHLD from exiting children). + self.drain_cross_process_signals(); + match self.global.process_registry.try_wait(parent_pid, pid) { Err(()) => { // No matching children at all — ECHILD. @@ -788,8 +810,11 @@ impl Task { .next_thread_id .fetch_max(child_pid + 1, Ordering::Relaxed); - // Clone the FD table for the child. - let child_files = Arc::new(self.files.borrow().clone_for_fork()); + // Clone the FD table for the child, incrementing fork_refcounts in the global descriptor table. + let child_files = { + let mut dt = self.global.litebox.descriptor_table_mut(); + Arc::new(self.files.borrow().clone_for_fork(&mut dt)) + }; // Capture the parent's guest FS base for the child. #[cfg(target_arch = "x86_64")] @@ -826,10 +851,11 @@ impl Task { comm: self.comm.clone(), fs: RefCell::new((*self.fs.borrow()).clone()), files: RefCell::new(child_files), - signals: self.signals.clone_for_new_task(), + signals: self.signals.clone_for_fork(), fork_context: RefCell::new(Some(ForkContext { vfork_done: vfork_done.clone(), })), + signal_mailbox: self.global.register_signal_mailbox(child_pid), }; // Spawn the child as a new host thread. @@ -1001,6 +1027,7 @@ impl Task { files: self.files.clone(), // TODO: !CLONE_FILES support signals: self.signals.clone_for_new_task(), fork_context: RefCell::new(None), + signal_mailbox: self.signal_mailbox.clone(), // share parent's mailbox }, }), ) diff --git a/litebox_shim_linux/src/syscalls/signal/mod.rs b/litebox_shim_linux/src/syscalls/signal/mod.rs index af5764700..aea23240d 100644 --- a/litebox_shim_linux/src/syscalls/signal/mod.rs +++ b/litebox_shim_linux/src/syscalls/signal/mod.rs @@ -91,6 +91,45 @@ impl SignalState { } } + /// Clone signal state for a fork child (new process). + /// + /// Unlike `clone_for_new_task` (for threads within the same process), fork creates + /// a new process that gets: + /// - Independent signal handlers (deep-cloned, not shared) + /// - Fresh process-wide pending signals (new process, no inherited pending) + /// - Parent's blocked signal mask (inherited) + /// - Fresh per-thread pending (as with new_task) + /// - Fresh altstack + pub fn clone_for_fork(&self) -> Self { + // Deep-clone handlers: copy the inner data into a new Arc + let parent_handlers = self.handlers.borrow(); + let cloned_handlers_inner = parent_handlers.inner.lock().clone(); + let new_handlers = Arc::new(SignalHandlers { + inner: Mutex::new(cloned_handlers_inner), + }); + + Self { + pending: RefCell::new(PendingSignals::new()), + shared_pending: Arc::new(Mutex::new(PendingSignals::new())), + blocked: Cell::new(self.blocked.get()), + handlers: RefCell::new(new_handlers), + altstack: SigAltStack { + flags: SsFlags::DISABLE, + sp: 0, + size: 0, + #[cfg(target_arch = "x86_64")] + __pad: 0, + } + .into(), + last_exception: Cell::new(litebox::shim::ExceptionInfo { + exception: litebox::shim::Exception(0), + error_code: 0, + cr2: 0, + kernel_mode: false, + }), + } + } + /// Resets signal state for an `execve` call. pub(crate) fn reset_for_exec(&self) { let mut handlers = self.handlers.borrow_mut(); @@ -293,6 +332,38 @@ pub(crate) fn siginfo_kill(signal: Signal) -> Siginfo { } } +/// Creates a `Siginfo` for SIGCHLD when a child process exits. +/// `wait_status` is the wait-encoded status: `(code & 0xff) << 8` for normal exit, +/// `sig & 0x7f` for signal death. +pub(crate) fn siginfo_chld(child_pid: i32, wait_status: u32) -> Siginfo { + const CLD_EXITED: i32 = 1; + const CLD_KILLED: i32 = 2; + + // Decode wait_status to determine si_code and si_status + let (code, si_status) = if wait_status.trailing_zeros() >= 7 { + // Normal exit: status is in bits 15..8 + (CLD_EXITED, (wait_status >> 8) & 0xff) + } else { + // Killed by signal: signal number is in bits 6..0 + (CLD_KILLED, wait_status & 0x7f) + }; + + // Build sigchld data: { pid: i32, uid: u32, status: i32, utime: i64, stime: i64 } + let mut data = SiginfoData::new_zeroed(); + // Layout: pid at offset 0, uid at offset 4, status at offset 8 + data.pad[0] = child_pid.cast_unsigned(); + data.pad[1] = 0; // uid + data.pad[2] = si_status; + Siginfo { + signo: Signal::SIGCHLD.as_i32(), + errno: 0, + code, + #[cfg(target_arch = "x86_64")] + __pad: 0, + data, + } +} + impl SignalState { /// Updates the blocked signal mask. fn set_signal_mask(&self, mask: SigSet) { @@ -518,10 +589,34 @@ impl Task { fn do_kill(&self, pid: Option, tid: Option, signal: i32) -> Result { let signal = Signal::try_from(signal)?; if pid.is_none_or(|pid| pid == self.pid) && tid.is_none_or(|tid| tid == self.tid) { + // Signal to self self.send_signal(signal, siginfo_kill(signal)); Ok(0) + } else if tid.is_none() + && let Some(target_pid) = pid + { + // Process-directed signal to another process. + // pid > 0: send to specific process + // pid == 0: send to own process group (TODO: process groups) + // pid == -1: send to all processes (TODO) + // pid < -1: send to process group |pid| (TODO: process groups) + if target_pid > 0 { + if self + .global + .send_signal_to_process(target_pid, signal, siginfo_kill(signal)) + { + Ok(0) + } else { + Err(Errno::ESRCH) + } + } else { + log_unsupported!( + "sys_kill with pid={target_pid} (process groups not yet supported)" + ); + Err(Errno::ESRCH) + } } else { - log_unsupported!("sys_{{t|tg}}kill with remote pid/tid"); + log_unsupported!("sys_tgkill with remote pid/tid"); Err(Errno::ESRCH) } } @@ -676,6 +771,23 @@ impl Task { .push(&self.process().limits, signal, siginfo); } + /// Raise SIGPIPE on the current task (used when write/send gets EPIPE). + pub(crate) fn raise_sigpipe(&self) { + use zerocopy::FromZeros; + let data = SiginfoData::new_zeroed(); + self.send_signal( + Signal::SIGPIPE, + Siginfo { + signo: Signal::SIGPIPE.as_i32(), + errno: 0, + code: SI_KERNEL, + #[cfg(target_arch = "x86_64")] + __pad: 0, + data, + }, + ); + } + /// Sends a process-directed signal (stored in shared_pending). pub(crate) fn send_shared_signal(&self, signal: Signal, siginfo: Siginfo) { if self.is_signal_ignored(signal) { @@ -687,6 +799,16 @@ impl Task { .push(&self.process().limits, signal, siginfo); } + /// Drain cross-process signals from the mailbox into local shared_pending. + /// Call this periodically (e.g., on wait/syscall return) to pick up signals + /// from other processes (e.g., SIGCHLD from exiting children). + pub(crate) fn drain_cross_process_signals(&self) { + let mut mailbox = self.signal_mailbox.lock(); + while let Some((signal, siginfo)) = mailbox.pop_front() { + self.send_shared_signal(signal, siginfo); + } + } + /// Forces a signal to be delivered on next call to `check_for_signals`. fn force_signal(&self, signal: Signal, force_exit: bool) { let siginfo = Siginfo { diff --git a/litebox_shim_linux/src/syscalls/unix.rs b/litebox_shim_linux/src/syscalls/unix.rs index 6aec592d8..23411ac30 100644 --- a/litebox_shim_linux/src/syscalls/unix.rs +++ b/litebox_shim_linux/src/syscalls/unix.rs @@ -1249,12 +1249,7 @@ impl UnixSocket { datagram.sendto(task, timeout, buf, is_nonblocking, addr) } }; - if let Err(Errno::EPIPE) = ret - && !flags.contains(SendFlags::NOSIGNAL) - { - // TODO: send SIGPIPE signal - unimplemented!("send SIGPIPE on EPIPE"); - } + // Note: SIGPIPE is sent at the Task level (do_sendto/sys_sendmsg) ret } diff --git a/litebox_shim_linux/src/wait.rs b/litebox_shim_linux/src/wait.rs index 7ab43cee6..f2eb518d8 100644 --- a/litebox_shim_linux/src/wait.rs +++ b/litebox_shim_linux/src/wait.rs @@ -42,6 +42,7 @@ impl Task { self.queue_signals(signal); }); self.check_alarm_deadline(); + self.drain_cross_process_signals(); self.process_signals(ctx); !self.is_exiting() }) @@ -55,6 +56,7 @@ impl litebox::event::wait::CheckForInterrupt for Task { self.queue_signals(sig); }); self.check_alarm_deadline(); + self.drain_cross_process_signals(); self.is_exiting() || self.has_pending_signals() } } From a4168b69a173086821fe1cf5cbedbed544245416 Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Sun, 26 Apr 2026 09:28:48 -0700 Subject: [PATCH 03/23] Close-on-exec: close FD_CLOEXEC file descriptors during execve Add raw_fds_matching_metadata() to RawDescriptorStorage to correctly resolve raw FD numbers (not slot indices) matching per-FD metadata. Called in sys_execve before detach_to_new_address_space. --- litebox/src/fd/mod.rs | 46 ++++++++++++++++++++++ litebox_shim_linux/src/syscalls/process.rs | 18 +++++++++ 2 files changed, 64 insertions(+) diff --git a/litebox/src/fd/mod.rs b/litebox/src/fd/mod.rs index 25c5a67a5..0bf2b6500 100644 --- a/litebox/src/fd/mod.rs +++ b/litebox/src/fd/mod.rs @@ -587,6 +587,26 @@ impl Descriptors { .metadata .insert(metadata) } + + /// Returns the indices of all live entries whose per-FD metadata of type `T` satisfies `pred`. + /// + /// **Important**: These are slot indices into `Descriptors.entries`, NOT raw FD numbers. + /// To get raw FD numbers matching metadata, use + /// [`RawDescriptorStorage::raw_fds_matching_metadata`] instead. + pub fn indices_matching_metadata( + &self, + pred: impl Fn(&T) -> bool, + ) -> alloc::vec::Vec { + self.entries + .iter() + .enumerate() + .filter_map(|(idx, slot)| { + let entry = slot.as_ref()?; + let matches = entry.metadata.get::().is_some_and(&pred); + matches.then_some(idx) + }) + .collect() + } } /// A handle to a descriptor entry (via [`Descriptors::entry_handle`]) that can be used without @@ -796,6 +816,32 @@ impl RawDescriptorStorage { .enumerate() .filter_map(|(i, slot)| slot.as_ref().map(|_| i)) } + + /// Returns raw FD numbers whose corresponding `Descriptors` slot has per-FD metadata + /// of type `T` satisfying `pred`. + /// + /// This resolves the raw FD → slot index mapping correctly, unlike + /// [`Descriptors::indices_matching_metadata`] which returns slot indices. + pub fn raw_fds_matching_metadata< + Platform: RawSyncPrimitivesProvider, + T: core::any::Any + Send + Sync, + >( + &self, + descriptors: &Descriptors, + pred: impl Fn(&T) -> bool, + ) -> alloc::vec::Vec { + self.stored_fds + .iter() + .enumerate() + .filter_map(|(raw_fd, slot)| { + let stored = slot.as_ref()?; + let slot_idx = stored.x.as_usize()?; + let entry = descriptors.entries.get(slot_idx)?.as_ref()?; + let matches = entry.metadata.get::().is_some_and(&pred); + matches.then_some(raw_fd) + }) + .collect() + } } macro_rules! multi_subsystem_generic { diff --git a/litebox_shim_linux/src/syscalls/process.rs b/litebox_shim_linux/src/syscalls/process.rs index a4d082f77..25fe747d4 100644 --- a/litebox_shim_linux/src/syscalls/process.rs +++ b/litebox_shim_linux/src/syscalls/process.rs @@ -1791,6 +1791,24 @@ impl Task { self.signals.reset_for_exec(); + // Close FDs marked with FD_CLOEXEC (POSIX requirement). + { + let files = self.files.borrow(); + let rds = files.raw_descriptor_store.read(); + let dt = self.global.litebox.descriptor_table(); + let cloexec_fds = rds + .raw_fds_matching_metadata::<_, litebox_common_linux::FileDescriptorFlags>( + &dt, + |flags| flags.contains(litebox_common_linux::FileDescriptorFlags::FD_CLOEXEC), + ); + drop(dt); + drop(rds); + drop(files); + for raw_fd in cloexec_fds { + let _ = self.do_close(raw_fd); + } + } + // If this is a vfork child, detach to a new address space before // releasing memory (so we don't destroy the parent's mappings). let vfork_done = self From 7d09afbcab579414c122e6568a6de90ffa16d28d Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Sun, 26 Apr 2026 09:32:06 -0700 Subject: [PATCH 04/23] Orphan reparenting: reparent children to init (pid 1) on parent exit ProcessRegistry::reparent() updates parent/child relationships and returns zombie status so caller can deliver SIGCHLD to new parent. Orphan handler in prepare_for_exit sends SIGCHLD to init for zombies. --- litebox/src/process/mod.rs | 37 ++++++++++++++++++++++ litebox_shim_linux/src/syscalls/process.rs | 21 ++++++++++-- 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/litebox/src/process/mod.rs b/litebox/src/process/mod.rs index eb9ed4fc6..9a8b3c30c 100644 --- a/litebox/src/process/mod.rs +++ b/litebox/src/process/mod.rs @@ -330,6 +330,43 @@ impl ProcessRegistry { notification } + /// Reparent a child process to a new parent. + /// + /// Updates the child's parent field, removes it from the old parent's children list, + /// and adds it to the new parent's children list. + /// + /// Returns `Some(exit_status)` if the child is already a zombie (exited but not reaped), + /// so the caller can deliver SIGCHLD to the new parent. + pub fn reparent(&self, child: ProcessId, new_parent: ProcessId) -> Option { + let mut inner = self.inner.lock(); + + // Read child info first. + let (old_parent, exit_status) = { + let Some(entry) = inner.processes.get_mut(&child) else { + return None; + }; + let old_parent = entry.context.parent.replace(new_parent); + let exit_status = match entry.context.state { + ProcessState::Exited(status) => Some(status), + _ => None, + }; + (old_parent, exit_status) + }; + + // Remove from old parent's children list. + if let Some(old_pid) = old_parent { + if let Some(old_entry) = inner.processes.get_mut(&old_pid) { + old_entry.context.children.retain(|&c| c != child); + } + } + + // Add to new parent's children list. + if let Some(parent_entry) = inner.processes.get_mut(&new_parent) { + parent_entry.context.children.push(child); + } + exit_status + } + /// Read process context through a closure. /// Returns `None` if the process does not exist. pub fn with_context( diff --git a/litebox_shim_linux/src/syscalls/process.rs b/litebox_shim_linux/src/syscalls/process.rs index 25fe747d4..38d502684 100644 --- a/litebox_shim_linux/src/syscalls/process.rs +++ b/litebox_shim_linux/src/syscalls/process.rs @@ -594,8 +594,25 @@ impl Task { let notification = self.global .process_registry - .exit_process(process_id, wait_status, |_orphan| { - // TODO: reparent orphans to init + .exit_process(process_id, wait_status, |orphan| { + // Reparent orphaned children to init (pid 1). + if let Some(init_pid) = litebox::process::ProcessId::new(1) { + let zombie_status = + self.global.process_registry.reparent(orphan, init_pid); + // If the orphan is already a zombie, notify init with SIGCHLD. + if let Some(exit_status) = zombie_status { + use litebox_common_linux::signal::Signal; + let siginfo = super::signal::siginfo_chld( + orphan.as_u32().cast_signed(), + exit_status, + ); + self.global.send_signal_to_process( + 1, // init pid + Signal::SIGCHLD, + siginfo, + ); + } + } }); // Deliver SIGCHLD to the parent process. From e5554f4a5eba1a7a711dcd6a63a498e0c74c7255 Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Sun, 26 Apr 2026 09:38:06 -0700 Subject: [PATCH 05/23] Process groups: setpgid/getpgid/getpgrp/setsid syscalls, kill to pgid - pgid/sid fields in ProcessContext, inherited by fork children - setpgid with self-or-child constraint - setsid checks process group leader (pgid == pid), not session leader - kill(0, sig) sends to own process group - kill(-pgid, sig) sends to specific process group - pids_in_group() collects running processes in a group --- litebox/src/process/mod.rs | 55 ++++++++++++++++++ litebox_common_linux/src/lib.rs | 13 +++++ litebox_shim_linux/src/lib.rs | 9 +++ litebox_shim_linux/src/syscalls/process.rs | 58 +++++++++++++++++++ litebox_shim_linux/src/syscalls/signal/mod.rs | 32 +++++++++- 5 files changed, 164 insertions(+), 3 deletions(-) diff --git a/litebox/src/process/mod.rs b/litebox/src/process/mod.rs index 9a8b3c30c..1d8172dcd 100644 --- a/litebox/src/process/mod.rs +++ b/litebox/src/process/mod.rs @@ -44,10 +44,21 @@ pub struct ProcessContext { /// Parent process. `None` only for the init process. pub parent: Option, pub state: ProcessState, + /// Process group ID. Defaults to the process's own PID. + pub pgid: ProcessId, + /// Session ID. Defaults to the process's own PID for the init process. + pub sid: ProcessId, /// Child processes. children: Vec, } +impl ProcessContext { + /// Get the list of child processes. + pub fn children(&self) -> &[ProcessId] { + &self.children + } +} + /// Whether a process is running or has exited. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ProcessState { @@ -224,6 +235,15 @@ impl ProcessRegistry { let exited_flag = Arc::new(AtomicBool::new(false)); let exit_observer = Arc::new(ExitSubject::new()); + // Determine pgid and sid: init gets its own, children inherit from parent. + let (pgid, sid) = match parent { + None => (pid, pid), + Some(parent_pid) => { + let parent_entry = inner.processes.get(&parent_pid).unwrap(); + (parent_entry.context.pgid, parent_entry.context.sid) + } + }; + inner.processes.insert( pid, ProcessEntry { @@ -231,6 +251,8 @@ impl ProcessRegistry { id: pid, parent, state: ProcessState::Running, + pgid, + sid, children: Vec::new(), }, exit_observer, @@ -367,6 +389,39 @@ impl ProcessRegistry { exit_status } + /// Set the process group ID for a process. + /// Returns `Ok(())` on success, `Err(())` if the process does not exist. + pub fn set_pgid(&self, id: ProcessId, pgid: ProcessId) -> Result<(), ()> { + let mut inner = self.inner.lock(); + let entry = inner.processes.get_mut(&id).ok_or(())?; + entry.context.pgid = pgid; + Ok(()) + } + + /// Create a new session: set both pgid and sid to the process's own PID. + /// Returns `Err(())` if the process doesn't exist or is already a process group leader. + pub fn setsid(&self, id: ProcessId) -> Result<(), ()> { + let mut inner = self.inner.lock(); + let entry = inner.processes.get_mut(&id).ok_or(())?; + if entry.context.pgid == id { + return Err(()); // already a process group leader + } + entry.context.pgid = id; + entry.context.sid = id; + Ok(()) + } + + /// Collect all process IDs in the given process group. + pub fn pids_in_group(&self, pgid: ProcessId) -> alloc::vec::Vec { + let inner = self.inner.lock(); + inner + .processes + .values() + .filter(|e| e.context.pgid == pgid && matches!(e.context.state, ProcessState::Running)) + .map(|e| e.context.id) + .collect() + } + /// Read process context through a closure. /// Returns `None` if the process does not exist. pub fn with_context( diff --git a/litebox_common_linux/src/lib.rs b/litebox_common_linux/src/lib.rs index 64e17c02c..922c300b2 100644 --- a/litebox_common_linux/src/lib.rs +++ b/litebox_common_linux/src/lib.rs @@ -2102,6 +2102,15 @@ pub enum SyscallRequest { }, Getpid, Getppid, + Setpgid { + pid: i32, + pgid: i32, + }, + Getpgid { + pid: i32, + }, + Getpgrp, + Setsid, Getuid, Geteuid, Getgid, @@ -2420,6 +2429,10 @@ impl SyscallRequest { Sysno::prlimit64 => sys_req!(Prlimit { pid, resource:?, new_limit:*, old_limit:* }), Sysno::getpid => SyscallRequest::Getpid, Sysno::getppid => SyscallRequest::Getppid, + Sysno::setpgid => sys_req!(Setpgid { pid, pgid }), + Sysno::getpgid => sys_req!(Getpgid { pid }), + Sysno::getpgrp => SyscallRequest::Getpgrp, + Sysno::setsid => SyscallRequest::Setsid, Sysno::getuid => SyscallRequest::Getuid, Sysno::getgid => SyscallRequest::Getgid, Sysno::geteuid => SyscallRequest::Geteuid, diff --git a/litebox_shim_linux/src/lib.rs b/litebox_shim_linux/src/lib.rs index 0909e8e2a..68f4f4918 100644 --- a/litebox_shim_linux/src/lib.rs +++ b/litebox_shim_linux/src/lib.rs @@ -966,6 +966,15 @@ impl Task { } SyscallRequest::Getpid => Ok(self.sys_getpid().reinterpret_as_unsigned() as usize), SyscallRequest::Getppid => Ok(self.sys_getppid().reinterpret_as_unsigned() as usize), + SyscallRequest::Setpgid { pid, pgid } => { + self.sys_setpgid(pid, pgid)?; + Ok(0) + } + SyscallRequest::Getpgid { pid } => { + Ok(self.sys_getpgid(pid)?.reinterpret_as_unsigned() as usize) + } + SyscallRequest::Getpgrp => Ok(self.sys_getpgrp()?.reinterpret_as_unsigned() as usize), + SyscallRequest::Setsid => Ok(self.sys_setsid()?.reinterpret_as_unsigned() as usize), SyscallRequest::Getuid => Ok(self.sys_getuid() as usize), SyscallRequest::Getgid => Ok(self.sys_getgid() as usize), SyscallRequest::Geteuid => Ok(self.sys_geteuid() as usize), diff --git a/litebox_shim_linux/src/syscalls/process.rs b/litebox_shim_linux/src/syscalls/process.rs index 38d502684..14e418f9e 100644 --- a/litebox_shim_linux/src/syscalls/process.rs +++ b/litebox_shim_linux/src/syscalls/process.rs @@ -1501,6 +1501,64 @@ impl Task { self.ppid } + /// Handle syscall `setpgid`. + pub(crate) fn sys_setpgid(&self, pid: i32, pgid: i32) -> Result<(), Errno> { + let target_pid = if pid == 0 { self.pid } else { pid }; + let target_pgid = if pgid == 0 { target_pid } else { pgid }; + let Some(target) = litebox::process::ProcessId::new(target_pid.cast_unsigned()) else { + return Err(Errno::ESRCH); + }; + let Some(pg) = litebox::process::ProcessId::new(target_pgid.cast_unsigned()) else { + return Err(Errno::EINVAL); + }; + // POSIX: can only setpgid on self or a child process. + let my_pid = + litebox::process::ProcessId::new(self.pid.cast_unsigned()).ok_or(Errno::ESRCH)?; + if target != my_pid { + let is_child = self + .global + .process_registry + .with_context(my_pid, |ctx| ctx.children().contains(&target)) + .unwrap_or(false); + if !is_child { + return Err(Errno::ESRCH); + } + } + self.global + .process_registry + .set_pgid(target, pg) + .map_err(|()| Errno::ESRCH) + } + + /// Handle syscall `getpgid`. + pub(crate) fn sys_getpgid(&self, pid: i32) -> Result { + let target_pid = if pid == 0 { self.pid } else { pid }; + let Some(target) = litebox::process::ProcessId::new(target_pid.cast_unsigned()) else { + return Err(Errno::ESRCH); + }; + self.global + .process_registry + .with_context(target, |ctx| ctx.pgid.as_u32().cast_signed()) + .ok_or(Errno::ESRCH) + } + + /// Handle syscall `getpgrp` (equivalent to getpgid(0)). + pub(crate) fn sys_getpgrp(&self) -> Result { + self.sys_getpgid(0) + } + + /// Handle syscall `setsid`. + pub(crate) fn sys_setsid(&self) -> Result { + let Some(pid) = litebox::process::ProcessId::new(self.pid.cast_unsigned()) else { + return Err(Errno::EPERM); + }; + self.global + .process_registry + .setsid(pid) + .map_err(|()| Errno::EPERM)?; + Ok(self.pid) + } + /// Handle syscall `getuid`. pub(crate) fn sys_getuid(&self) -> u32 { self.credentials.uid diff --git a/litebox_shim_linux/src/syscalls/signal/mod.rs b/litebox_shim_linux/src/syscalls/signal/mod.rs index aea23240d..b9d3f6558 100644 --- a/litebox_shim_linux/src/syscalls/signal/mod.rs +++ b/litebox_shim_linux/src/syscalls/signal/mod.rs @@ -609,10 +609,36 @@ impl Task { } else { Err(Errno::ESRCH) } + } else if target_pid == 0 || target_pid < -1 { + // kill(0, sig) -> own process group + // kill(-pgid, sig) -> specific process group + let pgid_raw = if target_pid == 0 { + // Get own pgid + let my_pid = litebox::process::ProcessId::new(self.pid.cast_unsigned()) + .ok_or(Errno::ESRCH)?; + self.global + .process_registry + .with_context(my_pid, |ctx| ctx.pgid) + .ok_or(Errno::ESRCH)? + } else { + litebox::process::ProcessId::new((-target_pid).cast_unsigned()) + .ok_or(Errno::ESRCH)? + }; + let pids = self.global.process_registry.pids_in_group(pgid_raw); + if pids.is_empty() { + return Err(Errno::ESRCH); + } + for pid in pids { + self.global.send_signal_to_process( + pid.as_u32().cast_signed(), + signal, + siginfo_kill(signal), + ); + } + Ok(0) } else { - log_unsupported!( - "sys_kill with pid={target_pid} (process groups not yet supported)" - ); + // pid == -1: send to all processes (not supported) + log_unsupported!("sys_kill with pid=-1 (broadcast) not yet supported"); Err(Errno::ESRCH) } } else { From 21aada165c6b8ec57526178f1cf0fa65df6c4412 Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Sun, 26 Apr 2026 09:39:21 -0700 Subject: [PATCH 06/23] execve PATH resolution: search $PATH for binaries without '/' resolve_path_lookup() extracts PATH from envp, tries each directory, falls back to /usr/bin:/bin if PATH not set. --- litebox_shim_linux/src/syscalls/process.rs | 41 ++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/litebox_shim_linux/src/syscalls/process.rs b/litebox_shim_linux/src/syscalls/process.rs index 14e418f9e..36faa2ffa 100644 --- a/litebox_shim_linux/src/syscalls/process.rs +++ b/litebox_shim_linux/src/syscalls/process.rs @@ -1718,6 +1718,40 @@ fn parse_shebang(buf: &[u8]) -> Option<(&str, Option<&str>)> { } impl Task { + /// Search $PATH for a binary name (no '/' in path). + /// Returns the full path if found, or ENOENT. + fn resolve_path_lookup( + &self, + name: &str, + envp: &[alloc::ffi::CString], + ) -> Result { + let path_env = envp + .iter() + .find_map(|e| { + let s = e.to_str().ok()?; + s.strip_prefix("PATH=") + }) + .unwrap_or("/usr/bin:/bin"); + + for dir in path_env.split(':') { + let candidate = if dir.is_empty() { + alloc::format!("./{name}") + } else { + alloc::format!("{dir}/{name}") + }; + // Check if the file exists by trying to open it. + if let Ok(fd) = self.sys_open( + candidate.as_str(), + litebox::fs::OFlags::RDONLY, + litebox::fs::Mode::empty(), + ) { + let _ = self.do_close(fd as usize); + return Ok(candidate); + } + } + Err(Errno::ENOENT) + } + /// Resolve shebang (`#!`) chains for the given path and argv if the file starts with a shebang line. /// Otherwise, returns the original path and argv. pub(crate) fn resolve_shebang( @@ -1844,6 +1878,13 @@ impl Task { let (path, argv_vec) = self.resolve_shebang(alloc::string::String::from(path), argv_vec)?; + // PATH resolution: if path doesn't contain '/', search $PATH. + let path = if !path.contains('/') { + self.resolve_path_lookup(&path, &envp_vec)? + } else { + path + }; + let loader = crate::loader::elf::ElfLoader::new(self, &path)?; // After this point, the old program is torn down and failures must terminate the process. From a8074f111926e3a708436724ef752374e973063b Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Sun, 26 Apr 2026 09:41:05 -0700 Subject: [PATCH 07/23] Multi-threaded fork guard: reject fork when process has >1 threads Returns ENOSYS with log_unsupported if do_fork is called from a multi-threaded process, preventing undefined behavior. --- litebox_shim_linux/src/syscalls/process.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/litebox_shim_linux/src/syscalls/process.rs b/litebox_shim_linux/src/syscalls/process.rs index 36faa2ffa..d35c11020 100644 --- a/litebox_shim_linux/src/syscalls/process.rs +++ b/litebox_shim_linux/src/syscalls/process.rs @@ -811,6 +811,19 @@ impl Task { return Err(Errno::EINVAL); } + // Guard: fork from a multi-threaded process is not supported. + { + let inner = self.thread.process.inner.lock(); + if inner.threads.len() > 1 { + log_unsupported!( + "fork from multi-threaded process (pid={}, {} threads) is not supported", + self.pid, + inner.threads.len() + ); + return Err(Errno::ENOSYS); + } + } + // Register the child process in the process registry. let parent_process_id = litebox::process::ProcessId::new(self.pid.cast_unsigned()).expect("parent PID is 0"); From 64db6ea17c64b0386052f7094c2cc1871d0b038b Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Sun, 26 Apr 2026 09:42:05 -0700 Subject: [PATCH 08/23] waitpid: support specific PID, process group, and own-group waits try_wait now handles pid==0 (own process group), pid<-1 (specific process group), in addition to existing pid>0 and pid==-1. --- litebox/src/process/mod.rs | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/litebox/src/process/mod.rs b/litebox/src/process/mod.rs index 1d8172dcd..29d80552d 100644 --- a/litebox/src/process/mod.rs +++ b/litebox/src/process/mod.rs @@ -537,7 +537,37 @@ impl ProcessRegistry { } result } - _ => return Err(()), // process groups not supported + 0 => { + // Wait for any child in the caller's process group. + let caller_pgid = parent_entry.context.pgid; + let mut result = None; + for &child_pid in &children { + if let Some(entry) = inner.processes.get(&child_pid) + && entry.context.pgid == caller_pgid + && let ProcessState::Exited(status) = entry.context.state + { + result = Some((child_pid, status)); + break; + } + } + result + } + t if t < -1 => { + // Wait for any child in process group |t|. + let pgid = ProcessId((-t).cast_unsigned()); + let mut result = None; + for &child_pid in &children { + if let Some(entry) = inner.processes.get(&child_pid) + && entry.context.pgid == pgid + && let ProcessState::Exited(status) = entry.context.state + { + result = Some((child_pid, status)); + break; + } + } + result + } + _ => return Err(()), }; // Reap the child if found From fa161fcf73231ecddacef1eab7f7a2a50c7bd903 Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Sun, 26 Apr 2026 09:49:33 -0700 Subject: [PATCH 09/23] Pipe readv/writev: implement pipe paths for scatter/gather I/O readv: check pipe FD once before the iov loop, then use a dedicated pipe read path that avoids the double-mutable-borrow of kernel_buffer that would occur inside run_on_raw_fd closures. writev: route pipe FDs through write_to_iovec inside run_on_raw_fd. --- litebox_shim_linux/src/syscalls/file.rs | 120 ++++++++++++++++-------- 1 file changed, 79 insertions(+), 41 deletions(-) diff --git a/litebox_shim_linux/src/syscalls/file.rs b/litebox_shim_linux/src/syscalls/file.rs index cee1d5745..be4c5c1ef 100644 --- a/litebox_shim_linux/src/syscalls/file.rs +++ b/litebox_shim_linux/src/syscalls/file.rs @@ -611,47 +611,78 @@ impl Task { let iovs: &[IoReadVec>] = &iovec.to_owned_slice(iovcnt).ok_or(Errno::EFAULT)?; let files = self.files.borrow(); let mut total_read = 0; - let mut kernel_buffer = vec![ - 0u8; - iovs.iter() - .map(|i| i.iov_len) - .max() - .unwrap_or_default() - .min(super::super::MAX_KERNEL_BUF_SIZE) - ]; - for iov in iovs { - if iov.iov_len == 0 { - continue; + + // Check once whether this FD is a pipe to avoid per-iov lock acquisition. + let pipe_fd = { + let rds = files.raw_descriptor_store.read(); + rds.fd_from_raw_integer::>(raw_fd) + .ok() + }; + + if let Some(pipe_fd) = pipe_fd { + // Pipe-specific readv path: avoids borrow conflict with kernel_buffer. + for iov in iovs { + if iov.iov_len == 0 { + continue; + } + let Ok(_iov_len) = isize::try_from(iov.iov_len) else { + return Err(Errno::EINVAL); + }; + let mut pipe_buf = vec![0u8; iov.iov_len.min(super::super::MAX_KERNEL_BUF_SIZE)]; + let n = self + .global + .pipes + .read(&self.wait_cx(), &pipe_fd, &mut pipe_buf) + .map_err(Errno::from)?; + iov.iov_base + .copy_from_slice(0, &pipe_buf[..n]) + .ok_or(Errno::EFAULT)?; + total_read += n; + if n < iov.iov_len { + break; + } } - let Ok(_iov_len) = isize::try_from(iov.iov_len) else { - return Err(Errno::EINVAL); - }; - // TODO: The data transfers performed by readv() and writev() are atomic: the data - // written by writev() is written as a single block that is not intermingled with - // output from writes in other processes - let size = files - .run_on_raw_fd( - raw_fd, - |fd| { - files - .fs - .read(fd, &mut kernel_buffer, None) - .map_err(Errno::from) - }, - |_fd| todo!("net"), - |_fd| todo!("pipes"), - |_fd| todo!("eventfd"), - |_fd| Err(Errno::EINVAL), - |_fd| todo!("unix"), - ) - .flatten()?; - iov.iov_base - .copy_from_slice(0, &kernel_buffer[..size]) - .ok_or(Errno::EFAULT)?; - total_read += size; - if size < iov.iov_len { - // Okay to transfer fewer bytes than requested - break; + } else { + let mut kernel_buffer = vec![ + 0u8; + iovs.iter() + .map(|i| i.iov_len) + .max() + .unwrap_or_default() + .min(super::super::MAX_KERNEL_BUF_SIZE) + ]; + for iov in iovs { + if iov.iov_len == 0 { + continue; + } + let Ok(_iov_len) = isize::try_from(iov.iov_len) else { + return Err(Errno::EINVAL); + }; + // TODO: The data transfers performed by readv() and writev() are atomic + let size = files + .run_on_raw_fd( + raw_fd, + |fd| { + files + .fs + .read(fd, &mut kernel_buffer, None) + .map_err(Errno::from) + }, + |_fd| todo!("net"), + |_fd| unreachable!(), // pipes handled above + |_fd| todo!("eventfd"), + |_fd| Err(Errno::EINVAL), + |_fd| todo!("unix"), + ) + .flatten()?; + iov.iov_base + .copy_from_slice(0, &kernel_buffer[..size]) + .ok_or(Errno::EFAULT)?; + total_read += size; + if size < iov.iov_len { + // Okay to transfer fewer bytes than requested + break; + } } } Ok(total_read) @@ -717,7 +748,14 @@ impl Task { ) }) }, - |_fd| todo!("pipes"), + |fd| { + write_to_iovec(iovs, |buf| { + self.global + .pipes + .write(&self.wait_cx(), fd, buf) + .map_err(Errno::from) + }) + }, |_fd| todo!("eventfd"), |_fd| Err(Errno::EINVAL), |_fd| todo!("unix"), From 8e74f79e24edf84fa9c8d90375393d6ff1bf031d Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Sun, 26 Apr 2026 10:14:08 -0700 Subject: [PATCH 10/23] Tests for kill/signal, waitpid WNOHANG, and PATH resolution Add three integration tests: - test_kill_signal: vfork+exec sleeper, kill with SIGKILL, verify WIFSIGNALED - test_waitpid_wnohang: poll with WNOHANG until child exits - test_exec_path_lookup: execve bare name triggers PATH search in shim Fix: move PATH resolution before shebang resolution in sys_execve. Previously, resolve_shebang tried to open the bare name (e.g. 'exit_with') which failed with ENOENT before PATH lookup could run. --- .../tests/multiprocess/kill_test.c | 50 +++++++++++++ .../tests/multiprocess/path_exec_test.c | 42 +++++++++++ .../tests/multiprocess/sleeper.c | 11 +++ .../tests/multiprocess/wnohang_test.c | 57 +++++++++++++++ litebox_runner_linux_userland/tests/run.rs | 71 +++++++++++++++++++ litebox_shim_linux/src/syscalls/process.rs | 8 +-- 6 files changed, 235 insertions(+), 4 deletions(-) create mode 100644 litebox_runner_linux_userland/tests/multiprocess/kill_test.c create mode 100644 litebox_runner_linux_userland/tests/multiprocess/path_exec_test.c create mode 100644 litebox_runner_linux_userland/tests/multiprocess/sleeper.c create mode 100644 litebox_runner_linux_userland/tests/multiprocess/wnohang_test.c diff --git a/litebox_runner_linux_userland/tests/multiprocess/kill_test.c b/litebox_runner_linux_userland/tests/multiprocess/kill_test.c new file mode 100644 index 000000000..6724ca8f9 --- /dev/null +++ b/litebox_runner_linux_userland/tests/multiprocess/kill_test.c @@ -0,0 +1,50 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// Test: vfork + exec a sleeper, kill it with SIGKILL, verify WIFSIGNALED. + +#include +#include +#include +#include +#include + +int main(int argc, char *argv[]) { + if (argc < 2) { + fprintf(stderr, "usage: kill_test \n"); + return 1; + } + const char *sleeper = argv[1]; + + pid_t pid = vfork(); + if (pid < 0) { + perror("vfork"); + return 1; + } + if (pid == 0) { + execl(sleeper, sleeper, (char *)NULL); + _exit(127); + } + + // Give the child a moment to start, then kill it. + // (In practice the child is already running after vfork returns to parent.) + if (kill(pid, SIGKILL) < 0) { + perror("kill"); + return 1; + } + + int wstatus; + pid_t w = waitpid(pid, &wstatus, 0); + if (w != pid) { + fprintf(stderr, "waitpid returned %d, expected %d\n", w, pid); + return 1; + } + if (!WIFSIGNALED(wstatus) || WTERMSIG(wstatus) != SIGKILL) { + fprintf(stderr, "unexpected status: 0x%x (WIFSIGNALED=%d, WTERMSIG=%d)\n", + wstatus, WIFSIGNALED(wstatus), WTERMSIG(wstatus)); + return 1; + } + + printf("kill_test: OK\n"); + return 0; +} diff --git a/litebox_runner_linux_userland/tests/multiprocess/path_exec_test.c b/litebox_runner_linux_userland/tests/multiprocess/path_exec_test.c new file mode 100644 index 000000000..5863cf4b5 --- /dev/null +++ b/litebox_runner_linux_userland/tests/multiprocess/path_exec_test.c @@ -0,0 +1,42 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// Test: execve with a bare name (no '/') triggers PATH resolution in the shim. + +#include +#include +#include +#include +#include +#include + +int main(void) { + + pid_t pid = vfork(); + if (pid < 0) { + perror("vfork"); + return 1; + } + if (pid == 0) { + // Bare name — no '/' — triggers PATH lookup in the shim's sys_execve. + char *const args[] = {"exit_with", "99", NULL}; + char *const envp[] = {"PATH=/out", NULL}; + execve("exit_with", args, envp); + _exit(127); + } + + int wstatus; + pid_t w = waitpid(pid, &wstatus, 0); + if (w != pid) { + fprintf(stderr, "waitpid returned %d, expected %d\n", w, pid); + return 1; + } + if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 99) { + fprintf(stderr, "unexpected status: 0x%x (WIFEXITED=%d, WEXITSTATUS=%d)\n", + wstatus, WIFEXITED(wstatus), WEXITSTATUS(wstatus)); + return 1; + } + + printf("path_exec_test: OK\n"); + return 0; +} diff --git a/litebox_runner_linux_userland/tests/multiprocess/sleeper.c b/litebox_runner_linux_userland/tests/multiprocess/sleeper.c new file mode 100644 index 000000000..33e3e82fa --- /dev/null +++ b/litebox_runner_linux_userland/tests/multiprocess/sleeper.c @@ -0,0 +1,11 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// Helper: block forever until killed by a signal. + +#include + +int main(void) { + pause(); + return 0; +} diff --git a/litebox_runner_linux_userland/tests/multiprocess/wnohang_test.c b/litebox_runner_linux_userland/tests/multiprocess/wnohang_test.c new file mode 100644 index 000000000..1715b7be8 --- /dev/null +++ b/litebox_runner_linux_userland/tests/multiprocess/wnohang_test.c @@ -0,0 +1,57 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// Test: waitpid with WNOHANG — poll until child exits. + +#include +#include +#include +#include + +int main(int argc, char *argv[]) { + if (argc < 2) { + fprintf(stderr, "usage: wnohang_test \n"); + return 1; + } + const char *helper = argv[1]; + + pid_t pid = vfork(); + if (pid < 0) { + perror("vfork"); + return 1; + } + if (pid == 0) { + execl(helper, helper, "7", (char *)NULL); + _exit(127); + } + + // Poll with WNOHANG until child exits. + int wstatus; + int attempts = 0; + pid_t w; + while (1) { + w = waitpid(pid, &wstatus, WNOHANG); + if (w < 0) { + perror("waitpid"); + return 1; + } + if (w == pid) { + break; // Child exited. + } + // w == 0 means child still running; keep polling. + attempts++; + if (attempts > 1000000) { + fprintf(stderr, "child did not exit after %d polls\n", attempts); + return 1; + } + } + + if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 7) { + fprintf(stderr, "unexpected status: 0x%x (WIFEXITED=%d, WEXITSTATUS=%d)\n", + wstatus, WIFEXITED(wstatus), WEXITSTATUS(wstatus)); + return 1; + } + + printf("wnohang_test: OK\n"); + return 0; +} diff --git a/litebox_runner_linux_userland/tests/run.rs b/litebox_runner_linux_userland/tests/run.rs index d548e34ce..aec32458f 100644 --- a/litebox_runner_linux_userland/tests/run.rs +++ b/litebox_runner_linux_userland/tests/run.rs @@ -668,3 +668,74 @@ fn test_pipe_fork() { "pipe_fork test failed, output: {output_str}" ); } + +#[test] +fn test_kill_signal() { + let main_target = common::compile("./tests/multiprocess/kill_test.c", "kill_test", true, false); + let helper_target = common::compile_static_pie("./tests/multiprocess/sleeper.c", "sleeper"); + + let mut runner = Runner::new(&main_target, "kill_test"); + runner.with_fs_path(|out_dir| { + let guest_helper = out_dir.join("out/sleeper"); + let success = common::rewrite_with_cache(&helper_target, &guest_helper, &[]); + assert!(success, "failed to rewrite sleeper helper"); + }); + runner.arg("/out/sleeper"); + let output = runner.output(); + let output_str = String::from_utf8_lossy(&output); + assert!( + output_str.contains("kill_test: OK"), + "kill_test failed, output: {output_str}" + ); +} + +#[test] +fn test_waitpid_wnohang() { + let main_target = common::compile( + "./tests/multiprocess/wnohang_test.c", + "wnohang_test", + true, + false, + ); + let helper_target = + common::compile_static_pie("./tests/multiprocess/exit_with.c", "exit_with_wnohang"); + + let mut runner = Runner::new(&main_target, "wnohang_test"); + runner.with_fs_path(|out_dir| { + let guest_helper = out_dir.join("out/exit_with"); + let success = common::rewrite_with_cache(&helper_target, &guest_helper, &[]); + assert!(success, "failed to rewrite exit_with helper"); + }); + runner.arg("/out/exit_with"); + let output = runner.output(); + let output_str = String::from_utf8_lossy(&output); + assert!( + output_str.contains("wnohang_test: OK"), + "wnohang_test failed, output: {output_str}" + ); +} + +#[test] +fn test_exec_path_lookup() { + let main_target = common::compile( + "./tests/multiprocess/path_exec_test.c", + "path_exec_test", + true, + false, + ); + let helper_target = + common::compile_static_pie("./tests/multiprocess/exit_with.c", "exit_with_path"); + + let mut runner = Runner::new(&main_target, "path_exec_test"); + runner.with_fs_path(|out_dir| { + let guest_helper = out_dir.join("out/exit_with"); + let success = common::rewrite_with_cache(&helper_target, &guest_helper, &[]); + assert!(success, "failed to rewrite exit_with helper"); + }); + let output = runner.output(); + let output_str = String::from_utf8_lossy(&output); + assert!( + output_str.contains("path_exec_test: OK"), + "path_exec_test failed, output: {output_str}" + ); +} diff --git a/litebox_shim_linux/src/syscalls/process.rs b/litebox_shim_linux/src/syscalls/process.rs index d35c11020..0b4dcc795 100644 --- a/litebox_shim_linux/src/syscalls/process.rs +++ b/litebox_shim_linux/src/syscalls/process.rs @@ -1889,15 +1889,15 @@ impl Task { copy_vector(envp, "envp")? }; - let (path, argv_vec) = self.resolve_shebang(alloc::string::String::from(path), argv_vec)?; - // PATH resolution: if path doesn't contain '/', search $PATH. let path = if !path.contains('/') { - self.resolve_path_lookup(&path, &envp_vec)? + self.resolve_path_lookup(path, &envp_vec)? } else { - path + alloc::string::String::from(path) }; + let (path, argv_vec) = self.resolve_shebang(path, argv_vec)?; + let loader = crate::loader::elf::ElfLoader::new(self, &path)?; // After this point, the old program is torn down and failures must terminate the process. From 711012247a08eb8a8f6d242ec92b3eae067fdec6 Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Sun, 26 Apr 2026 11:31:58 -0700 Subject: [PATCH 11/23] Fix review issues: fork bomb protection, signal mailbox cap, error handling hardening - Add process count limit (128) to prevent fork bombs - Cap signal mailbox at 256 entries (drop oldest on overflow) - Return ECHILD for process group waits with no matching children - detach_to_new_address_space returns Result instead of panicking on VA exhaustion - PID overflow returns CreateProcessError instead of panicking - exit_process is idempotent (no assert on double-exit) - Fork failure cleanup: remove zombie registry entry after spawn failure - Store address_space_id on ProcessState for future VA partition reclamation - Defer VA partition reclamation (128 partitions sufficient for minimal version) --- litebox/src/process/mod.rs | 53 +++++++++++++++++----- litebox_shim_linux/src/lib.rs | 16 ++++++- litebox_shim_linux/src/syscalls/process.rs | 26 +++++++++-- 3 files changed, 78 insertions(+), 17 deletions(-) diff --git a/litebox/src/process/mod.rs b/litebox/src/process/mod.rs index 29d80552d..63c714e5c 100644 --- a/litebox/src/process/mod.rs +++ b/litebox/src/process/mod.rs @@ -85,6 +85,10 @@ pub enum CreateProcessError { NoSuchParent, #[error("a root (init) process already exists")] InitAlreadyExists, + #[error("too many processes (limit: {0})")] + TooManyProcesses(usize), + #[error("PID space exhausted")] + PidSpaceExhausted, } /// Shared handle for observing a process's exit. @@ -170,6 +174,8 @@ struct RegistryInner { /// Counter incremented on every process exit. Used with `exit_event` futex /// so that `wait_for_any_child_exit` can block efficiently. exit_epoch: u32, + /// Maximum number of processes allowed. 0 means unlimited. + max_processes: usize, } #[allow( @@ -181,11 +187,18 @@ struct RegistryInner { impl ProcessRegistry { /// Create a new, empty process registry. pub fn new() -> Self { + Self::with_max_processes(0) + } + + /// Create a new process registry with a maximum process count. + /// Pass 0 for unlimited. + pub fn with_max_processes(max_processes: usize) -> Self { Self { inner: Mutex::new(RegistryInner { processes: HashMap::new(), next_pid: 1, exit_epoch: 0, + max_processes, }), exit_event: ::RawMutex::INIT, } @@ -217,8 +230,14 @@ impl ProcessRegistry { if !inner.processes.contains_key(&parent_pid) { return Err(CreateProcessError::NoSuchParent); } + // Enforce process count limit + if inner.max_processes > 0 && inner.processes.len() >= inner.max_processes { + return Err(CreateProcessError::TooManyProcesses(inner.max_processes)); + } let raw = inner.next_pid; - inner.next_pid = raw.checked_add(1).expect("PID space exhausted"); + inner.next_pid = raw + .checked_add(1) + .ok_or(CreateProcessError::PidSpaceExhausted)?; let pid = ProcessId(raw); // Register as child of parent inner @@ -312,10 +331,10 @@ impl ProcessRegistry { .processes .get_mut(&id) .expect("exit_process: no such process"); - assert!( - matches!(entry.context.state, ProcessState::Running), - "exit_process: process already exited" - ); + // Idempotent: if already exited, return None without re-notifying. + if matches!(entry.context.state, ProcessState::Exited(_)) { + return None; + } entry.context.state = ProcessState::Exited(status); entry.exited_flag.store(true, Ordering::Release); exit_observer = Arc::clone(&entry.exit_observer); @@ -540,31 +559,43 @@ impl ProcessRegistry { 0 => { // Wait for any child in the caller's process group. let caller_pgid = parent_entry.context.pgid; + let mut any_match = false; let mut result = None; for &child_pid in &children { if let Some(entry) = inner.processes.get(&child_pid) && entry.context.pgid == caller_pgid - && let ProcessState::Exited(status) = entry.context.state { - result = Some((child_pid, status)); - break; + any_match = true; + if let ProcessState::Exited(status) = entry.context.state { + result = Some((child_pid, status)); + break; + } } } + if !any_match { + return Err(()); // ECHILD — no children in this group + } result } t if t < -1 => { // Wait for any child in process group |t|. let pgid = ProcessId((-t).cast_unsigned()); + let mut any_match = false; let mut result = None; for &child_pid in &children { if let Some(entry) = inner.processes.get(&child_pid) && entry.context.pgid == pgid - && let ProcessState::Exited(status) = entry.context.state { - result = Some((child_pid, status)); - break; + any_match = true; + if let ProcessState::Exited(status) = entry.context.state { + result = Some((child_pid, status)); + break; + } } } + if !any_match { + return Err(()); // ECHILD — no children in this group + } result } _ => return Err(()), diff --git a/litebox_shim_linux/src/lib.rs b/litebox_shim_linux/src/lib.rs index 68f4f4918..c47b90264 100644 --- a/litebox_shim_linux/src/lib.rs +++ b/litebox_shim_linux/src/lib.rs @@ -191,7 +191,7 @@ impl LinuxShimBuilder { pub fn build(self) -> LinuxShim { let mut net = Network::new(&self.litebox); net.set_platform_interaction(litebox::net::PlatformInteraction::Manual); - let process_registry = litebox::process::ProcessRegistry::new(); + let process_registry = litebox::process::ProcessRegistry::with_max_processes(128); // Register the init process (PID 1). process_registry .create_process(None) @@ -212,6 +212,7 @@ impl LinuxShimBuilder { }); let init_process = Arc::new(ProcessState { pm: PageManager::new(&global.litebox), + address_space_id: None, }); LinuxShim(global, init_process) } @@ -1107,7 +1108,13 @@ impl GlobalState { ) -> bool { let mailboxes = self.signal_mailboxes.lock(); if let Some(mailbox) = mailboxes.get(&target_pid) { - mailbox.lock().push_back((signal, siginfo)); + let mut mbox = mailbox.lock(); + // Cap mailbox size to prevent unbounded memory growth. + const MAX_MAILBOX_SIZE: usize = 256; + if mbox.len() >= MAX_MAILBOX_SIZE { + mbox.pop_front(); + } + mbox.push_back((signal, siginfo)); true } else { false @@ -1119,6 +1126,10 @@ impl GlobalState { struct ProcessState { /// The page manager for this process's virtual memory / address space. pm: litebox::mm::PageManager, + /// Address space ID for child processes that have exec'd into their own + /// VA partition. `None` for the init process (which uses the default + /// platform address space) and for vfork children that haven't exec'd yet. + address_space_id: Option<::AddressSpaceId>, } struct Task { @@ -1183,6 +1194,7 @@ mod test_utils { thread: syscalls::process::ThreadState::new_process(pid), process: RefCell::new(Arc::new(ProcessState { pm: PageManager::new(&self.litebox), + address_space_id: None, })), pid, ppid: 0, diff --git a/litebox_shim_linux/src/syscalls/process.rs b/litebox_shim_linux/src/syscalls/process.rs index 0b4dcc795..dae0ea836 100644 --- a/litebox_shim_linux/src/syscalls/process.rs +++ b/litebox_shim_linux/src/syscalls/process.rs @@ -635,6 +635,15 @@ impl Task { self.global.deregister_signal_mailbox(self.pid); } + // NOTE: VA partition reclamation is deferred for future work. + // We cannot release the partition here because: + // 1. The thread is still executing (unwinding through host code) + // 2. Guest pages are still mapped in the partition's VA range + // 3. Releasing the ID would allow another fork to reuse the same range + // With 128 partitions, this is acceptable for the minimal implementation. + // Future: move cleanup to zombie reaping (waitpid/remove_process) and + // unmap all guest pages before releasing the partition ID. + // If this is a vfork child that never exec'd, signal the parent. // Done after exit recording so parent's waitpid sees the exit. if let Some(fc) = self.fork_context.borrow_mut().take() { @@ -896,6 +905,13 @@ impl Task { }; if let Err(err) = r { litebox_util_log::error!(err:% = err; "failed to spawn fork child"); + // The child_task was dropped by spawn_thread's failure path, which + // triggered prepare_for_exit (closing FDs, recording exit, deregistering + // mailbox). Clean up the zombie registry entry so the parent doesn't + // see a phantom child from a failed fork. + self.global + .process_registry + .remove_process(child_process_id); return Err(Errno::ENOMEM); } @@ -1814,22 +1830,24 @@ impl Task { /// Called during exec of a vfork child. Creates a new address space via the /// platform, builds a new `ProcessState` with a `PageManager` scoped to that /// partition's VA range, and replaces `self.process`. - fn detach_to_new_address_space(&self) { + fn detach_to_new_address_space(&self) -> Result<(), Errno> { use litebox::platform::AddressSpaceProvider; let platform = self.global.platform; let as_id = platform .create_address_space() - .expect("failed to create address space for fork child"); + .map_err(|_| Errno::ENOMEM)?; let range = platform .address_space_range(as_id) - .expect("failed to get address space range"); + .map_err(|_| Errno::ENOMEM)?; let new_process = Arc::new(crate::ProcessState { pm: litebox::mm::PageManager::new_with_range(&self.global.litebox, range), + address_space_id: Some(as_id), }); *self.process.borrow_mut() = new_process; + Ok(()) } /// Handle syscall `execve`. @@ -1946,7 +1964,7 @@ impl Task { .take() .map(|fc| fc.vfork_done); if vfork_done.is_some() { - self.detach_to_new_address_space(); + self.detach_to_new_address_space()?; } // Don't release reserved mappings. From 8ac8ef57a4910689c03b1f311603bd3ea1bdff9a Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Sun, 26 Apr 2026 11:44:14 -0700 Subject: [PATCH 12/23] Fix review round 2: wait4 race, PID/TID collision, error handling - Fix CRITICAL wait4 futex race: snapshot exit epoch BEFORE try_wait to prevent missed wakeups when child exits between check and block - Fix PID/TID namespace collision: advance_next_pid after thread creation, saturating_add for child_pid+1 overflow - Remove duplicate unused ProcessRegistry from LiteBox core - Remove dead exit_epoch field from RegistryInner - Fix siginfo_chld: use (wait_status & 0x7f)==0 instead of fragile trailing_zeros heuristic for normal exit detection - Replace unwrap/expect with Result propagation in fork punchthrough - Fix try_wait doc comment to match actual return type semantics --- litebox/src/litebox.rs | 9 ---- litebox/src/process/mod.rs | 43 +++++++++++++------ litebox_shim_linux/src/syscalls/process.rs | 27 ++++++++---- litebox_shim_linux/src/syscalls/signal/mod.rs | 5 ++- 4 files changed, 50 insertions(+), 34 deletions(-) diff --git a/litebox/src/litebox.rs b/litebox/src/litebox.rs index 0de125ba7..2fb209c22 100644 --- a/litebox/src/litebox.rs +++ b/litebox/src/litebox.rs @@ -7,7 +7,6 @@ use alloc::sync::Arc; use crate::{ fd::Descriptors, - process::ProcessRegistry, sync::{RawSyncPrimitivesProvider, RwLock}, }; @@ -66,7 +65,6 @@ impl LiteBox { crate::sync::lock_tracing::LockTracker::init(platform); let descriptors = RwLock::new(Descriptors::new_from_litebox_creation()); - let process_registry = ProcessRegistry::new(); litebox_util_log::trace!("LiteBox instance initialized"); @@ -74,7 +72,6 @@ impl LiteBox { x: Arc::new(LiteBoxX { platform, descriptors, - process_registry, }), } } @@ -109,16 +106,10 @@ impl LiteBox { ) -> impl core::ops::DerefMut> + use<'_, Platform> { self.x.descriptors.write() } - - /// Access the process registry. - pub fn process_registry(&self) -> &ProcessRegistry { - &self.x.process_registry - } } /// The actual body of [`LiteBox`], containing any components that might be shared. pub(crate) struct LiteBoxX { pub(crate) platform: &'static Platform, descriptors: RwLock>, - process_registry: ProcessRegistry, } diff --git a/litebox/src/process/mod.rs b/litebox/src/process/mod.rs index 63c714e5c..32c1c09fc 100644 --- a/litebox/src/process/mod.rs +++ b/litebox/src/process/mod.rs @@ -163,17 +163,14 @@ struct ProcessEntry { /// `ProcessRegistry` is parameterized on a platform type for mutex support. pub struct ProcessRegistry { inner: Mutex>, - /// Futex-like primitive: value is `exit_epoch`. Woken on every child exit - /// so that blocking `wait_for_any_child_exit` can unblock. + /// Futex-like primitive: incremented on every child exit so that + /// blocking `wait_for_child_exit_since` can unblock. exit_event: ::RawMutex, } struct RegistryInner { processes: HashMap>, next_pid: u32, - /// Counter incremented on every process exit. Used with `exit_event` futex - /// so that `wait_for_any_child_exit` can block efficiently. - exit_epoch: u32, /// Maximum number of processes allowed. 0 means unlimited. max_processes: usize, } @@ -197,7 +194,6 @@ impl ProcessRegistry { inner: Mutex::new(RegistryInner { processes: HashMap::new(), next_pid: 1, - exit_epoch: 0, max_processes, }), exit_event: ::RawMutex::INIT, @@ -282,6 +278,15 @@ impl ProcessRegistry { Ok(pid) } + /// Ensure the next PID will be at least `min_pid`. + /// Used to keep PIDs and TIDs in disjoint ranges when they share a namespace. + pub fn advance_next_pid(&self, min_pid: u32) { + let mut inner = self.inner.lock(); + if inner.next_pid < min_pid { + inner.next_pid = min_pid; + } + } + /// Remove a process that was created but never started (e.g., child setup /// failed after PID allocation). /// @@ -354,9 +359,6 @@ impl ProcessRegistry { None } }); - - // Bump the exit epoch so blocking waiters unblock. - inner.exit_epoch = inner.exit_epoch.wrapping_add(1); } // Wake any threads blocked in wait_for_any_child_exit. self.exit_event @@ -507,12 +509,13 @@ impl ProcessRegistry { /// `target` selects which children to consider: /// - `> 0`: only the child with that specific PID /// - `-1`: any child - /// - `0` / other negative: not yet supported (returns `None`) + /// - `0`: any child in the caller's process group + /// - `< -1`: any child in process group `|target|` /// /// If a matching exited child is found, it is reaped (removed from the - /// registry) and `Some((child_pid, exit_status))` is returned. - /// Returns `None` if no matching exited child exists. - /// Returns `Some(Err(()))` if the parent has no children matching `target` + /// registry) and `Ok(Some((child_pid, exit_status)))` is returned. + /// Returns `Ok(None)` if matching children exist but none have exited yet. + /// Returns `Err(())` if the parent has no children matching `target` /// (i.e., ECHILD condition). pub fn try_wait(&self, parent: ProcessId, target: i32) -> Result, ()> { let mut inner = self.inner.lock(); @@ -614,11 +617,23 @@ impl ProcessRegistry { Ok(found) } + /// Snapshot the current exit epoch. Used with `wait_for_child_exit_since` + /// to implement the standard futex pattern: snapshot, check, block-on-snapshot. + pub fn exit_epoch(&self) -> u32 { + self.exit_event.underlying_atomic().load(Ordering::Acquire) + } + + /// Block until a child exit occurs after the given epoch snapshot. + /// The caller should call `exit_epoch()` BEFORE `try_wait()`, then + /// pass the snapshot here if `try_wait` returned `Ok(None)`. + pub fn wait_for_child_exit_since(&self, epoch: u32) { + let _ = self.exit_event.block(epoch); + } + /// Block until any child exit occurs (or return immediately if one has /// happened since the last call). Used by blocking wait4. pub fn wait_for_any_child_exit(&self) { let epoch = self.exit_event.underlying_atomic().load(Ordering::Acquire); - // Block until the epoch changes (i.e., a new exit has been recorded). let _ = self.exit_event.block(epoch); } diff --git a/litebox_shim_linux/src/syscalls/process.rs b/litebox_shim_linux/src/syscalls/process.rs index dae0ea836..6ff670e8c 100644 --- a/litebox_shim_linux/src/syscalls/process.rs +++ b/litebox_shim_linux/src/syscalls/process.rs @@ -678,6 +678,10 @@ impl Task { // Pick up any cross-process signals (e.g., SIGCHLD from exiting children). self.drain_cross_process_signals(); + // Snapshot the exit epoch BEFORE try_wait to avoid a race where a + // child exits between try_wait and the blocking call. + let epoch = self.global.process_registry.exit_epoch(); + match self.global.process_registry.try_wait(parent_pid, pid) { Err(()) => { // No matching children at all — ECHILD. @@ -695,11 +699,11 @@ impl Task { if options & WNOHANG != 0 { return Ok(0); } - // Block: sleep briefly and retry. This is a simple poll loop. - // A proper implementation would use ExitSubject observers, - // but for the minimal multi-process support this suffices. - // Block until some child exits. - self.global.process_registry.wait_for_any_child_exit(); + // Block until some child exits (using epoch snapshot from + // before try_wait to avoid missed wakeups). + self.global + .process_registry + .wait_for_child_exit_since(epoch); } } } @@ -843,11 +847,12 @@ impl Task { .map_err(|_| Errno::EAGAIN)?; let child_pid = child_process_id.as_u32().cast_signed(); - // Advance the thread ID counter past the child PID to avoid collisions. + // Advance the thread ID counter past the child PID to avoid collisions + // between PIDs and TIDs. Use saturating_add to prevent overflow. let _ = self .global .next_thread_id - .fetch_max(child_pid + 1, Ordering::Relaxed); + .fetch_max(child_pid.saturating_add(1), Ordering::Relaxed); // Clone the FD table for the child, incrementing fork_refcounts in the global descriptor table. let child_files = { @@ -863,8 +868,8 @@ impl Task { .global .platform .get_punchthrough_token_for(punchthrough) - .expect("Failed to get punchthrough token for GET_FS"); - token.execute().unwrap() + .ok_or(Errno::ENOSYS)?; + token.execute().map_err(|_| Errno::EFAULT)? }; // Create the vfork synchronization. @@ -1033,6 +1038,10 @@ impl Task { }; let child_tid = self.global.next_thread_id.fetch_add(1, Ordering::Relaxed); + // Keep PID counter in sync with TID counter to avoid namespace collisions. + self.global + .process_registry + .advance_next_pid(child_tid.saturating_add(1).cast_unsigned()); if let Some(parent_tid_ptr) = set_parent_tid { let _ = parent_tid_ptr.write_at_offset(0, child_tid); } diff --git a/litebox_shim_linux/src/syscalls/signal/mod.rs b/litebox_shim_linux/src/syscalls/signal/mod.rs index b9d3f6558..4dc806a98 100644 --- a/litebox_shim_linux/src/syscalls/signal/mod.rs +++ b/litebox_shim_linux/src/syscalls/signal/mod.rs @@ -339,8 +339,9 @@ pub(crate) fn siginfo_chld(child_pid: i32, wait_status: u32) -> Siginfo { const CLD_EXITED: i32 = 1; const CLD_KILLED: i32 = 2; - // Decode wait_status to determine si_code and si_status - let (code, si_status) = if wait_status.trailing_zeros() >= 7 { + // Decode wait_status: bits 6..0 == 0 means normal exit (status in bits 15..8), + // otherwise killed by signal (signal number in bits 6..0). + let (code, si_status) = if (wait_status & 0x7f) == 0 { // Normal exit: status is in bits 15..8 (CLD_EXITED, (wait_status >> 8) & 0xff) } else { From 6acd1a624316017055d34fa4ad7e441b5990037e Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Sun, 26 Apr 2026 12:01:19 -0700 Subject: [PATCH 13/23] Fix review round 3: vfork_done on detach failure, double cloexec, lock ordering - Fix HIGH: signal vfork_done before returning error when detach_to_new_address_space fails during exec, preventing parent hang - Fix MEDIUM: remove redundant close-on-exec pass in sys_execve (close_on_exec() already handles it; second pass could double-decrement fork_refcount) - Fix MEDIUM: add debug_assert that reaped zombie has no children in try_wait (children should have been reparented during exit_process) - Fix MEDIUM: use unsigned_abs() instead of (-t).cast_unsigned() in try_wait to avoid i32::MIN overflow - Fix MEDIUM: clone Arc from signal_mailboxes map and drop outer lock before acquiring per-mailbox lock to prevent nested lock acquisition - Fix LOW: simplify redundant match arm in try_wait (t > 0 case) --- litebox/src/process/mod.rs | 24 ++++++++++--------- litebox_shim_linux/src/lib.rs | 9 ++++++-- litebox_shim_linux/src/syscalls/process.rs | 27 +++++++--------------- 3 files changed, 28 insertions(+), 32 deletions(-) diff --git a/litebox/src/process/mod.rs b/litebox/src/process/mod.rs index 32c1c09fc..2400320a9 100644 --- a/litebox/src/process/mod.rs +++ b/litebox/src/process/mod.rs @@ -534,14 +534,9 @@ impl ProcessRegistry { if !children.contains(&target_pid) { return Err(()); // ECHILD — not our child } - let entry = inner.processes.get(&target_pid); - match entry { - Some(e) if matches!(e.context.state, ProcessState::Exited(_)) => { - if let ProcessState::Exited(status) = e.context.state { - Some((target_pid, status)) - } else { - None - } + match inner.processes.get(&target_pid) { + Some(e) if let ProcessState::Exited(status) = e.context.state => { + Some((target_pid, status)) } _ => None, } @@ -582,7 +577,7 @@ impl ProcessRegistry { } t if t < -1 => { // Wait for any child in process group |t|. - let pgid = ProcessId((-t).cast_unsigned()); + let pgid = ProcessId(t.unsigned_abs()); let mut any_match = false; let mut result = None; for &child_pid in &children { @@ -606,8 +601,15 @@ impl ProcessRegistry { // Reap the child if found if let Some((child_pid, _)) = found { - // Remove child from registry - let _entry = inner.processes.remove(&child_pid); + // Remove child from registry. Its children should have been + // reparented during exit_process. + let entry = inner.processes.remove(&child_pid); + debug_assert!( + entry + .as_ref() + .map_or(true, |e| e.context.children.is_empty()), + "reaped zombie still has children" + ); // Remove from parent's children list if let Some(parent) = inner.processes.get_mut(&parent) { parent.context.children.retain(|&c| c != child_pid); diff --git a/litebox_shim_linux/src/lib.rs b/litebox_shim_linux/src/lib.rs index c47b90264..b3bc11055 100644 --- a/litebox_shim_linux/src/lib.rs +++ b/litebox_shim_linux/src/lib.rs @@ -1106,8 +1106,13 @@ impl GlobalState { signal: litebox_common_linux::signal::Signal, siginfo: litebox_common_linux::signal::Siginfo, ) -> bool { - let mailboxes = self.signal_mailboxes.lock(); - if let Some(mailbox) = mailboxes.get(&target_pid) { + // Clone the Arc and drop the outer lock before acquiring the mailbox + // lock to prevent nested lock acquisition (deadlock risk). + let mailbox = { + let mailboxes = self.signal_mailboxes.lock(); + mailboxes.get(&target_pid).cloned() + }; + if let Some(mailbox) = mailbox { let mut mbox = mailbox.lock(); // Cap mailbox size to prevent unbounded memory growth. const MAX_MAILBOX_SIZE: usize = 256; diff --git a/litebox_shim_linux/src/syscalls/process.rs b/litebox_shim_linux/src/syscalls/process.rs index 6ff670e8c..5ff0b9cd8 100644 --- a/litebox_shim_linux/src/syscalls/process.rs +++ b/litebox_shim_linux/src/syscalls/process.rs @@ -1947,24 +1947,6 @@ impl Task { self.signals.reset_for_exec(); - // Close FDs marked with FD_CLOEXEC (POSIX requirement). - { - let files = self.files.borrow(); - let rds = files.raw_descriptor_store.read(); - let dt = self.global.litebox.descriptor_table(); - let cloexec_fds = rds - .raw_fds_matching_metadata::<_, litebox_common_linux::FileDescriptorFlags>( - &dt, - |flags| flags.contains(litebox_common_linux::FileDescriptorFlags::FD_CLOEXEC), - ); - drop(dt); - drop(rds); - drop(files); - for raw_fd in cloexec_fds { - let _ = self.do_close(raw_fd); - } - } - // If this is a vfork child, detach to a new address space before // releasing memory (so we don't destroy the parent's mappings). let vfork_done = self @@ -1973,7 +1955,14 @@ impl Task { .take() .map(|fc| fc.vfork_done); if vfork_done.is_some() { - self.detach_to_new_address_space()?; + if let Err(e) = self.detach_to_new_address_space() { + // Signal the parent before returning error — otherwise parent + // hangs forever waiting on vfork_done. + if let Some(vd) = vfork_done { + vd.signal(); + } + return Err(e); + } } // Don't release reserved mappings. From f28ad450fc811b8dae57038bbdfc1e95fce214ce Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Sun, 26 Apr 2026 12:20:15 -0700 Subject: [PATCH 14/23] fix: resolve all clippy errors for CI (-Dwarnings) Fix clippy lints across litebox core, shim, and runner crates: - similar_names: allow on create_process and sys_setpgid - question_mark: use ? operator in reparent - match_wildcard_for_single_variants: explicit ProcessState::Running - collapsible_if: use let-chains (edition 2024) - unnecessary_map_or: use is_none_or - verbose_bit_mask: use trailing_zeros - items_after_statements: move const before let bindings - unnecessary boolean not: invert if/else branches - dead_code: allow on compile_static_pie (unused in loader test) --- litebox/src/process/mod.rs | 19 +++++++-------- .../tests/common/mod.rs | 1 + litebox_shim_linux/src/lib.rs | 3 ++- litebox_shim_linux/src/syscalls/net.rs | 16 ++++++------- litebox_shim_linux/src/syscalls/process.rs | 23 +++++++++---------- litebox_shim_linux/src/syscalls/signal/mod.rs | 2 +- 6 files changed, 31 insertions(+), 33 deletions(-) diff --git a/litebox/src/process/mod.rs b/litebox/src/process/mod.rs index 2400320a9..229b117d4 100644 --- a/litebox/src/process/mod.rs +++ b/litebox/src/process/mod.rs @@ -204,6 +204,7 @@ impl ProcessRegistry { /// /// `parent` is the parent process ID. Pass `None` to create the init /// process (PID 1). Only one init process is allowed. + #[allow(clippy::similar_names)] pub fn create_process( &self, parent: Option, @@ -385,22 +386,20 @@ impl ProcessRegistry { // Read child info first. let (old_parent, exit_status) = { - let Some(entry) = inner.processes.get_mut(&child) else { - return None; - }; + let entry = inner.processes.get_mut(&child)?; let old_parent = entry.context.parent.replace(new_parent); let exit_status = match entry.context.state { ProcessState::Exited(status) => Some(status), - _ => None, + ProcessState::Running => None, }; (old_parent, exit_status) }; // Remove from old parent's children list. - if let Some(old_pid) = old_parent { - if let Some(old_entry) = inner.processes.get_mut(&old_pid) { - old_entry.context.children.retain(|&c| c != child); - } + if let Some(old_pid) = old_parent + && let Some(old_entry) = inner.processes.get_mut(&old_pid) + { + old_entry.context.children.retain(|&c| c != child); } // Add to new parent's children list. @@ -605,9 +604,7 @@ impl ProcessRegistry { // reparented during exit_process. let entry = inner.processes.remove(&child_pid); debug_assert!( - entry - .as_ref() - .map_or(true, |e| e.context.children.is_empty()), + entry.as_ref().is_none_or(|e| e.context.children.is_empty()), "reaped zombie still has children" ); // Remove from parent's children list diff --git a/litebox_runner_linux_userland/tests/common/mod.rs b/litebox_runner_linux_userland/tests/common/mod.rs index 20e47be48..5c2582127 100644 --- a/litebox_runner_linux_userland/tests/common/mod.rs +++ b/litebox_runner_linux_userland/tests/common/mod.rs @@ -101,6 +101,7 @@ pub fn compile(src_path: &str, unique_name: &str, exec_or_lib: bool, nolibc: boo compile_inner(src_path, unique_name, exec_or_lib, nolibc, false) } +#[allow(dead_code)] pub fn compile_static_pie(src_path: &str, unique_name: &str) -> PathBuf { compile_inner(src_path, unique_name, true, false, true) } diff --git a/litebox_shim_linux/src/lib.rs b/litebox_shim_linux/src/lib.rs index b3bc11055..7718fed77 100644 --- a/litebox_shim_linux/src/lib.rs +++ b/litebox_shim_linux/src/lib.rs @@ -1106,6 +1106,7 @@ impl GlobalState { signal: litebox_common_linux::signal::Signal, siginfo: litebox_common_linux::signal::Siginfo, ) -> bool { + const MAX_MAILBOX_SIZE: usize = 256; // Clone the Arc and drop the outer lock before acquiring the mailbox // lock to prevent nested lock acquisition (deadlock risk). let mailbox = { @@ -1115,7 +1116,6 @@ impl GlobalState { if let Some(mailbox) = mailbox { let mut mbox = mailbox.lock(); // Cap mailbox size to prevent unbounded memory growth. - const MAX_MAILBOX_SIZE: usize = 256; if mbox.len() >= MAX_MAILBOX_SIZE { mbox.pop_front(); } @@ -1134,6 +1134,7 @@ struct ProcessState { /// Address space ID for child processes that have exec'd into their own /// VA partition. `None` for the init process (which uses the default /// platform address space) and for vfork children that haven't exec'd yet. + #[allow(dead_code)] address_space_id: Option<::AddressSpaceId>, } diff --git a/litebox_shim_linux/src/syscalls/net.rs b/litebox_shim_linux/src/syscalls/net.rs index 2ec296c5e..9bfcc2c59 100644 --- a/litebox_shim_linux/src/syscalls/net.rs +++ b/litebox_shim_linux/src/syscalls/net.rs @@ -1355,10 +1355,10 @@ impl Task { file.sendto(self, buf, flags, addr) }, ); - if let Err(Errno::EPIPE) = ret { - if !flags.contains(SendFlags::NOSIGNAL) { - self.raise_sigpipe(); - } + if let Err(Errno::EPIPE) = ret + && !flags.contains(SendFlags::NOSIGNAL) + { + self.raise_sigpipe(); } ret } @@ -1443,10 +1443,10 @@ impl Task { Ok(total_sent) }, ); - if let Err(Errno::EPIPE) = ret { - if !flags.contains(SendFlags::NOSIGNAL) { - self.raise_sigpipe(); - } + if let Err(Errno::EPIPE) = ret + && !flags.contains(SendFlags::NOSIGNAL) + { + self.raise_sigpipe(); } ret } diff --git a/litebox_shim_linux/src/syscalls/process.rs b/litebox_shim_linux/src/syscalls/process.rs index 5ff0b9cd8..0d762b8b0 100644 --- a/litebox_shim_linux/src/syscalls/process.rs +++ b/litebox_shim_linux/src/syscalls/process.rs @@ -1540,6 +1540,7 @@ impl Task { } /// Handle syscall `setpgid`. + #[allow(clippy::similar_names)] pub(crate) fn sys_setpgid(&self, pid: i32, pgid: i32) -> Result<(), Errno> { let target_pid = if pid == 0 { self.pid } else { pid }; let target_pgid = if pgid == 0 { target_pid } else { pgid }; @@ -1917,10 +1918,10 @@ impl Task { }; // PATH resolution: if path doesn't contain '/', search $PATH. - let path = if !path.contains('/') { - self.resolve_path_lookup(path, &envp_vec)? - } else { + let path = if path.contains('/') { alloc::string::String::from(path) + } else { + self.resolve_path_lookup(path, &envp_vec)? }; let (path, argv_vec) = self.resolve_shebang(path, argv_vec)?; @@ -1954,15 +1955,13 @@ impl Task { .borrow_mut() .take() .map(|fc| fc.vfork_done); - if vfork_done.is_some() { - if let Err(e) = self.detach_to_new_address_space() { - // Signal the parent before returning error — otherwise parent - // hangs forever waiting on vfork_done. - if let Some(vd) = vfork_done { - vd.signal(); - } - return Err(e); - } + if let Some(ref vd) = vfork_done + && let Err(e) = self.detach_to_new_address_space() + { + // Signal the parent before returning error — otherwise parent + // hangs forever waiting on vfork_done. + vd.signal(); + return Err(e); } // Don't release reserved mappings. diff --git a/litebox_shim_linux/src/syscalls/signal/mod.rs b/litebox_shim_linux/src/syscalls/signal/mod.rs index 4dc806a98..f78d97357 100644 --- a/litebox_shim_linux/src/syscalls/signal/mod.rs +++ b/litebox_shim_linux/src/syscalls/signal/mod.rs @@ -341,7 +341,7 @@ pub(crate) fn siginfo_chld(child_pid: i32, wait_status: u32) -> Siginfo { // Decode wait_status: bits 6..0 == 0 means normal exit (status in bits 15..8), // otherwise killed by signal (signal number in bits 6..0). - let (code, si_status) = if (wait_status & 0x7f) == 0 { + let (code, si_status) = if wait_status.trailing_zeros() >= 7 { // Normal exit: status is in bits 15..8 (CLD_EXITED, (wait_status >> 8) & 0xff) } else { From 5ac9dd4652117409e9f6fe31e8240cf6d1ba353f Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Sun, 26 Apr 2026 12:23:43 -0700 Subject: [PATCH 15/23] =?UTF-8?q?fix:=20CI=20failures=20=E2=80=94=20fmt,?= =?UTF-8?q?=20if-let=20match=20guard,=20Windows=20AddressSpaceProvider?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix cargo fmt formatting in process.rs - Replace unstable if-let match guard with nested match (E0658 on SNP/LVBS) - Add stub AddressSpaceProvider impl for WindowsUserland platform --- litebox/src/process/mod.rs | 9 +++++---- litebox_platform_windows_userland/src/lib.rs | 6 ++++++ litebox_shim_linux/src/syscalls/process.rs | 4 +--- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/litebox/src/process/mod.rs b/litebox/src/process/mod.rs index 229b117d4..40992b865 100644 --- a/litebox/src/process/mod.rs +++ b/litebox/src/process/mod.rs @@ -534,10 +534,11 @@ impl ProcessRegistry { return Err(()); // ECHILD — not our child } match inner.processes.get(&target_pid) { - Some(e) if let ProcessState::Exited(status) = e.context.state => { - Some((target_pid, status)) - } - _ => None, + Some(e) => match e.context.state { + ProcessState::Exited(status) => Some((target_pid, status)), + ProcessState::Running => None, + }, + None => None, } } -1 => { diff --git a/litebox_platform_windows_userland/src/lib.rs b/litebox_platform_windows_userland/src/lib.rs index 396282544..4427e7445 100644 --- a/litebox_platform_windows_userland/src/lib.rs +++ b/litebox_platform_windows_userland/src/lib.rs @@ -333,6 +333,12 @@ impl WindowsUserland { impl litebox::platform::Provider for WindowsUserland {} +impl litebox::platform::AddressSpaceProvider for WindowsUserland { + type AddressSpaceId = u32; + const ADDRESS_SPACE_KIND: litebox::platform::AddressSpaceKind = + litebox::platform::AddressSpaceKind::SharedMemory; +} + impl litebox::platform::SignalProvider for WindowsUserland { type Signal = litebox_common_linux::signal::Signal; diff --git a/litebox_shim_linux/src/syscalls/process.rs b/litebox_shim_linux/src/syscalls/process.rs index 0d762b8b0..5dbb9b42a 100644 --- a/litebox_shim_linux/src/syscalls/process.rs +++ b/litebox_shim_linux/src/syscalls/process.rs @@ -1844,9 +1844,7 @@ impl Task { use litebox::platform::AddressSpaceProvider; let platform = self.global.platform; - let as_id = platform - .create_address_space() - .map_err(|_| Errno::ENOMEM)?; + let as_id = platform.create_address_space().map_err(|_| Errno::ENOMEM)?; let range = platform .address_space_range(as_id) .map_err(|_| Errno::ENOMEM)?; From c525826f10870879ae6c8bd40b17f524bed871cd Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Sun, 26 Apr 2026 12:34:17 -0700 Subject: [PATCH 16/23] =?UTF-8?q?fix:=20CI=20failures=20=E2=80=94=20Window?= =?UTF-8?q?s=20AddressSpaceKind=20path,=20let=5Fneedless=5Freturn,=20test?= =?UTF-8?q?=20PID=20mismatch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Use full path for AddressSpaceKind in Windows platform impl - Remove needless let bindings in net.rs and unix.rs (clippy::let_and_return) - Fix test_syscall_rewriter: override PID to 1 to match process registry - Fix cargo fmt formatting --- litebox_platform_windows_userland/src/lib.rs | 4 ++-- litebox_runner_linux_userland/tests/loader.rs | 6 +++++- litebox_shim_linux/src/syscalls/net.rs | 7 +++---- litebox_shim_linux/src/syscalls/unix.rs | 7 +++---- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/litebox_platform_windows_userland/src/lib.rs b/litebox_platform_windows_userland/src/lib.rs index 4427e7445..85ac7a56b 100644 --- a/litebox_platform_windows_userland/src/lib.rs +++ b/litebox_platform_windows_userland/src/lib.rs @@ -335,8 +335,8 @@ impl litebox::platform::Provider for WindowsUserland {} impl litebox::platform::AddressSpaceProvider for WindowsUserland { type AddressSpaceId = u32; - const ADDRESS_SPACE_KIND: litebox::platform::AddressSpaceKind = - litebox::platform::AddressSpaceKind::SharedMemory; + const ADDRESS_SPACE_KIND: litebox::platform::address_space::AddressSpaceKind = + litebox::platform::address_space::AddressSpaceKind::SharedMemory; } impl litebox::platform::SignalProvider for WindowsUserland { diff --git a/litebox_runner_linux_userland/tests/loader.rs b/litebox_runner_linux_userland/tests/loader.rs index c96168f50..e7d929137 100644 --- a/litebox_runner_linux_userland/tests/loader.rs +++ b/litebox_runner_linux_userland/tests/loader.rs @@ -100,8 +100,12 @@ impl TestLauncher { ]; let fs = std::sync::Arc::new(self.fs); let shim = self.shim_builder.build(); + let mut task = self.platform.init_task(); + // Use deterministic guest PID 1 (init) to match process registry. + task.pid = 1; + task.ppid = 0; let program = shim - .load_program(fs, self.platform.init_task(), executable_path, argv, envp) + .load_program(fs, task, executable_path, argv, envp) .unwrap(); unsafe { litebox_platform_linux_userland::run_thread( diff --git a/litebox_shim_linux/src/syscalls/net.rs b/litebox_shim_linux/src/syscalls/net.rs index 9bfcc2c59..63af1b111 100644 --- a/litebox_shim_linux/src/syscalls/net.rs +++ b/litebox_shim_linux/src/syscalls/net.rs @@ -751,8 +751,8 @@ impl GlobalState { let is_nonblock = self.get_status(fd).contains(OFlags::NONBLOCK) || flags.contains(SendFlags::DONTWAIT); - let ret = cx - .with_timeout(timeout) + // Note: SIGPIPE is sent at the Task level (do_sendto/sys_sendmsg) + cx.with_timeout(timeout) .wait_on_events( is_nonblock, Events::OUT, @@ -766,9 +766,8 @@ impl GlobalState { Err(e) => Err(TryOpError::Other(Errno::from(e))), }, ) - .map_err(Errno::from); + .map_err(Errno::from) // Note: SIGPIPE is sent at the Task level (do_sendto/sys_sendmsg) - ret } /// Receive data via socket channel (lock-free path). diff --git a/litebox_shim_linux/src/syscalls/unix.rs b/litebox_shim_linux/src/syscalls/unix.rs index 23411ac30..522783892 100644 --- a/litebox_shim_linux/src/syscalls/unix.rs +++ b/litebox_shim_linux/src/syscalls/unix.rs @@ -1241,16 +1241,15 @@ impl UnixSocket { let is_nonblocking = flags.contains(SendFlags::DONTWAIT) || self.get_status().contains(OFlags::NONBLOCK); let timeout = self.options.lock().send_timeout; - let ret = match &self.inner { + // Note: SIGPIPE is sent at the Task level (do_sendto/sys_sendmsg) + match &self.inner { UnixSocketInner::Stream(stream) => { stream.sendto(&task.wait_cx(), timeout, buf, is_nonblocking, addr) } UnixSocketInner::Datagram(datagram) => { datagram.sendto(task, timeout, buf, is_nonblocking, addr) } - }; - // Note: SIGPIPE is sent at the Task level (do_sendto/sys_sendmsg) - ret + } } pub(super) fn recvfrom( From 83a32d285176d056c0eec0a4816d5ca871bd7fda Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Sun, 26 Apr 2026 12:41:47 -0700 Subject: [PATCH 17/23] fix: make exit_process graceful when PID not in registry, fix Windows test PID - exit_process returns None instead of panicking when process not found - Override PID to 1 in Windows runner test helper (matches process registry) - Fixes test_stdio, test_syscall_rewriter, and Windows loader test panics --- litebox/src/process/mod.rs | 5 +---- .../tests/common/mod.rs | 6 +++++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/litebox/src/process/mod.rs b/litebox/src/process/mod.rs index 40992b865..1615e0155 100644 --- a/litebox/src/process/mod.rs +++ b/litebox/src/process/mod.rs @@ -333,10 +333,7 @@ impl ProcessRegistry { let (children, exit_observer, notification); { let mut inner = self.inner.lock(); - let entry = inner - .processes - .get_mut(&id) - .expect("exit_process: no such process"); + let entry = inner.processes.get_mut(&id)?; // Idempotent: if already exited, return None without re-notifying. if matches!(entry.context.state, ProcessState::Exited(_)) { return None; diff --git a/litebox_runner_linux_on_windows_userland/tests/common/mod.rs b/litebox_runner_linux_on_windows_userland/tests/common/mod.rs index 977150767..6bad0d8bf 100644 --- a/litebox_runner_linux_on_windows_userland/tests/common/mod.rs +++ b/litebox_runner_linux_on_windows_userland/tests/common/mod.rs @@ -83,8 +83,12 @@ impl TestLauncher { ]; let envp = vec![CString::new("PATH=/bin").unwrap()]; let shim = self.shim_builder.build(); + let mut task = self.platform.init_task(); + // Use deterministic guest PID 1 (init) to match process registry. + task.pid = 1; + task.ppid = 0; let program = shim - .load_program(fs, self.platform.init_task(), executable_path, argv, envp) + .load_program(fs, task, executable_path, argv, envp) .unwrap(); unsafe { litebox_platform_windows_userland::run_thread( From 4efc636a234b6c14a29c4bfbea2d67bb34089adb Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Fri, 1 May 2026 16:01:03 -0700 Subject: [PATCH 18/23] =?UTF-8?q?refactor:=20make=20fd/mod.rs=20OS-agnosti?= =?UTF-8?q?c=20=E2=80=94=20remove=20POSIX-specific=20naming=20and=20unused?= =?UTF-8?q?=20clone=5Ftable?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - fork_refcount → process_refcount - on_dup → on_ref_added - ForkDecremented → SharedDecremented - clone_for_fork → clone_for_child_selective(Option<&[usize]>) None = inherit all (bulk), Some = selective (NT-style) - increment_fork_refcounts → increment_process_refcounts - Remove unused Descriptors::clone_table --- litebox/src/fd/mod.rs | 110 +++++++++------------ litebox/src/net/mod.rs | 4 +- litebox_shim_linux/src/syscalls/file.rs | 11 ++- litebox_shim_linux/src/syscalls/process.rs | 2 +- 4 files changed, 58 insertions(+), 69 deletions(-) diff --git a/litebox/src/fd/mod.rs b/litebox/src/fd/mod.rs index 0bf2b6500..ead7fc430 100644 --- a/litebox/src/fd/mod.rs +++ b/litebox/src/fd/mod.rs @@ -36,46 +36,22 @@ impl Descriptors { Self { entries: vec![] } } - /// Clone the entire descriptor table for fork. + /// Increment the process reference count for each of the given descriptor slot indices. /// - /// Each entry in the new table shares the same underlying `DescriptorEntry` - /// (via `Arc::clone`), matching the semantics of both POSIX fork (shared - /// file descriptions) and NT handle inheritance. Per-FD metadata is **not** - /// cloned; each slot in the child starts with a fresh `AnyMap`. - /// - /// Calls `on_dup()` on each entry to notify subsystems of the new reference. - pub(crate) fn clone_table(&self) -> Self { - let entries = self - .entries - .iter() - .map(|slot| { - slot.as_ref().map(|ind| { - let mut cloned = IndividualEntry::new(Arc::clone(&ind.x)); - cloned.fork_refcount = 1; - cloned.x.read().entry.on_dup(); - cloned - }) - }) - .collect(); - Self { entries } - } - - /// Increment the fork reference count for each of the given descriptor slot indices. - /// - /// This must be called during fork, paired with [`RawDescriptorStorage::clone_for_fork`], - /// so that each forked slot index is properly tracked. When a process closes an FD via - /// [`Self::remove`], the fork_refcount is decremented; the entry is only truly removed - /// when fork_refcount reaches 0. + /// This must be called during child process creation, paired with + /// [`RawDescriptorStorage::clone_for_child_selective`], so that each inherited slot index is properly + /// tracked. When a process closes an FD via [`Self::remove`], the process_refcount is + /// decremented; the entry is only truly removed when process_refcount reaches 0. #[expect( clippy::missing_panics_doc, - reason = "panics only on invariant violation (slot must exist during fork)" + reason = "panics only on invariant violation (slot must exist during child creation)" )] - pub fn increment_fork_refcounts(&mut self, slot_indices: &[usize]) { + pub fn increment_process_refcounts(&mut self, slot_indices: &[usize]) { for &idx in slot_indices { let entry = self.entries[idx] .as_mut() - .expect("fork: descriptor slot must exist"); - entry.fork_refcount += 1; + .expect("child creation: descriptor slot must exist"); + entry.process_refcount += 1; } } @@ -138,7 +114,7 @@ impl Descriptors { let new_ind_entry = IndividualEntry::new(Arc::clone( &self.entries[fd.x.as_usize()?].as_ref().unwrap().x, )); - new_ind_entry.x.read().entry.on_dup(); + new_ind_entry.x.read().entry.on_ref_added(); let old = self.entries[idx].replace(new_ind_entry); assert!(old.is_none()); Some(TypedFd { @@ -166,9 +142,9 @@ impl Descriptors { entry.x.read().entry.on_close(); fd.x.mark_as_closed(); - assert!(entry.fork_refcount > 0); - entry.fork_refcount -= 1; - if entry.fork_refcount > 0 { + assert!(entry.process_refcount > 0); + entry.process_refcount -= 1; + if entry.process_refcount > 0 { // Another process still references this slot — don't remove the entry. return None; } @@ -198,15 +174,15 @@ impl Descriptors { let entry = self.entries[idx].as_mut().unwrap(); // If another process holds a fork reference, just decrement and don't truly close. - assert!(entry.fork_refcount > 0); - if entry.fork_refcount > 1 { + assert!(entry.process_refcount > 0); + if entry.process_refcount > 1 { entry.x.read().entry.on_close(); fd.x.mark_as_closed(); - entry.fork_refcount -= 1; - return Some(CloseResult::ForkDecremented); + entry.process_refcount -= 1; + return Some(CloseResult::SharedDecremented); } - // fork_refcount == 1: this is the last process. Proceed with normal close logic. + // process_refcount == 1: this is the last process. Proceed with normal close logic. let old = self.entries[idx].take().unwrap(); if Arc::strong_count(&old.x) == 1 { // Unique, so we can just return it if allowed. @@ -258,7 +234,7 @@ impl Descriptors { ) -> Vec { // Each FD corresponds to an `IndividualEntry`, which has an Arc to a `DescriptorEntry`. If // we have the same number of FDs as matching to the strong-count of a descriptor entry, - // AND the slot has fork_refcount == 1 (no other process references it), + // AND the slot has process_refcount == 1 (no other process references it), // then it must be the case that we have everything needed to close the entries out. let removable_entries: Vec<*const RwLock<_, _>> = { let mut strong_count_and_count = HashMap::<*const _, (usize, usize, bool)>::new(); @@ -268,12 +244,12 @@ impl Descriptors { // believe that we'll only see alive entries, so this `unwrap` is confirming that; if we // need to expand it out, we'd simply have a `continue` here. let entry = entry.as_ref().unwrap(); - let has_fork_refs = entry.fork_refcount > 1; + let has_shared_refs = entry.process_refcount > 1; let record = strong_count_and_count .entry(Arc::as_ptr(&entry.x)) .or_insert((Arc::strong_count(&entry.x), 0, false)); record.1 += 1; - record.2 |= has_fork_refs; + record.2 |= has_shared_refs; } strong_count_and_count .into_iter() @@ -635,9 +611,9 @@ pub(crate) enum CloseResult { Duplicated(TypedFd), /// The FD was unique but couldn't be closed immediately (e.g., due to pending data) Deferred, - /// Another process still holds a fork reference to this slot. The fork_refcount + /// Another process still holds a reference to this slot. The process_refcount /// was decremented and the FD was marked closed; no further action needed. - ForkDecremented, + SharedDecremented, } /// Safe(r) conversions between safely-typed file descriptors and unsafely-typed integers. @@ -772,37 +748,47 @@ impl RawDescriptorStorage { self.stored_fds.get(fd).is_some_and(Option::is_some) } - /// Clone the entire raw descriptor storage for fork. + /// Clone this storage for a child process, optionally selecting which raw FD + /// indices to inherit. + /// + /// - `None` — inherit all open FDs (bulk inheritance). + /// - `Some(fds)` — inherit only the listed raw FD indices (selective + /// inheritance). FD indices not present in the slice are skipped. + /// - `Some(&[])` — inherit nothing (child gets an empty FD table). /// /// Each slot in the new storage gets a **new, independent** `OwnedFd` /// (with the same raw index as the parent's), avoiding shared `AtomicBool` /// poisoning when either process closes the FD independently. /// /// Returns `(cloned_storage, slot_indices)` where `slot_indices` is the - /// list of descriptor table slot indices that were cloned. The caller MUST - /// call [`Descriptors::increment_fork_refcounts`] with these indices so that + /// list of descriptor-table slot indices that were inherited. The caller MUST + /// call [`Descriptors::increment_process_refcounts`] with these indices so that /// the descriptor table knows multiple processes reference these slots. #[must_use] #[expect( clippy::missing_panics_doc, - reason = "panics only if FD is closed during fork (invariant violation)" + reason = "panics only if FD is closed during child creation (invariant violation)" )] - pub fn clone_for_fork(&self) -> (Self, Vec) { + pub fn clone_for_child_selective(&self, inherit: Option<&[usize]>) -> (Self, Vec) { let mut slot_indices = Vec::new(); let stored_fds = self .stored_fds .iter() - .map(|slot| { - slot.as_ref().map(|stored| { + .enumerate() + .map(|(fd_index, slot)| { + slot.as_ref().and_then(|stored| { + if inherit.is_some_and(|fds| !fds.contains(&fd_index)) { + return None; + } let raw = stored .x .as_usize() - .expect("FD should not be closed during fork"); + .expect("FD should not be closed during child creation"); slot_indices.push(raw); - StoredFd { + Some(StoredFd { x: Arc::new(OwnedFd::new(raw)), subsystem_entry_type_id: stored.subsystem_entry_type_id, - } + }) }) }) .collect(); @@ -924,19 +910,19 @@ pub trait FdEnabledSubsystem: Sized { /// /// # Hook contract /// -/// `on_dup` and `on_close` are called while a read lock is held on the +/// `on_ref_added` and `on_close` are called while a read lock is held on the /// containing `DescriptorEntry`. Implementations must use interior mutability /// (e.g., atomics) and must **not** attempt to acquire a write lock on the /// same entry, or deadlock will result. /// -/// The initial `insert()` does **not** call `on_dup()`; subsystems should +/// The initial `insert()` does **not** call `on_ref_added()`; subsystems should /// initialize any reference count to 1 in their constructor. pub trait FdEnabledSubsystemEntry: Send + Sync + core::any::Any { /// Called when a new reference to this entry is created (dup, fork). /// /// Subsystems that track reference counts (e.g., pipe write-ends for /// EOF detection) should increment their count here. - fn on_dup(&self) {} + fn on_ref_added(&self) {} /// Called when a reference to this entry is dropped (close). /// @@ -970,7 +956,7 @@ struct IndividualEntry { metadata: AnyMap, /// Number of processes referencing this slot (incremented on fork, decremented on close). /// Starts at 1 when created or duplicated. When this reaches 0, the slot is truly vacated. - fork_refcount: usize, + process_refcount: usize, } impl core::ops::Deref for IndividualEntry { type Target = Arc>; @@ -983,7 +969,7 @@ impl IndividualEntry { Self { x, metadata: AnyMap::new(), - fork_refcount: 1, + process_refcount: 1, } } } diff --git a/litebox/src/net/mod.rs b/litebox/src/net/mod.rs index c7416a929..5ab39a0f5 100644 --- a/litebox/src/net/mod.rs +++ b/litebox/src/net/mod.rs @@ -875,8 +875,8 @@ where // We attempt to queue it for future closure and then just return. self.queued_for_closure.push(dup_fd); } - super::fd::CloseResult::ForkDecremented => { - // Another process still holds a fork reference. Our close is done. + super::fd::CloseResult::SharedDecremented => { + // Another process still holds a reference. Our close is done. } super::fd::CloseResult::Deferred => { let Some(()) = dt.with_entry_mut(fd, |entry| entry.entry.consider_closed = true) diff --git a/litebox_shim_linux/src/syscalls/file.rs b/litebox_shim_linux/src/syscalls/file.rs index be4c5c1ef..06f1138d7 100644 --- a/litebox_shim_linux/src/syscalls/file.rs +++ b/litebox_shim_linux/src/syscalls/file.rs @@ -81,16 +81,19 @@ impl FilesState { /// The child gets its own `RawDescriptorStorage` with independent `OwnedFd` /// instances (so close in the child does not poison the parent's FDs). /// The underlying open file descriptions are shared via Arc in the global - /// descriptor table, tracked by `fork_refcount`. + /// descriptor table, tracked by `process_refcount`. /// /// The caller must provide a mutable reference to the global descriptor table - /// so that fork_refcounts can be incremented atomically with the clone. + /// so that process_refcounts can be incremented atomically with the clone. pub(crate) fn clone_for_fork( &self, descriptors: &mut litebox::fd::Descriptors, ) -> Self { - let (cloned_rds, slot_indices) = self.raw_descriptor_store.read().clone_for_fork(); - descriptors.increment_fork_refcounts(&slot_indices); + let (cloned_rds, slot_indices) = self + .raw_descriptor_store + .read() + .clone_for_child_selective(None); + descriptors.increment_process_refcounts(&slot_indices); Self { fs: self.fs.clone(), raw_descriptor_store: litebox::sync::RwLock::new(cloned_rds), diff --git a/litebox_shim_linux/src/syscalls/process.rs b/litebox_shim_linux/src/syscalls/process.rs index 5dbb9b42a..036ce0fb9 100644 --- a/litebox_shim_linux/src/syscalls/process.rs +++ b/litebox_shim_linux/src/syscalls/process.rs @@ -854,7 +854,7 @@ impl Task { .next_thread_id .fetch_max(child_pid.saturating_add(1), Ordering::Relaxed); - // Clone the FD table for the child, incrementing fork_refcounts in the global descriptor table. + // Clone the FD table for the child, incrementing process_refcounts in the global descriptor table. let child_files = { let mut dt = self.global.litebox.descriptor_table_mut(); Arc::new(self.files.borrow().clone_for_fork(&mut dt)) From 5fc0566e58e7c4bca58eb2b23ce3e1408b082309 Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Fri, 1 May 2026 16:12:34 -0700 Subject: [PATCH 19/23] refactor: merge increment_process_refcounts into clone_storage_for_child MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Single Descriptors::clone_storage_for_child method combines FD storage cloning with refcount bookkeeping — impossible to misuse by forgetting to increment refcounts after cloning. --- litebox/src/fd/mod.rs | 33 +++++++++++++++++-------- litebox_shim_linux/src/syscalls/file.rs | 10 ++------ 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/litebox/src/fd/mod.rs b/litebox/src/fd/mod.rs index ead7fc430..13b0aebe5 100644 --- a/litebox/src/fd/mod.rs +++ b/litebox/src/fd/mod.rs @@ -36,23 +36,35 @@ impl Descriptors { Self { entries: vec![] } } - /// Increment the process reference count for each of the given descriptor slot indices. + /// Clone a [`RawDescriptorStorage`] for a child process, optionally selecting + /// which raw FD indices to inherit, and increment the process reference counts + /// for all inherited slots. /// - /// This must be called during child process creation, paired with - /// [`RawDescriptorStorage::clone_for_child_selective`], so that each inherited slot index is properly - /// tracked. When a process closes an FD via [`Self::remove`], the process_refcount is - /// decremented; the entry is only truly removed when process_refcount reaches 0. + /// - `inherit = None` — inherit all open FDs (bulk inheritance). + /// - `inherit = Some(fds)` — inherit only the listed raw FD indices (selective + /// inheritance). Indices not present in the slice are skipped. + /// - `inherit = Some(&[])` — inherit nothing (child gets an empty FD table). + /// + /// This combines [`RawDescriptorStorage::clone_for_child_selective`] with + /// process refcount bookkeeping into a single atomic operation that cannot be + /// misused (the caller cannot forget to increment refcounts). #[expect( clippy::missing_panics_doc, reason = "panics only on invariant violation (slot must exist during child creation)" )] - pub fn increment_process_refcounts(&mut self, slot_indices: &[usize]) { - for &idx in slot_indices { + pub fn clone_storage_for_child( + &mut self, + storage: &RawDescriptorStorage, + inherit: Option<&[usize]>, + ) -> RawDescriptorStorage { + let (cloned, slot_indices) = storage.clone_for_child_selective(inherit); + for &idx in &slot_indices { let entry = self.entries[idx] .as_mut() .expect("child creation: descriptor slot must exist"); entry.process_refcount += 1; } + cloned } /// Insert `entry` into the descriptor table, returning an `OwnedFd` to this entry. @@ -761,9 +773,10 @@ impl RawDescriptorStorage { /// poisoning when either process closes the FD independently. /// /// Returns `(cloned_storage, slot_indices)` where `slot_indices` is the - /// list of descriptor-table slot indices that were inherited. The caller MUST - /// call [`Descriptors::increment_process_refcounts`] with these indices so that - /// the descriptor table knows multiple processes reference these slots. + /// list of descriptor-table slot indices that were inherited. + /// + /// Prefer using [`Descriptors::clone_storage_for_child`] which combines this + /// with refcount bookkeeping into a single operation. #[must_use] #[expect( clippy::missing_panics_doc, diff --git a/litebox_shim_linux/src/syscalls/file.rs b/litebox_shim_linux/src/syscalls/file.rs index 06f1138d7..4fef21d29 100644 --- a/litebox_shim_linux/src/syscalls/file.rs +++ b/litebox_shim_linux/src/syscalls/file.rs @@ -82,18 +82,12 @@ impl FilesState { /// instances (so close in the child does not poison the parent's FDs). /// The underlying open file descriptions are shared via Arc in the global /// descriptor table, tracked by `process_refcount`. - /// - /// The caller must provide a mutable reference to the global descriptor table - /// so that process_refcounts can be incremented atomically with the clone. pub(crate) fn clone_for_fork( &self, descriptors: &mut litebox::fd::Descriptors, ) -> Self { - let (cloned_rds, slot_indices) = self - .raw_descriptor_store - .read() - .clone_for_child_selective(None); - descriptors.increment_process_refcounts(&slot_indices); + let cloned_rds = + descriptors.clone_storage_for_child(&self.raw_descriptor_store.read(), None); Self { fs: self.fs.clone(), raw_descriptor_store: litebox::sync::RwLock::new(cloned_rds), From 2d8c427163e67e38fa2dc8410b1251303c8b7e1c Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Fri, 1 May 2026 16:14:52 -0700 Subject: [PATCH 20/23] refactor: inline clone_for_child_selective into clone_storage_for_child MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Single method on Descriptors handles cloning + refcount increment in one pass. Removes the intermediate clone_for_child_selective from RawDescriptorStorage — no need for two methods when there is one caller. --- litebox/src/fd/mod.rs | 82 +++++++++++++------------------------------ 1 file changed, 24 insertions(+), 58 deletions(-) diff --git a/litebox/src/fd/mod.rs b/litebox/src/fd/mod.rs index 13b0aebe5..7adddc8cd 100644 --- a/litebox/src/fd/mod.rs +++ b/litebox/src/fd/mod.rs @@ -45,9 +45,9 @@ impl Descriptors { /// inheritance). Indices not present in the slice are skipped. /// - `inherit = Some(&[])` — inherit nothing (child gets an empty FD table). /// - /// This combines [`RawDescriptorStorage::clone_for_child_selective`] with - /// process refcount bookkeeping into a single atomic operation that cannot be - /// misused (the caller cannot forget to increment refcounts). + /// Each slot in the new storage gets a **new, independent** `OwnedFd` + /// (with the same raw index as the parent's), avoiding shared `AtomicBool` + /// poisoning when either process closes the FD independently. #[expect( clippy::missing_panics_doc, reason = "panics only on invariant violation (slot must exist during child creation)" @@ -57,14 +57,28 @@ impl Descriptors { storage: &RawDescriptorStorage, inherit: Option<&[usize]>, ) -> RawDescriptorStorage { - let (cloned, slot_indices) = storage.clone_for_child_selective(inherit); - for &idx in &slot_indices { - let entry = self.entries[idx] - .as_mut() - .expect("child creation: descriptor slot must exist"); - entry.process_refcount += 1; + let mut stored_fds = Vec::with_capacity(storage.stored_fds.len()); + for (fd_index, slot) in storage.stored_fds.iter().enumerate() { + let cloned = slot.as_ref().and_then(|stored| { + if inherit.is_some_and(|fds| !fds.contains(&fd_index)) { + return None; + } + let raw = stored + .x + .as_usize() + .expect("FD should not be closed during child creation"); + let entry = self.entries[raw] + .as_mut() + .expect("child creation: descriptor slot must exist"); + entry.process_refcount += 1; + Some(StoredFd { + x: Arc::new(OwnedFd::new(raw)), + subsystem_entry_type_id: stored.subsystem_entry_type_id, + }) + }); + stored_fds.push(cloned); } - cloned + RawDescriptorStorage { stored_fds } } /// Insert `entry` into the descriptor table, returning an `OwnedFd` to this entry. @@ -760,54 +774,6 @@ impl RawDescriptorStorage { self.stored_fds.get(fd).is_some_and(Option::is_some) } - /// Clone this storage for a child process, optionally selecting which raw FD - /// indices to inherit. - /// - /// - `None` — inherit all open FDs (bulk inheritance). - /// - `Some(fds)` — inherit only the listed raw FD indices (selective - /// inheritance). FD indices not present in the slice are skipped. - /// - `Some(&[])` — inherit nothing (child gets an empty FD table). - /// - /// Each slot in the new storage gets a **new, independent** `OwnedFd` - /// (with the same raw index as the parent's), avoiding shared `AtomicBool` - /// poisoning when either process closes the FD independently. - /// - /// Returns `(cloned_storage, slot_indices)` where `slot_indices` is the - /// list of descriptor-table slot indices that were inherited. - /// - /// Prefer using [`Descriptors::clone_storage_for_child`] which combines this - /// with refcount bookkeeping into a single operation. - #[must_use] - #[expect( - clippy::missing_panics_doc, - reason = "panics only if FD is closed during child creation (invariant violation)" - )] - pub fn clone_for_child_selective(&self, inherit: Option<&[usize]>) -> (Self, Vec) { - let mut slot_indices = Vec::new(); - let stored_fds = self - .stored_fds - .iter() - .enumerate() - .map(|(fd_index, slot)| { - slot.as_ref().and_then(|stored| { - if inherit.is_some_and(|fds| !fds.contains(&fd_index)) { - return None; - } - let raw = stored - .x - .as_usize() - .expect("FD should not be closed during child creation"); - slot_indices.push(raw); - Some(StoredFd { - x: Arc::new(OwnedFd::new(raw)), - subsystem_entry_type_id: stored.subsystem_entry_type_id, - }) - }) - }) - .collect(); - (Self { stored_fds }, slot_indices) - } - /// Returns an iterator over raw integer indices that are currently alive (i.e., occupied). pub fn iter_alive(&self) -> impl Iterator + '_ { self.stored_fds From b7250cf1b1ad35ae8271700506e07eec341e4f37 Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Fri, 1 May 2026 16:23:22 -0700 Subject: [PATCH 21/23] refactor: move clone_for_child to RawDescriptorStorage where it belongs The method naturally belongs on the object being cloned (the per-process FD table), not on Descriptors. Takes &mut Descriptors as a parameter for refcount bookkeeping. --- litebox/src/fd/mod.rs | 91 +++++++++++++------------ litebox/src/fd/tests.rs | 2 +- litebox_shim_linux/src/syscalls/file.rs | 6 +- 3 files changed, 51 insertions(+), 48 deletions(-) diff --git a/litebox/src/fd/mod.rs b/litebox/src/fd/mod.rs index 7adddc8cd..30c464e2f 100644 --- a/litebox/src/fd/mod.rs +++ b/litebox/src/fd/mod.rs @@ -36,51 +36,6 @@ impl Descriptors { Self { entries: vec![] } } - /// Clone a [`RawDescriptorStorage`] for a child process, optionally selecting - /// which raw FD indices to inherit, and increment the process reference counts - /// for all inherited slots. - /// - /// - `inherit = None` — inherit all open FDs (bulk inheritance). - /// - `inherit = Some(fds)` — inherit only the listed raw FD indices (selective - /// inheritance). Indices not present in the slice are skipped. - /// - `inherit = Some(&[])` — inherit nothing (child gets an empty FD table). - /// - /// Each slot in the new storage gets a **new, independent** `OwnedFd` - /// (with the same raw index as the parent's), avoiding shared `AtomicBool` - /// poisoning when either process closes the FD independently. - #[expect( - clippy::missing_panics_doc, - reason = "panics only on invariant violation (slot must exist during child creation)" - )] - pub fn clone_storage_for_child( - &mut self, - storage: &RawDescriptorStorage, - inherit: Option<&[usize]>, - ) -> RawDescriptorStorage { - let mut stored_fds = Vec::with_capacity(storage.stored_fds.len()); - for (fd_index, slot) in storage.stored_fds.iter().enumerate() { - let cloned = slot.as_ref().and_then(|stored| { - if inherit.is_some_and(|fds| !fds.contains(&fd_index)) { - return None; - } - let raw = stored - .x - .as_usize() - .expect("FD should not be closed during child creation"); - let entry = self.entries[raw] - .as_mut() - .expect("child creation: descriptor slot must exist"); - entry.process_refcount += 1; - Some(StoredFd { - x: Arc::new(OwnedFd::new(raw)), - subsystem_entry_type_id: stored.subsystem_entry_type_id, - }) - }); - stored_fds.push(cloned); - } - RawDescriptorStorage { stored_fds } - } - /// Insert `entry` into the descriptor table, returning an `OwnedFd` to this entry. #[expect( clippy::missing_panics_doc, @@ -774,6 +729,52 @@ impl RawDescriptorStorage { self.stored_fds.get(fd).is_some_and(Option::is_some) } + /// Clone this FD table for a child process, optionally selecting which raw FD + /// indices to inherit, and increment the process reference counts in the + /// provided [`Descriptors`] for all inherited slots. + /// + /// - `inherit = None` — inherit all open FDs (bulk inheritance). + /// - `inherit = Some(fds)` — inherit only the listed raw FD indices (selective + /// inheritance). Indices not present in the slice are skipped. + /// - `inherit = Some(&[])` — inherit nothing (child gets an empty FD table). + /// + /// Each slot in the new storage gets a **new, independent** `OwnedFd` + /// (with the same raw index as the parent's), avoiding shared `AtomicBool` + /// poisoning when either process closes the FD independently. + #[must_use] + #[expect( + clippy::missing_panics_doc, + reason = "panics only on invariant violation (slot must exist during child creation)" + )] + pub fn clone_for_child( + &self, + descriptors: &mut Descriptors, + inherit: Option<&[usize]>, + ) -> Self { + let mut stored_fds = Vec::with_capacity(self.stored_fds.len()); + for (fd_index, slot) in self.stored_fds.iter().enumerate() { + let cloned = slot.as_ref().and_then(|stored| { + if inherit.is_some_and(|fds| !fds.contains(&fd_index)) { + return None; + } + let raw = stored + .x + .as_usize() + .expect("FD should not be closed during child creation"); + let entry = descriptors.entries[raw] + .as_mut() + .expect("child creation: descriptor slot must exist"); + entry.process_refcount += 1; + Some(StoredFd { + x: Arc::new(OwnedFd::new(raw)), + subsystem_entry_type_id: stored.subsystem_entry_type_id, + }) + }); + stored_fds.push(cloned); + } + Self { stored_fds } + } + /// Returns an iterator over raw integer indices that are currently alive (i.e., occupied). pub fn iter_alive(&self) -> impl Iterator + '_ { self.stored_fds diff --git a/litebox/src/fd/tests.rs b/litebox/src/fd/tests.rs index 04a482b44..ac92d252d 100644 --- a/litebox/src/fd/tests.rs +++ b/litebox/src/fd/tests.rs @@ -6,10 +6,10 @@ use alloc::string::ToString as _; use alloc::vec; use alloc::vec::Vec; -use crate::LiteBox; use crate::fd::FdEnabledSubsystemEntry; use crate::fd::{ErrRawIntFd, FdEnabledSubsystem, TypedFd}; use crate::platform::mock::MockPlatform; +use crate::LiteBox; struct MockSubsystem; impl FdEnabledSubsystem for MockSubsystem { diff --git a/litebox_shim_linux/src/syscalls/file.rs b/litebox_shim_linux/src/syscalls/file.rs index 4fef21d29..8a52f86f8 100644 --- a/litebox_shim_linux/src/syscalls/file.rs +++ b/litebox_shim_linux/src/syscalls/file.rs @@ -86,8 +86,10 @@ impl FilesState { &self, descriptors: &mut litebox::fd::Descriptors, ) -> Self { - let cloned_rds = - descriptors.clone_storage_for_child(&self.raw_descriptor_store.read(), None); + let cloned_rds = self + .raw_descriptor_store + .read() + .clone_for_child(descriptors, None); Self { fs: self.fs.clone(), raw_descriptor_store: litebox::sync::RwLock::new(cloned_rds), From e3451bb7f52df96272dc7d1c2c341018a933ded3 Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Fri, 1 May 2026 16:28:58 -0700 Subject: [PATCH 22/23] refactor: rename on_close to on_ref_removed for symmetry with on_ref_added --- litebox/src/fd/mod.rs | 14 +++++++------- litebox/src/fd/tests.rs | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/litebox/src/fd/mod.rs b/litebox/src/fd/mod.rs index 30c464e2f..75e30a022 100644 --- a/litebox/src/fd/mod.rs +++ b/litebox/src/fd/mod.rs @@ -120,7 +120,7 @@ impl Descriptors { ) -> Option { let idx = fd.x.as_usize()?; let entry = self.entries[idx].as_mut().unwrap(); - entry.x.read().entry.on_close(); + entry.x.read().entry.on_ref_removed(); fd.x.mark_as_closed(); assert!(entry.process_refcount > 0); @@ -157,7 +157,7 @@ impl Descriptors { // If another process holds a fork reference, just decrement and don't truly close. assert!(entry.process_refcount > 0); if entry.process_refcount > 1 { - entry.x.read().entry.on_close(); + entry.x.read().entry.on_ref_removed(); fd.x.mark_as_closed(); entry.process_refcount -= 1; return Some(CloseResult::SharedDecremented); @@ -168,7 +168,7 @@ impl Descriptors { if Arc::strong_count(&old.x) == 1 { // Unique, so we can just return it if allowed. if can_close_immediately(old.x.read().as_subsystem::()) { - old.x.read().entry.on_close(); + old.x.read().entry.on_ref_removed(); fd.x.mark_as_closed(); let entry = Arc::into_inner(old.x) .map(RwLock::into_inner) @@ -182,7 +182,7 @@ impl Descriptors { Some(CloseResult::Deferred) } } else { - old.x.read().entry.on_close(); + old.x.read().entry.on_ref_removed(); fd.x.mark_as_closed(); // Shared (via dup), so we need to duplicate it. let old = self.entries[idx].replace(old); @@ -890,7 +890,7 @@ pub trait FdEnabledSubsystem: Sized { /// /// # Hook contract /// -/// `on_ref_added` and `on_close` are called while a read lock is held on the +/// `on_ref_added` and `on_ref_removed` are called while a read lock is held on the /// containing `DescriptorEntry`. Implementations must use interior mutability /// (e.g., atomics) and must **not** attempt to acquire a write lock on the /// same entry, or deadlock will result. @@ -904,11 +904,11 @@ pub trait FdEnabledSubsystemEntry: Send + Sync + core::any::Any { /// EOF detection) should increment their count here. fn on_ref_added(&self) {} - /// Called when a reference to this entry is dropped (close). + /// Called when a reference to this entry is removed (close). /// /// This is called for every close, even when other references remain. /// Subsystems should decrement their reference count here. - fn on_close(&self) {} + fn on_ref_removed(&self) {} } /// Possible errors from [`RawDescriptorStorage::fd_from_raw_integer`] and diff --git a/litebox/src/fd/tests.rs b/litebox/src/fd/tests.rs index ac92d252d..04a482b44 100644 --- a/litebox/src/fd/tests.rs +++ b/litebox/src/fd/tests.rs @@ -6,10 +6,10 @@ use alloc::string::ToString as _; use alloc::vec; use alloc::vec::Vec; +use crate::LiteBox; use crate::fd::FdEnabledSubsystemEntry; use crate::fd::{ErrRawIntFd, FdEnabledSubsystem, TypedFd}; use crate::platform::mock::MockPlatform; -use crate::LiteBox; struct MockSubsystem; impl FdEnabledSubsystem for MockSubsystem { From 88c41b02f50a553e6be36823ba058486dfe547e1 Mon Sep 17 00:00:00 2001 From: Weidong Cui Date: Fri, 1 May 2026 16:36:07 -0700 Subject: [PATCH 23/23] refactor: remove dead on_ref_added/on_ref_removed hooks from FdEnabledSubsystemEntry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No subsystem overrides these hooks — they were all default no-ops. process_refcount on IndividualEntry handles cross-process sharing, and Arc::strong_count handles within-process dup sharing. The hooks added complexity without value; they can be re-added if a subsystem actually needs them. --- litebox/src/fd/mod.rs | 29 +---------------------------- 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/litebox/src/fd/mod.rs b/litebox/src/fd/mod.rs index 75e30a022..a9677319b 100644 --- a/litebox/src/fd/mod.rs +++ b/litebox/src/fd/mod.rs @@ -95,7 +95,6 @@ impl Descriptors { let new_ind_entry = IndividualEntry::new(Arc::clone( &self.entries[fd.x.as_usize()?].as_ref().unwrap().x, )); - new_ind_entry.x.read().entry.on_ref_added(); let old = self.entries[idx].replace(new_ind_entry); assert!(old.is_none()); Some(TypedFd { @@ -120,7 +119,6 @@ impl Descriptors { ) -> Option { let idx = fd.x.as_usize()?; let entry = self.entries[idx].as_mut().unwrap(); - entry.x.read().entry.on_ref_removed(); fd.x.mark_as_closed(); assert!(entry.process_refcount > 0); @@ -157,7 +155,6 @@ impl Descriptors { // If another process holds a fork reference, just decrement and don't truly close. assert!(entry.process_refcount > 0); if entry.process_refcount > 1 { - entry.x.read().entry.on_ref_removed(); fd.x.mark_as_closed(); entry.process_refcount -= 1; return Some(CloseResult::SharedDecremented); @@ -168,7 +165,6 @@ impl Descriptors { if Arc::strong_count(&old.x) == 1 { // Unique, so we can just return it if allowed. if can_close_immediately(old.x.read().as_subsystem::()) { - old.x.read().entry.on_ref_removed(); fd.x.mark_as_closed(); let entry = Arc::into_inner(old.x) .map(RwLock::into_inner) @@ -182,7 +178,6 @@ impl Descriptors { Some(CloseResult::Deferred) } } else { - old.x.read().entry.on_ref_removed(); fd.x.mark_as_closed(); // Shared (via dup), so we need to duplicate it. let old = self.entries[idx].replace(old); @@ -887,29 +882,7 @@ pub trait FdEnabledSubsystem: Sized { } /// A per-FD entry stored in the descriptor table for a specific [`FdEnabledSubsystem`] -/// -/// # Hook contract -/// -/// `on_ref_added` and `on_ref_removed` are called while a read lock is held on the -/// containing `DescriptorEntry`. Implementations must use interior mutability -/// (e.g., atomics) and must **not** attempt to acquire a write lock on the -/// same entry, or deadlock will result. -/// -/// The initial `insert()` does **not** call `on_ref_added()`; subsystems should -/// initialize any reference count to 1 in their constructor. -pub trait FdEnabledSubsystemEntry: Send + Sync + core::any::Any { - /// Called when a new reference to this entry is created (dup, fork). - /// - /// Subsystems that track reference counts (e.g., pipe write-ends for - /// EOF detection) should increment their count here. - fn on_ref_added(&self) {} - - /// Called when a reference to this entry is removed (close). - /// - /// This is called for every close, even when other references remain. - /// Subsystems should decrement their reference count here. - fn on_ref_removed(&self) {} -} +pub trait FdEnabledSubsystemEntry: Send + Sync + core::any::Any {} /// Possible errors from [`RawDescriptorStorage::fd_from_raw_integer`] and /// [`RawDescriptorStorage::fd_consume_raw_integer`].