diff --git a/litebox/src/fd/mod.rs b/litebox/src/fd/mod.rs index af267b5b4..a9677319b 100644 --- a/litebox/src/fd/mod.rs +++ b/litebox/src/fd/mod.rs @@ -106,17 +106,30 @@ impl Descriptors { /// Removes the entry at `fd`, closing out the file descriptor. /// /// Returns the descriptor entry if it is unique (i.e., it was not duplicated, or all duplicates - /// have been cleared out). + /// have been cleared out) AND no other process holds a fork reference to this slot. /// /// If the `fd` was already closed out, then (obviously) it does not return an entry. + #[expect( + clippy::missing_panics_doc, + reason = "panics only on invariant violation" + )] pub fn remove( &mut self, fd: &TypedFd, ) -> Option { - let Some(old) = self.entries[fd.x.as_usize()?].take() else { - unreachable!(); - }; + let idx = fd.x.as_usize()?; + let entry = self.entries[idx].as_mut().unwrap(); fd.x.mark_as_closed(); + + assert!(entry.process_refcount > 0); + entry.process_refcount -= 1; + if entry.process_refcount > 0 { + // Another process still references this slot — don't remove the entry. + return None; + } + + // Last fork reference — truly vacate the slot. + let old = self.entries[idx].take().unwrap(); Arc::into_inner(old.x) .map(RwLock::into_inner) .map(DescriptorEntry::into_subsystem_entry::) @@ -137,9 +150,18 @@ impl Descriptors { can_close_immediately: F, ) -> Option> { let idx = fd.x.as_usize()?; - let Some(old) = self.entries[idx].take() else { - unreachable!(); - }; + let entry = self.entries[idx].as_mut().unwrap(); + + // If another process holds a fork reference, just decrement and don't truly close. + assert!(entry.process_refcount > 0); + if entry.process_refcount > 1 { + fd.x.mark_as_closed(); + entry.process_refcount -= 1; + return Some(CloseResult::SharedDecremented); + } + + // process_refcount == 1: this is the last process. Proceed with normal close logic. + let old = self.entries[idx].take().unwrap(); if Arc::strong_count(&old.x) == 1 { // Unique, so we can just return it if allowed. if can_close_immediately(old.x.read().as_subsystem::()) { @@ -157,7 +179,7 @@ impl Descriptors { } } else { fd.x.mark_as_closed(); - // Shared, so we need to duplicate it. + // Shared (via dup), so we need to duplicate it. let old = self.entries[idx].replace(old); assert!(old.is_none()); Some(CloseResult::Duplicated(TypedFd { @@ -188,23 +210,26 @@ impl Descriptors { ) -> Vec { // Each FD corresponds to an `IndividualEntry`, which has an Arc to a `DescriptorEntry`. If // we have the same number of FDs as matching to the strong-count of a descriptor entry, + // AND the slot has process_refcount == 1 (no other process references it), // then it must be the case that we have everything needed to close the entries out. let removable_entries: Vec<*const RwLock<_, _>> = { - let mut strong_count_and_count = HashMap::<*const _, (usize, usize)>::new(); + let mut strong_count_and_count = HashMap::<*const _, (usize, usize, bool)>::new(); for fd in fds.iter() { let entry = &self.entries[fd.x.as_usize().unwrap()]; // It would not be "incorrect" to see a closed out entry, but as it currently stands, I // believe that we'll only see alive entries, so this `unwrap` is confirming that; if we // need to expand it out, we'd simply have a `continue` here. let entry = entry.as_ref().unwrap(); - strong_count_and_count + let has_shared_refs = entry.process_refcount > 1; + let record = strong_count_and_count .entry(Arc::as_ptr(&entry.x)) - .or_insert((Arc::strong_count(&entry.x), 0)) - .1 += 1; + .or_insert((Arc::strong_count(&entry.x), 0, false)); + record.1 += 1; + record.2 |= has_shared_refs; } strong_count_and_count .into_iter() - .filter(|(_ptr, (sc, c))| sc == c) + .filter(|(_ptr, (sc, c, has_fork))| sc == c && !has_fork) .map(|(ptr, _)| ptr) .collect() }; @@ -514,6 +539,26 @@ impl Descriptors { .metadata .insert(metadata) } + + /// Returns the indices of all live entries whose per-FD metadata of type `T` satisfies `pred`. + /// + /// **Important**: These are slot indices into `Descriptors.entries`, NOT raw FD numbers. + /// To get raw FD numbers matching metadata, use + /// [`RawDescriptorStorage::raw_fds_matching_metadata`] instead. + pub fn indices_matching_metadata( + &self, + pred: impl Fn(&T) -> bool, + ) -> alloc::vec::Vec { + self.entries + .iter() + .enumerate() + .filter_map(|(idx, slot)| { + let entry = slot.as_ref()?; + let matches = entry.metadata.get::().is_some_and(&pred); + matches.then_some(idx) + }) + .collect() + } } /// A handle to a descriptor entry (via [`Descriptors::entry_handle`]) that can be used without @@ -542,6 +587,9 @@ pub(crate) enum CloseResult { Duplicated(TypedFd), /// The FD was unique but couldn't be closed immediately (e.g., due to pending data) Deferred, + /// Another process still holds a reference to this slot. The process_refcount + /// was decremented and the FD was marked closed; no further action needed. + SharedDecremented, } /// Safe(r) conversions between safely-typed file descriptors and unsafely-typed integers. @@ -676,6 +724,52 @@ impl RawDescriptorStorage { self.stored_fds.get(fd).is_some_and(Option::is_some) } + /// Clone this FD table for a child process, optionally selecting which raw FD + /// indices to inherit, and increment the process reference counts in the + /// provided [`Descriptors`] for all inherited slots. + /// + /// - `inherit = None` — inherit all open FDs (bulk inheritance). + /// - `inherit = Some(fds)` — inherit only the listed raw FD indices (selective + /// inheritance). Indices not present in the slice are skipped. + /// - `inherit = Some(&[])` — inherit nothing (child gets an empty FD table). + /// + /// Each slot in the new storage gets a **new, independent** `OwnedFd` + /// (with the same raw index as the parent's), avoiding shared `AtomicBool` + /// poisoning when either process closes the FD independently. + #[must_use] + #[expect( + clippy::missing_panics_doc, + reason = "panics only on invariant violation (slot must exist during child creation)" + )] + pub fn clone_for_child( + &self, + descriptors: &mut Descriptors, + inherit: Option<&[usize]>, + ) -> Self { + let mut stored_fds = Vec::with_capacity(self.stored_fds.len()); + for (fd_index, slot) in self.stored_fds.iter().enumerate() { + let cloned = slot.as_ref().and_then(|stored| { + if inherit.is_some_and(|fds| !fds.contains(&fd_index)) { + return None; + } + let raw = stored + .x + .as_usize() + .expect("FD should not be closed during child creation"); + let entry = descriptors.entries[raw] + .as_mut() + .expect("child creation: descriptor slot must exist"); + entry.process_refcount += 1; + Some(StoredFd { + x: Arc::new(OwnedFd::new(raw)), + subsystem_entry_type_id: stored.subsystem_entry_type_id, + }) + }); + stored_fds.push(cloned); + } + Self { stored_fds } + } + /// Returns an iterator over raw integer indices that are currently alive (i.e., occupied). pub fn iter_alive(&self) -> impl Iterator + '_ { self.stored_fds @@ -683,6 +777,32 @@ impl RawDescriptorStorage { .enumerate() .filter_map(|(i, slot)| slot.as_ref().map(|_| i)) } + + /// Returns raw FD numbers whose corresponding `Descriptors` slot has per-FD metadata + /// of type `T` satisfying `pred`. + /// + /// This resolves the raw FD → slot index mapping correctly, unlike + /// [`Descriptors::indices_matching_metadata`] which returns slot indices. + pub fn raw_fds_matching_metadata< + Platform: RawSyncPrimitivesProvider, + T: core::any::Any + Send + Sync, + >( + &self, + descriptors: &Descriptors, + pred: impl Fn(&T) -> bool, + ) -> alloc::vec::Vec { + self.stored_fds + .iter() + .enumerate() + .filter_map(|(raw_fd, slot)| { + let stored = slot.as_ref()?; + let slot_idx = stored.x.as_usize()?; + let entry = descriptors.entries.get(slot_idx)?.as_ref()?; + let matches = entry.metadata.get::().is_some_and(&pred); + matches.then_some(raw_fd) + }) + .collect() + } } macro_rules! multi_subsystem_generic { @@ -787,6 +907,9 @@ pub enum MetadataError { struct IndividualEntry { x: Arc>, metadata: AnyMap, + /// Number of processes referencing this slot (incremented on fork, decremented on close). + /// Starts at 1 when created or duplicated. When this reaches 0, the slot is truly vacated. + process_refcount: usize, } impl core::ops::Deref for IndividualEntry { type Target = Arc>; @@ -799,6 +922,7 @@ impl IndividualEntry { Self { x, metadata: AnyMap::new(), + process_refcount: 1, } } } diff --git a/litebox/src/lib.rs b/litebox/src/lib.rs index f3d80997a..144a00bbb 100644 --- a/litebox/src/lib.rs +++ b/litebox/src/lib.rs @@ -24,6 +24,7 @@ pub mod net; pub mod path; pub mod pipes; pub mod platform; +pub mod process; pub mod shim; pub mod sync; pub mod tls; diff --git a/litebox/src/mm/linux.rs b/litebox/src/mm/linux.rs index f33094971..72fc72452 100644 --- a/litebox/src/mm/linux.rs +++ b/litebox/src/mm/linux.rs @@ -304,30 +304,53 @@ pub(super) struct Vmem + 'static, const pub(super) brk: usize, /// Virtual memory areas. vmas: RangeMap, + /// Minimum valid address for this address space. + pub(super) addr_min: usize, + /// Maximum valid address (exclusive) for this address space. + pub(super) addr_max: usize, } impl + 'static, const ALIGN: usize> Vmem { pub(super) const STACK_GUARD_GAP: usize = 256 << 12; - /// Create a new [`Vmem`] instance with the given memory [backend](PageManagementProvider). + /// Create a new [`Vmem`] instance using the platform's default address range. pub(super) fn new(platform: &'static Platform) -> Self { + Self::new_with_range(platform, Platform::TASK_ADDR_MIN..Platform::TASK_ADDR_MAX) + } + + /// Create a new [`Vmem`] instance scoped to the given VA range. + /// + /// Used for multi-process support where each process gets a VA partition. + pub(super) fn new_with_range( + platform: &'static Platform, + range: core::ops::Range, + ) -> Self { + assert!( + range.start.is_multiple_of(ALIGN) && range.end.is_multiple_of(ALIGN), + "Vmem: address range must be aligned to {ALIGN} bytes" + ); let mut vmem = Self { vmas: RangeMap::new(), brk: 0, platform, + addr_min: range.start, + addr_max: range.end, }; for each in platform.reserved_pages() { - assert!( - each.start % ALIGN == 0 && each.end % ALIGN == 0, - "Vmem: reserved range is not aligned to {ALIGN} bytes" - ); - vmem.vmas.insert( - each.start..each.end, - VmArea { - flags: VmFlags::empty(), - is_file_backed: false, - }, - ); + // Only insert reserved pages that fall within our range + if each.start >= range.start && each.end <= range.end { + assert!( + each.start % ALIGN == 0 && each.end % ALIGN == 0, + "Vmem: reserved range is not aligned to {ALIGN} bytes" + ); + vmem.vmas.insert( + each.start..each.end, + VmArea { + flags: VmFlags::empty(), + is_file_backed: false, + }, + ); + } } vmem } @@ -453,10 +476,10 @@ impl + 'static, const ALIGN: usize> Vmem fixed_address_behavior: FixedAddressBehavior, ) -> Result, AllocationError> { let (start, end) = (suggested_range.start, suggested_range.end); - if start < Platform::TASK_ADDR_MIN { + if start < self.addr_min { return Err(AllocationError::BelowMinAddress); } - if end > Platform::TASK_ADDR_MAX { + if end > self.addr_max { return Err(AllocationError::AboveMaxAddress); } let platform_fixed_address_behavior = match fixed_address_behavior { @@ -518,8 +541,8 @@ impl + 'static, const ALIGN: usize> Vmem let new_start = ret.as_usize(); let new_end = new_start + suggested_range.len(); self.vmas.insert(new_start..new_end, vma); - debug_assert!(new_start >= Platform::TASK_ADDR_MIN); - debug_assert!(new_end <= Platform::TASK_ADDR_MAX); + debug_assert!(new_start >= self.addr_min); + debug_assert!(new_end <= self.addr_max); Ok(ret) } @@ -890,11 +913,11 @@ impl + 'static, const ALIGN: usize> Vmem fixed_addr: bool, ) -> Option { let size = length.as_usize(); - if size > Platform::TASK_ADDR_MAX { + if size > self.addr_max.saturating_sub(self.addr_min) { return None; } if let Some(suggested_address) = suggested_address { - if (Platform::TASK_ADDR_MAX - size) < suggested_address.0 { + if (self.addr_max - size) < suggested_address.0 { return None; } if fixed_addr @@ -912,12 +935,9 @@ impl + 'static, const ALIGN: usize> Vmem // top down // 1. check [last_end, TASK_SIZE_MAX) - let (low_limit, high_limit) = ( - Platform::TASK_ADDR_MIN, - Platform::TASK_ADDR_MAX - length.as_usize(), - ); - debug_assert!(Platform::TASK_ADDR_MIN % ALIGN == 0); - debug_assert!(Platform::TASK_ADDR_MAX % ALIGN == 0); + let (low_limit, high_limit) = (self.addr_min, self.addr_max - length.as_usize()); + debug_assert!(self.addr_min.is_multiple_of(ALIGN)); + debug_assert!(self.addr_max.is_multiple_of(ALIGN)); let last_end = self.vmas.last_range_value().map_or(low_limit, |r| r.0.end); if last_end <= high_limit { return Some(high_limit); diff --git a/litebox/src/mm/mod.rs b/litebox/src/mm/mod.rs index a46b3c855..a19f1abdd 100644 --- a/litebox/src/mm/mod.rs +++ b/litebox/src/mm/mod.rs @@ -46,6 +46,19 @@ where Self { vmem } } + /// Create a new `PageManager` scoped to a specific VA range. + /// + /// Used for multi-process support where each process gets a VA partition. + pub fn new_with_range(litebox: &LiteBox, range: core::ops::Range) -> Self { + let vmem = RwLock::new(linux::Vmem::new_with_range(litebox.x.platform, range)); + Self { vmem } + } + + /// Returns the minimum address of this process's virtual address range. + pub fn addr_min(&self) -> usize { + self.vmem.read().addr_min + } + /// Create a mapping with the given flags. /// /// `suggested_new_address` is the hint address for where to create the pages if it is not `None`. @@ -672,15 +685,21 @@ where error_code: u64, ) -> Result<(), PageFaultError> { let fault_addr = fault_addr & !(ALIGN - 1); - if !(Platform::TASK_ADDR_MIN..Platform::TASK_ADDR_MAX).contains(&fault_addr) { - return Err(PageFaultError::AccessError("Invalid address")); + // Read address bounds from vmem to avoid using Platform constants directly + { + let vmem = self.vmem.read(); + if !(vmem.addr_min..vmem.addr_max).contains(&fault_addr) { + return Err(PageFaultError::AccessError("Invalid address")); + } } let mut vmem = self.vmem.write(); + let addr_min = vmem.addr_min; + let addr_max = vmem.addr_max; // Find the range closest to the fault address let (start, vma) = { let (r, vma) = vmem - .overlapping(fault_addr..Platform::TASK_ADDR_MAX) + .overlapping(fault_addr..addr_max) .next() .ok_or(PageFaultError::AccessError("no mapping"))?; (r.start, *vma) @@ -692,7 +711,7 @@ where } if !vmem - .overlapping(Platform::TASK_ADDR_MIN..fault_addr) + .overlapping(addr_min..fault_addr) .next_back() .is_none_or(|(prev_range, prev_vma)| { // Enforce gap between stack and other preceding non-stack mappings. diff --git a/litebox/src/net/mod.rs b/litebox/src/net/mod.rs index 954162620..5ab39a0f5 100644 --- a/litebox/src/net/mod.rs +++ b/litebox/src/net/mod.rs @@ -875,6 +875,9 @@ where // We attempt to queue it for future closure and then just return. self.queued_for_closure.push(dup_fd); } + super::fd::CloseResult::SharedDecremented => { + // Another process still holds a reference. Our close is done. + } super::fd::CloseResult::Deferred => { let Some(()) = dt.with_entry_mut(fd, |entry| entry.entry.consider_closed = true) else { diff --git a/litebox/src/platform/address_space.rs b/litebox/src/platform/address_space.rs new file mode 100644 index 000000000..2c0e9d429 --- /dev/null +++ b/litebox/src/platform/address_space.rs @@ -0,0 +1,92 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//! Address space management for multi-process support. +//! +//! Platforms implement [`AddressSpaceProvider`] to manage isolated or shared +//! memory regions for guest processes. + +use core::fmt::Debug; +use core::hash::Hash; +use core::ops::Range; +use thiserror::Error; + +/// Platform-wide property: are address spaces isolated or shared? +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum AddressSpaceKind { + /// Each address space has independent memory (e.g., kernel page tables, + /// separate host processes). The platform handles memory isolation; + /// the shim does not need to manage CoW. + Isolated, + /// Address spaces share the same host memory (e.g., VA partitions in a + /// single userland process). The shim is responsible for copy-on-write + /// or other memory separation. + SharedMemory, +} + +/// Errors from address space operations. +#[derive(Error, Debug)] +pub enum AddressSpaceError { + #[error("no space available for a new address space")] + NoSpace, + #[error("the address space ID is invalid")] + InvalidId, + #[error("address space operations are not supported by this platform")] + NotSupported, +} + +/// Address space management for multi-process support. +/// +/// Platforms implement this trait to create, destroy, and switch between +/// address spaces. Each address space represents an isolated (or partitioned) +/// memory region for a guest process. +pub trait AddressSpaceProvider { + /// An opaque identifier for an address space. + type AddressSpaceId: Copy + Eq + Send + Sync + Hash + Debug + 'static; + + /// Platform-wide: are address spaces isolated or shared? + const ADDRESS_SPACE_KIND: AddressSpaceKind; + + /// Create a new address space. + fn create_address_space(&self) -> Result { + Err(AddressSpaceError::NotSupported) + } + + /// Destroy an address space, releasing all resources. + fn destroy_address_space(&self, _id: Self::AddressSpaceId) -> Result<(), AddressSpaceError> { + Err(AddressSpaceError::NotSupported) + } + + /// Make `id` the active address space for the current thread. + /// + /// Activation is thread-local: each thread independently tracks its + /// active address space. On kernel platforms this switches page tables. + /// On userland platforms this may be a no-op. + fn activate_address_space(&self, _id: Self::AddressSpaceId) -> Result<(), AddressSpaceError> { + Err(AddressSpaceError::NotSupported) + } + + /// Execute `f` with the given address space active, then restore the + /// previously active address space. + fn with_address_space( + &self, + _id: Self::AddressSpaceId, + f: impl FnOnce() -> R, + ) -> Result { + let _ = f; + Err(AddressSpaceError::NotSupported) + } + + /// Return the VA range available to the given address space. + /// + /// Primarily meaningful for [`AddressSpaceKind::SharedMemory`] platforms + /// (e.g., userland VA partitions) where the shim needs to scope memory + /// operations (mmap, brk, etc.) to the correct region. Platforms with + /// hardware-isolated address spaces typically return `NotSupported`. + fn address_space_range( + &self, + _id: Self::AddressSpaceId, + ) -> Result, AddressSpaceError> { + Err(AddressSpaceError::NotSupported) + } +} diff --git a/litebox/src/platform/mock.rs b/litebox/src/platform/mock.rs index 4bcb936eb..bfac9a849 100644 --- a/litebox/src/platform/mock.rs +++ b/litebox/src/platform/mock.rs @@ -340,3 +340,9 @@ unsafe impl ThreadLocalStorageProvider for MockPlatform { MOCK_TLS.replace(value) } } + +impl AddressSpaceProvider for MockPlatform { + type AddressSpaceId = u32; + const ADDRESS_SPACE_KIND: address_space::AddressSpaceKind = + address_space::AddressSpaceKind::SharedMemory; +} diff --git a/litebox/src/platform/mod.rs b/litebox/src/platform/mod.rs index 2a0b6a9df..f2aea2756 100644 --- a/litebox/src/platform/mod.rs +++ b/litebox/src/platform/mod.rs @@ -7,6 +7,7 @@ //! trait is merely a collection of subtraits that could be composed independently from various //! other crates that implement them upon various types. +pub mod address_space; pub mod common_providers; pub mod page_mgmt; pub mod trivial_providers; @@ -18,6 +19,7 @@ use either::Either; use thiserror::Error; use zerocopy::{FromBytes, IntoBytes}; +pub use address_space::AddressSpaceProvider; pub use page_mgmt::PageManagementProvider; /// A provider of a platform upon which LiteBox can execute. @@ -26,7 +28,12 @@ pub use page_mgmt::PageManagementProvider; /// provided by it. _However_, most of the provided APIs within the provider act upon an `&self` to /// allow storage of any useful "globals" within it necessary. pub trait Provider: - RawMutexProvider + IPInterfaceProvider + TimeProvider + PunchthroughProvider + RawPointerProvider + RawMutexProvider + + IPInterfaceProvider + + TimeProvider + + PunchthroughProvider + + RawPointerProvider + + AddressSpaceProvider { } diff --git a/litebox/src/process/mod.rs b/litebox/src/process/mod.rs new file mode 100644 index 000000000..1615e0155 --- /dev/null +++ b/litebox/src/process/mod.rs @@ -0,0 +1,763 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//! Process identity and lifecycle management. +//! +//! This module provides a platform-agnostic process registry for tracking +//! parent-child relationships, exit status, and process lifecycle. OS-specific +//! semantics (POSIX process groups/sessions, NT job objects, etc.) belong in +//! the shim layer, not here. + +use alloc::sync::Arc; +use alloc::vec::Vec; +use core::sync::atomic::{AtomicBool, Ordering}; +use hashbrown::HashMap; +use thiserror::Error; + +use crate::event::{Events, observer::Observer}; +use crate::platform::RawMutex as RawMutexTrait; +use crate::sync::{Mutex, RawSyncPrimitivesProvider}; + +/// Process identifier. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct ProcessId(u32); + +impl ProcessId { + /// The first process created in every LiteBox instance. + pub const INIT: Self = Self(1); + + /// Create a new `ProcessId` from a raw value. + /// Returns `None` if `raw` is 0 (invalid). + pub fn new(raw: u32) -> Option { + if raw == 0 { None } else { Some(Self(raw)) } + } + + /// Get the raw `u32` value of this process ID. + pub fn as_u32(self) -> u32 { + self.0 + } +} + +/// Per-process state tracked by the core. +pub struct ProcessContext { + pub id: ProcessId, + /// Parent process. `None` only for the init process. + pub parent: Option, + pub state: ProcessState, + /// Process group ID. Defaults to the process's own PID. + pub pgid: ProcessId, + /// Session ID. Defaults to the process's own PID for the init process. + pub sid: ProcessId, + /// Child processes. + children: Vec, +} + +impl ProcessContext { + /// Get the list of child processes. + pub fn children(&self) -> &[ProcessId] { + &self.children + } +} + +/// Whether a process is running or has exited. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ProcessState { + Running, + /// The process has exited. The `u32` is opaque to the core; + /// shims assign platform-specific meaning (POSIX: wait status encoding; + /// NT: NTSTATUS / DWORD exit code, etc.). + Exited(u32), +} + +/// Returned by [`ProcessRegistry::exit_process`] so the shim can notify the +/// parent through whatever mechanism is appropriate (SIGCHLD, handle +/// signaling, etc.). +pub struct ExitNotification { + pub parent_pid: ProcessId, + pub child_pid: ProcessId, + pub exit_status: u32, +} + +/// Errors from [`ProcessRegistry::create_process`]. +#[derive(Error, Debug)] +pub enum CreateProcessError { + #[error("the specified parent PID does not exist in the registry")] + NoSuchParent, + #[error("a root (init) process already exists")] + InitAlreadyExists, + #[error("too many processes (limit: {0})")] + TooManyProcesses(usize), + #[error("PID space exhausted")] + PidSpaceExhausted, +} + +/// Shared handle for observing a process's exit. +/// +/// `exited` becomes `true` when the process exits. `observer` is notified +/// with readiness events so shims can integrate with their event loop. +pub struct ProcessExitObserver { + /// Whether the process has exited. + pub exited: Arc, + /// Observer that receives [`Events::IN`] when the process exits. + pub observer: Arc>, +} + +/// An exit notification subject that observers can register on. +pub struct ExitSubject { + observers: Mutex>>>, + notified: AtomicBool, +} + +impl ExitSubject { + fn new() -> Self { + Self { + observers: Mutex::new(Vec::new()), + notified: AtomicBool::new(false), + } + } + + /// Register an observer to be notified when the process exits. + pub fn register_observer(&self, observer: alloc::sync::Weak>) { + let fire_immediately; + { + let mut observers = self.observers.lock(); + fire_immediately = self.notified.load(Ordering::Acquire); + observers.push(observer.clone()); + } + // Fire outside the lock to avoid deadlock if on_events re-enters. + if fire_immediately && let Some(obs) = observer.upgrade() { + obs.on_events(&Events::IN); + } + } + + /// Notify all registered observers. Collects them under the lock, then + /// fires outside the lock to prevent deadlocks. + fn notify(&self) { + let snapshot: Vec<_>; + { + let observers = self.observers.lock(); + // Set notified under the lock so register_observer sees a + // consistent state: once notified is true the observer list + // is finalized for this notification round. + self.notified.store(true, Ordering::Release); + snapshot = observers + .iter() + .filter_map(alloc::sync::Weak::upgrade) + .collect(); + } + for obs in snapshot { + obs.on_events(&Events::IN); + } + } +} + +/// Internal per-process entry in the registry. +struct ProcessEntry { + context: ProcessContext, + exit_observer: Arc>, + exited_flag: Arc, +} + +/// A registry of processes with parent-child lifecycle management. +/// +/// `ProcessRegistry` is parameterized on a platform type for mutex support. +pub struct ProcessRegistry { + inner: Mutex>, + /// Futex-like primitive: incremented on every child exit so that + /// blocking `wait_for_child_exit_since` can unblock. + exit_event: ::RawMutex, +} + +struct RegistryInner { + processes: HashMap>, + next_pid: u32, + /// Maximum number of processes allowed. 0 means unlimited. + max_processes: usize, +} + +#[allow( + clippy::missing_panics_doc, + clippy::must_use_candidate, + clippy::new_without_default, + clippy::result_unit_err +)] +impl ProcessRegistry { + /// Create a new, empty process registry. + pub fn new() -> Self { + Self::with_max_processes(0) + } + + /// Create a new process registry with a maximum process count. + /// Pass 0 for unlimited. + pub fn with_max_processes(max_processes: usize) -> Self { + Self { + inner: Mutex::new(RegistryInner { + processes: HashMap::new(), + next_pid: 1, + max_processes, + }), + exit_event: ::RawMutex::INIT, + } + } + + /// Allocate a PID and register a new process. + /// + /// `parent` is the parent process ID. Pass `None` to create the init + /// process (PID 1). Only one init process is allowed. + #[allow(clippy::similar_names)] + pub fn create_process( + &self, + parent: Option, + ) -> Result { + let mut inner = self.inner.lock(); + let pid = match parent { + None => { + // Creating init process + let pid = ProcessId::INIT; + if inner.processes.contains_key(&pid) { + return Err(CreateProcessError::InitAlreadyExists); + } + // Ensure next_pid is past init + if inner.next_pid <= pid.as_u32() { + inner.next_pid = pid.as_u32() + 1; + } + pid + } + Some(parent_pid) => { + if !inner.processes.contains_key(&parent_pid) { + return Err(CreateProcessError::NoSuchParent); + } + // Enforce process count limit + if inner.max_processes > 0 && inner.processes.len() >= inner.max_processes { + return Err(CreateProcessError::TooManyProcesses(inner.max_processes)); + } + let raw = inner.next_pid; + inner.next_pid = raw + .checked_add(1) + .ok_or(CreateProcessError::PidSpaceExhausted)?; + let pid = ProcessId(raw); + // Register as child of parent + inner + .processes + .get_mut(&parent_pid) + .unwrap() + .context + .children + .push(pid); + pid + } + }; + + let exited_flag = Arc::new(AtomicBool::new(false)); + let exit_observer = Arc::new(ExitSubject::new()); + + // Determine pgid and sid: init gets its own, children inherit from parent. + let (pgid, sid) = match parent { + None => (pid, pid), + Some(parent_pid) => { + let parent_entry = inner.processes.get(&parent_pid).unwrap(); + (parent_entry.context.pgid, parent_entry.context.sid) + } + }; + + inner.processes.insert( + pid, + ProcessEntry { + context: ProcessContext { + id: pid, + parent, + state: ProcessState::Running, + pgid, + sid, + children: Vec::new(), + }, + exit_observer, + exited_flag, + }, + ); + + Ok(pid) + } + + /// Ensure the next PID will be at least `min_pid`. + /// Used to keep PIDs and TIDs in disjoint ranges when they share a namespace. + pub fn advance_next_pid(&self, min_pid: u32) { + let mut inner = self.inner.lock(); + if inner.next_pid < min_pid { + inner.next_pid = min_pid; + } + } + + /// Remove a process that was created but never started (e.g., child setup + /// failed after PID allocation). + /// + /// # Panics + /// + /// Panics if the process has children, is not in `Running` state, or does + /// not exist. + pub fn abort_process(&self, id: ProcessId) { + let mut inner = self.inner.lock(); + let entry = inner + .processes + .remove(&id) + .expect("abort_process: no such process"); + assert!( + matches!(entry.context.state, ProcessState::Running), + "abort_process: process not running" + ); + assert!( + entry.context.children.is_empty(), + "abort_process: process has children" + ); + // Remove from parent's children list + if let Some(parent_pid) = entry.context.parent + && let Some(parent) = inner.processes.get_mut(&parent_pid) + { + parent.context.children.retain(|&c| c != id); + } + } + + /// Record that a process has exited with the given status. + /// + /// For each orphaned child, calls `orphan_handler` so the shim can decide + /// the reparenting policy (e.g., POSIX reparents to init). + /// + /// Returns `Some(ExitNotification)` if the parent is still alive, + /// `None` otherwise. + pub fn exit_process( + &self, + id: ProcessId, + status: u32, + mut orphan_handler: impl FnMut(ProcessId), + ) -> Option { + let (children, exit_observer, notification); + { + let mut inner = self.inner.lock(); + let entry = inner.processes.get_mut(&id)?; + // Idempotent: if already exited, return None without re-notifying. + if matches!(entry.context.state, ProcessState::Exited(_)) { + return None; + } + entry.context.state = ProcessState::Exited(status); + entry.exited_flag.store(true, Ordering::Release); + exit_observer = Arc::clone(&entry.exit_observer); + children = entry.context.children.clone(); + + // Check if parent is alive + let parent_pid = entry.context.parent; + notification = parent_pid.and_then(|ppid| { + let parent = inner.processes.get(&ppid)?; + if matches!(parent.context.state, ProcessState::Running) { + Some(ExitNotification { + parent_pid: ppid, + child_pid: id, + exit_status: status, + }) + } else { + None + } + }); + } + // Wake any threads blocked in wait_for_any_child_exit. + self.exit_event + .underlying_atomic() + .fetch_add(1, Ordering::Release); + self.exit_event.wake_all(); + // All callbacks run outside the lock to prevent deadlocks. + exit_observer.notify(); + for child_pid in children { + orphan_handler(child_pid); + } + notification + } + + /// Reparent a child process to a new parent. + /// + /// Updates the child's parent field, removes it from the old parent's children list, + /// and adds it to the new parent's children list. + /// + /// Returns `Some(exit_status)` if the child is already a zombie (exited but not reaped), + /// so the caller can deliver SIGCHLD to the new parent. + pub fn reparent(&self, child: ProcessId, new_parent: ProcessId) -> Option { + let mut inner = self.inner.lock(); + + // Read child info first. + let (old_parent, exit_status) = { + let entry = inner.processes.get_mut(&child)?; + let old_parent = entry.context.parent.replace(new_parent); + let exit_status = match entry.context.state { + ProcessState::Exited(status) => Some(status), + ProcessState::Running => None, + }; + (old_parent, exit_status) + }; + + // Remove from old parent's children list. + if let Some(old_pid) = old_parent + && let Some(old_entry) = inner.processes.get_mut(&old_pid) + { + old_entry.context.children.retain(|&c| c != child); + } + + // Add to new parent's children list. + if let Some(parent_entry) = inner.processes.get_mut(&new_parent) { + parent_entry.context.children.push(child); + } + exit_status + } + + /// Set the process group ID for a process. + /// Returns `Ok(())` on success, `Err(())` if the process does not exist. + pub fn set_pgid(&self, id: ProcessId, pgid: ProcessId) -> Result<(), ()> { + let mut inner = self.inner.lock(); + let entry = inner.processes.get_mut(&id).ok_or(())?; + entry.context.pgid = pgid; + Ok(()) + } + + /// Create a new session: set both pgid and sid to the process's own PID. + /// Returns `Err(())` if the process doesn't exist or is already a process group leader. + pub fn setsid(&self, id: ProcessId) -> Result<(), ()> { + let mut inner = self.inner.lock(); + let entry = inner.processes.get_mut(&id).ok_or(())?; + if entry.context.pgid == id { + return Err(()); // already a process group leader + } + entry.context.pgid = id; + entry.context.sid = id; + Ok(()) + } + + /// Collect all process IDs in the given process group. + pub fn pids_in_group(&self, pgid: ProcessId) -> alloc::vec::Vec { + let inner = self.inner.lock(); + inner + .processes + .values() + .filter(|e| e.context.pgid == pgid && matches!(e.context.state, ProcessState::Running)) + .map(|e| e.context.id) + .collect() + } + + /// Read process context through a closure. + /// Returns `None` if the process does not exist. + pub fn with_context( + &self, + id: ProcessId, + f: impl FnOnce(&ProcessContext) -> R, + ) -> Option { + let inner = self.inner.lock(); + inner.processes.get(&id).map(|e| f(&e.context)) + } + + /// Returns `true` if the process exists and is in `Running` state. + pub fn is_alive(&self, id: ProcessId) -> bool { + let inner = self.inner.lock(); + inner + .processes + .get(&id) + .is_some_and(|e| matches!(e.context.state, ProcessState::Running)) + } + + /// Get the parent PID of a process. + pub fn get_parent(&self, id: ProcessId) -> Option { + let inner = self.inner.lock(); + inner.processes.get(&id).and_then(|e| e.context.parent) + } + + /// Get the child PIDs of a process. + pub fn get_children(&self, id: ProcessId) -> Option> { + let inner = self.inner.lock(); + inner.processes.get(&id).map(|e| e.context.children.clone()) + } + + /// Total number of processes in the registry (running + exited). + pub fn process_count(&self) -> usize { + let inner = self.inner.lock(); + inner.processes.len() + } + + /// Remove an exited process from the table. + /// + /// # Panics + /// + /// Panics if the process is still running or does not exist. + pub fn remove_process(&self, id: ProcessId) { + let mut inner = self.inner.lock(); + let entry = inner + .processes + .remove(&id) + .expect("remove_process: no such process"); + assert!( + matches!(entry.context.state, ProcessState::Exited(_)), + "remove_process: process still running" + ); + // Remove from parent's children list + if let Some(parent_pid) = entry.context.parent + && let Some(parent) = inner.processes.get_mut(&parent_pid) + { + parent.context.children.retain(|&c| c != id); + } + } + + /// Non-blocking check for an exited child of `parent`. + /// + /// `target` selects which children to consider: + /// - `> 0`: only the child with that specific PID + /// - `-1`: any child + /// - `0`: any child in the caller's process group + /// - `< -1`: any child in process group `|target|` + /// + /// If a matching exited child is found, it is reaped (removed from the + /// registry) and `Ok(Some((child_pid, exit_status)))` is returned. + /// Returns `Ok(None)` if matching children exist but none have exited yet. + /// Returns `Err(())` if the parent has no children matching `target` + /// (i.e., ECHILD condition). + pub fn try_wait(&self, parent: ProcessId, target: i32) -> Result, ()> { + let mut inner = self.inner.lock(); + let parent_entry = inner.processes.get(&parent).ok_or(())?; + let children = parent_entry.context.children.clone(); + + if children.is_empty() { + return Err(()); // ECHILD + } + + // Find a matching exited child + let found = match target { + t if t > 0 => { + let target_pid = ProcessId(t.cast_unsigned()); + // Verify it's actually a child of parent + if !children.contains(&target_pid) { + return Err(()); // ECHILD — not our child + } + match inner.processes.get(&target_pid) { + Some(e) => match e.context.state { + ProcessState::Exited(status) => Some((target_pid, status)), + ProcessState::Running => None, + }, + None => None, + } + } + -1 => { + // Any child + let mut result = None; + for &child_pid in &children { + if let Some(entry) = inner.processes.get(&child_pid) + && let ProcessState::Exited(status) = entry.context.state + { + result = Some((child_pid, status)); + break; + } + } + result + } + 0 => { + // Wait for any child in the caller's process group. + let caller_pgid = parent_entry.context.pgid; + let mut any_match = false; + let mut result = None; + for &child_pid in &children { + if let Some(entry) = inner.processes.get(&child_pid) + && entry.context.pgid == caller_pgid + { + any_match = true; + if let ProcessState::Exited(status) = entry.context.state { + result = Some((child_pid, status)); + break; + } + } + } + if !any_match { + return Err(()); // ECHILD — no children in this group + } + result + } + t if t < -1 => { + // Wait for any child in process group |t|. + let pgid = ProcessId(t.unsigned_abs()); + let mut any_match = false; + let mut result = None; + for &child_pid in &children { + if let Some(entry) = inner.processes.get(&child_pid) + && entry.context.pgid == pgid + { + any_match = true; + if let ProcessState::Exited(status) = entry.context.state { + result = Some((child_pid, status)); + break; + } + } + } + if !any_match { + return Err(()); // ECHILD — no children in this group + } + result + } + _ => return Err(()), + }; + + // Reap the child if found + if let Some((child_pid, _)) = found { + // Remove child from registry. Its children should have been + // reparented during exit_process. + let entry = inner.processes.remove(&child_pid); + debug_assert!( + entry.as_ref().is_none_or(|e| e.context.children.is_empty()), + "reaped zombie still has children" + ); + // Remove from parent's children list + if let Some(parent) = inner.processes.get_mut(&parent) { + parent.context.children.retain(|&c| c != child_pid); + } + } + + Ok(found) + } + + /// Snapshot the current exit epoch. Used with `wait_for_child_exit_since` + /// to implement the standard futex pattern: snapshot, check, block-on-snapshot. + pub fn exit_epoch(&self) -> u32 { + self.exit_event.underlying_atomic().load(Ordering::Acquire) + } + + /// Block until a child exit occurs after the given epoch snapshot. + /// The caller should call `exit_epoch()` BEFORE `try_wait()`, then + /// pass the snapshot here if `try_wait` returned `Ok(None)`. + pub fn wait_for_child_exit_since(&self, epoch: u32) { + let _ = self.exit_event.block(epoch); + } + + /// Block until any child exit occurs (or return immediately if one has + /// happened since the last call). Used by blocking wait4. + pub fn wait_for_any_child_exit(&self) { + let epoch = self.exit_event.underlying_atomic().load(Ordering::Acquire); + let _ = self.exit_event.block(epoch); + } + + /// Get exit observers for all children of `parent` matching `target`. + /// Used by blocking wait to know when to re-check. + pub fn child_exit_observers( + &self, + parent: ProcessId, + target: i32, + ) -> Vec> { + let inner = self.inner.lock(); + let Some(parent_entry) = inner.processes.get(&parent) else { + return Vec::new(); + }; + let children = &parent_entry.context.children; + let pids: Vec = match target { + t if t > 0 => { + let pid = ProcessId(t.cast_unsigned()); + if children.contains(&pid) { + alloc::vec![pid] + } else { + Vec::new() + } + } + -1 => children.clone(), + _ => Vec::new(), + }; + pids.iter() + .filter_map(|pid| { + let entry = inner.processes.get(pid)?; + Some(ProcessExitObserver { + exited: Arc::clone(&entry.exited_flag), + observer: Arc::clone(&entry.exit_observer), + }) + }) + .collect() + } + + /// Obtain a shared exit-observation handle for the given process. + pub fn exit_observer(&self, id: ProcessId) -> Option> { + let inner = self.inner.lock(); + let entry = inner.processes.get(&id)?; + Some(ProcessExitObserver { + exited: Arc::clone(&entry.exited_flag), + observer: Arc::clone(&entry.exit_observer), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::platform::mock::MockPlatform; + + type Registry = ProcessRegistry; + + fn new_registry() -> Registry { + Registry::new() + } + + #[test] + fn test_create_init_process() { + let registry = new_registry(); + let pid = registry.create_process(None).unwrap(); + assert_eq!(pid, ProcessId::INIT); + assert!(registry.is_alive(pid)); + assert_eq!(registry.get_parent(pid), None); + } + + #[test] + fn test_create_child_process() { + let registry = new_registry(); + let init = registry.create_process(None).unwrap(); + let child = registry.create_process(Some(init)).unwrap(); + assert_ne!(init, child); + assert!(registry.is_alive(child)); + assert_eq!(registry.get_parent(child), Some(init)); + assert_eq!(registry.get_children(init), Some(alloc::vec![child])); + } + + #[test] + fn test_exit_process() { + let registry = new_registry(); + let init = registry.create_process(None).unwrap(); + let child = registry.create_process(Some(init)).unwrap(); + + let notif = registry.exit_process(child, 42, |_| {}); + assert!(notif.is_some()); + let notif = notif.unwrap(); + assert_eq!(notif.parent_pid, init); + assert_eq!(notif.child_pid, child); + assert_eq!(notif.exit_status, 42); + + assert!(!registry.is_alive(child)); + registry.remove_process(child); + assert_eq!(registry.get_children(init), Some(alloc::vec![])); + } + + #[test] + fn test_abort_process() { + let registry = new_registry(); + let init = registry.create_process(None).unwrap(); + let child = registry.create_process(Some(init)).unwrap(); + registry.abort_process(child); + assert_eq!(registry.get_children(init), Some(alloc::vec![])); + } + + #[test] + fn test_duplicate_init_rejected() { + let registry = new_registry(); + registry.create_process(None).unwrap(); + assert!(matches!( + registry.create_process(None), + Err(CreateProcessError::InitAlreadyExists) + )); + } + + #[test] + fn test_exit_observer() { + let registry = new_registry(); + let init = registry.create_process(None).unwrap(); + let child = registry.create_process(Some(init)).unwrap(); + + let observer = registry.exit_observer(child).unwrap(); + assert!(!observer.exited.load(Ordering::Acquire)); + + registry.exit_process(child, 0, |_| {}); + assert!(observer.exited.load(Ordering::Acquire)); + } +} diff --git a/litebox/src/shim.rs b/litebox/src/shim.rs index 82a800339..752644d68 100644 --- a/litebox/src/shim.rs +++ b/litebox/src/shim.rs @@ -28,6 +28,11 @@ pub trait EnterShim { /// FUTURE: use a single per-architecture type for all shims and platforms. type ExecutionContext; + /// The process ID for this thread's process, if multi-process is supported. + fn process_id(&self) -> Option { + None + } + /// Initialize a new thread. Must be called by the platform exactly once /// before running the thread in the guest for the first time. /// diff --git a/litebox_common_linux/src/lib.rs b/litebox_common_linux/src/lib.rs index 578bd51a9..922c300b2 100644 --- a/litebox_common_linux/src/lib.rs +++ b/litebox_common_linux/src/lib.rs @@ -2102,6 +2102,15 @@ pub enum SyscallRequest { }, Getpid, Getppid, + Setpgid { + pid: i32, + pgid: i32, + }, + Getpgid { + pid: i32, + }, + Getpgrp, + Setsid, Getuid, Geteuid, Getgid, @@ -2146,6 +2155,12 @@ pub enum SyscallRequest { new_value: Platform::RawConstPointer, old_value: Option>, }, + Wait4 { + pid: i32, + wstatus: Option>, + options: i32, + // rusage is ignored for now + }, } impl SyscallRequest { @@ -2414,6 +2429,10 @@ impl SyscallRequest { Sysno::prlimit64 => sys_req!(Prlimit { pid, resource:?, new_limit:*, old_limit:* }), Sysno::getpid => SyscallRequest::Getpid, Sysno::getppid => SyscallRequest::Getppid, + Sysno::setpgid => sys_req!(Setpgid { pid, pgid }), + Sysno::getpgid => sys_req!(Getpgid { pid }), + Sysno::getpgrp => SyscallRequest::Getpgrp, + Sysno::setsid => SyscallRequest::Setsid, Sysno::getuid => SyscallRequest::Getuid, Sysno::getgid => SyscallRequest::Getgid, Sysno::geteuid => SyscallRequest::Geteuid, @@ -2544,6 +2563,24 @@ impl SyscallRequest { }, Sysno::eventfd2 => sys_req!(Eventfd2 { initval, flags }), Sysno::getrandom => sys_req!(GetRandom { buf:*,count,flags }), + Sysno::vfork => { + // vfork is equivalent to clone(CLONE_VM | CLONE_VFORK | SIGCHLD) + // with no new stack (child runs on parent's stack). + let args = CloneArgs { + flags: CloneFlags::VM | CloneFlags::VFORK, + stack: 0, + parent_tid: 0, + child_tid: 0, + tls: 0, + pidfd: 0, + exit_signal: 17, // SIGCHLD + stack_size: 0, + set_tid: 0, + set_tid_size: 0, + cgroup: 0, + }; + SyscallRequest::Clone { args } + } Sysno::clone => { let args = CloneArgs { // The upper 32 bits are clone3-specific. The low 8 bits are the exit signal. @@ -2603,6 +2640,21 @@ impl SyscallRequest { Sysno::umask => sys_req!(Umask { mask }), Sysno::alarm => sys_req!(Alarm { seconds }), Sysno::setitimer => sys_req!(SetITimer { which:?, new_value:*, old_value:* }), + Sysno::wait4 => { + let pid: i32 = ctx.sys_req_arg(0); + let wstatus: Platform::RawMutPointer = ctx.sys_req_ptr(1); + let options: i32 = ctx.sys_req_arg(2); + // arg3 is rusage, ignored for now + SyscallRequest::Wait4 { + pid, + wstatus: if wstatus.as_usize() == 0 { + None + } else { + Some(wstatus) + }, + options, + } + } // Noisy unsupported syscalls. Sysno::statx | Sysno::io_uring_setup | Sysno::rseq | Sysno::statfs => { return Err(errno::Errno::ENOSYS); diff --git a/litebox_platform_linux_kernel/src/lib.rs b/litebox_platform_linux_kernel/src/lib.rs index cc207d3da..020a73bf7 100644 --- a/litebox_platform_linux_kernel/src/lib.rs +++ b/litebox_platform_linux_kernel/src/lib.rs @@ -13,8 +13,9 @@ use litebox::mm::linux::PageRange; use litebox::platform::RawPointerProvider; use litebox::platform::page_mgmt::FixedAddressBehavior; use litebox::platform::{ - IPInterfaceProvider, ImmediatelyWokenUp, PageManagementProvider, Provider, Punchthrough, - PunchthroughProvider, PunchthroughToken, RawMutexProvider, TimeProvider, UnblockedOrTimedOut, + AddressSpaceProvider, IPInterfaceProvider, ImmediatelyWokenUp, PageManagementProvider, + Provider, Punchthrough, PunchthroughProvider, PunchthroughToken, RawMutexProvider, + TimeProvider, UnblockedOrTimedOut, }; use litebox_common_linux::PunchthroughSyscall; use litebox_common_linux::errno::Errno; @@ -84,6 +85,12 @@ impl<'a, Host: HostInterface> PunchthroughToken for LinuxPunchthroughToken<'a, H impl Provider for LinuxKernel {} +impl AddressSpaceProvider for LinuxKernel { + type AddressSpaceId = u32; + const ADDRESS_SPACE_KIND: litebox::platform::address_space::AddressSpaceKind = + litebox::platform::address_space::AddressSpaceKind::Isolated; +} + // TODO: implement pointer validation to ensure the pointers are in user space. type UserConstPtr = litebox::platform::common_providers::userspace_pointers::UserConstPtr< litebox::platform::common_providers::userspace_pointers::NoValidation, diff --git a/litebox_platform_linux_userland/src/lib.rs b/litebox_platform_linux_userland/src/lib.rs index a1ccf7fdc..dcf2f0ee4 100644 --- a/litebox_platform_linux_userland/src/lib.rs +++ b/litebox_platform_linux_userland/src/lib.rs @@ -105,6 +105,8 @@ pub struct LinuxUserland { /// is persistent across multiple process executions, however, it is ephemeral across true /// reboots. boot_id: std::sync::OnceLock>, + /// VA partition allocator for multi-process support. + va_partitions: VaPartitionAllocator, } impl core::fmt::Debug for LinuxUserland { @@ -236,6 +238,7 @@ impl LinuxUserland { reserved_pages, cow_regions: std::sync::RwLock::new(std::collections::BTreeMap::new()), boot_id: std::sync::OnceLock::new(), + va_partitions: VaPartitionAllocator::new(), }; Box::leak(Box::new(platform)) } @@ -416,6 +419,32 @@ impl LinuxUserland { impl litebox::platform::Provider for LinuxUserland {} +impl litebox::platform::AddressSpaceProvider for LinuxUserland { + type AddressSpaceId = u32; + const ADDRESS_SPACE_KIND: litebox::platform::address_space::AddressSpaceKind = + litebox::platform::address_space::AddressSpaceKind::SharedMemory; + + fn create_address_space( + &self, + ) -> Result { + self.va_partitions.allocate() + } + + fn destroy_address_space( + &self, + id: Self::AddressSpaceId, + ) -> Result<(), litebox::platform::address_space::AddressSpaceError> { + self.va_partitions.release(id) + } + + fn address_space_range( + &self, + id: Self::AddressSpaceId, + ) -> Result, litebox::platform::address_space::AddressSpaceError> { + self.va_partitions.range(id) + } +} + impl litebox::platform::SignalProvider for LinuxUserland { type Signal = litebox_common_linux::signal::Signal; @@ -2272,6 +2301,90 @@ impl litebox::mm::linux::VmemPageFaultHandler for LinuxUserland { } } +// --------------------------------------------------------------------------- +// VA Partition Allocator +// --------------------------------------------------------------------------- + +/// Allocates 1 TiB VA partitions from the 47-bit userland address space. +/// +/// The usable VA range `0x1_0000..0x7FFF_FFFF_F000` (~128 TiB) is divided +/// into 1 TiB slots. Slot 0 covers `0x0..0x100_0000_0000` (though only +/// `0x1_0000..` is usable), slot 1 covers `0x100_0000_0000..0x200_0000_0000`, +/// and so on. A simple bitmap tracks which slots are allocated. +struct VaPartitionAllocator { + /// Bitmap of allocated partitions. Bit N = partition N is allocated. + /// 128 bits covers all 128 possible 1-TiB partitions in 47-bit VA. + allocated: std::sync::Mutex, +} + +impl VaPartitionAllocator { + /// Size of each VA partition: 1 TiB. + const PARTITION_SIZE: usize = 1 << 40; // 0x100_0000_0000 + /// Maximum partition index (exclusive). 128 TiB / 1 TiB = 128. + const MAX_PARTITIONS: u32 = 128; + /// First allocatable partition. Partition 0 is reserved for the init + /// process (its range is the platform default `TASK_ADDR_MIN..TASK_ADDR_MAX`). + const FIRST_ALLOC: u32 = 1; + + fn new() -> Self { + // Mark partition 0 as pre-allocated (init process). + Self { + allocated: std::sync::Mutex::new(1), + } + } + + fn allocate(&self) -> Result { + let mut bitmap = self.allocated.lock().unwrap(); + for i in Self::FIRST_ALLOC..Self::MAX_PARTITIONS { + if *bitmap & (1u128 << i) == 0 { + *bitmap |= 1u128 << i; + return Ok(i); + } + } + Err(litebox::platform::address_space::AddressSpaceError::NoSpace) + } + + fn release(&self, id: u32) -> Result<(), litebox::platform::address_space::AddressSpaceError> { + if !(Self::FIRST_ALLOC..Self::MAX_PARTITIONS).contains(&id) { + return Err(litebox::platform::address_space::AddressSpaceError::InvalidId); + } + let mut bitmap = self.allocated.lock().unwrap(); + if *bitmap & (1u128 << id) == 0 { + return Err(litebox::platform::address_space::AddressSpaceError::InvalidId); + } + *bitmap &= !(1u128 << id); + Ok(()) + } + + fn range( + &self, + id: u32, + ) -> Result, litebox::platform::address_space::AddressSpaceError> { + if id >= Self::MAX_PARTITIONS { + return Err(litebox::platform::address_space::AddressSpaceError::InvalidId); + } + // Verify partition is allocated + { + let bitmap = self.allocated.lock().unwrap(); + if *bitmap & (1u128 << id) == 0 { + return Err(litebox::platform::address_space::AddressSpaceError::InvalidId); + } + } + let start = (id as usize) * Self::PARTITION_SIZE; + let end = start + Self::PARTITION_SIZE; + // Clamp to usable VA space + let start = start.max(0x1_0000); // TASK_ADDR_MIN + let end = end.min(0x7FFF_FFFF_F000); // TASK_ADDR_MAX + // Align to page boundary + let start = (start + 0xFFF) & !0xFFF; + let end = end & !0xFFF; + if start >= end { + return Err(litebox::platform::address_space::AddressSpaceError::InvalidId); + } + Ok(start..end) + } +} + #[cfg(test)] mod tests { use core::sync::atomic::AtomicU32; diff --git a/litebox_platform_windows_userland/src/lib.rs b/litebox_platform_windows_userland/src/lib.rs index 396282544..85ac7a56b 100644 --- a/litebox_platform_windows_userland/src/lib.rs +++ b/litebox_platform_windows_userland/src/lib.rs @@ -333,6 +333,12 @@ impl WindowsUserland { impl litebox::platform::Provider for WindowsUserland {} +impl litebox::platform::AddressSpaceProvider for WindowsUserland { + type AddressSpaceId = u32; + const ADDRESS_SPACE_KIND: litebox::platform::address_space::AddressSpaceKind = + litebox::platform::address_space::AddressSpaceKind::SharedMemory; +} + impl litebox::platform::SignalProvider for WindowsUserland { type Signal = litebox_common_linux::signal::Signal; diff --git a/litebox_runner_linux_on_windows_userland/tests/common/mod.rs b/litebox_runner_linux_on_windows_userland/tests/common/mod.rs index 977150767..6bad0d8bf 100644 --- a/litebox_runner_linux_on_windows_userland/tests/common/mod.rs +++ b/litebox_runner_linux_on_windows_userland/tests/common/mod.rs @@ -83,8 +83,12 @@ impl TestLauncher { ]; let envp = vec![CString::new("PATH=/bin").unwrap()]; let shim = self.shim_builder.build(); + let mut task = self.platform.init_task(); + // Use deterministic guest PID 1 (init) to match process registry. + task.pid = 1; + task.ppid = 0; let program = shim - .load_program(fs, self.platform.init_task(), executable_path, argv, envp) + .load_program(fs, task, executable_path, argv, envp) .unwrap(); unsafe { litebox_platform_windows_userland::run_thread( diff --git a/litebox_runner_linux_userland/src/lib.rs b/litebox_runner_linux_userland/src/lib.rs index 90b24878b..7e170951f 100644 --- a/litebox_runner_linux_userland/src/lib.rs +++ b/litebox_runner_linux_userland/src/lib.rs @@ -380,13 +380,12 @@ pub fn run(cli_args: CliArgs) -> Result<()> { envp }; - let program = shim.load_program( - initial_file_system, - platform.init_task(), - prog_path, - argv, - envp, - )?; + let mut task_params = platform.init_task(); + // Use deterministic guest PIDs starting from 1 (init process). + task_params.pid = 1; + task_params.ppid = 0; + + let program = shim.load_program(initial_file_system, task_params, prog_path, argv, envp)?; #[cfg(feature = "lock_tracing")] litebox::sync::start_recording(); diff --git a/litebox_runner_linux_userland/tests/common/mod.rs b/litebox_runner_linux_userland/tests/common/mod.rs index 60d760aa9..5c2582127 100644 --- a/litebox_runner_linux_userland/tests/common/mod.rs +++ b/litebox_runner_linux_userland/tests/common/mod.rs @@ -98,12 +98,29 @@ fn find_rewriter_source_files() -> Vec { /// Compile C code into an executable with caching pub fn compile(src_path: &str, unique_name: &str, exec_or_lib: bool, nolibc: bool) -> PathBuf { + compile_inner(src_path, unique_name, exec_or_lib, nolibc, false) +} + +#[allow(dead_code)] +pub fn compile_static_pie(src_path: &str, unique_name: &str) -> PathBuf { + compile_inner(src_path, unique_name, true, false, true) +} + +fn compile_inner( + src_path: &str, + unique_name: &str, + exec_or_lib: bool, + nolibc: bool, + static_pie: bool, +) -> PathBuf { let dir_path = std::env::var("OUT_DIR").unwrap(); let path = std::path::Path::new(dir_path.as_str()).join(unique_name); let output = path.to_str().unwrap(); let mut args = vec!["-o", output, src_path]; - if exec_or_lib { + if static_pie { + args.extend_from_slice(&["-static-pie", "-fpie"]); + } else if exec_or_lib { args.push("-static"); } if nolibc { diff --git a/litebox_runner_linux_userland/tests/loader.rs b/litebox_runner_linux_userland/tests/loader.rs index c96168f50..e7d929137 100644 --- a/litebox_runner_linux_userland/tests/loader.rs +++ b/litebox_runner_linux_userland/tests/loader.rs @@ -100,8 +100,12 @@ impl TestLauncher { ]; let fs = std::sync::Arc::new(self.fs); let shim = self.shim_builder.build(); + let mut task = self.platform.init_task(); + // Use deterministic guest PID 1 (init) to match process registry. + task.pid = 1; + task.ppid = 0; let program = shim - .load_program(fs, self.platform.init_task(), executable_path, argv, envp) + .load_program(fs, task, executable_path, argv, envp) .unwrap(); unsafe { litebox_platform_linux_userland::run_thread( diff --git a/litebox_runner_linux_userland/tests/multiprocess/cat_stdin.c b/litebox_runner_linux_userland/tests/multiprocess/cat_stdin.c new file mode 100644 index 000000000..1f7c4e827 --- /dev/null +++ b/litebox_runner_linux_userland/tests/multiprocess/cat_stdin.c @@ -0,0 +1,14 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// Helper: read stdin and write to stdout until EOF, then exit. +#include + +int main(void) { + char buf[256]; + ssize_t n; + while ((n = read(STDIN_FILENO, buf, sizeof(buf))) > 0) { + write(STDOUT_FILENO, buf, n); + } + return 0; +} diff --git a/litebox_runner_linux_userland/tests/multiprocess/echo_hello.c b/litebox_runner_linux_userland/tests/multiprocess/echo_hello.c new file mode 100644 index 000000000..ccbb2269f --- /dev/null +++ b/litebox_runner_linux_userland/tests/multiprocess/echo_hello.c @@ -0,0 +1,11 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// Helper: write "hello\n" to stdout and exit. +#include + +int main(void) { + const char msg[] = "hello\n"; + write(STDOUT_FILENO, msg, sizeof(msg) - 1); + return 0; +} diff --git a/litebox_runner_linux_userland/tests/multiprocess/exit_with.c b/litebox_runner_linux_userland/tests/multiprocess/exit_with.c new file mode 100644 index 000000000..b57faac8c --- /dev/null +++ b/litebox_runner_linux_userland/tests/multiprocess/exit_with.c @@ -0,0 +1,10 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// Helper: exit with the status given as argv[1]. +#include + +int main(int argc, char *argv[]) { + if (argc < 2) return 1; + return atoi(argv[1]); +} diff --git a/litebox_runner_linux_userland/tests/multiprocess/fork_exec_wait.c b/litebox_runner_linux_userland/tests/multiprocess/fork_exec_wait.c new file mode 100644 index 000000000..20e306fed --- /dev/null +++ b/litebox_runner_linux_userland/tests/multiprocess/fork_exec_wait.c @@ -0,0 +1,45 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// Test: vfork + exec a helper program, wait for it, verify exit status. +// Uses vfork() explicitly since our fork implementation has vfork semantics. +// The child must only call execve/_exit (no library calls). + +#include +#include +#include +#include + +int main(int argc, char *argv[]) { + if (argc < 2) { + fprintf(stderr, "usage: fork_exec_wait \n"); + return 1; + } + const char *helper = argv[1]; + + pid_t pid = vfork(); + if (pid < 0) { + perror("vfork"); + return 1; + } + if (pid == 0) { + // Child: exec the helper with exit code 42. + execl(helper, helper, "42", (char *)NULL); + // If exec fails, _exit immediately. + _exit(127); + } + // Parent + int wstatus; + pid_t waited = waitpid(pid, &wstatus, 0); + if (waited != pid) { + fprintf(stderr, "waitpid returned %d, expected %d\n", waited, pid); + return 1; + } + if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 42) { + fprintf(stderr, "unexpected exit status: 0x%x (WIFEXITED=%d, WEXITSTATUS=%d)\n", + wstatus, WIFEXITED(wstatus), WEXITSTATUS(wstatus)); + return 1; + } + printf("fork_exec_wait: OK\n"); + return 0; +} diff --git a/litebox_runner_linux_userland/tests/multiprocess/kill_test.c b/litebox_runner_linux_userland/tests/multiprocess/kill_test.c new file mode 100644 index 000000000..6724ca8f9 --- /dev/null +++ b/litebox_runner_linux_userland/tests/multiprocess/kill_test.c @@ -0,0 +1,50 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// Test: vfork + exec a sleeper, kill it with SIGKILL, verify WIFSIGNALED. + +#include +#include +#include +#include +#include + +int main(int argc, char *argv[]) { + if (argc < 2) { + fprintf(stderr, "usage: kill_test \n"); + return 1; + } + const char *sleeper = argv[1]; + + pid_t pid = vfork(); + if (pid < 0) { + perror("vfork"); + return 1; + } + if (pid == 0) { + execl(sleeper, sleeper, (char *)NULL); + _exit(127); + } + + // Give the child a moment to start, then kill it. + // (In practice the child is already running after vfork returns to parent.) + if (kill(pid, SIGKILL) < 0) { + perror("kill"); + return 1; + } + + int wstatus; + pid_t w = waitpid(pid, &wstatus, 0); + if (w != pid) { + fprintf(stderr, "waitpid returned %d, expected %d\n", w, pid); + return 1; + } + if (!WIFSIGNALED(wstatus) || WTERMSIG(wstatus) != SIGKILL) { + fprintf(stderr, "unexpected status: 0x%x (WIFSIGNALED=%d, WTERMSIG=%d)\n", + wstatus, WIFSIGNALED(wstatus), WTERMSIG(wstatus)); + return 1; + } + + printf("kill_test: OK\n"); + return 0; +} diff --git a/litebox_runner_linux_userland/tests/multiprocess/path_exec_test.c b/litebox_runner_linux_userland/tests/multiprocess/path_exec_test.c new file mode 100644 index 000000000..5863cf4b5 --- /dev/null +++ b/litebox_runner_linux_userland/tests/multiprocess/path_exec_test.c @@ -0,0 +1,42 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// Test: execve with a bare name (no '/') triggers PATH resolution in the shim. + +#include +#include +#include +#include +#include +#include + +int main(void) { + + pid_t pid = vfork(); + if (pid < 0) { + perror("vfork"); + return 1; + } + if (pid == 0) { + // Bare name — no '/' — triggers PATH lookup in the shim's sys_execve. + char *const args[] = {"exit_with", "99", NULL}; + char *const envp[] = {"PATH=/out", NULL}; + execve("exit_with", args, envp); + _exit(127); + } + + int wstatus; + pid_t w = waitpid(pid, &wstatus, 0); + if (w != pid) { + fprintf(stderr, "waitpid returned %d, expected %d\n", w, pid); + return 1; + } + if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 99) { + fprintf(stderr, "unexpected status: 0x%x (WIFEXITED=%d, WEXITSTATUS=%d)\n", + wstatus, WIFEXITED(wstatus), WEXITSTATUS(wstatus)); + return 1; + } + + printf("path_exec_test: OK\n"); + return 0; +} diff --git a/litebox_runner_linux_userland/tests/multiprocess/pipe_fork.c b/litebox_runner_linux_userland/tests/multiprocess/pipe_fork.c new file mode 100644 index 000000000..d37a263f7 --- /dev/null +++ b/litebox_runner_linux_userland/tests/multiprocess/pipe_fork.c @@ -0,0 +1,80 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// Test: pipe between two forked children (simulates `echo hello | cat`). +// Uses vfork + exec so children detach and run concurrently. +// +// Child 1: dup2 pipe write end to stdout, exec "echo_hello" helper +// Child 2: dup2 pipe read end to stdin, exec "cat_stdin" helper +// Parent: close pipe ends, wait for both children. + +#include +#include +#include +#include + +int main(int argc, char *argv[]) { + if (argc < 3) { + fprintf(stderr, "usage: pipe_fork \n"); + return 1; + } + const char *echo_path = argv[1]; + const char *cat_path = argv[2]; + + int pipefd[2]; + if (pipe(pipefd) < 0) { + perror("pipe"); + return 1; + } + + // Fork child 1: writer (echo_hello) + pid_t writer = vfork(); + if (writer < 0) { + perror("vfork writer"); + return 1; + } + if (writer == 0) { + // Redirect stdout to pipe write end + dup2(pipefd[1], STDOUT_FILENO); + close(pipefd[0]); + close(pipefd[1]); + execl(echo_path, echo_path, (char *)NULL); + _exit(127); + } + + // Fork child 2: reader (cat_stdin) + pid_t reader = vfork(); + if (reader < 0) { + perror("vfork reader"); + return 1; + } + if (reader == 0) { + // Redirect stdin to pipe read end + dup2(pipefd[0], STDIN_FILENO); + close(pipefd[0]); + close(pipefd[1]); + execl(cat_path, cat_path, (char *)NULL); + _exit(127); + } + + // Parent: close both pipe ends so children get proper EOF + close(pipefd[0]); + close(pipefd[1]); + + // Wait for both children + int wstatus; + for (int i = 0; i < 2; i++) { + pid_t w = wait(&wstatus); + if (w < 0) { + perror("wait"); + return 1; + } + if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 0) { + fprintf(stderr, "child %d exited with status 0x%x\n", w, wstatus); + return 1; + } + } + + printf("pipe_fork: OK\n"); + return 0; +} diff --git a/litebox_runner_linux_userland/tests/multiprocess/sleeper.c b/litebox_runner_linux_userland/tests/multiprocess/sleeper.c new file mode 100644 index 000000000..33e3e82fa --- /dev/null +++ b/litebox_runner_linux_userland/tests/multiprocess/sleeper.c @@ -0,0 +1,11 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// Helper: block forever until killed by a signal. + +#include + +int main(void) { + pause(); + return 0; +} diff --git a/litebox_runner_linux_userland/tests/multiprocess/wnohang_test.c b/litebox_runner_linux_userland/tests/multiprocess/wnohang_test.c new file mode 100644 index 000000000..1715b7be8 --- /dev/null +++ b/litebox_runner_linux_userland/tests/multiprocess/wnohang_test.c @@ -0,0 +1,57 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +// Test: waitpid with WNOHANG — poll until child exits. + +#include +#include +#include +#include + +int main(int argc, char *argv[]) { + if (argc < 2) { + fprintf(stderr, "usage: wnohang_test \n"); + return 1; + } + const char *helper = argv[1]; + + pid_t pid = vfork(); + if (pid < 0) { + perror("vfork"); + return 1; + } + if (pid == 0) { + execl(helper, helper, "7", (char *)NULL); + _exit(127); + } + + // Poll with WNOHANG until child exits. + int wstatus; + int attempts = 0; + pid_t w; + while (1) { + w = waitpid(pid, &wstatus, WNOHANG); + if (w < 0) { + perror("waitpid"); + return 1; + } + if (w == pid) { + break; // Child exited. + } + // w == 0 means child still running; keep polling. + attempts++; + if (attempts > 1000000) { + fprintf(stderr, "child did not exit after %d polls\n", attempts); + return 1; + } + } + + if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 7) { + fprintf(stderr, "unexpected status: 0x%x (WIFEXITED=%d, WEXITSTATUS=%d)\n", + wstatus, WIFEXITED(wstatus), WEXITSTATUS(wstatus)); + return 1; + } + + printf("wnohang_test: OK\n"); + return 0; +} diff --git a/litebox_runner_linux_userland/tests/run.rs b/litebox_runner_linux_userland/tests/run.rs index f9e3c6b7a..aec32458f 100644 --- a/litebox_runner_linux_userland/tests/run.rs +++ b/litebox_runner_linux_userland/tests/run.rs @@ -604,3 +604,138 @@ fn test_shebang() { "shebang test failed, output: {output_str}" ); } + +// Multi-process tests (fork, pipe, waitpid) + +#[test] +fn test_fork_exec_wait() { + // Compile the main test program (static non-PIE, runs as init process). + let main_target = common::compile( + "./tests/multiprocess/fork_exec_wait.c", + "fork_exec_wait", + true, + false, + ); + // Compile the helper as static-pie so it can load in any VA partition. + let helper_target = common::compile_static_pie("./tests/multiprocess/exit_with.c", "exit_with"); + + // Build a runner with the helper binary added to the guest filesystem. + let mut runner = Runner::new(&main_target, "fork_exec_wait"); + runner.with_fs_path(|out_dir| { + // Rewrite and place the helper binary in the guest filesystem. + let guest_helper = out_dir.join("out/exit_with"); + let success = common::rewrite_with_cache(&helper_target, &guest_helper, &[]); + assert!(success, "failed to rewrite exit_with helper"); + }); + // Pass the guest path to the helper as an argument. + runner.arg("/out/exit_with"); + let output = runner.output(); + let output_str = String::from_utf8_lossy(&output); + assert!( + output_str.contains("fork_exec_wait: OK"), + "fork_exec_wait test failed, output: {output_str}" + ); +} + +#[test] +fn test_pipe_fork() { + // Compile main test (static non-PIE, runs as init) + let main_target = common::compile("./tests/multiprocess/pipe_fork.c", "pipe_fork", true, false); + // Compile helpers as static-pie (loaded in child VA partitions) + let echo_target = common::compile_static_pie("./tests/multiprocess/echo_hello.c", "echo_hello"); + let cat_target = common::compile_static_pie("./tests/multiprocess/cat_stdin.c", "cat_stdin"); + + let mut runner = Runner::new(&main_target, "pipe_fork"); + runner.with_fs_path(|out_dir| { + let guest_echo = out_dir.join("out/echo_hello"); + let success = common::rewrite_with_cache(&echo_target, &guest_echo, &[]); + assert!(success, "failed to rewrite echo_hello helper"); + + let guest_cat = out_dir.join("out/cat_stdin"); + let success = common::rewrite_with_cache(&cat_target, &guest_cat, &[]); + assert!(success, "failed to rewrite cat_stdin helper"); + }); + runner.arg("/out/echo_hello"); + runner.arg("/out/cat_stdin"); + let output = runner.output(); + let output_str = String::from_utf8_lossy(&output); + assert!( + output_str.contains("hello"), + "pipe_fork test failed — expected 'hello' in output, got: {output_str}" + ); + assert!( + output_str.contains("pipe_fork: OK"), + "pipe_fork test failed, output: {output_str}" + ); +} + +#[test] +fn test_kill_signal() { + let main_target = common::compile("./tests/multiprocess/kill_test.c", "kill_test", true, false); + let helper_target = common::compile_static_pie("./tests/multiprocess/sleeper.c", "sleeper"); + + let mut runner = Runner::new(&main_target, "kill_test"); + runner.with_fs_path(|out_dir| { + let guest_helper = out_dir.join("out/sleeper"); + let success = common::rewrite_with_cache(&helper_target, &guest_helper, &[]); + assert!(success, "failed to rewrite sleeper helper"); + }); + runner.arg("/out/sleeper"); + let output = runner.output(); + let output_str = String::from_utf8_lossy(&output); + assert!( + output_str.contains("kill_test: OK"), + "kill_test failed, output: {output_str}" + ); +} + +#[test] +fn test_waitpid_wnohang() { + let main_target = common::compile( + "./tests/multiprocess/wnohang_test.c", + "wnohang_test", + true, + false, + ); + let helper_target = + common::compile_static_pie("./tests/multiprocess/exit_with.c", "exit_with_wnohang"); + + let mut runner = Runner::new(&main_target, "wnohang_test"); + runner.with_fs_path(|out_dir| { + let guest_helper = out_dir.join("out/exit_with"); + let success = common::rewrite_with_cache(&helper_target, &guest_helper, &[]); + assert!(success, "failed to rewrite exit_with helper"); + }); + runner.arg("/out/exit_with"); + let output = runner.output(); + let output_str = String::from_utf8_lossy(&output); + assert!( + output_str.contains("wnohang_test: OK"), + "wnohang_test failed, output: {output_str}" + ); +} + +#[test] +fn test_exec_path_lookup() { + let main_target = common::compile( + "./tests/multiprocess/path_exec_test.c", + "path_exec_test", + true, + false, + ); + let helper_target = + common::compile_static_pie("./tests/multiprocess/exit_with.c", "exit_with_path"); + + let mut runner = Runner::new(&main_target, "path_exec_test"); + runner.with_fs_path(|out_dir| { + let guest_helper = out_dir.join("out/exit_with"); + let success = common::rewrite_with_cache(&helper_target, &guest_helper, &[]); + assert!(success, "failed to rewrite exit_with helper"); + }); + let output = runner.output(); + let output_str = String::from_utf8_lossy(&output); + assert!( + output_str.contains("path_exec_test: OK"), + "path_exec_test failed, output: {output_str}" + ); +} diff --git a/litebox_shim_linux/src/lib.rs b/litebox_shim_linux/src/lib.rs index c1c3d1b47..7718fed77 100644 --- a/litebox_shim_linux/src/lib.rs +++ b/litebox_shim_linux/src/lib.rs @@ -17,6 +17,7 @@ extern crate alloc; use alloc::vec; use alloc::vec::Vec; +use alloc::collections::vec_deque::VecDeque; use alloc::sync::Arc; use core::cell::{Cell, RefCell}; use litebox::{ @@ -101,7 +102,8 @@ impl litebox::shim::EnterShim for LinuxShimEntrypoints { if info.kernel_mode && info.exception == litebox::shim::Exception::PAGE_FAULT { if unsafe { self.task - .global + .process + .borrow() .pm .handle_page_fault(info.cr2, info.error_code.into()) } @@ -183,12 +185,20 @@ impl LinuxShimBuilder { } /// Build the shim. + /// + /// # Panics + /// Panics if the init process cannot be created in the process registry. pub fn build(self) -> LinuxShim { let mut net = Network::new(&self.litebox); net.set_platform_interaction(litebox::net::PlatformInteraction::Manual); + let process_registry = litebox::process::ProcessRegistry::with_max_processes(128); + // Register the init process (PID 1). + process_registry + .create_process(None) + .expect("failed to create init process"); + let global = Arc::new(GlobalState { platform: self.platform, - pm: PageManager::new(&self.litebox), futex_manager: FutexManager::new(), pipes: Pipes::new(&self.litebox), net: litebox::sync::Mutex::new(net), @@ -197,15 +207,21 @@ impl LinuxShimBuilder { next_thread_id: 2.into(), // start from 2, as 1 is used by the main thread litebox: self.litebox, unix_addr_table: litebox::sync::RwLock::new(syscalls::unix::UnixAddrTable::new()), + process_registry, + signal_mailboxes: litebox::sync::Mutex::new(alloc::collections::BTreeMap::new()), + }); + let init_process = Arc::new(ProcessState { + pm: PageManager::new(&global.litebox), + address_space_id: None, }); - LinuxShim(global) + LinuxShim(global, init_process) } } -pub struct LinuxShim(Arc>); +pub struct LinuxShim(Arc>, Arc); impl Clone for LinuxShim { fn clone(&self) -> Self { - Self(self.0.clone()) + Self(self.0.clone(), self.1.clone()) } } @@ -238,6 +254,7 @@ impl LinuxShim { _not_send: core::marker::PhantomData, task: Task { global: self.0.clone(), + process: RefCell::new(self.1.clone()), thread: syscalls::process::ThreadState::new_process(pid), wait_state: wait::WaitState::new(self.0.platform), pid, @@ -254,6 +271,8 @@ impl LinuxShim { fs: Arc::new(syscalls::file::FsState::new()).into(), files: files.into(), signals: syscalls::signal::SignalState::new_process(), + fork_context: RefCell::new(None), + signal_mailbox: self.0.register_signal_mailbox(pid), }, }; @@ -274,9 +293,9 @@ impl LinuxShim { }) } - /// Get the global page manager + /// Get the page manager for the initial process. pub fn page_manager(&self) -> &PageManager { - &self.0.pm + &self.1.pm } /// Perform queued network interactions with the outside world. @@ -948,6 +967,15 @@ impl Task { } SyscallRequest::Getpid => Ok(self.sys_getpid().reinterpret_as_unsigned() as usize), SyscallRequest::Getppid => Ok(self.sys_getppid().reinterpret_as_unsigned() as usize), + SyscallRequest::Setpgid { pid, pgid } => { + self.sys_setpgid(pid, pgid)?; + Ok(0) + } + SyscallRequest::Getpgid { pid } => { + Ok(self.sys_getpgid(pid)?.reinterpret_as_unsigned() as usize) + } + SyscallRequest::Getpgrp => Ok(self.sys_getpgrp()?.reinterpret_as_unsigned() as usize), + SyscallRequest::Setsid => Ok(self.sys_setsid()?.reinterpret_as_unsigned() as usize), SyscallRequest::Getuid => Ok(self.sys_getuid() as usize), SyscallRequest::Getgid => Ok(self.sys_getgid() as usize), SyscallRequest::Geteuid => Ok(self.sys_geteuid() as usize), @@ -991,6 +1019,11 @@ impl Task { SyscallRequest::Tgkill { tgid, tid, sig } => self.sys_tgkill(tgid, tid, sig), SyscallRequest::Sigaltstack { ss, old_ss } => self.sys_sigaltstack(ss, old_ss, ctx), SyscallRequest::Alarm { seconds } => syscall!(sys_alarm(seconds)), + SyscallRequest::Wait4 { + pid, + wstatus, + options, + } => self.sys_wait4(pid, wstatus, options), _ => { log_unsupported!("{request:?}"); Err(Errno::ENOSYS) @@ -999,14 +1032,12 @@ impl Task { } } -/// Global shim state, shared across all tasks. +/// Global shim state, shared across all tasks and all processes. struct GlobalState { /// The platform instance used throughout the shim. platform: &'static Platform, /// The LiteBox instance used throughout the shim. litebox: litebox::LiteBox, - /// The page manager for managing virtual memory. - pm: litebox::mm::PageManager, /// The futex manager for handling futex operations. futex_manager: FutexManager, /// The anonymous pipe implementation. @@ -1022,10 +1053,94 @@ struct GlobalState { next_thread_id: core::sync::atomic::AtomicI32, /// UNIX domain socket address table unix_addr_table: litebox::sync::RwLock>, + /// Process registry for tracking parent-child relationships and exit status. + process_registry: litebox::process::ProcessRegistry, + /// Cross-process signal mailboxes, keyed by PID. + /// Used for delivering signals (e.g., SIGCHLD) between processes. + #[allow(clippy::type_complexity)] + signal_mailboxes: litebox::sync::Mutex< + Platform, + alloc::collections::BTreeMap< + i32, + Arc< + litebox::sync::Mutex< + Platform, + VecDeque<( + litebox_common_linux::signal::Signal, + litebox_common_linux::signal::Siginfo, + )>, + >, + >, + >, + >, +} + +impl GlobalState { + /// Register a signal mailbox for a process. + fn register_signal_mailbox( + &self, + pid: i32, + ) -> Arc< + litebox::sync::Mutex< + Platform, + VecDeque<( + litebox_common_linux::signal::Signal, + litebox_common_linux::signal::Siginfo, + )>, + >, + > { + let mailbox = Arc::new(litebox::sync::Mutex::new(VecDeque::new())); + self.signal_mailboxes.lock().insert(pid, mailbox.clone()); + mailbox + } + + /// Deregister a signal mailbox for a process. + fn deregister_signal_mailbox(&self, pid: i32) { + self.signal_mailboxes.lock().remove(&pid); + } + + /// Send a signal to a process by PID. Returns true if the target mailbox exists. + fn send_signal_to_process( + &self, + target_pid: i32, + signal: litebox_common_linux::signal::Signal, + siginfo: litebox_common_linux::signal::Siginfo, + ) -> bool { + const MAX_MAILBOX_SIZE: usize = 256; + // Clone the Arc and drop the outer lock before acquiring the mailbox + // lock to prevent nested lock acquisition (deadlock risk). + let mailbox = { + let mailboxes = self.signal_mailboxes.lock(); + mailboxes.get(&target_pid).cloned() + }; + if let Some(mailbox) = mailbox { + let mut mbox = mailbox.lock(); + // Cap mailbox size to prevent unbounded memory growth. + if mbox.len() >= MAX_MAILBOX_SIZE { + mbox.pop_front(); + } + mbox.push_back((signal, siginfo)); + true + } else { + false + } + } +} + +/// Per-process state, shared among threads of the same process. +struct ProcessState { + /// The page manager for this process's virtual memory / address space. + pm: litebox::mm::PageManager, + /// Address space ID for child processes that have exec'd into their own + /// VA partition. `None` for the init process (which uses the default + /// platform address space) and for vfork children that haven't exec'd yet. + #[allow(dead_code)] + address_space_id: Option<::AddressSpaceId>, } struct Task { global: Arc>, + process: RefCell>, wait_state: wait::WaitState, thread: syscalls::process::ThreadState, /// Process ID @@ -1045,6 +1160,18 @@ struct Task { files: RefCell>>, /// Signal state signals: syscalls::signal::SignalState, + /// Fork context: present on vfork children, used to signal parent on exec/exit. + fork_context: RefCell>, + /// Cross-process signal mailbox for this process (shared with GlobalState). + signal_mailbox: Arc< + litebox::sync::Mutex< + Platform, + VecDeque<( + litebox_common_linux::signal::Signal, + litebox_common_linux::signal::Siginfo, + )>, + >, + >, } impl Drop for Task { @@ -1071,6 +1198,10 @@ mod test_utils { Task { wait_state: wait::WaitState::new(self.platform), thread: syscalls::process::ThreadState::new_process(pid), + process: RefCell::new(Arc::new(ProcessState { + pm: PageManager::new(&self.litebox), + address_space_id: None, + })), pid, ppid: 0, tid: pid, @@ -1084,6 +1215,8 @@ mod test_utils { fs: Arc::new(syscalls::file::FsState::new()).into(), files: files.into(), signals: syscalls::signal::SignalState::new_process(), + fork_context: RefCell::new(None), + signal_mailbox: self.register_signal_mailbox(pid), global: self, } } @@ -1099,6 +1232,7 @@ mod test_utils { let task = Task { wait_state: wait::WaitState::new(self.global.platform), global: self.global.clone(), + process: self.process.clone(), thread: self.thread.new_thread(tid)?, pid: self.pid, ppid: self.ppid, @@ -1108,6 +1242,8 @@ mod test_utils { fs: self.fs.clone(), files: self.files.clone(), signals: self.signals.clone_for_new_task(), + fork_context: RefCell::new(None), + signal_mailbox: self.signal_mailbox.clone(), }; Some(task) } diff --git a/litebox_shim_linux/src/loader/elf.rs b/litebox_shim_linux/src/loader/elf.rs index 0d62030a8..3d7d40d4b 100644 --- a/litebox_shim_linux/src/loader/elf.rs +++ b/litebox_shim_linux/src/loader/elf.rs @@ -72,13 +72,15 @@ impl litebox_common_linux::loader::MapMemory for ElfFile<'_, FS> { type Error = Errno; fn reserve(&mut self, len: usize, align: usize) -> Result { + // Compute a hint address within this process's VA partition. + let hint = self.task.process.borrow().pm.addr_min() + super::PIE_LOAD_OFFSET; // Allocate a mapping large enough that even if it's maximally misaligned we can // still fit `len` bytes. let mapping_len = len + (align.max(PAGE_SIZE) - PAGE_SIZE); let mapping_ptr = self .task .sys_mmap( - super::DEFAULT_LOW_ADDR, + hint, mapping_len, litebox_common_linux::ProtFlags::PROT_NONE, litebox_common_linux::MapFlags::MAP_ANONYMOUS @@ -202,6 +204,7 @@ impl<'a, FS: ShimFS> ElfLoader<'a, FS> { mut aux: AuxVec, ) -> Result { let global = &self.main.file.task.global; + let process = self.main.file.task.process.borrow(); // Load the main ELF file first so that it gets privileged addresses. let info = self @@ -220,7 +223,7 @@ impl<'a, FS: ShimFS> ElfLoader<'a, FS> { None }; - global.pm.set_initial_brk(info.brk); + process.pm.set_initial_brk(info.brk); aux.insert(AuxKey::AT_PAGESZ, PAGE_SIZE); aux.insert(AuxKey::AT_PHDR, info.phdrs_addr); aux.insert(AuxKey::AT_PHENT, info.phent_size()); @@ -236,7 +239,7 @@ impl<'a, FS: ShimFS> ElfLoader<'a, FS> { let sp = unsafe { let length = litebox::mm::linux::NonZeroPageSize::new(super::DEFAULT_STACK_SIZE) .expect("DEFAULT_STACK_SIZE is not page-aligned"); - global + process .pm .create_stack_pages(None, length, CreatePagesFlags::empty()) .map_err(ElfLoaderError::MappingError)? diff --git a/litebox_shim_linux/src/loader/mod.rs b/litebox_shim_linux/src/loader/mod.rs index a7e370cd3..924d747ac 100644 --- a/litebox_shim_linux/src/loader/mod.rs +++ b/litebox_shim_linux/src/loader/mod.rs @@ -10,6 +10,7 @@ mod stack; pub(crate) const DEFAULT_STACK_SIZE: usize = 8 * 1024 * 1024; // 8 MB -/// A default low address is used for the binary (which grows upwards) to avoid -/// conflicts with the kernel's memory mappings (which grows downwards). -pub(crate) const DEFAULT_LOW_ADDR: usize = 0x1000_0000; +/// Offset added to the process's `addr_min` when computing the PIE load hint. +/// This places binaries low in the partition (growing upwards), leaving +/// room for top-down allocations (stack, mmap) at the high end. +pub(crate) const PIE_LOAD_OFFSET: usize = 0x1000_0000; // 256 MiB diff --git a/litebox_shim_linux/src/syscalls/file.rs b/litebox_shim_linux/src/syscalls/file.rs index 7b900243d..8a52f86f8 100644 --- a/litebox_shim_linux/src/syscalls/file.rs +++ b/litebox_shim_linux/src/syscalls/file.rs @@ -76,6 +76,27 @@ impl FilesState { } } + /// Clone the file descriptor table for fork. + /// + /// The child gets its own `RawDescriptorStorage` with independent `OwnedFd` + /// instances (so close in the child does not poison the parent's FDs). + /// The underlying open file descriptions are shared via Arc in the global + /// descriptor table, tracked by `process_refcount`. + pub(crate) fn clone_for_fork( + &self, + descriptors: &mut litebox::fd::Descriptors, + ) -> Self { + let cloned_rds = self + .raw_descriptor_store + .read() + .clone_for_child(descriptors, None); + Self { + fs: self.fs.clone(), + raw_descriptor_store: litebox::sync::RwLock::new(cloned_rds), + max_fd: AtomicUsize::new(self.max_fd.load(Ordering::Relaxed)), + } + } + pub(crate) fn set_max_fd(&self, max_fd: usize) { self.max_fd.store(max_fd, Ordering::Relaxed); } @@ -428,7 +449,7 @@ impl Task { ) .flatten(); if let Err(Errno::EPIPE) = res { - unimplemented!("send SIGPIPE to the current task"); + self.raise_sigpipe(); } res } @@ -589,47 +610,78 @@ impl Task { let iovs: &[IoReadVec>] = &iovec.to_owned_slice(iovcnt).ok_or(Errno::EFAULT)?; let files = self.files.borrow(); let mut total_read = 0; - let mut kernel_buffer = vec![ - 0u8; - iovs.iter() - .map(|i| i.iov_len) - .max() - .unwrap_or_default() - .min(super::super::MAX_KERNEL_BUF_SIZE) - ]; - for iov in iovs { - if iov.iov_len == 0 { - continue; + + // Check once whether this FD is a pipe to avoid per-iov lock acquisition. + let pipe_fd = { + let rds = files.raw_descriptor_store.read(); + rds.fd_from_raw_integer::>(raw_fd) + .ok() + }; + + if let Some(pipe_fd) = pipe_fd { + // Pipe-specific readv path: avoids borrow conflict with kernel_buffer. + for iov in iovs { + if iov.iov_len == 0 { + continue; + } + let Ok(_iov_len) = isize::try_from(iov.iov_len) else { + return Err(Errno::EINVAL); + }; + let mut pipe_buf = vec![0u8; iov.iov_len.min(super::super::MAX_KERNEL_BUF_SIZE)]; + let n = self + .global + .pipes + .read(&self.wait_cx(), &pipe_fd, &mut pipe_buf) + .map_err(Errno::from)?; + iov.iov_base + .copy_from_slice(0, &pipe_buf[..n]) + .ok_or(Errno::EFAULT)?; + total_read += n; + if n < iov.iov_len { + break; + } } - let Ok(_iov_len) = isize::try_from(iov.iov_len) else { - return Err(Errno::EINVAL); - }; - // TODO: The data transfers performed by readv() and writev() are atomic: the data - // written by writev() is written as a single block that is not intermingled with - // output from writes in other processes - let size = files - .run_on_raw_fd( - raw_fd, - |fd| { - files - .fs - .read(fd, &mut kernel_buffer, None) - .map_err(Errno::from) - }, - |_fd| todo!("net"), - |_fd| todo!("pipes"), - |_fd| todo!("eventfd"), - |_fd| Err(Errno::EINVAL), - |_fd| todo!("unix"), - ) - .flatten()?; - iov.iov_base - .copy_from_slice(0, &kernel_buffer[..size]) - .ok_or(Errno::EFAULT)?; - total_read += size; - if size < iov.iov_len { - // Okay to transfer fewer bytes than requested - break; + } else { + let mut kernel_buffer = vec![ + 0u8; + iovs.iter() + .map(|i| i.iov_len) + .max() + .unwrap_or_default() + .min(super::super::MAX_KERNEL_BUF_SIZE) + ]; + for iov in iovs { + if iov.iov_len == 0 { + continue; + } + let Ok(_iov_len) = isize::try_from(iov.iov_len) else { + return Err(Errno::EINVAL); + }; + // TODO: The data transfers performed by readv() and writev() are atomic + let size = files + .run_on_raw_fd( + raw_fd, + |fd| { + files + .fs + .read(fd, &mut kernel_buffer, None) + .map_err(Errno::from) + }, + |_fd| todo!("net"), + |_fd| unreachable!(), // pipes handled above + |_fd| todo!("eventfd"), + |_fd| Err(Errno::EINVAL), + |_fd| todo!("unix"), + ) + .flatten()?; + iov.iov_base + .copy_from_slice(0, &kernel_buffer[..size]) + .ok_or(Errno::EFAULT)?; + total_read += size; + if size < iov.iov_len { + // Okay to transfer fewer bytes than requested + break; + } } } Ok(total_read) @@ -695,14 +747,21 @@ impl Task { ) }) }, - |_fd| todo!("pipes"), + |fd| { + write_to_iovec(iovs, |buf| { + self.global + .pipes + .write(&self.wait_cx(), fd, buf) + .map_err(Errno::from) + }) + }, |_fd| todo!("eventfd"), |_fd| Err(Errno::EINVAL), |_fd| todo!("unix"), ) .flatten(); if let Err(Errno::EPIPE) = res { - unimplemented!("send SIGPIPE to the current task"); + self.raise_sigpipe(); } res } diff --git a/litebox_shim_linux/src/syscalls/mm.rs b/litebox_shim_linux/src/syscalls/mm.rs index ce6c3513c..501b4645a 100644 --- a/litebox_shim_linux/src/syscalls/mm.rs +++ b/litebox_shim_linux/src/syscalls/mm.rs @@ -45,7 +45,7 @@ impl Task { op: impl FnOnce(MutPtr) -> Result, ) -> Result, MappingError> { litebox_common_linux::mm::do_mmap( - &self.global.pm, + &self.process.borrow().pm, suggested_addr, len, prot, @@ -175,7 +175,7 @@ impl Task { // SAFETY: ptr is the freshly CoW-mapped region of exactly `len` bytes with // `permissions`. unsafe { - self.global.pm.register_existing_mapping( + self.process.borrow().pm.register_existing_mapping( range, permissions, true, @@ -303,7 +303,7 @@ impl Task { /// Handle syscall `munmap` #[inline] pub(crate) fn sys_munmap(&self, addr: crate::MutPtr, len: usize) -> Result<(), Errno> { - litebox_common_linux::mm::sys_munmap(&self.global.pm, addr, len) + litebox_common_linux::mm::sys_munmap(&self.process.borrow().pm, addr, len) } /// Handle syscall `mprotect` @@ -314,7 +314,7 @@ impl Task { len: usize, prot: ProtFlags, ) -> Result<(), Errno> { - litebox_common_linux::mm::sys_mprotect(&self.global.pm, addr, len, prot) + litebox_common_linux::mm::sys_mprotect(&self.process.borrow().pm, addr, len, prot) } #[inline] @@ -327,7 +327,7 @@ impl Task { new_addr: usize, ) -> Result, Errno> { litebox_common_linux::mm::sys_mremap( - &self.global.pm, + &self.process.borrow().pm, old_addr, old_size, new_size, @@ -339,7 +339,7 @@ impl Task { /// Handle syscall `brk` #[inline] pub(crate) fn sys_brk(&self, addr: MutPtr) -> Result { - litebox_common_linux::mm::sys_brk(&self.global.pm, addr) + litebox_common_linux::mm::sys_brk(&self.process.borrow().pm, addr) } /// Handle syscall `madvise` @@ -350,7 +350,7 @@ impl Task { len: usize, advice: litebox_common_linux::MadviseBehavior, ) -> Result<(), Errno> { - litebox_common_linux::mm::sys_madvise(&self.global.pm, addr, len, advice) + litebox_common_linux::mm::sys_madvise(&self.process.borrow().pm, addr, len, advice) } } diff --git a/litebox_shim_linux/src/syscalls/net.rs b/litebox_shim_linux/src/syscalls/net.rs index d17574814..63af1b111 100644 --- a/litebox_shim_linux/src/syscalls/net.rs +++ b/litebox_shim_linux/src/syscalls/net.rs @@ -751,8 +751,8 @@ impl GlobalState { let is_nonblock = self.get_status(fd).contains(OFlags::NONBLOCK) || flags.contains(SendFlags::DONTWAIT); - let ret = cx - .with_timeout(timeout) + // Note: SIGPIPE is sent at the Task level (do_sendto/sys_sendmsg) + cx.with_timeout(timeout) .wait_on_events( is_nonblock, Events::OUT, @@ -766,13 +766,8 @@ impl GlobalState { Err(e) => Err(TryOpError::Other(Errno::from(e))), }, ) - .map_err(Errno::from); - if let Err(Errno::EPIPE) = ret - && !flags.contains(SendFlags::NOSIGNAL) - { - unimplemented!("send signal SIGPIPE on EPIPE"); - } - ret + .map_err(Errno::from) + // Note: SIGPIPE is sent at the Task level (do_sendto/sys_sendmsg) } /// Receive data via socket channel (lock-free path). @@ -1340,7 +1335,7 @@ impl Task { flags: SendFlags, sockaddr: Option, ) -> Result { - self.files.borrow().with_socket( + let ret = self.files.borrow().with_socket( &self.global, sockfd, |fd| { @@ -1358,7 +1353,13 @@ impl Task { .transpose()?; file.sendto(self, buf, flags, addr) }, - ) + ); + if let Err(Errno::EPIPE) = ret + && !flags.contains(SendFlags::NOSIGNAL) + { + self.raise_sigpipe(); + } + ret } /// Handle syscall `sendmsg` @@ -1399,7 +1400,7 @@ impl Task { .msg_iov .to_owned_slice(msg.msg_iovlen) .ok_or(Errno::EFAULT)?; - self.files.borrow().with_socket( + let ret = self.files.borrow().with_socket( &self.global, sockfd, |fd| { @@ -1440,7 +1441,13 @@ impl Task { } Ok(total_sent) }, - ) + ); + if let Err(Errno::EPIPE) = ret + && !flags.contains(SendFlags::NOSIGNAL) + { + self.raise_sigpipe(); + } + ret } /// Handle syscall `recvfrom` diff --git a/litebox_shim_linux/src/syscalls/process.rs b/litebox_shim_linux/src/syscalls/process.rs index 69a41b70b..036ce0fb9 100644 --- a/litebox_shim_linux/src/syscalls/process.rs +++ b/litebox_shim_linux/src/syscalls/process.rs @@ -8,7 +8,7 @@ use alloc::boxed::Box; use alloc::collections::btree_map::BTreeMap; use alloc::sync::Arc; use alloc::vec::Vec; -use core::cell::Cell; +use core::cell::{Cell, RefCell}; use core::mem::offset_of; use core::ops::Range; use core::sync::atomic::{AtomicBool, Ordering}; @@ -19,7 +19,7 @@ use litebox::platform::ThreadProvider; use litebox::platform::{Instant as _, SystemTime as _, TimeProvider}; use litebox::platform::{ PunchthroughProvider as _, PunchthroughToken as _, RawConstPointer as _, RawMutex as _, - ThreadLocalStorageProvider as _, + RawMutexProvider, ThreadLocalStorageProvider as _, }; use litebox::platform::{RawMutPointer as _, TimerHandle, TimerProvider}; use litebox::sync::Mutex; @@ -29,6 +29,49 @@ use litebox_common_linux::{ }; use litebox_platform_multiplex::Platform; +/// One-shot signal from a vfork child to the parent, indicating the child +/// has called `execve` or `_exit` and the parent may resume. +/// +/// On userland, all forks are treated as vfork: the parent is suspended +/// while the child runs in the shared address space. When the child performs +/// exec (detaching to its own VA partition) or exits, it signals the parent +/// via this structure. +pub(crate) struct VforkDone { + /// 0 = not done, 1 = done. + futex: ::RawMutex, +} + +impl VforkDone { + fn new() -> Self { + Self { + futex: ::RawMutex::INIT, + } + } + + /// Signal that the child is done (called by child on exec or exit). + pub(crate) fn signal(&self) { + self.futex.underlying_atomic().store(1, Ordering::Release); + self.futex.wake_all(); + } + + /// Block until the child signals done (called by parent after spawning child). + fn wait(&self) { + loop { + if self.futex.underlying_atomic().load(Ordering::Acquire) != 0 { + return; + } + let _ = self.futex.block(0); + } + } +} + +/// Context carried by a fork child task so that exec and exit know +/// to signal the parent and (on exec) detach to a new address space. +pub(crate) struct ForkContext { + /// Signaled on exec or exit to wake the parent. + pub(crate) vfork_done: Arc, +} + /// Process-management-related state on [`Task`]. pub(crate) struct ThreadState { init_state: Cell, @@ -322,6 +365,12 @@ enum ThreadInitState { tls: Option, set_child_tid: Option>, }, + /// A fork child: starts with parent's register state, return value 0. + NewForkChild { + /// The guest FS base (TLS pointer) inherited from the parent. + #[cfg(target_arch = "x86_64")] + guest_fsbase: usize, + }, } /// Credentials of a process @@ -500,6 +549,19 @@ impl Task { pub(crate) fn prepare_for_exit(&mut self) { self.thread.detach_from_process(); + // Close all file descriptors when the process leader exits. + // Only the process leader (pid == tid) closes FDs, not worker threads + // (which share the same FD table and may exit during exec). + if self.pid == self.tid { + let files = self.files.borrow(); + let live_fds: alloc::vec::Vec = + files.raw_descriptor_store.read().iter_alive().collect(); + drop(files); + for fd in live_fds { + let _ = self.do_close(fd); + } + } + if let Some(clear_child_tid) = self.thread.clear_child_tid.take() { // Clear the child TID if requested // TODO: if we are the last thread, we don't need to clear it @@ -515,6 +577,78 @@ impl Task { if let Some(robust_list) = self.thread.robust_list.take() { let _ = wake_robust_list(robust_list); } + + // If this is the process leader (pid == tid) and it's exiting, + // record the exit in the process registry so waitpid can collect it. + // This must happen BEFORE signaling vfork_done, so that the parent + // can immediately waitpid after being unblocked. + if self.pid == self.tid + && let Some(process_id) = litebox::process::ProcessId::new(self.pid.cast_unsigned()) + { + // Get the exit status from the process thread group. + let exit_status = self.thread.process.inner.lock().exit_status; + let wait_status = match exit_status { + ExitStatus::Exit(code) => (u32::from(code.cast_unsigned()) & 0xff) << 8, + ExitStatus::Signal(sig) => sig.as_i32().cast_unsigned() & 0x7f, + }; + let notification = + self.global + .process_registry + .exit_process(process_id, wait_status, |orphan| { + // Reparent orphaned children to init (pid 1). + if let Some(init_pid) = litebox::process::ProcessId::new(1) { + let zombie_status = + self.global.process_registry.reparent(orphan, init_pid); + // If the orphan is already a zombie, notify init with SIGCHLD. + if let Some(exit_status) = zombie_status { + use litebox_common_linux::signal::Signal; + let siginfo = super::signal::siginfo_chld( + orphan.as_u32().cast_signed(), + exit_status, + ); + self.global.send_signal_to_process( + 1, // init pid + Signal::SIGCHLD, + siginfo, + ); + } + } + }); + + // Deliver SIGCHLD to the parent process. + if let Some(notif) = notification { + use litebox_common_linux::signal::Signal; + let siginfo = super::signal::siginfo_chld( + notif.child_pid.as_u32().cast_signed(), + notif.exit_status, + ); + self.global.send_signal_to_process( + notif.parent_pid.as_u32().cast_signed(), + Signal::SIGCHLD, + siginfo, + ); + } + } + + // Deregister the signal mailbox for this process. + if self.pid == self.tid { + self.global.deregister_signal_mailbox(self.pid); + } + + // NOTE: VA partition reclamation is deferred for future work. + // We cannot release the partition here because: + // 1. The thread is still executing (unwinding through host code) + // 2. Guest pages are still mapped in the partition's VA range + // 3. Releasing the ID would allow another fork to reuse the same range + // With 128 partitions, this is acceptable for the minimal implementation. + // Future: move cleanup to zombie reaping (waitpid/remove_process) and + // unmap all guest pages before releasing the partition ID. + + // If this is a vfork child that never exec'd, signal the parent. + // Done after exit recording so parent's waitpid sees the exit. + if let Some(fc) = self.fork_context.borrow_mut().take() { + fc.vfork_done.signal(); + } } pub(crate) fn sys_exit(&self, status: i32) { @@ -527,6 +661,53 @@ impl Task { // Tear down occurs similarly to `sys_exit`. self.exit_group(ExitStatus::Exit(status.truncate())); } + + /// wait4(pid, wstatus, options, rusage) — wait for a child process. + pub(crate) fn sys_wait4( + &self, + pid: i32, + wstatus: Option>, + options: i32, + ) -> Result { + const WNOHANG: i32 = 1; + + let parent_pid = + litebox::process::ProcessId::new(self.pid.cast_unsigned()).ok_or(Errno::ESRCH)?; + + loop { + // Pick up any cross-process signals (e.g., SIGCHLD from exiting children). + self.drain_cross_process_signals(); + + // Snapshot the exit epoch BEFORE try_wait to avoid a race where a + // child exits between try_wait and the blocking call. + let epoch = self.global.process_registry.exit_epoch(); + + match self.global.process_registry.try_wait(parent_pid, pid) { + Err(()) => { + // No matching children at all — ECHILD. + return Err(Errno::ECHILD); + } + Ok(Some((child_pid, status))) => { + // Reaped a child. + if let Some(wstatus) = wstatus { + let _ = wstatus.write_at_offset(0, status.cast_signed()); + } + return Ok(child_pid.as_u32() as usize); + } + Ok(None) => { + // Children exist but none exited yet. + if options & WNOHANG != 0 { + return Ok(0); + } + // Block until some child exits (using epoch snapshot from + // before try_wait to avoid missed wakeups). + self.global + .process_registry + .wait_for_child_exit_since(epoch); + } + } + } + } } /// A descriptor for thread-local storage (TLS). @@ -577,12 +758,180 @@ impl Task { /// Creates a new thread or process. /// - /// Note we currently only support creating threads with the VM, FS, and FILES flags set. + /// If `CLONE_THREAD` is set, creates a new thread in the current process. + /// Otherwise, treats the clone as a fork (vfork semantics: parent is + /// suspended until child calls exec or exits). fn do_clone( &self, ctx: &litebox_common_linux::PtRegs, args: &litebox_common_linux::CloneArgs, clone3: bool, + ) -> Result { + let litebox_common_linux::CloneArgs { mut flags, .. } = *args; + + // `CLONE_DETACHED` is ignored but has been reserved for reuse with + // `clone3` or in combination with `CLONE_PIDFD`. + if !clone3 && !flags.contains(CloneFlags::PIDFD) { + flags.remove(CloneFlags::DETACHED); + } + + if !flags.contains(CloneFlags::THREAD) { + // This is a fork (or vfork). Route to fork path. + return self.do_fork(ctx, args); + } + + // Thread clone path — requires VM, THREAD, SIGHAND, FILES. + self.do_thread_clone(ctx, args, clone3) + } + + /// Fork: create a new child process with vfork semantics. + /// + /// On userland, all forks are treated as vfork: the parent is suspended + /// while the child runs in the parent's shared address space. When the + /// child calls `execve` (detaching to its own VA partition) or `_exit`, + /// the parent is woken. + /// + /// The child gets: + /// - A new PID/TID + /// - Its own cloned FD table (close in child doesn't affect parent) + /// - The parent's ProcessState (shared memory, shared PageManager) + /// - A `ForkContext` so exec/exit can signal the parent + fn do_fork( + &self, + ctx: &litebox_common_linux::PtRegs, + args: &litebox_common_linux::CloneArgs, + ) -> Result { + const MAX_SIGNAL_NUMBER: u64 = 64; + let litebox_common_linux::CloneArgs { + exit_signal, + set_tid, + set_tid_size, + cgroup, + .. + } = *args; + + if cgroup != 0 { + log_unsupported!("fork with cgroup"); + return Err(Errno::EINVAL); + } + if set_tid != 0 || set_tid_size != 0 { + log_unsupported!("fork with set_tid"); + return Err(Errno::EINVAL); + } + + // Validate exit_signal (typically SIGCHLD for fork). + if exit_signal > MAX_SIGNAL_NUMBER { + return Err(Errno::EINVAL); + } + + // Guard: fork from a multi-threaded process is not supported. + { + let inner = self.thread.process.inner.lock(); + if inner.threads.len() > 1 { + log_unsupported!( + "fork from multi-threaded process (pid={}, {} threads) is not supported", + self.pid, + inner.threads.len() + ); + return Err(Errno::ENOSYS); + } + } + + // Register the child process in the process registry. + let parent_process_id = + litebox::process::ProcessId::new(self.pid.cast_unsigned()).expect("parent PID is 0"); + let child_process_id = self + .global + .process_registry + .create_process(Some(parent_process_id)) + .map_err(|_| Errno::EAGAIN)?; + let child_pid = child_process_id.as_u32().cast_signed(); + + // Advance the thread ID counter past the child PID to avoid collisions + // between PIDs and TIDs. Use saturating_add to prevent overflow. + let _ = self + .global + .next_thread_id + .fetch_max(child_pid.saturating_add(1), Ordering::Relaxed); + + // Clone the FD table for the child, incrementing process_refcounts in the global descriptor table. + let child_files = { + let mut dt = self.global.litebox.descriptor_table_mut(); + Arc::new(self.files.borrow().clone_for_fork(&mut dt)) + }; + + // Capture the parent's guest FS base for the child. + #[cfg(target_arch = "x86_64")] + let guest_fsbase = { + let punchthrough = litebox_common_linux::PunchthroughSyscall::GetFsBase; + let token = self + .global + .platform + .get_punchthrough_token_for(punchthrough) + .ok_or(Errno::ENOSYS)?; + token.execute().map_err(|_| Errno::EFAULT)? + }; + + // Create the vfork synchronization. + let vfork_done = Arc::new(VforkDone::new()); + + // Build the child task. The child shares the parent's ProcessState + // (and thus PageManager / address space) until it execs. + let child_thread = ThreadState::new_process(child_pid); + child_thread.init_state.set(ThreadInitState::NewForkChild { + #[cfg(target_arch = "x86_64")] + guest_fsbase, + }); + + let child_task = Task { + global: self.global.clone(), + process: RefCell::new(self.process.borrow().clone()), // shared address space + wait_state: crate::wait::WaitState::new(self.global.platform), + thread: child_thread, + pid: child_pid, + ppid: self.pid, + tid: child_pid, + credentials: self.credentials.clone(), + comm: self.comm.clone(), + fs: RefCell::new((*self.fs.borrow()).clone()), + files: RefCell::new(child_files), + signals: self.signals.clone_for_fork(), + fork_context: RefCell::new(Some(ForkContext { + vfork_done: vfork_done.clone(), + })), + signal_mailbox: self.global.register_signal_mailbox(child_pid), + }; + + // Spawn the child as a new host thread. + let r = unsafe { + self.global + .platform + .spawn_thread(ctx, Box::new(NewThreadArgs { task: child_task })) + }; + if let Err(err) = r { + litebox_util_log::error!(err:% = err; "failed to spawn fork child"); + // The child_task was dropped by spawn_thread's failure path, which + // triggered prepare_for_exit (closing FDs, recording exit, deregistering + // mailbox). Clean up the zombie registry entry so the parent doesn't + // see a phantom child from a failed fork. + self.global + .process_registry + .remove_process(child_process_id); + return Err(Errno::ENOMEM); + } + + // Parent blocks here until child execs or exits. + vfork_done.wait(); + + Ok(usize::try_from(child_pid).unwrap()) + } + + /// Creates a new thread within the current process. + fn do_thread_clone( + &self, + ctx: &litebox_common_linux::PtRegs, + args: &litebox_common_linux::CloneArgs, + clone3: bool, ) -> Result { const MAX_SIGNAL_NUMBER: u64 = 64; @@ -689,6 +1038,10 @@ impl Task { }; let child_tid = self.global.next_thread_id.fetch_add(1, Ordering::Relaxed); + // Keep PID counter in sync with TID counter to avoid namespace collisions. + self.global + .process_registry + .advance_next_pid(child_tid.saturating_add(1).cast_unsigned()); if let Some(parent_tid_ptr) = set_parent_tid { let _ = parent_tid_ptr.write_at_offset(0, child_tid); } @@ -717,6 +1070,7 @@ impl Task { Box::new(NewThreadArgs { task: Task { global: self.global.clone(), + process: RefCell::new(self.process.borrow().clone()), wait_state: crate::wait::WaitState::new(self.global.platform), thread, pid: self.pid, @@ -727,6 +1081,8 @@ impl Task { fs: fs.into(), files: self.files.clone(), // TODO: !CLONE_FILES support signals: self.signals.clone_for_new_task(), + fork_context: RefCell::new(None), + signal_mailbox: self.signal_mailbox.clone(), // share parent's mailbox }, }), ) @@ -1183,6 +1539,65 @@ impl Task { self.ppid } + /// Handle syscall `setpgid`. + #[allow(clippy::similar_names)] + pub(crate) fn sys_setpgid(&self, pid: i32, pgid: i32) -> Result<(), Errno> { + let target_pid = if pid == 0 { self.pid } else { pid }; + let target_pgid = if pgid == 0 { target_pid } else { pgid }; + let Some(target) = litebox::process::ProcessId::new(target_pid.cast_unsigned()) else { + return Err(Errno::ESRCH); + }; + let Some(pg) = litebox::process::ProcessId::new(target_pgid.cast_unsigned()) else { + return Err(Errno::EINVAL); + }; + // POSIX: can only setpgid on self or a child process. + let my_pid = + litebox::process::ProcessId::new(self.pid.cast_unsigned()).ok_or(Errno::ESRCH)?; + if target != my_pid { + let is_child = self + .global + .process_registry + .with_context(my_pid, |ctx| ctx.children().contains(&target)) + .unwrap_or(false); + if !is_child { + return Err(Errno::ESRCH); + } + } + self.global + .process_registry + .set_pgid(target, pg) + .map_err(|()| Errno::ESRCH) + } + + /// Handle syscall `getpgid`. + pub(crate) fn sys_getpgid(&self, pid: i32) -> Result { + let target_pid = if pid == 0 { self.pid } else { pid }; + let Some(target) = litebox::process::ProcessId::new(target_pid.cast_unsigned()) else { + return Err(Errno::ESRCH); + }; + self.global + .process_registry + .with_context(target, |ctx| ctx.pgid.as_u32().cast_signed()) + .ok_or(Errno::ESRCH) + } + + /// Handle syscall `getpgrp` (equivalent to getpgid(0)). + pub(crate) fn sys_getpgrp(&self) -> Result { + self.sys_getpgid(0) + } + + /// Handle syscall `setsid`. + pub(crate) fn sys_setsid(&self) -> Result { + let Some(pid) = litebox::process::ProcessId::new(self.pid.cast_unsigned()) else { + return Err(Errno::EPERM); + }; + self.global + .process_registry + .setsid(pid) + .map_err(|()| Errno::EPERM)?; + Ok(self.pid) + } + /// Handle syscall `getuid`. pub(crate) fn sys_getuid(&self) -> u32 { self.credentials.uid @@ -1342,6 +1757,40 @@ fn parse_shebang(buf: &[u8]) -> Option<(&str, Option<&str>)> { } impl Task { + /// Search $PATH for a binary name (no '/' in path). + /// Returns the full path if found, or ENOENT. + fn resolve_path_lookup( + &self, + name: &str, + envp: &[alloc::ffi::CString], + ) -> Result { + let path_env = envp + .iter() + .find_map(|e| { + let s = e.to_str().ok()?; + s.strip_prefix("PATH=") + }) + .unwrap_or("/usr/bin:/bin"); + + for dir in path_env.split(':') { + let candidate = if dir.is_empty() { + alloc::format!("./{name}") + } else { + alloc::format!("{dir}/{name}") + }; + // Check if the file exists by trying to open it. + if let Ok(fd) = self.sys_open( + candidate.as_str(), + litebox::fs::OFlags::RDONLY, + litebox::fs::Mode::empty(), + ) { + let _ = self.do_close(fd as usize); + return Ok(candidate); + } + } + Err(Errno::ENOENT) + } + /// Resolve shebang (`#!`) chains for the given path and argv if the file starts with a shebang line. /// Otherwise, returns the original path and argv. pub(crate) fn resolve_shebang( @@ -1386,6 +1835,29 @@ impl Task { Err(Errno::ELOOP) } + /// Detach from the parent's shared address space to a new VA partition. + /// + /// Called during exec of a vfork child. Creates a new address space via the + /// platform, builds a new `ProcessState` with a `PageManager` scoped to that + /// partition's VA range, and replaces `self.process`. + fn detach_to_new_address_space(&self) -> Result<(), Errno> { + use litebox::platform::AddressSpaceProvider; + + let platform = self.global.platform; + let as_id = platform.create_address_space().map_err(|_| Errno::ENOMEM)?; + let range = platform + .address_space_range(as_id) + .map_err(|_| Errno::ENOMEM)?; + + let new_process = Arc::new(crate::ProcessState { + pm: litebox::mm::PageManager::new_with_range(&self.global.litebox, range), + address_space_id: Some(as_id), + }); + + *self.process.borrow_mut() = new_process; + Ok(()) + } + /// Handle syscall `execve`. pub(crate) fn sys_execve( &self, @@ -1443,7 +1915,14 @@ impl Task { copy_vector(envp, "envp")? }; - let (path, argv_vec) = self.resolve_shebang(alloc::string::String::from(path), argv_vec)?; + // PATH resolution: if path doesn't contain '/', search $PATH. + let path = if path.contains('/') { + alloc::string::String::from(path) + } else { + self.resolve_path_lookup(path, &envp_vec)? + }; + + let (path, argv_vec) = self.resolve_shebang(path, argv_vec)?; let loader = crate::loader::elf::ElfLoader::new(self, &path)?; @@ -1467,9 +1946,25 @@ impl Task { self.signals.reset_for_exec(); + // If this is a vfork child, detach to a new address space before + // releasing memory (so we don't destroy the parent's mappings). + let vfork_done = self + .fork_context + .borrow_mut() + .take() + .map(|fc| fc.vfork_done); + if let Some(ref vd) = vfork_done + && let Err(e) = self.detach_to_new_address_space() + { + // Signal the parent before returning error — otherwise parent + // hangs forever waiting on vfork_done. + vd.signal(); + return Err(e); + } + // Don't release reserved mappings. let release = |_r: Range, vm: VmFlags| !vm.is_empty(); - unsafe { self.global.pm.release_memory(release) } + unsafe { self.process.borrow().pm.release_memory(release) } .expect("failed to release memory mappings"); litebox_platform_multiplex::Platform::clear_guest_thread_local_storage(); @@ -1477,6 +1972,12 @@ impl Task { self.load_program(loader, argv_vec, envp_vec) .expect("TODO: terminate the process cleanly"); + // Signal the parent that the vfork child has exec'd and detached. + // The parent's address space is intact; it can safely resume. + if let Some(vd) = vfork_done { + vd.signal(); + } + self.init_thread_context(ctx); Ok(0) } @@ -1575,6 +2076,19 @@ impl Task { let _ = child_tid_ptr.write_at_offset(0, self.tid); } } + ThreadInitState::NewForkChild { + #[cfg(target_arch = "x86_64")] + guest_fsbase, + } => { + // Fork child: return 0 from the fork syscall. + #[cfg(target_arch = "x86_64")] + { + ctx.rax = 0; + // Restore the parent's guest FS base (TLS) on this new host thread. + self.sys_arch_prctl(ArchPrctlArg::SetFs(guest_fsbase)) + .expect("failed to set guest fsbase for fork child"); + } + } } } } diff --git a/litebox_shim_linux/src/syscalls/signal/mod.rs b/litebox_shim_linux/src/syscalls/signal/mod.rs index af5764700..f78d97357 100644 --- a/litebox_shim_linux/src/syscalls/signal/mod.rs +++ b/litebox_shim_linux/src/syscalls/signal/mod.rs @@ -91,6 +91,45 @@ impl SignalState { } } + /// Clone signal state for a fork child (new process). + /// + /// Unlike `clone_for_new_task` (for threads within the same process), fork creates + /// a new process that gets: + /// - Independent signal handlers (deep-cloned, not shared) + /// - Fresh process-wide pending signals (new process, no inherited pending) + /// - Parent's blocked signal mask (inherited) + /// - Fresh per-thread pending (as with new_task) + /// - Fresh altstack + pub fn clone_for_fork(&self) -> Self { + // Deep-clone handlers: copy the inner data into a new Arc + let parent_handlers = self.handlers.borrow(); + let cloned_handlers_inner = parent_handlers.inner.lock().clone(); + let new_handlers = Arc::new(SignalHandlers { + inner: Mutex::new(cloned_handlers_inner), + }); + + Self { + pending: RefCell::new(PendingSignals::new()), + shared_pending: Arc::new(Mutex::new(PendingSignals::new())), + blocked: Cell::new(self.blocked.get()), + handlers: RefCell::new(new_handlers), + altstack: SigAltStack { + flags: SsFlags::DISABLE, + sp: 0, + size: 0, + #[cfg(target_arch = "x86_64")] + __pad: 0, + } + .into(), + last_exception: Cell::new(litebox::shim::ExceptionInfo { + exception: litebox::shim::Exception(0), + error_code: 0, + cr2: 0, + kernel_mode: false, + }), + } + } + /// Resets signal state for an `execve` call. pub(crate) fn reset_for_exec(&self) { let mut handlers = self.handlers.borrow_mut(); @@ -293,6 +332,39 @@ pub(crate) fn siginfo_kill(signal: Signal) -> Siginfo { } } +/// Creates a `Siginfo` for SIGCHLD when a child process exits. +/// `wait_status` is the wait-encoded status: `(code & 0xff) << 8` for normal exit, +/// `sig & 0x7f` for signal death. +pub(crate) fn siginfo_chld(child_pid: i32, wait_status: u32) -> Siginfo { + const CLD_EXITED: i32 = 1; + const CLD_KILLED: i32 = 2; + + // Decode wait_status: bits 6..0 == 0 means normal exit (status in bits 15..8), + // otherwise killed by signal (signal number in bits 6..0). + let (code, si_status) = if wait_status.trailing_zeros() >= 7 { + // Normal exit: status is in bits 15..8 + (CLD_EXITED, (wait_status >> 8) & 0xff) + } else { + // Killed by signal: signal number is in bits 6..0 + (CLD_KILLED, wait_status & 0x7f) + }; + + // Build sigchld data: { pid: i32, uid: u32, status: i32, utime: i64, stime: i64 } + let mut data = SiginfoData::new_zeroed(); + // Layout: pid at offset 0, uid at offset 4, status at offset 8 + data.pad[0] = child_pid.cast_unsigned(); + data.pad[1] = 0; // uid + data.pad[2] = si_status; + Siginfo { + signo: Signal::SIGCHLD.as_i32(), + errno: 0, + code, + #[cfg(target_arch = "x86_64")] + __pad: 0, + data, + } +} + impl SignalState { /// Updates the blocked signal mask. fn set_signal_mask(&self, mask: SigSet) { @@ -518,10 +590,60 @@ impl Task { fn do_kill(&self, pid: Option, tid: Option, signal: i32) -> Result { let signal = Signal::try_from(signal)?; if pid.is_none_or(|pid| pid == self.pid) && tid.is_none_or(|tid| tid == self.tid) { + // Signal to self self.send_signal(signal, siginfo_kill(signal)); Ok(0) + } else if tid.is_none() + && let Some(target_pid) = pid + { + // Process-directed signal to another process. + // pid > 0: send to specific process + // pid == 0: send to own process group (TODO: process groups) + // pid == -1: send to all processes (TODO) + // pid < -1: send to process group |pid| (TODO: process groups) + if target_pid > 0 { + if self + .global + .send_signal_to_process(target_pid, signal, siginfo_kill(signal)) + { + Ok(0) + } else { + Err(Errno::ESRCH) + } + } else if target_pid == 0 || target_pid < -1 { + // kill(0, sig) -> own process group + // kill(-pgid, sig) -> specific process group + let pgid_raw = if target_pid == 0 { + // Get own pgid + let my_pid = litebox::process::ProcessId::new(self.pid.cast_unsigned()) + .ok_or(Errno::ESRCH)?; + self.global + .process_registry + .with_context(my_pid, |ctx| ctx.pgid) + .ok_or(Errno::ESRCH)? + } else { + litebox::process::ProcessId::new((-target_pid).cast_unsigned()) + .ok_or(Errno::ESRCH)? + }; + let pids = self.global.process_registry.pids_in_group(pgid_raw); + if pids.is_empty() { + return Err(Errno::ESRCH); + } + for pid in pids { + self.global.send_signal_to_process( + pid.as_u32().cast_signed(), + signal, + siginfo_kill(signal), + ); + } + Ok(0) + } else { + // pid == -1: send to all processes (not supported) + log_unsupported!("sys_kill with pid=-1 (broadcast) not yet supported"); + Err(Errno::ESRCH) + } } else { - log_unsupported!("sys_{{t|tg}}kill with remote pid/tid"); + log_unsupported!("sys_tgkill with remote pid/tid"); Err(Errno::ESRCH) } } @@ -676,6 +798,23 @@ impl Task { .push(&self.process().limits, signal, siginfo); } + /// Raise SIGPIPE on the current task (used when write/send gets EPIPE). + pub(crate) fn raise_sigpipe(&self) { + use zerocopy::FromZeros; + let data = SiginfoData::new_zeroed(); + self.send_signal( + Signal::SIGPIPE, + Siginfo { + signo: Signal::SIGPIPE.as_i32(), + errno: 0, + code: SI_KERNEL, + #[cfg(target_arch = "x86_64")] + __pad: 0, + data, + }, + ); + } + /// Sends a process-directed signal (stored in shared_pending). pub(crate) fn send_shared_signal(&self, signal: Signal, siginfo: Siginfo) { if self.is_signal_ignored(signal) { @@ -687,6 +826,16 @@ impl Task { .push(&self.process().limits, signal, siginfo); } + /// Drain cross-process signals from the mailbox into local shared_pending. + /// Call this periodically (e.g., on wait/syscall return) to pick up signals + /// from other processes (e.g., SIGCHLD from exiting children). + pub(crate) fn drain_cross_process_signals(&self) { + let mut mailbox = self.signal_mailbox.lock(); + while let Some((signal, siginfo)) = mailbox.pop_front() { + self.send_shared_signal(signal, siginfo); + } + } + /// Forces a signal to be delivered on next call to `check_for_signals`. fn force_signal(&self, signal: Signal, force_exit: bool) { let siginfo = Siginfo { diff --git a/litebox_shim_linux/src/syscalls/unix.rs b/litebox_shim_linux/src/syscalls/unix.rs index 6aec592d8..522783892 100644 --- a/litebox_shim_linux/src/syscalls/unix.rs +++ b/litebox_shim_linux/src/syscalls/unix.rs @@ -1241,21 +1241,15 @@ impl UnixSocket { let is_nonblocking = flags.contains(SendFlags::DONTWAIT) || self.get_status().contains(OFlags::NONBLOCK); let timeout = self.options.lock().send_timeout; - let ret = match &self.inner { + // Note: SIGPIPE is sent at the Task level (do_sendto/sys_sendmsg) + match &self.inner { UnixSocketInner::Stream(stream) => { stream.sendto(&task.wait_cx(), timeout, buf, is_nonblocking, addr) } UnixSocketInner::Datagram(datagram) => { datagram.sendto(task, timeout, buf, is_nonblocking, addr) } - }; - if let Err(Errno::EPIPE) = ret - && !flags.contains(SendFlags::NOSIGNAL) - { - // TODO: send SIGPIPE signal - unimplemented!("send SIGPIPE on EPIPE"); } - ret } pub(super) fn recvfrom( diff --git a/litebox_shim_linux/src/wait.rs b/litebox_shim_linux/src/wait.rs index 7ab43cee6..f2eb518d8 100644 --- a/litebox_shim_linux/src/wait.rs +++ b/litebox_shim_linux/src/wait.rs @@ -42,6 +42,7 @@ impl Task { self.queue_signals(signal); }); self.check_alarm_deadline(); + self.drain_cross_process_signals(); self.process_signals(ctx); !self.is_exiting() }) @@ -55,6 +56,7 @@ impl litebox::event::wait::CheckForInterrupt for Task { self.queue_signals(sig); }); self.check_alarm_deadline(); + self.drain_cross_process_signals(); self.is_exiting() || self.has_pending_signals() } }